feat(mp-quiz): add MP quiz tab and DB helpers; add design and plan docs

main
Sven Geboers 1 month ago
parent b09e580f65
commit eb73275f32
  1. 388
      database.py
  2. 287
      explorer.py
  3. 61
      pipeline/text_pipeline.py
  4. 2
      pyproject.toml
  5. 19
      scripts/rerun_embeddings.py
  6. 188
      scripts/sync_motion_content.py
  7. 130
      tests/test_syncfeed_parsers.py
  8. 119
      thoughts/blog-post-political-compass.html
  9. 117
      thoughts/blog-post-political-compass.md
  10. 125
      thoughts/explorer/top_svd_importance.json
  11. 605
      thoughts/explorer/top_svd_top_motions.json
  12. 51
      thoughts/ledgers/CONTINUITY_stemwijzer.md
  13. 549
      thoughts/ledgers/audit_events.json
  14. 29
      thoughts/ledgers/qa_similarity_20260323T194335Z.json
  15. 113
      thoughts/shared/designs/2026-03-24-welk-tweede-kamerlid-ben-jij-design.md
  16. 197
      thoughts/shared/plans/2026-03-24-welk-tweede-kamerlid-ben-jij-plan.md
  17. 88
      uv.lock

@ -59,11 +59,24 @@ class MotionDatabase:
winning_margin FLOAT,
controversy_score FLOAT,
layman_explanation TEXT,
externe_identifier TEXT,
body_text TEXT,
url TEXT UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
""")
# Ensure older databases get new columns added without recreating table
try:
conn.execute(
"ALTER TABLE motions ADD COLUMN IF NOT EXISTS externe_identifier TEXT"
)
conn.execute("ALTER TABLE motions ADD COLUMN IF NOT EXISTS body_text TEXT")
except Exception:
# Best-effort: if ALTER fails for any reason, continue without stopping app startup
_logger.debug(
"Could not ALTER motions table to add new columns (may already exist or unsupported)."
)
conn.execute("""
CREATE TABLE IF NOT EXISTS user_sessions (
@ -167,40 +180,9 @@ class MotionDatabase:
id INTEGER DEFAULT nextval('similarity_cache_id_seq'),
source_motion_id INTEGER NOT NULL,
target_motion_id INTEGER NOT NULL,
score REAL NOT NULL,
vector_type TEXT NOT NULL,
window_id TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
""")
# Embeddings table and sequence (stores vectors as JSON)
conn.execute("""
CREATE SEQUENCE IF NOT EXISTS embeddings_id_seq START 1
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS embeddings (
id INTEGER DEFAULT nextval('embeddings_id_seq'),
motion_id INTEGER NOT NULL,
model TEXT,
vector JSON NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
""")
# Similarity cache and sequence (stores only ids and score, no vectors)
conn.execute("""
CREATE SEQUENCE IF NOT EXISTS similarity_cache_id_seq START 1
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS similarity_cache (
id INTEGER DEFAULT nextval('similarity_cache_id_seq'),
source_motion_id INTEGER NOT NULL,
target_motion_id INTEGER NOT NULL,
score FLOAT NOT NULL,
vector_type TEXT NOT NULL,
window_id TEXT,
score FLOAT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id)
)
@ -230,6 +212,89 @@ class MotionDatabase:
except Exception:
pass
def append_audit_event(
self,
actor_id: Optional[str],
action: str,
target_type: Optional[str] = None,
target_id: Optional[str] = None,
metadata: Optional[Dict] = None,
) -> bool:
"""Record an audit event.
Tries to write to an audit_events table in DuckDB. If that fails (no DB,
missing table, or any error) falls back to appending the event to
thoughts/ledgers/audit_events.json for durable inspection.
Returns True when the event was recorded somewhere, False otherwise.
"""
event_id = str(uuid.uuid4())
now = datetime.utcnow().isoformat() + "Z"
payload = {
"id": event_id,
"actor_id": actor_id,
"action": action,
"target_type": target_type,
"target_id": target_id,
"metadata": metadata or {},
"created_at": now,
}
# If duckdb is available try to write to DB first.
try:
if duckdb is not None:
conn = duckdb.connect(self.db_path)
try:
conn.execute(
"INSERT INTO audit_events (id, actor_id, action, target_type, target_id, metadata, created_at) VALUES (?, ?, ?, ?, ?, ?, ?)",
(
payload["id"],
payload["actor_id"],
payload["action"],
payload["target_type"],
payload["target_id"],
json.dumps(payload["metadata"]),
payload["created_at"],
),
)
conn.close()
return True
except Exception as e:
_logger.debug("Could not write audit event to DB: %s", e)
try:
conn.close()
except Exception:
pass
# fall back to ledger file
except Exception:
_logger.debug("DuckDB unavailable when appending audit event")
# Ledger fallback: append to thoughts/ledgers/audit_events.json
try:
from pathlib import Path
ledger_dir = Path("thoughts") / "ledgers"
ledger_dir.mkdir(parents=True, exist_ok=True)
ledger_path = ledger_dir / "audit_events.json"
if ledger_path.exists():
try:
data = json.loads(ledger_path.read_text(encoding="utf-8"))
if not isinstance(data, list):
data = []
except Exception:
data = []
else:
data = []
data.append(payload)
ledger_path.write_text(
json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8"
)
return True
except Exception as e:
_logger.error("Failed to record audit event to ledger: %s", e)
return False
def insert_motion(self, motion_data: Dict) -> bool:
"""Insert a new motion into database"""
try:
@ -299,10 +364,12 @@ class MotionDatabase:
return True
except Exception as e:
print(f"Error inserting motion: {e}")
if "conn" in locals():
except Exception:
_logger.exception("Error inserting motion")
try:
conn.close()
except Exception:
pass
return False
def batch_insert_motions(self, motions_data: List[Dict]) -> Tuple[int, int]:
@ -434,10 +501,42 @@ class MotionDatabase:
conn.close()
return [dict(zip(columns, row)) for row in result]
except Exception as e:
print(f"Error querying motions: {e}")
except Exception:
_logger.exception("Error querying motions")
try:
conn.close()
except Exception:
pass
return []
def get_titles_for_ids(self, ids: List[int]) -> Dict[int, Optional[str]]:
"""Return a mapping of motion id -> title for the given ids.
If DuckDB is not available, fall back to an empty mapping.
"""
out: Dict[int, Optional[str]] = {}
try:
if duckdb is None:
return out
conn = duckdb.connect(self.db_path)
placeholders = ",".join("?" for _ in ids)
rows = conn.execute(
f"SELECT id, title FROM motions WHERE id IN ({placeholders})", ids
).fetchall()
conn.close()
return []
for r in rows:
try:
out[int(r[0])] = r[1]
except Exception:
continue
return out
except Exception:
try:
conn.close()
except Exception:
pass
_logger.exception("Error fetching titles for ids")
return out
def create_session(self, total_motions: int = 10) -> str:
"""Create new user session"""
@ -558,6 +657,215 @@ class MotionDatabase:
return sorted(results, key=lambda x: x["agreement_percentage"], reverse=True)
def match_mps_for_votes(
self, user_votes: Dict[int, str], limit: int = 50
) -> List[Dict]:
"""Return per-MP agreement against provided user_votes.
Args:
user_votes: mapping motion_id -> vote token (UI or canonical).
limit: max number of MPs to return.
Returns:
List of dicts: {mp_name, party, matched, overlap, agreement_pct}
Notes:
- Normalizes common UI tokens to canonical DB tokens: 'voor','tegen','onthouden','afwezig'.
- Excludes MPs with overlap == 0.
- Requires DuckDB to be available (raises RuntimeError otherwise).
"""
if not user_votes:
raise ValueError("user_votes must be a non-empty dict")
# Normalization mapping (UI variants -> canonical)
def _norm(v: str) -> Optional[str]:
if v is None:
return None
s = str(v).strip()
if not s:
return None
s_low = s.lower()
if s_low in ("voor", "v", "yes"):
return "voor"
if s_low in ("tegen", "t", "no"):
return "tegen"
if s_low in ("onthouden", "abstain", "abstained"):
return "onthouden"
if s_low in ("geen stem", "afwezig", "absent", "no vote"):
return "afwezig"
# already canonical?
if s_low in ("voor", "tegen", "onthouden", "afwezig"):
return s_low
# fallback: try Dutch keywords
if "voor" in s_low:
return "voor"
if "tegen" in s_low:
return "tegen"
if "onthouden" in s_low:
return "onthouden"
if "afwezig" in s_low:
return "afwezig"
return None
# Build normalized mapping and DataFrame for DuckDB
import pandas as pd
rows = []
for mid, v in user_votes.items():
try:
mid_i = int(mid)
except Exception:
raise ValueError(f"motion id must be integer-like: {mid}")
nv = _norm(v)
if nv is None:
# treat as abstain/skip (do not include in user votes)
continue
rows.append({"motion_id": mid_i, "user_vote": nv})
if not rows:
raise ValueError("After normalization no valid user votes remain")
if duckdb is None:
raise RuntimeError("DuckDB is required for match_mps_for_votes")
conn = duckdb.connect(self.db_path)
try:
uv = pd.DataFrame(rows)
# register as temporary relation
conn.register("_user_votes", uv)
q = (
"SELECT mp.mp_name, mp.party, "
"SUM(CASE WHEN lower(mp.vote)=_user_votes.user_vote THEN 1 ELSE 0 END) AS matched, "
"COUNT(*) AS overlap "
"FROM mp_votes mp JOIN _user_votes ON mp.motion_id = _user_votes.motion_id "
"WHERE mp.mp_name LIKE '%,%' "
"GROUP BY mp.mp_name, mp.party "
"HAVING COUNT(*) > 0 "
"ORDER BY (matched*1.0/overlap) DESC, matched DESC, mp.mp_name ASC "
f"LIMIT {int(limit)}"
)
rows_out = conn.execute(q).fetchall()
# columns: mp_name, party, matched, overlap
results = []
for r in rows_out:
try:
matched = int(r[2])
overlap = int(r[3])
pct = round((matched / overlap) * 100.0, 1) if overlap else 0.0
except Exception:
matched = int(r[2]) if r[2] is not None else 0
overlap = int(r[3]) if r[3] is not None else 0
pct = 0.0
results.append(
{
"mp_name": r[0],
"party": r[1],
"matched": matched,
"overlap": overlap,
"agreement_pct": pct,
}
)
conn.unregister("_user_votes")
conn.close()
return results
except Exception:
try:
conn.close()
except Exception:
pass
_logger.exception("Error in match_mps_for_votes")
return []
def choose_discriminating_motions(
self, candidates: List[str], excluded_motion_ids: List[int], k: int = 1
) -> List[int]:
"""Return top-k motion ids that best split the candidate MPs.
Scoring: Shannon entropy over vote distribution among candidate MPs for each motion.
Ties broken by higher controversy_score then lower motion id.
"""
if not candidates:
raise ValueError("candidates must be non-empty")
if duckdb is None:
raise RuntimeError("DuckDB is required for choose_discriminating_motions")
conn = duckdb.connect(self.db_path)
try:
# Prepare candidate names as a temp table
import pandas as pd
cand_df = pd.DataFrame({"mp_name": candidates})
conn.register("_candidates", cand_df)
# Build excluded list SQL fragment
excl_clause = ""
params = []
if excluded_motion_ids:
excl_clause = (
"AND mp.motion_id NOT IN ("
+ ",".join(str(int(x)) for x in excluded_motion_ids)
+ ")"
)
# Aggregate counts per motion by vote token
q = f"""
SELECT
m.id as motion_id,
m.controversy_score,
SUM(CASE WHEN lower(mp.vote) = 'voor' THEN 1 ELSE 0 END) as cnt_voor,
SUM(CASE WHEN lower(mp.vote) = 'tegen' THEN 1 ELSE 0 END) as cnt_tegen,
SUM(CASE WHEN lower(mp.vote) = 'onthouden' THEN 1 ELSE 0 END) as cnt_onthouden,
SUM(CASE WHEN lower(mp.vote) = 'afwezig' THEN 1 ELSE 0 END) as cnt_afwezig,
COUNT(*) as total_votes
FROM mp_votes mp
JOIN _candidates c ON mp.mp_name = c.mp_name
JOIN motions m ON m.id = mp.motion_id
WHERE 1=1
{excl_clause}
GROUP BY m.id, m.controversy_score
HAVING COUNT(*) > 0
"""
rows = conn.execute(q).fetchall()
conn.unregister("_candidates")
conn.close()
if not rows:
return []
import math
scored = []
for r in rows:
motion_id = int(r[0])
controversy = r[1] or 0.0
counts = [int(r[2]), int(r[3]), int(r[4]), int(r[5])]
total = int(r[6])
if total <= 0:
entropy = 0.0
else:
entropy = 0.0
for c in counts:
if c <= 0:
continue
p = c / total
entropy -= p * math.log2(p)
scored.append((motion_id, entropy, controversy))
# sort by entropy desc, controversy desc, motion_id asc
scored.sort(key=lambda x: (-x[1], -x[2], x[0]))
return [m[0] for m in scored[: int(k)]]
except Exception:
try:
conn.close()
except Exception:
pass
_logger.exception("Error in choose_discriminating_motions")
return []
def store_embedding(self, motion_id: int, model: str, vector: List[float]) -> int:
"""Store an embedding for a motion. Returns inserted row id or -1 on failure."""
try:
@ -631,8 +939,8 @@ class MotionDatabase:
results.sort(key=lambda x: x["score"], reverse=True)
return results[:top_k]
except Exception as e:
print(f"Error searching embeddings: {e}")
except Exception:
_logger.exception("Error searching embeddings")
try:
conn.close()
except Exception:

@ -405,14 +405,6 @@ def build_trajectories_tab(db_path: str, window_size: str) -> None:
default=default_parties,
)
# Note about partial data years
if "2023-Q1" in windows and not any(
w.startswith("2023-Q") and w != "2023-Q1" for w in windows
):
st.caption(
" 2023 heeft alleen data voor Q1 — pipeline draaide niet door in dat jaar."
)
fig = go.Figure()
for party in selected_parties:
if party not in centroids:
@ -425,11 +417,10 @@ def build_trajectories_tab(db_path: str, window_size: str) -> None:
go.Scatter(
x=xs,
y=ys,
mode="lines+markers+text",
mode="lines+markers",
name=party,
text=[w.replace("-Q4", "") for w in wids_sorted],
textposition="top center",
line=dict(color=colour),
text=wids_sorted, # full window ID for hover
line=dict(color=colour, shape="spline", smoothing=1.3),
marker=dict(color=colour, size=8),
hovertemplate=(
f"<b>{party}</b><br>"
@ -632,6 +623,228 @@ def build_browser_tab(db_path: str, show_rejected: bool) -> None:
st.caption("_Nog geen vergelijkbare moties beschikbaar voor deze motie_")
def build_svd_components_tab(db_path: str) -> None:
"""New tab: show top motions contributing to top SVD components.
Reads thoughts/explorer/top_svd_top_motions.json and displays a selector
for components 1..10 and a detail pane for selected motion.
"""
st.subheader("🔬 SVD Components — top contributing motions")
json_path = os.path.join("thoughts", "explorer", "top_svd_top_motions.json")
if not os.path.exists(json_path):
st.warning(
f"Top-SVD data not found at {json_path}. Run the importance job to generate it."
)
return
try:
with open(json_path, "r", encoding="utf-8") as fh:
j = json.load(fh)
except Exception as e:
st.error(f"Failed to load SVD importance JSON: {e}")
return
window = j.get("window")
rows = j.get("rows", [])
if not rows:
st.info("Geen top-moties in dataset")
return
st.caption(f"Top SVD contributors computed for window: {window}")
# Build mapping component -> list of motions (deduplicate by motion_id per component)
comp_map: dict[int, list] = {}
for r in rows:
comp = int(r.get("component", 0))
bucket = comp_map.setdefault(comp, [])
existing_ids = {m.get("motion_id") for m in bucket}
if r.get("motion_id") not in existing_ids:
bucket.append(r)
comp_options = sorted(comp_map.keys())
comp_sel = st.selectbox("Component", options=comp_options, index=0)
col1, col2 = st.columns([1, 2])
with col1:
st.markdown("**Top motions (title)**")
motions = comp_map.get(comp_sel, [])
for m in motions:
mid = m.get("motion_id")
title = m.get("title") or f"Motie #{mid}"
if st.button(f"{mid}: {title[:80]}", key=f"btn_{comp_sel}_{mid}"):
st.session_state["svd_selected_mid"] = mid
with col2:
sel_mid = st.session_state.get("svd_selected_mid")
if not sel_mid and motions:
sel_mid = motions[0].get("motion_id")
if sel_mid:
# fetch motion metadata from DB for completeness
try:
con = duckdb.connect(database=db_path, read_only=True)
row = con.execute(
"SELECT id, title, date, policy_area, url, body_text FROM motions WHERE id=?",
[int(sel_mid)],
).fetchone()
con.close()
except Exception:
row = None
if row:
st.markdown(f"### {row[1] or f'Motie #{row[0]}'}")
try:
date_str = str(row[2])[:10]
except Exception:
date_str = "?"
st.caption(f"📅 {date_str} | {row[3]}")
if row[4] and str(row[4]).startswith("http"):
st.markdown(f"[🔗 Bekijk op Tweede Kamer]({row[4]})")
if row[5]:
with st.expander("Show body text"):
st.write(row[5])
else:
st.info(f"Metadata not found in DB for motion {sel_mid}")
def build_mp_quiz_tab(db_path: str) -> None:
"""Interactive quiz: narrow MPs by asking motion vote questions.
Minimal viable flow:
- seed with top-N controversial motions (SEED_MOTIONS)
- present one question at a time, store answers in st.session_state['mp_quiz_votes']
- after each answer call MotionDatabase.match_mps_for_votes to rank MPs
- if multiple candidates remain, call choose_discriminating_motions to pick next question
- stop when unique MP found or no discriminating motions remain
"""
st.subheader("🧑 Welk tweede kamerlid ben jij?")
st.markdown(
"Beantwoord een paar eenvoudige ja/nee/onthoud vragen over moties om te zien welk Kamerlid het meest op jou lijkt."
)
SEED_MOTIONS = 8
MAX_QUESTIONS = 20
# initialize session state
if "mp_quiz_votes" not in st.session_state:
st.session_state["mp_quiz_votes"] = {}
if "mp_quiz_asked" not in st.session_state:
st.session_state["mp_quiz_asked"] = []
df = load_motions_df(db_path)
if df.empty:
st.warning("Geen moties beschikbaar om de quiz te starten.")
return
# seed motions by controversy_score, prefer those with layman_explanation
candidates_df = df[df["layman_explanation"].notna()]
if candidates_df.empty:
candidates_df = df
seed = (
candidates_df.sort_values(by="controversy_score", ascending=False)
.head(SEED_MOTIONS)
.copy()
)
seed_ids = [int(x) for x in seed["id"].tolist()]
# Determine next motion to ask
def _next_motion_id():
# prefer seed motions not yet asked
for mid in seed_ids:
if str(mid) not in st.session_state["mp_quiz_votes"]:
return mid
# otherwise ask discriminating motion based on remaining candidate MPs
# compute current candidate set
from database import db as global_db
try:
user_votes = {
int(k): v for k, v in st.session_state["mp_quiz_votes"].items()
}
ranked = global_db.match_mps_for_votes(user_votes, limit=200)
except Exception:
ranked = []
candidates = [r["mp_name"] for r in ranked]
excluded = [int(k) for k in st.session_state["mp_quiz_votes"].keys()]
if not candidates:
return None
try:
next_ids = global_db.choose_discriminating_motions(
candidates, excluded, k=1
)
return next_ids[0] if next_ids else None
except Exception:
return None
# show progress and controls
col1, col2 = st.columns([3, 1])
with col2:
st.caption(
f"Vragen beantwoord: {len(st.session_state['mp_quiz_votes'])}/{MAX_QUESTIONS}"
)
if st.button("Reset quiz"):
st.session_state["mp_quiz_votes"] = {}
st.session_state["mp_quiz_asked"] = []
st.experimental_rerun()
# main question loop (single question per render)
next_mid = _next_motion_id()
if next_mid is None:
st.info("Geen nieuwe vragen beschikbaar om kandidaten te scheiden.")
else:
motion_row = df[df["id"] == next_mid].iloc[0]
st.markdown(f"### {motion_row.get('title') or f'Motie #{next_mid}'}")
if motion_row.get("layman_explanation"):
st.info(motion_row.get("layman_explanation"))
choice = st.radio(
"Wat zou jij stemmen?",
options=["Voor", "Tegen", "Onthouden", "Geen stem"],
index=3,
key=f"mp_quiz_choice_{next_mid}",
)
if st.button("Beantwoord en verder", key=f"mp_quiz_submit_{next_mid}"):
st.session_state["mp_quiz_votes"][str(next_mid)] = choice
st.session_state["mp_quiz_asked"].append(next_mid)
st.experimental_rerun()
# display current ranking
from database import db as global_db
try:
user_votes = {int(k): v for k, v in st.session_state["mp_quiz_votes"].items()}
ranking = global_db.match_mps_for_votes(user_votes, limit=50)
except Exception:
ranking = []
if ranking:
st.markdown("**Top kandidaten**")
# show as table
import pandas as pd
rdf = pd.DataFrame(ranking)
st.dataframe(rdf.head(10), use_container_width=True)
# check uniqueness
top_pct = ranking[0]["agreement_pct"] if ranking else 0.0
top_matches = [r for r in ranking if r["agreement_pct"] == top_pct]
if len(top_matches) == 1 and top_matches[0]["overlap"] > 0:
st.success(
f"Unieke match gevonden: {top_matches[0]['mp_name']} ({top_matches[0]['party']})"
)
else:
if len(st.session_state["mp_quiz_asked"]) >= MAX_QUESTIONS:
st.warning(
"Maximaal aantal vragen beantwoord. Je hebt meerdere vergelijkbare kandidaten."
)
else:
st.info("Nog geen unieke match — vraag meer om verder te verfijnen.")
else:
st.info("Nog geen antwoorden of geen overlapping met bestaande stemdata.")
# ---------------------------------------------------------------------------
# App entry
# ---------------------------------------------------------------------------
@ -668,17 +881,45 @@ def run_app() -> None:
st.warning(f"DB niet bereikbaar: {e}")
# Main tabs
tab1, tab2, tab3, tab4 = st.tabs(
["🧭 Politiek Kompas", "📈 Trajectories", "🔍 Motie Zoeken", "📋 Motie Browser"]
)
with tab1:
build_compass_tab(db_path, window_size)
with tab2:
build_trajectories_tab(db_path, window_size)
with tab3:
build_search_tab(db_path, show_rejected)
with tab4:
build_browser_tab(db_path, show_rejected)
# Streamlit tabs compatibility: some older/newer Streamlit builds expose different APIs.
tab_labels = [
"🧭 Politiek Kompas",
"📈 Trajectories",
"🔍 Motie Zoeken",
"📋 Motie Browser",
"🧑 Welk tweede kamerlid ben jij?",
"🔬 SVD Components",
]
if hasattr(st, "tabs") and callable(getattr(st, "tabs")):
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(tab_labels)
with tab1:
build_compass_tab(db_path, window_size)
with tab2:
build_trajectories_tab(db_path, window_size)
with tab3:
build_search_tab(db_path, show_rejected)
with tab4:
build_browser_tab(db_path, show_rejected)
with tab6:
build_mp_quiz_tab(db_path)
with tab5:
build_svd_components_tab(db_path)
else:
# Fallback for environments where `st.tabs` is not available: use a radio selector
selection = st.radio("Tab", tab_labels)
if selection == tab_labels[0]:
build_compass_tab(db_path, window_size)
elif selection == tab_labels[1]:
build_trajectories_tab(db_path, window_size)
elif selection == tab_labels[2]:
build_search_tab(db_path, show_rejected)
elif selection == tab_labels[3]:
build_browser_tab(db_path, show_rejected)
elif selection == tab_labels[4]:
build_mp_quiz_tab(db_path)
else:
build_svd_components_tab(db_path)
if __name__ == "__main__":

@ -63,23 +63,28 @@ def _select_text(
def ensure_text_embeddings(
db_path: Optional[str] = None,
model: Optional[str] = None,
batch_size: int = 50,
batch_size: int = 128,
db=None,
embedder=None,
min_batch_size: int = 4,
max_batch_size: int = 512,
growth_factor: float = 1.5,
) -> Tuple[int, int, int, int, list]:
"""Ensure all motions have text embeddings for `model`.
Uses batched API calls (batch_size texts per HTTP request) for speed.
Returns tuple (stored_count, skipped_existing, skipped_no_text, errors).
Uses AIMD batch sizing to maximise throughput:
- After each fully-successful batch: grow by growth_factor (probe upward).
- After any batch failure: halve (back off from API limits).
This converges on the largest batch the provider can reliably handle.
Returns tuple (stored_count, skipped_existing, skipped_no_text, errors, failed_ids).
"""
model = model or DEFAULT_MODEL
if db is None:
db = MotionDatabase(db_path) if db_path else default_db
# motions to process
to_process = _select_text(db, model)
# how many already exist
if duckdb is None:
total_motions = 0
existing = 0
@ -89,7 +94,6 @@ def ensure_text_embeddings(
total_motions = conn.execute("SELECT COUNT(*) FROM motions").fetchone()[0]
except Exception:
total_motions = 0
try:
existing = conn.execute(
"SELECT COUNT(DISTINCT motion_id) FROM embeddings WHERE model = ?",
@ -97,7 +101,6 @@ def ensure_text_embeddings(
).fetchone()[0]
except Exception:
existing = 0
conn.close()
stored = 0
@ -105,7 +108,6 @@ def ensure_text_embeddings(
errors = 0
failed_ids: list = []
# Separate motions with text from those without
with_text: List[Tuple[int, str]] = []
for motion_id, text in to_process:
if not text:
@ -114,17 +116,23 @@ def ensure_text_embeddings(
else:
with_text.append((motion_id, text))
current_batch_size = max(min_batch_size, min(max_batch_size, batch_size))
_logger.info(
"Processing %d motions in batches of %d (%d skipped no text, %d already exist)",
"Processing %d motions (initial_batch=%d, min=%d, max=%d, growth=%.1fx"
"%d skipped no text, %d already exist)",
len(with_text),
batch_size,
current_batch_size,
min_batch_size,
max_batch_size,
growth_factor,
skipped_no_text,
existing,
)
# Process in batches
for batch_start in range(0, len(with_text), batch_size):
batch = with_text[batch_start : batch_start + batch_size]
i = 0
n = len(with_text)
while i < n:
batch = with_text[i : i + current_batch_size]
batch_ids = [mid for mid, _ in batch]
batch_texts = [txt for _, txt in batch]
@ -132,20 +140,21 @@ def ensure_text_embeddings(
batch_texts,
motion_ids=batch_ids,
model=model,
batch_size=batch_size,
batch_size=current_batch_size,
embedder=embedder,
)
batch_stored = 0
batch_errors = 0
for (motion_id, _text), vec in zip(batch, vecs):
if not isinstance(vec, list):
_logger.warning(
"Embedding provider returned non-list for motion %s", motion_id
)
errors += 1
batch_errors += 1
failed_ids.append(motion_id)
continue
try:
res = db.store_embedding(motion_id, model, vec)
if res and res > 0:
@ -158,24 +167,40 @@ def ensure_text_embeddings(
res,
)
errors += 1
batch_errors += 1
failed_ids.append(motion_id)
except Exception as exc:
_logger.error(
"Error storing embedding for motion %s: %s", motion_id, exc
)
errors += 1
batch_errors += 1
failed_ids.append(motion_id)
# AIMD: grow on full success, halve on any failure
prev_batch_size = current_batch_size
if batch_errors == 0:
current_batch_size = min(
max_batch_size, int(current_batch_size * growth_factor)
)
else:
current_batch_size = max(min_batch_size, current_batch_size // 2)
_logger.info(
"Batch %d-%d: stored %d/%d (total: %d/%d)",
batch_start,
batch_start + len(batch),
"Batch %d-%d: stored %d/%d errors %d — batch_size %d%d (total: %d/%d)",
i,
i + len(batch),
batch_stored,
len(batch),
batch_errors,
prev_batch_size,
current_batch_size,
stored + existing,
total_motions,
)
i += len(batch)
skipped_existing = int(existing)
return stored, skipped_existing, skipped_no_text, errors, failed_ids

@ -16,4 +16,6 @@ dependencies = [
"schedule>=1.2.2",
"streamlit>=1.48.0",
"scikit-learn>=1.8.0",
"beautifulsoup4>=4.14.3",
"lxml>=6.0.2",
]

@ -88,7 +88,10 @@ def _clear_embeddings(db_path: str) -> int:
def rerun_embeddings(
db_path: str, model: str = None, retry_missing: bool = False
db_path: str,
model: str = None,
retry_missing: bool = False,
growth_factor: float = 1.5,
) -> dict:
"""Full rerun: clear → embed → fuse → similarity for all windows.
@ -105,7 +108,9 @@ def rerun_embeddings(
# (stored, skipped_existing, skipped_no_text, errors) or a 5-tuple that
# includes failed_ids as the fifth element. Support both shapes for
# backward-compatibility.
result = text_pipeline.ensure_text_embeddings(db_path=db_path, model=model)
result = text_pipeline.ensure_text_embeddings(
db_path=db_path, model=model, growth_factor=growth_factor
)
if isinstance(result, tuple) and len(result) == 5:
stored, skipped_existing, skipped_no_text, emb_errors, failed_ids = result
elif isinstance(result, tuple) and len(result) == 4:
@ -207,8 +212,16 @@ def _main():
default=None,
help="Embedding model name (default: text_pipeline default)",
)
parser.add_argument(
"--growth-factor",
type=float,
default=1.5,
help="AIMD growth factor for batch-size tuning (default: 1.5)",
)
args = parser.parse_args()
summary = rerun_embeddings(args.db_path, model=args.model)
summary = rerun_embeddings(
args.db_path, model=args.model, growth_factor=args.growth_factor
)
print(f"cleared_rows: {summary['cleared_rows']}")
print(f"embeddings_stored: {summary['embeddings_stored']}")
print(f"embeddings_skipped_no_text: {summary['embeddings_skipped_no_text']}")

@ -252,43 +252,67 @@ def walk_syncfeed(
# ---------------------------------------------------------------------------
def _extract_motion_text(html: str) -> Optional[str]:
"""Extract clean motion text from Overheid.nl HTML.
Targets <div id="broodtekst"> which contains the actual kamerstuk body.
Falls back to <div id="content"> if broodtekst is absent.
Returns plain text with normalised whitespace, capped at 32 000 chars.
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
# Primary target: the kamerstuk body div
node = soup.find("div", id="broodtekst")
if node is None:
# Fallback: main content area
node = soup.find("div", id="content")
if node is None:
# Last resort: whole <article> if present
node = soup.find("article")
if node is None:
# Final fallback: strip all tags from the full body
node = soup.body or soup
text = node.get_text(separator=" ")
# Collapse whitespace
text = re.sub(r"\s+", " ", text).strip()
return text[:32_000] if text else None
def _fetch_body_text(
ext_id: str, session: requests.Session, retries: int = 3
) -> Optional[str]:
"""Fetch plain text body from officielebekendmakingen.nl for ext_id.
"""Fetch plain motion text from officielebekendmakingen.nl for ext_id.
Uses BeautifulSoup to extract only the <div id="broodtekst"> element,
avoiding JavaScript, navigation, and cookie-banner noise.
Retries on network errors and on HTTP 5xx or 429 responses using
exponential backoff starting at 0.5s. On permanent failure returns None
and records an audit event via database.db.append_audit_event(...).
Retries on network errors and HTTP 5xx / 429 with exponential backoff
starting at 0.5 s. Returns None on permanent failure.
"""
import time
import re
from requests import exceptions as req_exceptions
import database
url = BODY_TEXT_BASE.format(ext_id=ext_id)
attempt = 0
backoff = 0.5
last_exc = None
last_exc: Optional[Exception] = None
while attempt < retries:
attempt += 1
try:
resp = session.get(url, timeout=30)
# treat 5xx and 429 as transient
status = getattr(resp, "status_code", None)
if status == 429 or (status is not None and 500 <= status < 600):
last_exc = Exception(f"HTTP {status}")
raise req_exceptions.RequestException(f"HTTP {status}")
resp.raise_for_status()
# Very simple text extraction: strip tags
text = re.sub(r"<[^>]+>", " ", resp.text)
text = re.sub(r"\s+", " ", text).strip()
return text[:32_000] if text else None
text = _extract_motion_text(resp.text)
return text
except req_exceptions.RequestException as exc:
last_exc = exc
# retry for transient errors unless we've exhausted attempts
if attempt < retries:
_logger.info(
"Transient body fetch error for %s (attempt %d/%d): %s; retrying in %.1fs",
@ -298,51 +322,32 @@ def _fetch_body_text(
exc,
backoff,
)
try:
time.sleep(backoff)
except Exception:
pass
time.sleep(backoff)
backoff *= 2
continue
# exhausted retries => permanent failure
_logger.warning(
"Body text fetch permanently failed for %s: %s", ext_id, exc
)
metadata = {"attempts": attempt, "error": str(exc)}
try:
# MotionDatabase.append_audit_event signature: (actor_id, action, ...)
import database
database.db.append_audit_event(
None,
"body_fetch_failed",
target_type="document",
target_id=ext_id,
metadata=metadata,
metadata={"attempts": attempt, "error": str(exc)},
)
except Exception:
_logger.exception(
"Failed to write audit event for body fetch failure %s", ext_id
)
pass
return None
except Exception as exc: # pragma: no cover - unexpected errors
except Exception as exc: # pragma: no cover
_logger.exception(
"Unexpected error fetching body text for %s: %s", ext_id, exc
)
last_exc = exc
break
# If we fall through here, ensure audit event is recorded
try:
database.db.append_audit_event(
None,
"body_fetch_failed",
target_type="document",
target_id=ext_id,
metadata={"attempts": retries, "error": str(last_exc)},
)
except Exception:
_logger.exception(
"Failed to write audit event for body fetch failure %s", ext_id
)
return None
@ -572,6 +577,86 @@ def sync_motion_content(db_path: str, skip_body_text: bool = False) -> Dict:
}
# ---------------------------------------------------------------------------
# Body-only re-scrape (uses stored externe_identifier; no SyncFeed walk needed)
# ---------------------------------------------------------------------------
def rescrape_body_texts(
db_path: str,
max_workers: int = MAX_BODY_WORKERS,
batch_size: int = 500,
) -> Dict:
"""Re-fetch and overwrite body_text for every motion that has an externe_identifier.
Reads externe_identifier directly from the DB no SyncFeed walk needed.
Fetches in parallel (max_workers threads) and commits in batches of batch_size
to limit memory use and provide progress checkpoints.
Returns summary dict with counts.
"""
conn = duckdb.connect(db_path, read_only=True)
rows = conn.execute(
"SELECT id, externe_identifier FROM motions WHERE externe_identifier IS NOT NULL"
).fetchall()
conn.close()
total = len(rows)
_logger.info(
"Re-scraping body_text for %d motions (workers=%d) ...", total, max_workers
)
session = requests.Session()
session.headers["User-Agent"] = "stemwijzer-scraper/1.0"
fetched = 0
failed = 0
committed = 0
# Process in batches so we can commit progress and log along the way
for batch_start in range(0, total, batch_size):
batch = rows[batch_start : batch_start + batch_size]
with ThreadPoolExecutor(max_workers=max_workers) as pool:
future_to_row = {
pool.submit(_fetch_body_text, ext_id, session): (mid, ext_id)
for mid, ext_id in batch
}
updates: List[Tuple[int, Optional[str], Optional[str], Optional[str]]] = []
for future in as_completed(future_to_row):
mid, ext_id = future_to_row[future]
try:
text = future.result()
except Exception as exc:
_logger.warning(
"Future failed for motion %d (%s): %s", mid, ext_id, exc
)
text = None
if text:
fetched += 1
updates.append((mid, None, text, None))
else:
failed += 1
_update_motions(db_path, updates)
committed += len(updates)
done = batch_start + len(batch)
_logger.info(
"Progress: %d/%d done — %d fetched, %d failed, %d committed",
done,
total,
fetched,
failed,
committed,
)
_logger.info(
"Re-scrape complete. fetched=%d, failed=%d, total=%d", fetched, failed, total
)
return {"total": total, "fetched": fetched, "failed": failed}
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
@ -582,7 +667,6 @@ def _main():
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
# allow overriding MAX_BODY_WORKERS from CLI
parser = argparse.ArgumentParser(description="Sync motion content from SyncFeed")
parser.add_argument("--db-path", required=True, help="Path to motions.db")
parser.add_argument(
@ -590,22 +674,28 @@ def _main():
action="store_true",
help="Skip fetching body text from officielebekendmakingen.nl",
)
parser.add_argument(
"--body-only",
action="store_true",
help=(
"Skip SyncFeed walk; re-fetch and overwrite body_text for all motions "
"that already have an externe_identifier in the DB."
),
)
parser.add_argument(
"--max-body-workers",
type=int,
default=MAX_BODY_WORKERS,
help=f"Maximum concurrent workers for fetching body text (default: {MAX_BODY_WORKERS})",
)
# Use a local copy for the default to avoid referencing the name after assignment
args = parser.parse_args()
# Set module-level MAX_BODY_WORKERS based on CLI
try:
MAX_BODY_WORKERS = (
int(args.max_body_workers) if args.max_body_workers else MAX_BODY_WORKERS
)
except Exception:
pass
summary = sync_motion_content(args.db_path, skip_body_text=args.skip_body_text)
max_workers = args.max_body_workers or MAX_BODY_WORKERS
if args.body_only:
summary = rescrape_body_texts(args.db_path, max_workers=max_workers)
else:
summary = sync_motion_content(args.db_path, skip_body_text=args.skip_body_text)
for k, v in summary.items():
print(f" {k}: {v}")

@ -0,0 +1,130 @@
"""Tests for sync_motion_content.py XML parsers and join builders.
Fixtures use the real SyncFeed XML format:
- Entity ID is an attribute: id="..."
- tk:verwijderd is a namespaced attribute
- zaak refs are child elements with ref="..." attributes
- Zaak onderwerp/soort are child elements with text content
- DocumentVersie uses <document ref="..."/> and <externeIdentifier> child elements
"""
import scripts.sync_motion_content as smc
NS_TK = "http://www.tweedekamer.nl/xsd/tkData/v1-0"
NS_PREFIX = f'xmlns:tk="{NS_TK}" xmlns="{NS_TK}"'
def test_parse_besluit_simple():
xml = (
f'<besluit {NS_PREFIX} id="B1" tk:verwijderd="false">'
' <zaak ref="Z1" />'
' <zaak ref="Z2" />'
" <besluitTekst>Aangenomen.</besluitTekst>"
"</besluit>"
)
out = smc.parse_besluit(xml)
assert out["id"] == "B1"
assert out["verwijderd"] is False
assert out["zaak_refs"] == ["Z1", "Z2"]
def test_parse_besluit_deleted():
xml = (
f'<besluit {NS_PREFIX} id="B2" tk:verwijderd="true">'
' <zaak ref="Z3" />'
"</besluit>"
)
out = smc.parse_besluit(xml)
assert out["verwijderd"] is True
assert out["zaak_refs"] == ["Z3"]
def test_parse_zaak_and_title_map():
zxml = (
f'<zaak {NS_PREFIX} id="Z1" tk:verwijderd="false">'
" <onderwerp>My title</onderwerp>"
" <soort>Motie</soort>"
"</zaak>"
)
z = smc.parse_zaak(zxml)
assert z["id"] == "Z1"
assert z["verwijderd"] is False
assert z["onderwerp"] == "My title"
assert z["soort"] == "Motie"
besluit_index = {"B1": {"zaak_refs": ["Z1"]}}
zaak_index = {"Z1": z}
tm = smc.build_title_map(besluit_index, zaak_index)
assert tm["B1"] == "My title"
def test_build_title_map_prefers_motie():
"""When a Besluit links multiple Zaak records, prefer soort==Motie."""
zaak_index = {
"Z1": {
"id": "Z1",
"verwijderd": False,
"onderwerp": "Other title",
"soort": "Amendement",
},
"Z2": {
"id": "Z2",
"verwijderd": False,
"onderwerp": "Motion title",
"soort": "Motie",
},
}
besluit_index = {"B1": {"zaak_refs": ["Z1", "Z2"]}}
tm = smc.build_title_map(besluit_index, zaak_index)
assert tm["B1"] == "Motion title"
def test_parse_document():
dxml = (
f'<document {NS_PREFIX} id="D1" tk:verwijderd="false">'
' <zaak ref="Z1" />'
"</document>"
)
doc = smc.parse_document(dxml)
assert doc["id"] == "D1"
assert doc["verwijderd"] is False
assert doc["zaak_refs"] == ["Z1"]
def test_parse_documentversie():
dvxml = (
f'<documentVersie {NS_PREFIX} id="DV1" tk:verwijderd="false">'
' <document ref="D1" />'
" <externeIdentifier>kst-12345-678</externeIdentifier>"
" <extensie>html</extensie>"
"</documentVersie>"
)
dv = smc.parse_documentversie(dvxml)
assert dv["id"] == "DV1"
assert dv["verwijderd"] is False
assert dv["document_id"] == "D1"
assert dv["externe_identifier"] == "kst-12345-678"
assert dv["extensie"] == "html"
def test_parse_document_and_docversie_and_extid_map():
dxml = (
f'<document {NS_PREFIX} id="D1" tk:verwijderd="false">'
' <zaak ref="Z1" />'
"</document>"
)
dvxml = (
f'<documentVersie {NS_PREFIX} id="DV1" tk:verwijderd="false">'
' <document ref="D1" />'
" <externeIdentifier>EXT-123</externeIdentifier>"
" <extensie>html</extensie>"
"</documentVersie>"
)
doc = smc.parse_document(dxml)
dv = smc.parse_documentversie(dvxml)
besluit_index = {"B1": {"zaak_refs": ["Z1"]}}
zaak_index = {"Z1": {"id": "Z1", "onderwerp": "t", "soort": "Motie"}}
doc_index = {"D1": doc}
dv_index = {"DV1": dv}
extmap = smc.build_ext_id_map(besluit_index, zaak_index, doc_index, dv_index)
assert extmap["B1"] == "EXT-123"

@ -0,0 +1,119 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Mapping Dutch Democracy: Building a Political Compass</title>
<style>
body{font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial; max-width:900px; margin:40px auto; line-height:1.6; color:#111}
pre{background:#f6f8fa;padding:12px;border-radius:6px;overflow:auto}
code{background:#f2f4f6;padding:2px 4px;border-radius:4px}
h1,h2,h3{color:#0b3d91}
ul{margin-left:1.2rem}
</style>
</head>
<body>
<h1>Mapping Dutch Democracy: Building a Political Compass from 28,000+ Parliamentary Votes</h1>
<p><em>What if you could take every motion voted on in the Dutch Parliament over the past decade and automatically plot parties and MPs on a political map — with zero manual labeling?</em></p>
<p>That's exactly what this project does. Here's how I built it, what I had to solve along the way, and what it revealed about Dutch political dynamics.</p>
<p>---</p>
<h2>The Starting Point: Open Data, Hidden Structure</h2>
<p>The Dutch Parliament publishes every vote — every <em>motie</em>, every <em>amendement</em>, every <em>besluit</em> — in an open OData API. We're talking over <strong>28,000 motions</strong> spanning 2016 to 2026, with a record of how every individual MP voted: <em>voor</em> (for), <em>tegen</em> (against), <em>onthouden</em> (abstained), or <em>afwezig</em> (absent). That's 506,000 individual vote records.</p>
<p>This is an extraordinary dataset. But in raw form it's just a table of votes. The interesting question is: can we extract <em>structure</em> — left vs. right, progressive vs. conservative, governing vs. opposition — purely from the pattern of who votes with whom?</p>
<p>The answer is yes, and the method is surprisingly elegant.</p>
<p>---</p>
<h2>Step 1: Turning Votes into Geometry</h2>
<p>Each motion is a snapshot of political alignment. For each motion, we know which MPs voted together and which voted apart. If every PvdA and GroenLinks MP votes the same way almost every time, that tells us something. If PVV and CDA MPs diverge consistently, that tells us something too.</p>
<p>I represent this with <strong>Singular Value Decomposition (SVD)</strong> on the MP × motion matrix:</p>
<ul><li>Rows: individual MPs (and party actors for collective votes)</li><li>Columns: motions</li><li>Values: +1 (voor), -1 (tegen), 0 (absent/abstain)</li></ul>
SVD finds the dominant axes of variation — the directions along which the chamber disagrees most. The first component almost always corresponds to a left-right axis. The second typically captures something like progressive-traditionalist or libertarian-authoritarian. The key point: <strong>the axes emerge from the math, not from any labeling on my part.</strong>
<p>I request 50 SVD dimensions per window — but the actual dimensionality is constrained by <code>min(n_MPs, n_motions) - 1</code>. Sparse windows (early years, partial quarters) produce fewer meaningful dimensions. The pipeline handles this gracefully, storing whatever <code>k_used</code> is for each window so downstream fusion always works with the actual vector length.</p>
<h3>Making Windows Comparable: Procrustes Alignment</h3>
<p>Running SVD independently per window creates a subtle problem: SVD axes are <strong>arbitrarily oriented</strong>. The "left-right" axis from 2020-Q3 and the "left-right" axis from 2021-Q1 might point in completely different directions — even if the underlying politics barely changed. You can't just stack the coordinates and call it a trajectory.</p>
<p>The fix is <strong>Procrustes alignment</strong>: given two sets of party/MP positions across consecutive windows, find the rotation matrix R that best maps one onto the other (minimizing the Frobenius norm of the difference), using MPs who appear in both windows as anchors:</p>
<pre><code>R = argmin_R ||A - B @ R||_F, subject to R'R = I</code></pre>
<p>This is solved cleanly via SVD of the cross-covariance matrix (a nice piece of mathematical symmetry — SVD to build the space, SVD to align it). The result: a continuous track for every party from 2019 to 2026, where position changes reflect genuine political movement rather than axis flips.</p>
<p>High Procrustes disparity between consecutive windows — where alignment is poor even with the best rotation — is itself a signal: it suggests a structural political shift, not just individual drift.</p>
<p>---</p>
<h2>Step 2: What Each Motion Is Actually About</h2>
<p>Voting patterns tell us <em>who</em> agrees, but not <em>why</em>. For that, I add <strong>text embeddings</strong> — dense vector representations of each motion's content using a language model.</p>
<p>I use <strong><code>qwen/qwen3-embedding-4b</code></strong> via OpenRouter — a 4-billion parameter multilingual model that produces 2560-dimensional vectors with strong Dutch-language support. For each motion, I embed the richest text available: full parliamentary body text when we have it (94% of the 28,172 motions after an enrichment pass against the Tweede Kamer API), falling back to the summary description or title otherwise.</p>
<p>This lets us do something powerful: find motions that are genuinely similar in <em>topic</em>, not just in voting pattern. Two motions about nitrogen policy from 2020 and 2023 might have very different vote splits (different coalitions, different political moment) but near-identical text embeddings. That's a meaningful connection.</p>
<p>---</p>
<h2>Step 3: Fused Embeddings — The Best of Both Worlds</h2>
<p>SVD gives the political-structural signal: <em>how does this motion split the chamber?</em> Text embeddings give the semantic signal: <em>what is this motion about?</em></p>
<p>I concatenate both into a <strong>fused vector</strong> per motion per window:</p>
<pre><code>fused = [svd_dims (typically 50)] + [text_dims (2560)] = typically 2610 dimensions</code></pre>
<p>The actual dimension varies slightly because SVD dimensionality adapts to window density — the code stores <code>svd_dims</code> and <code>text_dims</code> per row so nothing downstream has to assume a fixed size.</p>
<p>This fused representation powers the similarity search. Two motions are "close" only if they're about a similar <em>topic</em> <strong>and</strong> they produce a similar <em>political split</em>. This filters out spurious matches — two motions might both be controversial (close 50/50 votes) but on completely unrelated things, and the text component separates them.</p>
<p>---</p>
<h2>The Numbers: What We're Working With</h2>
<p>After the full pipeline run:</p>
<p>| Year | Motions |
|------|---------|
| 2016 | 132 |
| 2017 | 30 |
| 2018 | 100 |
| 2019 | 3,374 |
| 2020 | 4,228 |
| 2021 | 4,289 |
| 2022 | 4,116 |
| 2023 | 3,272 |
| 2024 | 3,968 |
| 2025 | 3,715 |
| 2026 | 948 |
| <strong>Total</strong> | <strong>28,172</strong> |</p>
<p>The 2022 spike is striking — over 4,000 motions in a single year. This was the year the Rutte IV coalition took office amid intense debates on energy prices, housing, the war in Ukraine, and the ongoing nitrogen crisis. 2023 is similarly dense at 3,272 motions, culminating in the November election that brought PVV to its historic first-place finish.</p>
<p>Early years (2016–2018) use annual windows because the data is too sparse for meaningful quarterly SVD. From 2019 onwards, everything runs quarterly, giving us 38 windows in total.</p>
<p>The similarity cache holds <strong>405,216 precomputed pairs</strong> — top 10 neighbors per motion per window — making lookup instant at query time.</p>
<p>---</p>
<h2>Interesting Findings</h2>
<h3>The 2022–2023 Polarization Surge</h3>
<p>2022 and 2023 together account for more than a quarter of all motions in the dataset. In the SVD positions for 2022, the distance between the governing coalition (VVD, D66, CDA, CU) and the opposition (PVV, SP, FvD) is near its maximum. The nitrogen crisis and energy policy debates forced unusually sharp coalition discipline — which shows up geometrically as well-separated clusters.</p>
<p>2023 continued the intensity, and the Procrustes-aligned trajectory shows the party positions in 2023-Q4 and 2024-Q1 shifting noticeably as the new coalition began to form.</p>
<h3>BBB's Geometric Arrival</h3>
<p>When BBB (BoerBurgerBeweging) entered parliament in 2023 with a historic 16 seats, their SVD position placed them between PVV and CDA — exactly matching their policy profile: agrarian-nationalist populism with Catholic-provincial roots. The model found this without being told. That's a good sanity check that the geometry is capturing something real.</p>
<h3>The Strange Case of "Verworpen."</h3>
<p>Motions rejected without debate are recorded with the title "Verworpen." (Rejected.). There are hundreds of these. Because they share a 9-character title, their text embeddings are <strong>identical</strong> — cosine similarity 1.0 to every other "Verworpen." in the cache. Technically correct; semantically meaningless. The UI layer filters these out.</p>
<p>It's a reminder that <strong>data quality surprises emerge at scale</strong>. I found three or four similar pathologies (motions withdrawn mid-session, duplicate API records) that required explicit handling.</p>
<h3>Party Cohesion as a Signal</h3>
<p>Party cohesion — how often all MPs of a party vote identically — varies enormously. SGP and CU are near-perfect blocs. PvdA/GroenLinks (post-2023 merger) is similarly tight. VVD shows the most internal variation, which tracks with what you'd expect from a governing party managing coalition discipline across conflicting wings.</p>
<p>In earlier years (2019–2020), before the GroenLinks-PvdA merger, GroenLinks occasionally splits on security and defense policy — visible in the SVD as individual MP positions diverging from the party centroid.</p>
<p>---</p>
<h2>The Pipeline Architecture</h2>
<p>Single DuckDB database, modular Python pipeline, no cloud infrastructure:</p>
<pre><code>API (Tweede Kamer OData)
→ download_past_year.py
→ motions table (28,172 rows)
<p>motions
→ extract_mp_votes.py → mp_votes table (506,336 rows)
→ sync_motion_content.py → body_text enrichment (26,447 motions, ~94%)
→ text_pipeline.py → embeddings table (28,172 rows, qwen3-embedding-4b via OpenRouter)
→ svd_pipeline.py → svd_vectors table (54,150 rows, 38 windows)</p>
<p>svd_vectors + embeddings
→ fusion.py → fused_embeddings table (40,522 rows)</p>
<p>fused_embeddings
→ similarity/compute.py → similarity_cache table (405,216 rows, top-10 per window)</code></pre></p>
<p>The similarity computation is pure NumPy: load all fused vectors for a window, pad to uniform length, L2-normalize, compute the full <code>N×N</code> cosine similarity matrix via a single matrix multiply (<code>normalized @ normalized.T</code>), then extract top-k neighbors per row with <code>np.argpartition</code>. For a 4,000-motion quarter, that's a 4000×4000 matrix operation — fast enough that it's not worth batching.</p>
<p>The database sits at 15 GB on disk — up from ~3 GB before body text enrichment. The full parliamentary text for 26,000+ motions accounts for most of that growth.</p>
<p>---</p>
<h2>What's Next</h2>
<p><strong>Motion explorer</strong>: Given a motion, retrieve the 10 most politically and semantically similar ones from across the decade. Trace how a policy debate evolved — who championed it, how the coalitions shifted.</p>
<p><strong>Party trajectory animation</strong>: Procrustes-aligned positions, animated year by year. Watch D66 drift post-2021, watch PVV consolidate its flank, watch new parties arrive and find their geometric home.</p>
<p><strong>Cross-party coalition patterns</strong>: The fused embeddings let us ask which topics produce unusual coalition configurations — motions where the normal left-right split breaks down and unexpected alliances form.</p>
<p><strong>The controversy index</strong>: <code>1 - winning_margin</code> gives a controversy score per motion. The most contested votes — close margins, high-salience topics — tell a different story than the headline political narratives.</p>
<p>---</p>
<h2>Reproducibility</h2>
<pre><code>bash
<h1>Download historical data</h1>
python scripts/download_past_year.py --start-date 2016-01-01 --end-date 2026-01-01
<h1>Run full pipeline (SVD, text embeddings, fusion, similarity cache)</h1>
python -m pipeline.run_pipeline --db-path data/motions.db \
--start-date 2016-01-01 --end-date 2026-01-01 \
--window-size quarterly --text-batch-size 200
<h1>Enrich with full motion body text</h1>
python scripts/sync_motion_content.py --db-path data/motions.db</code></pre>
<p>The DB grows to ~15 GB for the full dataset including body text. All computation — SVD, fusion, similarity — runs locally on a single machine.</p>
<p>Democracy is more legible than it looks.</p>
</body>
</html>

@ -1,14 +1,14 @@
# Mapping Dutch Democracy: Building a Political Compass from 25,000+ Parliamentary Votes
# Mapping Dutch Democracy: Building a Political Compass from 28,000+ Parliamentary Votes
*What if you could take every motion voted on in the Dutch Parliament over the past decade and automatically plot parties and MPs on a political map — with zero manual labeling?*
That's exactly what this project does. Here's how we built it, what surprised us, and what it revealed about Dutch political dynamics.
That's exactly what this project does. Here's how I built it, what I had to solve along the way, and what it revealed about Dutch political dynamics.
---
## The Starting Point: Open Data, Hidden Structure
The Dutch Parliament publishes every vote — every *motie*, every *amendement*, every *besluit* — in an open OData API. We're talking over **25,500 motions** spanning 2016 to 2026, each with a record of how every party (and in many cases every individual MP) voted: *voor* (for), *tegen* (against), *onthouden* (abstained), or *afwezig* (absent).
The Dutch Parliament publishes every vote — every *motie*, every *amendement*, every *besluit* — in an open OData API. We're talking over **28,000 motions** spanning 2016 to 2026, with a record of how every individual MP voted: *voor* (for), *tegen* (against), *onthouden* (abstained), or *afwezig* (absent). That's 506,000 individual vote records.
This is an extraordinary dataset. But in raw form it's just a table of votes. The interesting question is: can we extract *structure* — left vs. right, progressive vs. conservative, governing vs. opposition — purely from the pattern of who votes with whom?
@ -18,53 +18,57 @@ The answer is yes, and the method is surprisingly elegant.
## Step 1: Turning Votes into Geometry
Each motion is a snapshot of political alignment. For each motion, we know which parties voted together and which voted apart. If PvdA and GroenLinks almost always vote the same way, that tells us something. If PVV and CDA frequently diverge, that tells us something too.
Each motion is a snapshot of political alignment. For each motion, we know which MPs voted together and which voted apart. If every PvdA and GroenLinks MP votes the same way almost every time, that tells us something. If PVV and CDA MPs diverge consistently, that tells us something too.
We represent this with **Singular Value Decomposition (SVD)** on the party-vote matrix:
I represent this with **Singular Value Decomposition (SVD)** on the MP × motion matrix:
- Rows: parties (VVD, PVV, D66, CDA, PvdA, GroenLinks, SP, CU, SGP, FvD, BBB, ...)
- Rows: individual MPs (and party actors for collective votes)
- Columns: motions
- Values: vote encoded as +1 (voor), -1 (tegen), 0 (absent/abstain)
- Values: +1 (voor), -1 (tegen), 0 (absent/abstain)
SVD finds the dominant axes of variation — the directions along which parties disagree most strongly. The first dimension almost always corresponds to a left-right axis. The second dimension typically captures something like a libertarian-authoritarian or progressive-traditionalist axis.
SVD finds the dominant axes of variation — the directions along which the chamber disagrees most. The first component almost always corresponds to a left-right axis. The second typically captures something like progressive-traditionalist or libertarian-authoritarian. The key point: **the axes emerge from the math, not from any labeling on my part.**
We run this **per quarterly window** (2019-Q1, 2019-Q2, ..., 2024-Q4) so we can track how positions shift over time at fine resolution.
I request 50 SVD dimensions per window — but the actual dimensionality is constrained by `min(n_MPs, n_motions) - 1`. Sparse windows (early years, partial quarters) produce fewer meaningful dimensions. The pipeline handles this gracefully, storing whatever `k_used` is for each window so downstream fusion always works with the actual vector length.
### The Result: A 2D Political Compass
### Making Windows Comparable: Procrustes Alignment
The output is coordinates for every party in 2D space — computed purely from voting behavior, with no labels or assumptions from us. When you plot it, recognizable structure emerges immediately:
Running SVD independently per window creates a subtle problem: SVD axes are **arbitrarily oriented**. The "left-right" axis from 2020-Q3 and the "left-right" axis from 2021-Q1 might point in completely different directions — even if the underlying politics barely changed. You can't just stack the coordinates and call it a trajectory.
- **Left bloc** (PvdA, GroenLinks, SP) cluster tightly together
- **Right-liberal** (VVD, D66) sit in a distinct quadrant
- **Religious right** (SGP, CU) form their own coherent group
- **Populist right** (PVV, FvD in later years) occupy a distant extreme
- **BBB** (Farmer's party, 2022 onwards) drops into an interesting position between PVV and CDA
The fix is **Procrustes alignment**: given two sets of party/MP positions across consecutive windows, find the rotation matrix R that best maps one onto the other (minimizing the Frobenius norm of the difference), using MPs who appear in both windows as anchors:
The political axis emerges from the math — not our intuitions.
```
R = argmin_R ||A - B @ R||_F, subject to R'R = I
```
This is solved cleanly via SVD of the cross-covariance matrix (a nice piece of mathematical symmetry — SVD to build the space, SVD to align it). The result: a continuous track for every party from 2019 to 2026, where position changes reflect genuine political movement rather than axis flips.
High Procrustes disparity between consecutive windows — where alignment is poor even with the best rotation — is itself a signal: it suggests a structural political shift, not just individual drift.
---
## Step 2: What Each Motion Is Actually About
Voting patterns tell us *who* agrees, but not *why*. For that, we add **text embeddings** — dense vector representations of each motion's title and description using a language model.
Voting patterns tell us *who* agrees, but not *why*. For that, I add **text embeddings** — dense vector representations of each motion's content using a language model.
This lets us do something powerful: if a new motion comes in about nitrogen emissions, we can find the 20 most similar past motions (by meaning, not just keywords). If a motion uses identical party-line voting as another motion from 2022, the text embedding can confirm they're genuinely related — or reveal that the voting pattern is coincidental (parties split on unrelated issues for similar structural reasons).
I use **`qwen/qwen3-embedding-4b`** via OpenRouter — a 4-billion parameter multilingual model that produces 2560-dimensional vectors with strong Dutch-language support. For each motion, I embed the richest text available: full parliamentary body text when we have it (94% of the 28,172 motions after an enrichment pass against the Tweede Kamer API), falling back to the summary description or title otherwise.
We compute these using **OpenAI-compatible embeddings** via OpenRouter, processing 25,640 motions in batches of 200.
This lets us do something powerful: find motions that are genuinely similar in *topic*, not just in voting pattern. Two motions about nitrogen policy from 2020 and 2023 might have very different vote splits (different coalitions, different political moment) but near-identical text embeddings. That's a meaningful connection.
---
## Step 3: Fused Embeddings — The Best of Both Worlds
SVD gives us the political-structural signal: *how does this motion split the chamber?* Text embeddings give us semantic signal: *what is this motion about?*
SVD gives the political-structural signal: *how does this motion split the chamber?* Text embeddings give the semantic signal: *what is this motion about?*
We concatenate both into a **fused vector** per motion per window:
I concatenate both into a **fused vector** per motion per window:
```
fused = [svd_dims (50)] + [text_dims (2560)] = 2610 dimensions
fused = [svd_dims (typically 50)] + [text_dims (2560)] = typically 2610 dimensions
```
This fused representation powers the similarity search. Two motions are considered "close" if they're both about a similar topic *and* they produce a similar political split. This filters out spurious matches — two motions might both be controversial (splitting 50/50) but about completely unrelated things.
The actual dimension varies slightly because SVD dimensionality adapts to window density — the code stores `svd_dims` and `text_dims` per row so nothing downstream has to assume a fixed size.
This fused representation powers the similarity search. Two motions are "close" only if they're about a similar *topic* **and** they produce a similar *political split*. This filters out spurious matches — two motions might both be controversial (close 50/50 votes) but on completely unrelated things, and the text component separates them.
---
@ -81,94 +85,101 @@ After the full pipeline run:
| 2020 | 4,228 |
| 2021 | 4,289 |
| 2022 | 4,116 |
| 2023 | 621 |
| 2023 | 3,272 |
| 2024 | 3,968 |
| 2025 | 3,715 |
| 2026 | 948 |
| **Total** | **28,172** |
The 2022 spike is striking — over 4,000 motions in a single year. This was the year the Rutte IV coalition took office amid intense debates on energy prices, housing, the war in Ukraine, and the ongoing nitrogen crisis. 2023 is similarly dense at 3,272 motions, culminating in the November election that brought PVV to its historic first-place finish.
The 2022 spike is striking — over 4,000 motions in a single year. This was the year the Rutte IV coalition took office amid intense debates on energy prices, housing, the war in Ukraine, and the ongoing nitrogen crisis.
Early years (2016–2018) use annual windows because the data is too sparse for meaningful quarterly SVD. From 2019 onwards, everything runs quarterly, giving us 38 windows in total.
Our similarity cache now holds **627,272 precomputed pairs** (top 20 neighbors per motion per window), making similarity lookup instant at query time.
The similarity cache holds **405,216 precomputed pairs** — top 10 neighbors per motion per window — making lookup instant at query time.
---
## Interesting Findings
### The 2022 Polarization Surge
### The 2022–2023 Polarization Surge
The 2022 cohort dominates the dataset. Looking at the SVD positions for that year, the distance between the governing coalition (VVD, D66, CDA, CU) and the opposition (PVV, SP, FvD) is near its maximum. The nitrogen crisis and energy policy debates forced unusually sharp coalition discipline.
2022 and 2023 together account for more than a quarter of all motions in the dataset. In the SVD positions for 2022, the distance between the governing coalition (VVD, D66, CDA, CU) and the opposition (PVV, SP, FvD) is near its maximum. The nitrogen crisis and energy policy debates forced unusually sharp coalition discipline — which shows up geometrically as well-separated clusters.
2023 continued the intensity, and the Procrustes-aligned trajectory shows the party positions in 2023-Q4 and 2024-Q1 shifting noticeably as the new coalition began to form.
### BBB's Geometric Arrival
When BBB (BoerBurgerBeweging) entered parliament in 2023 with a historic 16 seats, their SVD position placed them between PVV and CDA — exactly as expected from their policy profile: agrarian-nationalist populism with Catholic-provincial roots. The model found this without being told.
When BBB (BoerBurgerBeweging) entered parliament in 2023 with a historic 16 seats, their SVD position placed them between PVV and CDA — exactly matching their policy profile: agrarian-nationalist populism with Catholic-provincial roots. The model found this without being told. That's a good sanity check that the geometry is capturing something real.
### The Strange Case of "Verworpen."
Motions that are rejected without debate are recorded with the title "Verworpen." (Rejected.). There are hundreds of these. Because they share a single 9-character title, their text embeddings are identical — meaning every "Verworpen." has cosine similarity 1.0 to every other "Verworpen." This is technically correct (they are textually identical) but semantically meaningless. The similarity cache contains these spurious pairs, which the UI layer needs to filter out.
Motions rejected without debate are recorded with the title "Verworpen." (Rejected.). There are hundreds of these. Because they share a 9-character title, their text embeddings are **identical** — cosine similarity 1.0 to every other "Verworpen." in the cache. Technically correct; semantically meaningless. The UI layer filters these out.
It's a good reminder that **data quality surprises emerge at scale**.
It's a reminder that **data quality surprises emerge at scale**. I found three or four similar pathologies (motions withdrawn mid-session, duplicate API records) that required explicit handling.
### Party Cohesion as a Signal
A subtle finding: party cohesion (how often all members of a party vote the same way) varies enormously. SGP and CU have near-perfect cohesion — they vote as a bloc on almost everything. PvdA/GroenLinks (post-merger) has similarly high cohesion. But in earlier years (2019-2020), before the merger, GroenLinks occasionally splits on specific issues around security policy.
Party cohesion — how often all MPs of a party vote identically — varies enormously. SGP and CU are near-perfect blocs. PvdA/GroenLinks (post-2023 merger) is similarly tight. VVD shows the most internal variation, which tracks with what you'd expect from a governing party managing coalition discipline across conflicting wings.
VVD shows the most internal variation — governing parties develop fissures.
In earlier years (2019–2020), before the GroenLinks-PvdA merger, GroenLinks occasionally splits on security and defense policy — visible in the SVD as individual MP positions diverging from the party centroid.
---
## The Pipeline Architecture
The system is built around a single DuckDB database and a modular Python pipeline:
Single DuckDB database, modular Python pipeline, no cloud infrastructure:
```
API (Tweede Kamer OData)
→ download_past_year.py
→ motions table (25,500+ rows)
→ motions table (28,172 rows)
motions
→ extract_mp_votes.py → mp_votes table (200k rows)
→ text_pipeline.py → embeddings table (25,640 rows, via OpenRouter)
→ svd_pipeline.py → svd_vectors table (50,779 rows, quarterly windows)
→ extract_mp_votes.py → mp_votes table (506,336 rows)
→ sync_motion_content.py → body_text enrichment (26,447 motions, ~94%)
→ text_pipeline.py → embeddings table (28,172 rows, qwen3-embedding-4b via OpenRouter)
→ svd_pipeline.py → svd_vectors table (54,150 rows, 38 windows)
svd_vectors + embeddings
→ fusion.py → fused_embeddings table (35,872 rows)
→ fusion.py → fused_embeddings table (40,522 rows)
fused_embeddings
→ similarity/compute.py → similarity_cache table (627k rows, top-20 per window)
→ similarity/compute.py → similarity_cache table (405,216 rows, top-10 per window)
```
Everything runs locally. The only external call is to the OpenRouter API for text embeddings. The similarity computation (627k pairs) is pure NumPy — load vectors, normalize, matrix multiply, take top-k. For 4,000 motions in a quarter, that's a 4000×4000 cosine similarity matrix computed in seconds.
The similarity computation is pure NumPy: load all fused vectors for a window, pad to uniform length, L2-normalize, compute the full `N×N` cosine similarity matrix via a single matrix multiply (`normalized @ normalized.T`), then extract top-k neighbors per row with `np.argpartition`. For a 4,000-motion quarter, that's a 4000×4000 matrix operation — fast enough that it's not worth batching.
The database sits at 15 GB on disk — up from ~3 GB before body text enrichment. The full parliamentary text for 26,000+ motions accounts for most of that growth.
---
## What's Next
The similarity cache and political compass open up several directions:
**Motion explorer**: Given a motion you care about, find the 20 most politically and semantically similar motions from across the decade. Trace how a policy debate evolved from 2019 to 2025.
**Motion explorer**: Given a motion, retrieve the 10 most politically and semantically similar ones from across the decade. Trace how a policy debate evolved — who championed it, how the coalitions shifted.
**Party trajectory plots**: Animate party positions on the 2D compass year by year. Watch D66 drift, watch PVV consolidate, watch the new parties arrive and find their position.
**Party trajectory animation**: Procrustes-aligned positions, animated year by year. Watch D66 drift post-2021, watch PVV consolidate its flank, watch new parties arrive and find their geometric home.
**Cross-party coalition predictor**: Given a new motion's text and expected vote split, predict which parties will support it based on past patterns.
**Cross-party coalition patterns**: The fused embeddings let us ask which topics produce unusual coalition configurations — motions where the normal left-right split breaks down and unexpected alliances form.
**The "controversy index"**: We already compute `1 - winning_margin` as a controversy score. The most controversial motions (close votes, high stakes topics) tell a story about where Dutch politics is genuinely undecided vs. where it's performing conflict for the cameras.
**The controversy index**: `1 - winning_margin` gives a controversy score per motion. The most contested votes — close margins, high-salience topics — tell a different story than the headline political narratives.
---
## Reproducibility
The full pipeline is open and runs on a single machine with no cloud infrastructure:
```bash
# Download historical data
python scripts/download_past_year.py --start-date 2016-01-01 --end-date 2026-01-01
# Run full pipeline (extract votes, compute SVD, embed text, fuse, build similarity cache)
# Run full pipeline (SVD, text embeddings, fusion, similarity cache)
python -m pipeline.run_pipeline --db-path data/motions.db \
--start-date 2016-01-01 --end-date 2026-01-01 \
--window-size annual --text-batch-size 200
--window-size quarterly --text-batch-size 200
# Enrich with full motion body text
python scripts/sync_motion_content.py --db-path data/motions.db
```
The DB grows to ~3.6GB for the full dataset (mostly embeddings and vote records). Everything else — the SVD, fusion, similarity cache — fits comfortably in memory during computation.
The DB grows to ~15 GB for the full dataset including body text. All computation — SVD, fusion, similarity — runs locally on a single machine.
Democracy is more legible than it looks.

@ -0,0 +1,125 @@
{
"window": "2025",
"important": {
"0": [
"2185",
"1354",
"145",
"1983",
"3111",
"1299",
"3246",
"1967",
"3061",
"1682"
],
"1": [
"164",
"799",
"792",
"3536",
"3120",
"3001",
"3011",
"3013",
"1814",
"103"
],
"2": [
"1958",
"1958",
"1432",
"1432",
"1584",
"1584",
"1139",
"1259",
"1139",
"1172"
],
"3": [
"2539",
"2539",
"2541",
"2541",
"2452",
"2452",
"929",
"1625",
"929",
"509"
],
"4": [
"1456",
"1456",
"2466",
"2466",
"1344",
"1344",
"2313",
"2313",
"1446",
"1446"
],
"5": [
"906",
"906",
"3244",
"3244",
"2060",
"2060",
"2967",
"2967",
"1817",
"1817"
],
"6": [
"1826",
"1826",
"2333",
"2333",
"2002",
"2002",
"879",
"3518",
"3518",
"879"
],
"7": [
"1964",
"1964",
"3370",
"3370",
"3216",
"3216",
"2379",
"2379",
"3517",
"3517"
],
"8": [
"719",
"719",
"3281",
"3281",
"3530",
"3626",
"3530",
"3626",
"2001",
"2001"
],
"9": [
"720",
"720",
"408",
"408",
"730",
"730",
"732",
"732",
"3443",
"3443"
]
}
}

File diff suppressed because one or more lines are too long

@ -1,9 +1,8 @@
# Session: stemwijzer — Parliamentary Embedding Pipeline
Updated: 2026-03-22T16:00:00Z
# Session: stemwijzer
Updated: 2026-03-23T09:00:00Z
## Goal
2D political compass + motion similarity search from parliamentary votes + motion text.
Full historical coverage 2016–2026, precomputed similarity cache, fused (SVD + text) embeddings.
2D political compass + motion similarity search from parliamentary votes + motion text. Full historical coverage 2016–2026, precomputed similarity cache, fused (SVD + text) embeddings.
## Constraints
- DuckDB only (`data/motions.db`); open/close `duckdb.connect(self.db_path)` per method
@ -77,3 +76,47 @@ Full historical coverage 2016–2026, precomputed similarity cache, fused (SVD +
## Branch
`main`
## Progress
### Done
- [x] All items listed under "Completed This Session" above
### In Progress
- [ ] Rerun embeddings: started scripts/rerun_embeddings.py against `data/motions.db`
- Start time: 2026-03-23T01:42:00Z (approx)
- Current progress: embeddings stored = 950 / total motions = 28,172
- fused_embeddings = 0 (not started)
- similarity_cache = 0 (not started)
### Blocked
- Not fully blocked, but encountering provider failures and warnings that slow progress:
- Batch 951..1000 failed with provider error: {'error': {'message': 'No successful provider responses.', 'code': 404}} (recorded)
- Occasional connection pool warnings during earlier body fetch phase (logged)
- Provider failures are transient but may require retries or provider change if repeated
## Key Decisions
- **Retry strategy on provider failure**: On repeated provider failures, retry embedding batches with smaller batch_size (e.g. 50 -> 20) or switch provider. Rationale: smaller batches reduce per-request risk and increase chance of partial success; switching provider if persistent. (UNCONFIRMED)
## Next Steps
1. Continue the rerun_embeddings job until completion; monitor batches closely
2. If provider failures repeat, retry failed batches with smaller batch_size (50 -> 20) or switch provider (as above)
3. On completion, update ledger with final counts and list any failed motion IDs
4. If fused_embeddings / similarity_cache remain 0 after embeddings finished, run fusion and similarity recompute pipelines
## File Operations
### Read
- `data/motions.db`
- `scripts/rerun_embeddings.py` (invoked)
### Modified
- `thoughts/ledgers/CONTINUITY_stemwijzer.md` (this file)
## Critical Context
- Rerun started 2026-03-23T01:42Z; current embeddings stored = 950 of 28,172 total motions.
- Recent error: Batch 951..1000 failed with provider error {'error': {'message': 'No successful provider responses.', 'code': 404}} — these batch numbers and error payload should be retried.
- ETA: approx 1.5–2.5 hours remaining at current rate (UNCONFIRMED, depends on provider stability)
- Earlier stage produced occasional connection pool warnings while fetching motion bodies; these did not stop progress but may indicate transient network instability.
## Working Set
- Branch: `main`
- Key files: `data/motions.db`, `scripts/rerun_embeddings.py`, `thoughts/ledgers/CONTINUITY_stemwijzer.md`

@ -0,0 +1,549 @@
[
{
"id": "91d3a66a-5542-4325-8fc0-f0715b570e5b",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:01:06.889693Z"
},
{
"id": "1e1bcee0-5f2a-4337-b57f-ca83ac93da7e",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:01:07.327717Z"
},
{
"id": "51fec578-84ae-4d69-85df-b415a2b6c752",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:01:07.827925Z"
},
{
"id": "38c9ba12-5829-410c-ac38-b992a9b22652",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:01:11.681524Z"
},
{
"id": "1e69741a-d0ff-41c9-b5e7-e4e2e8475836",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:01:12.061577Z"
},
{
"id": "c2fa7d58-958b-4efb-b670-d044de0db357",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:01:12.104491Z"
},
{
"id": "4bd245c7-9e6c-41dc-bce0-f36d3b675ef8",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:03:14.657886Z"
},
{
"id": "e885a2da-48f3-4130-b62f-413ae2670b9c",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:03:14.997464Z"
},
{
"id": "29949f88-0739-4029-964a-d1a9be3d1030",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:03:15.057155Z"
},
{
"id": "b08b870b-1923-4cc7-8384-6a4fd5a5ae63",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:08:18.540282Z"
},
{
"id": "acb1d1ef-1a2f-4256-b23c-dc7272e6cda8",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:08:18.992755Z"
},
{
"id": "a2e7f741-ce46-4533-a4a1-98d202ad5ba9",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:16:44.733143Z"
},
{
"id": "208fd3d6-dcaa-408a-b7ef-054703083756",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:16:45.983301Z"
},
{
"id": "9112851e-ba85-4498-8aa9-4f71aa91d6ec",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:16:46.060827Z"
},
{
"id": "2a5bffe4-c75b-46f2-baf1-164ac87953d6",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:18:02.707023Z"
},
{
"id": "834a7419-6d1b-48e0-825c-08c3ea780c94",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:18:04.158612Z"
},
{
"id": "9d27c575-c186-4cc6-b202-1e7d44600983",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:18:04.213958Z"
},
{
"id": "b03f2ce4-6ac0-41a5-8f32-36dc86db4048",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:19:07.187178Z"
},
{
"id": "4589cfd1-16c9-4743-b2e9-15be42e121e7",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:19:07.560463Z"
},
{
"id": "b2e8fa30-b3ca-42f1-9138-c188b2683723",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:19:07.631447Z"
},
{
"id": "492bd375-b002-446b-b424-4dc3cf40ea44",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:20:12.734568Z"
},
{
"id": "e9349d9b-7962-489b-8a06-1753f1606048",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:20:13.089162Z"
},
{
"id": "a8af51f1-2126-4b57-87e3-6913ada4643b",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:20:13.152945Z"
},
{
"id": "999cccc1-dcbe-48c7-80f0-229351780823",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:20:26.037664Z"
},
{
"id": "03acdcb0-da76-4ac7-bc7d-ea5a314734b8",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:20:27.339457Z"
},
{
"id": "964943f7-c431-484a-a2a3-070327287d90",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:20:27.382680Z"
},
{
"id": "4d4ffec0-83c9-48e6-8724-df04de5cf741",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:25:42.534010Z"
},
{
"id": "56511987-38b1-4286-a8ac-9771c297051f",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:25:42.861164Z"
},
{
"id": "89e29b19-8926-47c8-8a52-922a65da4189",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:25:44.542652Z"
},
{
"id": "a430eb8d-c36e-4ad1-b5c3-01c1fc5f3be5",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:30:11.148013Z"
},
{
"id": "2b108662-6d4b-48b6-88ba-9f3111491217",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:30:12.361529Z"
},
{
"id": "18aad6fc-567f-4d7d-8569-28357c6c301d",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:30:12.429871Z"
},
{
"id": "730467ab-2c1d-47ec-a29e-9d02803c8b1f",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T20:31:09.659557Z"
},
{
"id": "e87170fe-c03b-42a6-bd96-d45a26717359",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T20:31:10.991569Z"
},
{
"id": "cf5c8957-c893-478d-924b-ce77d9e53a41",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T20:31:11.039826Z"
},
{
"id": "d0aa2441-6fc8-4df9-bf37-621f8d9e5351",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T21:37:36.823732Z"
},
{
"id": "e55be18c-0fb5-4576-aa51-24559148916e",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T21:37:38.061523Z"
},
{
"id": "b8f7ee6c-4ce5-4415-97bc-e00e02b2c851",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T21:37:38.105562Z"
},
{
"id": "75e76c14-28e5-4f5b-a2d4-52c9dba5efc4",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T21:53:11.948658Z"
},
{
"id": "0e515c39-ef8a-4abf-87fb-1c96e669e7f5",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T21:53:13.215965Z"
},
{
"id": "836ba937-d26f-4869-9661-36d5744f649a",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T21:53:13.259045Z"
},
{
"id": "95606fed-c023-4fb1-98a4-5c5707e95056",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T21:54:11.170757Z"
},
{
"id": "e188e203-932c-4f79-bed9-af1ec1336102",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T21:54:12.444776Z"
},
{
"id": "591eb30e-a91f-448b-b9a0-9a3d75a35790",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T21:54:12.491990Z"
},
{
"id": "75d70cad-41f4-49f8-ae8a-08b1bafb02b4",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T21:58:08.120685Z"
},
{
"id": "a0cc28c1-28f6-409e-ad5c-8a974ff28e00",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T21:58:08.847188Z"
},
{
"id": "84e5699d-64d9-4179-99e2-f79cb9b27b33",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T21:58:08.888009Z"
},
{
"id": "f0e47b0a-4791-4475-9036-9e87a5e00be8",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T22:31:18.698163Z"
},
{
"id": "50006ac9-d6b5-4198-9cca-c937dee60eab",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T22:31:19.938465Z"
},
{
"id": "d44d09e5-fa20-40ea-8829-dfa044addf57",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T22:31:19.997516Z"
},
{
"id": "c1876f8b-79bd-4fd0-9d0f-68eb8dd3321d",
"actor_id": null,
"action": "embedding_failed",
"target_type": "motion",
"target_id": "99",
"metadata": {
"error": "RuntimeError(\"Simulated embedding failure for index 0: 'failing motion'\")"
},
"created_at": "2026-03-23T22:52:46.531868Z"
},
{
"id": "39fb4426-e348-455c-b4f7-0e77c6a72dc7",
"actor_id": null,
"action": "test_action",
"target_type": "unit",
"target_id": "u1",
"metadata": {
"k": 1
},
"created_at": "2026-03-23T22:52:47.772599Z"
},
{
"id": "fc04bbb8-1378-460c-9914-c923cd1a45f8",
"actor_id": null,
"action": "another_action",
"target_type": "motion",
"target_id": null,
"metadata": {},
"created_at": "2026-03-23T22:52:47.836920Z"
}
]

@ -0,0 +1,29 @@
{
"timestamp": "2026-03-23T19:43:35.098568Z",
"sample_size": 2,
"top_k": 3,
"results": [
{
"motion_id": 1,
"top_k": 3,
"suspicious": 1
},
{
"motion_id": 2,
"top_k": 3,
"suspicious": 1
}
],
"motions": {
"1": {
"motion_id": 1,
"top_k": 3,
"suspicious": 1
},
"2": {
"motion_id": 2,
"top_k": 3,
"suspicious": 1
}
}
}

@ -0,0 +1,113 @@
---
date: 2026-03-24
topic: "Welk tweede kamerlid ben jij?"
status: draft
---
## Problem Statement
We need a new Streamlit tab in explorer.py titled **"Welk tweede kamerlid ben jij?"** that interactively narrows the list of 2026 MPs by asking the user a sequence of yes/no/abstain questions (motions). The goal: find the minimal set of motions (questions) that uniquely identify a single MP, or determine that no unique MP exists because two or more MPs have identical voting records.
**Why:** This is a guided identification quiz that helps users discover which MP they most resemble by iteratively comparing their answers to historic MP votes.
## Constraints
- Work inside the existing Streamlit explorer (single-file UI: **explorer.py**).
- Use existing data models/tables: **mp_votes**, **mp_metadata**, **motions** (DuckDB / MotionDatabase). No new external services.
- Keep reads read-only: do not modify the DB from the UI flow.
- YAGNI: minimal viable UX first (linear question flow, basic results table), extensible later.
## Approach (chosen)
I recommend a two-stage approach that balances simplicity and correctness:
- **Stage A (Batch-match + ranking):** Ask the user a small curated set of motions (e.g., high-controversy / high-discriminative score). Collect answers into a map motion_id -> vote and compute per-MP agreement counts using a new read-only DB helper. Show ranked candidates and whether any are unique.
- **Stage B (Minimal distinguishing set):** If multiple candidates tie (or more than one remain), compute a minimal discriminating set of additional motions by greedily selecting motions that best split the remaining candidate set and present them as follow-up questions until a unique MP or impossibility is reached.
Alternatives considered (rejected):
- Asking motions adaptively from the start using an information-gain search over the entire motion space. Rejected because it’s heavier to implement and harder to explain to users; we can implement a greedy information-gain variant later.
- Building a full decision tree offline for all MPs. Rejected for now because the dataset and party churn make maintenance cumbersome.
Effort estimate (rough):
- Backend: add one DB method to MotionDatabase (match_mps_for_votes) + helper to compute split scores — 2–4 hours.
- Frontend: add new Streamlit builder, UI state, and wiring into tabs — 2–4 hours.
- Testing & polish: 2–3 hours.
Risks & dependencies
- **Data quality:** If mp_votes.party or mp_metadata are incomplete, matching may be imperfect. We rely on existing backfill scripts to improve party fields.
- **Performance:** Joins over mp_votes can be large; we'll limit candidate motion set and use read-only DuckDB queries, with caching where appropriate.
## Architecture
High-level components (all in-process Streamlit app):
- **Explorer UI (explorer.py)** — new tab builder **build_mp_quiz_tab**. Presents questions and displays results.
- **MotionDatabase (database.py)** — new read-only method **match_mps_for_votes(user_votes, limit)** that returns per-MP agreement and overlap counts. Also a helper **choose_discriminating_motions(candidates, excluded_motion_ids, k=1)** that scores motions by how well they split candidate MPs.
- **DuckDB (data)** — existing tables: motions, mp_votes, mp_metadata.
All calls stay local — the Streamlit UI instantiates MotionDatabase(db_path) and calls the new read methods.
## Components and Responsibilities
- **build_mp_quiz_tab (explorer.py)**
- Render intro and instructions.
- Load an initial pool of candidate motions (curated by controversy or SVD components via existing load_motions_df).
- Present one question at a time, store answers in st.session_state (motion_id -> vote).
- After each answer (or on demand), call MotionDatabase.match_mps_for_votes to get ranked candidates.
- If multiple candidates remain, call the discriminating-motion helper to pick the next question.
- Show final result (unique MP or note that multiple MPs are indistinguishable).
- **MotionDatabase.match_mps_for_votes (database.py)**
- Input: user_votes dict {motion_id: vote_str}
- Output: ordered list of {mp_name, party, matched, total, agreement_pct}
- Implementation: create an in-memory relation of user_votes, join with mp_votes where mp_name LIKE '%,%' and aggregate matched / overlap counts. Order by agreement_pct, matched desc.
- **MotionDatabase.choose_discriminating_motions (database.py)**
- Input: remaining candidate mp_names, excluded_motion_ids
- Output: motion_id(s) ranked by split-score (e.g., entropy or max-min split)
- Implementation: for a small candidate set, compute how many MPs vote 'voor'/'tegen'/'onthouden' on each motion and pick motion with best split.
Files to modify (concrete)
- explorer.py
- Add function: build_mp_quiz_tab(...) near other build_*_tab functions (e.g., after build_svd_components_tab).
- Add new tab label to the tab_labels list and wire into the st.tabs and fallback radio branches. (See existing tab pattern at explorer.py around lines ~626-779.)
- database.py
- Add methods: match_mps_for_votes and choose_discriminating_motions near calculate_party_matches / mp_votes helpers.
## Data Flow
1. UI loads candidate motion list via existing load_motions_df(db_path).
2. User answers a question => stored in st.session_state['mp_quiz_votes'] mapping motion_id -> vote_token.
3. UI calls MotionDatabase.match_mps_for_votes(user_votes) (read-only DuckDB). Returns sorted candidate MPs with matched/total/agreement_pct.
4. If >1 candidate remains, UI calls MotionDatabase.choose_discriminating_motions(candidates, excluded) to pick next motion(s).
5. Repeat until one candidate remains OR no motion splits candidates (tie by identical voting histories).
## Error Handling
- Validation: normalize UI votes to the canonical tokens used in mp_votes (lowercase Dutch tokens like 'voor','tegen','onthouden','afwezig').
- Empty or missing data: if user_votes is empty or no overlaps exist, show helpful message and fall back to top-ranked MPs by similarity.
- Division-by-zero: in match computations, treat zero-overlap MPs as excluded from ranking and surface a clear message.
- Timeouts / heavy queries: restrict candidate set and use read-only DuckDB and caching (@st.cache_data) to avoid repeated heavy queries.
## Testing Strategy
- Unit tests for database methods (new tests/test_match_mps.py):
- small synthetic mp_votes fixture to assert matched/total/agreement_pct logic.
- tests for choose_discriminating_motions producing expected splits.
- Integration test for explorer tab (tests/test_explorer_quiz.py): render the builder function in a headless mode and assert UI state updates and DB calls succeed (similar to existing tests/test_explorer_import.py).
## Open Questions
1. Do we want an initial curated motion set (top-10 controversial), or start fully adaptive? I'll implement a small curated seed and make adaptive/discovery optional.
2. UX: Should we let users skip a question (abstain) and count abstain as a valid token? I assume yes and will treat abstain as a normal vote that matches mp_votes 'onthouden' or 'afwezig' values.
3. Performance limits: how many motions should we allow the user to answer (arbitrary cap e.g., 20)? I suggest 20 to keep interactions snappy.
## Next steps
I'm proceeding to create the design doc file at thoughts/shared/designs/2026-03-24-welk-tweede-kamerlid-ben-jij-design.md and commit it. Interrupt if you want changes. After that I'll spawn the planner to create a detailed implementation plan based on this design.

@ -0,0 +1,197 @@
# "Welk tweede kamerlid ben jij?" Implementation Plan
**Goal:** Add a Streamlit quiz tab that interactively asks the user motion (vote) questions and narrows the set of 2026 MPs to find the best-matching MP. Implement two DB helpers (matching + discriminating-motion selection), the UI builder and tab wiring, and tests. Minimal viable changes only — no UX bells & whistles.
**Design:** thoughts/shared/designs/2026-03-24-welk-tweede-kamerlid-ben-jij-design.md
---
## Dependency Graph
```
Batch 1 (parallel): 1.1 [foundation - no deps], 1.2 (plan file) [none]
Batch 2 (parallel): 2.1 [explorer UI - depends: 1.1]
Batch 3 (parallel): 3.1 [integration tests - depends: 1.1,2.1]
```
---
## Summary of implementation decisions (gap-filling)
- MotionDatabase.match_mps_for_votes: implement as a read-only DuckDB-backed method on the existing MotionDatabase class (database.py). It accepts user_votes: Dict[int, str] where keys are motion ids and values are UI vote tokens. I will implement vote normalization inside the method (mapping UI tokens to canonical DB tokens) to avoid touching other modules. Rationale: keeps surface changes minimal and avoids creating new modules.
- MotionDatabase.choose_discriminating_motions: implement in the same file. For a small candidate set (expected << 200 MPs), fetch mp_votes for candidate MPs across candidate motions (excluding already-answered motion ids). Score candidate motions by information-entropy of vote distribution among remaining candidates (higher entropy = better split). Ties broken by controversy_score then motion id.
- Explorer UI changes: add build_mp_quiz_tab(db_path) to explorer.py and wire it into the tabs list and fallback radio. Use st.session_state['mp_quiz_votes'] to store answers as mapping str(motion_id)->UI token. Use @st.cache_data on any expensive DB-calls in the UI layer.
- Vote token normalization: UI will present choices: "Voor", "Tegen", "Onthouden", "Afwezig / Geen stem". The DB stores lowercase tokens like 'voor', 'tegen', 'onthouden', 'afwezig'. match_mps_for_votes will normalize case and a small set of variants (e.g., 'Geen stem' -> 'afwezig', 'Abstain' -> 'onthouden') — explicit list included in tests.
---
## BATCH 1: Foundation (parallel - N implementers)
All tasks in this batch have NO dependencies and can run simultaneously.
### Task 1.1: Add DB helpers to MotionDatabase
**File:** `database.py` (modify existing)
**Test:** `tests/test_match_mps.py`
**Depends:** none
Description / Acceptance criteria:
- Add two new public methods to MotionDatabase:
- match_mps_for_votes(user_votes: Dict[int, str], limit: int = 50) -> List[Dict]
- Returns an ordered list (desc by agreement_pct) of dicts with keys: mp_name, party, matched (int), overlap (int), agreement_pct (float 0-100).
- Behavior: for each mp present in mp_votes for any of the provided motions compute:
- overlap = number of motions where MP has a recorded vote AND the user provided a non-empty vote (i.e., not "Geen stem").
- matched = number of those overlaps where normalized(mp_vote) == normalized(user_vote).
- agreement_pct = matched / overlap * 100 rounded to 1 decimal. MPs with overlap==0 are excluded from the returned list.
- Ordering: agreement_pct desc, then matched desc, then mp_name asc.
- choose_discriminating_motions(candidates: List[str], excluded_motion_ids: List[int], k: int = 1) -> List[int]
- For the provided candidate mp_names, compute vote distributions per motion (voor/tegen/onthouden/afwezig) excluding motion ids in excluded_motion_ids.
- Score each motion by Shannon entropy of the distribution among the candidate MPs (treating 'afwezig' as a separate bucket). Higher entropy preferred.
- Return top-k motion ids as a list, tiebreakers: higher controversy_score (motions table) then lower motion id.
Implementation notes & decisions:
- Implement normalization inside these methods. Normalization mapping (DB vote -> canonical): map DB votes lowercased to one of {'voor','tegen','onthouden','afwezig'}. UI inputs (Voor/Tegen/Onthouden/Geen stem) normalized to these same tokens.
- For performance, implement SQL queries that select mp_votes filtered by motion_id IN (...) and mp_name IN (candidates) and aggregate via GROUP BY mp_name and vote. For small candidate sets and a limited set of motion_ids this will be fast. If duckdb is not available, fall back to in-Python aggregates using the file-backed JSON format already present in MotionDatabase._init_database.
- Add docstrings and basic parameter validation (raise ValueError for empty user_votes or empty candidates input). Tests will cover expected exceptions.
Test outline (tests/test_match_mps.py):
- Setup: create a temporary MotionDatabase using a temp db_path (MotionDatabase.reset_database() can be used if duckdb available; otherwise use file-backed mode). Insert a small set of motions and mp_votes via insert_motion / insert_mp_vote. Create at least 3 MPs with overlapping but distinct vote patterns across 4-6 motions.
- Tests:
1) test_match_basic_counts: user_votes covering 3 motions returns expected matched/overlap/agreement_pct per MP.
2) test_match_excludes_zero_overlap: MPs with no recorded votes for provided motions are excluded.
3) test_choose_discriminating_motions_entropy_ranking: with a small candidate set, the chosen motion(s) split candidates as expected (assert returned motion id is one of known good splitters)
4) test_invalid_input: calling match_mps_for_votes with empty user_votes raises ValueError.
Verify: `pytest -q tests/test_match_mps.py`
Commit message: `feat(database): add match_mps_for_votes and choose_discriminating_motions`
Estimated time: 3.0 - 4.5 hours
---
### Task 1.2: Add plan file (this document)
**File:** `thoughts/shared/plans/2026-03-24-welk-tweede-kamerlid-ben-jij-plan.md` (this file)
**Test:** none
**Depends:** none
Description: Add the implementation plan (this document) to the repo to provide step-by-step microtasks to implementers. No tests.
Verify: visually review file in repo. No test run.
Commit message: `docs(plans): add plan for 'Welk tweede kamerlid ben jij?'`
Estimated time: 0.25 - 0.5 hours
---
## BATCH 2: Core UI (parallel - depends on Batch 1)
All tasks in this batch assume the DB methods from Task 1.1 exist.
### Task 2.1: Add Streamlit quiz tab & wiring
**File:** `explorer.py` (modify existing)
**Test:** `tests/test_explorer_quiz.py`
**Depends:** 1.1
Description / Acceptance criteria:
- Add a function `build_mp_quiz_tab(db_path: str) -> None` placed near other build_*_tab functions (as described in the design, e.g., after build_svd_components_tab or near the top of the tab builders). The function must:
- Render a short intro/instructions.
- Load an initial pool of candidate motions using existing `load_motions_df(db_path)` and pick a seed set (top N by controversy_score). Decision: seed N = 8 (configurable constant in the function: SEED_MOTIONS = 8) — this is small and fast.
- Present questions one at a time: show motion title + layman_explanation (if available) and a radio with choices: "Voor", "Tegen", "Onthouden", "Geen stem" and a "Skip"/"Niet zeker" optional button mapped to "Geen stem". Choice stored to `st.session_state['mp_quiz_votes']` as mapping with keys str(motion_id) -> UI token.
- After each answer, call MotionDatabase.match_mps_for_votes(user_votes) to fetch ranked candidates and display a small DataFrame (top 10) with columns: MP name, party, matched, overlap, agreement_pct. Use st.dataframe.
- If more than 1 candidate remains with top agreement_pct tied, call MotionDatabase.choose_discriminating_motions(candidates, excluded_motion_ids) to pick the next question to ask and continue until one unique MP remains or choose_discriminating_motions returns an empty list (tie / indistinguishable). Cap total questions at 20 (SESS_CAP = 20).
- When unique MP is found (agreement_pct == 100 and overlap>0 and only one MP with highest agreement), show final MP summary (name, party) and their matching motions count.
- Use caching: wrap any repeated DB lookups (e.g., load_motions_df already cached) and mark heavy updates via @st.cache_data where appropriate.
Implementation notes & decisions:
- Keep all UI state local to st.session_state with keys prefixed `mp_quiz_` to avoid collisions.
- Normalize UI tokens before sending to DB helper (but DB methods will also normalize; duplication is defensive).
- Keep the UI function self-contained in explorer.py (do not create new modules for this minimal MVP).
Test outline (tests/test_explorer_quiz.py):
- Use monkeypatching to inject a MotionDatabase mock into explorer module or run in a test DB using MotionDatabase with temp db_path. The test must be import-safe (explorer.py imports many heavy libs), so follow pattern used by existing tests/test_explorer_import.py: import the module and assert `build_mp_quiz_tab` exists and is callable.
- Functional assertions:
1) test_builder_exists: import explorer, assert callable(build_mp_quiz_tab)
2) test_ui_state_update_simulation: simulate st.session_state by creating a fake session dict (use monkeypatch to set st.session_state to a dict-like object) and calling build_mp_quiz_tab with a small temp DB where motions and mp_votes are prepared. Assert that after calling the builder with pre-filled votes the DataFrame block would display ranked candidates (test inspects returned structure if builder returns it, or else monkeypatch MotionDatabase.match_mps_for_votes to verify it was called with expected mapping).
Verification: `pytest -q tests/test_explorer_quiz.py`
Commit message: `feat(ui): add 'Welk tweede kamerlid ben jij?' tab and wiring in explorer.py`
Estimated time: 2.0 - 4.0 hours
---
## BATCH 3: Integration & Tests (parallel - depends on Batches 1+2)
### Task 3.1: Add integration test for quiz flow
**File:** `tests/test_explorer_quiz_integration.py`
**Test:** this file
**Depends:** 1.1, 2.1
Description / Acceptance criteria:
- Create an end-to-end-ish headless test that:
- Sets up a temporary MotionDatabase instance (temp file path) and inserts a small controlled dataset: ~6 motions, 4 MPs with distinct votes.
- Calls build_mp_quiz_tab via explorer with monkeypatched st.session_state (or with a minimal wrapper) and simulates a sequence of user answers by pre-populating st.session_state['mp_quiz_votes'].
- Asserts that final candidate set matches expectations: either a unique MP (when answers match exactly one MP) or that the function properly identifies indistinguishable MPs (when two MPs have identical votes).
Testing details & choices:
- Avoid launching Streamlit server; tests only import explorer module and call the builder function in the same way other explorer tests do. Use monkeypatch to stub expensive functions (plotly, query_similar) where required.
Verify: `pytest -q tests/test_explorer_quiz_integration.py`
Commit message: `test(ui): add integration tests for mp quiz tab flow`
Estimated time: 2.0 - 3.0 hours
---
## Verification & CI
- Local verification commands (per task) use pytest. Example:
- `pytest -q tests/test_match_mps.py`
- `pytest -q tests/test_explorer_quiz.py`
- `pytest -q tests/test_explorer_quiz_integration.py`
- CI expectations: run full test suite. The new tests should be lightweight and use temporary DBs / monkeypatching to avoid depending on large production DB.
---
## Commit & PR Strategy
- Work in a feature branch `feat/mp-quiz-2026-03-24`.
- Make small focused commits per task (messages suggested above). Each micro-task should be one commit.
- PR organization:
- PR #1 (Batch 1): database.py changes + tests/test_match_mps.py — target only DB helpers and their unit tests. Keep this PR small so backend logic can be reviewed independently.
- PR #2 (Batch 2): explorer.py UI builder + tests/test_explorer_quiz.py — depends on PR #1; rebase after PR #1 merges or open as stacked PR (base=feat/mp-quiz-2026-03-24).
- PR #3 (Batch 3): integration+polish tests (tests/test_explorer_quiz_integration.py) and any small fixes discovered during integration testing.
- Review checklist for each PR:
- Tests covering edge cases (zero-overlap MPs, empty inputs)
- DB queries use read_only DuckDB connections
- UI uses st.session_state and @st.cache_data appropriately
- No production DB writes, no schema changes
---
## Risks & Mitigations (short)
- Performance: selecting motions across the entire motions table could be heavy. Mitigation: seed with top-N controversial motions and limit choose_discriminating_motions to motions that have mp_votes rows for the candidate MPs only.
- Data quality: MPs with identical votes will remain indistinguishable — surface clearly to user. Tests include that scenario.
---
## Task checklist for implementers (copy/paste friendly)
- [ ] Task 1.1: Modify database.py — implement match_mps_for_votes & choose_discriminating_motions. Add tests in tests/test_match_mps.py. (3.0–4.5h)
- [ ] Task 1.2: Add this plan file. (0.25–0.5h)
- [ ] Task 2.1: Modify explorer.py — add build_mp_quiz_tab and wire into tabs. Add tests in tests/test_explorer_quiz.py. (2.0–4.0h)
- [ ] Task 3.1: Add integration test tests/test_explorer_quiz_integration.py to exercise quiz flow. (2.0–3.0h)
---
If you run into ambiguous input normalization details or DB edge-cases, follow the choices documented above (explicit normalization mapping, exclude zero-overlap MPs, use entropy scoring). If you encounter a blocker (e.g. missing mp_votes data in test fixtures), create small test fixtures using MotionDatabase.insert_motion and insert_mp_vote in the test setup.
Good luck — keep PRs small and tests fast.

@ -58,6 +58,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815, upload-time = "2025-03-13T11:10:21.14Z" },
]
[[package]]
name = "beautifulsoup4"
version = "4.14.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "soupsieve" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
]
[[package]]
name = "blinker"
version = "1.9.0"
@ -369,6 +382,68 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/30/a8/e61a8c2b3cc7a597073d9cde1fcbb567e9d827f1db30c93cf80422eac70d/llvmlite-0.46.0-cp314-cp314-win_amd64.whl", hash = "sha256:7821eda3ec1f18050f981819756631d60b6d7ab1a6cf806d9efefbe3f4082d61", size = 39153056, upload-time = "2025-12-08T18:15:33.938Z" },
]
[[package]]
name = "lxml"
version = "6.0.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/aa/88/262177de60548e5a2bfc46ad28232c9e9cbde697bd94132aeb80364675cb/lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62", size = 4073426, upload-time = "2025-09-22T04:04:59.287Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/53/fd/4e8f0540608977aea078bf6d79f128e0e2c2bba8af1acf775c30baa70460/lxml-6.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77", size = 8648494, upload-time = "2025-09-22T04:01:54.242Z" },
{ url = "https://files.pythonhosted.org/packages/5d/f4/2a94a3d3dfd6c6b433501b8d470a1960a20ecce93245cf2db1706adf6c19/lxml-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f", size = 4661146, upload-time = "2025-09-22T04:01:56.282Z" },
{ url = "https://files.pythonhosted.org/packages/25/2e/4efa677fa6b322013035d38016f6ae859d06cac67437ca7dc708a6af7028/lxml-6.0.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452", size = 4946932, upload-time = "2025-09-22T04:01:58.989Z" },
{ url = "https://files.pythonhosted.org/packages/ce/0f/526e78a6d38d109fdbaa5049c62e1d32fdd70c75fb61c4eadf3045d3d124/lxml-6.0.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048", size = 5100060, upload-time = "2025-09-22T04:02:00.812Z" },
{ url = "https://files.pythonhosted.org/packages/81/76/99de58d81fa702cc0ea7edae4f4640416c2062813a00ff24bd70ac1d9c9b/lxml-6.0.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df", size = 5019000, upload-time = "2025-09-22T04:02:02.671Z" },
{ url = "https://files.pythonhosted.org/packages/b5/35/9e57d25482bc9a9882cb0037fdb9cc18f4b79d85df94fa9d2a89562f1d25/lxml-6.0.2-cp313-cp313-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1", size = 5348496, upload-time = "2025-09-22T04:02:04.904Z" },
{ url = "https://files.pythonhosted.org/packages/a6/8e/cb99bd0b83ccc3e8f0f528e9aa1f7a9965dfec08c617070c5db8d63a87ce/lxml-6.0.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916", size = 5643779, upload-time = "2025-09-22T04:02:06.689Z" },
{ url = "https://files.pythonhosted.org/packages/d0/34/9e591954939276bb679b73773836c6684c22e56d05980e31d52a9a8deb18/lxml-6.0.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd", size = 5244072, upload-time = "2025-09-22T04:02:08.587Z" },
{ url = "https://files.pythonhosted.org/packages/8d/27/b29ff065f9aaca443ee377aff699714fcbffb371b4fce5ac4ca759e436d5/lxml-6.0.2-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6", size = 4718675, upload-time = "2025-09-22T04:02:10.783Z" },
{ url = "https://files.pythonhosted.org/packages/2b/9f/f756f9c2cd27caa1a6ef8c32ae47aadea697f5c2c6d07b0dae133c244fbe/lxml-6.0.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a", size = 5255171, upload-time = "2025-09-22T04:02:12.631Z" },
{ url = "https://files.pythonhosted.org/packages/61/46/bb85ea42d2cb1bd8395484fd72f38e3389611aa496ac7772da9205bbda0e/lxml-6.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679", size = 5057175, upload-time = "2025-09-22T04:02:14.718Z" },
{ url = "https://files.pythonhosted.org/packages/95/0c/443fc476dcc8e41577f0af70458c50fe299a97bb6b7505bb1ae09aa7f9ac/lxml-6.0.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659", size = 4785688, upload-time = "2025-09-22T04:02:16.957Z" },
{ url = "https://files.pythonhosted.org/packages/48/78/6ef0b359d45bb9697bc5a626e1992fa5d27aa3f8004b137b2314793b50a0/lxml-6.0.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484", size = 5660655, upload-time = "2025-09-22T04:02:18.815Z" },
{ url = "https://files.pythonhosted.org/packages/ff/ea/e1d33808f386bc1339d08c0dcada6e4712d4ed8e93fcad5f057070b7988a/lxml-6.0.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2", size = 5247695, upload-time = "2025-09-22T04:02:20.593Z" },
{ url = "https://files.pythonhosted.org/packages/4f/47/eba75dfd8183673725255247a603b4ad606f4ae657b60c6c145b381697da/lxml-6.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314", size = 5269841, upload-time = "2025-09-22T04:02:22.489Z" },
{ url = "https://files.pythonhosted.org/packages/76/04/5c5e2b8577bc936e219becb2e98cdb1aca14a4921a12995b9d0c523502ae/lxml-6.0.2-cp313-cp313-win32.whl", hash = "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2", size = 3610700, upload-time = "2025-09-22T04:02:24.465Z" },
{ url = "https://files.pythonhosted.org/packages/fe/0a/4643ccc6bb8b143e9f9640aa54e38255f9d3b45feb2cbe7ae2ca47e8782e/lxml-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7", size = 4010347, upload-time = "2025-09-22T04:02:26.286Z" },
{ url = "https://files.pythonhosted.org/packages/31/ef/dcf1d29c3f530577f61e5fe2f1bd72929acf779953668a8a47a479ae6f26/lxml-6.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf", size = 3671248, upload-time = "2025-09-22T04:02:27.918Z" },
{ url = "https://files.pythonhosted.org/packages/03/15/d4a377b385ab693ce97b472fe0c77c2b16ec79590e688b3ccc71fba19884/lxml-6.0.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe", size = 8659801, upload-time = "2025-09-22T04:02:30.113Z" },
{ url = "https://files.pythonhosted.org/packages/c8/e8/c128e37589463668794d503afaeb003987373c5f94d667124ffd8078bbd9/lxml-6.0.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d", size = 4659403, upload-time = "2025-09-22T04:02:32.119Z" },
{ url = "https://files.pythonhosted.org/packages/00/ce/74903904339decdf7da7847bb5741fc98a5451b42fc419a86c0c13d26fe2/lxml-6.0.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d", size = 4966974, upload-time = "2025-09-22T04:02:34.155Z" },
{ url = "https://files.pythonhosted.org/packages/1f/d3/131dec79ce61c5567fecf82515bd9bc36395df42501b50f7f7f3bd065df0/lxml-6.0.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5", size = 5102953, upload-time = "2025-09-22T04:02:36.054Z" },
{ url = "https://files.pythonhosted.org/packages/3a/ea/a43ba9bb750d4ffdd885f2cd333572f5bb900cd2408b67fdda07e85978a0/lxml-6.0.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0", size = 5055054, upload-time = "2025-09-22T04:02:38.154Z" },
{ url = "https://files.pythonhosted.org/packages/60/23/6885b451636ae286c34628f70a7ed1fcc759f8d9ad382d132e1c8d3d9bfd/lxml-6.0.2-cp314-cp314-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba", size = 5352421, upload-time = "2025-09-22T04:02:40.413Z" },
{ url = "https://files.pythonhosted.org/packages/48/5b/fc2ddfc94ddbe3eebb8e9af6e3fd65e2feba4967f6a4e9683875c394c2d8/lxml-6.0.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0", size = 5673684, upload-time = "2025-09-22T04:02:42.288Z" },
{ url = "https://files.pythonhosted.org/packages/29/9c/47293c58cc91769130fbf85531280e8cc7868f7fbb6d92f4670071b9cb3e/lxml-6.0.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d", size = 5252463, upload-time = "2025-09-22T04:02:44.165Z" },
{ url = "https://files.pythonhosted.org/packages/9b/da/ba6eceb830c762b48e711ded880d7e3e89fc6c7323e587c36540b6b23c6b/lxml-6.0.2-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37", size = 4698437, upload-time = "2025-09-22T04:02:46.524Z" },
{ url = "https://files.pythonhosted.org/packages/a5/24/7be3f82cb7990b89118d944b619e53c656c97dc89c28cfb143fdb7cd6f4d/lxml-6.0.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9", size = 5269890, upload-time = "2025-09-22T04:02:48.812Z" },
{ url = "https://files.pythonhosted.org/packages/1b/bd/dcfb9ea1e16c665efd7538fc5d5c34071276ce9220e234217682e7d2c4a5/lxml-6.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917", size = 5097185, upload-time = "2025-09-22T04:02:50.746Z" },
{ url = "https://files.pythonhosted.org/packages/21/04/a60b0ff9314736316f28316b694bccbbabe100f8483ad83852d77fc7468e/lxml-6.0.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f", size = 4745895, upload-time = "2025-09-22T04:02:52.968Z" },
{ url = "https://files.pythonhosted.org/packages/d6/bd/7d54bd1846e5a310d9c715921c5faa71cf5c0853372adf78aee70c8d7aa2/lxml-6.0.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8", size = 5695246, upload-time = "2025-09-22T04:02:54.798Z" },
{ url = "https://files.pythonhosted.org/packages/fd/32/5643d6ab947bc371da21323acb2a6e603cedbe71cb4c99c8254289ab6f4e/lxml-6.0.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a", size = 5260797, upload-time = "2025-09-22T04:02:57.058Z" },
{ url = "https://files.pythonhosted.org/packages/33/da/34c1ec4cff1eea7d0b4cd44af8411806ed943141804ac9c5d565302afb78/lxml-6.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c", size = 5277404, upload-time = "2025-09-22T04:02:58.966Z" },
{ url = "https://files.pythonhosted.org/packages/82/57/4eca3e31e54dc89e2c3507e1cd411074a17565fa5ffc437c4ae0a00d439e/lxml-6.0.2-cp314-cp314-win32.whl", hash = "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b", size = 3670072, upload-time = "2025-09-22T04:03:38.05Z" },
{ url = "https://files.pythonhosted.org/packages/e3/e0/c96cf13eccd20c9421ba910304dae0f619724dcf1702864fd59dd386404d/lxml-6.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed", size = 4080617, upload-time = "2025-09-22T04:03:39.835Z" },
{ url = "https://files.pythonhosted.org/packages/d5/5d/b3f03e22b3d38d6f188ef044900a9b29b2fe0aebb94625ce9fe244011d34/lxml-6.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8", size = 3754930, upload-time = "2025-09-22T04:03:41.565Z" },
{ url = "https://files.pythonhosted.org/packages/5e/5c/42c2c4c03554580708fc738d13414801f340c04c3eff90d8d2d227145275/lxml-6.0.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d", size = 8910380, upload-time = "2025-09-22T04:03:01.645Z" },
{ url = "https://files.pythonhosted.org/packages/bf/4f/12df843e3e10d18d468a7557058f8d3733e8b6e12401f30b1ef29360740f/lxml-6.0.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba", size = 4775632, upload-time = "2025-09-22T04:03:03.814Z" },
{ url = "https://files.pythonhosted.org/packages/e4/0c/9dc31e6c2d0d418483cbcb469d1f5a582a1cd00a1f4081953d44051f3c50/lxml-6.0.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601", size = 4975171, upload-time = "2025-09-22T04:03:05.651Z" },
{ url = "https://files.pythonhosted.org/packages/e7/2b/9b870c6ca24c841bdd887504808f0417aa9d8d564114689266f19ddf29c8/lxml-6.0.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed", size = 5110109, upload-time = "2025-09-22T04:03:07.452Z" },
{ url = "https://files.pythonhosted.org/packages/bf/0c/4f5f2a4dd319a178912751564471355d9019e220c20d7db3fb8307ed8582/lxml-6.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37", size = 5041061, upload-time = "2025-09-22T04:03:09.297Z" },
{ url = "https://files.pythonhosted.org/packages/12/64/554eed290365267671fe001a20d72d14f468ae4e6acef1e179b039436967/lxml-6.0.2-cp314-cp314t-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338", size = 5306233, upload-time = "2025-09-22T04:03:11.651Z" },
{ url = "https://files.pythonhosted.org/packages/7a/31/1d748aa275e71802ad9722df32a7a35034246b42c0ecdd8235412c3396ef/lxml-6.0.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9", size = 5604739, upload-time = "2025-09-22T04:03:13.592Z" },
{ url = "https://files.pythonhosted.org/packages/8f/41/2c11916bcac09ed561adccacceaedd2bf0e0b25b297ea92aab99fd03d0fa/lxml-6.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd", size = 5225119, upload-time = "2025-09-22T04:03:15.408Z" },
{ url = "https://files.pythonhosted.org/packages/99/05/4e5c2873d8f17aa018e6afde417c80cc5d0c33be4854cce3ef5670c49367/lxml-6.0.2-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d", size = 4633665, upload-time = "2025-09-22T04:03:17.262Z" },
{ url = "https://files.pythonhosted.org/packages/0f/c9/dcc2da1bebd6275cdc723b515f93edf548b82f36a5458cca3578bc899332/lxml-6.0.2-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9", size = 5234997, upload-time = "2025-09-22T04:03:19.14Z" },
{ url = "https://files.pythonhosted.org/packages/9c/e2/5172e4e7468afca64a37b81dba152fc5d90e30f9c83c7c3213d6a02a5ce4/lxml-6.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e", size = 5090957, upload-time = "2025-09-22T04:03:21.436Z" },
{ url = "https://files.pythonhosted.org/packages/a5/b3/15461fd3e5cd4ddcb7938b87fc20b14ab113b92312fc97afe65cd7c85de1/lxml-6.0.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d", size = 4764372, upload-time = "2025-09-22T04:03:23.27Z" },
{ url = "https://files.pythonhosted.org/packages/05/33/f310b987c8bf9e61c4dd8e8035c416bd3230098f5e3cfa69fc4232de7059/lxml-6.0.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec", size = 5634653, upload-time = "2025-09-22T04:03:25.767Z" },
{ url = "https://files.pythonhosted.org/packages/70/ff/51c80e75e0bc9382158133bdcf4e339b5886c6ee2418b5199b3f1a61ed6d/lxml-6.0.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272", size = 5233795, upload-time = "2025-09-22T04:03:27.62Z" },
{ url = "https://files.pythonhosted.org/packages/56/4d/4856e897df0d588789dd844dbed9d91782c4ef0b327f96ce53c807e13128/lxml-6.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f", size = 5257023, upload-time = "2025-09-22T04:03:30.056Z" },
{ url = "https://files.pythonhosted.org/packages/0f/85/86766dfebfa87bea0ab78e9ff7a4b4b45225df4b4d3b8cc3c03c5cd68464/lxml-6.0.2-cp314-cp314t-win32.whl", hash = "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312", size = 3911420, upload-time = "2025-09-22T04:03:32.198Z" },
{ url = "https://files.pythonhosted.org/packages/fe/1a/b248b355834c8e32614650b8008c69ffeb0ceb149c793961dd8c0b991bb3/lxml-6.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca", size = 4406837, upload-time = "2025-09-22T04:03:34.027Z" },
{ url = "https://files.pythonhosted.org/packages/92/aa/df863bcc39c5e0946263454aba394de8a9084dbaff8ad143846b0d844739/lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c", size = 3822205, upload-time = "2025-09-22T04:03:36.249Z" },
]
[[package]]
name = "markdown-it-py"
version = "4.0.0"
@ -1035,6 +1110,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
]
[[package]]
name = "soupsieve"
version = "2.8.3"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" },
]
[[package]]
name = "sqlglot"
version = "27.6.0"
@ -1049,8 +1133,10 @@ name = "stemwijzer"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "duckdb" },
{ name = "ibis-framework", extra = ["duckdb"] },
{ name = "lxml" },
{ name = "openai" },
{ name = "plotly" },
{ name = "pytest" },
@ -1064,8 +1150,10 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "beautifulsoup4", specifier = ">=4.14.3" },
{ name = "duckdb", specifier = ">=1.3.2" },
{ name = "ibis-framework", extras = ["duckdb"], specifier = ">=10.8.0" },
{ name = "lxml", specifier = ">=6.0.2" },
{ name = "openai", specifier = ">=1.99.7" },
{ name = "plotly", specifier = ">=5.0" },
{ name = "pytest", specifier = ">=9.0.2" },

Loading…
Cancel
Save