|
|
|
|
@ -14,9 +14,10 @@ Both modes return a dict mapping mp_name → scalar score for the given window. |
|
|
|
|
|
|
|
|
|
import json |
|
|
|
|
import logging |
|
|
|
|
from typing import Dict, List, Optional |
|
|
|
|
from typing import Dict, List, Optional, Tuple |
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
|
from . import trajectory as _trajectory |
|
|
|
|
import duckdb |
|
|
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__) |
|
|
|
|
@ -125,3 +126,162 @@ def compute_anchor_axis( |
|
|
|
|
axis = axis / norm |
|
|
|
|
|
|
|
|
|
return {name: float(np.dot(vec, axis)) for name, vec in mp_vecs.items()} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_2d_axes( |
|
|
|
|
db_path: str, |
|
|
|
|
window_ids: Optional[List[str]] = None, |
|
|
|
|
method: str = "pca", |
|
|
|
|
anchor_kwargs: Optional[Dict] = None, |
|
|
|
|
) -> Tuple[Dict[str, Dict[str, Tuple[float, float]]], Dict[str, np.ndarray]]: |
|
|
|
|
"""Compute 2D coordinates for MPs per window. |
|
|
|
|
|
|
|
|
|
Args: |
|
|
|
|
db_path: path to duckdb |
|
|
|
|
window_ids: optional ordered list of windows (defaults to all) |
|
|
|
|
method: 'pca' or 'anchor' |
|
|
|
|
anchor_kwargs: when method=='anchor' must provide |
|
|
|
|
{ |
|
|
|
|
'left_parties': List[str], |
|
|
|
|
'right_parties': List[str], |
|
|
|
|
'prog_parties': List[str], |
|
|
|
|
'cons_parties': List[str], |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
Returns: |
|
|
|
|
positions_by_window, axis_def |
|
|
|
|
- positions_by_window: {window_id: {mp_name: (x,y)}} |
|
|
|
|
- axis_def: {'x_axis': np.ndarray, 'y_axis': np.ndarray, 'method': str} |
|
|
|
|
|
|
|
|
|
Notes: |
|
|
|
|
This function expects aligned SVD vectors produced by |
|
|
|
|
trajectory._procrustes_align_windows. It will call trajectory helpers |
|
|
|
|
to load and align windows so the returned coordinates are consistent |
|
|
|
|
across windows. |
|
|
|
|
""" |
|
|
|
|
if window_ids is None: |
|
|
|
|
window_ids = _trajectory._load_window_ids(db_path) |
|
|
|
|
|
|
|
|
|
# Load per-window raw vectors using the trajectory helper and align them |
|
|
|
|
raw_window_vecs: Dict[str, Dict[str, np.ndarray]] = {} |
|
|
|
|
for wid in window_ids: |
|
|
|
|
raw_window_vecs[wid] = _trajectory._load_mp_vectors_for_window(db_path, wid) |
|
|
|
|
|
|
|
|
|
aligned_window_vecs = _trajectory._procrustes_align_windows(raw_window_vecs) |
|
|
|
|
|
|
|
|
|
# Stack all vectors across windows into a single matrix for PCA if needed |
|
|
|
|
all_vecs = [] |
|
|
|
|
entity_index = [] # parallel list of (window_id, entity) |
|
|
|
|
for wid, d in aligned_window_vecs.items(): |
|
|
|
|
for ent, v in d.items(): |
|
|
|
|
all_vecs.append(v) |
|
|
|
|
entity_index.append((wid, ent)) |
|
|
|
|
|
|
|
|
|
if len(all_vecs) == 0: |
|
|
|
|
_logger.info("No vectors loaded for windows %s", window_ids) |
|
|
|
|
return ({}, {}) |
|
|
|
|
|
|
|
|
|
M = np.vstack(all_vecs) |
|
|
|
|
|
|
|
|
|
if method == "pca": |
|
|
|
|
# centre globally |
|
|
|
|
Mc = M - M.mean(axis=0) |
|
|
|
|
try: |
|
|
|
|
_, _, Vt = np.linalg.svd(Mc, full_matrices=False) |
|
|
|
|
except np.linalg.LinAlgError: |
|
|
|
|
_logger.exception("SVD failed in compute_2d_axes (pca)") |
|
|
|
|
return ({}, {}) |
|
|
|
|
# take top-2 components as axes (shape k,) |
|
|
|
|
comp1 = Vt[0] |
|
|
|
|
comp2 = Vt[1] if Vt.shape[0] > 1 else np.zeros_like(comp1) |
|
|
|
|
axes = { |
|
|
|
|
"x_axis": comp1 / (np.linalg.norm(comp1) + 1e-12), |
|
|
|
|
"y_axis": comp2 / (np.linalg.norm(comp2) + 1e-12), |
|
|
|
|
"method": "pca", |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
# project per-window vectors (centre by global mean) |
|
|
|
|
global_mean = M.mean(axis=0) |
|
|
|
|
positions_by_window: Dict[str, Dict[str, Tuple[float, float]]] = { |
|
|
|
|
wid: {} for wid in window_ids |
|
|
|
|
} |
|
|
|
|
for (wid, ent), vec in zip(entity_index, M): |
|
|
|
|
v_centered = vec - global_mean |
|
|
|
|
x = float(np.dot(v_centered, axes["x_axis"])) |
|
|
|
|
y = float(np.dot(v_centered, axes["y_axis"])) |
|
|
|
|
positions_by_window[wid][ent] = (x, y) |
|
|
|
|
|
|
|
|
|
return positions_by_window, axes |
|
|
|
|
|
|
|
|
|
elif method == "anchor": |
|
|
|
|
if not anchor_kwargs: |
|
|
|
|
raise ValueError("anchor_kwargs required for method='anchor'") |
|
|
|
|
left = set(anchor_kwargs.get("left_parties", [])) |
|
|
|
|
right = set(anchor_kwargs.get("right_parties", [])) |
|
|
|
|
prog = set(anchor_kwargs.get("prog_parties", [])) |
|
|
|
|
cons = set(anchor_kwargs.get("cons_parties", [])) |
|
|
|
|
|
|
|
|
|
# collect vectors across all windows for each anchor group |
|
|
|
|
def collect_for_party_set(party_set: set) -> List[np.ndarray]: |
|
|
|
|
res: List[np.ndarray] = [] |
|
|
|
|
# party-level entities (entity_id equals party name) |
|
|
|
|
for wid, d in aligned_window_vecs.items(): |
|
|
|
|
for ent, v in d.items(): |
|
|
|
|
if ent in party_set: |
|
|
|
|
res.append(v) |
|
|
|
|
# MP-level via mp_metadata party affiliation |
|
|
|
|
conn = duckdb.connect(db_path) |
|
|
|
|
rows = conn.execute("SELECT mp_name, party FROM mp_metadata").fetchall() |
|
|
|
|
conn.close() |
|
|
|
|
for mp_name, party in rows: |
|
|
|
|
if party in party_set: |
|
|
|
|
# take all vectors for this MP across windows if present |
|
|
|
|
for wid, d in aligned_window_vecs.items(): |
|
|
|
|
if mp_name in d: |
|
|
|
|
res.append(d[mp_name]) |
|
|
|
|
return res |
|
|
|
|
|
|
|
|
|
left_vecs = collect_for_party_set(left) |
|
|
|
|
right_vecs = collect_for_party_set(right) |
|
|
|
|
prog_vecs = collect_for_party_set(prog) |
|
|
|
|
cons_vecs = collect_for_party_set(cons) |
|
|
|
|
|
|
|
|
|
if not left_vecs or not right_vecs or not prog_vecs or not cons_vecs: |
|
|
|
|
_logger.warning("Insufficient anchor vectors for requested parties") |
|
|
|
|
return ({}, {}) |
|
|
|
|
|
|
|
|
|
left_centroid = np.mean(np.vstack(left_vecs), axis=0) |
|
|
|
|
right_centroid = np.mean(np.vstack(right_vecs), axis=0) |
|
|
|
|
prog_centroid = np.mean(np.vstack(prog_vecs), axis=0) |
|
|
|
|
cons_centroid = np.mean(np.vstack(cons_vecs), axis=0) |
|
|
|
|
|
|
|
|
|
lr = right_centroid - left_centroid |
|
|
|
|
pc = cons_centroid - prog_centroid |
|
|
|
|
|
|
|
|
|
# Gram-Schmidt: make pc orthogonal to lr |
|
|
|
|
lr_norm = np.linalg.norm(lr) |
|
|
|
|
if lr_norm < 1e-12: |
|
|
|
|
raise ValueError("Left-right anchor axis has near-zero norm") |
|
|
|
|
lr_hat = lr / lr_norm |
|
|
|
|
# remove projection of pc on lr |
|
|
|
|
pc = pc - np.dot(pc, lr_hat) * lr_hat |
|
|
|
|
pc_norm = np.linalg.norm(pc) |
|
|
|
|
if pc_norm < 1e-12: |
|
|
|
|
raise ValueError( |
|
|
|
|
"Progressive-conservative anchor axis degenerate after orthogonalisation" |
|
|
|
|
) |
|
|
|
|
pc_hat = pc / pc_norm |
|
|
|
|
|
|
|
|
|
axes = {"x_axis": lr_hat, "y_axis": pc_hat, "method": "anchor"} |
|
|
|
|
|
|
|
|
|
positions_by_window = {wid: {} for wid in window_ids} |
|
|
|
|
for wid, d in aligned_window_vecs.items(): |
|
|
|
|
for ent, v in d.items(): |
|
|
|
|
x = float(np.dot(v, axes["x_axis"])) |
|
|
|
|
y = float(np.dot(v, axes["y_axis"])) |
|
|
|
|
positions_by_window[wid][ent] = (x, y) |
|
|
|
|
|
|
|
|
|
return positions_by_window, axes |
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
raise ValueError("Unknown method '%s'" % method) |
|
|
|
|
|