You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
189 lines
5.7 KiB
189 lines
5.7 KiB
"""Content validation primitives for agent operation.
|
|
|
|
Tools for validating data quality, coverage, and content correctness.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from agent_tools.database import query_motions, query_svd_vectors
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def validate_motion_coverage(
|
|
db_path: str,
|
|
start_date: str,
|
|
end_date: str,
|
|
) -> Dict[str, Any]:
|
|
"""Validate motion coverage for a date range.
|
|
|
|
Returns gaps where no motions exist in the database.
|
|
"""
|
|
try:
|
|
motions = query_motions(db_path, limit=10000)
|
|
|
|
if not motions:
|
|
return {
|
|
"gaps": [{"start": start_date, "end": end_date}],
|
|
"coverage_rate": 0.0,
|
|
"total_motions": 0,
|
|
}
|
|
|
|
# Convert dates
|
|
start = datetime.fromisoformat(start_date)
|
|
end = datetime.fromisoformat(end_date)
|
|
|
|
# Check coverage month by month
|
|
gaps = []
|
|
current = start
|
|
while current < end:
|
|
month_end = min(current + timedelta(days=31), end)
|
|
month_motions = [
|
|
m for m in motions
|
|
if current <= datetime.fromisoformat(str(m.get("date", "1970-01-01"))) < month_end
|
|
]
|
|
if not month_motions:
|
|
gaps.append({
|
|
"start": current.isoformat(),
|
|
"end": month_end.isoformat(),
|
|
})
|
|
current = month_end
|
|
|
|
total_days = (end - start).days
|
|
gap_days = sum(
|
|
(datetime.fromisoformat(g["end"]) - datetime.fromisoformat(g["start"])).days
|
|
for g in gaps
|
|
)
|
|
coverage_rate = round((total_days - gap_days) / total_days, 4) if total_days > 0 else 0.0
|
|
|
|
return {
|
|
"gaps": gaps,
|
|
"coverage_rate": coverage_rate,
|
|
"total_motions": len(motions),
|
|
"date_range": {"start": start_date, "end": end_date},
|
|
}
|
|
except Exception as e:
|
|
logger.exception("validate_motion_coverage failed")
|
|
return {"gaps": [], "coverage_rate": 0.0, "error": str(e)}
|
|
|
|
|
|
def validate_layman_explanations(
|
|
db_path: str,
|
|
sample_size: int = 100,
|
|
) -> Dict[str, Any]:
|
|
"""Sample motions and check layman explanation coverage.
|
|
|
|
Returns quality metrics for explanations.
|
|
"""
|
|
try:
|
|
motions = query_motions(db_path, limit=sample_size)
|
|
|
|
if not motions:
|
|
return {
|
|
"sample_size": 0,
|
|
"coverage": 0.0,
|
|
"empty_count": 0,
|
|
}
|
|
|
|
with_explanation = sum(
|
|
1 for m in motions
|
|
if m.get("layman_explanation") and str(m.get("layman_explanation")).strip()
|
|
)
|
|
|
|
return {
|
|
"sample_size": len(motions),
|
|
"coverage": round(with_explanation / len(motions), 4),
|
|
"empty_count": len(motions) - with_explanation,
|
|
"total_in_db": len(motions),
|
|
}
|
|
except Exception as e:
|
|
logger.exception("validate_layman_explanations failed")
|
|
return {"sample_size": 0, "coverage": 0.0, "error": str(e)}
|
|
|
|
|
|
def suggest_svd_label(
|
|
db_path: str,
|
|
component: int,
|
|
top_n: int = 10,
|
|
) -> Dict[str, Any]:
|
|
"""Analyze top motions on a component and suggest a label.
|
|
|
|
Returns the top positive and negative motions with scores.
|
|
"""
|
|
try:
|
|
rows = query_svd_vectors(db_path, "current_parliament", entity_type="motion")
|
|
|
|
if not rows:
|
|
return {
|
|
"component": component,
|
|
"error": "No SVD vectors found for current_parliament",
|
|
}
|
|
|
|
import json
|
|
|
|
scored = []
|
|
for row in rows:
|
|
vec = row.get("vector")
|
|
if isinstance(vec, str):
|
|
vec = json.loads(vec)
|
|
if isinstance(vec, list) and component - 1 < len(vec):
|
|
scored.append({
|
|
"motion_id": row.get("entity_id"),
|
|
"score": vec[component - 1],
|
|
})
|
|
|
|
scored.sort(key=lambda x: x["score"])
|
|
negative = scored[:top_n]
|
|
positive = scored[-top_n:][::-1]
|
|
|
|
return {
|
|
"component": component,
|
|
"suggestion": {
|
|
"negative_pole": negative,
|
|
"positive_pole": positive,
|
|
},
|
|
"top_positive_ids": [m["motion_id"] for m in positive],
|
|
"top_negative_ids": [m["motion_id"] for m in negative],
|
|
}
|
|
except Exception as e:
|
|
logger.exception("suggest_svd_label failed")
|
|
return {"component": component, "error": str(e)}
|
|
|
|
|
|
def check_embedding_quality(
|
|
db_path: str,
|
|
window_id: str,
|
|
healthy_threshold: float = 0.8,
|
|
) -> Dict[str, Any]:
|
|
"""Check embedding coverage and quality for a window.
|
|
|
|
Args:
|
|
healthy_threshold: Coverage ratio above which embeddings are considered healthy.
|
|
Defaults to 0.8; override via prompt for different quality bars.
|
|
|
|
Returns coverage stats for fused embeddings.
|
|
"""
|
|
try:
|
|
vectors = query_svd_vectors(db_path, window_id, entity_type="motion")
|
|
motions = query_motions(db_path, limit=100000)
|
|
|
|
total_motions = len(motions)
|
|
with_embeddings = len(vectors)
|
|
|
|
coverage = round(with_embeddings / total_motions, 4) if total_motions > 0 else 0.0
|
|
|
|
return {
|
|
"window_id": window_id,
|
|
"total_motions": total_motions,
|
|
"with_embeddings": with_embeddings,
|
|
"coverage": coverage,
|
|
"healthy": coverage > healthy_threshold,
|
|
"healthy_threshold": healthy_threshold,
|
|
}
|
|
except Exception as e:
|
|
logger.exception("check_embedding_quality failed")
|
|
return {"window_id": window_id, "coverage": 0.0, "error": str(e)}
|
|
|