You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
motief/agent_tools/content.py

189 lines
5.7 KiB

"""Content validation primitives for agent operation.
Tools for validating data quality, coverage, and content correctness.
"""
from __future__ import annotations
import logging
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from agent_tools.database import query_motions, query_svd_vectors
logger = logging.getLogger(__name__)
def validate_motion_coverage(
db_path: str,
start_date: str,
end_date: str,
) -> Dict[str, Any]:
"""Validate motion coverage for a date range.
Returns gaps where no motions exist in the database.
"""
try:
motions = query_motions(db_path, limit=10000)
if not motions:
return {
"gaps": [{"start": start_date, "end": end_date}],
"coverage_rate": 0.0,
"total_motions": 0,
}
# Convert dates
start = datetime.fromisoformat(start_date)
end = datetime.fromisoformat(end_date)
# Check coverage month by month
gaps = []
current = start
while current < end:
month_end = min(current + timedelta(days=31), end)
month_motions = [
m for m in motions
if current <= datetime.fromisoformat(str(m.get("date", "1970-01-01"))) < month_end
]
if not month_motions:
gaps.append({
"start": current.isoformat(),
"end": month_end.isoformat(),
})
current = month_end
total_days = (end - start).days
gap_days = sum(
(datetime.fromisoformat(g["end"]) - datetime.fromisoformat(g["start"])).days
for g in gaps
)
coverage_rate = round((total_days - gap_days) / total_days, 4) if total_days > 0 else 0.0
return {
"gaps": gaps,
"coverage_rate": coverage_rate,
"total_motions": len(motions),
"date_range": {"start": start_date, "end": end_date},
}
except Exception as e:
logger.exception("validate_motion_coverage failed")
return {"gaps": [], "coverage_rate": 0.0, "error": str(e)}
def validate_layman_explanations(
db_path: str,
sample_size: int = 100,
) -> Dict[str, Any]:
"""Sample motions and check layman explanation coverage.
Returns quality metrics for explanations.
"""
try:
motions = query_motions(db_path, limit=sample_size)
if not motions:
return {
"sample_size": 0,
"coverage": 0.0,
"empty_count": 0,
}
with_explanation = sum(
1 for m in motions
if m.get("layman_explanation") and str(m.get("layman_explanation")).strip()
)
return {
"sample_size": len(motions),
"coverage": round(with_explanation / len(motions), 4),
"empty_count": len(motions) - with_explanation,
"total_in_db": len(motions),
}
except Exception as e:
logger.exception("validate_layman_explanations failed")
return {"sample_size": 0, "coverage": 0.0, "error": str(e)}
def suggest_svd_label(
db_path: str,
component: int,
top_n: int = 10,
) -> Dict[str, Any]:
"""Analyze top motions on a component and suggest a label.
Returns the top positive and negative motions with scores.
"""
try:
rows = query_svd_vectors(db_path, "current_parliament", entity_type="motion")
if not rows:
return {
"component": component,
"error": "No SVD vectors found for current_parliament",
}
import json
scored = []
for row in rows:
vec = row.get("vector")
if isinstance(vec, str):
vec = json.loads(vec)
if isinstance(vec, list) and component - 1 < len(vec):
scored.append({
"motion_id": row.get("entity_id"),
"score": vec[component - 1],
})
scored.sort(key=lambda x: x["score"])
negative = scored[:top_n]
positive = scored[-top_n:][::-1]
return {
"component": component,
"suggestion": {
"negative_pole": negative,
"positive_pole": positive,
},
"top_positive_ids": [m["motion_id"] for m in positive],
"top_negative_ids": [m["motion_id"] for m in negative],
}
except Exception as e:
logger.exception("suggest_svd_label failed")
return {"component": component, "error": str(e)}
def check_embedding_quality(
db_path: str,
window_id: str,
healthy_threshold: float = 0.8,
) -> Dict[str, Any]:
"""Check embedding coverage and quality for a window.
Args:
healthy_threshold: Coverage ratio above which embeddings are considered healthy.
Defaults to 0.8; override via prompt for different quality bars.
Returns coverage stats for fused embeddings.
"""
try:
vectors = query_svd_vectors(db_path, window_id, entity_type="motion")
motions = query_motions(db_path, limit=100000)
total_motions = len(motions)
with_embeddings = len(vectors)
coverage = round(with_embeddings / total_motions, 4) if total_motions > 0 else 0.0
return {
"window_id": window_id,
"total_motions": total_motions,
"with_embeddings": with_embeddings,
"coverage": coverage,
"healthy": coverage > healthy_threshold,
"healthy_threshold": healthy_threshold,
}
except Exception as e:
logger.exception("check_embedding_quality failed")
return {"window_id": window_id, "coverage": 0.0, "error": str(e)}