You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
motief/agent_tools/content.py

133 lines
4.0 KiB

"""Content validation primitives for agent operation.
Tools for validating data quality, coverage, and content correctness.
"""
from __future__ import annotations
import logging
from datetime import datetime, timedelta
from typing import Any, Dict
from agent_tools.database import query_motions, query_svd_vectors
logger = logging.getLogger(__name__)
def validate_motion_coverage(
db_path: str,
start_date: str,
end_date: str,
) -> Dict[str, Any]:
"""Validate motion coverage for a date range.
Returns gaps where no motions exist in the database.
"""
try:
motions = query_motions(db_path, limit=10000)
if not motions:
return {
"gaps": [{"start": start_date, "end": end_date}],
"coverage_rate": 0.0,
"total_motions": 0,
}
# Convert dates
start = datetime.fromisoformat(start_date)
end = datetime.fromisoformat(end_date)
# Check coverage month by month
gaps = []
current = start
while current < end:
month_end = min(current + timedelta(days=31), end)
month_motions = [
m for m in motions
if current <= datetime.fromisoformat(str(m.get("date", "1970-01-01"))) < month_end
]
if not month_motions:
gaps.append({
"start": current.isoformat(),
"end": month_end.isoformat(),
})
current = month_end
total_days = (end - start).days
gap_days = sum(
(datetime.fromisoformat(g["end"]) - datetime.fromisoformat(g["start"])).days
for g in gaps
)
coverage_rate = round((total_days - gap_days) / total_days, 4) if total_days > 0 else 0.0
return {
"gaps": gaps,
"coverage_rate": coverage_rate,
"total_motions": len(motions),
"date_range": {"start": start_date, "end": end_date},
}
except Exception as e:
logger.exception("validate_motion_coverage failed")
return {"gaps": [], "coverage_rate": 0.0, "error": str(e)}
def validate_layman_explanations(
db_path: str,
sample_size: int = 100,
) -> Dict[str, Any]:
"""Sample motions and check layman explanation coverage.
Returns quality metrics for explanations.
"""
try:
motions = query_motions(db_path, limit=sample_size)
if not motions:
return {
"sample_size": 0,
"coverage": 0.0,
"empty_count": 0,
}
with_explanation = sum(
1 for m in motions
if m.get("layman_explanation") and str(m.get("layman_explanation")).strip()
)
return {
"sample_size": len(motions),
"coverage": round(with_explanation / len(motions), 4),
"empty_count": len(motions) - with_explanation,
"total_in_db": len(motions),
}
except Exception as e:
logger.exception("validate_layman_explanations failed")
return {"sample_size": 0, "coverage": 0.0, "error": str(e)}
def check_embedding_quality(
db_path: str,
window_id: str,
) -> Dict[str, Any]:
"""Check embedding coverage for a window.
Returns raw coverage stats. The agent decides whether coverage is acceptable.
"""
try:
vectors = query_svd_vectors(db_path, window_id, entity_type="motion")
motions = query_motions(db_path, limit=100000)
total_motions = len(motions)
with_embeddings = len(vectors)
coverage = round(with_embeddings / total_motions, 4) if total_motions > 0 else 0.0
return {
"window_id": window_id,
"total_motions": total_motions,
"with_embeddings": with_embeddings,
"coverage": coverage,
}
except Exception as e:
logger.exception("check_embedding_quality failed")
return {"window_id": window_id, "coverage": 0.0, "error": str(e)}