You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
4.0 KiB
133 lines
4.0 KiB
"""Content validation primitives for agent operation.
|
|
|
|
Tools for validating data quality, coverage, and content correctness.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Any, Dict
|
|
|
|
from agent_tools.database import query_motions, query_svd_vectors
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def validate_motion_coverage(
|
|
db_path: str,
|
|
start_date: str,
|
|
end_date: str,
|
|
) -> Dict[str, Any]:
|
|
"""Validate motion coverage for a date range.
|
|
|
|
Returns gaps where no motions exist in the database.
|
|
"""
|
|
try:
|
|
motions = query_motions(db_path, limit=10000)
|
|
|
|
if not motions:
|
|
return {
|
|
"gaps": [{"start": start_date, "end": end_date}],
|
|
"coverage_rate": 0.0,
|
|
"total_motions": 0,
|
|
}
|
|
|
|
# Convert dates
|
|
start = datetime.fromisoformat(start_date)
|
|
end = datetime.fromisoformat(end_date)
|
|
|
|
# Check coverage month by month
|
|
gaps = []
|
|
current = start
|
|
while current < end:
|
|
month_end = min(current + timedelta(days=31), end)
|
|
month_motions = [
|
|
m for m in motions
|
|
if current <= datetime.fromisoformat(str(m.get("date", "1970-01-01"))) < month_end
|
|
]
|
|
if not month_motions:
|
|
gaps.append({
|
|
"start": current.isoformat(),
|
|
"end": month_end.isoformat(),
|
|
})
|
|
current = month_end
|
|
|
|
total_days = (end - start).days
|
|
gap_days = sum(
|
|
(datetime.fromisoformat(g["end"]) - datetime.fromisoformat(g["start"])).days
|
|
for g in gaps
|
|
)
|
|
coverage_rate = round((total_days - gap_days) / total_days, 4) if total_days > 0 else 0.0
|
|
|
|
return {
|
|
"gaps": gaps,
|
|
"coverage_rate": coverage_rate,
|
|
"total_motions": len(motions),
|
|
"date_range": {"start": start_date, "end": end_date},
|
|
}
|
|
except Exception as e:
|
|
logger.exception("validate_motion_coverage failed")
|
|
return {"gaps": [], "coverage_rate": 0.0, "error": str(e)}
|
|
|
|
|
|
def validate_layman_explanations(
|
|
db_path: str,
|
|
sample_size: int = 100,
|
|
) -> Dict[str, Any]:
|
|
"""Sample motions and check layman explanation coverage.
|
|
|
|
Returns quality metrics for explanations.
|
|
"""
|
|
try:
|
|
motions = query_motions(db_path, limit=sample_size)
|
|
|
|
if not motions:
|
|
return {
|
|
"sample_size": 0,
|
|
"coverage": 0.0,
|
|
"empty_count": 0,
|
|
}
|
|
|
|
with_explanation = sum(
|
|
1 for m in motions
|
|
if m.get("layman_explanation") and str(m.get("layman_explanation")).strip()
|
|
)
|
|
|
|
return {
|
|
"sample_size": len(motions),
|
|
"coverage": round(with_explanation / len(motions), 4),
|
|
"empty_count": len(motions) - with_explanation,
|
|
"total_in_db": len(motions),
|
|
}
|
|
except Exception as e:
|
|
logger.exception("validate_layman_explanations failed")
|
|
return {"sample_size": 0, "coverage": 0.0, "error": str(e)}
|
|
|
|
|
|
def check_embedding_quality(
|
|
db_path: str,
|
|
window_id: str,
|
|
) -> Dict[str, Any]:
|
|
"""Check embedding coverage for a window.
|
|
|
|
Returns raw coverage stats. The agent decides whether coverage is acceptable.
|
|
"""
|
|
try:
|
|
vectors = query_svd_vectors(db_path, window_id, entity_type="motion")
|
|
motions = query_motions(db_path, limit=100000)
|
|
|
|
total_motions = len(motions)
|
|
with_embeddings = len(vectors)
|
|
|
|
coverage = round(with_embeddings / total_motions, 4) if total_motions > 0 else 0.0
|
|
|
|
return {
|
|
"window_id": window_id,
|
|
"total_motions": total_motions,
|
|
"with_embeddings": with_embeddings,
|
|
"coverage": coverage,
|
|
}
|
|
except Exception as e:
|
|
logger.exception("check_embedding_quality failed")
|
|
return {"window_id": window_id, "coverage": 0.0, "error": str(e)}
|
|
|