motief/agent_tools/content.py

"""Content validation primitives for agent operation.

Tools for validating data quality, coverage, and content correctness.
"""

from __future__ import annotations

import logging
from datetime import datetime, timedelta
from typing import Any, Dict

from agent_tools.database import query_motions, query_svd_vectors

logger = logging.getLogger(__name__)


def validate_motion_coverage(
    db_path: str,
    start_date: str,
    end_date: str,
) -> Dict[str, Any]:
    """Validate motion coverage for a date range.

    Returns gaps where no motions exist in the database.
    """
    try:
        motions = query_motions(db_path, limit=10000)

        if not motions:
            return {
                "gaps": [{"start": start_date, "end": end_date}],
                "coverage_rate": 0.0,
                "total_motions": 0,
            }

        # Convert dates
        start = datetime.fromisoformat(start_date)
        end = datetime.fromisoformat(end_date)

        # Check coverage month by month
        gaps = []
        current = start
        while current < end:
            month_end = min(current + timedelta(days=31), end)
            month_motions = [
                m for m in motions
                if current <= datetime.fromisoformat(str(m.get("date", "1970-01-01"))) < month_end
            ]
            if not month_motions:
                gaps.append({
                    "start": current.isoformat(),
                    "end": month_end.isoformat(),
                })
            current = month_end

        total_days = (end - start).days
        gap_days = sum(
            (datetime.fromisoformat(g["end"]) - datetime.fromisoformat(g["start"])).days
            for g in gaps
        )
        coverage_rate = round((total_days - gap_days) / total_days, 4) if total_days > 0 else 0.0

        return {
            "gaps": gaps,
            "coverage_rate": coverage_rate,
            "total_motions": len(motions),
            "date_range": {"start": start_date, "end": end_date},
        }
    except Exception as e:
        logger.exception("validate_motion_coverage failed")
        return {"gaps": [], "coverage_rate": 0.0, "error": str(e)}


def validate_layman_explanations(
    db_path: str,
    sample_size: int = 100,
) -> Dict[str, Any]:
    """Sample motions and check layman explanation coverage.

    Returns quality metrics for explanations.
    """
    try:
        motions = query_motions(db_path, limit=sample_size)

        if not motions:
            return {
                "sample_size": 0,
                "coverage": 0.0,
                "empty_count": 0,
            }

        with_explanation = sum(
            1 for m in motions
            if m.get("layman_explanation") and str(m.get("layman_explanation")).strip()
        )

        return {
            "sample_size": len(motions),
            "coverage": round(with_explanation / len(motions), 4),
            "empty_count": len(motions) - with_explanation,
            "total_in_db": len(motions),
        }
    except Exception as e:
        logger.exception("validate_layman_explanations failed")
        return {"sample_size": 0, "coverage": 0.0, "error": str(e)}


def check_embedding_quality(
    db_path: str,
    window_id: str,
) -> Dict[str, Any]:
    """Check embedding coverage for a window.

    Returns raw coverage stats. The agent decides whether coverage is acceptable.
    """
    try:
        vectors = query_svd_vectors(db_path, window_id, entity_type="motion")
        motions = query_motions(db_path, limit=100000)

        total_motions = len(motions)
        with_embeddings = len(vectors)

        coverage = round(with_embeddings / total_motions, 4) if total_motions > 0 else 0.0

        return {
            "window_id": window_id,
            "total_motions": total_motions,
            "with_embeddings": with_embeddings,
            "coverage": coverage,
        }
    except Exception as e:
        logger.exception("check_embedding_quality failed")
        return {"window_id": window_id, "coverage": 0.0, "error": str(e)}