motief/agent_tools/pipeline.py

"""Pipeline control primitives for agent operation.

Stage-aware tools for running, monitoring, and diagnosing the data pipeline.
"""

from __future__ import annotations

import logging
from typing import Any, Dict, List, Optional

from agent_tools.database import query_pipeline_status

logger = logging.getLogger(__name__)

VALID_STAGES = {"ingestion", "votes", "svd", "text_embeddings", "fusion", "similarity"}


def pipeline_run_stage(
    db_path: str,
    stage: str,
    window_id: Optional[str] = None,
    dry_run: bool = False,
) -> Dict[str, Any]:
    """Run a single pipeline stage.

    Args:
        db_path: Path to DuckDB database
        stage: One of VALID_STAGES
        window_id: Optional window identifier (e.g., "2024", "current_parliament")
        dry_run: If True, return planned actions without executing

    Returns:
        dict with status and metadata
    """
    if stage not in VALID_STAGES:
        return {
            "error": f"Invalid stage '{stage}'. Valid stages: {sorted(VALID_STAGES)}",
        }

    result = {
        "stage": stage,
        "window_id": window_id,
        "dry_run": dry_run,
        "status": "planned" if dry_run else "not_implemented",
    }

    if dry_run:
        return result

    # Actual execution would delegate to pipeline/run_pipeline.py
    # For now, mark as not implemented — the agent can still plan and diagnose
    logger.info("pipeline_run_stage: %s (dry_run=%s)", stage, dry_run)
    return result


def pipeline_run_full(
    db_path: str,
    dry_run: bool = False,
) -> Dict[str, Any]:
    """Run all pipeline stages in dependency order.

    Args:
        db_path: Path to DuckDB database
        dry_run: If True, return planned actions without executing

    Returns:
        dict with stage statuses
    """
    stages = ["ingestion", "votes", "svd", "text_embeddings", "fusion", "similarity"]
    results = []

    for stage in stages:
        result = pipeline_run_stage(db_path, stage, dry_run=dry_run)
        results.append(result)

    return {
        "stages": results,
        "dry_run": dry_run,
        "status": "planned" if dry_run else "partial",
    }


def pipeline_check_health(db_path: str) -> Dict[str, Any]:
    """Check pipeline health and return structured report.

    Reuses the health/ module and database queries.
    """
    try:
        from health.checks import check_motion_freshness, check_embedding_coverage

        checks = []
        healthy = True

        try:
            freshness = check_motion_freshness(db_path)
            checks.append({
                "name": "motion_freshness",
                "healthy": freshness.get("healthy", False),
                "details": freshness,
            })
            if not freshness.get("healthy", False):
                healthy = False
        except Exception as e:
            checks.append({"name": "motion_freshness", "healthy": False, "error": str(e)})
            healthy = False

        try:
            embedding = check_embedding_coverage(db_path)
            checks.append({
                "name": "embedding_coverage",
                "healthy": embedding.get("healthy", False),
                "details": embedding,
            })
            if not embedding.get("healthy", False):
                healthy = False
        except Exception as e:
            checks.append({"name": "embedding_coverage", "healthy": False, "error": str(e)})
            healthy = False

        status = query_pipeline_status(db_path)

        return {
            "healthy": healthy,
            "checks": checks,
            "pipeline_status": status,
        }
    except Exception as e:
        logger.exception("pipeline_check_health failed")
        return {
            "healthy": False,
            "checks": [],
            "error": str(e),
        }


def pipeline_get_logs(
    db_path: str,
    stage: Optional[str] = None,
    lines: int = 50,
) -> List[str]:
    """Return recent log lines for a stage.

    Note: This is a placeholder. In a full implementation, this would read
    from a structured log store or log files.
    """
    # Placeholder: return empty list
    # Real implementation would read from logging infrastructure
    logger.info("pipeline_get_logs requested for stage=%s lines=%d", stage, lines)
    return []


def pipeline_validate_output(
    db_path: str,
    stage: str,
) -> Dict[str, Any]:
    """Validate that a stage's output looks reasonable.

    Args:
        db_path: Path to DuckDB database
        stage: Pipeline stage to validate

    Returns:
        dict with validation results
    """
    if stage not in VALID_STAGES:
        return {
            "valid": False,
            "error": f"Invalid stage '{stage}'",
        }

    try:
        status = query_pipeline_status(db_path)

        validators = {
            "svd": lambda s: s.get("svd_window_count", 0) > 0,
            "similarity": lambda s: s.get("embedding_count", 0) > 0,
            "ingestion": lambda s: s.get("motion_count", 0) > 0,
            "votes": lambda s: s.get("motion_count", 0) > 0,
            "text_embeddings": lambda s: s.get("embedding_count", 0) > 0,
            "fusion": lambda s: s.get("embedding_count", 0) > 0,
        }

        is_valid = validators.get(stage, lambda s: False)(status)

        return {
            "valid": is_valid,
            "stage": stage,
            "pipeline_status": status,
        }
    except Exception as e:
        logger.exception("pipeline_validate_output failed")
        return {"valid": False, "stage": stage, "error": str(e)}