"""Pipeline control primitives for agent operation. Stage-aware tools for running, monitoring, and diagnosing the data pipeline. """ from __future__ import annotations import logging from typing import Any, Dict, List, Optional from agent_tools.database import query_pipeline_status logger = logging.getLogger(__name__) VALID_STAGES = {"ingestion", "votes", "svd", "text_embeddings", "fusion", "similarity"} def pipeline_run_stage( db_path: str, stage: str, window_id: Optional[str] = None, dry_run: bool = False, ) -> Dict[str, Any]: """Run a single pipeline stage. Args: db_path: Path to DuckDB database stage: One of VALID_STAGES window_id: Optional window identifier (e.g., "2024", "current_parliament") dry_run: If True, return planned actions without executing Returns: dict with status and metadata """ if stage not in VALID_STAGES: return { "error": f"Invalid stage '{stage}'. Valid stages: {sorted(VALID_STAGES)}", } result = { "stage": stage, "window_id": window_id, "dry_run": dry_run, "status": "planned" if dry_run else "not_implemented", } if dry_run: return result # Actual execution would delegate to pipeline/run_pipeline.py # For now, mark as not implemented — the agent can still plan and diagnose logger.info("pipeline_run_stage: %s (dry_run=%s)", stage, dry_run) return result def pipeline_run_full( db_path: str, dry_run: bool = False, ) -> Dict[str, Any]: """Run all pipeline stages in dependency order. Args: db_path: Path to DuckDB database dry_run: If True, return planned actions without executing Returns: dict with stage statuses """ stages = ["ingestion", "votes", "svd", "text_embeddings", "fusion", "similarity"] results = [] for stage in stages: result = pipeline_run_stage(db_path, stage, dry_run=dry_run) results.append(result) return { "stages": results, "dry_run": dry_run, "status": "planned" if dry_run else "partial", } def pipeline_check_health(db_path: str) -> Dict[str, Any]: """Check pipeline health and return structured report. Reuses the health/ module and database queries. """ try: from health.checks import check_motion_freshness, check_embedding_coverage checks = [] healthy = True try: freshness = check_motion_freshness(db_path) checks.append({ "name": "motion_freshness", "healthy": freshness.get("healthy", False), "details": freshness, }) if not freshness.get("healthy", False): healthy = False except Exception as e: checks.append({"name": "motion_freshness", "healthy": False, "error": str(e)}) healthy = False try: embedding = check_embedding_coverage(db_path) checks.append({ "name": "embedding_coverage", "healthy": embedding.get("healthy", False), "details": embedding, }) if not embedding.get("healthy", False): healthy = False except Exception as e: checks.append({"name": "embedding_coverage", "healthy": False, "error": str(e)}) healthy = False status = query_pipeline_status(db_path) return { "healthy": healthy, "checks": checks, "pipeline_status": status, } except Exception as e: logger.exception("pipeline_check_health failed") return { "healthy": False, "checks": [], "error": str(e), } def pipeline_get_logs( db_path: str, stage: Optional[str] = None, lines: int = 50, ) -> List[str]: """Return recent log lines for a stage. Note: This is a placeholder. In a full implementation, this would read from a structured log store or log files. """ # Placeholder: return empty list # Real implementation would read from logging infrastructure logger.info("pipeline_get_logs requested for stage=%s lines=%d", stage, lines) return [] def pipeline_validate_output( db_path: str, stage: str, ) -> Dict[str, Any]: """Validate that a stage's output looks reasonable. Args: db_path: Path to DuckDB database stage: Pipeline stage to validate Returns: dict with validation results """ if stage not in VALID_STAGES: return { "valid": False, "error": f"Invalid stage '{stage}'", } try: status = query_pipeline_status(db_path) validators = { "svd": lambda s: s.get("svd_window_count", 0) > 0, "similarity": lambda s: s.get("embedding_count", 0) > 0, "ingestion": lambda s: s.get("motion_count", 0) > 0, "votes": lambda s: s.get("motion_count", 0) > 0, "text_embeddings": lambda s: s.get("embedding_count", 0) > 0, "fusion": lambda s: s.get("embedding_count", 0) > 0, } is_valid = validators.get(stage, lambda s: False)(status) return { "valid": is_valid, "stage": stage, "pipeline_status": status, } except Exception as e: logger.exception("pipeline_validate_output failed") return {"valid": False, "stage": stage, "error": str(e)}