You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
motief/agent_tools/pipeline.py

192 lines
5.5 KiB

"""Pipeline control primitives for agent operation.
Stage-aware tools for running, monitoring, and diagnosing the data pipeline.
"""
from __future__ import annotations
import logging
from typing import Any, Dict, List, Optional
from agent_tools.database import query_pipeline_status
logger = logging.getLogger(__name__)
VALID_STAGES = {"ingestion", "votes", "svd", "text_embeddings", "fusion", "similarity"}
def pipeline_run_stage(
db_path: str,
stage: str,
window_id: Optional[str] = None,
dry_run: bool = False,
) -> Dict[str, Any]:
"""Run a single pipeline stage.
Args:
db_path: Path to DuckDB database
stage: One of VALID_STAGES
window_id: Optional window identifier (e.g., "2024", "current_parliament")
dry_run: If True, return planned actions without executing
Returns:
dict with status and metadata
"""
if stage not in VALID_STAGES:
return {
"error": f"Invalid stage '{stage}'. Valid stages: {sorted(VALID_STAGES)}",
}
result = {
"stage": stage,
"window_id": window_id,
"dry_run": dry_run,
"status": "planned" if dry_run else "not_implemented",
}
if dry_run:
return result
# Actual execution would delegate to pipeline/run_pipeline.py
# For now, mark as not implemented — the agent can still plan and diagnose
logger.info("pipeline_run_stage: %s (dry_run=%s)", stage, dry_run)
return result
def pipeline_run_full(
db_path: str,
dry_run: bool = False,
) -> Dict[str, Any]:
"""Run all pipeline stages in dependency order.
Args:
db_path: Path to DuckDB database
dry_run: If True, return planned actions without executing
Returns:
dict with stage statuses
"""
stages = ["ingestion", "votes", "svd", "text_embeddings", "fusion", "similarity"]
results = []
for stage in stages:
result = pipeline_run_stage(db_path, stage, dry_run=dry_run)
results.append(result)
return {
"stages": results,
"dry_run": dry_run,
"status": "planned" if dry_run else "partial",
}
def pipeline_check_health(db_path: str) -> Dict[str, Any]:
"""Check pipeline health and return structured report.
Reuses the health/ module and database queries.
"""
try:
from health.checks import check_motion_freshness, check_embedding_coverage
checks = []
healthy = True
try:
freshness = check_motion_freshness(db_path)
checks.append({
"name": "motion_freshness",
"healthy": freshness.get("healthy", False),
"details": freshness,
})
if not freshness.get("healthy", False):
healthy = False
except Exception as e:
checks.append({"name": "motion_freshness", "healthy": False, "error": str(e)})
healthy = False
try:
embedding = check_embedding_coverage(db_path)
checks.append({
"name": "embedding_coverage",
"healthy": embedding.get("healthy", False),
"details": embedding,
})
if not embedding.get("healthy", False):
healthy = False
except Exception as e:
checks.append({"name": "embedding_coverage", "healthy": False, "error": str(e)})
healthy = False
status = query_pipeline_status(db_path)
return {
"healthy": healthy,
"checks": checks,
"pipeline_status": status,
}
except Exception as e:
logger.exception("pipeline_check_health failed")
return {
"healthy": False,
"checks": [],
"error": str(e),
}
def pipeline_get_logs(
db_path: str,
stage: Optional[str] = None,
lines: int = 50,
) -> List[str]:
"""Return recent log lines for a stage.
Note: This is a placeholder. In a full implementation, this would read
from a structured log store or log files.
"""
# Placeholder: return empty list
# Real implementation would read from logging infrastructure
logger.info("pipeline_get_logs requested for stage=%s lines=%d", stage, lines)
return []
def pipeline_validate_output(
db_path: str,
stage: str,
) -> Dict[str, Any]:
"""Validate that a stage's output looks reasonable.
Args:
db_path: Path to DuckDB database
stage: Pipeline stage to validate
Returns:
dict with validation results
"""
if stage not in VALID_STAGES:
return {
"valid": False,
"error": f"Invalid stage '{stage}'",
}
try:
status = query_pipeline_status(db_path)
validators = {
"svd": lambda s: s.get("svd_window_count", 0) > 0,
"similarity": lambda s: s.get("embedding_count", 0) > 0,
"ingestion": lambda s: s.get("motion_count", 0) > 0,
"votes": lambda s: s.get("motion_count", 0) > 0,
"text_embeddings": lambda s: s.get("embedding_count", 0) > 0,
"fusion": lambda s: s.get("embedding_count", 0) > 0,
}
is_valid = validators.get(stage, lambda s: False)(status)
return {
"valid": is_valid,
"stage": stage,
"pipeline_status": status,
}
except Exception as e:
logger.exception("pipeline_validate_output failed")
return {"valid": False, "stage": stage, "error": str(e)}