You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
192 lines
5.5 KiB
192 lines
5.5 KiB
"""Pipeline control primitives for agent operation.
|
|
|
|
Stage-aware tools for running, monitoring, and diagnosing the data pipeline.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from agent_tools.database import query_pipeline_status
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
VALID_STAGES = {"ingestion", "votes", "svd", "text_embeddings", "fusion", "similarity"}
|
|
|
|
|
|
def pipeline_run_stage(
|
|
db_path: str,
|
|
stage: str,
|
|
window_id: Optional[str] = None,
|
|
dry_run: bool = False,
|
|
) -> Dict[str, Any]:
|
|
"""Run a single pipeline stage.
|
|
|
|
Args:
|
|
db_path: Path to DuckDB database
|
|
stage: One of VALID_STAGES
|
|
window_id: Optional window identifier (e.g., "2024", "current_parliament")
|
|
dry_run: If True, return planned actions without executing
|
|
|
|
Returns:
|
|
dict with status and metadata
|
|
"""
|
|
if stage not in VALID_STAGES:
|
|
return {
|
|
"error": f"Invalid stage '{stage}'. Valid stages: {sorted(VALID_STAGES)}",
|
|
}
|
|
|
|
result = {
|
|
"stage": stage,
|
|
"window_id": window_id,
|
|
"dry_run": dry_run,
|
|
"status": "planned" if dry_run else "not_implemented",
|
|
}
|
|
|
|
if dry_run:
|
|
return result
|
|
|
|
# Actual execution would delegate to pipeline/run_pipeline.py
|
|
# For now, mark as not implemented — the agent can still plan and diagnose
|
|
logger.info("pipeline_run_stage: %s (dry_run=%s)", stage, dry_run)
|
|
return result
|
|
|
|
|
|
def pipeline_run_full(
|
|
db_path: str,
|
|
dry_run: bool = False,
|
|
) -> Dict[str, Any]:
|
|
"""Run all pipeline stages in dependency order.
|
|
|
|
Args:
|
|
db_path: Path to DuckDB database
|
|
dry_run: If True, return planned actions without executing
|
|
|
|
Returns:
|
|
dict with stage statuses
|
|
"""
|
|
stages = ["ingestion", "votes", "svd", "text_embeddings", "fusion", "similarity"]
|
|
results = []
|
|
|
|
for stage in stages:
|
|
result = pipeline_run_stage(db_path, stage, dry_run=dry_run)
|
|
results.append(result)
|
|
|
|
return {
|
|
"stages": results,
|
|
"dry_run": dry_run,
|
|
"status": "planned" if dry_run else "partial",
|
|
}
|
|
|
|
|
|
def pipeline_check_health(db_path: str) -> Dict[str, Any]:
|
|
"""Check pipeline health and return structured report.
|
|
|
|
Reuses the health/ module and database queries.
|
|
"""
|
|
try:
|
|
from health.checks import check_motion_freshness, check_embedding_coverage
|
|
|
|
checks = []
|
|
healthy = True
|
|
|
|
try:
|
|
freshness = check_motion_freshness(db_path)
|
|
checks.append({
|
|
"name": "motion_freshness",
|
|
"healthy": freshness.get("healthy", False),
|
|
"details": freshness,
|
|
})
|
|
if not freshness.get("healthy", False):
|
|
healthy = False
|
|
except Exception as e:
|
|
checks.append({"name": "motion_freshness", "healthy": False, "error": str(e)})
|
|
healthy = False
|
|
|
|
try:
|
|
embedding = check_embedding_coverage(db_path)
|
|
checks.append({
|
|
"name": "embedding_coverage",
|
|
"healthy": embedding.get("healthy", False),
|
|
"details": embedding,
|
|
})
|
|
if not embedding.get("healthy", False):
|
|
healthy = False
|
|
except Exception as e:
|
|
checks.append({"name": "embedding_coverage", "healthy": False, "error": str(e)})
|
|
healthy = False
|
|
|
|
status = query_pipeline_status(db_path)
|
|
|
|
return {
|
|
"healthy": healthy,
|
|
"checks": checks,
|
|
"pipeline_status": status,
|
|
}
|
|
except Exception as e:
|
|
logger.exception("pipeline_check_health failed")
|
|
return {
|
|
"healthy": False,
|
|
"checks": [],
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
def pipeline_get_logs(
|
|
db_path: str,
|
|
stage: Optional[str] = None,
|
|
lines: int = 50,
|
|
) -> List[str]:
|
|
"""Return recent log lines for a stage.
|
|
|
|
Note: This is a placeholder. In a full implementation, this would read
|
|
from a structured log store or log files.
|
|
"""
|
|
# Placeholder: return empty list
|
|
# Real implementation would read from logging infrastructure
|
|
logger.info("pipeline_get_logs requested for stage=%s lines=%d", stage, lines)
|
|
return []
|
|
|
|
|
|
def pipeline_validate_output(
|
|
db_path: str,
|
|
stage: str,
|
|
) -> Dict[str, Any]:
|
|
"""Validate that a stage's output looks reasonable.
|
|
|
|
Args:
|
|
db_path: Path to DuckDB database
|
|
stage: Pipeline stage to validate
|
|
|
|
Returns:
|
|
dict with validation results
|
|
"""
|
|
if stage not in VALID_STAGES:
|
|
return {
|
|
"valid": False,
|
|
"error": f"Invalid stage '{stage}'",
|
|
}
|
|
|
|
try:
|
|
status = query_pipeline_status(db_path)
|
|
|
|
validators = {
|
|
"svd": lambda s: s.get("svd_window_count", 0) > 0,
|
|
"similarity": lambda s: s.get("embedding_count", 0) > 0,
|
|
"ingestion": lambda s: s.get("motion_count", 0) > 0,
|
|
"votes": lambda s: s.get("motion_count", 0) > 0,
|
|
"text_embeddings": lambda s: s.get("embedding_count", 0) > 0,
|
|
"fusion": lambda s: s.get("embedding_count", 0) > 0,
|
|
}
|
|
|
|
is_valid = validators.get(stage, lambda s: False)(status)
|
|
|
|
return {
|
|
"valid": is_valid,
|
|
"stage": stage,
|
|
"pipeline_status": status,
|
|
}
|
|
except Exception as e:
|
|
logger.exception("pipeline_validate_output failed")
|
|
return {"valid": False, "stage": stage, "error": str(e)}
|
|
|