- Create health/ package with HealthStatus, HealthCheck, HealthReport - Add check_motion_freshness, check_embedding_coverage, check_llm_coverage - Add scripts/health_check.py CLI with text/JSON output and exit codes - Add comprehensive tests for core, checks, and CLI P4-005: Pipeline health checksmain
parent
04cc62ea06
commit
e352d7c7bc
@ -0,0 +1,42 @@ |
|||||||
|
from dataclasses import dataclass |
||||||
|
from enum import Enum |
||||||
|
from typing import List |
||||||
|
|
||||||
|
|
||||||
|
class HealthStatus(Enum): |
||||||
|
OK = "ok" |
||||||
|
WARNING = "warning" |
||||||
|
CRITICAL = "critical" |
||||||
|
|
||||||
|
|
||||||
|
@dataclass |
||||||
|
class HealthCheck: |
||||||
|
name: str |
||||||
|
status: HealthStatus |
||||||
|
message: str |
||||||
|
details: dict |
||||||
|
|
||||||
|
|
||||||
|
@dataclass |
||||||
|
class HealthReport: |
||||||
|
checks: List[HealthCheck] |
||||||
|
|
||||||
|
@property |
||||||
|
def status(self) -> HealthStatus: |
||||||
|
if any(c.status == HealthStatus.CRITICAL for c in self.checks): |
||||||
|
return HealthStatus.CRITICAL |
||||||
|
if any(c.status == HealthStatus.WARNING for c in self.checks): |
||||||
|
return HealthStatus.WARNING |
||||||
|
return HealthStatus.OK |
||||||
|
|
||||||
|
@property |
||||||
|
def exit_code(self) -> int: |
||||||
|
if self.status == HealthStatus.CRITICAL: |
||||||
|
return 2 |
||||||
|
if self.status == HealthStatus.WARNING: |
||||||
|
return 1 |
||||||
|
return 0 |
||||||
|
|
||||||
|
|
||||||
|
def run_checks(checks: List[HealthCheck]) -> HealthReport: |
||||||
|
return HealthReport(checks=checks) |
||||||
@ -0,0 +1,140 @@ |
|||||||
|
from datetime import datetime, timedelta |
||||||
|
from typing import Any, Dict, Optional |
||||||
|
|
||||||
|
from health import HealthCheck, HealthStatus |
||||||
|
|
||||||
|
|
||||||
|
def check_motion_freshness( |
||||||
|
conn: Any, |
||||||
|
max_age_days: int = 7, |
||||||
|
min_motions: int = 100, |
||||||
|
) -> HealthCheck: |
||||||
|
try: |
||||||
|
result = conn.execute( |
||||||
|
"SELECT COUNT(*) FROM motions WHERE date >= ?", |
||||||
|
[datetime.now() - timedelta(days=max_age_days)], |
||||||
|
).fetchone() |
||||||
|
count = result[0] if result else 0 |
||||||
|
except Exception as e: |
||||||
|
return HealthCheck( |
||||||
|
name="motion_freshness", |
||||||
|
status=HealthStatus.CRITICAL, |
||||||
|
message=f"Could not query motion freshness: {e}", |
||||||
|
details={}, |
||||||
|
) |
||||||
|
|
||||||
|
if count == 0: |
||||||
|
return HealthCheck( |
||||||
|
name="motion_freshness", |
||||||
|
status=HealthStatus.CRITICAL, |
||||||
|
message=f"No motions in last {max_age_days} days", |
||||||
|
details={"count": 0, "threshold": max_age_days}, |
||||||
|
) |
||||||
|
if count < min_motions: |
||||||
|
return HealthCheck( |
||||||
|
name="motion_freshness", |
||||||
|
status=HealthStatus.WARNING, |
||||||
|
message=f"Only {count} motions in last {max_age_days} days (expected >= {min_motions})", |
||||||
|
details={"count": count, "threshold": max_age_days, "min_expected": min_motions}, |
||||||
|
) |
||||||
|
return HealthCheck( |
||||||
|
name="motion_freshness", |
||||||
|
status=HealthStatus.OK, |
||||||
|
message=f"{count} motions in last {max_age_days} days", |
||||||
|
details={"count": count, "threshold": max_age_days}, |
||||||
|
) |
||||||
|
|
||||||
|
|
||||||
|
def check_embedding_coverage( |
||||||
|
conn: Any, |
||||||
|
min_coverage: float = 0.95, |
||||||
|
) -> HealthCheck: |
||||||
|
try: |
||||||
|
total_result = conn.execute("SELECT COUNT(*) FROM motions").fetchone() |
||||||
|
total = total_result[0] if total_result else 0 |
||||||
|
|
||||||
|
if total == 0: |
||||||
|
return HealthCheck( |
||||||
|
name="embedding_coverage", |
||||||
|
status=HealthStatus.CRITICAL, |
||||||
|
message="No motions in database", |
||||||
|
details={"total": 0, "with_embeddings": 0, "coverage": 0.0}, |
||||||
|
) |
||||||
|
|
||||||
|
embed_result = conn.execute( |
||||||
|
"SELECT COUNT(DISTINCT motion_id) FROM fused_embeddings" |
||||||
|
).fetchone() |
||||||
|
with_embeddings = embed_result[0] if embed_result else 0 |
||||||
|
coverage = with_embeddings / total |
||||||
|
except Exception as e: |
||||||
|
return HealthCheck( |
||||||
|
name="embedding_coverage", |
||||||
|
status=HealthStatus.CRITICAL, |
||||||
|
message=f"Could not query embedding coverage: {e}", |
||||||
|
details={}, |
||||||
|
) |
||||||
|
|
||||||
|
if coverage < min_coverage: |
||||||
|
return HealthCheck( |
||||||
|
name="embedding_coverage", |
||||||
|
status=HealthStatus.WARNING, |
||||||
|
message=f"Embedding coverage {coverage:.1%} (expected >= {min_coverage:.0%})", |
||||||
|
details={"total": total, "with_embeddings": with_embeddings, "coverage": coverage}, |
||||||
|
) |
||||||
|
return HealthCheck( |
||||||
|
name="embedding_coverage", |
||||||
|
status=HealthStatus.OK, |
||||||
|
message=f"Embedding coverage {coverage:.1%}", |
||||||
|
details={"total": total, "with_embeddings": with_embeddings, "coverage": coverage}, |
||||||
|
) |
||||||
|
|
||||||
|
|
||||||
|
def check_llm_coverage( |
||||||
|
conn: Any, |
||||||
|
max_missing_ratio: float = 0.15, |
||||||
|
) -> HealthCheck: |
||||||
|
try: |
||||||
|
total_result = conn.execute("SELECT COUNT(*) FROM motions").fetchone() |
||||||
|
total = total_result[0] if total_result else 0 |
||||||
|
|
||||||
|
if total == 0: |
||||||
|
return HealthCheck( |
||||||
|
name="llm_coverage", |
||||||
|
status=HealthStatus.CRITICAL, |
||||||
|
message="No motions in database", |
||||||
|
details={"total": 0, "missing": 0, "missing_ratio": 0.0}, |
||||||
|
) |
||||||
|
|
||||||
|
missing_result = conn.execute( |
||||||
|
"SELECT COUNT(*) FROM motions WHERE layman_explanation IS NULL OR layman_explanation = ''" |
||||||
|
).fetchone() |
||||||
|
missing = missing_result[0] if missing_result else 0 |
||||||
|
missing_ratio = missing / total |
||||||
|
except Exception as e: |
||||||
|
return HealthCheck( |
||||||
|
name="llm_coverage", |
||||||
|
status=HealthStatus.CRITICAL, |
||||||
|
message=f"Could not query LLM coverage: {e}", |
||||||
|
details={}, |
||||||
|
) |
||||||
|
|
||||||
|
if missing_ratio > max_missing_ratio: |
||||||
|
return HealthCheck( |
||||||
|
name="llm_coverage", |
||||||
|
status=HealthStatus.CRITICAL, |
||||||
|
message=f"{missing_ratio:.1%} missing layman explanations ({missing}/{total})", |
||||||
|
details={"total": total, "missing": missing, "missing_ratio": missing_ratio}, |
||||||
|
) |
||||||
|
if missing_ratio > 0.05: |
||||||
|
return HealthCheck( |
||||||
|
name="llm_coverage", |
||||||
|
status=HealthStatus.WARNING, |
||||||
|
message=f"{missing_ratio:.1%} missing layman explanations ({missing}/{total})", |
||||||
|
details={"total": total, "missing": missing, "missing_ratio": missing_ratio}, |
||||||
|
) |
||||||
|
return HealthCheck( |
||||||
|
name="llm_coverage", |
||||||
|
status=HealthStatus.OK, |
||||||
|
message=f"{missing_ratio:.1%} missing layman explanations ({missing}/{total})", |
||||||
|
details={"total": total, "missing": missing, "missing_ratio": missing_ratio}, |
||||||
|
) |
||||||
@ -0,0 +1,69 @@ |
|||||||
|
import sys |
||||||
|
from unittest.mock import MagicMock, patch |
||||||
|
|
||||||
|
import pytest |
||||||
|
|
||||||
|
from scripts.health_check import main |
||||||
|
|
||||||
|
|
||||||
|
class TestHealthCheckCLI: |
||||||
|
@patch("scripts.health_check.duckdb.connect") |
||||||
|
@patch("scripts.health_check.config") |
||||||
|
def test_all_ok_exits_0(self, mock_config, mock_connect): |
||||||
|
mock_config.DATABASE_PATH = "/fake/db" |
||||||
|
mock_conn = MagicMock() |
||||||
|
mock_conn.execute.return_value.fetchone.side_effect = [ |
||||||
|
[150], # motion count |
||||||
|
[100], # total motions |
||||||
|
[100], # embeddings count |
||||||
|
[100], # total motions |
||||||
|
[0], # missing explanations |
||||||
|
] |
||||||
|
mock_connect.return_value = mock_conn |
||||||
|
|
||||||
|
with patch.object(sys, "argv", ["health_check"]): |
||||||
|
exit_code = main() |
||||||
|
assert exit_code == 0 |
||||||
|
|
||||||
|
@patch("scripts.health_check.duckdb.connect") |
||||||
|
@patch("scripts.health_check.config") |
||||||
|
def test_critical_exits_2(self, mock_config, mock_connect): |
||||||
|
mock_config.DATABASE_PATH = "/fake/db" |
||||||
|
mock_conn = MagicMock() |
||||||
|
mock_conn.execute.return_value.fetchone.side_effect = [ |
||||||
|
[0], # no recent motions |
||||||
|
[100], # total motions |
||||||
|
[100], # embeddings count |
||||||
|
[100], # total motions |
||||||
|
[0], # missing explanations |
||||||
|
] |
||||||
|
mock_connect.return_value = mock_conn |
||||||
|
|
||||||
|
with patch.object(sys, "argv", ["health_check"]): |
||||||
|
exit_code = main() |
||||||
|
assert exit_code == 2 |
||||||
|
|
||||||
|
@patch("scripts.health_check.duckdb.connect") |
||||||
|
@patch("scripts.health_check.config") |
||||||
|
def test_json_format(self, mock_config, mock_connect, capsys): |
||||||
|
mock_config.DATABASE_PATH = "/fake/db" |
||||||
|
mock_conn = MagicMock() |
||||||
|
mock_conn.execute.return_value.fetchone.side_effect = [ |
||||||
|
[150], [100], [100], [100], [0] |
||||||
|
] |
||||||
|
mock_connect.return_value = mock_conn |
||||||
|
|
||||||
|
with patch.object(sys, "argv", ["health_check", "--format", "json"]): |
||||||
|
main() |
||||||
|
|
||||||
|
captured = capsys.readouterr() |
||||||
|
assert '"status": "ok"' in captured.out |
||||||
|
assert '"exit_code": 0' in captured.out |
||||||
|
|
||||||
|
@patch("scripts.health_check.duckdb.connect") |
||||||
|
def test_db_connect_failure_exits_2(self, mock_connect): |
||||||
|
mock_connect.side_effect = Exception("cannot open") |
||||||
|
|
||||||
|
with patch.object(sys, "argv", ["health_check"]): |
||||||
|
exit_code = main() |
||||||
|
assert exit_code == 2 |
||||||
@ -0,0 +1,92 @@ |
|||||||
|
from datetime import datetime, timedelta |
||||||
|
from unittest.mock import MagicMock |
||||||
|
|
||||||
|
import pytest |
||||||
|
|
||||||
|
from health import HealthStatus |
||||||
|
from health.checks import check_embedding_coverage, check_llm_coverage, check_motion_freshness |
||||||
|
|
||||||
|
|
||||||
|
class TestCheckMotionFreshness: |
||||||
|
def test_recent_motions_ok(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.return_value = [150] |
||||||
|
result = check_motion_freshness(conn, max_age_days=7, min_motions=100) |
||||||
|
assert result.status == HealthStatus.OK |
||||||
|
assert result.details["count"] == 150 |
||||||
|
|
||||||
|
def test_no_motions_critical(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.return_value = [0] |
||||||
|
result = check_motion_freshness(conn, max_age_days=7, min_motions=100) |
||||||
|
assert result.status == HealthStatus.CRITICAL |
||||||
|
|
||||||
|
def test_low_count_warning(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.return_value = [50] |
||||||
|
result = check_motion_freshness(conn, max_age_days=7, min_motions=100) |
||||||
|
assert result.status == HealthStatus.WARNING |
||||||
|
|
||||||
|
def test_query_error_critical(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.side_effect = Exception("db down") |
||||||
|
result = check_motion_freshness(conn) |
||||||
|
assert result.status == HealthStatus.CRITICAL |
||||||
|
assert "db down" in result.message |
||||||
|
|
||||||
|
|
||||||
|
class TestCheckEmbeddingCoverage: |
||||||
|
def test_full_coverage_ok(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.side_effect = [[100], [100]] |
||||||
|
result = check_embedding_coverage(conn, min_coverage=0.95) |
||||||
|
assert result.status == HealthStatus.OK |
||||||
|
assert result.details["coverage"] == 1.0 |
||||||
|
|
||||||
|
def test_low_coverage_warning(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.side_effect = [[100], [80]] |
||||||
|
result = check_embedding_coverage(conn, min_coverage=0.95) |
||||||
|
assert result.status == HealthStatus.WARNING |
||||||
|
assert result.details["coverage"] == 0.8 |
||||||
|
|
||||||
|
def test_empty_db_critical(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.side_effect = [[0], [0]] |
||||||
|
result = check_embedding_coverage(conn) |
||||||
|
assert result.status == HealthStatus.CRITICAL |
||||||
|
|
||||||
|
def test_query_error_critical(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.side_effect = Exception("db down") |
||||||
|
result = check_embedding_coverage(conn) |
||||||
|
assert result.status == HealthStatus.CRITICAL |
||||||
|
|
||||||
|
|
||||||
|
class TestCheckLLMCoverage: |
||||||
|
def test_full_coverage_ok(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.side_effect = [[100], [0]] |
||||||
|
result = check_llm_coverage(conn) |
||||||
|
assert result.status == HealthStatus.OK |
||||||
|
assert result.details["missing_ratio"] == 0.0 |
||||||
|
|
||||||
|
def test_some_missing_warning(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.side_effect = [[100], [10]] |
||||||
|
result = check_llm_coverage(conn) |
||||||
|
assert result.status == HealthStatus.WARNING |
||||||
|
assert result.details["missing_ratio"] == 0.1 |
||||||
|
|
||||||
|
def test_too_many_missing_critical(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.return_value.fetchone.side_effect = [[100], [20]] |
||||||
|
result = check_llm_coverage(conn, max_missing_ratio=0.15) |
||||||
|
assert result.status == HealthStatus.CRITICAL |
||||||
|
assert result.details["missing_ratio"] == 0.2 |
||||||
|
|
||||||
|
def test_query_error_critical(self): |
||||||
|
conn = MagicMock() |
||||||
|
conn.execute.side_effect = Exception("db down") |
||||||
|
result = check_llm_coverage(conn) |
||||||
|
assert result.status == HealthStatus.CRITICAL |
||||||
@ -0,0 +1,54 @@ |
|||||||
|
import pytest |
||||||
|
|
||||||
|
from health import HealthCheck, HealthReport, HealthStatus, run_checks |
||||||
|
|
||||||
|
|
||||||
|
class TestHealthStatus: |
||||||
|
def test_ok_less_than_warning(self): |
||||||
|
assert HealthStatus.OK.value == "ok" |
||||||
|
|
||||||
|
def test_warning_less_than_critical(self): |
||||||
|
assert HealthStatus.WARNING.value == "warning" |
||||||
|
|
||||||
|
|
||||||
|
class TestHealthReport: |
||||||
|
def test_all_ok_returns_ok(self): |
||||||
|
checks = [ |
||||||
|
HealthCheck("a", HealthStatus.OK, "fine", {}), |
||||||
|
HealthCheck("b", HealthStatus.OK, "fine", {}), |
||||||
|
] |
||||||
|
report = run_checks(checks) |
||||||
|
assert report.status == HealthStatus.OK |
||||||
|
assert report.exit_code == 0 |
||||||
|
|
||||||
|
def test_one_warning_returns_warning(self): |
||||||
|
checks = [ |
||||||
|
HealthCheck("a", HealthStatus.OK, "fine", {}), |
||||||
|
HealthCheck("b", HealthStatus.WARNING, "hmm", {}), |
||||||
|
] |
||||||
|
report = run_checks(checks) |
||||||
|
assert report.status == HealthStatus.WARNING |
||||||
|
assert report.exit_code == 1 |
||||||
|
|
||||||
|
def test_one_critical_returns_critical(self): |
||||||
|
checks = [ |
||||||
|
HealthCheck("a", HealthStatus.OK, "fine", {}), |
||||||
|
HealthCheck("b", HealthStatus.CRITICAL, "bad", {}), |
||||||
|
] |
||||||
|
report = run_checks(checks) |
||||||
|
assert report.status == HealthStatus.CRITICAL |
||||||
|
assert report.exit_code == 2 |
||||||
|
|
||||||
|
def test_critical_trumps_warning(self): |
||||||
|
checks = [ |
||||||
|
HealthCheck("a", HealthStatus.WARNING, "hmm", {}), |
||||||
|
HealthCheck("b", HealthStatus.CRITICAL, "bad", {}), |
||||||
|
] |
||||||
|
report = run_checks(checks) |
||||||
|
assert report.status == HealthStatus.CRITICAL |
||||||
|
assert report.exit_code == 2 |
||||||
|
|
||||||
|
def test_empty_checks_returns_ok(self): |
||||||
|
report = run_checks([]) |
||||||
|
assert report.status == HealthStatus.OK |
||||||
|
assert report.exit_code == 0 |
||||||
Loading…
Reference in new issue