feat(mindmodel): add report-only validator skeleton, types, and tests

1 month ago · f091846dc8
parent dbd8cc801a
commit f091846dc8
4 changed files with 309 additions and 0 deletions
--- a/src/validators/mindmodel_validator.py
+++ b/src/validators/mindmodel_validator.py
@ -0,0 +1,133 @@
+"""Conservative, report-only mindmodel/manifest validator.
+
+This module provides a small validator that reads a manifest (YAML if
+PyYAML is available, otherwise a tiny fallback parser) and reports
+potential issues without making changes.
+
+The returned report contains the keys:
+- missing_files: list of file paths referenced in the manifest that don't exist
+- truncated_evidence: list of items (dicts) where evidence_excerpt appears truncated
+- potential_secrets: list of items (dicts) where evidence_excerpt looks like it may contain secrets
+
+The manifest is expected to contain a top-level `files` list with
+entries that are mappings and have at least a `path` (or `file_path`)
+and optionally `evidence_excerpt`.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import List, Dict, Any
+
+
+def _load_yaml_native(path: str) -> Dict[str, Any]:
+    try:
+        import yaml  # type: ignore
+
+        with open(path, "r", encoding="utf-8") as f:
+            return yaml.safe_load(f) or {}
+    except Exception:
+        raise
+
+
+def _load_yaml_fallback(path: str) -> Dict[str, Any]:
+    """Tiny YAML-ish fallback parser that understands a minimal manifest.
+
+    It only supports a top-level `files:` key and a sequence of simple
+    mappings with `-` list items and `key: value` pairs indented.
+    This is intentionally conservative and fragile; it's only used when
+    PyYAML is not available.
+    """
+    result: Dict[str, Any] = {}
+    files: List[Dict[str, Any]] = []
+    current: Dict[str, Any] | None = None
+
+    with open(path, "r", encoding="utf-8") as f:
+        for raw in f:
+            line = raw.rstrip("\n")
+            stripped = line.lstrip()
+            if not stripped or stripped.startswith("#"):
+                continue
+            if stripped.startswith("files:") and line.startswith(stripped):
+                # top-level marker, skip
+                continue
+            if stripped.startswith("- "):
+                # start new item
+                if current is not None:
+                    files.append(current)
+                current = {}
+                # possible inline key: - path: something
+                rest = stripped[2:].strip()
+                if rest:
+                    if ":" in rest:
+                        k, v = rest.split(":", 1)
+                        current[k.strip()] = v.strip()
+                continue
+            # key: value lines (indented)
+            if ":" in stripped and current is not None:
+                k, v = stripped.split(":", 1)
+                current[k.strip()] = v.strip()
+
+    if current is not None:
+        files.append(current)
+    if files:
+        result["files"] = files
+    return result
+
+
+def _normalize_entry(entry: Any) -> Dict[str, Any]:
+    if not isinstance(entry, dict):
+        return {"path": str(entry)}
+    # prefer path or file_path
+    if "file_path" in entry and "path" not in entry:
+        entry = dict(entry)
+        entry["path"] = entry.pop("file_path")
+    return entry
+
+
+def validate_manifest(manifest_path: str, report_only: bool = True) -> dict:
+    """Validate a minimal mindmodel manifest and return a report.
+
+    Parameters
+    - manifest_path: path to the YAML manifest file
+    - report_only: unused flag for now; kept to emphasise this is report-only
+
+    Returns a dict with keys: missing_files, truncated_evidence, potential_secrets
+    """
+    if not os.path.exists(manifest_path):
+        raise FileNotFoundError(manifest_path)
+
+    # attempt to use PyYAML if available, otherwise fallback
+    try:
+        manifest = _load_yaml_native(manifest_path)
+    except Exception:
+        manifest = _load_yaml_fallback(manifest_path)
+
+    files = manifest.get("files") or []
+    report = {"missing_files": [], "truncated_evidence": [], "potential_secrets": []}
+
+    for raw in files:
+        entry = _normalize_entry(raw)
+        path = entry.get("path")
+        evidence = entry.get("evidence_excerpt") or entry.get("evidence") or ""
+
+        # missing files
+        if path:
+            if not os.path.exists(path):
+                report["missing_files"].append(path)
+
+        # truncated evidence heuristics
+        if isinstance(evidence, str):
+            if len(evidence) > 1000 or evidence.strip().endswith("..."):
+                report["truncated_evidence"].append(
+                    {"path": path, "evidence_excerpt": evidence}
+                )
+
+            # potential secrets heuristics
+            up = evidence.upper()
+            if "PASSWORD" in up or "SECRET" in up or "BEGIN PRIVATE KEY" in evidence:
+                report["potential_secrets"].append(
+                    {"path": path, "evidence_excerpt": evidence}
+                )
+
+    return report
--- a/src/validators/types.py
+++ b/src/validators/types.py
@ -0,0 +1,107 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+
+@dataclass
+class EvidencePointer:
+    # minimal placeholder for evidence pointing
+    path: str
+    excerpt: str | None = None
+
+
+@dataclass
+class Constraint:
+    # minimal placeholder for constraints
+    key: str
+    value: Any = None
+
+
+@dataclass
+class Manifest:
+    files: List[Dict[str, Any]]
+
+
+def _load_yaml_native(path: str) -> dict:
+    # Use PyYAML when available. If the loaded document is not a mapping,
+    # return an empty mapping to keep the API stable for callers.
+    import yaml  # type: ignore
+
+    with open(path, "r", encoding="utf-8") as f:
+        loaded = yaml.safe_load(f)
+
+    if not isinstance(loaded, dict):
+        return {}
+
+    return loaded
+
+
+def _load_yaml_fallback(path: str) -> dict:
+    # very small fallback that recognises a top-level files: list and
+    # simple key: value lines. It intentionally is tiny and forgiving.
+    result: dict = {}
+    files: List[Dict[str, Any]] = []
+    current: Dict[str, Any] | None = None
+
+    with open(path, "r", encoding="utf-8") as f:
+        for raw in f:
+            line = raw.rstrip("\n")
+            stripped = line.lstrip()
+            if not stripped or stripped.startswith("#"):
+                continue
+            if stripped.startswith("files:") and line.startswith(stripped):
+                continue
+            if stripped.startswith("- "):
+                if current is not None:
+                    files.append(current)
+                current = {}
+                rest = stripped[2:].strip()
+                if rest and ":" in rest:
+                    k, v = rest.split(":", 1)
+                    current[k.strip()] = v.strip().strip("'\"")
+                continue
+            if ":" in stripped and current is not None:
+                k, v = stripped.split(":", 1)
+                current[k.strip()] = v.strip().strip("'\"")
+
+    if current is not None:
+        files.append(current)
+    if files:
+        result["files"] = files
+    return result
+
+
+def parse_manifest(manifest_path: str) -> Manifest:
+    """Parse a minimal manifest file and return a Manifest dataclass.
+
+    The function will attempt to use PyYAML (yaml.safe_load) when available;
+    otherwise it falls back to a tiny parser that recognises a top-level
+    `files:` list and simple mappings. The returned Manifest normalises
+    `file_path` -> `path` when present.
+    """
+    try:
+        manifest = _load_yaml_native(manifest_path)
+    except Exception:
+        manifest = _load_yaml_fallback(manifest_path)
+
+    # Be defensive: ensure we always operate on a mapping
+    if not isinstance(manifest, dict):
+        manifest = {}
+
+    files = manifest.get("files") or []
+    normalized: List[Dict[str, Any]] = []
+    for entry in files:
+        if not isinstance(entry, dict):
+            # coerce simple scalar entries
+            entry = {"path": str(entry)}
+        # prefer path over file_path
+        if "file_path" in entry and "path" not in entry:
+            entry = dict(entry)
+            entry["path"] = entry.pop("file_path")
+        normalized.append(entry)
+
+    return Manifest(files=normalized)
+
+
+__all__ = ["Manifest", "Constraint", "EvidencePointer", "parse_manifest"]
--- a/tests/validators/test_mindmodel_validator.py
+++ b/tests/validators/test_mindmodel_validator.py
@ -0,0 +1,45 @@
+import os
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from src.validators.mindmodel_validator import validate_manifest
+
+
+def _write_temp_manifest(contents: str) -> str:
+    fd, path = tempfile.mkstemp(prefix="manifest_", suffix=".yaml")
+    os.close(fd)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(contents)
+    return path
+
+
+def test_validator_reports_missing_file(tmp_path):
+    # manifest referencing a non-existent file
+    missing = str(tmp_path / "no_such_file.txt")
+    manifest = f"""
+files:
+  - path: {missing}
+"""
+    mpath = _write_temp_manifest(manifest)
+    try:
+        report = validate_manifest(mpath)
+        assert "missing_files" in report
+        assert missing in report["missing_files"]
+    finally:
+        Path(mpath).unlink()
+
+
+def test_validator_detects_potential_secret(tmp_path):
+    # manifest with evidence_excerpt containing PASSWORD
+    evidence = "This shows a PASSWORD=hunter2 in the output"
+    manifest = f'files:\n  - path: some_file.txt\n    evidence_excerpt: "{evidence}"\n'
+    mpath = _write_temp_manifest(manifest)
+    try:
+        report = validate_manifest(mpath)
+        assert "potential_secrets" in report
+        items = report["potential_secrets"]
+        assert any(evidence in (item.get("evidence_excerpt") or "") for item in items)
+    finally:
+        Path(mpath).unlink()
--- a/tests/validators/test_types.py
+++ b/tests/validators/test_types.py
@ -0,0 +1,24 @@
+import os
+from pathlib import Path
+
+import pytest
+
+from src.validators.types import parse_manifest, Manifest
+
+
+def test_manifest_model_parses_sample(tmp_path: Path):
+    sample = """
+files:
+  - path: data/file1.txt
+    evidence_excerpt: "some evidence"
+  - file_path: data/file2.txt
+    evidence_excerpt: "other evidence"
+"""
+    p = tmp_path / "manifest.yaml"
+    p.write_text(sample, encoding="utf-8")
+
+    manifest = parse_manifest(str(p))
+    assert isinstance(manifest, Manifest)
+    assert len(manifest.files) == 2
+    assert manifest.files[0]["path"] == "data/file1.txt"
+    assert manifest.files[1]["path"] == "data/file2.txt"