feat(mindmodel): add report-only validator skeleton, types, and tests

1 month ago · f091846dc8
parent dbd8cc801a
commit f091846dc8
4 changed files with 309 additions and 0 deletions
--- a/src/validators/mindmodel_validator.py
+++ b/src/validators/mindmodel_validator.py
@ -0,0 +1,133 @@
 """Conservative, report-only mindmodel/manifest validator.
 This module provides a small validator that reads a manifest (YAML if
 PyYAML is available, otherwise a tiny fallback parser) and reports
 potential issues without making changes.
 The returned report contains the keys:
 - missing_files: list of file paths referenced in the manifest that don't exist
 - truncated_evidence: list of items (dicts) where evidence_excerpt appears truncated
 - potential_secrets: list of items (dicts) where evidence_excerpt looks like it may contain secrets
 The manifest is expected to contain a top-level `files` list with
 entries that are mappings and have at least a `path` (or `file_path`)
 and optionally `evidence_excerpt`.
 """
 from __future__ import annotations
 import os
 from typing import List, Dict, Any
 def _load_yaml_native(path: str) -> Dict[str, Any]:
    try:
        import yaml  # type: ignore
        with open(path, "r", encoding="utf-8") as f:
            return yaml.safe_load(f) or {}
    except Exception:
        raise
 def _load_yaml_fallback(path: str) -> Dict[str, Any]:
    """Tiny YAML-ish fallback parser that understands a minimal manifest.
    It only supports a top-level `files:` key and a sequence of simple
    mappings with `-` list items and `key: value` pairs indented.
    This is intentionally conservative and fragile; it's only used when
    PyYAML is not available.
    """
    result: Dict[str, Any] = {}
    files: List[Dict[str, Any]] = []
    current: Dict[str, Any] | None = None
    with open(path, "r", encoding="utf-8") as f:
        for raw in f:
            line = raw.rstrip("\n")
            stripped = line.lstrip()
            if not stripped or stripped.startswith("#"):
                continue
            if stripped.startswith("files:") and line.startswith(stripped):
                # top-level marker, skip
                continue
            if stripped.startswith("- "):
                # start new item
                if current is not None:
                    files.append(current)
                current = {}
                # possible inline key: - path: something
                rest = stripped[2:].strip()
                if rest:
                    if ":" in rest:
                        k, v = rest.split(":", 1)
                        current[k.strip()] = v.strip()
                continue
            # key: value lines (indented)
            if ":" in stripped and current is not None:
                k, v = stripped.split(":", 1)
                current[k.strip()] = v.strip()
    if current is not None:
        files.append(current)
    if files:
        result["files"] = files
    return result
 def _normalize_entry(entry: Any) -> Dict[str, Any]:
    if not isinstance(entry, dict):
        return {"path": str(entry)}
    # prefer path or file_path
    if "file_path" in entry and "path" not in entry:
        entry = dict(entry)
        entry["path"] = entry.pop("file_path")
    return entry
 def validate_manifest(manifest_path: str, report_only: bool = True) -> dict:
    """Validate a minimal mindmodel manifest and return a report.
    Parameters
    - manifest_path: path to the YAML manifest file
    - report_only: unused flag for now; kept to emphasise this is report-only
    Returns a dict with keys: missing_files, truncated_evidence, potential_secrets
    """
    if not os.path.exists(manifest_path):
        raise FileNotFoundError(manifest_path)
    # attempt to use PyYAML if available, otherwise fallback
    try:
        manifest = _load_yaml_native(manifest_path)
    except Exception:
        manifest = _load_yaml_fallback(manifest_path)
    files = manifest.get("files") or []
    report = {"missing_files": [], "truncated_evidence": [], "potential_secrets": []}
    for raw in files:
        entry = _normalize_entry(raw)
        path = entry.get("path")
        evidence = entry.get("evidence_excerpt") or entry.get("evidence") or ""
        # missing files
        if path:
            if not os.path.exists(path):
                report["missing_files"].append(path)
        # truncated evidence heuristics
        if isinstance(evidence, str):
            if len(evidence) > 1000 or evidence.strip().endswith("..."):
                report["truncated_evidence"].append(
                    {"path": path, "evidence_excerpt": evidence}
                )
            # potential secrets heuristics
            up = evidence.upper()
            if "PASSWORD" in up or "SECRET" in up or "BEGIN PRIVATE KEY" in evidence:
                report["potential_secrets"].append(
                    {"path": path, "evidence_excerpt": evidence}
                )
    return report
--- a/src/validators/types.py
+++ b/src/validators/types.py
@ -0,0 +1,107 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any, Dict, List
@dataclass
 class EvidencePointer:
    # minimal placeholder for evidence pointing
    path: str
    excerpt: str | None = None
@dataclass
 class Constraint:
    # minimal placeholder for constraints
    key: str
    value: Any = None
@dataclass
 class Manifest:
    files: List[Dict[str, Any]]
 def _load_yaml_native(path: str) -> dict:
    # Use PyYAML when available. If the loaded document is not a mapping,
    # return an empty mapping to keep the API stable for callers.
    import yaml  # type: ignore
    with open(path, "r", encoding="utf-8") as f:
        loaded = yaml.safe_load(f)
    if not isinstance(loaded, dict):
        return {}
    return loaded
 def _load_yaml_fallback(path: str) -> dict:
    # very small fallback that recognises a top-level files: list and
    # simple key: value lines. It intentionally is tiny and forgiving.
    result: dict = {}
    files: List[Dict[str, Any]] = []
    current: Dict[str, Any] | None = None
    with open(path, "r", encoding="utf-8") as f:
        for raw in f:
            line = raw.rstrip("\n")
            stripped = line.lstrip()
            if not stripped or stripped.startswith("#"):
                continue
            if stripped.startswith("files:") and line.startswith(stripped):
                continue
            if stripped.startswith("- "):
                if current is not None:
                    files.append(current)
                current = {}
                rest = stripped[2:].strip()
                if rest and ":" in rest:
                    k, v = rest.split(":", 1)
                    current[k.strip()] = v.strip().strip("'\"")
                continue
            if ":" in stripped and current is not None:
                k, v = stripped.split(":", 1)
                current[k.strip()] = v.strip().strip("'\"")
    if current is not None:
        files.append(current)
    if files:
        result["files"] = files
    return result
 def parse_manifest(manifest_path: str) -> Manifest:
    """Parse a minimal manifest file and return a Manifest dataclass.
    The function will attempt to use PyYAML (yaml.safe_load) when available;
    otherwise it falls back to a tiny parser that recognises a top-level
    `files:` list and simple mappings. The returned Manifest normalises
    `file_path` -> `path` when present.
    """
    try:
        manifest = _load_yaml_native(manifest_path)
    except Exception:
        manifest = _load_yaml_fallback(manifest_path)
    # Be defensive: ensure we always operate on a mapping
    if not isinstance(manifest, dict):
        manifest = {}
    files = manifest.get("files") or []
    normalized: List[Dict[str, Any]] = []
    for entry in files:
        if not isinstance(entry, dict):
            # coerce simple scalar entries
            entry = {"path": str(entry)}
        # prefer path over file_path
        if "file_path" in entry and "path" not in entry:
            entry = dict(entry)
            entry["path"] = entry.pop("file_path")
        normalized.append(entry)
    return Manifest(files=normalized)
 __all__ = ["Manifest", "Constraint", "EvidencePointer", "parse_manifest"]
--- a/tests/validators/test_mindmodel_validator.py
+++ b/tests/validators/test_mindmodel_validator.py
@ -0,0 +1,45 @@
 import os
 import tempfile
 from pathlib import Path
 import pytest
 from src.validators.mindmodel_validator import validate_manifest
 def _write_temp_manifest(contents: str) -> str:
    fd, path = tempfile.mkstemp(prefix="manifest_", suffix=".yaml")
    os.close(fd)
    with open(path, "w", encoding="utf-8") as f:
        f.write(contents)
    return path
 def test_validator_reports_missing_file(tmp_path):
    # manifest referencing a non-existent file
    missing = str(tmp_path / "no_such_file.txt")
    manifest = f"""
 files:
  - path: {missing}
 """
    mpath = _write_temp_manifest(manifest)
    try:
        report = validate_manifest(mpath)
        assert "missing_files" in report
        assert missing in report["missing_files"]
    finally:
        Path(mpath).unlink()
 def test_validator_detects_potential_secret(tmp_path):
    # manifest with evidence_excerpt containing PASSWORD
    evidence = "This shows a PASSWORD=hunter2 in the output"
    manifest = f'files:\n  - path: some_file.txt\n    evidence_excerpt: "{evidence}"\n'
    mpath = _write_temp_manifest(manifest)
    try:
        report = validate_manifest(mpath)
        assert "potential_secrets" in report
        items = report["potential_secrets"]
        assert any(evidence in (item.get("evidence_excerpt") or "") for item in items)
    finally:
        Path(mpath).unlink()
--- a/tests/validators/test_types.py
+++ b/tests/validators/test_types.py
@ -0,0 +1,24 @@
 import os
 from pathlib import Path
 import pytest
 from src.validators.types import parse_manifest, Manifest
 def test_manifest_model_parses_sample(tmp_path: Path):
    sample = """
 files:
  - path: data/file1.txt
    evidence_excerpt: "some evidence"
  - file_path: data/file2.txt
    evidence_excerpt: "other evidence"
 """
    p = tmp_path / "manifest.yaml"
    p.write_text(sample, encoding="utf-8")
    manifest = parse_manifest(str(p))
    assert isinstance(manifest, Manifest)
    assert len(manifest.files) == 2
    assert manifest.files[0]["path"] == "data/file1.txt"
    assert manifest.files[1]["path"] == "data/file2.txt"