From f091846dc8c27ad89773945c51e28f1e1b8d90b5 Mon Sep 17 00:00:00 2001 From: Sven Geboers Date: Tue, 24 Mar 2026 22:41:15 +0100 Subject: [PATCH] feat(mindmodel): add report-only validator skeleton, types, and tests --- src/validators/mindmodel_validator.py | 133 +++++++++++++++++++ src/validators/types.py | 107 +++++++++++++++ tests/validators/test_mindmodel_validator.py | 45 +++++++ tests/validators/test_types.py | 24 ++++ 4 files changed, 309 insertions(+) create mode 100644 src/validators/mindmodel_validator.py create mode 100644 src/validators/types.py create mode 100644 tests/validators/test_mindmodel_validator.py create mode 100644 tests/validators/test_types.py diff --git a/src/validators/mindmodel_validator.py b/src/validators/mindmodel_validator.py new file mode 100644 index 0000000..0b27b1e --- /dev/null +++ b/src/validators/mindmodel_validator.py @@ -0,0 +1,133 @@ +"""Conservative, report-only mindmodel/manifest validator. + +This module provides a small validator that reads a manifest (YAML if +PyYAML is available, otherwise a tiny fallback parser) and reports +potential issues without making changes. + +The returned report contains the keys: +- missing_files: list of file paths referenced in the manifest that don't exist +- truncated_evidence: list of items (dicts) where evidence_excerpt appears truncated +- potential_secrets: list of items (dicts) where evidence_excerpt looks like it may contain secrets + +The manifest is expected to contain a top-level `files` list with +entries that are mappings and have at least a `path` (or `file_path`) +and optionally `evidence_excerpt`. +""" + +from __future__ import annotations + +import os +from typing import List, Dict, Any + + +def _load_yaml_native(path: str) -> Dict[str, Any]: + try: + import yaml # type: ignore + + with open(path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + except Exception: + raise + + +def _load_yaml_fallback(path: str) -> Dict[str, Any]: + """Tiny YAML-ish fallback parser that understands a minimal manifest. + + It only supports a top-level `files:` key and a sequence of simple + mappings with `-` list items and `key: value` pairs indented. + This is intentionally conservative and fragile; it's only used when + PyYAML is not available. + """ + result: Dict[str, Any] = {} + files: List[Dict[str, Any]] = [] + current: Dict[str, Any] | None = None + + with open(path, "r", encoding="utf-8") as f: + for raw in f: + line = raw.rstrip("\n") + stripped = line.lstrip() + if not stripped or stripped.startswith("#"): + continue + if stripped.startswith("files:") and line.startswith(stripped): + # top-level marker, skip + continue + if stripped.startswith("- "): + # start new item + if current is not None: + files.append(current) + current = {} + # possible inline key: - path: something + rest = stripped[2:].strip() + if rest: + if ":" in rest: + k, v = rest.split(":", 1) + current[k.strip()] = v.strip() + continue + # key: value lines (indented) + if ":" in stripped and current is not None: + k, v = stripped.split(":", 1) + current[k.strip()] = v.strip() + + if current is not None: + files.append(current) + if files: + result["files"] = files + return result + + +def _normalize_entry(entry: Any) -> Dict[str, Any]: + if not isinstance(entry, dict): + return {"path": str(entry)} + # prefer path or file_path + if "file_path" in entry and "path" not in entry: + entry = dict(entry) + entry["path"] = entry.pop("file_path") + return entry + + +def validate_manifest(manifest_path: str, report_only: bool = True) -> dict: + """Validate a minimal mindmodel manifest and return a report. + + Parameters + - manifest_path: path to the YAML manifest file + - report_only: unused flag for now; kept to emphasise this is report-only + + Returns a dict with keys: missing_files, truncated_evidence, potential_secrets + """ + if not os.path.exists(manifest_path): + raise FileNotFoundError(manifest_path) + + # attempt to use PyYAML if available, otherwise fallback + try: + manifest = _load_yaml_native(manifest_path) + except Exception: + manifest = _load_yaml_fallback(manifest_path) + + files = manifest.get("files") or [] + report = {"missing_files": [], "truncated_evidence": [], "potential_secrets": []} + + for raw in files: + entry = _normalize_entry(raw) + path = entry.get("path") + evidence = entry.get("evidence_excerpt") or entry.get("evidence") or "" + + # missing files + if path: + if not os.path.exists(path): + report["missing_files"].append(path) + + # truncated evidence heuristics + if isinstance(evidence, str): + if len(evidence) > 1000 or evidence.strip().endswith("..."): + report["truncated_evidence"].append( + {"path": path, "evidence_excerpt": evidence} + ) + + # potential secrets heuristics + up = evidence.upper() + if "PASSWORD" in up or "SECRET" in up or "BEGIN PRIVATE KEY" in evidence: + report["potential_secrets"].append( + {"path": path, "evidence_excerpt": evidence} + ) + + return report diff --git a/src/validators/types.py b/src/validators/types.py new file mode 100644 index 0000000..0dc9ae8 --- /dev/null +++ b/src/validators/types.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, List + + +@dataclass +class EvidencePointer: + # minimal placeholder for evidence pointing + path: str + excerpt: str | None = None + + +@dataclass +class Constraint: + # minimal placeholder for constraints + key: str + value: Any = None + + +@dataclass +class Manifest: + files: List[Dict[str, Any]] + + +def _load_yaml_native(path: str) -> dict: + # Use PyYAML when available. If the loaded document is not a mapping, + # return an empty mapping to keep the API stable for callers. + import yaml # type: ignore + + with open(path, "r", encoding="utf-8") as f: + loaded = yaml.safe_load(f) + + if not isinstance(loaded, dict): + return {} + + return loaded + + +def _load_yaml_fallback(path: str) -> dict: + # very small fallback that recognises a top-level files: list and + # simple key: value lines. It intentionally is tiny and forgiving. + result: dict = {} + files: List[Dict[str, Any]] = [] + current: Dict[str, Any] | None = None + + with open(path, "r", encoding="utf-8") as f: + for raw in f: + line = raw.rstrip("\n") + stripped = line.lstrip() + if not stripped or stripped.startswith("#"): + continue + if stripped.startswith("files:") and line.startswith(stripped): + continue + if stripped.startswith("- "): + if current is not None: + files.append(current) + current = {} + rest = stripped[2:].strip() + if rest and ":" in rest: + k, v = rest.split(":", 1) + current[k.strip()] = v.strip().strip("'\"") + continue + if ":" in stripped and current is not None: + k, v = stripped.split(":", 1) + current[k.strip()] = v.strip().strip("'\"") + + if current is not None: + files.append(current) + if files: + result["files"] = files + return result + + +def parse_manifest(manifest_path: str) -> Manifest: + """Parse a minimal manifest file and return a Manifest dataclass. + + The function will attempt to use PyYAML (yaml.safe_load) when available; + otherwise it falls back to a tiny parser that recognises a top-level + `files:` list and simple mappings. The returned Manifest normalises + `file_path` -> `path` when present. + """ + try: + manifest = _load_yaml_native(manifest_path) + except Exception: + manifest = _load_yaml_fallback(manifest_path) + + # Be defensive: ensure we always operate on a mapping + if not isinstance(manifest, dict): + manifest = {} + + files = manifest.get("files") or [] + normalized: List[Dict[str, Any]] = [] + for entry in files: + if not isinstance(entry, dict): + # coerce simple scalar entries + entry = {"path": str(entry)} + # prefer path over file_path + if "file_path" in entry and "path" not in entry: + entry = dict(entry) + entry["path"] = entry.pop("file_path") + normalized.append(entry) + + return Manifest(files=normalized) + + +__all__ = ["Manifest", "Constraint", "EvidencePointer", "parse_manifest"] diff --git a/tests/validators/test_mindmodel_validator.py b/tests/validators/test_mindmodel_validator.py new file mode 100644 index 0000000..e75a8a8 --- /dev/null +++ b/tests/validators/test_mindmodel_validator.py @@ -0,0 +1,45 @@ +import os +import tempfile +from pathlib import Path + +import pytest + +from src.validators.mindmodel_validator import validate_manifest + + +def _write_temp_manifest(contents: str) -> str: + fd, path = tempfile.mkstemp(prefix="manifest_", suffix=".yaml") + os.close(fd) + with open(path, "w", encoding="utf-8") as f: + f.write(contents) + return path + + +def test_validator_reports_missing_file(tmp_path): + # manifest referencing a non-existent file + missing = str(tmp_path / "no_such_file.txt") + manifest = f""" +files: + - path: {missing} +""" + mpath = _write_temp_manifest(manifest) + try: + report = validate_manifest(mpath) + assert "missing_files" in report + assert missing in report["missing_files"] + finally: + Path(mpath).unlink() + + +def test_validator_detects_potential_secret(tmp_path): + # manifest with evidence_excerpt containing PASSWORD + evidence = "This shows a PASSWORD=hunter2 in the output" + manifest = f'files:\n - path: some_file.txt\n evidence_excerpt: "{evidence}"\n' + mpath = _write_temp_manifest(manifest) + try: + report = validate_manifest(mpath) + assert "potential_secrets" in report + items = report["potential_secrets"] + assert any(evidence in (item.get("evidence_excerpt") or "") for item in items) + finally: + Path(mpath).unlink() diff --git a/tests/validators/test_types.py b/tests/validators/test_types.py new file mode 100644 index 0000000..0de0bea --- /dev/null +++ b/tests/validators/test_types.py @@ -0,0 +1,24 @@ +import os +from pathlib import Path + +import pytest + +from src.validators.types import parse_manifest, Manifest + + +def test_manifest_model_parses_sample(tmp_path: Path): + sample = """ +files: + - path: data/file1.txt + evidence_excerpt: "some evidence" + - file_path: data/file2.txt + evidence_excerpt: "other evidence" +""" + p = tmp_path / "manifest.yaml" + p.write_text(sample, encoding="utf-8") + + manifest = parse_manifest(str(p)) + assert isinstance(manifest, Manifest) + assert len(manifest.files) == 2 + assert manifest.files[0]["path"] == "data/file1.txt" + assert manifest.files[1]["path"] == "data/file2.txt"