feat(mindmodel): add report-only validator skeleton, types, and tests

main
Sven Geboers 1 month ago
parent dbd8cc801a
commit f091846dc8
  1. 133
      src/validators/mindmodel_validator.py
  2. 107
      src/validators/types.py
  3. 45
      tests/validators/test_mindmodel_validator.py
  4. 24
      tests/validators/test_types.py

@ -0,0 +1,133 @@
"""Conservative, report-only mindmodel/manifest validator.
This module provides a small validator that reads a manifest (YAML if
PyYAML is available, otherwise a tiny fallback parser) and reports
potential issues without making changes.
The returned report contains the keys:
- missing_files: list of file paths referenced in the manifest that don't exist
- truncated_evidence: list of items (dicts) where evidence_excerpt appears truncated
- potential_secrets: list of items (dicts) where evidence_excerpt looks like it may contain secrets
The manifest is expected to contain a top-level `files` list with
entries that are mappings and have at least a `path` (or `file_path`)
and optionally `evidence_excerpt`.
"""
from __future__ import annotations
import os
from typing import List, Dict, Any
def _load_yaml_native(path: str) -> Dict[str, Any]:
try:
import yaml # type: ignore
with open(path, "r", encoding="utf-8") as f:
return yaml.safe_load(f) or {}
except Exception:
raise
def _load_yaml_fallback(path: str) -> Dict[str, Any]:
"""Tiny YAML-ish fallback parser that understands a minimal manifest.
It only supports a top-level `files:` key and a sequence of simple
mappings with `-` list items and `key: value` pairs indented.
This is intentionally conservative and fragile; it's only used when
PyYAML is not available.
"""
result: Dict[str, Any] = {}
files: List[Dict[str, Any]] = []
current: Dict[str, Any] | None = None
with open(path, "r", encoding="utf-8") as f:
for raw in f:
line = raw.rstrip("\n")
stripped = line.lstrip()
if not stripped or stripped.startswith("#"):
continue
if stripped.startswith("files:") and line.startswith(stripped):
# top-level marker, skip
continue
if stripped.startswith("- "):
# start new item
if current is not None:
files.append(current)
current = {}
# possible inline key: - path: something
rest = stripped[2:].strip()
if rest:
if ":" in rest:
k, v = rest.split(":", 1)
current[k.strip()] = v.strip()
continue
# key: value lines (indented)
if ":" in stripped and current is not None:
k, v = stripped.split(":", 1)
current[k.strip()] = v.strip()
if current is not None:
files.append(current)
if files:
result["files"] = files
return result
def _normalize_entry(entry: Any) -> Dict[str, Any]:
if not isinstance(entry, dict):
return {"path": str(entry)}
# prefer path or file_path
if "file_path" in entry and "path" not in entry:
entry = dict(entry)
entry["path"] = entry.pop("file_path")
return entry
def validate_manifest(manifest_path: str, report_only: bool = True) -> dict:
"""Validate a minimal mindmodel manifest and return a report.
Parameters
- manifest_path: path to the YAML manifest file
- report_only: unused flag for now; kept to emphasise this is report-only
Returns a dict with keys: missing_files, truncated_evidence, potential_secrets
"""
if not os.path.exists(manifest_path):
raise FileNotFoundError(manifest_path)
# attempt to use PyYAML if available, otherwise fallback
try:
manifest = _load_yaml_native(manifest_path)
except Exception:
manifest = _load_yaml_fallback(manifest_path)
files = manifest.get("files") or []
report = {"missing_files": [], "truncated_evidence": [], "potential_secrets": []}
for raw in files:
entry = _normalize_entry(raw)
path = entry.get("path")
evidence = entry.get("evidence_excerpt") or entry.get("evidence") or ""
# missing files
if path:
if not os.path.exists(path):
report["missing_files"].append(path)
# truncated evidence heuristics
if isinstance(evidence, str):
if len(evidence) > 1000 or evidence.strip().endswith("..."):
report["truncated_evidence"].append(
{"path": path, "evidence_excerpt": evidence}
)
# potential secrets heuristics
up = evidence.upper()
if "PASSWORD" in up or "SECRET" in up or "BEGIN PRIVATE KEY" in evidence:
report["potential_secrets"].append(
{"path": path, "evidence_excerpt": evidence}
)
return report

@ -0,0 +1,107 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, List
@dataclass
class EvidencePointer:
# minimal placeholder for evidence pointing
path: str
excerpt: str | None = None
@dataclass
class Constraint:
# minimal placeholder for constraints
key: str
value: Any = None
@dataclass
class Manifest:
files: List[Dict[str, Any]]
def _load_yaml_native(path: str) -> dict:
# Use PyYAML when available. If the loaded document is not a mapping,
# return an empty mapping to keep the API stable for callers.
import yaml # type: ignore
with open(path, "r", encoding="utf-8") as f:
loaded = yaml.safe_load(f)
if not isinstance(loaded, dict):
return {}
return loaded
def _load_yaml_fallback(path: str) -> dict:
# very small fallback that recognises a top-level files: list and
# simple key: value lines. It intentionally is tiny and forgiving.
result: dict = {}
files: List[Dict[str, Any]] = []
current: Dict[str, Any] | None = None
with open(path, "r", encoding="utf-8") as f:
for raw in f:
line = raw.rstrip("\n")
stripped = line.lstrip()
if not stripped or stripped.startswith("#"):
continue
if stripped.startswith("files:") and line.startswith(stripped):
continue
if stripped.startswith("- "):
if current is not None:
files.append(current)
current = {}
rest = stripped[2:].strip()
if rest and ":" in rest:
k, v = rest.split(":", 1)
current[k.strip()] = v.strip().strip("'\"")
continue
if ":" in stripped and current is not None:
k, v = stripped.split(":", 1)
current[k.strip()] = v.strip().strip("'\"")
if current is not None:
files.append(current)
if files:
result["files"] = files
return result
def parse_manifest(manifest_path: str) -> Manifest:
"""Parse a minimal manifest file and return a Manifest dataclass.
The function will attempt to use PyYAML (yaml.safe_load) when available;
otherwise it falls back to a tiny parser that recognises a top-level
`files:` list and simple mappings. The returned Manifest normalises
`file_path` -> `path` when present.
"""
try:
manifest = _load_yaml_native(manifest_path)
except Exception:
manifest = _load_yaml_fallback(manifest_path)
# Be defensive: ensure we always operate on a mapping
if not isinstance(manifest, dict):
manifest = {}
files = manifest.get("files") or []
normalized: List[Dict[str, Any]] = []
for entry in files:
if not isinstance(entry, dict):
# coerce simple scalar entries
entry = {"path": str(entry)}
# prefer path over file_path
if "file_path" in entry and "path" not in entry:
entry = dict(entry)
entry["path"] = entry.pop("file_path")
normalized.append(entry)
return Manifest(files=normalized)
__all__ = ["Manifest", "Constraint", "EvidencePointer", "parse_manifest"]

@ -0,0 +1,45 @@
import os
import tempfile
from pathlib import Path
import pytest
from src.validators.mindmodel_validator import validate_manifest
def _write_temp_manifest(contents: str) -> str:
fd, path = tempfile.mkstemp(prefix="manifest_", suffix=".yaml")
os.close(fd)
with open(path, "w", encoding="utf-8") as f:
f.write(contents)
return path
def test_validator_reports_missing_file(tmp_path):
# manifest referencing a non-existent file
missing = str(tmp_path / "no_such_file.txt")
manifest = f"""
files:
- path: {missing}
"""
mpath = _write_temp_manifest(manifest)
try:
report = validate_manifest(mpath)
assert "missing_files" in report
assert missing in report["missing_files"]
finally:
Path(mpath).unlink()
def test_validator_detects_potential_secret(tmp_path):
# manifest with evidence_excerpt containing PASSWORD
evidence = "This shows a PASSWORD=hunter2 in the output"
manifest = f'files:\n - path: some_file.txt\n evidence_excerpt: "{evidence}"\n'
mpath = _write_temp_manifest(manifest)
try:
report = validate_manifest(mpath)
assert "potential_secrets" in report
items = report["potential_secrets"]
assert any(evidence in (item.get("evidence_excerpt") or "") for item in items)
finally:
Path(mpath).unlink()

@ -0,0 +1,24 @@
import os
from pathlib import Path
import pytest
from src.validators.types import parse_manifest, Manifest
def test_manifest_model_parses_sample(tmp_path: Path):
sample = """
files:
- path: data/file1.txt
evidence_excerpt: "some evidence"
- file_path: data/file2.txt
evidence_excerpt: "other evidence"
"""
p = tmp_path / "manifest.yaml"
p.write_text(sample, encoding="utf-8")
manifest = parse_manifest(str(p))
assert isinstance(manifest, Manifest)
assert len(manifest.files) == 2
assert manifest.files[0]["path"] == "data/file1.txt"
assert manifest.files[1]["path"] == "data/file2.txt"
Loading…
Cancel
Save