parent
dbd8cc801a
commit
f091846dc8
@ -0,0 +1,133 @@ |
|||||||
|
"""Conservative, report-only mindmodel/manifest validator. |
||||||
|
|
||||||
|
This module provides a small validator that reads a manifest (YAML if |
||||||
|
PyYAML is available, otherwise a tiny fallback parser) and reports |
||||||
|
potential issues without making changes. |
||||||
|
|
||||||
|
The returned report contains the keys: |
||||||
|
- missing_files: list of file paths referenced in the manifest that don't exist |
||||||
|
- truncated_evidence: list of items (dicts) where evidence_excerpt appears truncated |
||||||
|
- potential_secrets: list of items (dicts) where evidence_excerpt looks like it may contain secrets |
||||||
|
|
||||||
|
The manifest is expected to contain a top-level `files` list with |
||||||
|
entries that are mappings and have at least a `path` (or `file_path`) |
||||||
|
and optionally `evidence_excerpt`. |
||||||
|
""" |
||||||
|
|
||||||
|
from __future__ import annotations |
||||||
|
|
||||||
|
import os |
||||||
|
from typing import List, Dict, Any |
||||||
|
|
||||||
|
|
||||||
|
def _load_yaml_native(path: str) -> Dict[str, Any]: |
||||||
|
try: |
||||||
|
import yaml # type: ignore |
||||||
|
|
||||||
|
with open(path, "r", encoding="utf-8") as f: |
||||||
|
return yaml.safe_load(f) or {} |
||||||
|
except Exception: |
||||||
|
raise |
||||||
|
|
||||||
|
|
||||||
|
def _load_yaml_fallback(path: str) -> Dict[str, Any]: |
||||||
|
"""Tiny YAML-ish fallback parser that understands a minimal manifest. |
||||||
|
|
||||||
|
It only supports a top-level `files:` key and a sequence of simple |
||||||
|
mappings with `-` list items and `key: value` pairs indented. |
||||||
|
This is intentionally conservative and fragile; it's only used when |
||||||
|
PyYAML is not available. |
||||||
|
""" |
||||||
|
result: Dict[str, Any] = {} |
||||||
|
files: List[Dict[str, Any]] = [] |
||||||
|
current: Dict[str, Any] | None = None |
||||||
|
|
||||||
|
with open(path, "r", encoding="utf-8") as f: |
||||||
|
for raw in f: |
||||||
|
line = raw.rstrip("\n") |
||||||
|
stripped = line.lstrip() |
||||||
|
if not stripped or stripped.startswith("#"): |
||||||
|
continue |
||||||
|
if stripped.startswith("files:") and line.startswith(stripped): |
||||||
|
# top-level marker, skip |
||||||
|
continue |
||||||
|
if stripped.startswith("- "): |
||||||
|
# start new item |
||||||
|
if current is not None: |
||||||
|
files.append(current) |
||||||
|
current = {} |
||||||
|
# possible inline key: - path: something |
||||||
|
rest = stripped[2:].strip() |
||||||
|
if rest: |
||||||
|
if ":" in rest: |
||||||
|
k, v = rest.split(":", 1) |
||||||
|
current[k.strip()] = v.strip() |
||||||
|
continue |
||||||
|
# key: value lines (indented) |
||||||
|
if ":" in stripped and current is not None: |
||||||
|
k, v = stripped.split(":", 1) |
||||||
|
current[k.strip()] = v.strip() |
||||||
|
|
||||||
|
if current is not None: |
||||||
|
files.append(current) |
||||||
|
if files: |
||||||
|
result["files"] = files |
||||||
|
return result |
||||||
|
|
||||||
|
|
||||||
|
def _normalize_entry(entry: Any) -> Dict[str, Any]: |
||||||
|
if not isinstance(entry, dict): |
||||||
|
return {"path": str(entry)} |
||||||
|
# prefer path or file_path |
||||||
|
if "file_path" in entry and "path" not in entry: |
||||||
|
entry = dict(entry) |
||||||
|
entry["path"] = entry.pop("file_path") |
||||||
|
return entry |
||||||
|
|
||||||
|
|
||||||
|
def validate_manifest(manifest_path: str, report_only: bool = True) -> dict: |
||||||
|
"""Validate a minimal mindmodel manifest and return a report. |
||||||
|
|
||||||
|
Parameters |
||||||
|
- manifest_path: path to the YAML manifest file |
||||||
|
- report_only: unused flag for now; kept to emphasise this is report-only |
||||||
|
|
||||||
|
Returns a dict with keys: missing_files, truncated_evidence, potential_secrets |
||||||
|
""" |
||||||
|
if not os.path.exists(manifest_path): |
||||||
|
raise FileNotFoundError(manifest_path) |
||||||
|
|
||||||
|
# attempt to use PyYAML if available, otherwise fallback |
||||||
|
try: |
||||||
|
manifest = _load_yaml_native(manifest_path) |
||||||
|
except Exception: |
||||||
|
manifest = _load_yaml_fallback(manifest_path) |
||||||
|
|
||||||
|
files = manifest.get("files") or [] |
||||||
|
report = {"missing_files": [], "truncated_evidence": [], "potential_secrets": []} |
||||||
|
|
||||||
|
for raw in files: |
||||||
|
entry = _normalize_entry(raw) |
||||||
|
path = entry.get("path") |
||||||
|
evidence = entry.get("evidence_excerpt") or entry.get("evidence") or "" |
||||||
|
|
||||||
|
# missing files |
||||||
|
if path: |
||||||
|
if not os.path.exists(path): |
||||||
|
report["missing_files"].append(path) |
||||||
|
|
||||||
|
# truncated evidence heuristics |
||||||
|
if isinstance(evidence, str): |
||||||
|
if len(evidence) > 1000 or evidence.strip().endswith("..."): |
||||||
|
report["truncated_evidence"].append( |
||||||
|
{"path": path, "evidence_excerpt": evidence} |
||||||
|
) |
||||||
|
|
||||||
|
# potential secrets heuristics |
||||||
|
up = evidence.upper() |
||||||
|
if "PASSWORD" in up or "SECRET" in up or "BEGIN PRIVATE KEY" in evidence: |
||||||
|
report["potential_secrets"].append( |
||||||
|
{"path": path, "evidence_excerpt": evidence} |
||||||
|
) |
||||||
|
|
||||||
|
return report |
||||||
@ -0,0 +1,107 @@ |
|||||||
|
from __future__ import annotations |
||||||
|
|
||||||
|
from dataclasses import dataclass |
||||||
|
from typing import Any, Dict, List |
||||||
|
|
||||||
|
|
||||||
|
@dataclass |
||||||
|
class EvidencePointer: |
||||||
|
# minimal placeholder for evidence pointing |
||||||
|
path: str |
||||||
|
excerpt: str | None = None |
||||||
|
|
||||||
|
|
||||||
|
@dataclass |
||||||
|
class Constraint: |
||||||
|
# minimal placeholder for constraints |
||||||
|
key: str |
||||||
|
value: Any = None |
||||||
|
|
||||||
|
|
||||||
|
@dataclass |
||||||
|
class Manifest: |
||||||
|
files: List[Dict[str, Any]] |
||||||
|
|
||||||
|
|
||||||
|
def _load_yaml_native(path: str) -> dict: |
||||||
|
# Use PyYAML when available. If the loaded document is not a mapping, |
||||||
|
# return an empty mapping to keep the API stable for callers. |
||||||
|
import yaml # type: ignore |
||||||
|
|
||||||
|
with open(path, "r", encoding="utf-8") as f: |
||||||
|
loaded = yaml.safe_load(f) |
||||||
|
|
||||||
|
if not isinstance(loaded, dict): |
||||||
|
return {} |
||||||
|
|
||||||
|
return loaded |
||||||
|
|
||||||
|
|
||||||
|
def _load_yaml_fallback(path: str) -> dict: |
||||||
|
# very small fallback that recognises a top-level files: list and |
||||||
|
# simple key: value lines. It intentionally is tiny and forgiving. |
||||||
|
result: dict = {} |
||||||
|
files: List[Dict[str, Any]] = [] |
||||||
|
current: Dict[str, Any] | None = None |
||||||
|
|
||||||
|
with open(path, "r", encoding="utf-8") as f: |
||||||
|
for raw in f: |
||||||
|
line = raw.rstrip("\n") |
||||||
|
stripped = line.lstrip() |
||||||
|
if not stripped or stripped.startswith("#"): |
||||||
|
continue |
||||||
|
if stripped.startswith("files:") and line.startswith(stripped): |
||||||
|
continue |
||||||
|
if stripped.startswith("- "): |
||||||
|
if current is not None: |
||||||
|
files.append(current) |
||||||
|
current = {} |
||||||
|
rest = stripped[2:].strip() |
||||||
|
if rest and ":" in rest: |
||||||
|
k, v = rest.split(":", 1) |
||||||
|
current[k.strip()] = v.strip().strip("'\"") |
||||||
|
continue |
||||||
|
if ":" in stripped and current is not None: |
||||||
|
k, v = stripped.split(":", 1) |
||||||
|
current[k.strip()] = v.strip().strip("'\"") |
||||||
|
|
||||||
|
if current is not None: |
||||||
|
files.append(current) |
||||||
|
if files: |
||||||
|
result["files"] = files |
||||||
|
return result |
||||||
|
|
||||||
|
|
||||||
|
def parse_manifest(manifest_path: str) -> Manifest: |
||||||
|
"""Parse a minimal manifest file and return a Manifest dataclass. |
||||||
|
|
||||||
|
The function will attempt to use PyYAML (yaml.safe_load) when available; |
||||||
|
otherwise it falls back to a tiny parser that recognises a top-level |
||||||
|
`files:` list and simple mappings. The returned Manifest normalises |
||||||
|
`file_path` -> `path` when present. |
||||||
|
""" |
||||||
|
try: |
||||||
|
manifest = _load_yaml_native(manifest_path) |
||||||
|
except Exception: |
||||||
|
manifest = _load_yaml_fallback(manifest_path) |
||||||
|
|
||||||
|
# Be defensive: ensure we always operate on a mapping |
||||||
|
if not isinstance(manifest, dict): |
||||||
|
manifest = {} |
||||||
|
|
||||||
|
files = manifest.get("files") or [] |
||||||
|
normalized: List[Dict[str, Any]] = [] |
||||||
|
for entry in files: |
||||||
|
if not isinstance(entry, dict): |
||||||
|
# coerce simple scalar entries |
||||||
|
entry = {"path": str(entry)} |
||||||
|
# prefer path over file_path |
||||||
|
if "file_path" in entry and "path" not in entry: |
||||||
|
entry = dict(entry) |
||||||
|
entry["path"] = entry.pop("file_path") |
||||||
|
normalized.append(entry) |
||||||
|
|
||||||
|
return Manifest(files=normalized) |
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Manifest", "Constraint", "EvidencePointer", "parse_manifest"] |
||||||
@ -0,0 +1,45 @@ |
|||||||
|
import os |
||||||
|
import tempfile |
||||||
|
from pathlib import Path |
||||||
|
|
||||||
|
import pytest |
||||||
|
|
||||||
|
from src.validators.mindmodel_validator import validate_manifest |
||||||
|
|
||||||
|
|
||||||
|
def _write_temp_manifest(contents: str) -> str: |
||||||
|
fd, path = tempfile.mkstemp(prefix="manifest_", suffix=".yaml") |
||||||
|
os.close(fd) |
||||||
|
with open(path, "w", encoding="utf-8") as f: |
||||||
|
f.write(contents) |
||||||
|
return path |
||||||
|
|
||||||
|
|
||||||
|
def test_validator_reports_missing_file(tmp_path): |
||||||
|
# manifest referencing a non-existent file |
||||||
|
missing = str(tmp_path / "no_such_file.txt") |
||||||
|
manifest = f""" |
||||||
|
files: |
||||||
|
- path: {missing} |
||||||
|
""" |
||||||
|
mpath = _write_temp_manifest(manifest) |
||||||
|
try: |
||||||
|
report = validate_manifest(mpath) |
||||||
|
assert "missing_files" in report |
||||||
|
assert missing in report["missing_files"] |
||||||
|
finally: |
||||||
|
Path(mpath).unlink() |
||||||
|
|
||||||
|
|
||||||
|
def test_validator_detects_potential_secret(tmp_path): |
||||||
|
# manifest with evidence_excerpt containing PASSWORD |
||||||
|
evidence = "This shows a PASSWORD=hunter2 in the output" |
||||||
|
manifest = f'files:\n - path: some_file.txt\n evidence_excerpt: "{evidence}"\n' |
||||||
|
mpath = _write_temp_manifest(manifest) |
||||||
|
try: |
||||||
|
report = validate_manifest(mpath) |
||||||
|
assert "potential_secrets" in report |
||||||
|
items = report["potential_secrets"] |
||||||
|
assert any(evidence in (item.get("evidence_excerpt") or "") for item in items) |
||||||
|
finally: |
||||||
|
Path(mpath).unlink() |
||||||
@ -0,0 +1,24 @@ |
|||||||
|
import os |
||||||
|
from pathlib import Path |
||||||
|
|
||||||
|
import pytest |
||||||
|
|
||||||
|
from src.validators.types import parse_manifest, Manifest |
||||||
|
|
||||||
|
|
||||||
|
def test_manifest_model_parses_sample(tmp_path: Path): |
||||||
|
sample = """ |
||||||
|
files: |
||||||
|
- path: data/file1.txt |
||||||
|
evidence_excerpt: "some evidence" |
||||||
|
- file_path: data/file2.txt |
||||||
|
evidence_excerpt: "other evidence" |
||||||
|
""" |
||||||
|
p = tmp_path / "manifest.yaml" |
||||||
|
p.write_text(sample, encoding="utf-8") |
||||||
|
|
||||||
|
manifest = parse_manifest(str(p)) |
||||||
|
assert isinstance(manifest, Manifest) |
||||||
|
assert len(manifest.files) == 2 |
||||||
|
assert manifest.files[0]["path"] == "data/file1.txt" |
||||||
|
assert manifest.files[1]["path"] == "data/file2.txt" |
||||||
Loading…
Reference in new issue