parent
dbd8cc801a
commit
f091846dc8
@ -0,0 +1,133 @@ |
||||
"""Conservative, report-only mindmodel/manifest validator. |
||||
|
||||
This module provides a small validator that reads a manifest (YAML if |
||||
PyYAML is available, otherwise a tiny fallback parser) and reports |
||||
potential issues without making changes. |
||||
|
||||
The returned report contains the keys: |
||||
- missing_files: list of file paths referenced in the manifest that don't exist |
||||
- truncated_evidence: list of items (dicts) where evidence_excerpt appears truncated |
||||
- potential_secrets: list of items (dicts) where evidence_excerpt looks like it may contain secrets |
||||
|
||||
The manifest is expected to contain a top-level `files` list with |
||||
entries that are mappings and have at least a `path` (or `file_path`) |
||||
and optionally `evidence_excerpt`. |
||||
""" |
||||
|
||||
from __future__ import annotations |
||||
|
||||
import os |
||||
from typing import List, Dict, Any |
||||
|
||||
|
||||
def _load_yaml_native(path: str) -> Dict[str, Any]: |
||||
try: |
||||
import yaml # type: ignore |
||||
|
||||
with open(path, "r", encoding="utf-8") as f: |
||||
return yaml.safe_load(f) or {} |
||||
except Exception: |
||||
raise |
||||
|
||||
|
||||
def _load_yaml_fallback(path: str) -> Dict[str, Any]: |
||||
"""Tiny YAML-ish fallback parser that understands a minimal manifest. |
||||
|
||||
It only supports a top-level `files:` key and a sequence of simple |
||||
mappings with `-` list items and `key: value` pairs indented. |
||||
This is intentionally conservative and fragile; it's only used when |
||||
PyYAML is not available. |
||||
""" |
||||
result: Dict[str, Any] = {} |
||||
files: List[Dict[str, Any]] = [] |
||||
current: Dict[str, Any] | None = None |
||||
|
||||
with open(path, "r", encoding="utf-8") as f: |
||||
for raw in f: |
||||
line = raw.rstrip("\n") |
||||
stripped = line.lstrip() |
||||
if not stripped or stripped.startswith("#"): |
||||
continue |
||||
if stripped.startswith("files:") and line.startswith(stripped): |
||||
# top-level marker, skip |
||||
continue |
||||
if stripped.startswith("- "): |
||||
# start new item |
||||
if current is not None: |
||||
files.append(current) |
||||
current = {} |
||||
# possible inline key: - path: something |
||||
rest = stripped[2:].strip() |
||||
if rest: |
||||
if ":" in rest: |
||||
k, v = rest.split(":", 1) |
||||
current[k.strip()] = v.strip() |
||||
continue |
||||
# key: value lines (indented) |
||||
if ":" in stripped and current is not None: |
||||
k, v = stripped.split(":", 1) |
||||
current[k.strip()] = v.strip() |
||||
|
||||
if current is not None: |
||||
files.append(current) |
||||
if files: |
||||
result["files"] = files |
||||
return result |
||||
|
||||
|
||||
def _normalize_entry(entry: Any) -> Dict[str, Any]: |
||||
if not isinstance(entry, dict): |
||||
return {"path": str(entry)} |
||||
# prefer path or file_path |
||||
if "file_path" in entry and "path" not in entry: |
||||
entry = dict(entry) |
||||
entry["path"] = entry.pop("file_path") |
||||
return entry |
||||
|
||||
|
||||
def validate_manifest(manifest_path: str, report_only: bool = True) -> dict: |
||||
"""Validate a minimal mindmodel manifest and return a report. |
||||
|
||||
Parameters |
||||
- manifest_path: path to the YAML manifest file |
||||
- report_only: unused flag for now; kept to emphasise this is report-only |
||||
|
||||
Returns a dict with keys: missing_files, truncated_evidence, potential_secrets |
||||
""" |
||||
if not os.path.exists(manifest_path): |
||||
raise FileNotFoundError(manifest_path) |
||||
|
||||
# attempt to use PyYAML if available, otherwise fallback |
||||
try: |
||||
manifest = _load_yaml_native(manifest_path) |
||||
except Exception: |
||||
manifest = _load_yaml_fallback(manifest_path) |
||||
|
||||
files = manifest.get("files") or [] |
||||
report = {"missing_files": [], "truncated_evidence": [], "potential_secrets": []} |
||||
|
||||
for raw in files: |
||||
entry = _normalize_entry(raw) |
||||
path = entry.get("path") |
||||
evidence = entry.get("evidence_excerpt") or entry.get("evidence") or "" |
||||
|
||||
# missing files |
||||
if path: |
||||
if not os.path.exists(path): |
||||
report["missing_files"].append(path) |
||||
|
||||
# truncated evidence heuristics |
||||
if isinstance(evidence, str): |
||||
if len(evidence) > 1000 or evidence.strip().endswith("..."): |
||||
report["truncated_evidence"].append( |
||||
{"path": path, "evidence_excerpt": evidence} |
||||
) |
||||
|
||||
# potential secrets heuristics |
||||
up = evidence.upper() |
||||
if "PASSWORD" in up or "SECRET" in up or "BEGIN PRIVATE KEY" in evidence: |
||||
report["potential_secrets"].append( |
||||
{"path": path, "evidence_excerpt": evidence} |
||||
) |
||||
|
||||
return report |
||||
@ -0,0 +1,107 @@ |
||||
from __future__ import annotations |
||||
|
||||
from dataclasses import dataclass |
||||
from typing import Any, Dict, List |
||||
|
||||
|
||||
@dataclass |
||||
class EvidencePointer: |
||||
# minimal placeholder for evidence pointing |
||||
path: str |
||||
excerpt: str | None = None |
||||
|
||||
|
||||
@dataclass |
||||
class Constraint: |
||||
# minimal placeholder for constraints |
||||
key: str |
||||
value: Any = None |
||||
|
||||
|
||||
@dataclass |
||||
class Manifest: |
||||
files: List[Dict[str, Any]] |
||||
|
||||
|
||||
def _load_yaml_native(path: str) -> dict: |
||||
# Use PyYAML when available. If the loaded document is not a mapping, |
||||
# return an empty mapping to keep the API stable for callers. |
||||
import yaml # type: ignore |
||||
|
||||
with open(path, "r", encoding="utf-8") as f: |
||||
loaded = yaml.safe_load(f) |
||||
|
||||
if not isinstance(loaded, dict): |
||||
return {} |
||||
|
||||
return loaded |
||||
|
||||
|
||||
def _load_yaml_fallback(path: str) -> dict: |
||||
# very small fallback that recognises a top-level files: list and |
||||
# simple key: value lines. It intentionally is tiny and forgiving. |
||||
result: dict = {} |
||||
files: List[Dict[str, Any]] = [] |
||||
current: Dict[str, Any] | None = None |
||||
|
||||
with open(path, "r", encoding="utf-8") as f: |
||||
for raw in f: |
||||
line = raw.rstrip("\n") |
||||
stripped = line.lstrip() |
||||
if not stripped or stripped.startswith("#"): |
||||
continue |
||||
if stripped.startswith("files:") and line.startswith(stripped): |
||||
continue |
||||
if stripped.startswith("- "): |
||||
if current is not None: |
||||
files.append(current) |
||||
current = {} |
||||
rest = stripped[2:].strip() |
||||
if rest and ":" in rest: |
||||
k, v = rest.split(":", 1) |
||||
current[k.strip()] = v.strip().strip("'\"") |
||||
continue |
||||
if ":" in stripped and current is not None: |
||||
k, v = stripped.split(":", 1) |
||||
current[k.strip()] = v.strip().strip("'\"") |
||||
|
||||
if current is not None: |
||||
files.append(current) |
||||
if files: |
||||
result["files"] = files |
||||
return result |
||||
|
||||
|
||||
def parse_manifest(manifest_path: str) -> Manifest: |
||||
"""Parse a minimal manifest file and return a Manifest dataclass. |
||||
|
||||
The function will attempt to use PyYAML (yaml.safe_load) when available; |
||||
otherwise it falls back to a tiny parser that recognises a top-level |
||||
`files:` list and simple mappings. The returned Manifest normalises |
||||
`file_path` -> `path` when present. |
||||
""" |
||||
try: |
||||
manifest = _load_yaml_native(manifest_path) |
||||
except Exception: |
||||
manifest = _load_yaml_fallback(manifest_path) |
||||
|
||||
# Be defensive: ensure we always operate on a mapping |
||||
if not isinstance(manifest, dict): |
||||
manifest = {} |
||||
|
||||
files = manifest.get("files") or [] |
||||
normalized: List[Dict[str, Any]] = [] |
||||
for entry in files: |
||||
if not isinstance(entry, dict): |
||||
# coerce simple scalar entries |
||||
entry = {"path": str(entry)} |
||||
# prefer path over file_path |
||||
if "file_path" in entry and "path" not in entry: |
||||
entry = dict(entry) |
||||
entry["path"] = entry.pop("file_path") |
||||
normalized.append(entry) |
||||
|
||||
return Manifest(files=normalized) |
||||
|
||||
|
||||
__all__ = ["Manifest", "Constraint", "EvidencePointer", "parse_manifest"] |
||||
@ -0,0 +1,45 @@ |
||||
import os |
||||
import tempfile |
||||
from pathlib import Path |
||||
|
||||
import pytest |
||||
|
||||
from src.validators.mindmodel_validator import validate_manifest |
||||
|
||||
|
||||
def _write_temp_manifest(contents: str) -> str: |
||||
fd, path = tempfile.mkstemp(prefix="manifest_", suffix=".yaml") |
||||
os.close(fd) |
||||
with open(path, "w", encoding="utf-8") as f: |
||||
f.write(contents) |
||||
return path |
||||
|
||||
|
||||
def test_validator_reports_missing_file(tmp_path): |
||||
# manifest referencing a non-existent file |
||||
missing = str(tmp_path / "no_such_file.txt") |
||||
manifest = f""" |
||||
files: |
||||
- path: {missing} |
||||
""" |
||||
mpath = _write_temp_manifest(manifest) |
||||
try: |
||||
report = validate_manifest(mpath) |
||||
assert "missing_files" in report |
||||
assert missing in report["missing_files"] |
||||
finally: |
||||
Path(mpath).unlink() |
||||
|
||||
|
||||
def test_validator_detects_potential_secret(tmp_path): |
||||
# manifest with evidence_excerpt containing PASSWORD |
||||
evidence = "This shows a PASSWORD=hunter2 in the output" |
||||
manifest = f'files:\n - path: some_file.txt\n evidence_excerpt: "{evidence}"\n' |
||||
mpath = _write_temp_manifest(manifest) |
||||
try: |
||||
report = validate_manifest(mpath) |
||||
assert "potential_secrets" in report |
||||
items = report["potential_secrets"] |
||||
assert any(evidence in (item.get("evidence_excerpt") or "") for item in items) |
||||
finally: |
||||
Path(mpath).unlink() |
||||
@ -0,0 +1,24 @@ |
||||
import os |
||||
from pathlib import Path |
||||
|
||||
import pytest |
||||
|
||||
from src.validators.types import parse_manifest, Manifest |
||||
|
||||
|
||||
def test_manifest_model_parses_sample(tmp_path: Path): |
||||
sample = """ |
||||
files: |
||||
- path: data/file1.txt |
||||
evidence_excerpt: "some evidence" |
||||
- file_path: data/file2.txt |
||||
evidence_excerpt: "other evidence" |
||||
""" |
||||
p = tmp_path / "manifest.yaml" |
||||
p.write_text(sample, encoding="utf-8") |
||||
|
||||
manifest = parse_manifest(str(p)) |
||||
assert isinstance(manifest, Manifest) |
||||
assert len(manifest.files) == 2 |
||||
assert manifest.files[0]["path"] == "data/file1.txt" |
||||
assert manifest.files[1]["path"] == "data/file2.txt" |
||||
Loading…
Reference in new issue