motief/src/validators/mindmodel_validator.py

"""Conservative, report-only mindmodel/manifest validator.

This module provides a small validator that reads a manifest (YAML if
PyYAML is available, otherwise a tiny fallback parser) and reports
potential issues without making changes.

The returned report contains the keys:
- missing_files: list of file paths referenced in the manifest that don't exist
- truncated_evidence: list of items (dicts) where evidence_excerpt appears truncated
- potential_secrets: list of items (dicts) where evidence_excerpt looks like it may contain secrets

The manifest is expected to contain a top-level `files` list with
entries that are mappings and have at least a `path` (or `file_path`)
and optionally `evidence_excerpt`.
"""

from __future__ import annotations

import os
from typing import List, Dict, Any


def _load_yaml_native(path: str) -> Dict[str, Any]:
    try:
        import yaml  # type: ignore

        with open(path, "r", encoding="utf-8") as f:
            return yaml.safe_load(f) or {}
    except Exception:
        raise


def _load_yaml_fallback(path: str) -> Dict[str, Any]:
    """Tiny YAML-ish fallback parser that understands a minimal manifest.

    It only supports a top-level `files:` key and a sequence of simple
    mappings with `-` list items and `key: value` pairs indented.
    This is intentionally conservative and fragile; it's only used when
    PyYAML is not available.
    """
    result: Dict[str, Any] = {}
    files: List[Dict[str, Any]] = []
    current: Dict[str, Any] | None = None

    with open(path, "r", encoding="utf-8") as f:
        for raw in f:
            line = raw.rstrip("\n")
            stripped = line.lstrip()
            if not stripped or stripped.startswith("#"):
                continue
            if stripped.startswith("files:") and line.startswith(stripped):
                # top-level marker, skip
                continue
            if stripped.startswith("- "):
                # start new item
                if current is not None:
                    files.append(current)
                current = {}
                # possible inline key: - path: something
                rest = stripped[2:].strip()
                if rest:
                    if ":" in rest:
                        k, v = rest.split(":", 1)
                        current[k.strip()] = v.strip()
                continue
            # key: value lines (indented)
            if ":" in stripped and current is not None:
                k, v = stripped.split(":", 1)
                current[k.strip()] = v.strip()

    if current is not None:
        files.append(current)
    if files:
        result["files"] = files
    return result


def _normalize_entry(entry: Any) -> Dict[str, Any]:
    if not isinstance(entry, dict):
        return {"path": str(entry)}
    # prefer path or file_path
    if "file_path" in entry and "path" not in entry:
        entry = dict(entry)
        entry["path"] = entry.pop("file_path")
    return entry


def validate_manifest(manifest_path: str, report_only: bool = True) -> dict:
    """Validate a minimal mindmodel manifest and return a report.

    Parameters
    - manifest_path: path to the YAML manifest file
    - report_only: unused flag for now; kept to emphasise this is report-only

    Returns a dict with keys: missing_files, truncated_evidence, potential_secrets
    """
    if not os.path.exists(manifest_path):
        raise FileNotFoundError(manifest_path)

    # attempt to use PyYAML if available, otherwise fallback
    try:
        manifest = _load_yaml_native(manifest_path)
    except Exception:
        manifest = _load_yaml_fallback(manifest_path)

    files = manifest.get("files") or []
    report = {"missing_files": [], "truncated_evidence": [], "potential_secrets": []}

    def _strip_surrounding_quotes(s: str) -> str:
        s = s.strip()
        if len(s) >= 2 and s[0] == s[-1] and s[0] in ('"', "'"):
            return s[1:-1]
        return s

    for raw in files:
        entry = _normalize_entry(raw)
        path = entry.get("path")
        evidence = entry.get("evidence_excerpt") or entry.get("evidence") or ""
        # Remove surrounding quotes if the fallback YAML parser left them in place
        if isinstance(evidence, str):
            evidence = _strip_surrounding_quotes(evidence)

        # missing files
        if path:
            if not os.path.exists(path):
                report["missing_files"].append(path)

        # truncated evidence heuristics
        if isinstance(evidence, str):
            if len(evidence) > 1000 or evidence.strip().endswith("..."):
                report["truncated_evidence"].append(
                    {"path": path, "evidence_excerpt": evidence}
                )

            # potential secrets heuristics
            up = evidence.upper()
            if "PASSWORD" in up or "SECRET" in up or "BEGIN PRIVATE KEY" in evidence:
                report["potential_secrets"].append(
                    {"path": path, "evidence_excerpt": evidence}
                )

    return report