#!/usr/bin/env python3
"""Prompt Provenance Tracking — Post-Commit Hook

Records which backlog item / session led to which file changes and commit.
Appends a JSONL record to .claude/provenance/provenance.jsonl after each commit.

Usage:
  python3 scripts/docs/provenance_log.py              # Log latest commit
  python3 scripts/docs/provenance_log.py --commit abc1234
  python3 scripts/docs/provenance_log.py --query SR-2026-02-28-001
  python3 scripts/docs/provenance_log.py --stats
"""

import json
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path

ROOT = Path(__file__).resolve().parents[2]
PROVENANCE_DIR = ROOT / ".claude" / "provenance"
PROVENANCE_FILE = PROVENANCE_DIR / "provenance.jsonl"
BACKLOG_FILE = ROOT / "BACKLOG.md"


def get_commit_info(commit_hash: str = "HEAD") -> dict | None:
    result = subprocess.run(
        ["git", "log", "-1", "--format=%H%n%s%n%an%n%ai", commit_hash],
        capture_output=True, text=True, cwd=ROOT,
    )
    if result.returncode != 0:
        return None
    lines = result.stdout.strip().split("\n")
    if len(lines) < 4:
        return None
    return {
        "commit_sha": lines[0],
        "message": lines[1],
        "author": lines[2],
        "date": lines[3],
    }


def get_commit_files(commit_hash: str = "HEAD") -> list[str]:
    result = subprocess.run(
        ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", commit_hash],
        capture_output=True, text=True, cwd=ROOT,
    )
    return [f for f in result.stdout.strip().split("\n") if f.strip()]


def get_active_sr_id() -> str | None:
    if not BACKLOG_FILE.exists():
        return None
    content = BACKLOG_FILE.read_text(encoding="utf-8")
    match = re.search(r'\[in_progress\s*\].*?(SR-[\d-]+)', content)
    if match:
        return match.group(1)
    promoted = re.findall(r'\[promoted\s*\].*?(SR-[\d-]+)', content)
    return promoted[0] if promoted else None


def log_provenance(commit_hash: str = "HEAD") -> dict | None:
    info = get_commit_info(commit_hash)
    if not info:
        return None

    files = get_commit_files(commit_hash)
    sr_id = re.search(r'SR-[\d-]+', info["message"])
    sr_id = sr_id.group(0) if sr_id else get_active_sr_id()

    domains = set()
    for f in files:
        if any(d in f for d in ["backend/", "app/", "api/", "server/"]):
            domains.add("backend")
        elif any(d in f for d in ["frontend/", "src/components/", "src/pages/"]):
            domains.add("frontend")
        elif "docs/" in f:
            domains.add("docs")
        elif "scripts/" in f:
            domains.add("infra")
        elif ".claude/" in f:
            domains.add("pipeline")

    record = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "sr_id": sr_id,
        "commit_sha": info["commit_sha"][:12],
        "message": info["message"][:120],
        "author": info["author"],
        "files_changed": files,
        "file_count": len(files),
        "domains": sorted(domains),
    }

    PROVENANCE_DIR.mkdir(parents=True, exist_ok=True)
    with open(PROVENANCE_FILE, "a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

    return record


def query_by_sr_id(sr_id: str) -> list[dict]:
    if not PROVENANCE_FILE.exists():
        return []
    records = []
    for line in PROVENANCE_FILE.read_text(encoding="utf-8").strip().split("\n"):
        if not line.strip():
            continue
        try:
            r = json.loads(line)
            if r.get("sr_id") == sr_id:
                records.append(r)
        except json.JSONDecodeError:
            continue
    return records


def show_stats() -> None:
    if not PROVENANCE_FILE.exists():
        print("No provenance data yet.")
        return
    records = []
    for line in PROVENANCE_FILE.read_text(encoding="utf-8").strip().split("\n"):
        if not line.strip():
            continue
        try:
            records.append(json.loads(line))
        except json.JSONDecodeError:
            continue
    if not records:
        print("No provenance records found.")
        return
    sr_ids = set(r.get("sr_id") for r in records if r.get("sr_id"))
    total_files = sum(r.get("file_count", 0) for r in records)
    domains = set()
    for r in records:
        domains.update(r.get("domains", []))
    print(f"Provenance Stats:")
    print(f"  Total commits tracked: {len(records)}")
    print(f"  Unique backlog items: {len(sr_ids)}")
    print(f"  Total files changed: {total_files}")
    print(f"  Domains touched: {', '.join(sorted(domains))}")
    print(f"  First record: {records[0].get('timestamp', '?')[:10]}")
    print(f"  Last record: {records[-1].get('timestamp', '?')[:10]}")


def main() -> int:
    if "--query" in sys.argv:
        idx = sys.argv.index("--query")
        if len(sys.argv) <= idx + 1:
            print("Usage: --query SR-ID", file=sys.stderr)
            return 1
        records = query_by_sr_id(sys.argv[idx + 1])
        if not records:
            print(f"No records for {sys.argv[idx + 1]}")
            return 0
        for r in records:
            print(f"  {r['commit_sha']} ({r['timestamp'][:10]}): {r['message']}")
            print(f"    Files: {', '.join(r.get('files_changed', [])[:5])}")
        return 0

    if "--stats" in sys.argv:
        show_stats()
        return 0

    commit_hash = "HEAD"
    if "--commit" in sys.argv:
        idx = sys.argv.index("--commit")
        if len(sys.argv) > idx + 1:
            commit_hash = sys.argv[idx + 1]

    record = log_provenance(commit_hash)
    if record:
        print(
            f"Provenance logged: {record['commit_sha']} "
            f"(SR: {record.get('sr_id', 'none')}, "
            f"{record['file_count']} files, "
            f"domains: {', '.join(record['domains'])})",
            file=sys.stderr,
        )
    return 0


if __name__ == "__main__":
    sys.exit(main())
