#!/usr/bin/env python3
"""
Build a complete, structured dataset of UK Competition and Markets Authority
(CMA) cases from the public GOV.UK Search API.

Demonstration extraction by UK Data Services. The source is public and the
method is reproducible: anyone can re-run this script and get the same dataset.
Completeness is proven by reconciling the number of records collected against
the total the source itself reports.

Usage:  python3 build_cma_dataset.py
Outputs (written next to this script):
  cma-cases.csv        the dataset, one row per case
  cma-cases.json       the same records as structured JSON
  provenance.json      source, capture time, and the completeness check
"""
import csv
import json
import time
import urllib.request
import urllib.parse
from datetime import datetime, timezone
from collections import Counter

API = "https://www.gov.uk/api/search.json"
DOCTYPE = "cma_case"
PAGE = 1000          # records per request
PAUSE = 1.0          # seconds between requests, to stay polite
UA = ("UKDataServices-demo/1.0 (+https://ukdataservices.co.uk; "
      "reproducible CMA case dataset)")
FIELDS = ["title", "link", "description", "case_type", "case_state",
          "market_sector", "outcome_type", "opened_date", "closed_date",
          "public_timestamp"]


def fetch(start, count):
    params = {
        "filter_content_store_document_type": DOCTYPE,
        "fields": ",".join(FIELDS),
        "count": count,
        "start": start,
    }
    url = API + "?" + urllib.parse.urlencode(params)
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    with urllib.request.urlopen(req, timeout=30) as r:
        return json.load(r)


def first(value):
    """GOV.UK returns several facet fields as single-item lists."""
    if isinstance(value, list):
        return ";".join(str(v) for v in value)
    return value if value is not None else ""


def main():
    captured_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    head = fetch(0, 0)
    total_reported = head["total"]
    print(f"Source reports {total_reported} CMA cases")

    records = {}
    start = 0
    while start < total_reported:
        page = fetch(start, PAGE)
        got = page["results"]
        if not got:
            break
        for r in got:
            link = r.get("link", "")
            case_id = link.replace("/cma-cases/", "").strip("/")
            records[link] = {
                "case_id": case_id,
                "title": (r.get("title") or "").strip(),
                "case_type": first(r.get("case_type")),
                "case_state": first(r.get("case_state")),
                "outcome_type": first(r.get("outcome_type")),
                "market_sector": first(r.get("market_sector")),
                "opened_date": r.get("opened_date") or "",
                "closed_date": r.get("closed_date") or "",
                "last_updated": r.get("public_timestamp") or "",
                "url": "https://www.gov.uk" + link,
                "description": (r.get("description") or "").replace("\n", " ").strip(),
            }
        print(f"  collected {len(records)} / {total_reported}")
        start += PAGE
        time.sleep(PAUSE)

    rows = sorted(records.values(),
                  key=lambda x: (x["opened_date"] or "", x["case_id"]))
    total_collected = len(rows)
    reconciled = total_collected == total_reported

    cols = ["case_id", "title", "case_type", "case_state", "outcome_type",
            "market_sector", "opened_date", "closed_date", "last_updated",
            "url", "description"]
    with open("cma-cases.csv", "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        w.writerows(rows)

    with open("cma-cases.json", "w", encoding="utf-8") as f:
        json.dump(rows, f, indent=2, ensure_ascii=False)

    by_state = Counter(r["case_state"] for r in rows)
    by_type = Counter(r["case_type"] for r in rows)
    by_year = Counter((r["opened_date"] or "")[:4] for r in rows if r["opened_date"])

    data_quality = {
        "note": ("Field gaps below reflect the source, not the extraction. "
                 "Many pre-2014 records were inherited from the OFT and the "
                 "Competition Commission and never carried these fields."),
        "missing_opened_date": sum(1 for r in rows if not r["opened_date"]),
        "closed_without_outcome_type": sum(
            1 for r in rows if r["case_state"] == "closed" and not r["outcome_type"]),
        "with_outcome_type": sum(1 for r in rows if r["outcome_type"]),
        "distinct_outcome_types": len(
            {r["outcome_type"] for r in rows if r["outcome_type"]}),
    }

    provenance = {
        "dataset": "UK Competition and Markets Authority (CMA) cases",
        "source_name": "GOV.UK Search API",
        "source_url": "https://www.gov.uk/cma-cases",
        "api_endpoint": API,
        "filter": {"content_store_document_type": DOCTYPE},
        "captured_at_utc": captured_at,
        "total_reported_by_source": total_reported,
        "total_collected": total_collected,
        "completeness_reconciled": reconciled,
        "data_quality": data_quality,
        "breakdown_by_state": dict(by_state.most_common()),
        "breakdown_by_type": dict(by_type.most_common()),
        "opened_by_year": dict(sorted(by_year.items())),
    }
    with open("provenance.json", "w", encoding="utf-8") as f:
        json.dump(provenance, f, indent=2, ensure_ascii=False)

    print(f"\nCollected {total_collected} unique cases")
    print(f"Completeness check: "
          f"{'PASS' if reconciled else 'FAIL'} "
          f"(collected {total_collected} vs reported {total_reported})")
    print("Wrote cma-cases.csv, cma-cases.json, provenance.json")


if __name__ == "__main__":
    main()