From 8c82be95578832313eee3144a840d0d38cb3cf19 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 17 Apr 2026 12:42:48 -0700 Subject: [PATCH] Mirror Eric's OC PQG to R2 with immutable cache + drift-check script Eric Kansa's OpenContext PQG files (the ones with 48K populated thumbnails, see #131) were only served from his GCS bucket. Mirrored to R2 under oc_pqg/ with date-versioned filenames + per-file manifests + a latest.json pointer so we: 1. Have a stable source-of-truth input for the PQG pipeline rebuild. 2. Can detect drift when Eric re-uploads. 3. Get free Cloudflare edge caching via the existing Worker. Worker change: expand the immutable Cache-Control regex from a single isamples_YYYYMM_* pattern to an array that also covers oc_pqg/oc_isamples_pqg*_YYYYMMDD.parquet. Non-versioned files under oc_pqg/ (manifests, latest.json) fall through to the 5-minute default. scripts/check_oc_pqg_drift.py fetches latest.json + per-file manifests from R2, HEADs GCS, and compares etags. Exit 0 = in sync, 1 = drift, 2 = probe failure. Run manually for now; later wire to GitHub Actions cron. Mirror contents (2026-04-17): oc_pqg/oc_isamples_pqg_20251107.parquet (727 MB, narrow) oc_pqg/oc_isamples_pqg_wide_20251116.parquet (289 MB, wide) oc_pqg/*.manifest.json (per-file provenance) oc_pqg/latest.json (flavor -> current version) Verified live: cache-control on the parquets is public, max-age=31536000, immutable. Drift check passes. Co-Authored-By: Claude Opus 4.7 --- scripts/check_oc_pqg_drift.py | 85 ++++++++++++++++++++++++++ workers/data-isamples-org/src/index.js | 12 +++- 2 files changed, 95 insertions(+), 2 deletions(-) create mode 100755 scripts/check_oc_pqg_drift.py diff --git a/scripts/check_oc_pqg_drift.py b/scripts/check_oc_pqg_drift.py new file mode 100755 index 0000000..627fdb8 --- /dev/null +++ b/scripts/check_oc_pqg_drift.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +"""Check whether Eric Kansa's OC PQG files on GCS have drifted from our R2 mirror. + +Reads our latest.json + the per-file manifests from data.isamples.org/oc_pqg/, +HEADs the GCS source, and reports whether upstream has a newer version. + +Exit codes: + 0 — in sync, no drift + 1 — drift detected (GCS has a different etag from what we've mirrored) + 2 — probe failure (network error, malformed response, etc.) + +Run manually for now: + python scripts/check_oc_pqg_drift.py + +Later: wire to GitHub Actions cron. +""" +import json +import sys +import urllib.request + +LATEST_URL = "https://data.isamples.org/oc_pqg/latest.json" +GCS_BASE = "https://storage.googleapis.com/opencontext-parquet/" +GCS_FILES = { + "narrow": "oc_isamples_pqg.parquet", + "wide": "oc_isamples_pqg_wide.parquet", +} + + +def fetch_json(url, timeout=20): + req = urllib.request.Request(url, headers={"User-Agent": "isamples-oc-drift-check/1.0"}) + with urllib.request.urlopen(req, timeout=timeout) as r: + return json.loads(r.read()) + + +def head(url, timeout=20): + req = urllib.request.Request(url, method="HEAD", + headers={"User-Agent": "isamples-oc-drift-check/1.0"}) + with urllib.request.urlopen(req, timeout=timeout) as r: + return dict(r.headers) + + +def main() -> int: + try: + latest = fetch_json(LATEST_URL) + except Exception as e: + print(f"ERROR: could not fetch {LATEST_URL}: {e}", file=sys.stderr) + return 2 + + drift_any = False + for flavor, gcs_name in GCS_FILES.items(): + flavor_ptr = latest.get(flavor) + if not flavor_ptr: + print(f"ERROR: latest.json has no entry for {flavor!r}", file=sys.stderr) + return 2 + + try: + manifest = fetch_json(f"https://data.isamples.org/{flavor_ptr['manifest']}") + except Exception as e: + print(f"ERROR: could not fetch manifest for {flavor}: {e}", file=sys.stderr) + return 2 + + try: + gcs_headers = head(f"{GCS_BASE}{gcs_name}") + except Exception as e: + print(f"ERROR: HEAD {GCS_BASE}{gcs_name}: {e}", file=sys.stderr) + return 2 + + gcs_etag = gcs_headers.get("ETag", "").strip('"') + gcs_last_modified = gcs_headers.get("Last-Modified", "") + our_etag = manifest.get("source_etag", "") + our_updated = manifest.get("source_updated", "") + + in_sync = gcs_etag == our_etag + state = "in sync" if in_sync else "DRIFT" + print(f"[{flavor}] {state}") + print(f" mirrored: etag={our_etag} updated={our_updated}") + print(f" gcs: etag={gcs_etag} last-modified={gcs_last_modified}") + if not in_sync: + drift_any = True + + return 1 if drift_any else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/workers/data-isamples-org/src/index.js b/workers/data-isamples-org/src/index.js index e1f5088..82e4e0d 100644 --- a/workers/data-isamples-org/src/index.js +++ b/workers/data-isamples-org/src/index.js @@ -18,7 +18,15 @@ * working. */ -const IMMUTABLE_PATTERN = /^isamples_\d{6}_.*\.parquet$/; +// Immutable-by-filename patterns. Match files whose path fully determines +// their contents (filename includes a version / date stamp). +// - isamples_YYYYMM_*.parquet (monthly iSamples snapshots) +// - oc_pqg/oc_isamples_pqg*_YYYYMMDD.parquet (mirror of Eric Kansa's +// OpenContext PQG files — versioned by the upstream GCS updated-date) +const IMMUTABLE_PATTERNS = [ + /^isamples_\d{6}_.*\.parquet$/, + /^oc_pqg\/oc_isamples_pqg.*_\d{8}\.parquet$/, +]; const IMMUTABLE_MAX_AGE = 60 * 60 * 24 * 365; // 1 year const FALLBACK_MAX_AGE = 300; // 5 minutes @@ -72,7 +80,7 @@ export default { for (const [k, v] of Object.entries(CORS_HEADERS)) headers.set(k, v); // Cache-Control: this is the optimization. - if (IMMUTABLE_PATTERN.test(key)) { + if (IMMUTABLE_PATTERNS.some(p => p.test(key))) { headers.set('Cache-Control', `public, max-age=${IMMUTABLE_MAX_AGE}, immutable`); } else { headers.set('Cache-Control', `public, max-age=${FALLBACK_MAX_AGE}`);