From 2b57482b2511af3c2dc7e3869fe33aa4f0f10a00 Mon Sep 17 00:00:00 2001
From: Raymond Yee <raymond.yee@gmail.com>
Date: Fri, 17 Apr 2026 12:52:06 -0700
Subject: [PATCH] Add /current/ alias layer + enrichment script

Two additions for stable-URL access to rotating versioned parquets:

1. Worker alias route: GET /current/<flavor>.parquet reads
   current/manifest.json from R2 and 302-redirects to the dated file it
   points to. Redirect response carries short 5-min Cache-Control so
   rotation propagates quickly; the target (versioned file) keeps its
   immutable 1-year cache. DuckDB-WASM / curl / browsers all follow
   302s transparently, so range requests hit the target directly.

2. scripts/enrich_wide_with_oc_thumbnails.py: DuckDB LEFT-JOIN script
   that takes the unified Zenodo wide parquet (thumbnail_url all NULL,
   see #131) and Eric Kansa's oc_isamples_pqg.parquet (48K thumbnails)
   and produces an enriched wide file with ~47.7K thumbnails populated
   for MaterialSampleRecord pids that overlap both.

Used today to build and ship isamples_202604_wide.parquet via
https://data.isamples.org/current/wide.parquet. The older
isamples_202601_wide.parquet stays in place, untouched, still
immutable. Verified via DuckDB query through the /current/ URL:
47,717 rows with thumbnail_url populated.

Closes the "soft-link" piece of #131.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 scripts/enrich_wide_with_oc_thumbnails.py | 83 +++++++++++++++++++++++
 workers/data-isamples-org/src/index.js    | 37 ++++++++++
 2 files changed, 120 insertions(+)
 create mode 100755 scripts/enrich_wide_with_oc_thumbnails.py
diff --git a/scripts/enrich_wide_with_oc_thumbnails.py b/scripts/enrich_wide_with_oc_thumbnails.py
new file mode 100755
index 0000000..cc84759
--- /dev/null
+++ b/scripts/enrich_wide_with_oc_thumbnails.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""Build an enriched unified-wide parquet by left-joining OC thumbnails.
+
+Takes the unified Zenodo wide parquet (which has thumbnail_url = NULL for all
+6.7M samples because the upstream iSamples export doesn't carry thumbnails —
+see issue #131) and fills in thumbnail_url for the ~47K OpenContext samples
+that appear in Eric Kansa's oc_isamples_pqg.parquet.
+
+Input:
+    --src          local path to source unified wide parquet
+                   (e.g. ~/Data/iSample/pqg_refining/zenodo_wide_*.parquet)
+    --oc           local path to Eric's oc_isamples_pqg.parquet (the narrow
+                   one — thumbnails live on MaterialSampleRecord rows)
+    --out          path to write the enriched output
+
+Usage:
+    python scripts/enrich_wide_with_oc_thumbnails.py \\
+        --src ~/Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet \\
+        --oc  /tmp/oc_isamples_pqg_20251107.parquet \\
+        --out /tmp/isamples_202604_wide.parquet
+
+Then upload to R2 under a date-stamped filename (e.g. isamples_202604_wide.parquet)
+and update current/manifest.json to point at it.
+"""
+import argparse
+import os
+import sys
+import time
+import duckdb
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument('--src', required=True, help='source unified wide parquet')
+    p.add_argument('--oc',  required=True, help="Eric's OC narrow parquet (for thumbnails)")
+    p.add_argument('--out', required=True, help='output path for enriched parquet')
+    args = p.parse_args()
+
+    for f in (args.src, args.oc):
+        if not os.path.exists(f):
+            print(f'ERROR: missing {f}', file=sys.stderr)
+            return 2
+
+    con = duckdb.connect()
+
+    print(f'source: {args.src}')
+    print(f'oc:     {args.oc}')
+    print(f'out:    {args.out}')
+
+    t0 = time.time()
+    con.execute(f"""
+        CREATE TEMP TABLE oc_thumbs AS
+        SELECT DISTINCT pid, thumbnail_url
+        FROM read_parquet('{args.oc}')
+        WHERE thumbnail_url IS NOT NULL AND thumbnail_url <> ''
+    """)
+    n = con.sql('SELECT COUNT(*) FROM oc_thumbs').fetchone()[0]
+    print(f'[{time.time()-t0:.1f}s] oc_thumbs lookup: {n:,} (pid, thumbnail) pairs')
+
+    t0 = time.time()
+    con.execute(f"""
+        COPY (
+          SELECT p.* REPLACE (COALESCE(oc.thumbnail_url, p.thumbnail_url) AS thumbnail_url)
+          FROM read_parquet('{args.src}') p
+          LEFT JOIN oc_thumbs oc ON p.pid = oc.pid
+        )
+        TO '{args.out}' (FORMAT PARQUET, COMPRESSION ZSTD)
+    """)
+    print(f'[{time.time()-t0:.1f}s] wrote enriched parquet')
+
+    # Verify
+    r = con.sql(f"""
+        SELECT COUNT(*) AS rows,
+               COUNT(*) FILTER (WHERE thumbnail_url IS NOT NULL AND thumbnail_url <> '') AS with_thumb
+        FROM read_parquet('{args.out}')
+    """).df()
+    print(r.to_string(index=False))
+    print(f'output size: {os.path.getsize(args.out)/1024/1024:.1f} MB')
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/workers/data-isamples-org/src/index.js b/workers/data-isamples-org/src/index.js
index 82e4e0d..08bcaed 100644
--- a/workers/data-isamples-org/src/index.js
+++ b/workers/data-isamples-org/src/index.js
@@ -58,6 +58,43 @@ export default {
       });
     }
 
+    // === /current/ alias layer ===
+    // `/current/<flavor>.parquet` reads `current/manifest.json` from R2 and
+    // 302-redirects to the dated file it points to. Lets consumers pin to a
+    // stable URL while the underlying immutable file rotates out-of-band.
+    const currentAliasMatch = key.match(/^current\/([a-z0-9_-]+)\.parquet$/i);
+    if (currentAliasMatch) {
+      const flavor = currentAliasMatch[1];
+      const manifestObj = await env.BUCKET.get('current/manifest.json');
+      if (!manifestObj) {
+        return new Response('current/manifest.json not found', { status: 503, headers: CORS_HEADERS });
+      }
+      let manifest;
+      try {
+        manifest = JSON.parse(await manifestObj.text());
+      } catch (e) {
+        return new Response('current/manifest.json is invalid JSON', { status: 503, headers: CORS_HEADERS });
+      }
+      const entry = manifest[flavor];
+      if (!entry || !entry.public_url) {
+        return new Response(
+          `current/manifest.json has no entry for flavor '${flavor}'`,
+          { status: 404, headers: CORS_HEADERS }
+        );
+      }
+      // 302 Found preserves the request method semantics and lets clients
+      // re-issue range requests against the target URL directly.
+      return new Response(null, {
+        status: 302,
+        headers: {
+          'Location': entry.public_url,
+          // Short TTL so rotation propagates quickly without stale fanout.
+          'Cache-Control': `public, max-age=${FALLBACK_MAX_AGE}`,
+          ...CORS_HEADERS,
+        },
+      });
+    }
+
     // Parse Range header if present. R2's get() accepts { offset, length } or
     // { suffix }, mirroring HTTP Range semantics.
     const rangeHeader = request.headers.get('range');