From 2b57482b2511af3c2dc7e3869fe33aa4f0f10a00 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 17 Apr 2026 12:52:06 -0700 Subject: [PATCH] Add /current/ alias layer + enrichment script Two additions for stable-URL access to rotating versioned parquets: 1. Worker alias route: GET /current/.parquet reads current/manifest.json from R2 and 302-redirects to the dated file it points to. Redirect response carries short 5-min Cache-Control so rotation propagates quickly; the target (versioned file) keeps its immutable 1-year cache. DuckDB-WASM / curl / browsers all follow 302s transparently, so range requests hit the target directly. 2. scripts/enrich_wide_with_oc_thumbnails.py: DuckDB LEFT-JOIN script that takes the unified Zenodo wide parquet (thumbnail_url all NULL, see #131) and Eric Kansa's oc_isamples_pqg.parquet (48K thumbnails) and produces an enriched wide file with ~47.7K thumbnails populated for MaterialSampleRecord pids that overlap both. Used today to build and ship isamples_202604_wide.parquet via https://data.isamples.org/current/wide.parquet. The older isamples_202601_wide.parquet stays in place, untouched, still immutable. Verified via DuckDB query through the /current/ URL: 47,717 rows with thumbnail_url populated. Closes the "soft-link" piece of #131. Co-Authored-By: Claude Opus 4.7 --- scripts/enrich_wide_with_oc_thumbnails.py | 83 +++++++++++++++++++++++ workers/data-isamples-org/src/index.js | 37 ++++++++++ 2 files changed, 120 insertions(+) create mode 100755 scripts/enrich_wide_with_oc_thumbnails.py diff --git a/scripts/enrich_wide_with_oc_thumbnails.py b/scripts/enrich_wide_with_oc_thumbnails.py new file mode 100755 index 0000000..cc84759 --- /dev/null +++ b/scripts/enrich_wide_with_oc_thumbnails.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +"""Build an enriched unified-wide parquet by left-joining OC thumbnails. + +Takes the unified Zenodo wide parquet (which has thumbnail_url = NULL for all +6.7M samples because the upstream iSamples export doesn't carry thumbnails — +see issue #131) and fills in thumbnail_url for the ~47K OpenContext samples +that appear in Eric Kansa's oc_isamples_pqg.parquet. + +Input: + --src local path to source unified wide parquet + (e.g. ~/Data/iSample/pqg_refining/zenodo_wide_*.parquet) + --oc local path to Eric's oc_isamples_pqg.parquet (the narrow + one — thumbnails live on MaterialSampleRecord rows) + --out path to write the enriched output + +Usage: + python scripts/enrich_wide_with_oc_thumbnails.py \\ + --src ~/Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet \\ + --oc /tmp/oc_isamples_pqg_20251107.parquet \\ + --out /tmp/isamples_202604_wide.parquet + +Then upload to R2 under a date-stamped filename (e.g. isamples_202604_wide.parquet) +and update current/manifest.json to point at it. +""" +import argparse +import os +import sys +import time +import duckdb + + +def main(): + p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument('--src', required=True, help='source unified wide parquet') + p.add_argument('--oc', required=True, help="Eric's OC narrow parquet (for thumbnails)") + p.add_argument('--out', required=True, help='output path for enriched parquet') + args = p.parse_args() + + for f in (args.src, args.oc): + if not os.path.exists(f): + print(f'ERROR: missing {f}', file=sys.stderr) + return 2 + + con = duckdb.connect() + + print(f'source: {args.src}') + print(f'oc: {args.oc}') + print(f'out: {args.out}') + + t0 = time.time() + con.execute(f""" + CREATE TEMP TABLE oc_thumbs AS + SELECT DISTINCT pid, thumbnail_url + FROM read_parquet('{args.oc}') + WHERE thumbnail_url IS NOT NULL AND thumbnail_url <> '' + """) + n = con.sql('SELECT COUNT(*) FROM oc_thumbs').fetchone()[0] + print(f'[{time.time()-t0:.1f}s] oc_thumbs lookup: {n:,} (pid, thumbnail) pairs') + + t0 = time.time() + con.execute(f""" + COPY ( + SELECT p.* REPLACE (COALESCE(oc.thumbnail_url, p.thumbnail_url) AS thumbnail_url) + FROM read_parquet('{args.src}') p + LEFT JOIN oc_thumbs oc ON p.pid = oc.pid + ) + TO '{args.out}' (FORMAT PARQUET, COMPRESSION ZSTD) + """) + print(f'[{time.time()-t0:.1f}s] wrote enriched parquet') + + # Verify + r = con.sql(f""" + SELECT COUNT(*) AS rows, + COUNT(*) FILTER (WHERE thumbnail_url IS NOT NULL AND thumbnail_url <> '') AS with_thumb + FROM read_parquet('{args.out}') + """).df() + print(r.to_string(index=False)) + print(f'output size: {os.path.getsize(args.out)/1024/1024:.1f} MB') + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/workers/data-isamples-org/src/index.js b/workers/data-isamples-org/src/index.js index 82e4e0d..08bcaed 100644 --- a/workers/data-isamples-org/src/index.js +++ b/workers/data-isamples-org/src/index.js @@ -58,6 +58,43 @@ export default { }); } + // === /current/ alias layer === + // `/current/.parquet` reads `current/manifest.json` from R2 and + // 302-redirects to the dated file it points to. Lets consumers pin to a + // stable URL while the underlying immutable file rotates out-of-band. + const currentAliasMatch = key.match(/^current\/([a-z0-9_-]+)\.parquet$/i); + if (currentAliasMatch) { + const flavor = currentAliasMatch[1]; + const manifestObj = await env.BUCKET.get('current/manifest.json'); + if (!manifestObj) { + return new Response('current/manifest.json not found', { status: 503, headers: CORS_HEADERS }); + } + let manifest; + try { + manifest = JSON.parse(await manifestObj.text()); + } catch (e) { + return new Response('current/manifest.json is invalid JSON', { status: 503, headers: CORS_HEADERS }); + } + const entry = manifest[flavor]; + if (!entry || !entry.public_url) { + return new Response( + `current/manifest.json has no entry for flavor '${flavor}'`, + { status: 404, headers: CORS_HEADERS } + ); + } + // 302 Found preserves the request method semantics and lets clients + // re-issue range requests against the target URL directly. + return new Response(null, { + status: 302, + headers: { + 'Location': entry.public_url, + // Short TTL so rotation propagates quickly without stale fanout. + 'Cache-Control': `public, max-age=${FALLBACK_MAX_AGE}`, + ...CORS_HEADERS, + }, + }); + } + // Parse Range header if present. R2's get() accepts { offset, length } or // { suffix }, mirroring HTTP Range semantics. const rangeHeader = request.headers.get('range');