From 9198e81d9ef5bf1bd1674f01c9317ebb9e2724ff Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 17 Apr 2026 12:57:59 -0700 Subject: [PATCH] Tutorials query /current/wide.parquet instead of dated file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interactive Explorer, Search Explorer, and Deep-Dive Analysis now query through the stable /current/ alias so they pick up the current enriched wide parquet (with OpenContext thumbnails) without needing per-tutorial URL updates on rebuild. The alias 302-redirects to the latest dated file; DuckDB-WASM follows redirects transparently and range requests after the redirect go to the target directly. - tutorials/progressive_globe.qmd: wide_url (lazy description fetch on sample click; v2 explorer description fetch) - tutorials/isamples_explorer.qmd: wide_url (v1 primary source) - tutorials/zenodo_isamples_analysis.qmd: primary data source narrow_vs_wide_performance.qmd intentionally keeps dated URLs — benchmarks need reproducibility, not freshness. Data catalog updates: - how-to-use.qmd: document the /current/ alias pattern, explain the trade (stable alias for interactive work vs. dated URL for pinned reproducibility), preserve historical isamples_202601_wide.parquet pointer for anyone pinning. - tutorials/index.qmd: primary Wide format row points at the alias and notes the rotation convention. Closes the "#2" item from #131's status comment (migrate tutorials to /current/wide.parquet). Co-Authored-By: Claude Opus 4.7 --- how-to-use.qmd | 14 ++++++++++++-- tutorials/index.qmd | 2 +- tutorials/isamples_explorer.qmd | 6 ++++-- tutorials/progressive_globe.qmd | 4 +++- tutorials/zenodo_isamples_analysis.qmd | 2 +- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/how-to-use.qmd b/how-to-use.qmd index f47e61e..e9d530a 100644 --- a/how-to-use.qmd +++ b/how-to-use.qmd @@ -60,10 +60,20 @@ The two main files carrying the sample records themselves: | File | Size | Shape | Rows | Use when you need… | |---|---:|---|---:|---| -| [`isamples_202601_wide.parquet`](https://data.isamples.org/isamples_202601_wide.parquet) | 278 MB | Wide (one row per entity, nested relationships in `p__*` array columns) | 20 M | General entity queries, UI filtering, description text | +| [`current/wide.parquet`](https://data.isamples.org/current/wide.parquet) ∗ | 292 MB | Wide (one row per entity, nested relationships in `p__*` array columns) | 20 M | General entity queries, UI filtering, description text | | [`isamples_202601_wide_h3.parquet`](https://data.isamples.org/isamples_202601_wide_h3.parquet) | 292 MB | Wide + H3 BIGINT indices (`h3_res4`, `h3_res6`, `h3_res8`) | 20 M | Geospatial queries with H3 clustering at arbitrary zoom | | [`isamples_202512_narrow.parquet`](https://data.isamples.org/isamples_202512_narrow.parquet) | 820 MB | Narrow (graph: nodes + explicit `_edge_` rows, s/p/o/n fields) | 106 M | Graph traversals, relationship-centric analysis, PQG work | +∗ `/current/wide.parquet` is a stable alias that HTTP 302-redirects to the +latest dated file (currently +[`isamples_202604_wide.parquet`](https://data.isamples.org/isamples_202604_wide.parquet), +enriched with ~47 K OpenContext thumbnails). The dated filename is +immutable; the alias rotates atomically when we rebuild. Use the alias for +interactive work, the dated URL when you want a pinned, reproducible +reference. The original +[`isamples_202601_wide.parquet`](https://data.isamples.org/isamples_202601_wide.parquet) +(278 MB, no thumbnails) is kept available for historical pinning. + All three represent the same underlying data (SESAR + OpenContext + GEOME + Smithsonian) with identical semantics — they differ only in serialization strategy. See the @@ -123,7 +133,7 @@ import duckdb con = duckdb.connect() con.sql(""" SELECT source, COUNT(*) AS n - FROM read_parquet('https://data.isamples.org/isamples_202601_wide.parquet') + FROM read_parquet('https://data.isamples.org/current/wide.parquet') WHERE otype = 'MaterialSampleRecord' GROUP BY 1 ORDER BY 2 DESC """).df() diff --git a/tutorials/index.qmd b/tutorials/index.qmd index cc39228..a4e5bf8 100644 --- a/tutorials/index.qmd +++ b/tutorials/index.qmd @@ -28,7 +28,7 @@ All data is hosted on [`data.isamples.org`](https://data.isamples.org) with HTTP | File | Size | Description | |------|------|-------------| -| [Wide format](https://data.isamples.org/isamples_202601_wide.parquet) | 278 MB | One row per entity, all sources — primary file for tutorials | +| [Wide format](https://data.isamples.org/current/wide.parquet) | 292 MB | One row per entity, all sources — primary file for tutorials. Stable alias redirects to the current dated build (`isamples_YYYYMM_wide.parquet`). | | [Wide + H3](https://data.isamples.org/isamples_202601_wide_h3.parquet) | 292 MB | Wide format with H3 spatial indices for globe visualizations | | [Facet summaries](https://data.isamples.org/isamples_202601_facet_summaries.parquet) | 2 KB | Pre-computed filter counts — loads instantly | | [H3 clusters (res4)](https://data.isamples.org/isamples_202601_h3_summary_res4.parquet) | 0.6 MB | Zoomed-out globe view | diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd index ccd5818..a3d1a44 100644 --- a/tutorials/isamples_explorer.qmd +++ b/tutorials/isamples_explorer.qmd @@ -82,8 +82,10 @@ duckdbModule = import("https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.28.0/+ // description fetch on click, no ORDER BY RANDOM(), lazy Cesium mount). explorerVersion = new URLSearchParams(location.search).get('v') === '2' ? 'v2' : 'v1' -// Data source configuration -wide_url = "https://data.isamples.org/isamples_202601_wide.parquet" +// Data source configuration. +// wide_url uses the /current/ alias so we pick up the latest enriched build +// (with OpenContext thumbnails); the alias 302-redirects to the dated file. +wide_url = "https://data.isamples.org/current/wide.parquet" lite_url = "https://data.isamples.org/isamples_202601_samples_map_lite.parquet" parquet_url = explorerVersion === 'v2' ? lite_url : wide_url diff --git a/tutorials/progressive_globe.qmd b/tutorials/progressive_globe.qmd index d0b0e83..aec008e 100644 --- a/tutorials/progressive_globe.qmd +++ b/tutorials/progressive_globe.qmd @@ -202,7 +202,9 @@ h3_res4_url = `${R2_BASE}/isamples_202601_h3_summary_res4.parquet` h3_res6_url = `${R2_BASE}/isamples_202601_h3_summary_res6.parquet` h3_res8_url = `${R2_BASE}/isamples_202601_h3_summary_res8.parquet` lite_url = `${R2_BASE}/isamples_202601_samples_map_lite.parquet` -wide_url = `${R2_BASE}/isamples_202601_wide.parquet` +// Stable alias that 302-redirects to the current enriched wide parquet +// (isamples_YYYYMM_wide.parquet). Gets OpenContext thumbnails populated. +wide_url = `${R2_BASE}/current/wide.parquet` facets_url = `${R2_BASE}/isamples_202601_sample_facets.parquet` facet_summaries_url = `${R2_BASE}/isamples_202601_facet_summaries.parquet` diff --git a/tutorials/zenodo_isamples_analysis.qmd b/tutorials/zenodo_isamples_analysis.qmd index 650630e..d3a499b 100644 --- a/tutorials/zenodo_isamples_analysis.qmd +++ b/tutorials/zenodo_isamples_analysis.qmd @@ -95,7 +95,7 @@ parquet_urls = [ 'https://data.isamples.org/isamples_202601_wide_h3.parquet', // Fallback: original wide format without H3 - 'https://data.isamples.org/isamples_202601_wide.parquet', + 'https://data.isamples.org/current/wide.parquet', // Fallback: older versions 'https://labs.dataunbound.com/docs/2025/07/isamples_export_2025_04_21_16_23_46_geo.parquet',