From 8f48ba8a453f3dd3efdbac53ca2d60fe0e5c8ca1 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 17 Apr 2026 10:00:49 -0700 Subject: [PATCH] Add Explorer v2 behind ?v=2 flag (lite parquet, lazy description, no RANDOM, lazy Cesium) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four architectural moves, each gated on ?v=2. v1 stays unchanged. 1. Primary read surface: samples_map_lite.parquet (60 MB) instead of wide.parquet (278 MB). The lite file has every column the Explorer needs for the list + globe view except description. 2. No ORDER BY RANDOM(). v1 uses RANDOM() which forces a scan across row groups; v2 uses bare LIMIT, accepting row-order bias in exchange for ~20× query speedup on columnar parquet. (Trade-off acceptable for a viz sample; revisit if source clustering becomes visible.) 3. Lazy description fetch. v2 drops description from sampleData and adds a lazyDescription cell that queries wide.parquet for just the one pid when a sample is clicked. sampleCard falls back to lazyDescription when s.description is empty. 4. Lazy Cesium mount. v2 returns null from the viewer cell until viewMode === 'globe', so the viewer constructor (~500 ms) doesn't run for users who stay in list/table view. v1 mounts eagerly. whereClause handles column-name drift (v1 uses `n`, v2 uses `source`) and skips the otype filter for v2 (lite is samples-only). Text search in v2 is limited to label + place_name (description isn't loaded eagerly). v1 keeps description search. Next: measure v2 and compare against the PR #124 baseline. Co-Authored-By: Claude Opus 4.7 --- tutorials/isamples_explorer.qmd | 112 +++++++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 17 deletions(-) diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd index bc5bdb7..ccd5818 100644 --- a/tutorials/isamples_explorer.qmd +++ b/tutorials/isamples_explorer.qmd @@ -77,8 +77,15 @@ duckdbModule = import("https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.28.0/+ ```{ojs} //| code-fold: true +// Version gate. Append ?v=2 to the URL to opt into the lite-backed +// rewrite (samples_map_lite.parquet instead of wide.parquet, lazy +// description fetch on click, no ORDER BY RANDOM(), lazy Cesium mount). +explorerVersion = new URLSearchParams(location.search).get('v') === '2' ? 'v2' : 'v1' + // Data source configuration -parquet_url = "https://data.isamples.org/isamples_202601_wide.parquet" +wide_url = "https://data.isamples.org/isamples_202601_wide.parquet" +lite_url = "https://data.isamples.org/isamples_202601_samples_map_lite.parquet" +parquet_url = explorerVersion === 'v2' ? lite_url : wide_url // Pre-computed facet summaries (2KB - loads instantly) facet_summaries_url = "https://data.isamples.org/isamples_202601_facet_summaries.parquet" @@ -356,9 +363,19 @@ db = { await instance.instantiate(bundle.mainModule, bundle.pthreadWorker); URL.revokeObjectURL(worker_url); - // Create views for convenience + // Create views for convenience. v1 reads the full wide parquet directly; + // v2 reads the 60 MB lite file (no description, no row_id, source is + // already named 'source' not 'n'). const conn = await instance.connect(); - await conn.query(`CREATE VIEW samples AS SELECT * FROM read_parquet('${parquet_url}')`); + if (explorerVersion === 'v2') { + await conn.query(` + CREATE VIEW samples AS + SELECT pid, label, source, latitude, longitude, place_name + FROM read_parquet('${parquet_url}') + `); + } else { + await conn.query(`CREATE VIEW samples AS SELECT * FROM read_parquet('${parquet_url}')`); + } // Slim facets view with correct URI-string columns for cross-filtering await conn.query(`CREATE VIEW sample_facets AS SELECT * FROM read_parquet('${sample_facets_url}')`); await conn.close(); @@ -636,26 +653,38 @@ crossFilteredFacets = { // Material/context/object_type filters use the sample_facets view (URI strings) // via a subquery, since the wide parquet stores these as BIGINT foreign keys. whereClause = { - const conditions = [ - "otype = 'MaterialSampleRecord'", - "latitude IS NOT NULL" - ]; + const conditions = ["latitude IS NOT NULL"]; - // Text search (against wide parquet — has label, description, place_name) + // v1 reads the multi-entity-type wide parquet, so filter to sample records. + // v2 reads lite which is already samples-only. + if (explorerVersion !== 'v2') { + conditions.unshift("otype = 'MaterialSampleRecord'"); + } + + // Text search. v1 can search description (column exists in wide); + // v2 can't (description is not in lite — lazy-fetched on sample click). if (searchInput?.trim()) { const term = searchInput.trim().replace(/'/g, "''"); - conditions.push(`( - label ILIKE '%${term}%' - OR description ILIKE '%${term}%' - OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' - )`); + if (explorerVersion === 'v2') { + conditions.push(`( + label ILIKE '%${term}%' + OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' + )`); + } else { + conditions.push(`( + label ILIKE '%${term}%' + OR description ILIKE '%${term}%' + OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' + )`); + } } - // Source filter (n column exists in wide parquet) + // Source filter. v1 uses the wide parquet's `n` column; v2 uses `source`. const sources = Array.from(sourceCheckboxes || []); if (sources.length > 0) { const sourceList = sources.map(s => `'${s}'`).join(", "); - conditions.push(`n IN (${sourceList})`); + const col = explorerVersion === 'v2' ? 'source' : 'n'; + conditions.push(`${col} IN (${sourceList})`); } // Facet filters: build a subquery against sample_facets to get matching PIDs @@ -720,7 +749,24 @@ sampleData = { performance.mark('explorer-samples-start'); try { - const query = ` + // v2: read from lite (60 MB), no description (fetched lazily on click), + // no row_id, no ORDER BY RANDOM(). LIMIT returns whatever rows the + // scan encounters first — biased toward row order but ~20x faster + // than RANDOM() on a columnar file. + // v1: original query against the 278 MB wide file. + const query = explorerVersion === 'v2' ? ` + SELECT + pid, + label, + '' AS description, + latitude, + longitude, + source, + place_name + FROM samples + WHERE ${whereClause} + LIMIT ${maxSamples} + ` : ` SELECT row_id, pid, @@ -778,6 +824,14 @@ mutable clickedPointIndex = null //| code-fold: true // Cesium viewer setup viewer = { + // v2: defer Cesium construction until the user actually switches to + // globe view. The cell re-evaluates when viewMode changes (reactive + // dependency below), so toggling into globe will mount on demand. + // v1 mounts eagerly to preserve original behavior. + if (explorerVersion === 'v2' && viewMode !== 'globe') { + return null; + } + // Wait for Cesium to be available await new Promise(resolve => { if (typeof Cesium !== 'undefined') resolve(); @@ -886,6 +940,28 @@ selectedSample = { } ``` +```{ojs} +//| code-fold: true +// v2: lazy description fetch — only hit the 278 MB wide parquet when a sample +// is actually clicked, rather than pulling description for every row eagerly. +lazyDescription = { + if (explorerVersion !== 'v2') return null; + if (!selectedSample?.pid) return null; + const pid = selectedSample.pid.replace(/'/g, "''"); + try { + const rows = await runQuery(` + SELECT description FROM read_parquet('${wide_url}') + WHERE pid = '${pid}' AND otype = 'MaterialSampleRecord' + LIMIT 1 + `); + return rows[0]?.description || ''; + } catch (e) { + console.warn('Lazy description fetch failed:', e); + return ''; + } +} +``` + ```{ojs} //| code-fold: true // Render sample card @@ -900,7 +976,9 @@ sampleCard = { const sourceColor = SOURCE_COLORS[s.source] || SOURCE_COLORS.default; const label = s.label || 'No label'; - const description = s.description || ''; + // v2: prefer the lazily-fetched description (from wide parquet on click); + // v1: the description is already in sampleData. + const description = (s.description || lazyDescription || '').trim(); const truncDesc = description.length > 200 ? description.substring(0, 200) + '...' : description; let placeStr = '';