From 2afaa99730ca4dc4895f3cfee468e13fb3ef0695 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 9 Nov 2025 16:03:46 -0800 Subject: [PATCH] Hyperscope --- bin/cli.js | 12 +- bin/s3.js | 92 ++++++++ bin/scope.js | 305 +++++++++++++++++++++++++++ bin/serve.js | 2 +- bin/types.d.ts | 15 ++ package.json | 30 +-- src/lib/parquet/parquetDataSource.ts | 4 +- src/lib/parquet/parquetFilter.ts | 21 +- table.md | 34 --- 9 files changed, 453 insertions(+), 62 deletions(-) create mode 100644 bin/s3.js create mode 100644 bin/scope.js delete mode 100644 table.md diff --git a/bin/cli.js b/bin/cli.js index d2a29973..7fc1a646 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -3,15 +3,25 @@ import fs from 'fs/promises' import packageJson from '../package.json' with { type: 'json' } import { chat } from './chat.js' +import { scope } from './scope.js' import { serve } from './serve.js' const updateCheck = checkForUpdates() const arg = process.argv[2] -if (arg === '--help' || arg === '-H' || arg === '-h') { +if (arg === 'scope') { + const filePath = process.argv[3] + if (!filePath) { + console.error('Error: hyperparam scope requires a file argument') + process.exit(1) + } + scope(filePath) +} else if (arg === '--help' || arg === '-H' || arg === '-h') { console.log('Usage:') console.log(' hyperparam start chat client') console.log(' hyperparam [path] start hyperparam webapp. "path" is a directory or a URL.') + console.log(' defaults to the current directory.') + console.log(' hyperparam scope start hyperscope client') console.log(' ') console.log(' hyperparam -h, --help, give this help list') console.log(' hyperparam -v, --version print program version') diff --git a/bin/s3.js b/bin/s3.js new file mode 100644 index 00000000..fe447705 --- /dev/null +++ b/bin/s3.js @@ -0,0 +1,92 @@ +/** + * Parse an S3 URL into bucket and key components + * @param {string} url - S3 URL in format: + * - s3://bucket/key + * - https://bucket.s3.amazonaws.com/key + * - https://s3.amazonaws.com/bucket/key + * - https://bucket.s3.region.amazonaws.com/key + * @returns {{bucket: string, key: string}} Object with bucket and key + * @throws {Error} If URL format is invalid + */ +export function parseS3Url(url) { + if (!url || typeof url !== 'string') { + throw new Error('Invalid S3 URL: URL must be a non-empty string') + } + + // Handle s3:// protocol + if (url.startsWith('s3://')) { + const withoutProtocol = url.slice(5) // Remove 's3://' + const firstSlashIndex = withoutProtocol.indexOf('/') + + if (firstSlashIndex === -1) { + throw new Error('Invalid S3 URL: Missing key after bucket name') + } + + const bucket = withoutProtocol.slice(0, firstSlashIndex) + const key = withoutProtocol.slice(firstSlashIndex + 1) + + if (!bucket) { + throw new Error('Invalid S3 URL: Empty bucket name') + } + + if (!key) { + throw new Error('Invalid S3 URL: Empty key') + } + + return { bucket, key } + } + + // Handle https:// protocol + if (url.startsWith('https://') || url.startsWith('http://')) { + const urlObj = new URL(url) + const { hostname } = urlObj + const pathname = urlObj.pathname.startsWith('/') + ? urlObj.pathname.slice(1) + : urlObj.pathname + + if (!pathname) { + throw new Error('Invalid S3 URL: Missing key in path') + } + + // Virtual-hosted-style URL: https://bucket.s3.amazonaws.com/key + // or https://bucket.s3.region.amazonaws.com/key + if (hostname.includes('.s3.') || hostname.includes('.s3-')) { + const bucket = hostname.split('.')[0] + const key = pathname + + if (!bucket) { + throw new Error('Invalid S3 URL: Empty bucket name') + } + + return { bucket, key } + } + + // Path-style URL: https://s3.amazonaws.com/bucket/key + // or https://s3.region.amazonaws.com/bucket/key + if (hostname.startsWith('s3.') || hostname.startsWith('s3-') || hostname === 's3.amazonaws.com') { + const firstSlashIndex = pathname.indexOf('/') + + if (firstSlashIndex === -1) { + // pathname is just the bucket with no key + throw new Error('Invalid S3 URL: Missing key after bucket name') + } + + const bucket = pathname.slice(0, firstSlashIndex) + const key = pathname.slice(firstSlashIndex + 1) + + if (!bucket) { + throw new Error('Invalid S3 URL: Empty bucket name') + } + + if (!key) { + throw new Error('Invalid S3 URL: Empty key') + } + + return { bucket, key } + } + + throw new Error('Invalid S3 URL: Hostname does not match S3 URL patterns') + } + + throw new Error('Invalid S3 URL: Must start with s3:// or https://') +} diff --git a/bin/scope.js b/bin/scope.js new file mode 100644 index 00000000..e75ab27c --- /dev/null +++ b/bin/scope.js @@ -0,0 +1,305 @@ +import { randomUUID } from 'node:crypto' +import fs from 'node:fs/promises' +import path from 'node:path' +import { WebSocket } from 'ws' +import { GetObjectCommand, HeadObjectCommand, S3Client } from '@aws-sdk/client-s3' +import { parseS3Url } from './s3.js' +import { openUrl } from './serve.js' + +/** + * @import {WsRequestPayload, WsResponsePayload} from './types.js' + */ + +// Initialize S3 client +const s3Client = new S3Client({ + region: process.env.AWS_REGION || 'us-east-1', +}) + +/** + * Start a client that connects to the hyperscope server + * and handles incoming requests + * @param {string} filePath - S3/HTTPS URL, or a local file path (relative or absolute) + */ +export async function scope(filePath) { + /** @type {string | undefined} - set when serving from the local filesystem */ + let workingDir + if (!/^(s3:\/\/|https?:\/\/)/.exec(filePath)) { + // Local file: serve from cwd with the relative path as the key + let stat + try { + stat = await fs.stat(filePath) + } catch { + console.error(`Error: file ${filePath} does not exist`) + process.exit(1) + } + if (!stat.isFile()) { + console.error(`Error: ${filePath} is not a file`) + process.exit(1) + } + const absolutePath = await fs.realpath(filePath) + workingDir = process.cwd() + const relative = path.relative(workingDir, absolutePath) + if (relative.startsWith('..') || path.isAbsolute(relative)) { + console.error(`Error: ${filePath} must be inside the current directory`) + process.exit(1) + } + filePath = relative + } + + const connectionId = 'scope-' + randomUUID().replace(/-/g, '').slice(0, 12) + const scopeOrigin = 'wss://scope.hyperparam.app' + // const scopeOrigin = 'ws://localhost:4666' + const url = `${scopeOrigin}/connect?connection_id=${connectionId}` + + console.log('Connecting to', scopeOrigin, connectionId) + + const ws = new WebSocket(url) + + ws.on('open', () => { + const hyperparamBase = 'https://hyperparam.app/files?key=' + // Use provided S3 path or default to the hardcoded one + const key = encodeURIComponent(`https://scope.hyperparam.app/scope/${connectionId}?key=${encodeURIComponent(filePath ?? '')}`) + const url = `${hyperparamBase}${key}` + console.log(`Hyperscope connected ${connectionId}\n`) + openUrl(url) + }) + + ws.on('message', async (data) => { + let request + try { + request = JSON.parse(data.toString()) + } catch (err) { + console.error('Hyperscope failed to parse request:', err) + return + } + + // Handle the request and send a response + console.log('Request ', request.request_id, request.type, request.key) + try { + const response = await handleRequest(request, workingDir) + ws.send(JSON.stringify(response)) + console.log('Response', request.request_id, response.status, response.headers['Content-Length']) + } catch (err) { + console.error('Hyperscope error handling request:', err) + // Send error response + ws.send(JSON.stringify({ + request_id: request.request_id, + error: err instanceof Error ? err.message : String(err), + })) + } + }) + + ws.on('error', (err) => { + console.error('Hyperscope WebSocket error:', err.message) + }) + + ws.on('close', (code, reason) => { + console.log(`Hyperscope connection closed (code: ${code}, reason: ${reason || 'none'})`) + process.exit(0) + }) + + // Handle graceful shutdown + process.on('SIGINT', () => { + console.log('\nShutting down hyperscope connection...') + ws.close() + }) + + process.on('SIGTERM', () => { + console.log('\nShutting down hyperscope connection...') + ws.close() + }) +} + +/** + * Handle an incoming request from the server + * @param {WsRequestPayload} request - The request object + * @param {string} [workingDir] - Optional local directory to serve files from + * @returns {Promise} The response to send back + */ +async function handleRequest(request, workingDir) { + const { request_id, type, key } = request + + if (workingDir) { + return handleLocalRequest(request, workingDir) + } + + // Handle different request types + if (type === 'get') { + // Parse the S3 URL to extract bucket and key + const { bucket, key: s3Key } = parseS3Url(key) + + // Check for Range header to support partial content requests + let rangeHeader = request.headers?.['range'] || request.headers?.['Range'] + // Normalize to string if it's an array + if (Array.isArray(rangeHeader)) { + rangeHeader = rangeHeader[0] + } + + // Fetch the object from S3 + const command = new GetObjectCommand({ + Bucket: bucket, + Key: s3Key, + ...rangeHeader && { Range: rangeHeader }, + }) + + const response = await s3Client.send(command) + + // Convert the stream to a buffer + const chunks = [] + if (response.Body) { + // @ts-expect-error - Body is an async iterable stream + for await (const chunk of response.Body) { + chunks.push(chunk) + } + } + const buffer = Buffer.concat(chunks) + const body = buffer.toString('base64') + + // Determine status code (206 for partial content, 200 for full) + const status = rangeHeader ? 206 : 200 + + /** @type {Record} */ + const headers = {} + if (response.ContentType) { + headers['Content-Type'] = response.ContentType + } + if (response.ContentLength !== undefined) { + headers['Content-Length'] = String(response.ContentLength) + } + if (response.ETag) { + headers['ETag'] = response.ETag + } + if (response.LastModified) { + headers['Last-Modified'] = response.LastModified.toUTCString() + } + if (response.ContentRange) { + headers['Content-Range'] = response.ContentRange + } + if (response.AcceptRanges) { + headers['Accept-Ranges'] = response.AcceptRanges + } + + return { + request_id, + type: 'response', + status, + headers, + body, + } + } else if (type === 'head') { + // Parse the S3 URL to extract bucket and key + const { bucket, key: s3Key } = parseS3Url(key) + + // Get object metadata directly from S3 + const command = new HeadObjectCommand({ + Bucket: bucket, + Key: s3Key, + }) + + const metadata = await s3Client.send(command) + + /** @type {Record} */ + const headers = {} + if (metadata.ContentLength !== undefined) { + headers['Content-Length'] = String(metadata.ContentLength) + } + if (metadata.ContentType) { + headers['Content-Type'] = metadata.ContentType + } + if (metadata.ETag) { + headers['ETag'] = metadata.ETag + } + if (metadata.LastModified) { + headers['Last-Modified'] = metadata.LastModified.toUTCString() + } + + return { + request_id, + type: 'response', + status: 200, + headers, + body: '', + } + } else { + throw new Error(`Unknown request type: ${type}`) + } +} + +/** + * Handle a request by reading from the local filesystem + * @param {WsRequestPayload} request + * @param {string} workingDir + * @returns {Promise} + */ +async function handleLocalRequest(request, workingDir) { + const { request_id, type, key } = request + + // Resolve key safely within workingDir + const resolved = path.resolve(workingDir, key) + const rel = path.relative(workingDir, resolved) + if (rel.startsWith('..') || path.isAbsolute(rel)) { + throw new Error(`Access denied: ${key}`) + } + + const stat = await fs.stat(resolved) + const contentLength = stat.size + + /** @type {Record} */ + const headers = { + 'Content-Length': String(contentLength), + 'Last-Modified': stat.mtime.toUTCString(), + 'Accept-Ranges': 'bytes', + } + + if (type === 'head') { + return { request_id, type: 'response', status: 200, headers, body: '' } + } + + if (type !== 'get') { + throw new Error(`Unknown request type: ${type}`) + } + + let rangeHeader = request.headers?.['range'] || request.headers?.['Range'] + if (Array.isArray(rangeHeader)) rangeHeader = rangeHeader[0] + + /** @type {Buffer} */ + let buffer + let status = 200 + if (rangeHeader) { + const match = /^bytes=(\d*)-(\d*)$/.exec(rangeHeader) + if (!match) throw new Error(`Invalid Range header: ${rangeHeader}`) + const startStr = match[1] ?? '' + const endStr = match[2] ?? '' + let start = startStr === '' ? 0 : parseInt(startStr, 10) + let end = endStr === '' ? contentLength - 1 : parseInt(endStr, 10) + if (startStr === '' && endStr !== '') { + // suffix range: last N bytes + start = Math.max(0, contentLength - parseInt(endStr, 10)) + end = contentLength - 1 + } + if (start > end || start < 0 || end >= contentLength) { + throw new Error(`Range not satisfiable: ${rangeHeader}`) + } + const length = end - start + 1 + buffer = Buffer.alloc(length) + const handle = await fs.open(resolved, 'r') + try { + await handle.read(buffer, 0, length, start) + } finally { + await handle.close() + } + status = 206 + headers['Content-Length'] = String(length) + headers['Content-Range'] = `bytes ${start}-${end}/${contentLength}` + } else { + buffer = await fs.readFile(resolved) + } + + return { + request_id, + type: 'response', + status, + headers, + body: buffer.toString('base64'), + } +} diff --git a/bin/serve.js b/bin/serve.js index 55de0f63..4630a45e 100644 --- a/bin/serve.js +++ b/bin/serve.js @@ -333,7 +333,7 @@ function gzip(req, content) { * @param {string} url * @returns {void} */ -function openUrl(url) { +export function openUrl(url) { switch (process.platform) { case 'darwin': exec(`open ${url}`); return case 'win32': exec(`start ${url}`); return diff --git a/bin/types.d.ts b/bin/types.d.ts index 88c6b9b9..53d76f57 100644 --- a/bin/types.d.ts +++ b/bin/types.d.ts @@ -107,3 +107,18 @@ interface ArrayToolProperty extends BaseToolProperty { items: ToolProperty } export type ToolProperty = StringToolProperty | NumberToolProperty | ArrayToolProperty | BooleanToolProperty + +export interface WsRequestPayload { + request_id: string + type: 'get' | 'head' + key: string + headers: Record +} + +export interface WsResponsePayload { + request_id: string + type: 'response' + status: number + headers: Record + body: string // base64 encoded +} diff --git a/package.json b/package.json index ed68abd9..ee6e9bcb 100644 --- a/package.json +++ b/package.json @@ -56,36 +56,40 @@ "watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet" }, "dependencies": { + "@aws-sdk/client-s3": "3.1029.0", + "@aws-sdk/credential-provider-node": "3.927.0", "hightable": "0.26.4", - "hyparquet": "1.25.3", + "hyparquet": "1.25.6", "hyparquet-compressors": "1.1.1", "icebird": "0.3.1", - "squirreling": "0.11.2" + "squirreling": "0.12.1", + "ws": "8.20.0" }, "devDependencies": { - "@storybook/react-vite": "10.3.3", + "@storybook/react-vite": "10.3.5", "@testing-library/react": "16.3.2", - "@types/node": "25.5.0", + "@types/node": "25.6.0", "@types/react": "19.2.14", "@types/react-dom": "19.2.3", + "@types/ws": "8.18.1", "@vitejs/plugin-react": "6.0.1", - "@vitest/coverage-v8": "4.1.2", + "@vitest/coverage-v8": "4.1.4", "eslint": "9.39.2", "eslint-plugin-react": "7.37.5", "eslint-plugin-react-hooks": "7.0.1", "eslint-plugin-react-refresh": "0.5.2", - "eslint-plugin-storybook": "10.3.3", + "eslint-plugin-storybook": "10.3.5", "globals": "17.4.0", - "jsdom": "29.0.1", + "jsdom": "29.0.2", "nodemon": "3.1.14", "npm-run-all": "4.1.5", - "react": "19.2.4", - "react-dom": "19.2.4", - "storybook": "10.3.3", + "react": "19.2.5", + "react-dom": "19.2.5", + "storybook": "10.3.5", "typescript": "6.0.2", - "typescript-eslint": "8.58.0", - "vite": "8.0.3", - "vitest": "4.1.2" + "typescript-eslint": "8.58.1", + "vite": "8.0.8", + "vitest": "4.1.4" }, "peerDependencies": { "react": "18.3.1 || ^19", diff --git a/src/lib/parquet/parquetDataSource.ts b/src/lib/parquet/parquetDataSource.ts index c0ca7980..7a185d8a 100644 --- a/src/lib/parquet/parquetDataSource.ts +++ b/src/lib/parquet/parquetDataSource.ts @@ -34,7 +34,7 @@ export function parquetDataSource(file: AsyncBuffer, metadata: FileMetaData, com } return { - rows: (async function* () { + async *rows() { // Emit rows by row group let groupStart = 0 let remainingLimit = limit ?? Infinity @@ -84,7 +84,7 @@ export function parquetDataSource(file: AsyncBuffer, metadata: FileMetaData, com remainingLimit -= data.length groupStart += rowCount } - })(), + }, appliedWhere, appliedLimitOffset, } diff --git a/src/lib/parquet/parquetFilter.ts b/src/lib/parquet/parquetFilter.ts index b1b50eca..5b02646e 100644 --- a/src/lib/parquet/parquetFilter.ts +++ b/src/lib/parquet/parquetFilter.ts @@ -75,8 +75,10 @@ function mapOperator(op: BinaryOp, flipped: boolean, negate: boolean): string | if (negate) mappedOp = neg(mappedOp) if (flipped) mappedOp = flip(mappedOp) switch (mappedOp) { - case '=': return '$eq' - case '!=': case '<>': return '$ne' + case '=': + case '==': return '$eq' + case '!=': + case '<>': return '$ne' case '<': return '$lt' case '<=': return '$lte' case '>': return '$gt' @@ -91,21 +93,18 @@ function neg(op: ComparisonOp): ComparisonOp { case '>': return '<=' case '>=': return '<' case '=': return '!=' + case '==': return '!=' case '!=': return '=' case '<>': return '=' } } function flip(op: ComparisonOp): ComparisonOp { - switch (op) { - case '<': return '>' - case '<=': return '>=' - case '>': return '<' - case '>=': return '<=' - case '=': return '=' - case '!=': return '!=' - case '<>': return '=' - } + if (op === '<') return '>' + if (op === '<=') return '>=' + if (op === '>') return '<' + if (op === '>=') return '<=' + return op } function convertInValues(node: InValuesNode, negate: boolean): ParquetQueryFilter | undefined { diff --git a/table.md b/table.md deleted file mode 100644 index 58982a19..00000000 --- a/table.md +++ /dev/null @@ -1,34 +0,0 @@ - -# Test Tables with Unicode Dashes - -## Regular dash table -Header 1 | Header 2 --|- -Row 1 | Data 1 -Row 2 | Data 2 - -## En dash table (–) -Header 1 | Header 2 -–|– -Row 1 | Data 1 -Row 2 | Data 2 - -## Em dash table (—) -Header 1 | Header 2 -—|— -Row 1 | Data 1 -Row 2 | Data 2 - -## Mixed dashes table -Header 1 | Header 2 | Header 3 --|–|— -Row 1 | Data 1 | Data 1 -Row 2 | Data 2 | Data 2 - -## Original Table - -| Name | Age | Occupation | -|----------|-----|----------------| -| Alice | 30 | Engineer | -| Bob | 25 | Designer | -| Charlie | 35 | Product Manager|