diff --git a/.gitignore b/.gitignore index 40f2d5c..e3b2633 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ models/generated/extensions/*.md # Large data files *.parquet +*.duckdb # Node / Playwright node_modules/ diff --git a/tools/build_fts_index.py b/tools/build_fts_index.py new file mode 100644 index 0000000..b0ddd09 --- /dev/null +++ b/tools/build_fts_index.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Build a DuckDB full-text search index for the iSamples Explorer. + +Creates a .duckdb file containing the FTS index (BM25-scored) that can +be ATTACHed in DuckDB-WASM for ranked text search over 6.7M samples. + +Usage: + python tools/build_fts_index.py + +Output: + tools/isamples_fts_index.duckdb (upload to data.isamples.org) + +Requirements: + pip install duckdb +""" + +import duckdb +import os +import sys +from pathlib import Path + +PARQUET_URL = "https://data.isamples.org/isamples_202601_wide.parquet" +OUTPUT_DB = Path(__file__).parent / "isamples_fts_index.duckdb" + +# Local fallback for faster builds +LOCAL_PARQUET = Path.home() / "Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet" + + +def build_fts_index(): + # Use local file if available, otherwise remote + source = str(LOCAL_PARQUET) if LOCAL_PARQUET.exists() else PARQUET_URL + print(f"Source: {source}") + + # Remove existing index file + if OUTPUT_DB.exists(): + OUTPUT_DB.unlink() + + con = duckdb.connect(str(OUTPUT_DB)) + + print("Creating samples table from parquet...") + con.execute(f""" + CREATE TABLE samples AS + SELECT + pid, + label, + COALESCE(description, '') AS description, + COALESCE(CAST(place_name AS VARCHAR), '') AS place_name + FROM read_parquet('{source}') + WHERE otype = 'MaterialSampleRecord' + """) + + row_count = con.execute("SELECT COUNT(*) FROM samples").fetchone()[0] + print(f"Loaded {row_count:,} rows") + + print("Installing and loading FTS extension...") + con.execute("INSTALL fts") + con.execute("LOAD fts") + + print("Building FTS index (this may take a few minutes)...") + con.execute(""" + PRAGMA create_fts_index( + 'samples', 'pid', + 'label', 'description', 'place_name', + stemmer = 'porter', + stopwords = 'english', + overwrite = 1 + ) + """) + + # Verify the index works + test_result = con.execute(""" + SELECT pid, fts_main_samples.match_bm25(pid, 'pottery') AS score + FROM samples + WHERE score IS NOT NULL + ORDER BY score DESC + LIMIT 5 + """).fetchall() + print(f"Test query 'pottery': {len(test_result)} results") + for pid, score in test_result: + print(f" {pid[:60]} score={score:.4f}") + + # Keep samples table — FTS macros reference it internally. + # The table has only pid + text columns (not the full schema), + # so it's much smaller than the full parquet. + + con.close() + + size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024) + print(f"\nIndex file: {OUTPUT_DB}") + print(f"Size: {size_mb:.1f} MB") + print(f"\nUpload to data.isamples.org and ATTACH in DuckDB-WASM:") + print(f" ATTACH 'https://data.isamples.org/isamples_fts_index.duckdb' AS fts_db;") + + +if __name__ == "__main__": + build_fts_index() diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd index 402b8a4..b7b5597 100644 --- a/tutorials/isamples_explorer.qmd +++ b/tutorials/isamples_explorer.qmd @@ -71,7 +71,7 @@ initialParams = { //| code-fold: false // Search input viewof searchInput = Inputs.text({ - placeholder: "Search samples (e.g., pottery, basalt, Cyprus...)", + placeholder: "Search samples — multiple words narrow results (e.g., pottery Cyprus)", value: initialParams.q, submit: "Search" }) @@ -392,14 +392,17 @@ whereClause = { "latitude IS NOT NULL" ]; - // Text search + // Multi-term text search: each word must match at least one text field if (searchInput?.trim()) { - const term = searchInput.trim().replace(/'/g, "''"); - conditions.push(`( - label ILIKE '%${term}%' - OR description ILIKE '%${term}%' - OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' - )`); + const terms = searchInput.trim().split(/\s+/).filter(t => t.length > 0); + for (const raw of terms) { + const term = raw.replace(/'/g, "''"); + conditions.push(`( + label ILIKE '%${term}%' + OR description ILIKE '%${term}%' + OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' + )`); + } } // Source filter @@ -466,6 +469,30 @@ sampleData = { } try { + // When searching, rank results by relevance (fields matched); + // otherwise random sample for exploration + const hasSearch = searchInput?.trim()?.length > 0; + const terms = hasSearch + ? searchInput.trim().split(/\s+/).filter(t => t.length > 0) + : []; + + // Build a relevance score: +3 for label match, +2 for place, +1 for description + // per term — higher scores float to top + let scoreExpr = "0"; + if (terms.length > 0) { + const termScores = terms.map(raw => { + const t = raw.replace(/'/g, "''"); + return `(CASE WHEN label ILIKE '%${t}%' THEN 3 ELSE 0 END + + CASE WHEN CAST(place_name AS VARCHAR) ILIKE '%${t}%' THEN 2 ELSE 0 END + + CASE WHEN description ILIKE '%${t}%' THEN 1 ELSE 0 END)`; + }); + scoreExpr = termScores.join(" + "); + } + + const orderClause = hasSearch + ? `ORDER BY (${scoreExpr}) DESC, label` + : "ORDER BY RANDOM()"; + const query = ` SELECT row_id, @@ -478,7 +505,7 @@ sampleData = { place_name FROM samples WHERE ${whereClause} - ORDER BY RANDOM() + ${orderClause} LIMIT ${maxSamples} `;