Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ models/generated/extensions/*.md

# Large data files
*.parquet
*.duckdb

# Node / Playwright
node_modules/
Expand Down
97 changes: 97 additions & 0 deletions tools/build_fts_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3
"""
Build a DuckDB full-text search index for the iSamples Explorer.

Creates a .duckdb file containing the FTS index (BM25-scored) that can
be ATTACHed in DuckDB-WASM for ranked text search over 6.7M samples.

Usage:
python tools/build_fts_index.py

Output:
tools/isamples_fts_index.duckdb (upload to data.isamples.org)

Requirements:
pip install duckdb
"""

import duckdb
import os
import sys
from pathlib import Path

PARQUET_URL = "https://data.isamples.org/isamples_202601_wide.parquet"
OUTPUT_DB = Path(__file__).parent / "isamples_fts_index.duckdb"

# Local fallback for faster builds
LOCAL_PARQUET = Path.home() / "Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet"


def build_fts_index():
# Use local file if available, otherwise remote
source = str(LOCAL_PARQUET) if LOCAL_PARQUET.exists() else PARQUET_URL
print(f"Source: {source}")

# Remove existing index file
if OUTPUT_DB.exists():
OUTPUT_DB.unlink()

con = duckdb.connect(str(OUTPUT_DB))

print("Creating samples table from parquet...")
con.execute(f"""
CREATE TABLE samples AS
SELECT
pid,
label,
COALESCE(description, '') AS description,
COALESCE(CAST(place_name AS VARCHAR), '') AS place_name
FROM read_parquet('{source}')
WHERE otype = 'MaterialSampleRecord'
""")

row_count = con.execute("SELECT COUNT(*) FROM samples").fetchone()[0]
print(f"Loaded {row_count:,} rows")

print("Installing and loading FTS extension...")
con.execute("INSTALL fts")
con.execute("LOAD fts")

print("Building FTS index (this may take a few minutes)...")
con.execute("""
PRAGMA create_fts_index(
'samples', 'pid',
'label', 'description', 'place_name',
stemmer = 'porter',
stopwords = 'english',
overwrite = 1
)
""")

# Verify the index works
test_result = con.execute("""
SELECT pid, fts_main_samples.match_bm25(pid, 'pottery') AS score
FROM samples
WHERE score IS NOT NULL
ORDER BY score DESC
LIMIT 5
""").fetchall()
print(f"Test query 'pottery': {len(test_result)} results")
for pid, score in test_result:
print(f" {pid[:60]} score={score:.4f}")

# Keep samples table — FTS macros reference it internally.
# The table has only pid + text columns (not the full schema),
# so it's much smaller than the full parquet.

con.close()

size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024)
print(f"\nIndex file: {OUTPUT_DB}")
print(f"Size: {size_mb:.1f} MB")
print(f"\nUpload to data.isamples.org and ATTACH in DuckDB-WASM:")
print(f" ATTACH 'https://data.isamples.org/isamples_fts_index.duckdb' AS fts_db;")


if __name__ == "__main__":
build_fts_index()
45 changes: 36 additions & 9 deletions tutorials/isamples_explorer.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ initialParams = {
//| code-fold: false
// Search input
viewof searchInput = Inputs.text({
placeholder: "Search samples (e.g., pottery, basalt, Cyprus...)",
placeholder: "Search samples — multiple words narrow results (e.g., pottery Cyprus)",
value: initialParams.q,
submit: "Search"
})
Expand Down Expand Up @@ -392,14 +392,17 @@ whereClause = {
"latitude IS NOT NULL"
];

// Text search
// Multi-term text search: each word must match at least one text field
if (searchInput?.trim()) {
const term = searchInput.trim().replace(/'/g, "''");
conditions.push(`(
label ILIKE '%${term}%'
OR description ILIKE '%${term}%'
OR CAST(place_name AS VARCHAR) ILIKE '%${term}%'
)`);
const terms = searchInput.trim().split(/\s+/).filter(t => t.length > 0);
for (const raw of terms) {
const term = raw.replace(/'/g, "''");
conditions.push(`(
label ILIKE '%${term}%'
OR description ILIKE '%${term}%'
OR CAST(place_name AS VARCHAR) ILIKE '%${term}%'
)`);
}
}

// Source filter
Expand Down Expand Up @@ -466,6 +469,30 @@ sampleData = {
}

try {
// When searching, rank results by relevance (fields matched);
// otherwise random sample for exploration
const hasSearch = searchInput?.trim()?.length > 0;
const terms = hasSearch
? searchInput.trim().split(/\s+/).filter(t => t.length > 0)
: [];

// Build a relevance score: +3 for label match, +2 for place, +1 for description
// per term — higher scores float to top
let scoreExpr = "0";
if (terms.length > 0) {
const termScores = terms.map(raw => {
const t = raw.replace(/'/g, "''");
return `(CASE WHEN label ILIKE '%${t}%' THEN 3 ELSE 0 END
+ CASE WHEN CAST(place_name AS VARCHAR) ILIKE '%${t}%' THEN 2 ELSE 0 END
+ CASE WHEN description ILIKE '%${t}%' THEN 1 ELSE 0 END)`;
});
scoreExpr = termScores.join(" + ");
}

const orderClause = hasSearch
? `ORDER BY (${scoreExpr}) DESC, label`
: "ORDER BY RANDOM()";

const query = `
SELECT
row_id,
Expand All @@ -478,7 +505,7 @@ sampleData = {
place_name
FROM samples
WHERE ${whereClause}
ORDER BY RANDOM()
${orderClause}
LIMIT ${maxSamples}
`;

Expand Down
Loading