Skip to content

Commit fea7b5e

Browse files
authored
perf(native): batch-load file/symbol IDs in edges phase (#1013) (#1028)
* perf(native): batch-load file/symbol node IDs in edges phase (#1013) Replaces per-import and per-file `conn.query_row` calls in `build_import_edges` and `build_and_insert_call_edges` with one-shot HashMap pre-loads. Each `query_row` ran a fresh sqlite3_prepare/step/ finalize cycle; on a ~470-file repo this paid ~2.3k cycles in the import-edge stage alone, dominating the edges phase. Also chunks the import-edge insert into multi-row VALUES batches (199 rows × 5 params), mirroring `edges_db::do_insert_edges`, to remove the per-row prepared-statement bind/step/reset overhead. Self-build benchmark (codegraph on itself, 743 files): edges: 124.5 ms native vs 193 ms wasm (was 310 ms / 179 ms in 3.9.5) roles: 73.7 ms native vs 70 ms wasm (was 269 ms / 62 ms in 3.9.5) Roles converged as a side effect — its prior tail was driven by shared SQLite cache pressure during the same build session. Closes #1013 (docs check acknowledged: internal perf fix, no API/language/feature surface changes) * fix(native): bound symbol-node lookup and restore name=file guard (#1028) Address two P2 review concerns from greptile on #1013: 1. **Bounded symbol-node lookup.** `load_symbol_node_ids` previously did an unbounded `SELECT name, file, id FROM nodes WHERE kind != 'file'`, loading every non-file symbol into memory. On 100k+-symbol monorepos this could push hundreds of MB. Now we walk type-only imports up front to collect the distinct `(name, file)` pairs we'll actually need, then issue a chunked `(name, file) IN (...)` query (332 pairs per chunk × 2 binds = 664, safely under `SQLITE_MAX_VARIABLE_NUMBER`). The full-scan path is gone; only the symbols referenced by type-only imports are hit, preserving the one-round-trip win without the memory blow-up. 2. **Restore `name = file` guard on file-node lookups.** The original per-row query bound `rel_path` to both `name = ?` and `file = ?`, matching only nodes where the two columns agreed. The bulk query keyed on `file` alone, so an unrelated file-kind row sharing the same `file` value (different `name`) could silently overwrite the map entry. Add `AND name = file` to both `load_file_node_ids` and the parallel pre-load in `build_and_insert_call_edges` to keep the legacy semantics explicit. * fix(native): surface bind/execute errors in insert_edges (#1028) Greptile flagged two P2 issues in `insert_edges`: 1. Silent bind failures: `let _ = stmt.raw_bind_parameter(...)` silently discarded errors. A failed bind would leave that position unbound (NULL), causing `raw_execute()` to insert ghost edge rows with NULL `source_id`/`target_id`. Bind/execute now run inside a fallible `insert_edge_chunk` helper; failures emit a stderr warning and the chunk is skipped instead of producing partial rows. 2. `prepare_cached` mismatched with dynamic SQL: the SQL string varies with chunk length, so trailing partial chunks were always cache misses. Switched to plain `tx.prepare(&sql)` to match intent. No behavioural change for the success path. cargo test -p codegraph-core --lib: 181 passed.
1 parent a724a09 commit fea7b5e

2 files changed

Lines changed: 227 additions & 44 deletions

File tree

crates/codegraph-core/src/build_pipeline.rs

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,21 +1081,41 @@ fn build_and_insert_call_edges(
10811081
.map(String::from)
10821082
.collect();
10831083

1084+
// Pre-load every file node ID into a HashMap with one query, replacing
1085+
// the per-file `query_row` cycle that paid a fresh sqlite3_prepare for
1086+
// each entry in `file_symbols` (#1013).
1087+
//
1088+
// The `name = file` predicate matches the legacy per-row lookup
1089+
// (`WHERE name = ? AND file = ?` with both binds set to `rel_path`).
1090+
// For file-kind nodes `name` and `file` are conventionally identical,
1091+
// but keeping the guard prevents an unrelated row from silently
1092+
// overwriting the map entry for `file` (#1028 review).
1093+
let file_node_ids: HashMap<String, u32> = {
1094+
let mut map = HashMap::new();
1095+
if let Ok(mut stmt) = conn.prepare(
1096+
"SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0 AND name = file",
1097+
) {
1098+
if let Ok(rows) = stmt.query_map([], |row| {
1099+
Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)? as u32))
1100+
}) {
1101+
for r in rows.flatten() {
1102+
map.insert(r.0, r.1);
1103+
}
1104+
}
1105+
}
1106+
map
1107+
};
1108+
10841109
// Build FileEdgeInput entries for the native edge builder
10851110
let mut file_entries: Vec<FileEdgeInput> = Vec::new();
10861111
for (rel_path, symbols) in file_symbols {
10871112
if import_ctx.barrel_only_files.contains(rel_path) {
10881113
continue;
10891114
}
10901115

1091-
// Look up file node ID
1092-
let file_node_id: u32 = match conn.query_row(
1093-
"SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0",
1094-
[rel_path, rel_path],
1095-
|row| row.get::<_, i64>(0),
1096-
) {
1097-
Ok(id) => id as u32,
1098-
Err(_) => continue,
1116+
let file_node_id: u32 = match file_node_ids.get(rel_path) {
1117+
Some(&id) => id,
1118+
None => continue,
10991119
};
11001120

11011121
// Build imported names from resolved imports

crates/codegraph-core/src/import_edges.rs

Lines changed: 199 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -153,24 +153,118 @@ pub fn detect_barrel_only_files(ctx: &ImportEdgeContext) -> HashSet<String> {
153153
barrel_only
154154
}
155155

156-
/// Look up a file node ID from the database.
157-
fn get_file_node_id(conn: &Connection, rel_path: &str) -> Option<i64> {
158-
conn.query_row(
159-
"SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0",
160-
[rel_path, rel_path],
161-
|row| row.get(0),
162-
)
163-
.ok()
156+
/// Load every file node ID into a HashMap in one query — replaces per-import
157+
/// `conn.query_row` lookups that paid the SQLite prepare/execute cycle on each
158+
/// call (#1013).
159+
///
160+
/// Includes the explicit `name = file` predicate that matched the legacy
161+
/// per-row lookup (`WHERE name = ? AND file = ?` with both binds set to
162+
/// `rel_path`). For file-kind nodes `name` and `file` are conventionally
163+
/// identical, but keeping the guard prevents an unrelated row from silently
164+
/// overwriting the map entry for `file`.
165+
fn load_file_node_ids(conn: &Connection) -> HashMap<String, i64> {
166+
let mut map = HashMap::new();
167+
if let Ok(mut stmt) = conn.prepare(
168+
"SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0 AND name = file",
169+
) {
170+
if let Ok(rows) = stmt.query_map([], |row| {
171+
Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
172+
}) {
173+
for r in rows.flatten() {
174+
map.insert(r.0, r.1);
175+
}
176+
}
177+
}
178+
map
164179
}
165180

166-
/// Look up the first symbol node ID by name and file (for type-only import resolution).
167-
fn get_symbol_node_id(conn: &Connection, name: &str, file: &str) -> Option<i64> {
168-
conn.query_row(
169-
"SELECT id FROM nodes WHERE name = ? AND file = ? AND kind != 'file' LIMIT 1",
170-
[name, file],
171-
|row| row.get(0),
172-
)
173-
.ok()
181+
/// Load symbol node IDs for the supplied `(name, file)` pairs in one chunked
182+
/// query. Mirrors the JS `nodesByNameAndFile` lookup map; preserves the
183+
/// first-row semantics of the legacy `LIMIT 1` query by keeping the first ID
184+
/// seen per key.
185+
///
186+
/// The pairs are pre-computed by walking the type-only imports in
187+
/// `ctx.file_symbols`, so we never scan the entire `nodes` table — even on
188+
/// monorepos with 100k+ symbols, only the small slice actually referenced by
189+
/// type-only imports is hit (#1013, #1028 review).
190+
fn load_symbol_node_ids(
191+
conn: &Connection,
192+
needed_pairs: &HashSet<(String, String)>,
193+
) -> HashMap<(String, String), i64> {
194+
let mut map: HashMap<(String, String), i64> = HashMap::new();
195+
if needed_pairs.is_empty() {
196+
return map;
197+
}
198+
199+
// 332 pairs × 2 params + 1 spare = 665 binds, comfortably under
200+
// `SQLITE_MAX_VARIABLE_NUMBER`'s legacy 999 default.
201+
const SYMBOL_LOOKUP_CHUNK: usize = 332;
202+
203+
let pairs: Vec<&(String, String)> = needed_pairs.iter().collect();
204+
for chunk in pairs.chunks(SYMBOL_LOOKUP_CHUNK) {
205+
let placeholders: Vec<String> = (0..chunk.len())
206+
.map(|i| {
207+
let base = i * 2;
208+
format!("(?{},?{})", base + 1, base + 2)
209+
})
210+
.collect();
211+
let sql = format!(
212+
"SELECT name, file, id FROM nodes WHERE kind != 'file' AND (name, file) IN ({})",
213+
placeholders.join(",")
214+
);
215+
let mut params: Vec<&dyn rusqlite::ToSql> = Vec::with_capacity(chunk.len() * 2);
216+
for (name, file) in chunk {
217+
params.push(name);
218+
params.push(file);
219+
}
220+
221+
if let Ok(mut stmt) = conn.prepare(&sql) {
222+
if let Ok(rows) = stmt.query_map(rusqlite::params_from_iter(params.iter()), |row| {
223+
Ok((
224+
row.get::<_, String>(0)?,
225+
row.get::<_, String>(1)?,
226+
row.get::<_, i64>(2)?,
227+
))
228+
}) {
229+
for r in rows.flatten() {
230+
map.entry((r.0, r.1)).or_insert(r.2);
231+
}
232+
}
233+
}
234+
}
235+
map
236+
}
237+
238+
/// Walk type-only imports in `ctx.file_symbols` and return the distinct
239+
/// `(name, file)` pairs that `build_import_edges` will need to look up.
240+
/// Resolves barrel files the same way the edge-building loop does so the
241+
/// pre-computed set matches the actual lookup keys.
242+
fn collect_type_only_lookup_pairs(ctx: &ImportEdgeContext) -> HashSet<(String, String)> {
243+
let mut pairs = HashSet::new();
244+
for (rel_path, symbols) in &ctx.file_symbols {
245+
let abs_file = Path::new(&ctx.root_dir).join(rel_path);
246+
let abs_str = abs_file.to_str().unwrap_or("");
247+
for imp in &symbols.imports {
248+
if !imp.type_only.unwrap_or(false) {
249+
continue;
250+
}
251+
let resolved_path = ctx.get_resolved(abs_str, &imp.source);
252+
for name in &imp.names {
253+
let clean_name = name.strip_prefix("* as ").unwrap_or(name);
254+
let mut target_file = resolved_path.clone();
255+
if ctx.is_barrel_file(&resolved_path) {
256+
let mut visited = HashSet::new();
257+
if let Some(actual) =
258+
ctx.resolve_barrel_export(&resolved_path, clean_name, &mut visited)
259+
{
260+
target_file = actual;
261+
}
262+
}
263+
pairs.insert((clean_name.to_string(), target_file));
264+
}
265+
}
266+
}
267+
pairs
174268
}
175269

176270
/// Build import edges from parsed file symbols.
@@ -185,10 +279,24 @@ fn get_symbol_node_id(conn: &Connection, name: &str, file: &str) -> Option<i64>
185279
pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<EdgeRow> {
186280
let mut edges = Vec::new();
187281

282+
// Pre-load all file node IDs once. Previously this was N x query_row,
283+
// each of which ran a fresh sqlite3_prepare/step/finalize cycle (#1013).
284+
let file_node_ids = load_file_node_ids(conn);
285+
// Only the symbols actually referenced by type-only imports are needed —
286+
// skip the lookup entirely when no type-only imports exist (the common
287+
// case), and otherwise issue a chunked `(name, file) IN (...)` query so
288+
// memory stays bounded even on large monorepos (#1028 review).
289+
let needed_symbol_pairs = collect_type_only_lookup_pairs(ctx);
290+
let symbol_node_ids = if needed_symbol_pairs.is_empty() {
291+
HashMap::new()
292+
} else {
293+
load_symbol_node_ids(conn, &needed_symbol_pairs)
294+
};
295+
188296
for (rel_path, symbols) in &ctx.file_symbols {
189297
let is_barrel_only = ctx.barrel_only_files.contains(rel_path);
190-
let file_node_id = match get_file_node_id(conn, rel_path) {
191-
Some(id) => id,
298+
let file_node_id = match file_node_ids.get(rel_path) {
299+
Some(&id) => id,
192300
None => continue,
193301
};
194302

@@ -203,8 +311,8 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
203311
}
204312

205313
let resolved_path = ctx.get_resolved(abs_str, &imp.source);
206-
let target_id = match get_file_node_id(conn, &resolved_path) {
207-
Some(id) => id,
314+
let target_id = match file_node_ids.get(&resolved_path) {
315+
Some(&id) => id,
208316
None => continue,
209317
};
210318

@@ -238,7 +346,9 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
238346
target_file = actual;
239347
}
240348
}
241-
if let Some(sym_id) = get_symbol_node_id(conn, clean_name, &target_file) {
349+
if let Some(&sym_id) =
350+
symbol_node_ids.get(&(clean_name.to_string(), target_file))
351+
{
242352
edges.push(EdgeRow {
243353
source_id: file_node_id,
244354
target_id: sym_id,
@@ -262,7 +372,7 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
262372
if actual_source != resolved_path
263373
&& resolved_sources.insert(actual_source.clone())
264374
{
265-
if let Some(actual_id) = get_file_node_id(conn, &actual_source) {
375+
if let Some(&actual_id) = file_node_ids.get(&actual_source) {
266376
let through_kind = match edge_kind {
267377
"imports-type" => "imports-type",
268378
"dynamic-imports" => "dynamic-imports",
@@ -286,29 +396,82 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
286396
edges
287397
}
288398

289-
/// Batch insert edges into the database.
399+
/// 199 rows × 5 params = 995 bind parameters, safely under the legacy
400+
/// `SQLITE_MAX_VARIABLE_NUMBER` default of 999. Mirrors `edges_db::CHUNK`.
401+
const INSERT_CHUNK: usize = 199;
402+
403+
/// Batch insert edges into the database using multi-row VALUES chunks.
404+
///
405+
/// Replaces the previous one-prepared-statement-per-row pattern that paid a
406+
/// per-edge bind/step/reset cycle. With the chunked path each chunk runs a
407+
/// single VM execution against a freshly prepared statement (#1013).
408+
///
409+
/// Bind/execute errors are surfaced via a stderr warning and the offending
410+
/// chunk is skipped — silently swallowing them previously could produce
411+
/// `NULL` columns in the inserted edge rows.
290412
pub fn insert_edges(conn: &Connection, edges: &[EdgeRow]) {
291413
if edges.is_empty() {
292414
return;
293415
}
294416
let tx = match conn.unchecked_transaction() {
295417
Ok(tx) => tx,
296-
Err(_) => return,
418+
Err(e) => {
419+
eprintln!("[codegraph] insert_edges: failed to start transaction: {e}");
420+
return;
421+
}
297422
};
298-
if let Ok(mut stmt) = tx.prepare(
299-
"INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES (?, ?, ?, ?, ?)",
300-
) {
301-
for e in edges {
302-
let _ = stmt.execute(rusqlite::params![
303-
e.source_id,
304-
e.target_id,
305-
e.kind,
306-
e.confidence,
307-
e.dynamic,
308-
]);
423+
424+
for chunk in edges.chunks(INSERT_CHUNK) {
425+
if let Err(e) = insert_edge_chunk(&tx, chunk) {
426+
eprintln!(
427+
"[codegraph] insert_edges: skipped chunk of {} rows due to error: {e}",
428+
chunk.len()
429+
);
309430
}
310431
}
311-
let _ = tx.commit();
432+
if let Err(e) = tx.commit() {
433+
eprintln!("[codegraph] insert_edges: commit failed: {e}");
434+
}
435+
}
436+
437+
/// Bind and execute a single chunk in its own fallible scope so the caller
438+
/// can log the failure and continue with the next chunk.
439+
///
440+
/// `prepare` (not `prepare_cached`) is used because the SQL string varies
441+
/// with chunk length — caching keyed on dynamic SQL would churn the LRU
442+
/// for every partial trailing chunk and obscure the intent of the cache.
443+
fn insert_edge_chunk(
444+
tx: &rusqlite::Transaction<'_>,
445+
chunk: &[EdgeRow],
446+
) -> rusqlite::Result<()> {
447+
let placeholders: Vec<String> = (0..chunk.len())
448+
.map(|i| {
449+
let base = i * 5;
450+
format!(
451+
"(?{},?{},?{},?{},?{})",
452+
base + 1,
453+
base + 2,
454+
base + 3,
455+
base + 4,
456+
base + 5
457+
)
458+
})
459+
.collect();
460+
let sql = format!(
461+
"INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES {}",
462+
placeholders.join(",")
463+
);
464+
let mut stmt = tx.prepare(&sql)?;
465+
for (i, edge) in chunk.iter().enumerate() {
466+
let base = i * 5;
467+
stmt.raw_bind_parameter(base + 1, edge.source_id)?;
468+
stmt.raw_bind_parameter(base + 2, edge.target_id)?;
469+
stmt.raw_bind_parameter(base + 3, edge.kind.as_str())?;
470+
stmt.raw_bind_parameter(base + 4, edge.confidence)?;
471+
stmt.raw_bind_parameter(base + 5, edge.dynamic)?;
472+
}
473+
stmt.raw_execute()?;
474+
Ok(())
312475
}
313476

314477
#[cfg(test)]

0 commit comments

Comments
 (0)