denfry · denfry · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · chatgpt-codex-connector
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,44 @@ All notable changes to this project are documented here. The format is based on
 
 ## [Unreleased]
 
+### Changed — retrieval ranking & fusion (requires a one-time reindex)
+- **RRF fusion rescaled and re-keyed.** Fused scores were ~`w/k` (≈0.017), an order
+  of magnitude below the reranker's bounded bonuses, so rerank silently became the
+  primary ranker. RRF is now scaled by `k` (a pure monotonic rescale — order is
+  unchanged) so fused scores and rerank bonuses share an O(1) scale. Fusion also
+  merges on a coarse `(path, line-bucket)` key instead of an exact `(path, start,
+  end)` one: different retrievers report different ranges for the same place, so the
+  exact key almost never coincided and cross-source agreement never fired.
+  `agreeing_sources` is now counted at file granularity.
+- **Confidence uses a scale-invariant relative gap** instead of absolute thresholds.
+- **Per-file diversification**: at most 3 hits per file stay on the page; the rest
+  are pushed to the tail (nothing is dropped). Combined with bucketing this removes
+  the "same small file returned six times at different line slivers" noise.
+- **FTS recall on natural-language queries**: stopwords (`how`, `does`, `the`, …)
+  are dropped before building the FTS `MATCH`, so a query like "how does auth work"
+  no longer AND-s in filler that code chunks never contain.
+- **Symbol names are FTS-indexed.** `chunks` gained a denormalized `symbol_names`
+  column (mirrored verbatim by the FTS sync triggers, so external-content delete/
+  update stays consistent) — a query matching a symbol's name now hits even when the
+  body text doesn't repeat it. **Bumps `SCHEMA_VERSION` 1 → 2.** Older indexes are
+  still readable; `index`/`update` detect the mismatch and rebuild from scratch.
+- **Centrality fallback for ambiguous names**: symbols whose name isn't globally
+  unique never get a resolved `in_degree`; they now receive a damped, half-capped
+  bonus from a name-reference count so common names (`run`, `handle`, …) aren't
+  flatly zeroed. Precise `in_degree`, where present, still takes precedence.
+- **Test-file demotion is word-boundary aware**: `contest/`, `latest.py`,
+  `testimonials.tsx` are no longer mistaken for test files.
+- **Language-aware import resolution**: `import './base'` from a `.ts` file resolves
+  to `base.ts` rather than a same-named `base.py` earlier in the fallback order.
+- **Freshness is content-aware**: a bare `touch` (mtime change, identical bytes) is
+  a no-op for `update`, so it no longer reports the index as stale — freshness now
+  mirrors the sha-based incremental decision.
+
+### Removed
+- Dead legacy lexical-search path in `retrieval/searchers.py` (`fts_response`,
+  `fts_search`, the second `Candidate` dataclass and `_confidence`/`_fallbacks`/
+  `_trim`) — the live path goes through `pipeline.search` → `fts_candidates`.
+
 ## [1.4.0] - 2026-06-14
 
 ### Added

diff --git a/docs/SCHEMA.md b/docs/SCHEMA.md
@@ -39,7 +39,8 @@ CREATE TABLE chunks (
     kind          TEXT,                        -- 'symbol_body' | 'window' | 'doc'
     symbol_id     INTEGER REFERENCES symbols(id) ON DELETE SET NULL,
     content       TEXT NOT NULL,               -- raw text (secret-redacted before snippet output)
-    token_est     INTEGER NOT NULL             -- estimated tokens, for budgeting
+    token_est     INTEGER NOT NULL,            -- estimated tokens, for budgeting
+    symbol_names  TEXT NOT NULL DEFAULT ''     -- denormalized symbol name, FTS-indexed (mirrored by triggers)
 );
 CREATE INDEX idx_chunks_file ON chunks(file_id);
 

diff --git a/src/codebase_index/cli.py b/src/codebase_index/cli.py
@@ -54,6 +54,13 @@ def _ensure_index(ctx: "typer.Context") -> tuple[Path, Any]:
     return db_path, cfg
 
 
+def _remove_db_files(db_path: Path) -> None:
+    """Delete the SQLite db and its WAL/SHM sidecars (used to force a clean rebuild)."""
+    for p in (db_path, *(db_path.with_name(db_path.name + s) for s in ("-wal", "-shm"))):
+        if p.exists():
+            p.unlink()
+
+
 def _open_in_browser(path: Path) -> None:
     uri = path.resolve().as_uri()
     try:
@@ -278,13 +285,15 @@ def index(
 
     from .config import load
     from .indexer.pipeline import build_index
-    from .storage.db import Database
+    from .storage.db import SCHEMA_VERSION, Database, peek_schema_version
 
     root_opt = ctx.obj.get("root") if ctx.obj else None
     cfg = load(root_opt)
     db_path = Path(cfg.root) / ".claude" / "cache" / "codebase-index" / "index.sqlite"
-    if rebuild and db_path.exists():
-        db_path.unlink()
+    # A full build discards an outdated-schema index: schema.sql is applied with
+    # IF NOT EXISTS, so an upgrade can't add columns/triggers in place — recreate.
+    if rebuild or (db_path.exists() and peek_schema_version(db_path) < SCHEMA_VERSION):
+        _remove_db_files(db_path)
 
     with Database(db_path) as db:
         stats = build_index(cfg, db, root=Path(cfg.root))
@@ -321,8 +330,8 @@ def update(
     import json as _json
 
     from .config import load
-    from .indexer.pipeline import update_index
-    from .storage.db import Database
+    from .indexer.pipeline import build_index, update_index
+    from .storage.db import SCHEMA_VERSION, Database, peek_schema_version
 
     is_json = bool(ctx.obj and ctx.obj.get("json"))
     quiet = bool(ctx.obj and ctx.obj.get("quiet"))
@@ -336,8 +345,15 @@ def update(
             typer.echo("No index found. Run `codebase-index index` first.")
         raise typer.Exit(code=0)
 
-    with Database(db_path) as db:
-        stats = update_index(cfg, db, root=Path(cfg.root), since=since, all_files=all_files)
+    if peek_schema_version(db_path) < SCHEMA_VERSION:
+        # Schema changed under the index; an incremental write would target old
+        # tables. Upgrade by rebuilding from scratch (the index is a derived cache).
+        _remove_db_files(db_path)
+        with Database(db_path) as db:
+            stats = build_index(cfg, db, root=Path(cfg.root))
+    else:
+        with Database(db_path) as db:
+            stats = update_index(cfg, db, root=Path(cfg.root), since=since, all_files=all_files)
 
     if is_json:
         typer.echo(

diff --git a/src/codebase_index/discovery/classify.py b/src/codebase_index/discovery/classify.py
@@ -132,3 +132,20 @@ def is_generated(path: str) -> bool:
         or name.endswith(".min.js")
         or name.endswith(".min.css")
     )
+
+
+# Directory names that mark a test tree, and filename patterns for test modules.
+# Matched on whole path segments / filename stems — NOT a bare substring — so
+# `contest/`, `latest.py`, or `testimonials.ts` are never mistaken for tests.
+_TEST_DIRS = {"test", "tests", "__tests__", "__test__", "testing", "spec", "specs", "e2e"}
+
+
+def is_test_path(path: str) -> bool:
+    pure = PurePosixPath(path.replace("\\", "/"))
+    if any(part.lower() in _TEST_DIRS for part in pure.parts[:-1]):
+        return True
+    name = pure.name.lower()
+    stem = name.split(".", 1)[0]
+    if stem == "test" or stem.startswith("test_") or stem.endswith("_test"):
+        return True
+    return ".test." in name or ".spec." in name
diff --git a/src/codebase_index/graph/builder.py b/src/codebase_index/graph/builder.py
@@ -42,7 +42,7 @@ def resolve_edges(conn: sqlite3.Connection) -> int:
     for edge in edges:
         name = edge["dst_name"]
         if edge["edge_type"] == "import":
-            file_id = _module_to_file_id(suffix_map, name)
+            file_id = _module_to_file_id(suffix_map, name, lang=edge["lang"])
             if file_id is not None:
                 resolutions.append(("file", file_id, edge["id"]))
         elif edge["edge_type"] in _SYMBOL_EDGE_TYPES:
@@ -70,13 +70,36 @@ def _path_suffix_map(rows: list[sqlite3.Row]) -> dict[str, Optional[int]]:
     return mapping
 
 
+def _lang_suffixes(lang: Optional[str], base: str, rust_base: str, go_pkg: str) -> list[str]:
+    """Import-path suffixes specific to one language, most-specific first."""
+    return {
+        "python": [f"{base}.py", f"{base}/__init__.py"],
+        "typescript": [f"{base}.ts", f"{base}.tsx", f"{base}/index.ts", f"{base}/index.tsx"],
+        "javascript": [f"{base}.js", f"{base}/index.js"],
+        "java": [f"{base}.java"],
+        "kotlin": [f"{base}.kt"],
+        "go": [f"{go_pkg}.go"],
+        "rust": [
+            f"{rust_base}.rs", f"{rust_base}/mod.rs",
+            f"src/{rust_base}.rs", f"src/{rust_base}/mod.rs",
+        ],
+        "csharp": [f"{base}.cs"],
+        "ruby": [f"{base}.rb"],
+        "php": [f"{base}.php"],
+    }.get(lang or "", [])
+
+
 def _module_to_file_id(
-    suffix_map: dict[str, Optional[int]], module: str
+    suffix_map: dict[str, Optional[int]], module: str, lang: Optional[str] = None
 ) -> Optional[int]:
     """Resolve a module/import path to a unique file id, or None.
 
     Handles Python, TypeScript/JavaScript, Java/Kotlin/Scala, Rust (:: separator),
-    Go (last path segment), C#, Ruby, and PHP import conventions.
+    Go (last path segment), C#, Ruby, and PHP import conventions. The importing
+    file's `lang` is tried first so that, in a polyglot repo, `import './base'` from
+    a .ts file resolves to base.ts rather than a same-named base.py earlier in the
+    fixed fallback order. The fallback order is unchanged, so single-language repos
+    and the lang-unknown path behave exactly as before.
     """
     base = module.lower().replace(".", "/").strip("/")
     rust_base = module.lower().replace("::", "/").strip("/")
@@ -85,7 +108,7 @@ def _module_to_file_id(
     # Last segment used for Go package-level resolution
     go_pkg = base.rsplit("/", 1)[-1] if "/" in base else base
 
-    for suffix in (
+    fallback = (
         # Python
         f"{base}.py",
         f"{base}/__init__.py",
@@ -113,7 +136,8 @@ def _module_to_file_id(
         f"{base}.rb",
         # PHP
         f"{base}.php",
-    ):
+    )
+    for suffix in (*_lang_suffixes(lang, base, rust_base, go_pkg), *fallback):
         file_id = suffix_map.get(suffix)
         if file_id is not None:
             return file_id

diff --git a/src/codebase_index/indexer/freshness.py b/src/codebase_index/indexer/freshness.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import hashlib
 import subprocess
 from pathlib import Path
 
@@ -47,25 +48,47 @@ def compute_freshness(conn, root: Path, config: Config) -> IndexFreshness:
 
 
 def _changed_count(conn, root: Path, config: Config) -> int:
-    """Added + removed + mtime-modified indexable files vs. the index."""
-    current: dict[str, int] = {}
+    """Added + removed + content-modified indexable files vs. the index.
+
+    Mirrors the incremental update's decision (indexer/pipeline.py): a file is
+    unchanged when (mtime, size) match, and even when they differ it is only
+    counted as changed if its sha256 differs. A bare `touch` that rewrites mtime
+    without changing bytes is a no-op for update_index, so it must not register as
+    stale here either.
+    """
+    indexed = repo.fingerprints(conn)  # path -> (mtime_ns, size_bytes, sha256)
+    seen: set[str] = set()
+    changed = 0
     for cand in walk(root, config):
         try:
-            current[cand.rel_path] = cand.path.stat().st_mtime_ns
+            st = cand.path.stat()
         except OSError:
             continue
-    indexed = repo.path_mtimes(conn)
-
-    changed = 0
-    for path, mtime in current.items():
-        if path not in indexed or indexed[path] != mtime:
-            changed += 1
-    for path in indexed:
-        if path not in current:
+        seen.add(cand.rel_path)
+        prior = indexed.get(cand.rel_path)
+        if prior is None:
             changed += 1
+            continue
+        if prior[0] == st.st_mtime_ns and prior[1] == cand.size_bytes:
+            continue
+        try:
+            if prior[2] == _sha256_file(cand.path):
+                continue
+        except OSError:
+            pass
+        changed += 1
+    changed += sum(1 for path in indexed if path not in seen)
     return changed
 
 
+def _sha256_file(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as fh:
+        for block in iter(lambda: fh.read(65536), b""):
+            h.update(block)
+    return h.hexdigest()
+
+
 def _git_clean_at(root: Path, indexed_head: "str | None") -> bool:
     """True iff git is available, HEAD == indexed_head, and the tree has no changes."""
     if indexed_head is None or not (root / ".git").exists():

diff --git a/src/codebase_index/retrieval/fusion.py b/src/codebase_index/retrieval/fusion.py
@@ -1,8 +1,21 @@
 """Reciprocal Rank Fusion across per-source ranked candidate lists.
 
-RRF(d) = Σ_r  w_r / (k + rank_r(d))   — robust to incomparable raw scores.
-On merge, the candidate carrying the most signal (symbol > fts > path) is kept
-as the representative so downstream rerank/snippet logic has the richest fields.
+RRF(d) = Σ_r  w_r · k / (k + rank_r(d))   — robust to incomparable raw scores.
+
+Two deliberate departures from the textbook formula:
+
+* Scaled by k. Raw RRF tops out at w/k (≈0.017 for k=60), an order of magnitude
+  below the bounded bonuses the reranker layers on top, so rerank would silently
+  become the primary ranker and RRF a mere tiebreak. Multiplying by k is a pure
+  monotonic rescale (fusion order is identical) that lifts the top contribution to
+  ≈w, putting fused scores and rerank bonuses on the same O(1) scale.
+* Fused on a coarse (path, line-bucket) key, not (path, start, end). Different
+  retrievers report different line ranges for the same place; an exact key almost
+  never coincides across sources, so cross-source agreement — RRF's whole point —
+  would never fire. `agreeing_sources` is therefore counted at file granularity.
+
+On merge, the candidate carrying the most signal (symbol > fts > path) is kept as
+the representative so downstream rerank/snippet logic has the richest fields.
 """
 
 from __future__ import annotations
@@ -26,18 +39,24 @@ def fuse(
 ) -> list[Candidate]:
     accum: dict[tuple, float] = {}
     rep: dict[tuple, Candidate] = {}
-    agree: dict[tuple, set[str]] = {}
+    seen: set[tuple] = set()
+    file_sources: dict[str, set[str]] = {}
 
     for source, candidates in lists.items():
         w = weights.get(source, 0.0)
         if w <= 0.0:
             continue
         for rank, cand in enumerate(candidates):
-            key = cand.key()
-            accum[key] = accum.get(key, 0.0) + w / (k + rank)
-            agree.setdefault(key, set()).add(source)
+            file_sources.setdefault(cand.path, set()).add(source)
+            key = cand.fuse_key()
+            # One contribution per source per locator: a file matching three FTS
+            # chunks in the same bucket is one lexical signal, not three.
+            if (source, key) in seen:
+                continue
+            seen.add((source, key))
+            accum[key] = accum.get(key, 0.0) + w * k / (k + rank)
             rep[key] = _richer(rep[key], cand) if key in rep else cand
 
     fused = [_replace(rep[key], score=score) for key, score in accum.items()]
     fused.sort(key=lambda c: c.score, reverse=True)
-    return [_replace(c, agreeing_sources=len(agree[c.key()])) for c in fused]
+    return [_replace(c, agreeing_sources=len(file_sources[c.path])) for c in fused]