diff --git a/.changeset/index-optimization-partial-and-or.md b/.changeset/index-optimization-partial-and-or.md new file mode 100644 index 000000000..744f3110d --- /dev/null +++ b/.changeset/index-optimization-partial-and-or.md @@ -0,0 +1,16 @@ +--- +'@tanstack/db': patch +--- + +Fix incorrect results from index-optimized `where` clauses that combine indexed and non-indexed conditions. + +- `OR` expressions are now only served from indexes when every disjunct can use an index; otherwise the query falls back to a full scan. Previously, rows matched only by a non-indexed disjunct were missing from the result. +- `AND` expressions still use indexes for the conditions that have them, but the remaining conditions are now enforced by re-checking each candidate row against the full expression. Previously, non-indexed conditions were silently dropped, returning rows that did not match the query. +- Compound range conditions (e.g. `age > 5 AND age < 10`) combined with conditions on other fields no longer ignore those other conditions. +- Compound range conditions sharing the same boundary value (e.g. `age >= 5 AND age > 5`) now apply the strictest bound regardless of the order the conditions appear in, using the same value comparison semantics as the indexes (dates, locale strings, ...). +- Compound range conditions that only bound one side (e.g. `age > 5 AND age >= 8`) no longer return an empty result. +- Strict range comparisons (`gt`/`lt`) on BTree-indexed fields holding normalized values such as dates now correctly exclude the boundary value. +- Compound range conditions with a `null`/`undefined` bound (e.g. `gt(score, undefined)`) now re-filter against the full expression instead of returning index-ordered rows, matching the semantics of a full scan (a comparison against `null`/`undefined` is never true). +- Index-optimized `eq`, `IN`, and range queries on a field that has rows with `null`/`undefined` values no longer leak those rows into results. BTree indexes store and return such rows (they sort as the smallest key), but a comparison against `null`/`undefined` is never true, so these results are now re-filtered against the full expression to stay equivalent to a full scan. +- Index-optimized `eq` and `IN` against `NaN` no longer leak `NaN`-valued rows. `NaN` is never equal to itself, but indexes still return such rows, so these results are now re-filtered against the full expression. +- String range conditions (`gt`/`gte`/`lt`/`lte`) on a collection using locale string collation (the default) are no longer served by the index. The index orders strings with `localeCompare` while the `where` evaluator compares them with standard relational operators, so an index range lookup could omit matching rows; these conditions now fall back to a full scan. diff --git a/packages/db/src/collection/change-events.ts b/packages/db/src/collection/change-events.ts index 6afac412c..3f4977b7b 100644 --- a/packages/db/src/collection/change-events.ts +++ b/packages/db/src/collection/change-events.ts @@ -138,11 +138,16 @@ export function currentStateAsChanges< ) if (optimizationResult.canOptimize) { - // Use index optimization + // Use index optimization. When the index lookup is inexact, the keys + // are a superset of the true result (some conditions could not be + // served by an index), so re-check each row against the full expression. + const filterFn = optimizationResult.isExact + ? undefined + : createFilterFunctionFromExpression(expression) const result: Array, TKey>> = [] for (const key of optimizationResult.matchingKeys) { const value = collection.get(key) - if (value !== undefined) { + if (value !== undefined && (filterFn?.(value) ?? true)) { result.push({ type: `insert`, key, diff --git a/packages/db/src/indexes/btree-index.ts b/packages/db/src/indexes/btree-index.ts index 17608950a..b46c70710 100644 --- a/packages/db/src/indexes/btree-index.ts +++ b/packages/db/src/indexes/btree-index.ts @@ -247,7 +247,16 @@ export class BTreeIndex< toKey, toInclusive, (indexedValue, _) => { - if (!fromInclusive && this.compareFn(indexedValue, from) === 0) { + // Only exclude the boundary when an exclusive lower bound was + // actually provided. Without a `from` bound, `fromKey` defaults to + // the minimum key and must not be dropped. Compare against the + // normalized key since indexed values are stored normalized + // (e.g. dates as timestamps), so the raw `from` would never match. + if ( + hasFrom && + !fromInclusive && + this.compareFn(indexedValue, fromKey) === 0 + ) { // the B+ tree `forRange` method does not support exclusive lower bounds // so we need to exclude it manually return diff --git a/packages/db/src/utils/index-optimization.ts b/packages/db/src/utils/index-optimization.ts index 81b111af5..730f09b04 100644 --- a/packages/db/src/utils/index-optimization.ts +++ b/packages/db/src/utils/index-optimization.ts @@ -18,6 +18,7 @@ import { DEFAULT_COMPARE_OPTIONS } from '../utils.js' import { ReverseIndex } from '../indexes/reverse-index.js' import { hasVirtualPropPath } from '../virtual-props.js' +import { makeComparator } from './comparison.js' import type { CompareOptions } from '../query/builder/types.js' import type { IndexInterface, IndexOperation } from '../indexes/base-index.js' import type { BasicExpression } from '../query/ir.js' @@ -29,6 +30,13 @@ import type { CollectionLike } from '../types.js' export interface OptimizationResult { canOptimize: boolean matchingKeys: Set + /** + * Whether `matchingKeys` is exactly the set of keys matching the expression. + * When `false`, the keys are a superset of the true result (some conditions + * could not be served by an index) and each row must be re-checked against + * the full expression before being included in the result. + */ + isExact: boolean } /** @@ -94,6 +102,59 @@ export function unionSets(sets: Array>): Set { return result } +/** + * Whether a value can be matched exactly by an index lookup, i.e. the index + * result for it is not a superset that the caller must re-filter. + * + * The WHERE evaluator uses three-valued logic: a comparison against + * `null`/`undefined` yields UNKNOWN, and `NaN` is never equal to itself. BTree + * indexes, however, store and return rows with such values (nullish keys sort + * as the smallest key; `NaN` keys are found via SameValueZero map equality), so + * any result that could include them must be treated as inexact. + */ +function isExactComparisonValue(value: unknown): boolean { + if (value == null) return false + if (typeof value === `number` && Number.isNaN(value)) return false + return true +} + +/** + * Whether the collection orders strings using locale collation. + * + * Under `stringSort: 'locale'` a BTree string index orders values with + * `localeCompare`, but the WHERE evaluator compares strings with JS relational + * operators (code-point order). For range predicates these orders disagree + * (e.g. `'ö' > 'z'` is true in JS but `'ö'` sorts before `'z'` under locale + * `en`), so an index range lookup can omit matching rows. Such omissions cannot + * be recovered by re-filtering, so locale-backed string range predicates must + * not be index-optimized. + */ +function usesLocaleStringSort(collection: CollectionLike): boolean { + const opts = { ...DEFAULT_COMPARE_OPTIONS, ...collection.compareOptions } + return opts.stringSort === `locale` +} + +/** + * Whether a range predicate cannot be safely served by an index because the + * index orders strings with locale collation while the WHERE evaluator uses JS + * relational operators. See {@link usesLocaleStringSort}. + */ +function isLocaleStringRangeUnsafe( + operation: string, + value: unknown, + collection: CollectionLike, +): boolean { + if ( + operation !== `gt` && + operation !== `gte` && + operation !== `lt` && + operation !== `lte` + ) { + return false + } + return typeof value === `string` && usesLocaleStringSort(collection) +} + /** * Optimizes a query expression using available indexes to find matching keys */ @@ -134,7 +195,7 @@ function optimizeQueryRecursive( } } - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } /** @@ -167,6 +228,14 @@ export function canOptimizeExpression< return false } +/** + * Result of compound range optimization, including which AND arguments + * were covered by the range query so the caller can process the rest. + */ +interface CompoundRangeResult extends OptimizationResult { + coveredArgIndices: Set +} + /** * Optimizes compound range queries on the same field * Example: WHERE age > 5 AND age < 10 @@ -177,9 +246,14 @@ function optimizeCompoundRangeQuery< >( expression: BasicExpression, collection: CollectionLike, -): OptimizationResult { +): CompoundRangeResult { if (expression.type !== `func` || expression.args.length < 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { + canOptimize: false, + matchingKeys: new Set(), + isExact: false, + coveredArgIndices: new Set(), + } } // Group range operations by field @@ -188,11 +262,12 @@ function optimizeCompoundRangeQuery< Array<{ operation: `gt` | `gte` | `lt` | `lte` value: any + argIndex: number }> >() // Collect all range operations from AND arguments - for (const arg of expression.args) { + for (const [argIndex, arg] of expression.args.entries()) { if (arg.type === `func` && [`gt`, `gte`, `lt`, `lte`].includes(arg.name)) { const rangeOp = arg as any if (rangeOp.args.length === 2) { @@ -238,7 +313,7 @@ function optimizeCompoundRangeQuery< if (!fieldOperations.has(fieldKey)) { fieldOperations.set(fieldKey, []) } - fieldOperations.get(fieldKey)!.push({ operation, value }) + fieldOperations.get(fieldKey)!.push({ operation, value, argIndex }) } } } @@ -250,55 +325,113 @@ function optimizeCompoundRangeQuery< const fieldPath = fieldKey.split(`.`) const index = findIndexForField(collection, fieldPath) + // A locale-backed string range cannot be served by the index (it may + // omit matching rows that re-filtering cannot recover), so leave this + // field for a full scan instead of collapsing it into a range query. + if ( + operations.some((op) => + isLocaleStringRangeUnsafe(op.operation, op.value, collection), + ) + ) { + continue + } + if (index && index.supports(`gt`) && index.supports(`lt`)) { - // Build range query options + // Compare values with the same semantics the index uses (dates, + // locale strings, ...), in ascending order since bounds are about + // value order regardless of the index direction + const compare = makeComparator({ + ...DEFAULT_COMPARE_OPTIONS, + ...collection.compareOptions, + direction: `asc`, + }) + + // Build range query options, keeping the strictest bound on each + // side: a larger lower bound (or smaller upper bound) wins, and at + // equal values the exclusive operation wins over the inclusive one. + // `hasFromBound`/`hasToBound` track whether a bound was selected, + // separately from the bound value (which may legitimately be falsy). let from: any = undefined let to: any = undefined + let hasFromBound = false + let hasToBound = false let fromInclusive = true let toInclusive = true + // A comparison against null/undefined/NaN is never true, but in an + // index nullish values sort as the smallest key (and NaN is retained), + // so a range query cannot represent such a bound. Track it and force a + // re-filter instead of claiming the result is exact. + let hasNonComparableBound = false for (const { operation, value } of operations) { + if (!isExactComparisonValue(value)) { + hasNonComparableBound = true + continue + } switch (operation) { case `gt`: - if (from === undefined || value > from) { + case `gte`: { + const cmp = hasFromBound ? compare(value, from) : 1 + if (cmp > 0) { from = value + hasFromBound = true + fromInclusive = operation === `gte` + } else if (cmp === 0 && operation === `gt`) { fromInclusive = false } break - case `gte`: - if (from === undefined || value > from) { - from = value - fromInclusive = true - } - break + } case `lt`: - if (to === undefined || value < to) { + case `lte`: { + const cmp = hasToBound ? compare(value, to) : -1 + if (cmp < 0) { to = value + hasToBound = true + toInclusive = operation === `lte` + } else if (cmp === 0 && operation === `lt`) { toInclusive = false } break - case `lte`: - if (to === undefined || value < to) { - to = value - toInclusive = true - } - break + } } } - const matchingKeys = (index as any).rangeQuery({ - from, - to, - fromInclusive, - toInclusive, - }) - - return { canOptimize: true, matchingKeys } + // Only pass the bounds that were selected: rangeQuery distinguishes + // an absent bound (open-ended) from an explicitly provided one + const rangeOptions: Record = {} + if (hasFromBound) { + rangeOptions.from = from + rangeOptions.fromInclusive = fromInclusive + } + if (hasToBound) { + rangeOptions.to = to + rangeOptions.toInclusive = toInclusive + } + const matchingKeys = (index as any).rangeQuery(rangeOptions) + + return { + canOptimize: true, + matchingKeys, + // The range result is exact only when it cannot include rows with a + // nullish indexed value (which a comparison would reject but the + // index returns, as they sort as the smallest key). That requires a + // non-nullish lower bound to exclude them: without `hasFromBound` + // the range is open at the bottom and captures those rows, and a + // non-comparable bound value (`hasNonComparableBound`) can never + // bound them out. + isExact: hasFromBound && !hasNonComparableBound, + coveredArgIndices: new Set(operations.map((op) => op.argIndex)), + } } } } - return { canOptimize: false, matchingKeys: new Set() } + return { + canOptimize: false, + matchingKeys: new Set(), + isExact: false, + coveredArgIndices: new Set(), + } } /** @@ -312,7 +445,7 @@ function optimizeSimpleComparison< collection: CollectionLike, ): OptimizationResult { if (expression.type !== `func` || expression.args.length !== 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } const leftArg = expression.args[0]! @@ -362,15 +495,38 @@ function optimizeSimpleComparison< // Check if the index supports this operation if (!index.supports(indexOperation)) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } + } + + // A locale-backed string range cannot be served by the index: it may + // omit matching rows, which re-filtering cannot recover. Fall back to a + // full scan instead. + if (isLocaleStringRangeUnsafe(operation, queryValue, collection)) { + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } const matchingKeys = index.lookup(indexOperation, queryValue) - return { canOptimize: true, matchingKeys } + + // A comparison against a nullish or NaN value is never true, but BTree + // indexes store and return rows with such values (nullish keys sort as + // the smallest key; NaN keys match via SameValueZero map equality). + // Determine whether the index result is exact or a superset that the + // caller must re-filter: + // - eq/gt/gte: such a query value matches nothing while the index still + // returns those rows -> inexact. A non-nullish lower bound (gt/gte) + // excludes the bottom-sorted nullish rows, so those stay exact. + // - lt/lte: the open lower bound always includes nullish-keyed rows, + // so the result is conservatively inexact. + const isExact = + operation === `lt` || operation === `lte` + ? false + : isExactComparisonValue(queryValue) + + return { canOptimize: true, matchingKeys, isExact } } } - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } /** @@ -412,22 +568,41 @@ function optimizeAndExpression( collection: CollectionLike, ): OptimizationResult { if (expression.type !== `func` || expression.args.length < 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } // First, try to optimize compound range queries on the same field + // (e.g. age > 5 AND age < 10 becomes a single range query) const compoundRangeResult = optimizeCompoundRangeQuery(expression, collection) - if (compoundRangeResult.canOptimize) { - return compoundRangeResult - } + const coveredArgIndices = compoundRangeResult.canOptimize + ? compoundRangeResult.coveredArgIndices + : new Set() const results: Array> = [] + if (compoundRangeResult.canOptimize) { + results.push(compoundRangeResult) + } - // Try to optimize each part, keep the optimizable ones - for (const arg of expression.args) { + // Try to optimize the remaining conjuncts, keep the optimizable ones. + // Conjuncts that cannot use an index make the result inexact: the + // intersection is then a superset of the true result and must be + // re-filtered against the full expression by the caller. The compound + // range result may itself be inexact (e.g. a null/undefined bound). + let allConjunctsExact = !compoundRangeResult.canOptimize + ? true + : compoundRangeResult.isExact + for (const [argIndex, arg] of expression.args.entries()) { + if (coveredArgIndices.has(argIndex)) { + continue + } const result = optimizeQueryRecursive(arg, collection) if (result.canOptimize) { results.push(result) + if (!result.isExact) { + allConjunctsExact = false + } + } else { + allConjunctsExact = false } } @@ -435,10 +610,14 @@ function optimizeAndExpression( // Use intersectSets utility for AND logic const allMatchingSets = results.map((r) => r.matchingKeys) const intersectedKeys = intersectSets(allMatchingSets) - return { canOptimize: true, matchingKeys: intersectedKeys } + return { + canOptimize: true, + matchingKeys: intersectedKeys, + isExact: allConjunctsExact, + } } - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } /** @@ -464,27 +643,31 @@ function optimizeOrExpression( collection: CollectionLike, ): OptimizationResult { if (expression.type !== `func` || expression.args.length < 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } const results: Array> = [] - // Try to optimize each part, keep the optimizable ones + // Every disjunct must be optimizable: rows matched only by a disjunct + // that cannot use an index would be missing from the union, and no + // post-filtering can recover them. In that case fall back to a full scan. for (const arg of expression.args) { const result = optimizeQueryRecursive(arg, collection) - if (result.canOptimize) { - results.push(result) + if (!result.canOptimize) { + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } + results.push(result) } - if (results.length > 0) { - // Use unionSets utility for OR logic - const allMatchingSets = results.map((r) => r.matchingKeys) - const unionedKeys = unionSets(allMatchingSets) - return { canOptimize: true, matchingKeys: unionedKeys } + // Use unionSets utility for OR logic + const allMatchingSets = results.map((r) => r.matchingKeys) + const unionedKeys = unionSets(allMatchingSets) + return { + canOptimize: true, + matchingKeys: unionedKeys, + // An inexact (superset) disjunct makes the union a superset as well + isExact: results.every((r) => r.isExact), } - - return { canOptimize: false, matchingKeys: new Set() } } /** @@ -498,8 +681,9 @@ function canOptimizeOrExpression< return false } - // If any argument can be optimized, we can gain some speedup - return expression.args.some((arg) => canOptimizeExpression(arg, collection)) + // Every disjunct must be optimizable, otherwise the union would miss + // rows matched only by the non-optimizable disjuncts + return expression.args.every((arg) => canOptimizeExpression(arg, collection)) } /** @@ -513,7 +697,7 @@ function optimizeInArrayExpression< collection: CollectionLike, ): OptimizationResult { if (expression.type !== `func` || expression.args.length !== 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } const fieldArg = expression.args[0]! @@ -528,11 +712,17 @@ function optimizeInArrayExpression< const values = (arrayArg as any).value const index = findIndexForField(collection, fieldPath) + // A nullish or NaN member can never be matched by `IN` (a comparison + // against null/undefined/NaN is never true), but the index would still + // return rows with such an indexed value. When the list contains one of + // those the result is a superset that the caller must re-filter. + const isExact = values.every((value: any) => isExactComparisonValue(value)) + if (index) { // Check if the index supports IN operation if (index.supports(`in`)) { const matchingKeys = index.lookup(`in`, values) - return { canOptimize: true, matchingKeys } + return { canOptimize: true, matchingKeys, isExact } } else if (index.supports(`eq`)) { // Fallback to multiple equality lookups const matchingKeys = new Set() @@ -542,12 +732,12 @@ function optimizeInArrayExpression< matchingKeys.add(key) } } - return { canOptimize: true, matchingKeys } + return { canOptimize: true, matchingKeys, isExact } } } } - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } /** diff --git a/packages/db/tests/btree-index-undefined-values.test.ts b/packages/db/tests/btree-index-undefined-values.test.ts index 11e29690c..1510c02e3 100644 --- a/packages/db/tests/btree-index-undefined-values.test.ts +++ b/packages/db/tests/btree-index-undefined-values.test.ts @@ -248,6 +248,24 @@ describe(`BTreeIndex - undefined value handling`, () => { expect(withoutFrom.size).toBe(3) }) + it(`should not drop the minimum key when an upper-only range is exclusive on the (absent) lower bound`, () => { + // When no `from` bound is provided, `fromInclusive` must not cause the + // smallest key to be excluded: there is no lower bound to exclude + // against. Only an explicitly provided exclusive lower bound should + // drop its boundary value. + const index = createIndex(`value`) + index.add(`a`, { value: 1 }) + index.add(`b`, { value: 5 }) + index.add(`c`, { value: 10 }) + + const result = index.rangeQuery({ to: 10, fromInclusive: false }) + + expect(result.size).toBe(3) + expect(result).toContain(`a`) + expect(result).toContain(`b`) + expect(result).toContain(`c`) + }) + it(`should handle range query from undefined to undefined`, () => { const index = createIndex(`value`) index.add(`a`, { value: undefined })