diff --git a/Makefile b/Makefile index 12af6d0c6c..de00b1860f 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ REGRESS = security rum rum_validate rum_hash ruminv timestamp \ int2 int4 int8 float4 float8 money oid \ time timetz date interval \ macaddr inet cidr text varchar char bytea bit varbit \ - numeric rum_weight expr array + numeric rum_weight expr array rum_vacuum TAP_TESTS = 1 diff --git a/expected/rum_vacuum.out b/expected/rum_vacuum.out new file mode 100644 index 0000000000..a3d1293c96 --- /dev/null +++ b/expected/rum_vacuum.out @@ -0,0 +1,154 @@ +-- Test RUM index scan correctness after concurrent VACUUM removes all +-- posting tree entry items. +SET enable_seqscan TO off; +SET enable_indexscan TO off; +SET enable_bitmapscan TO on; +CREATE TABLE test_rum_vacuum (id int, body tsvector); +ALTER TABLE test_rum_vacuum SET (autovacuum_enabled = false); +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great ann') FROM generate_series(1, 6) i; +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great john') FROM generate_series(7, 10000) i; +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great james') FROM generate_series(10001, 10003) i; +CREATE INDEX ON test_rum_vacuum USING rum (body rum_tsvector_ops); +DELETE FROM test_rum_vacuum WHERE body @@ 'ann'::tsquery AND id <= 5; +DELETE FROM test_rum_vacuum WHERE body @@ 'john'::tsquery AND id <= 9999; +-- test normal result +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('ann'); + id | body +----+------------------- + 6 | 'ann':2 'great':1 +(1 row) + +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); + id | body +-------+-------------------- + 10000 | 'great':1 'john':2 +(1 row) + +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('ann') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('ann')) AS sub ORDER BY distance ASC, id ASC; + id | body +-------+-------------------- + 6 | 'ann':2 'great':1 + 10000 | 'great':1 'john':2 + 10001 | 'great':1 'jame':2 + 10002 | 'great':1 'jame':2 + 10003 | 'great':1 'jame':2 +(5 rows) + +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('john') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('john')) AS sub ORDER BY distance ASC, id ASC; + id | body +-------+-------------------- + 10000 | 'great':1 'john':2 + 6 | 'ann':2 'great':1 + 10001 | 'great':1 'jame':2 + 10002 | 'great':1 'jame':2 + 10003 | 'great':1 'jame':2 +(5 rows) + +VACUUM test_rum_vacuum; +-- this shouldn't cause a core dump +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('ann'); + id | body +----+------------------- + 6 | 'ann':2 'great':1 +(1 row) + +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); + id | body +-------+-------------------- + 10000 | 'great':1 'john':2 +(1 row) + +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('ann') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('ann')) AS sub ORDER BY distance ASC, id ASC; + id | body +-------+-------------------- + 6 | 'ann':2 'great':1 + 10000 | 'great':1 'john':2 + 10001 | 'great':1 'jame':2 + 10002 | 'great':1 'jame':2 + 10003 | 'great':1 'jame':2 +(5 rows) + +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('john') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('john')) AS sub ORDER BY distance ASC, id ASC; + id | body +-------+-------------------- + 10000 | 'great':1 'john':2 + 6 | 'ann':2 'great':1 + 10001 | 'great':1 'jame':2 + 10002 | 'great':1 'jame':2 + 10003 | 'great':1 'jame':2 +(5 rows) + +-- test that data can still be found after reinsertion +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great john') FROM generate_series(10004, 20000) i; +SELECT count(*) FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); + count +------- + 9998 +(1 row) + +DELETE FROM test_rum_vacuum WHERE body @@ 'john'::tsquery AND id <= 19999; +VACUUM test_rum_vacuum; +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('ann'); + id | body +----+------------------- + 6 | 'ann':2 'great':1 +(1 row) + +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); + id | body +-------+-------------------- + 20000 | 'great':1 'john':2 +(1 row) + +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('ann') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('ann')) AS sub ORDER BY distance ASC, id ASC; + id | body +-------+-------------------- + 6 | 'ann':2 'great':1 + 10001 | 'great':1 'jame':2 + 10002 | 'great':1 'jame':2 + 10003 | 'great':1 'jame':2 + 20000 | 'great':1 'john':2 +(5 rows) + +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('john') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('john')) AS sub ORDER BY distance ASC, id ASC; + id | body +-------+-------------------- + 20000 | 'great':1 'john':2 + 6 | 'ann':2 'great':1 + 10001 | 'great':1 'jame':2 + 10002 | 'great':1 'jame':2 + 10003 | 'great':1 'jame':2 +(5 rows) + +-- test if do while loop works when an entry has no non-empty posting tree pages +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great john') FROM generate_series(7, 10000) i; +DELETE FROM test_rum_vacuum WHERE body @@ 'john'::tsquery; +VACUUM test_rum_vacuum; +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('ann'); + id | body +----+------------------- + 6 | 'ann':2 'great':1 +(1 row) + +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); + id | body +----+------ +(0 rows) + +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('ann') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('ann')) AS sub ORDER BY distance ASC, id ASC; + id | body +-------+-------------------- + 6 | 'ann':2 'great':1 + 10001 | 'great':1 'jame':2 + 10002 | 'great':1 'jame':2 + 10003 | 'great':1 'jame':2 +(4 rows) + +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('john') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('john')) AS sub ORDER BY distance ASC, id ASC; + id | body +-------+-------------------- + 6 | 'ann':2 'great':1 + 10001 | 'great':1 'jame':2 + 10002 | 'great':1 'jame':2 + 10003 | 'great':1 'jame':2 +(4 rows) diff --git a/sql/rum_vacuum.sql b/sql/rum_vacuum.sql new file mode 100644 index 0000000000..cc9d087a01 --- /dev/null +++ b/sql/rum_vacuum.sql @@ -0,0 +1,54 @@ +-- Test RUM index scan correctness after concurrent VACUUM removes all +-- posting tree entry items. + +SET enable_seqscan TO off; +SET enable_indexscan TO off; +SET enable_bitmapscan TO on; + +CREATE TABLE test_rum_vacuum (id int, body tsvector); +ALTER TABLE test_rum_vacuum SET (autovacuum_enabled = false); + +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great ann') FROM generate_series(1, 6) i; +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great john') FROM generate_series(7, 10000) i; +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great james') FROM generate_series(10001, 10003) i; + +CREATE INDEX ON test_rum_vacuum USING rum (body rum_tsvector_ops); + +DELETE FROM test_rum_vacuum WHERE body @@ 'ann'::tsquery AND id <= 5; +DELETE FROM test_rum_vacuum WHERE body @@ 'john'::tsquery AND id <= 9999; + +-- test normal result +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('ann'); +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('ann') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('ann')) AS sub ORDER BY distance ASC, id ASC; +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('john') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('john')) AS sub ORDER BY distance ASC, id ASC; + +VACUUM test_rum_vacuum; + +-- this shouldn't cause a core dump +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('ann'); +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('ann') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('ann')) AS sub ORDER BY distance ASC, id ASC; +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('john') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('john')) AS sub ORDER BY distance ASC, id ASC; + +-- test that data can still be found after reinsertion +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great john') FROM generate_series(10004, 20000) i; +SELECT count(*) FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); +DELETE FROM test_rum_vacuum WHERE body @@ 'john'::tsquery AND id <= 19999; + +VACUUM test_rum_vacuum; + +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('ann'); +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('ann') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('ann')) AS sub ORDER BY distance ASC, id ASC; +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('john') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('john')) AS sub ORDER BY distance ASC, id ASC; + +-- test if do while loop works when an entry has no non-empty posting tree pages +INSERT INTO test_rum_vacuum SELECT i, to_tsvector('great john') FROM generate_series(7, 10000) i; +DELETE FROM test_rum_vacuum WHERE body @@ 'john'::tsquery; +VACUUM test_rum_vacuum; + +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('ann'); +SELECT * FROM test_rum_vacuum WHERE body @@ to_tsquery('john'); +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('ann') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('ann')) AS sub ORDER BY distance ASC, id ASC; +SELECT id, body FROM (SELECT id, body, body <=> to_tsquery('john') AS distance FROM test_rum_vacuum ORDER BY body <=> to_tsquery('john')) AS sub ORDER BY distance ASC, id ASC; diff --git a/src/rumget.c b/src/rumget.c index a36229f59f..bfd02e2907 100644 --- a/src/rumget.c +++ b/src/rumget.c @@ -684,9 +684,23 @@ startScanEntry(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) } LockBuffer(entry->buffer, RUM_UNLOCK); - entry->isFinished = setListPositionScanEntry(rumstate, entry); - if (!entry->isFinished) - entry->curItem = entry->list[entry->offset]; + + /* + * If the current page is empty (nlist == 0), we cannot assume the + * scan is complete, as subsequent pages may exist. Therefore, we + * set isFinished = false and leave entry->nlist = 0 and + * entry->offset = 0 to ensure that entryGetItem advances to the + * next page on the next call. Otherwise, initialize curItem to + * the first valid item. + */ + if (entry->nlist == 0) + entry->isFinished = false; + else + { + entry->isFinished = setListPositionScanEntry(rumstate, entry); + if (!entry->isFinished) + entry->curItem = entry->list[entry->offset]; + } } else if (RumGetNPosting(itup) > 0) { @@ -699,6 +713,16 @@ startScanEntry(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) if (!entry->isFinished) entry->curItem = entry->list[entry->offset]; } + /* + * Else, the posting list for this entry has been entirely vacuumed + * away (nlist == 0 after setListPositionScanEntry). We cannot assume + * the scan is complete, as subsequent pages may exist. Therefore, we + * set isFinished = false and leave entry->nlist = 0 and entry->offset + * = 0 to ensure that entryGetItem advances to the next page on the + * next call. + */ + else + entry->isFinished = false; if (entry->queryCategory == RUM_CAT_EMPTY_QUERY && entry->scanWithAddInfo) @@ -1011,6 +1035,13 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) LockBuffer(entry->buffer, RUM_UNLOCK); + /* + * No valid item if VACUUM removed all items concurrently. Go on + * next page. + */ + if (entry->nlist == 0) + break; + if (entry->offset < 0) { if (ScanDirectionIsForward(entry->scanDirection) && @@ -1044,6 +1075,7 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) RumItemSetMin(&entry->curItem); entry->offset = InvalidOffsetNumber; entry->list = NULL; + entry->nlist = 0; if (entry->gdi) { freeRumBtreeStack(entry->gdi->stack); @@ -1151,6 +1183,18 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) LockBuffer(entry->buffer, RUM_UNLOCK); entry->isFinished = false; + + /* + * Posting tree's first leaf page is empty due to concurrent VACUUM. + * Advance through empty pages until we find one with items or exhaust + * the tree. entryGetItem does not re-invoke entryGetNextItem after we + * return, so we must do it here to ensure curItem is valid on return. + */ + if (entry->nlist == 0) + { + entryGetNextItem(rumstate, entry, snapshot); + goto entry_done; + } } else if (RumGetNPosting(itup) > 0) { @@ -1161,12 +1205,21 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) rumReadTuple(rumstate, entry->attnum, itup, entry->list, true); entry->isFinished = setListPositionScanEntry(rumstate, entry); } + /* Posting list has been vacuumed. Go to the next entry. */ + else + { + ItemPointerSetInvalid(&entry->curItem.iptr); + entry->isFinished = true; + goto entry_done; + } Assert(entry->nlist > 0 && entry->list); entry->curItem = entry->list[entry->offset]; entry->offset += entry->scanDirection; +entry_done: + SCAN_ENTRY_GET_KEY(entry, rumstate, itup); /* @@ -1340,8 +1393,24 @@ entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList, Snaps } else if (entry->stack) { - entry->offset++; - if (entryGetNextItemList(rumstate, entry, snapshot) && nextEntryList) + /* + * We are responsible for ensuring that we keep advancing through + * ItemLists until we find one that contains at least one valid + * item. This is necessary because concurrent VACUUM may have + * removed all items from a page, leaving an empty ItemList. In + * such cases, we must continue to the next ItemList. + */ + bool success; + + Assert(!entry->isFinished); + + do + { + entry->isFinished = false; + success = entryGetNextItemList(rumstate, entry, snapshot); + } while (success && entry->nlist == 0); + + if (success && nextEntryList) *nextEntryList = true; } else @@ -1361,8 +1430,22 @@ entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList, Snaps dropItem(entry)); if (entry->stack && entry->isFinished) { - entry->isFinished = false; - if (entryGetNextItemList(rumstate, entry, snapshot) && nextEntryList) + /* + * We are responsible for ensuring that we keep advancing through + * ItemLists until we find one that contains at least one valid + * item. This is necessary because concurrent VACUUM may have + * removed all items from a page, leaving an empty ItemList. In + * such cases, we must continue to the next ItemList. + */ + bool success; + + do + { + entry->isFinished = false; + success = entryGetNextItemList(rumstate, entry, snapshot); + } while (success && entry->nlist == 0); + + if (success && nextEntryList) *nextEntryList = true; } }