diff --git a/rolling_hash/__init__.py b/rolling_hash/__init__.py new file mode 100644 index 000000000000..9ad335324807 --- /dev/null +++ b/rolling_hash/__init__.py @@ -0,0 +1 @@ +"""Rolling hash algorithms for string matching and similarity.""" diff --git a/rolling_hash/rabin_karp.py b/rolling_hash/rabin_karp.py new file mode 100644 index 000000000000..b2cae4dc33f5 --- /dev/null +++ b/rolling_hash/rabin_karp.py @@ -0,0 +1,77 @@ +"""Rabin-Karp rolling hash algorithm for substring search. + +Implements the classic Rabin-Karp algorithm using a rolling hash to find +all occurrences of a pattern in a text in O(n) average time. + +The algorithm uses a simple polynomial rolling hash with modulo prime to +avoid overflow. It works well for ASCII/Unicode strings. + +References: +- Rabin, M. O., & Karp, R. M. (1987). Algorithms for pattern matching. +""" +from typing import List + + +def rabin_karp(text: str, pattern: str) -> List[int]: + """Return starting indices of pattern in text using rolling hash. + + Args: + text: The text to search within. + pattern: The pattern to find. + + Returns: + List of starting indices (0-based) where pattern occurs. + + Example: + >>> rabin_karp("abracadabra", "abra") + [0, 7] + """ + # Edge cases + if pattern == "": + # By convention, empty pattern matches at each position plus one + return list(range(len(text) + 1)) + if len(pattern) > len(text): + return [] + + # Rolling hash parameters + base = 256 # number of possible character values (ASCII/extended) + prime = 101 # a small prime for modulus + m, n = len(pattern), len(text) + + # Precompute base^(m-1) mod prime for rolling removal + h = 1 + for _ in range(m - 1): + h = (h * base) % prime + + # Compute initial hash values + pattern_hash = 0 + window_hash = 0 + for i in range(m): + pattern_hash = (base * pattern_hash + ord(pattern[i])) % prime + window_hash = (base * window_hash + ord(text[i])) % prime + + matches: List[int] = [] + # Slide the window over text + for i in range(n - m + 1): + if pattern_hash == window_hash: + # Double-check to avoid hash collisions + if text[i:i + m] == pattern: + matches.append(i) + if i < n - m: + # Roll: remove leading char, add trailing char + window_hash = (base * (window_hash - ord(text[i]) * h) + ord(text[i + m])) % prime + if window_hash < 0: + window_hash += prime + return matches + + +def demo() -> None: + """Run a simple demonstration.""" + text = "abracadabra" + pattern = "abra" + indices = rabin_karp(text, pattern) + print(f"Pattern '{pattern}' found at positions: {indices}") + + +if __name__ == "__main__": + demo() diff --git a/tests/test_rolling_hash.py b/tests/test_rolling_hash.py new file mode 100644 index 000000000000..da97fedf355c --- /dev/null +++ b/tests/test_rolling_hash.py @@ -0,0 +1,49 @@ +"""Tests for rolling hash Rabin-Karp implementation.""" +import pytest +from rolling_hash.rabin_karp import rabin_karp + + +def test_basic_matches(): + assert rabin_karp("abracadabra", "abra") == [0, 7] + assert rabin_karp("aaaaa", "aa") == [0, 1, 2, 3] + assert rabin_karp("hello world", "world") == [6] + + +def test_no_match(): + assert rabin_karp("abcdef", "gh") == [] + assert rabin_karp("abc", "abcd") == [] + + +def test_empty_pattern(): + # Empty pattern matches at every position (including end) + assert rabin_karp("abc", "") == [0, 1, 2, 3] + assert rabin_karp("", "") == [0] + + +def test_single_character(): + assert rabin_karp("a", "a") == [0] + assert rabin_karp("ab", "a") == [0] + assert rabin_karp("ab", "b") == [1] + + +def test_overlapping(): + text = "aaa" + pattern = "aa" + assert rabin_karp(text, pattern) == [0, 1] + + +def test_case_sensitive(): + assert rabin_karp("ABCabc", "abc") == [3] + assert rabin_karp("ABCabc", "ABC") == [0] + + +def test_unicode(): + # Unicode characters + assert rabin_karp("你好世界你好", "你好") == [0, 4] + + +def test_long_pattern(): + text = "a" * 1000 + pattern = "a" * 100 + expected = list(range(0, 901)) + assert rabin_karp(text, pattern) == expected