From 151a820c72161aacde623b444ab1f137689a36f5 Mon Sep 17 00:00:00 2001 From: Haidong Ji Date: Sun, 17 Mar 2019 11:53:40 -0500 Subject: Rabin-Karp string search done! Not too bad to convert Java to Python. Converting the for loop and playing indexing is interesting, good exercise. --- sources/hash_substring.py | 56 +++++++++++++++++++++++++++++++++++++++++++++ tests/hash_substringTest.py | 31 +++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 sources/hash_substring.py create mode 100644 tests/hash_substringTest.py diff --git a/sources/hash_substring.py b/sources/hash_substring.py new file mode 100644 index 0000000..260084a --- /dev/null +++ b/sources/hash_substring.py @@ -0,0 +1,56 @@ +# python3 +import random + + +def read_input(): + return (input().rstrip(), input().rstrip()) + + +def print_occurrences(output): + print(' '.join(map(str, output))) + + +def get_occurrences(pattern, text): + return [ + i + for i in range(len(text) - len(pattern) + 1) + if text[i:i + len(pattern)] == pattern + ] + + +def poly_hash(S, p, x): + ans = 0 + for c in reversed(S): + ans = (ans * x + ord(c)) % p + + return ans + + +def pre_compute_hashes(T, pattenLength, p, x): + H = [0] * (len(T) - pattenLength + 1) + S = T[len(T) - pattenLength:] + H[len(T) - pattenLength] = poly_hash(S, p, x) + y = 1 + for _ in range(pattenLength): + y = (y * x) % p + for i in reversed(range(len(T) - pattenLength)): + H[i] = (x * H[i + 1] + ord(T[i]) - y * ord(T[i + pattenLength])) % p + return H + + +def rabin_karp(P, T): + p = 1000000007 + x = random.randint(1, 1000000006) + positions = [] + pHash = poly_hash(P, p, x) + H = pre_compute_hashes(T, len(P), p, x) + for i in range(len(T) - len(P) + 1): + if pHash != H[i]: + continue + if T[i:i + len(P)] == P: + positions.append(i) + return positions + + +if __name__ == '__main__': + print_occurrences(rabin_karp(*read_input())) diff --git a/tests/hash_substringTest.py b/tests/hash_substringTest.py new file mode 100644 index 0000000..4c8fd5c --- /dev/null +++ b/tests/hash_substringTest.py @@ -0,0 +1,31 @@ +import unittest + +from sources.hash_substring import get_occurrences, rabin_karp + + +class MyTestCase(unittest.TestCase): + + def test1(self): + result = get_occurrences('aba', 'abacaba') + result = rabin_karp('aba', 'abacaba') + self.assertEqual(len(result), 2) + self.assertEqual(result[0], 0) + self.assertEqual(result[1], 4) + + def test2(self): + result = get_occurrences('Test', 'testTesttesT') + result = rabin_karp('Test', 'testTesttesT') + self.assertEqual(len(result), 1) + self.assertEqual(result[0], 4) + + def test3(self): + result = get_occurrences('aaaaa', 'baaaaaaa') + result = rabin_karp('aaaaa', 'baaaaaaa') + self.assertEqual(len(result), 3) + self.assertEqual(result[0], 1) + self.assertEqual(result[1], 2) + self.assertEqual(result[2], 3) + + +if __name__ == '__main__': + unittest.main() -- cgit v1.2.3