This is an implementation of Boyer-Moore string search algorithm index_of
for a pattern in text. I was challenged to implement it by my friend.
This Python function returns the starting index of the first occurrence of a pattern in text and returns None
if no index is found. For example, index_of('ababc', 'abc')
returns 2 and index_of('abc', 'z')
returns None
.
I have the following implementation that handles most of the cases and I think the worst case running time is \$O(m/n)\$.
def index_of(text, pattern):
assert isinstance(text, str), 'text is not a string: {}'.format(text)
assert isinstance(pattern, str), 'pattern is not a string: {}'.format(text)
# Check if empty; if so, return 0, earliest bit
if len(pattern) == 0:
return 0
pattern = list(pattern)
pattern_len = len(pattern)
text = list(text)
textext_len = len(text)
shift = 0
while(shift <= textext_len - pattern_len):
index = pattern_len - 1
# Keep going until it all meets
while index >= 0 and pattern[index] == text[shift+index]:
index -= 1
# If we get past index, then return where the shift is
if index < 0:
return shift
else:
if text[shift+pattern_len-1] in pattern:
shift += pattern_len - pattern.index(text[shift+pattern_len-1]) - 1
else:
shift += pattern_len
return None
I include my code here. I ran my code against 3 separate tests, and it passed all of the pytests. So it's pretty robust, and I think my implementation works fine.
import unittest
class Test(unittest.TestCase):
def test_index_of_with_matching_patterns(self):
assert index_of('abc', '') == 0 # all strings contain empty string
assert index_of('abc', 'a') == 0 # single letters are easy
assert index_of('abc', 'b') == 1
assert index_of('abc', 'c') == 2
assert index_of('abc', 'ab') == 0 # multiple letters are harder
assert index_of('abc', 'bc') == 1
assert index_of('abc', 'abc') == 0 # all strings contain themselves
assert index_of('aaa', 'a') == 0 # multiple occurrences
assert index_of('aaa', 'aa') == 0 # overlapping pattern
assert index_of('thisisabsolutelycrazy', 't') == 0
assert index_of('abcdef', 'abcdef') == 0 # all strings contain
assert index_of('abcdef', 'abcdef') == 0 # all strings contain
def test_index_of_with_non_matching_patterns(self):
# Negative test cases (counterexamples) with non-matching patterns
assert index_of('abc', 'z') is None # remember to test other letters
assert index_of('abc', 'ac') is None # important to test close cases
assert index_of('abc', 'az') is None # first letter, but not last
assert index_of('abc', 'abz') is None # first 2 letters, but not last
def test_index_of_with_complex_patterns(self):
# Difficult test cases (examples) with complex patterns
assert index_of('ababc', 'abc') == 2 # overlapping prefix
assert index_of('bananas', 'nas') == 4 # overlapping prefix
assert index_of('abcabcabc', 'abc') == 0 # multiple occurrences
assert index_of('abcabcab', 'abc') == 0 # multiple occurrences
assert index_of('abcabcdef', 'abcd') == 3 # overlapping prefix
assert index_of('abcabcdef', 'abcdef') == 3 # overlapping prefix
assert index_of('abcabcdabcde', 'abcde') == 7 # overlapping prefix
assert index_of('abcabcdabcde', 'abcd') == 3 # multiple occurrences, overlapping prefix
assert index_of('abra cadabra', 'abra') == 0 # multiple occurrences
assert index_of('abra cadabra', 'adab') == 6 # overlapping prefix
if __name__ == '__main__':
unittest.main()
If you find a mismatch, you have a configuration of the type
. I'm tempted to suggestif __name__ …: import sys main(sys.argv)
) Do "the twinassert()
s" call for procedural abstraction? I don't likeif condition: return… else:
. \$\endgroup\$worst case running time
? 2)main()
looks scrambled: shouldn't "the single shot call" be in "the== 2
-branch", the script controlled byelse
? 3) this doesn't look a true implementation of Boyer-Moore string-search (Neither precomputation nor (not quite as true) memoisation of delta₁) 4) the comment aboutshift rule
looks misplaced: put it after the setup oftextext_len
(what a name…) 5)return 0, earliest bit
-return 0 (1st character)
? \$\endgroup\$