-
Notifications
You must be signed in to change notification settings - Fork 19.7k
/
Copy pathHorspoolSearch.java
185 lines (168 loc) · 7.04 KB
/
HorspoolSearch.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
package com.thealgorithms.strings;
import java.util.HashMap;
/**
* This class is not thread safe<br>
* <br>
* (From wikipedia) In computer science, the Boyer–Moore–Horspool algorithm or
* Horspool's algorithm is an algorithm for finding substrings in strings. It
* was published by Nigel Horspool in 1980.
* <br>
* <a href=https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm>Wikipedia
* page</a><br>
* <br>
*
* <p>
* An explanation:<br>
*
* <p>
* The Horspool algorithm is a simplification of the Boyer-Moore algorithm in
* that it uses only one of the two heuristic methods for increasing the number
* of characters shifted when finding a bad match in the text. This method is
* usually called the "bad symbol" or "bad character" shift. The bad symbol
* shift method is classified as an input enhancement method in the theory of
* algorithms. Input enhancement is (from wikipedia) the principle that
* processing a given input to a problem and altering it in a specific way will
* increase runtime efficiency or space efficiency, or both. Both algorithms try
* to match the pattern and text comparing the pattern symbols to the text's
* from right to left.<br>
* <br>
*
* <p>
* In the bad symbol shift method, a table is created prior to the search,
* called the "bad symbol table". The bad symbol table contains the shift values
* for any symbol in the text and pattern. For these symbols, the value is the
* length of the pattern, if the symbol is not in the first (length - 1) of the
* pattern. Else it is the distance from its rightmost occurrence in the pattern
* to the last symbol of the pattern. In practice, we only calculate the values
* for the ones that exist in the first (length - 1) of the pattern.<br>
* <br>
*
* <p>
* For more details on the algorithm and the more advanced Boyer-Moore I
* recommend checking out the wikipedia page and professor Anany Levitin's book:
* Introduction To The Design And Analysis Of Algorithms.
*/
public final class HorspoolSearch {
private HorspoolSearch() {
}
private static HashMap<Character, Integer> shiftValues; // bad symbol table
private static Integer patternLength;
private static int comparisons = 0; // total comparisons in the current/last search
/**
* Case sensitive version version of the algorithm
*
* @param pattern the pattern to be searched for (needle)
* @param text the text being searched in (haystack)
* @return -1 if not found or first index of the pattern in the text
*/
public static int findFirst(String pattern, String text) {
return firstOccurrence(pattern, text, true);
}
/**
* Case insensitive version version of the algorithm
*
* @param pattern the pattern to be searched for (needle)
* @param text the text being searched in (haystack)
* @return -1 if not found or first index of the pattern in the text
*/
public static int findFirstInsensitive(String pattern, String text) {
return firstOccurrence(pattern, text, false);
}
/**
* Utility method that returns comparisons made by last run (mainly for
* tests)
*
* @return number of character comparisons of the last search
*/
public static Integer getLastComparisons() {
return HorspoolSearch.comparisons;
}
/**
* Fairly standard implementation of the Horspool algorithm. Only the index
* of the last character of the pattern on the text is saved and shifted by
* the appropriate amount when a mismatch is found. The algorithm stops at
* the first match or when the entire text has been exhausted.
*
* @param pattern String to be matched in the text
* @param text text String
* @return index of first occurrence of the pattern in the text
*/
private static int firstOccurrence(String pattern, String text, boolean caseSensitive) {
shiftValues = calcShiftValues(pattern); // build the bad symbol table
comparisons = 0; // reset comparisons
if (pattern.length() == 0) { // return failure, if pattern empty
return -1;
}
int textIndex = pattern.length() - 1; // align pattern with text start and get index of the last character
// while pattern is not out of text bounds
while (textIndex < text.length()) {
// try to match pattern with current part of the text starting from last character
int i = pattern.length() - 1;
while (i >= 0) {
comparisons++;
char patternChar = pattern.charAt(i);
char textChar = text.charAt((textIndex + i) - (pattern.length() - 1));
if (!charEquals(patternChar, textChar, caseSensitive)) { // bad character, shift pattern
textIndex += getShiftValue(text.charAt(textIndex));
break;
}
i--;
}
// check for full match
if (i == -1) {
return textIndex - pattern.length() + 1;
}
}
// text exhausted, return failure
return -1;
}
/**
* Compares the argument characters
*
* @param c1 first character
* @param c2 second character
* @param caseSensitive boolean determining case sensitivity of comparison
* @return truth value of the equality comparison
*/
private static boolean charEquals(char c1, char c2, boolean caseSensitive) {
if (caseSensitive) {
return c1 == c2;
}
return Character.toLowerCase(c1) == Character.toLowerCase(c2);
}
/**
* Builds the bad symbol table required to run the algorithm. The method
* starts from the second to last character of the pattern and moves to the
* left. When it meets a new character, it is by definition its rightmost
* occurrence and therefore puts the distance from the current index to the
* index of the last character into the table. If the character is already
* in the table, then it is not a rightmost occurrence, so it continues.
*
* @param pattern basis for the bad symbol table
* @return the bad symbol table
*/
private static HashMap<Character, Integer> calcShiftValues(String pattern) {
patternLength = pattern.length();
HashMap<Character, Integer> table = new HashMap<>();
for (int i = pattern.length() - 2; i >= 0; i--) { // length - 2 is the index of the second to last character
char c = pattern.charAt(i);
int finalI = i;
table.computeIfAbsent(c, k -> pattern.length() - 1 - finalI);
}
return table;
}
/**
* Helper function that uses the bad symbol shift table to return the
* appropriate shift value for a given character
*
* @param c character
* @return shift value that corresponds to the character argument
*/
private static Integer getShiftValue(char c) {
if (shiftValues.get(c) != null) {
return shiftValues.get(c);
} else {
return patternLength;
}
}
}