1 | /* Byte-wise substring search, using the Two-Way algorithm. |
---|
2 | * Copyright (C) 2008, 2010 Eric Blake |
---|
3 | * Permission to use, copy, modify, and distribute this software |
---|
4 | * is freely granted, provided that this notice is preserved. |
---|
5 | */ |
---|
6 | |
---|
7 | |
---|
8 | /* Before including this file, you need to include <string.h>, and define: |
---|
9 | RESULT_TYPE A macro that expands to the return type. |
---|
10 | AVAILABLE(h, h_l, j, n_l) A macro that returns nonzero if there are |
---|
11 | at least N_L bytes left starting at |
---|
12 | H[J]. H is 'unsigned char *', H_L, J, |
---|
13 | and N_L are 'size_t'; H_L is an |
---|
14 | lvalue. For NUL-terminated searches, |
---|
15 | H_L can be modified each iteration to |
---|
16 | avoid having to compute the end of H |
---|
17 | up front. |
---|
18 | |
---|
19 | For case-insensitivity, you may optionally define: |
---|
20 | CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L |
---|
21 | characters of P1 and P2 are equal. |
---|
22 | CANON_ELEMENT(c) A macro that canonicalizes an element |
---|
23 | right after it has been fetched from |
---|
24 | one of the two strings. The argument |
---|
25 | is an 'unsigned char'; the result must |
---|
26 | be an 'unsigned char' as well. |
---|
27 | |
---|
28 | This file undefines the macros documented above, and defines |
---|
29 | LONG_NEEDLE_THRESHOLD. |
---|
30 | */ |
---|
31 | |
---|
32 | #include <limits.h> |
---|
33 | #include <stdint.h> |
---|
34 | |
---|
35 | /* We use the Two-Way string matching algorithm, which guarantees |
---|
36 | linear complexity with constant space. Additionally, for long |
---|
37 | needles, we also use a bad character shift table similar to the |
---|
38 | Boyer-Moore algorithm to achieve improved (potentially sub-linear) |
---|
39 | performance. |
---|
40 | |
---|
41 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 |
---|
42 | and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm |
---|
43 | */ |
---|
44 | |
---|
45 | /* Point at which computing a bad-byte shift table is likely to be |
---|
46 | worthwhile. Small needles should not compute a table, since it |
---|
47 | adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a |
---|
48 | speedup no greater than a factor of NEEDLE_LEN. The larger the |
---|
49 | needle, the better the potential performance gain. On the other |
---|
50 | hand, on non-POSIX systems with CHAR_BIT larger than eight, the |
---|
51 | memory required for the table is prohibitive. */ |
---|
52 | #if CHAR_BIT < 10 |
---|
53 | # define LONG_NEEDLE_THRESHOLD 32U |
---|
54 | #else |
---|
55 | # define LONG_NEEDLE_THRESHOLD SIZE_MAX |
---|
56 | #endif |
---|
57 | |
---|
58 | #define MAX(a, b) ((a < b) ? (b) : (a)) |
---|
59 | |
---|
60 | #ifndef CANON_ELEMENT |
---|
61 | # define CANON_ELEMENT(c) c |
---|
62 | #endif |
---|
63 | #ifndef CMP_FUNC |
---|
64 | # define CMP_FUNC memcmp |
---|
65 | #endif |
---|
66 | |
---|
67 | /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. |
---|
68 | Return the index of the first byte in the right half, and set |
---|
69 | *PERIOD to the global period of the right half. |
---|
70 | |
---|
71 | The global period of a string is the smallest index (possibly its |
---|
72 | length) at which all remaining bytes in the string are repetitions |
---|
73 | of the prefix (the last repetition may be a subset of the prefix). |
---|
74 | |
---|
75 | When NEEDLE is factored into two halves, a local period is the |
---|
76 | length of the smallest word that shares a suffix with the left half |
---|
77 | and shares a prefix with the right half. All factorizations of a |
---|
78 | non-empty NEEDLE have a local period of at least 1 and no greater |
---|
79 | than NEEDLE_LEN. |
---|
80 | |
---|
81 | A critical factorization has the property that the local period |
---|
82 | equals the global period. All strings have at least one critical |
---|
83 | factorization with the left half smaller than the global period. |
---|
84 | |
---|
85 | Given an ordered alphabet, a critical factorization can be computed |
---|
86 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
---|
87 | larger of two ordered maximal suffixes. The ordered maximal |
---|
88 | suffixes are determined by lexicographic comparison of |
---|
89 | periodicity. */ |
---|
90 | static size_t |
---|
91 | critical_factorization (const unsigned char *needle, size_t needle_len, |
---|
92 | size_t *period) |
---|
93 | { |
---|
94 | /* Index of last byte of left half, or SIZE_MAX. */ |
---|
95 | size_t max_suffix, max_suffix_rev; |
---|
96 | size_t j; /* Index into NEEDLE for current candidate suffix. */ |
---|
97 | size_t k; /* Offset into current period. */ |
---|
98 | size_t p; /* Intermediate period. */ |
---|
99 | unsigned char a, b; /* Current comparison bytes. */ |
---|
100 | |
---|
101 | /* Invariants: |
---|
102 | 0 <= j < NEEDLE_LEN - 1 |
---|
103 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
---|
104 | min(max_suffix, max_suffix_rev) < global period of NEEDLE |
---|
105 | 1 <= p <= global period of NEEDLE |
---|
106 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
---|
107 | 1 <= k <= p |
---|
108 | */ |
---|
109 | |
---|
110 | /* Perform lexicographic search. */ |
---|
111 | max_suffix = SIZE_MAX; |
---|
112 | j = 0; |
---|
113 | k = p = 1; |
---|
114 | while (j + k < needle_len) |
---|
115 | { |
---|
116 | a = CANON_ELEMENT (needle[j + k]); |
---|
117 | b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]); |
---|
118 | if (a < b) |
---|
119 | { |
---|
120 | /* Suffix is smaller, period is entire prefix so far. */ |
---|
121 | j += k; |
---|
122 | k = 1; |
---|
123 | p = j - max_suffix; |
---|
124 | } |
---|
125 | else if (a == b) |
---|
126 | { |
---|
127 | /* Advance through repetition of the current period. */ |
---|
128 | if (k != p) |
---|
129 | ++k; |
---|
130 | else |
---|
131 | { |
---|
132 | j += p; |
---|
133 | k = 1; |
---|
134 | } |
---|
135 | } |
---|
136 | else /* b < a */ |
---|
137 | { |
---|
138 | /* Suffix is larger, start over from current location. */ |
---|
139 | max_suffix = j++; |
---|
140 | k = p = 1; |
---|
141 | } |
---|
142 | } |
---|
143 | *period = p; |
---|
144 | |
---|
145 | /* Perform reverse lexicographic search. */ |
---|
146 | max_suffix_rev = SIZE_MAX; |
---|
147 | j = 0; |
---|
148 | k = p = 1; |
---|
149 | while (j + k < needle_len) |
---|
150 | { |
---|
151 | a = CANON_ELEMENT (needle[j + k]); |
---|
152 | b = CANON_ELEMENT (needle[max_suffix_rev + k]); |
---|
153 | if (b < a) |
---|
154 | { |
---|
155 | /* Suffix is smaller, period is entire prefix so far. */ |
---|
156 | j += k; |
---|
157 | k = 1; |
---|
158 | p = j - max_suffix_rev; |
---|
159 | } |
---|
160 | else if (a == b) |
---|
161 | { |
---|
162 | /* Advance through repetition of the current period. */ |
---|
163 | if (k != p) |
---|
164 | ++k; |
---|
165 | else |
---|
166 | { |
---|
167 | j += p; |
---|
168 | k = 1; |
---|
169 | } |
---|
170 | } |
---|
171 | else /* a < b */ |
---|
172 | { |
---|
173 | /* Suffix is larger, start over from current location. */ |
---|
174 | max_suffix_rev = j++; |
---|
175 | k = p = 1; |
---|
176 | } |
---|
177 | } |
---|
178 | |
---|
179 | /* Choose the longer suffix. Return the first byte of the right |
---|
180 | half, rather than the last byte of the left half. */ |
---|
181 | if (max_suffix_rev + 1 < max_suffix + 1) |
---|
182 | return max_suffix + 1; |
---|
183 | *period = p; |
---|
184 | return max_suffix_rev + 1; |
---|
185 | } |
---|
186 | |
---|
187 | /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
---|
188 | NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
---|
189 | method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. |
---|
190 | Performance is guaranteed to be linear, with an initialization cost |
---|
191 | of 2 * NEEDLE_LEN comparisons. |
---|
192 | |
---|
193 | If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at |
---|
194 | most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. |
---|
195 | If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
---|
196 | HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ |
---|
197 | static RETURN_TYPE |
---|
198 | two_way_short_needle (const unsigned char *haystack, size_t haystack_len, |
---|
199 | const unsigned char *needle, size_t needle_len) |
---|
200 | { |
---|
201 | size_t i; /* Index into current byte of NEEDLE. */ |
---|
202 | size_t j; /* Index into current window of HAYSTACK. */ |
---|
203 | size_t period; /* The period of the right half of needle. */ |
---|
204 | size_t suffix; /* The index of the right half of needle. */ |
---|
205 | |
---|
206 | /* Factor the needle into two halves, such that the left half is |
---|
207 | smaller than the global period, and the right half is |
---|
208 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
---|
209 | suffix = critical_factorization (needle, needle_len, &period); |
---|
210 | |
---|
211 | /* Perform the search. Each iteration compares the right half |
---|
212 | first. */ |
---|
213 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
---|
214 | { |
---|
215 | /* Entire needle is periodic; a mismatch can only advance by the |
---|
216 | period, so use memory to avoid rescanning known occurrences |
---|
217 | of the period. */ |
---|
218 | size_t memory = 0; |
---|
219 | j = 0; |
---|
220 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
---|
221 | { |
---|
222 | /* Scan for matches in right half. */ |
---|
223 | i = MAX (suffix, memory); |
---|
224 | while (i < needle_len && (CANON_ELEMENT (needle[i]) |
---|
225 | == CANON_ELEMENT (haystack[i + j]))) |
---|
226 | ++i; |
---|
227 | if (needle_len <= i) |
---|
228 | { |
---|
229 | /* Scan for matches in left half. */ |
---|
230 | i = suffix - 1; |
---|
231 | while (memory < i + 1 && (CANON_ELEMENT (needle[i]) |
---|
232 | == CANON_ELEMENT (haystack[i + j]))) |
---|
233 | --i; |
---|
234 | if (i + 1 < memory + 1) |
---|
235 | return (RETURN_TYPE) (haystack + j); |
---|
236 | /* No match, so remember how many repetitions of period |
---|
237 | on the right half were scanned. */ |
---|
238 | j += period; |
---|
239 | memory = needle_len - period; |
---|
240 | } |
---|
241 | else |
---|
242 | { |
---|
243 | j += i - suffix + 1; |
---|
244 | memory = 0; |
---|
245 | } |
---|
246 | } |
---|
247 | } |
---|
248 | else |
---|
249 | { |
---|
250 | /* The two halves of needle are distinct; no extra memory is |
---|
251 | required, and any mismatch results in a maximal shift. */ |
---|
252 | period = MAX (suffix, needle_len - suffix) + 1; |
---|
253 | j = 0; |
---|
254 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
---|
255 | { |
---|
256 | /* Scan for matches in right half. */ |
---|
257 | i = suffix; |
---|
258 | while (i < needle_len && (CANON_ELEMENT (needle[i]) |
---|
259 | == CANON_ELEMENT (haystack[i + j]))) |
---|
260 | ++i; |
---|
261 | if (needle_len <= i) |
---|
262 | { |
---|
263 | /* Scan for matches in left half. */ |
---|
264 | i = suffix - 1; |
---|
265 | while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) |
---|
266 | == CANON_ELEMENT (haystack[i + j]))) |
---|
267 | --i; |
---|
268 | if (i == SIZE_MAX) |
---|
269 | return (RETURN_TYPE) (haystack + j); |
---|
270 | j += period; |
---|
271 | } |
---|
272 | else |
---|
273 | j += i - suffix + 1; |
---|
274 | } |
---|
275 | } |
---|
276 | return NULL; |
---|
277 | } |
---|
278 | |
---|
279 | /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
---|
280 | NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
---|
281 | method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. |
---|
282 | Performance is guaranteed to be linear, with an initialization cost |
---|
283 | of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. |
---|
284 | |
---|
285 | If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at |
---|
286 | most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, |
---|
287 | and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. |
---|
288 | If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
---|
289 | HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and |
---|
290 | sublinear performance is not possible. */ |
---|
291 | static RETURN_TYPE |
---|
292 | two_way_long_needle (const unsigned char *haystack, size_t haystack_len, |
---|
293 | const unsigned char *needle, size_t needle_len) |
---|
294 | { |
---|
295 | size_t i; /* Index into current byte of NEEDLE. */ |
---|
296 | size_t j; /* Index into current window of HAYSTACK. */ |
---|
297 | size_t period; /* The period of the right half of needle. */ |
---|
298 | size_t suffix; /* The index of the right half of needle. */ |
---|
299 | size_t shift_table[1U << CHAR_BIT]; /* See below. */ |
---|
300 | |
---|
301 | /* Factor the needle into two halves, such that the left half is |
---|
302 | smaller than the global period, and the right half is |
---|
303 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
---|
304 | suffix = critical_factorization (needle, needle_len, &period); |
---|
305 | |
---|
306 | /* Populate shift_table. For each possible byte value c, |
---|
307 | shift_table[c] is the distance from the last occurrence of c to |
---|
308 | the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. |
---|
309 | shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ |
---|
310 | for (i = 0; i < 1U << CHAR_BIT; i++) |
---|
311 | shift_table[i] = needle_len; |
---|
312 | for (i = 0; i < needle_len; i++) |
---|
313 | shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; |
---|
314 | |
---|
315 | /* Perform the search. Each iteration compares the right half |
---|
316 | first. */ |
---|
317 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
---|
318 | { |
---|
319 | /* Entire needle is periodic; a mismatch can only advance by the |
---|
320 | period, so use memory to avoid rescanning known occurrences |
---|
321 | of the period. */ |
---|
322 | size_t memory = 0; |
---|
323 | size_t shift; |
---|
324 | j = 0; |
---|
325 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
---|
326 | { |
---|
327 | /* Check the last byte first; if it does not match, then |
---|
328 | shift to the next possible match location. */ |
---|
329 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
---|
330 | if (0 < shift) |
---|
331 | { |
---|
332 | if (memory && shift < period) |
---|
333 | { |
---|
334 | /* Since needle is periodic, but the last period has |
---|
335 | a byte out of place, there can be no match until |
---|
336 | after the mismatch. */ |
---|
337 | shift = needle_len - period; |
---|
338 | } |
---|
339 | memory = 0; |
---|
340 | j += shift; |
---|
341 | continue; |
---|
342 | } |
---|
343 | /* Scan for matches in right half. The last byte has |
---|
344 | already been matched, by virtue of the shift table. */ |
---|
345 | i = MAX (suffix, memory); |
---|
346 | while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) |
---|
347 | == CANON_ELEMENT (haystack[i + j]))) |
---|
348 | ++i; |
---|
349 | if (needle_len - 1 <= i) |
---|
350 | { |
---|
351 | /* Scan for matches in left half. */ |
---|
352 | i = suffix - 1; |
---|
353 | while (memory < i + 1 && (CANON_ELEMENT (needle[i]) |
---|
354 | == CANON_ELEMENT (haystack[i + j]))) |
---|
355 | --i; |
---|
356 | if (i + 1 < memory + 1) |
---|
357 | return (RETURN_TYPE) (haystack + j); |
---|
358 | /* No match, so remember how many repetitions of period |
---|
359 | on the right half were scanned. */ |
---|
360 | j += period; |
---|
361 | memory = needle_len - period; |
---|
362 | } |
---|
363 | else |
---|
364 | { |
---|
365 | j += i - suffix + 1; |
---|
366 | memory = 0; |
---|
367 | } |
---|
368 | } |
---|
369 | } |
---|
370 | else |
---|
371 | { |
---|
372 | /* The two halves of needle are distinct; no extra memory is |
---|
373 | required, and any mismatch results in a maximal shift. */ |
---|
374 | size_t shift; |
---|
375 | period = MAX (suffix, needle_len - suffix) + 1; |
---|
376 | j = 0; |
---|
377 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
---|
378 | { |
---|
379 | /* Check the last byte first; if it does not match, then |
---|
380 | shift to the next possible match location. */ |
---|
381 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
---|
382 | if (0 < shift) |
---|
383 | { |
---|
384 | j += shift; |
---|
385 | continue; |
---|
386 | } |
---|
387 | /* Scan for matches in right half. The last byte has |
---|
388 | already been matched, by virtue of the shift table. */ |
---|
389 | i = suffix; |
---|
390 | while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) |
---|
391 | == CANON_ELEMENT (haystack[i + j]))) |
---|
392 | ++i; |
---|
393 | if (needle_len - 1 <= i) |
---|
394 | { |
---|
395 | /* Scan for matches in left half. */ |
---|
396 | i = suffix - 1; |
---|
397 | while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) |
---|
398 | == CANON_ELEMENT (haystack[i + j]))) |
---|
399 | --i; |
---|
400 | if (i == SIZE_MAX) |
---|
401 | return (RETURN_TYPE) (haystack + j); |
---|
402 | j += period; |
---|
403 | } |
---|
404 | else |
---|
405 | j += i - suffix + 1; |
---|
406 | } |
---|
407 | } |
---|
408 | return NULL; |
---|
409 | } |
---|
410 | |
---|
411 | #undef AVAILABLE |
---|
412 | #undef CANON_ELEMENT |
---|
413 | #undef CMP_FUNC |
---|
414 | #undef MAX |
---|
415 | #undef RETURN_TYPE |
---|