source: trunk/libs/newlib/src/newlib/libc/string/str-two-way.h @ 444

Last change on this file since 444 was 444, checked in by satin@…, 6 years ago

add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc

File size: 13.5 KB
Line 
1/* Byte-wise substring search, using the Two-Way algorithm.
2 * Copyright (C) 2008, 2010 Eric Blake
3 * Permission to use, copy, modify, and distribute this software
4 * is freely granted, provided that this notice is preserved.
5 */
6
7
8/* Before including this file, you need to include <string.h>, and define:
9     RESULT_TYPE                A macro that expands to the return type.
10     AVAILABLE(h, h_l, j, n_l)  A macro that returns nonzero if there are
11                                at least N_L bytes left starting at
12                                H[J].  H is 'unsigned char *', H_L, J,
13                                and N_L are 'size_t'; H_L is an
14                                lvalue.  For NUL-terminated searches,
15                                H_L can be modified each iteration to
16                                avoid having to compute the end of H
17                                up front.
18
19  For case-insensitivity, you may optionally define:
20     CMP_FUNC(p1, p2, l)        A macro that returns 0 iff the first L
21                                characters of P1 and P2 are equal.
22     CANON_ELEMENT(c)           A macro that canonicalizes an element
23                                right after it has been fetched from
24                                one of the two strings.  The argument
25                                is an 'unsigned char'; the result must
26                                be an 'unsigned char' as well.
27
28  This file undefines the macros documented above, and defines
29  LONG_NEEDLE_THRESHOLD.
30*/
31
32#include <limits.h>
33#include <stdint.h>
34
35/* We use the Two-Way string matching algorithm, which guarantees
36   linear complexity with constant space.  Additionally, for long
37   needles, we also use a bad character shift table similar to the
38   Boyer-Moore algorithm to achieve improved (potentially sub-linear)
39   performance.
40
41   See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
42   and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
43*/
44
45/* Point at which computing a bad-byte shift table is likely to be
46   worthwhile.  Small needles should not compute a table, since it
47   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
48   speedup no greater than a factor of NEEDLE_LEN.  The larger the
49   needle, the better the potential performance gain.  On the other
50   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
51   memory required for the table is prohibitive.  */
52#if CHAR_BIT < 10
53# define LONG_NEEDLE_THRESHOLD 32U
54#else
55# define LONG_NEEDLE_THRESHOLD SIZE_MAX
56#endif
57
58#define MAX(a, b) ((a < b) ? (b) : (a))
59
60#ifndef CANON_ELEMENT
61# define CANON_ELEMENT(c) c
62#endif
63#ifndef CMP_FUNC
64# define CMP_FUNC memcmp
65#endif
66
67/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
68   Return the index of the first byte in the right half, and set
69   *PERIOD to the global period of the right half.
70
71   The global period of a string is the smallest index (possibly its
72   length) at which all remaining bytes in the string are repetitions
73   of the prefix (the last repetition may be a subset of the prefix).
74
75   When NEEDLE is factored into two halves, a local period is the
76   length of the smallest word that shares a suffix with the left half
77   and shares a prefix with the right half.  All factorizations of a
78   non-empty NEEDLE have a local period of at least 1 and no greater
79   than NEEDLE_LEN.
80
81   A critical factorization has the property that the local period
82   equals the global period.  All strings have at least one critical
83   factorization with the left half smaller than the global period.
84
85   Given an ordered alphabet, a critical factorization can be computed
86   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
87   larger of two ordered maximal suffixes.  The ordered maximal
88   suffixes are determined by lexicographic comparison of
89   periodicity.  */
90static size_t
91critical_factorization (const unsigned char *needle, size_t needle_len,
92                        size_t *period)
93{
94  /* Index of last byte of left half, or SIZE_MAX.  */
95  size_t max_suffix, max_suffix_rev;
96  size_t j; /* Index into NEEDLE for current candidate suffix.  */
97  size_t k; /* Offset into current period.  */
98  size_t p; /* Intermediate period.  */
99  unsigned char a, b; /* Current comparison bytes.  */
100
101  /* Invariants:
102     0 <= j < NEEDLE_LEN - 1
103     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
104     min(max_suffix, max_suffix_rev) < global period of NEEDLE
105     1 <= p <= global period of NEEDLE
106     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
107     1 <= k <= p
108  */
109
110  /* Perform lexicographic search.  */
111  max_suffix = SIZE_MAX;
112  j = 0;
113  k = p = 1;
114  while (j + k < needle_len)
115    {
116      a = CANON_ELEMENT (needle[j + k]);
117      b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
118      if (a < b)
119        {
120          /* Suffix is smaller, period is entire prefix so far.  */
121          j += k;
122          k = 1;
123          p = j - max_suffix;
124        }
125      else if (a == b)
126        {
127          /* Advance through repetition of the current period.  */
128          if (k != p)
129            ++k;
130          else
131            {
132              j += p;
133              k = 1;
134            }
135        }
136      else /* b < a */
137        {
138          /* Suffix is larger, start over from current location.  */
139          max_suffix = j++;
140          k = p = 1;
141        }
142    }
143  *period = p;
144
145  /* Perform reverse lexicographic search.  */
146  max_suffix_rev = SIZE_MAX;
147  j = 0;
148  k = p = 1;
149  while (j + k < needle_len)
150    {
151      a = CANON_ELEMENT (needle[j + k]);
152      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
153      if (b < a)
154        {
155          /* Suffix is smaller, period is entire prefix so far.  */
156          j += k;
157          k = 1;
158          p = j - max_suffix_rev;
159        }
160      else if (a == b)
161        {
162          /* Advance through repetition of the current period.  */
163          if (k != p)
164            ++k;
165          else
166            {
167              j += p;
168              k = 1;
169            }
170        }
171      else /* a < b */
172        {
173          /* Suffix is larger, start over from current location.  */
174          max_suffix_rev = j++;
175          k = p = 1;
176        }
177    }
178
179  /* Choose the longer suffix.  Return the first byte of the right
180     half, rather than the last byte of the left half.  */
181  if (max_suffix_rev + 1 < max_suffix + 1)
182    return max_suffix + 1;
183  *period = p;
184  return max_suffix_rev + 1;
185}
186
187/* Return the first location of non-empty NEEDLE within HAYSTACK, or
188   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
189   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
190   Performance is guaranteed to be linear, with an initialization cost
191   of 2 * NEEDLE_LEN comparisons.
192
193   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
194   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
195   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
196   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
197static RETURN_TYPE
198two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
199                      const unsigned char *needle, size_t needle_len)
200{
201  size_t i; /* Index into current byte of NEEDLE.  */
202  size_t j; /* Index into current window of HAYSTACK.  */
203  size_t period; /* The period of the right half of needle.  */
204  size_t suffix; /* The index of the right half of needle.  */
205
206  /* Factor the needle into two halves, such that the left half is
207     smaller than the global period, and the right half is
208     periodic (with a period as large as NEEDLE_LEN - suffix).  */
209  suffix = critical_factorization (needle, needle_len, &period);
210
211  /* Perform the search.  Each iteration compares the right half
212     first.  */
213  if (CMP_FUNC (needle, needle + period, suffix) == 0)
214    {
215      /* Entire needle is periodic; a mismatch can only advance by the
216         period, so use memory to avoid rescanning known occurrences
217         of the period.  */
218      size_t memory = 0;
219      j = 0;
220      while (AVAILABLE (haystack, haystack_len, j, needle_len))
221        {
222          /* Scan for matches in right half.  */
223          i = MAX (suffix, memory);
224          while (i < needle_len && (CANON_ELEMENT (needle[i])
225                                    == CANON_ELEMENT (haystack[i + j])))
226            ++i;
227          if (needle_len <= i)
228            {
229              /* Scan for matches in left half.  */
230              i = suffix - 1;
231              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
232                                        == CANON_ELEMENT (haystack[i + j])))
233                --i;
234              if (i + 1 < memory + 1)
235                return (RETURN_TYPE) (haystack + j);
236              /* No match, so remember how many repetitions of period
237                 on the right half were scanned.  */
238              j += period;
239              memory = needle_len - period;
240            }
241          else
242            {
243              j += i - suffix + 1;
244              memory = 0;
245            }
246        }
247    }
248  else
249    {
250      /* The two halves of needle are distinct; no extra memory is
251         required, and any mismatch results in a maximal shift.  */
252      period = MAX (suffix, needle_len - suffix) + 1;
253      j = 0;
254      while (AVAILABLE (haystack, haystack_len, j, needle_len))
255        {
256          /* Scan for matches in right half.  */
257          i = suffix;
258          while (i < needle_len && (CANON_ELEMENT (needle[i])
259                                    == CANON_ELEMENT (haystack[i + j])))
260            ++i;
261          if (needle_len <= i)
262            {
263              /* Scan for matches in left half.  */
264              i = suffix - 1;
265              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
266                                       == CANON_ELEMENT (haystack[i + j])))
267                --i;
268              if (i == SIZE_MAX)
269                return (RETURN_TYPE) (haystack + j);
270              j += period;
271            }
272          else
273            j += i - suffix + 1;
274        }
275    }
276  return NULL;
277}
278
279/* Return the first location of non-empty NEEDLE within HAYSTACK, or
280   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
281   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
282   Performance is guaranteed to be linear, with an initialization cost
283   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
284
285   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
286   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
287   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
288   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
289   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
290   sublinear performance is not possible.  */
291static RETURN_TYPE
292two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
293                     const unsigned char *needle, size_t needle_len)
294{
295  size_t i; /* Index into current byte of NEEDLE.  */
296  size_t j; /* Index into current window of HAYSTACK.  */
297  size_t period; /* The period of the right half of needle.  */
298  size_t suffix; /* The index of the right half of needle.  */
299  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
300
301  /* Factor the needle into two halves, such that the left half is
302     smaller than the global period, and the right half is
303     periodic (with a period as large as NEEDLE_LEN - suffix).  */
304  suffix = critical_factorization (needle, needle_len, &period);
305
306  /* Populate shift_table.  For each possible byte value c,
307     shift_table[c] is the distance from the last occurrence of c to
308     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
309     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
310  for (i = 0; i < 1U << CHAR_BIT; i++)
311    shift_table[i] = needle_len;
312  for (i = 0; i < needle_len; i++)
313    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
314
315  /* Perform the search.  Each iteration compares the right half
316     first.  */
317  if (CMP_FUNC (needle, needle + period, suffix) == 0)
318    {
319      /* Entire needle is periodic; a mismatch can only advance by the
320         period, so use memory to avoid rescanning known occurrences
321         of the period.  */
322      size_t memory = 0;
323      size_t shift;
324      j = 0;
325      while (AVAILABLE (haystack, haystack_len, j, needle_len))
326        {
327          /* Check the last byte first; if it does not match, then
328             shift to the next possible match location.  */
329          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
330          if (0 < shift)
331            {
332              if (memory && shift < period)
333                {
334                  /* Since needle is periodic, but the last period has
335                     a byte out of place, there can be no match until
336                     after the mismatch.  */
337                  shift = needle_len - period;
338                }
339              memory = 0;
340              j += shift;
341              continue;
342            }
343          /* Scan for matches in right half.  The last byte has
344             already been matched, by virtue of the shift table.  */
345          i = MAX (suffix, memory);
346          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
347                                        == CANON_ELEMENT (haystack[i + j])))
348            ++i;
349          if (needle_len - 1 <= i)
350            {
351              /* Scan for matches in left half.  */
352              i = suffix - 1;
353              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
354                                        == CANON_ELEMENT (haystack[i + j])))
355                --i;
356              if (i + 1 < memory + 1)
357                return (RETURN_TYPE) (haystack + j);
358              /* No match, so remember how many repetitions of period
359                 on the right half were scanned.  */
360              j += period;
361              memory = needle_len - period;
362            }
363          else
364            {
365              j += i - suffix + 1;
366              memory = 0;
367            }
368        }
369    }
370  else
371    {
372      /* The two halves of needle are distinct; no extra memory is
373         required, and any mismatch results in a maximal shift.  */
374      size_t shift;
375      period = MAX (suffix, needle_len - suffix) + 1;
376      j = 0;
377      while (AVAILABLE (haystack, haystack_len, j, needle_len))
378        {
379          /* Check the last byte first; if it does not match, then
380             shift to the next possible match location.  */
381          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
382          if (0 < shift)
383            {
384              j += shift;
385              continue;
386            }
387          /* Scan for matches in right half.  The last byte has
388             already been matched, by virtue of the shift table.  */
389          i = suffix;
390          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
391                                        == CANON_ELEMENT (haystack[i + j])))
392            ++i;
393          if (needle_len - 1 <= i)
394            {
395              /* Scan for matches in left half.  */
396              i = suffix - 1;
397              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
398                                       == CANON_ELEMENT (haystack[i + j])))
399                --i;
400              if (i == SIZE_MAX)
401                return (RETURN_TYPE) (haystack + j);
402              j += period;
403            }
404          else
405            j += i - suffix + 1;
406        }
407    }
408  return NULL;
409}
410
411#undef AVAILABLE
412#undef CANON_ELEMENT
413#undef CMP_FUNC
414#undef MAX
415#undef RETURN_TYPE
Note: See TracBrowser for help on using the repository browser.