Node.js  v8.x
Node.js is a JavaScript runtime built on Chrome's V8 JavaScript engine
string_search.h
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef SRC_STRING_SEARCH_H_
6 #define SRC_STRING_SEARCH_H_
7 
8 #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
9 
10 #include "node.h"
11 #include <string.h>
12 
13 namespace node {
14 namespace stringsearch {
15 
16 
17 // Returns the maximum of the two parameters.
18 template <typename T>
19 T Max(T a, T b) {
20  return a < b ? b : a;
21 }
22 
23 
24 static const uint32_t kMaxOneByteCharCodeU = 0xff;
25 
26 template <typename T>
27 class Vector {
28  public:
29  Vector(T* data, size_t length, bool isForward)
30  : start_(data), length_(length), is_forward_(isForward) {
31  CHECK(length > 0 && data != nullptr);
32  }
33 
34  // Returns the start of the memory range.
35  // For vector v this is NOT necessarily &v[0], see forward().
36  const T* start() const { return start_; }
37 
38  // Returns the length of the vector, in characters.
39  size_t length() const { return length_; }
40 
41  // Returns true if the Vector is front-to-back, false if back-to-front.
42  // In the latter case, v[0] corresponds to the *end* of the memory range.
43  size_t forward() const { return is_forward_; }
44 
45  // Access individual vector elements - checks bounds in debug mode.
46  T& operator[](size_t index) const {
47  CHECK(index < length_);
48  return start_[is_forward_ ? index : (length_ - index - 1)];
49  }
50 
51  private:
52  T* start_;
53  size_t length_;
54  bool is_forward_;
55 };
56 
57 
58 //---------------------------------------------------------------------
59 // String Search object.
60 //---------------------------------------------------------------------
61 
62 // Class holding constants and methods that apply to all string search variants,
63 // independently of subject and pattern char size.
64 class StringSearchBase {
65  protected:
66  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
67  // limit, we can fix the size of tables. For a needle longer than this limit,
68  // search will not be optimal, since we only build tables for a suffix
69  // of the string, but it is a safe approximation.
70  static const int kBMMaxShift = 250;
71 
72  // Reduce alphabet to this size.
73  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
74  // proportional to the input alphabet. We reduce the alphabet size by
75  // equating input characters modulo a smaller alphabet size. This gives
76  // a potentially less efficient searching, but is a safe approximation.
77  // For needles using only characters in the same Unicode 256-code point page,
78  // there is no search speed degradation.
79  static const int kLatin1AlphabetSize = 256;
80  static const int kUC16AlphabetSize = 256;
81 
82  // Bad-char shift table stored in the state. It's length is the alphabet size.
83  // For patterns below this length, the skip length of Boyer-Moore is too short
84  // to compensate for the algorithmic overhead compared to simple brute force.
85  static const int kBMMinPatternLength = 8;
86 
87  // Store for the BoyerMoore(Horspool) bad char shift table.
88  static int kBadCharShiftTable[kUC16AlphabetSize];
89  // Store for the BoyerMoore good suffix shift table.
90  static int kGoodSuffixShiftTable[kBMMaxShift + 1];
91  // Table used temporarily while building the BoyerMoore good suffix
92  // shift table.
93  static int kSuffixTable[kBMMaxShift + 1];
94 };
95 
96 template <typename Char>
97 class StringSearch : private StringSearchBase {
98  public:
99  explicit StringSearch(Vector<const Char> pattern)
100  : pattern_(pattern), start_(0) {
101  if (pattern.length() >= kBMMaxShift) {
102  start_ = pattern.length() - kBMMaxShift;
103  }
104 
105  size_t pattern_length = pattern_.length();
106  CHECK_GT(pattern_length, 0);
107  if (pattern_length < kBMMinPatternLength) {
108  if (pattern_length == 1) {
109  strategy_ = &SingleCharSearch;
110  return;
111  }
112  strategy_ = &LinearSearch;
113  return;
114  }
115  strategy_ = &InitialSearch;
116  }
117 
118  size_t Search(Vector<const Char> subject, size_t index) {
119  return strategy_(this, subject, index);
120  }
121 
122  static inline int AlphabetSize() {
123  if (sizeof(Char) == 1) {
124  // Latin1 needle.
125  return kLatin1AlphabetSize;
126  } else {
127  // UC16 needle.
128  return kUC16AlphabetSize;
129  }
130 
131  static_assert(sizeof(Char) == sizeof(uint8_t) ||
132  sizeof(Char) == sizeof(uint16_t),
133  "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)");
134  }
135 
136  private:
137  typedef size_t (*SearchFunction)(
138  StringSearch<Char>*,
139  Vector<const Char>,
140  size_t);
141 
142  static size_t SingleCharSearch(StringSearch<Char>* search,
143  Vector<const Char> subject,
144  size_t start_index);
145 
146  static size_t LinearSearch(StringSearch<Char>* search,
147  Vector<const Char> subject,
148  size_t start_index);
149 
150  static size_t InitialSearch(StringSearch<Char>* search,
151  Vector<const Char> subject,
152  size_t start_index);
153 
154  static size_t BoyerMooreHorspoolSearch(
155  StringSearch<Char>* search,
156  Vector<const Char> subject,
157  size_t start_index);
158 
159  static size_t BoyerMooreSearch(StringSearch<Char>* search,
160  Vector<const Char> subject,
161  size_t start_index);
162 
163  void PopulateBoyerMooreHorspoolTable();
164 
165  void PopulateBoyerMooreTable();
166 
167  static inline int CharOccurrence(int* bad_char_occurrence,
168  Char char_code) {
169  if (sizeof(Char) == 1) {
170  return bad_char_occurrence[static_cast<int>(char_code)];
171  }
172  // Both pattern and subject are UC16. Reduce character to equivalence class.
173  int equiv_class = char_code % kUC16AlphabetSize;
174  return bad_char_occurrence[equiv_class];
175  }
176 
177  // Store for the BoyerMoore(Horspool) bad char shift table.
178  // Return a table covering the last kBMMaxShift+1 positions of
179  // pattern.
180  int* bad_char_table() { return kBadCharShiftTable; }
181 
182  // Store for the BoyerMoore good suffix shift table.
183  int* good_suffix_shift_table() {
184  // Return biased pointer that maps the range [start_..pattern_.length()
185  // to the kGoodSuffixShiftTable array.
186  return kGoodSuffixShiftTable - start_;
187  }
188 
189  // Table used temporarily while building the BoyerMoore good suffix
190  // shift table.
191  int* suffix_table() {
192  // Return biased pointer that maps the range [start_..pattern_.length()
193  // to the kSuffixTable array.
194  return kSuffixTable - start_;
195  }
196 
197  // The pattern to search for.
198  Vector<const Char> pattern_;
199  // Pointer to implementation of the search.
200  SearchFunction strategy_;
201  // Cache value of Max(0, pattern_length() - kBMMaxShift)
202  size_t start_;
203 };
204 
205 
206 template <typename T, typename U>
207 inline T AlignDown(T value, U alignment) {
208  return reinterpret_cast<T>(
209  (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
210 }
211 
212 
213 inline uint8_t GetHighestValueByte(uint16_t character) {
214  return Max(static_cast<uint8_t>(character & 0xFF),
215  static_cast<uint8_t>(character >> 8));
216 }
217 
218 
219 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
220 
221 
222 // Searches for a byte value in a memory buffer, back to front.
223 // Uses memrchr(3) on systems which support it, for speed.
224 // Falls back to a vanilla for loop on non-GNU systems such as Windows.
225 inline const void* MemrchrFill(const void* haystack, uint8_t needle,
226  size_t haystack_len) {
227 #ifdef _GNU_SOURCE
228  return memrchr(haystack, needle, haystack_len);
229 #else
230  const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack);
231  for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) {
232  if (haystack8[i] == needle) {
233  return haystack8 + i;
234  }
235  }
236  return nullptr;
237 #endif
238 }
239 
240 
241 // Finds the first occurrence of *two-byte* character pattern[0] in the string
242 // `subject`. Does not check that the whole pattern matches.
243 template <typename Char>
244 inline size_t FindFirstCharacter(Vector<const Char> pattern,
245  Vector<const Char> subject, size_t index) {
246  const Char pattern_first_char = pattern[0];
247  const size_t max_n = (subject.length() - pattern.length() + 1);
248 
249  // For speed, search for the more `rare` of the two bytes in pattern[0]
250  // using memchr / memrchr (which are much faster than a simple for loop).
251  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
252  size_t pos = index;
253  do {
254  const size_t bytes_to_search = (max_n - pos) * sizeof(Char);
255  const void* void_pos;
256  if (subject.forward()) {
257  // Assert that bytes_to_search won't overflow
258  CHECK_LE(pos, max_n);
259  CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char));
260  void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search);
261  } else {
262  CHECK_LE(pos, subject.length());
263  CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char));
264  void_pos = MemrchrFill(subject.start() + pattern.length() - 1,
265  search_byte,
266  bytes_to_search);
267  }
268  const Char* char_pos = static_cast<const Char*>(void_pos);
269  if (char_pos == nullptr)
270  return subject.length();
271 
272  // Then, for each match, verify that the full two bytes match pattern[0].
273  char_pos = AlignDown(char_pos, sizeof(Char));
274  size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
275  pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1);
276  if (subject[pos] == pattern_first_char) {
277  // Match found, hooray.
278  return pos;
279  }
280  // Search byte matched, but the other byte of pattern[0] didn't. Keep going.
281  } while (++pos < max_n);
282 
283  return subject.length();
284 }
285 
286 
287 // Finds the first occurrence of the byte pattern[0] in string `subject`.
288 // Does not verify that the whole pattern matches.
289 template <>
290 inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
291  Vector<const uint8_t> subject,
292  size_t index) {
293  const uint8_t pattern_first_char = pattern[0];
294  const size_t subj_len = subject.length();
295  const size_t max_n = (subject.length() - pattern.length() + 1);
296 
297  const void* pos;
298  if (subject.forward()) {
299  pos = memchr(subject.start() + index, pattern_first_char, max_n - index);
300  } else {
301  pos = MemrchrFill(subject.start() + pattern.length() - 1,
302  pattern_first_char,
303  max_n - index);
304  }
305  const uint8_t* char_pos = static_cast<const uint8_t*>(pos);
306  if (char_pos == nullptr) {
307  return subj_len;
308  }
309 
310  size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
311  return subject.forward() ? raw_pos : (subj_len - raw_pos - 1);
312 }
313 
314 //---------------------------------------------------------------------
315 // Single Character Pattern Search Strategy
316 //---------------------------------------------------------------------
317 
318 template <typename Char>
319 size_t StringSearch<Char>::SingleCharSearch(
320  StringSearch<Char>* search,
321  Vector<const Char> subject,
322  size_t index) {
323  CHECK_EQ(1, search->pattern_.length());
324  return FindFirstCharacter(search->pattern_, subject, index);
325 }
326 
327 //---------------------------------------------------------------------
328 // Linear Search Strategy
329 //---------------------------------------------------------------------
330 
331 // Simple linear search for short patterns. Never bails out.
332 template <typename Char>
333 size_t StringSearch<Char>::LinearSearch(
334  StringSearch<Char>* search,
335  Vector<const Char> subject,
336  size_t index) {
337  Vector<const Char> pattern = search->pattern_;
338  CHECK_GT(pattern.length(), 1);
339  const size_t pattern_length = pattern.length();
340  const size_t n = subject.length() - pattern_length;
341  for (size_t i = index; i <= n; i++) {
342  i = FindFirstCharacter(pattern, subject, i);
343  if (i == subject.length())
344  return subject.length();
345  CHECK_LE(i, n);
346 
347  bool matches = true;
348  for (size_t j = 1; j < pattern_length; j++) {
349  if (pattern[j] != subject[i + j]) {
350  matches = false;
351  break;
352  }
353  }
354  if (matches) {
355  return i;
356  }
357  }
358  return subject.length();
359 }
360 
361 //---------------------------------------------------------------------
362 // Boyer-Moore string search
363 //---------------------------------------------------------------------
364 
365 template <typename Char>
366 size_t StringSearch<Char>::BoyerMooreSearch(
367  StringSearch<Char>* search,
368  Vector<const Char> subject,
369  size_t start_index) {
370  Vector<const Char> pattern = search->pattern_;
371  const size_t subject_length = subject.length();
372  const size_t pattern_length = pattern.length();
373  // Only preprocess at most kBMMaxShift last characters of pattern.
374  size_t start = search->start_;
375 
376  int* bad_char_occurrence = search->bad_char_table();
377  int* good_suffix_shift = search->good_suffix_shift_table();
378 
379  Char last_char = pattern[pattern_length - 1];
380  size_t index = start_index;
381  // Continue search from i.
382  while (index <= subject_length - pattern_length) {
383  size_t j = pattern_length - 1;
384  int c;
385  while (last_char != (c = subject[index + j])) {
386  int shift = j - CharOccurrence(bad_char_occurrence, c);
387  index += shift;
388  if (index > subject_length - pattern_length) {
389  return subject.length();
390  }
391  }
392  while (pattern[j] == (c = subject[index + j])) {
393  if (j == 0) {
394  return index;
395  }
396  j--;
397  }
398  if (j < start) {
399  // we have matched more than our tables allow us to be smart about.
400  // Fall back on BMH shift.
401  index += pattern_length - 1 -
402  CharOccurrence(bad_char_occurrence,
403  static_cast<Char>(last_char));
404  } else {
405  int gs_shift = good_suffix_shift[j + 1];
406  int bc_occ = CharOccurrence(bad_char_occurrence, c);
407  int shift = j - bc_occ;
408  if (gs_shift > shift) {
409  shift = gs_shift;
410  }
411  index += shift;
412  }
413  }
414 
415  return subject.length();
416 }
417 
418 template <typename Char>
419 void StringSearch<Char>::PopulateBoyerMooreTable() {
420  const size_t pattern_length = pattern_.length();
421  Vector<const Char> pattern = pattern_;
422  // Only look at the last kBMMaxShift characters of pattern (from start_
423  // to pattern_length).
424  const size_t start = start_;
425  const size_t length = pattern_length - start;
426 
427  // Biased tables so that we can use pattern indices as table indices,
428  // even if we only cover the part of the pattern from offset start.
429  int* shift_table = good_suffix_shift_table();
430  int* suffix_table = this->suffix_table();
431 
432  // Initialize table.
433  for (size_t i = start; i < pattern_length; i++) {
434  shift_table[i] = length;
435  }
436  shift_table[pattern_length] = 1;
437  suffix_table[pattern_length] = pattern_length + 1;
438 
439  if (pattern_length <= start) {
440  return;
441  }
442 
443  // Find suffixes.
444  Char last_char = pattern_[pattern_length - 1];
445  size_t suffix = pattern_length + 1;
446  {
447  size_t i = pattern_length;
448  while (i > start) {
449  Char c = pattern[i - 1];
450  while (suffix <= pattern_length && c != pattern[suffix - 1]) {
451  if (static_cast<size_t>(shift_table[suffix]) == length) {
452  shift_table[suffix] = suffix - i;
453  }
454  suffix = suffix_table[suffix];
455  }
456  suffix_table[--i] = --suffix;
457  if (suffix == pattern_length) {
458  // No suffix to extend, so we check against last_char only.
459  while ((i > start) && (pattern[i - 1] != last_char)) {
460  if (static_cast<size_t>(shift_table[pattern_length]) == length) {
461  shift_table[pattern_length] = pattern_length - i;
462  }
463  suffix_table[--i] = pattern_length;
464  }
465  if (i > start) {
466  suffix_table[--i] = --suffix;
467  }
468  }
469  }
470  }
471  // Build shift table using suffixes.
472  if (suffix < pattern_length) {
473  for (size_t i = start; i <= pattern_length; i++) {
474  if (static_cast<size_t>(shift_table[i]) == length) {
475  shift_table[i] = suffix - start;
476  }
477  if (i == suffix) {
478  suffix = suffix_table[suffix];
479  }
480  }
481  }
482 }
483 
484 //---------------------------------------------------------------------
485 // Boyer-Moore-Horspool string search.
486 //---------------------------------------------------------------------
487 
488 template <typename Char>
489 size_t StringSearch<Char>::BoyerMooreHorspoolSearch(
490  StringSearch<Char>* search,
491  Vector<const Char> subject,
492  size_t start_index) {
493  Vector<const Char> pattern = search->pattern_;
494  const size_t subject_length = subject.length();
495  const size_t pattern_length = pattern.length();
496  int* char_occurrences = search->bad_char_table();
497  int64_t badness = -pattern_length;
498 
499  // How bad we are doing without a good-suffix table.
500  Char last_char = pattern[pattern_length - 1];
501  int last_char_shift =
502  pattern_length - 1 -
503  CharOccurrence(char_occurrences, static_cast<Char>(last_char));
504 
505  // Perform search
506  size_t index = start_index; // No matches found prior to this index.
507  while (index <= subject_length - pattern_length) {
508  size_t j = pattern_length - 1;
509  int subject_char;
510  while (last_char != (subject_char = subject[index + j])) {
511  int bc_occ = CharOccurrence(char_occurrences, subject_char);
512  int shift = j - bc_occ;
513  index += shift;
514  badness += 1 - shift; // at most zero, so badness cannot increase.
515  if (index > subject_length - pattern_length) {
516  return subject_length;
517  }
518  }
519  j--;
520  while (pattern[j] == (subject[index + j])) {
521  if (j == 0) {
522  return index;
523  }
524  j--;
525  }
526  index += last_char_shift;
527  // Badness increases by the number of characters we have
528  // checked, and decreases by the number of characters we
529  // can skip by shifting. It's a measure of how we are doing
530  // compared to reading each character exactly once.
531  badness += (pattern_length - j) - last_char_shift;
532  if (badness > 0) {
533  search->PopulateBoyerMooreTable();
534  search->strategy_ = &BoyerMooreSearch;
535  return BoyerMooreSearch(search, subject, index);
536  }
537  }
538  return subject.length();
539 }
540 
541 template <typename Char>
542 void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() {
543  const size_t pattern_length = pattern_.length();
544 
545  int* bad_char_occurrence = bad_char_table();
546 
547  // Only preprocess at most kBMMaxShift last characters of pattern.
548  const size_t start = start_;
549  // Run forwards to populate bad_char_table, so that *last* instance
550  // of character equivalence class is the one registered.
551  // Notice: Doesn't include the last character.
552  const size_t table_size = AlphabetSize();
553  if (start == 0) {
554  // All patterns less than kBMMaxShift in length.
555  memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
556  } else {
557  for (size_t i = 0; i < table_size; i++) {
558  bad_char_occurrence[i] = start - 1;
559  }
560  }
561  for (size_t i = start; i < pattern_length - 1; i++) {
562  Char c = pattern_[i];
563  int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize();
564  bad_char_occurrence[bucket] = i;
565  }
566 }
567 
568 //---------------------------------------------------------------------
569 // Linear string search with bailout to BMH.
570 //---------------------------------------------------------------------
571 
572 // Simple linear search for short patterns, which bails out if the string
573 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
574 template <typename Char>
575 size_t StringSearch<Char>::InitialSearch(
576  StringSearch<Char>* search,
577  Vector<const Char> subject,
578  size_t index) {
579  Vector<const Char> pattern = search->pattern_;
580  const size_t pattern_length = pattern.length();
581  // Badness is a count of how much work we have done. When we have
582  // done enough work we decide it's probably worth switching to a better
583  // algorithm.
584  int64_t badness = -10 - (pattern_length << 2);
585 
586  // We know our pattern is at least 2 characters, we cache the first so
587  // the common case of the first character not matching is faster.
588  for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
589  badness++;
590  if (badness <= 0) {
591  i = FindFirstCharacter(pattern, subject, i);
592  if (i == subject.length())
593  return subject.length();
594  CHECK_LE(i, n);
595  size_t j = 1;
596  do {
597  if (pattern[j] != subject[i + j]) {
598  break;
599  }
600  j++;
601  } while (j < pattern_length);
602  if (j == pattern_length) {
603  return i;
604  }
605  badness += j;
606  } else {
607  search->PopulateBoyerMooreHorspoolTable();
608  search->strategy_ = &BoyerMooreHorspoolSearch;
609  return BoyerMooreHorspoolSearch(search, subject, i);
610  }
611  }
612  return subject.length();
613 }
614 
615 // Perform a a single stand-alone search.
616 // If searching multiple times for the same pattern, a search
617 // object should be constructed once and the Search function then called
618 // for each search.
619 template <typename Char>
620 size_t SearchString(Vector<const Char> subject,
621  Vector<const Char> pattern,
622  size_t start_index) {
623  StringSearch<Char> search(pattern);
624  return search.Search(subject, start_index);
625 }
626 } // namespace stringsearch
627 } // namespace node
628 
629 namespace node {
630 using node::stringsearch::Vector;
631 
632 template <typename Char>
633 size_t SearchString(const Char* haystack,
634  size_t haystack_length,
635  const Char* needle,
636  size_t needle_length,
637  size_t start_index,
638  bool is_forward) {
639  // To do a reverse search (lastIndexOf instead of indexOf) without redundant
640  // code, create two vectors that are reversed views into the input strings.
641  // For example, v_needle[0] would return the *last* character of the needle.
642  // So we're searching for the first instance of rev(needle) in rev(haystack)
643  Vector<const Char> v_needle = Vector<const Char>(
644  needle, needle_length, is_forward);
645  Vector<const Char> v_haystack = Vector<const Char>(
646  haystack, haystack_length, is_forward);
647  CHECK(haystack_length >= needle_length);
648  size_t diff = haystack_length - needle_length;
649  size_t relative_start_index;
650  if (is_forward) {
651  relative_start_index = start_index;
652  } else if (diff < start_index) {
653  relative_start_index = 0;
654  } else {
655  relative_start_index = diff - start_index;
656  }
657  size_t pos = node::stringsearch::SearchString(
658  v_haystack, v_needle, relative_start_index);
659  if (pos == haystack_length) {
660  // not found
661  return pos;
662  }
663  return is_forward ? pos : (haystack_length - needle_length - pos);
664 }
665 } // namespace node
666 
667 #endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
668 
669 #endif // SRC_STRING_SEARCH_H_
union node::cares_wrap::@8::CaresAsyncData::@0 data
dtrace a
Definition: v8ustack.d:531
dtrace n
Definition: v8ustack.d:531