v8  3.25.30(node0.11.13)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
string-search.h
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_STRING_SEARCH_H_
29 #define V8_STRING_SEARCH_H_
30 
31 namespace v8 {
32 namespace internal {
33 
34 
35 //---------------------------------------------------------------------
36 // String Search object.
37 //---------------------------------------------------------------------
38 
39 // Class holding constants and methods that apply to all string search variants,
40 // independently of subject and pattern char size.
42  protected:
43  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
44  // limit, we can fix the size of tables. For a needle longer than this limit,
45  // search will not be optimal, since we only build tables for a suffix
46  // of the string, but it is a safe approximation.
47  static const int kBMMaxShift = Isolate::kBMMaxShift;
48 
49  // Reduce alphabet to this size.
50  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
51  // proportional to the input alphabet. We reduce the alphabet size by
52  // equating input characters modulo a smaller alphabet size. This gives
53  // a potentially less efficient searching, but is a safe approximation.
54  // For needles using only characters in the same Unicode 256-code point page,
55  // there is no search speed degradation.
56  static const int kAsciiAlphabetSize = 256;
58 
59  // Bad-char shift table stored in the state. It's length is the alphabet size.
60  // For patterns below this length, the skip length of Boyer-Moore is too short
61  // to compensate for the algorithmic overhead compared to simple brute force.
62  static const int kBMMinPatternLength = 7;
63 
64  static inline bool IsOneByteString(Vector<const uint8_t> string) {
65  return true;
66  }
67 
68  static inline bool IsOneByteString(Vector<const uc16> string) {
69  return String::IsOneByte(string.start(), string.length());
70  }
71 
72  friend class Isolate;
73 };
74 
75 
76 template <typename PatternChar, typename SubjectChar>
77 class StringSearch : private StringSearchBase {
78  public:
80  : isolate_(isolate),
81  pattern_(pattern),
82  start_(Max(0, pattern.length() - kBMMaxShift)) {
83  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
84  if (!IsOneByteString(pattern_)) {
85  strategy_ = &FailSearch;
86  return;
87  }
88  }
89  int pattern_length = pattern_.length();
90  if (pattern_length < kBMMinPatternLength) {
91  if (pattern_length == 1) {
92  strategy_ = &SingleCharSearch;
93  return;
94  }
95  strategy_ = &LinearSearch;
96  return;
97  }
98  strategy_ = &InitialSearch;
99  }
100 
101  int Search(Vector<const SubjectChar> subject, int index) {
102  return strategy_(this, subject, index);
103  }
104 
105  static inline int AlphabetSize() {
106  if (sizeof(PatternChar) == 1) {
107  // ASCII needle.
108  return kAsciiAlphabetSize;
109  } else {
110  ASSERT(sizeof(PatternChar) == 2);
111  // UC16 needle.
112  return kUC16AlphabetSize;
113  }
114  }
115 
116  private:
117  typedef int (*SearchFunction)( // NOLINT - it's not a cast!
120  int);
121 
122  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
124  int) {
125  return -1;
126  }
127 
128  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
129  Vector<const SubjectChar> subject,
130  int start_index);
131 
132  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
133  Vector<const SubjectChar> subject,
134  int start_index);
135 
136  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
137  Vector<const SubjectChar> subject,
138  int start_index);
139 
140  static int BoyerMooreHorspoolSearch(
141  StringSearch<PatternChar, SubjectChar>* search,
142  Vector<const SubjectChar> subject,
143  int start_index);
144 
145  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
146  Vector<const SubjectChar> subject,
147  int start_index);
148 
149  void PopulateBoyerMooreHorspoolTable();
150 
151  void PopulateBoyerMooreTable();
152 
153  static inline bool exceedsOneByte(uint8_t c) {
154  return false;
155  }
156 
157  static inline bool exceedsOneByte(uint16_t c) {
158  return c > String::kMaxOneByteCharCodeU;
159  }
160 
161  static inline int CharOccurrence(int* bad_char_occurrence,
162  SubjectChar char_code) {
163  if (sizeof(SubjectChar) == 1) {
164  return bad_char_occurrence[static_cast<int>(char_code)];
165  }
166  if (sizeof(PatternChar) == 1) {
167  if (exceedsOneByte(char_code)) {
168  return -1;
169  }
170  return bad_char_occurrence[static_cast<unsigned int>(char_code)];
171  }
172  // Both pattern and subject are UC16. Reduce character to equivalence class.
173  int equiv_class = char_code % kUC16AlphabetSize;
174  return bad_char_occurrence[equiv_class];
175  }
176 
177  // The following tables are shared by all searches.
178  // TODO(lrn): Introduce a way for a pattern to keep its tables
179  // between searches (e.g., for an Atom RegExp).
180 
181  // Store for the BoyerMoore(Horspool) bad char shift table.
182  // Return a table covering the last kBMMaxShift+1 positions of
183  // pattern.
184  int* bad_char_table() {
185  return isolate_->bad_char_shift_table();
186  }
187 
188  // Store for the BoyerMoore good suffix shift table.
189  int* good_suffix_shift_table() {
190  // Return biased pointer that maps the range [start_..pattern_.length()
191  // to the kGoodSuffixShiftTable array.
192  return isolate_->good_suffix_shift_table() - start_;
193  }
194 
195  // Table used temporarily while building the BoyerMoore good suffix
196  // shift table.
197  int* suffix_table() {
198  // Return biased pointer that maps the range [start_..pattern_.length()
199  // to the kSuffixTable array.
200  return isolate_->suffix_table() - start_;
201  }
202 
203  Isolate* isolate_;
204  // The pattern to search for.
205  Vector<const PatternChar> pattern_;
206  // Pointer to implementation of the search.
207  SearchFunction strategy_;
208  // Cache value of Max(0, pattern_length() - kBMMaxShift)
209  int start_;
210 };
211 
212 
213 //---------------------------------------------------------------------
214 // Single Character Pattern Search Strategy
215 //---------------------------------------------------------------------
216 
217 template <typename PatternChar, typename SubjectChar>
218 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
219  StringSearch<PatternChar, SubjectChar>* search,
220  Vector<const SubjectChar> subject,
221  int index) {
222  ASSERT_EQ(1, search->pattern_.length());
223  PatternChar pattern_first_char = search->pattern_[0];
224  int i = index;
225  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
226  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
227  memchr(subject.start() + i,
228  pattern_first_char,
229  subject.length() - i));
230  if (pos == NULL) return -1;
231  return static_cast<int>(pos - subject.start());
232  } else {
233  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
234  if (exceedsOneByte(pattern_first_char)) {
235  return -1;
236  }
237  }
238  SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
239  int n = subject.length();
240  while (i < n) {
241  if (subject[i++] == search_char) return i - 1;
242  }
243  return -1;
244  }
245 }
246 
247 //---------------------------------------------------------------------
248 // Linear Search Strategy
249 //---------------------------------------------------------------------
250 
251 
252 template <typename PatternChar, typename SubjectChar>
253 inline bool CharCompare(const PatternChar* pattern,
254  const SubjectChar* subject,
255  int length) {
256  ASSERT(length > 0);
257  int pos = 0;
258  do {
259  if (pattern[pos] != subject[pos]) {
260  return false;
261  }
262  pos++;
263  } while (pos < length);
264  return true;
265 }
266 
267 
268 // Simple linear search for short patterns. Never bails out.
269 template <typename PatternChar, typename SubjectChar>
271  StringSearch<PatternChar, SubjectChar>* search,
272  Vector<const SubjectChar> subject,
273  int index) {
274  Vector<const PatternChar> pattern = search->pattern_;
275  ASSERT(pattern.length() > 1);
276  int pattern_length = pattern.length();
277  PatternChar pattern_first_char = pattern[0];
278  int i = index;
279  int n = subject.length() - pattern_length;
280  while (i <= n) {
281  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
282  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
283  memchr(subject.start() + i,
284  pattern_first_char,
285  n - i + 1));
286  if (pos == NULL) return -1;
287  i = static_cast<int>(pos - subject.start()) + 1;
288  } else {
289  if (subject[i++] != pattern_first_char) continue;
290  }
291  // Loop extracted to separate function to allow using return to do
292  // a deeper break.
293  if (CharCompare(pattern.start() + 1,
294  subject.start() + i,
295  pattern_length - 1)) {
296  return i - 1;
297  }
298  }
299  return -1;
300 }
301 
302 //---------------------------------------------------------------------
303 // Boyer-Moore string search
304 //---------------------------------------------------------------------
305 
306 template <typename PatternChar, typename SubjectChar>
307 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
308  StringSearch<PatternChar, SubjectChar>* search,
309  Vector<const SubjectChar> subject,
310  int start_index) {
311  Vector<const PatternChar> pattern = search->pattern_;
312  int subject_length = subject.length();
313  int pattern_length = pattern.length();
314  // Only preprocess at most kBMMaxShift last characters of pattern.
315  int start = search->start_;
316 
317  int* bad_char_occurence = search->bad_char_table();
318  int* good_suffix_shift = search->good_suffix_shift_table();
319 
320  PatternChar last_char = pattern[pattern_length - 1];
321  int index = start_index;
322  // Continue search from i.
323  while (index <= subject_length - pattern_length) {
324  int j = pattern_length - 1;
325  int c;
326  while (last_char != (c = subject[index + j])) {
327  int shift =
328  j - CharOccurrence(bad_char_occurence, c);
329  index += shift;
330  if (index > subject_length - pattern_length) {
331  return -1;
332  }
333  }
334  while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
335  if (j < 0) {
336  return index;
337  } else if (j < start) {
338  // we have matched more than our tables allow us to be smart about.
339  // Fall back on BMH shift.
340  index += pattern_length - 1
341  - CharOccurrence(bad_char_occurence,
342  static_cast<SubjectChar>(last_char));
343  } else {
344  int gs_shift = good_suffix_shift[j + 1];
345  int bc_occ =
346  CharOccurrence(bad_char_occurence, c);
347  int shift = j - bc_occ;
348  if (gs_shift > shift) {
349  shift = gs_shift;
350  }
351  index += shift;
352  }
353  }
354 
355  return -1;
356 }
357 
358 
359 template <typename PatternChar, typename SubjectChar>
360 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
361  int pattern_length = pattern_.length();
362  const PatternChar* pattern = pattern_.start();
363  // Only look at the last kBMMaxShift characters of pattern (from start_
364  // to pattern_length).
365  int start = start_;
366  int length = pattern_length - start;
367 
368  // Biased tables so that we can use pattern indices as table indices,
369  // even if we only cover the part of the pattern from offset start.
370  int* shift_table = good_suffix_shift_table();
371  int* suffix_table = this->suffix_table();
372 
373  // Initialize table.
374  for (int i = start; i < pattern_length; i++) {
375  shift_table[i] = length;
376  }
377  shift_table[pattern_length] = 1;
378  suffix_table[pattern_length] = pattern_length + 1;
379 
380  if (pattern_length <= start) {
381  return;
382  }
383 
384  // Find suffixes.
385  PatternChar last_char = pattern[pattern_length - 1];
386  int suffix = pattern_length + 1;
387  {
388  int i = pattern_length;
389  while (i > start) {
390  PatternChar c = pattern[i - 1];
391  while (suffix <= pattern_length && c != pattern[suffix - 1]) {
392  if (shift_table[suffix] == length) {
393  shift_table[suffix] = suffix - i;
394  }
395  suffix = suffix_table[suffix];
396  }
397  suffix_table[--i] = --suffix;
398  if (suffix == pattern_length) {
399  // No suffix to extend, so we check against last_char only.
400  while ((i > start) && (pattern[i - 1] != last_char)) {
401  if (shift_table[pattern_length] == length) {
402  shift_table[pattern_length] = pattern_length - i;
403  }
404  suffix_table[--i] = pattern_length;
405  }
406  if (i > start) {
407  suffix_table[--i] = --suffix;
408  }
409  }
410  }
411  }
412  // Build shift table using suffixes.
413  if (suffix < pattern_length) {
414  for (int i = start; i <= pattern_length; i++) {
415  if (shift_table[i] == length) {
416  shift_table[i] = suffix - start;
417  }
418  if (i == suffix) {
419  suffix = suffix_table[suffix];
420  }
421  }
422  }
423 }
424 
425 //---------------------------------------------------------------------
426 // Boyer-Moore-Horspool string search.
427 //---------------------------------------------------------------------
428 
429 template <typename PatternChar, typename SubjectChar>
430 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
431  StringSearch<PatternChar, SubjectChar>* search,
432  Vector<const SubjectChar> subject,
433  int start_index) {
434  Vector<const PatternChar> pattern = search->pattern_;
435  int subject_length = subject.length();
436  int pattern_length = pattern.length();
437  int* char_occurrences = search->bad_char_table();
438  int badness = -pattern_length;
439 
440  // How bad we are doing without a good-suffix table.
441  PatternChar last_char = pattern[pattern_length - 1];
442  int last_char_shift = pattern_length - 1 -
443  CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
444  // Perform search
445  int index = start_index; // No matches found prior to this index.
446  while (index <= subject_length - pattern_length) {
447  int j = pattern_length - 1;
448  int subject_char;
449  while (last_char != (subject_char = subject[index + j])) {
450  int bc_occ = CharOccurrence(char_occurrences, subject_char);
451  int shift = j - bc_occ;
452  index += shift;
453  badness += 1 - shift; // at most zero, so badness cannot increase.
454  if (index > subject_length - pattern_length) {
455  return -1;
456  }
457  }
458  j--;
459  while (j >= 0 && pattern[j] == (subject[index + j])) j--;
460  if (j < 0) {
461  return index;
462  } else {
463  index += last_char_shift;
464  // Badness increases by the number of characters we have
465  // checked, and decreases by the number of characters we
466  // can skip by shifting. It's a measure of how we are doing
467  // compared to reading each character exactly once.
468  badness += (pattern_length - j) - last_char_shift;
469  if (badness > 0) {
470  search->PopulateBoyerMooreTable();
471  search->strategy_ = &BoyerMooreSearch;
472  return BoyerMooreSearch(search, subject, index);
473  }
474  }
475  }
476  return -1;
477 }
478 
479 
480 template <typename PatternChar, typename SubjectChar>
481 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
482  int pattern_length = pattern_.length();
483 
484  int* bad_char_occurrence = bad_char_table();
485 
486  // Only preprocess at most kBMMaxShift last characters of pattern.
487  int start = start_;
488  // Run forwards to populate bad_char_table, so that *last* instance
489  // of character equivalence class is the one registered.
490  // Notice: Doesn't include the last character.
491  int table_size = AlphabetSize();
492  if (start == 0) { // All patterns less than kBMMaxShift in length.
493  memset(bad_char_occurrence,
494  -1,
495  table_size * sizeof(*bad_char_occurrence));
496  } else {
497  for (int i = 0; i < table_size; i++) {
498  bad_char_occurrence[i] = start - 1;
499  }
500  }
501  for (int i = start; i < pattern_length - 1; i++) {
502  PatternChar c = pattern_[i];
503  int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
504  bad_char_occurrence[bucket] = i;
505  }
506 }
507 
508 //---------------------------------------------------------------------
509 // Linear string search with bailout to BMH.
510 //---------------------------------------------------------------------
511 
512 // Simple linear search for short patterns, which bails out if the string
513 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
514 template <typename PatternChar, typename SubjectChar>
515 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
516  StringSearch<PatternChar, SubjectChar>* search,
517  Vector<const SubjectChar> subject,
518  int index) {
519  Vector<const PatternChar> pattern = search->pattern_;
520  int pattern_length = pattern.length();
521  // Badness is a count of how much work we have done. When we have
522  // done enough work we decide it's probably worth switching to a better
523  // algorithm.
524  int badness = -10 - (pattern_length << 2);
525 
526  // We know our pattern is at least 2 characters, we cache the first so
527  // the common case of the first character not matching is faster.
528  PatternChar pattern_first_char = pattern[0];
529  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
530  badness++;
531  if (badness <= 0) {
532  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
533  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
534  memchr(subject.start() + i,
535  pattern_first_char,
536  n - i + 1));
537  if (pos == NULL) {
538  return -1;
539  }
540  i = static_cast<int>(pos - subject.start());
541  } else {
542  if (subject[i] != pattern_first_char) continue;
543  }
544  int j = 1;
545  do {
546  if (pattern[j] != subject[i + j]) {
547  break;
548  }
549  j++;
550  } while (j < pattern_length);
551  if (j == pattern_length) {
552  return i;
553  }
554  badness += j;
555  } else {
556  search->PopulateBoyerMooreHorspoolTable();
557  search->strategy_ = &BoyerMooreHorspoolSearch;
558  return BoyerMooreHorspoolSearch(search, subject, i);
559  }
560  }
561  return -1;
562 }
563 
564 
565 // Perform a a single stand-alone search.
566 // If searching multiple times for the same pattern, a search
567 // object should be constructed once and the Search function then called
568 // for each search.
569 template <typename SubjectChar, typename PatternChar>
570 int SearchString(Isolate* isolate,
573  int start_index) {
574  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
575  return search.Search(subject, start_index);
576 }
577 
578 }} // namespace v8::internal
579 
580 #endif // V8_STRING_SEARCH_H_
enable upcoming ES6 features enable harmony block scoping enable harmony enable harmony proxies enable harmony generators enable harmony numeric enable harmony string enable harmony math functions harmony_scoping harmony_symbols harmony_collections harmony_iteration harmony_strings harmony_scoping harmony_maths tracks arrays with only smi values Optimize object Array DOM strings and string pretenure call new trace pretenuring decisions of HAllocate instructions track fields with only smi values track fields with heap values track_fields track_fields Enables optimizations which favor memory size over execution speed use string slices optimization filter maximum number of GVN fix point iterations use function inlining use allocation folding eliminate write barriers targeting allocations in optimized code maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining crankshaft harvests type feedback from stub cache trace check elimination phase hydrogen tracing filter NULL
Definition: flags.cc:269
bool CharCompare(const PatternChar *pattern, const SubjectChar *subject, int length)
T Max(T a, T b)
Definition: utils.h:227
static bool IsOneByteString(Vector< const uint8_t > string)
Definition: string-search.h:64
int Search(Vector< const SubjectChar > subject, int index)
#define ASSERT(condition)
Definition: checks.h:329
unsigned short uint16_t
Definition: unicode.cc:46
static const uint32_t kMaxOneByteCharCodeU
Definition: objects.h:8915
static bool IsOneByte(const uc16 *chars, int length)
Definition: objects.h:8985
static const int kAsciiAlphabetSize
Definition: string-search.h:56
enable upcoming ES6 features enable harmony block scoping enable harmony enable harmony proxies enable harmony generators enable harmony numeric enable harmony string enable harmony math functions harmony_scoping harmony_symbols harmony_collections harmony_iteration harmony_strings harmony_scoping harmony_maths tracks arrays with only smi values Optimize object Array shift
Definition: flags.cc:211
int length() const
Definition: utils.h:420
static const int kUC16AlphabetSize
Definition: isolate.h:824
StringSearch(Isolate *isolate, Vector< const PatternChar > pattern)
Definition: string-search.h:79
static bool IsOneByteString(Vector< const uc16 > string)
Definition: string-search.h:68
int SearchString(Isolate *isolate, Vector< const SubjectChar > subject, Vector< const PatternChar > pattern, int start_index)
#define ASSERT_EQ(v1, v2)
Definition: checks.h:330
static const int kBMMaxShift
Definition: isolate.h:825
static const int kUC16AlphabetSize
Definition: string-search.h:57
static const int kBMMinPatternLength
Definition: string-search.h:62
int LinearSearch(T *array, Name *name, int len, int valid_entries)
Definition: objects-inl.h:2487