v8  3.14.5(node0.10.28)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
string-search.h
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_STRING_SEARCH_H_
29 #define V8_STRING_SEARCH_H_
30 
31 namespace v8 {
32 namespace internal {
33 
34 
35 //---------------------------------------------------------------------
36 // String Search object.
37 //---------------------------------------------------------------------
38 
39 // Class holding constants and methods that apply to all string search variants,
40 // independently of subject and pattern char size.
42  protected:
43  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
44  // limit, we can fix the size of tables. For a needle longer than this limit,
45  // search will not be optimal, since we only build tables for a suffix
46  // of the string, but it is a safe approximation.
47  static const int kBMMaxShift = Isolate::kBMMaxShift;
48 
49  // Reduce alphabet to this size.
50  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
51  // proportional to the input alphabet. We reduce the alphabet size by
52  // equating input characters modulo a smaller alphabet size. This gives
53  // a potentially less efficient searching, but is a safe approximation.
54  // For needles using only characters in the same Unicode 256-code point page,
55  // there is no search speed degradation.
56  static const int kAsciiAlphabetSize = 128;
58 
59  // Bad-char shift table stored in the state. It's length is the alphabet size.
60  // For patterns below this length, the skip length of Boyer-Moore is too short
61  // to compensate for the algorithmic overhead compared to simple brute force.
62  static const int kBMMinPatternLength = 7;
63 
64  static inline bool IsAsciiString(Vector<const char>) {
65  return true;
66  }
67 
68  static inline bool IsAsciiString(Vector<const uc16> string) {
69  return String::IsAscii(string.start(), string.length());
70  }
71 
72  friend class Isolate;
73 };
74 
75 
76 template <typename PatternChar, typename SubjectChar>
77 class StringSearch : private StringSearchBase {
78  public:
80  : isolate_(isolate),
81  pattern_(pattern),
82  start_(Max(0, pattern.length() - kBMMaxShift)) {
83  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
84  if (!IsAsciiString(pattern_)) {
85  strategy_ = &FailSearch;
86  return;
87  }
88  }
89  int pattern_length = pattern_.length();
90  if (pattern_length < kBMMinPatternLength) {
91  if (pattern_length == 1) {
92  strategy_ = &SingleCharSearch;
93  return;
94  }
95  strategy_ = &LinearSearch;
96  return;
97  }
98  strategy_ = &InitialSearch;
99  }
100 
101  int Search(Vector<const SubjectChar> subject, int index) {
102  return strategy_(this, subject, index);
103  }
104 
105  static inline int AlphabetSize() {
106  if (sizeof(PatternChar) == 1) {
107  // ASCII needle.
108  return kAsciiAlphabetSize;
109  } else {
110  ASSERT(sizeof(PatternChar) == 2);
111  // UC16 needle.
112  return kUC16AlphabetSize;
113  }
114  }
115 
116  private:
117  typedef int (*SearchFunction)( // NOLINT - it's not a cast!
120  int);
121 
122  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
124  int) {
125  return -1;
126  }
127 
128  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
129  Vector<const SubjectChar> subject,
130  int start_index);
131 
132  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
133  Vector<const SubjectChar> subject,
134  int start_index);
135 
136  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
137  Vector<const SubjectChar> subject,
138  int start_index);
139 
140  static int BoyerMooreHorspoolSearch(
141  StringSearch<PatternChar, SubjectChar>* search,
142  Vector<const SubjectChar> subject,
143  int start_index);
144 
145  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
146  Vector<const SubjectChar> subject,
147  int start_index);
148 
149  void PopulateBoyerMooreHorspoolTable();
150 
151  void PopulateBoyerMooreTable();
152 
153  static inline int CharOccurrence(int* bad_char_occurrence,
154  SubjectChar char_code) {
155  if (sizeof(SubjectChar) == 1) {
156  return bad_char_occurrence[static_cast<int>(char_code)];
157  }
158  if (sizeof(PatternChar) == 1) {
159  if (static_cast<unsigned int>(char_code) > String::kMaxAsciiCharCodeU) {
160  return -1;
161  }
162  return bad_char_occurrence[static_cast<unsigned int>(char_code)];
163  }
164  // Both pattern and subject are UC16. Reduce character to equivalence class.
165  int equiv_class = char_code % kUC16AlphabetSize;
166  return bad_char_occurrence[equiv_class];
167  }
168 
169  // The following tables are shared by all searches.
170  // TODO(lrn): Introduce a way for a pattern to keep its tables
171  // between searches (e.g., for an Atom RegExp).
172 
173  // Store for the BoyerMoore(Horspool) bad char shift table.
174  // Return a table covering the last kBMMaxShift+1 positions of
175  // pattern.
176  int* bad_char_table() {
177  return isolate_->bad_char_shift_table();
178  }
179 
180  // Store for the BoyerMoore good suffix shift table.
181  int* good_suffix_shift_table() {
182  // Return biased pointer that maps the range [start_..pattern_.length()
183  // to the kGoodSuffixShiftTable array.
184  return isolate_->good_suffix_shift_table() - start_;
185  }
186 
187  // Table used temporarily while building the BoyerMoore good suffix
188  // shift table.
189  int* suffix_table() {
190  // Return biased pointer that maps the range [start_..pattern_.length()
191  // to the kSuffixTable array.
192  return isolate_->suffix_table() - start_;
193  }
194 
195  Isolate* isolate_;
196  // The pattern to search for.
197  Vector<const PatternChar> pattern_;
198  // Pointer to implementation of the search.
199  SearchFunction strategy_;
200  // Cache value of Max(0, pattern_length() - kBMMaxShift)
201  int start_;
202 };
203 
204 
205 //---------------------------------------------------------------------
206 // Single Character Pattern Search Strategy
207 //---------------------------------------------------------------------
208 
209 template <typename PatternChar, typename SubjectChar>
210 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
211  StringSearch<PatternChar, SubjectChar>* search,
212  Vector<const SubjectChar> subject,
213  int index) {
214  ASSERT_EQ(1, search->pattern_.length());
215  PatternChar pattern_first_char = search->pattern_[0];
216  int i = index;
217  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
218  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
219  memchr(subject.start() + i,
220  pattern_first_char,
221  subject.length() - i));
222  if (pos == NULL) return -1;
223  return static_cast<int>(pos - subject.start());
224  } else {
225  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
226  if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) {
227  return -1;
228  }
229  }
230  SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
231  int n = subject.length();
232  while (i < n) {
233  if (subject[i++] == search_char) return i - 1;
234  }
235  return -1;
236  }
237 }
238 
239 //---------------------------------------------------------------------
240 // Linear Search Strategy
241 //---------------------------------------------------------------------
242 
243 
244 template <typename PatternChar, typename SubjectChar>
245 inline bool CharCompare(const PatternChar* pattern,
246  const SubjectChar* subject,
247  int length) {
248  ASSERT(length > 0);
249  int pos = 0;
250  do {
251  if (pattern[pos] != subject[pos]) {
252  return false;
253  }
254  pos++;
255  } while (pos < length);
256  return true;
257 }
258 
259 
260 // Simple linear search for short patterns. Never bails out.
261 template <typename PatternChar, typename SubjectChar>
263  StringSearch<PatternChar, SubjectChar>* search,
264  Vector<const SubjectChar> subject,
265  int index) {
266  Vector<const PatternChar> pattern = search->pattern_;
267  ASSERT(pattern.length() > 1);
268  int pattern_length = pattern.length();
269  PatternChar pattern_first_char = pattern[0];
270  int i = index;
271  int n = subject.length() - pattern_length;
272  while (i <= n) {
273  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
274  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
275  memchr(subject.start() + i,
276  pattern_first_char,
277  n - i + 1));
278  if (pos == NULL) return -1;
279  i = static_cast<int>(pos - subject.start()) + 1;
280  } else {
281  if (subject[i++] != pattern_first_char) continue;
282  }
283  // Loop extracted to separate function to allow using return to do
284  // a deeper break.
285  if (CharCompare(pattern.start() + 1,
286  subject.start() + i,
287  pattern_length - 1)) {
288  return i - 1;
289  }
290  }
291  return -1;
292 }
293 
294 //---------------------------------------------------------------------
295 // Boyer-Moore string search
296 //---------------------------------------------------------------------
297 
298 template <typename PatternChar, typename SubjectChar>
299 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
300  StringSearch<PatternChar, SubjectChar>* search,
301  Vector<const SubjectChar> subject,
302  int start_index) {
303  Vector<const PatternChar> pattern = search->pattern_;
304  int subject_length = subject.length();
305  int pattern_length = pattern.length();
306  // Only preprocess at most kBMMaxShift last characters of pattern.
307  int start = search->start_;
308 
309  int* bad_char_occurence = search->bad_char_table();
310  int* good_suffix_shift = search->good_suffix_shift_table();
311 
312  PatternChar last_char = pattern[pattern_length - 1];
313  int index = start_index;
314  // Continue search from i.
315  while (index <= subject_length - pattern_length) {
316  int j = pattern_length - 1;
317  int c;
318  while (last_char != (c = subject[index + j])) {
319  int shift =
320  j - CharOccurrence(bad_char_occurence, c);
321  index += shift;
322  if (index > subject_length - pattern_length) {
323  return -1;
324  }
325  }
326  while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
327  if (j < 0) {
328  return index;
329  } else if (j < start) {
330  // we have matched more than our tables allow us to be smart about.
331  // Fall back on BMH shift.
332  index += pattern_length - 1
333  - CharOccurrence(bad_char_occurence,
334  static_cast<SubjectChar>(last_char));
335  } else {
336  int gs_shift = good_suffix_shift[j + 1];
337  int bc_occ =
338  CharOccurrence(bad_char_occurence, c);
339  int shift = j - bc_occ;
340  if (gs_shift > shift) {
341  shift = gs_shift;
342  }
343  index += shift;
344  }
345  }
346 
347  return -1;
348 }
349 
350 
351 template <typename PatternChar, typename SubjectChar>
352 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
353  int pattern_length = pattern_.length();
354  const PatternChar* pattern = pattern_.start();
355  // Only look at the last kBMMaxShift characters of pattern (from start_
356  // to pattern_length).
357  int start = start_;
358  int length = pattern_length - start;
359 
360  // Biased tables so that we can use pattern indices as table indices,
361  // even if we only cover the part of the pattern from offset start.
362  int* shift_table = good_suffix_shift_table();
363  int* suffix_table = this->suffix_table();
364 
365  // Initialize table.
366  for (int i = start; i < pattern_length; i++) {
367  shift_table[i] = length;
368  }
369  shift_table[pattern_length] = 1;
370  suffix_table[pattern_length] = pattern_length + 1;
371 
372  if (pattern_length <= start) {
373  return;
374  }
375 
376  // Find suffixes.
377  PatternChar last_char = pattern[pattern_length - 1];
378  int suffix = pattern_length + 1;
379  {
380  int i = pattern_length;
381  while (i > start) {
382  PatternChar c = pattern[i - 1];
383  while (suffix <= pattern_length && c != pattern[suffix - 1]) {
384  if (shift_table[suffix] == length) {
385  shift_table[suffix] = suffix - i;
386  }
387  suffix = suffix_table[suffix];
388  }
389  suffix_table[--i] = --suffix;
390  if (suffix == pattern_length) {
391  // No suffix to extend, so we check against last_char only.
392  while ((i > start) && (pattern[i - 1] != last_char)) {
393  if (shift_table[pattern_length] == length) {
394  shift_table[pattern_length] = pattern_length - i;
395  }
396  suffix_table[--i] = pattern_length;
397  }
398  if (i > start) {
399  suffix_table[--i] = --suffix;
400  }
401  }
402  }
403  }
404  // Build shift table using suffixes.
405  if (suffix < pattern_length) {
406  for (int i = start; i <= pattern_length; i++) {
407  if (shift_table[i] == length) {
408  shift_table[i] = suffix - start;
409  }
410  if (i == suffix) {
411  suffix = suffix_table[suffix];
412  }
413  }
414  }
415 }
416 
417 //---------------------------------------------------------------------
418 // Boyer-Moore-Horspool string search.
419 //---------------------------------------------------------------------
420 
421 template <typename PatternChar, typename SubjectChar>
422 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
423  StringSearch<PatternChar, SubjectChar>* search,
424  Vector<const SubjectChar> subject,
425  int start_index) {
426  Vector<const PatternChar> pattern = search->pattern_;
427  int subject_length = subject.length();
428  int pattern_length = pattern.length();
429  int* char_occurrences = search->bad_char_table();
430  int badness = -pattern_length;
431 
432  // How bad we are doing without a good-suffix table.
433  PatternChar last_char = pattern[pattern_length - 1];
434  int last_char_shift = pattern_length - 1 -
435  CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
436  // Perform search
437  int index = start_index; // No matches found prior to this index.
438  while (index <= subject_length - pattern_length) {
439  int j = pattern_length - 1;
440  int subject_char;
441  while (last_char != (subject_char = subject[index + j])) {
442  int bc_occ = CharOccurrence(char_occurrences, subject_char);
443  int shift = j - bc_occ;
444  index += shift;
445  badness += 1 - shift; // at most zero, so badness cannot increase.
446  if (index > subject_length - pattern_length) {
447  return -1;
448  }
449  }
450  j--;
451  while (j >= 0 && pattern[j] == (subject[index + j])) j--;
452  if (j < 0) {
453  return index;
454  } else {
455  index += last_char_shift;
456  // Badness increases by the number of characters we have
457  // checked, and decreases by the number of characters we
458  // can skip by shifting. It's a measure of how we are doing
459  // compared to reading each character exactly once.
460  badness += (pattern_length - j) - last_char_shift;
461  if (badness > 0) {
462  search->PopulateBoyerMooreTable();
463  search->strategy_ = &BoyerMooreSearch;
464  return BoyerMooreSearch(search, subject, index);
465  }
466  }
467  }
468  return -1;
469 }
470 
471 
472 template <typename PatternChar, typename SubjectChar>
473 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
474  int pattern_length = pattern_.length();
475 
476  int* bad_char_occurrence = bad_char_table();
477 
478  // Only preprocess at most kBMMaxShift last characters of pattern.
479  int start = start_;
480  // Run forwards to populate bad_char_table, so that *last* instance
481  // of character equivalence class is the one registered.
482  // Notice: Doesn't include the last character.
483  int table_size = AlphabetSize();
484  if (start == 0) { // All patterns less than kBMMaxShift in length.
485  memset(bad_char_occurrence,
486  -1,
487  table_size * sizeof(*bad_char_occurrence));
488  } else {
489  for (int i = 0; i < table_size; i++) {
490  bad_char_occurrence[i] = start - 1;
491  }
492  }
493  for (int i = start; i < pattern_length - 1; i++) {
494  PatternChar c = pattern_[i];
495  int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
496  bad_char_occurrence[bucket] = i;
497  }
498 }
499 
500 //---------------------------------------------------------------------
501 // Linear string search with bailout to BMH.
502 //---------------------------------------------------------------------
503 
504 // Simple linear search for short patterns, which bails out if the string
505 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
506 template <typename PatternChar, typename SubjectChar>
507 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
508  StringSearch<PatternChar, SubjectChar>* search,
509  Vector<const SubjectChar> subject,
510  int index) {
511  Vector<const PatternChar> pattern = search->pattern_;
512  int pattern_length = pattern.length();
513  // Badness is a count of how much work we have done. When we have
514  // done enough work we decide it's probably worth switching to a better
515  // algorithm.
516  int badness = -10 - (pattern_length << 2);
517 
518  // We know our pattern is at least 2 characters, we cache the first so
519  // the common case of the first character not matching is faster.
520  PatternChar pattern_first_char = pattern[0];
521  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
522  badness++;
523  if (badness <= 0) {
524  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
525  const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
526  memchr(subject.start() + i,
527  pattern_first_char,
528  n - i + 1));
529  if (pos == NULL) {
530  return -1;
531  }
532  i = static_cast<int>(pos - subject.start());
533  } else {
534  if (subject[i] != pattern_first_char) continue;
535  }
536  int j = 1;
537  do {
538  if (pattern[j] != subject[i + j]) {
539  break;
540  }
541  j++;
542  } while (j < pattern_length);
543  if (j == pattern_length) {
544  return i;
545  }
546  badness += j;
547  } else {
548  search->PopulateBoyerMooreHorspoolTable();
549  search->strategy_ = &BoyerMooreHorspoolSearch;
550  return BoyerMooreHorspoolSearch(search, subject, i);
551  }
552  }
553  return -1;
554 }
555 
556 
557 // Perform a a single stand-alone search.
558 // If searching multiple times for the same pattern, a search
559 // object should be constructed once and the Search function then called
560 // for each search.
561 template <typename SubjectChar, typename PatternChar>
562 int SearchString(Isolate* isolate,
565  int start_index) {
566  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
567  return search.Search(subject, start_index);
568 }
569 
570 }} // namespace v8::internal
571 
572 #endif // V8_STRING_SEARCH_H_
bool CharCompare(const PatternChar *pattern, const SubjectChar *subject, int length)
T Max(T a, T b)
Definition: utils.h:222
int Search(Vector< const SubjectChar > subject, int index)
#define ASSERT(condition)
Definition: checks.h:270
static bool IsAscii(const char *chars, int length)
Definition: objects.h:7443
static const int kAsciiAlphabetSize
Definition: string-search.h:56
int LinearSearch(T *array, String *name, int len, int valid_entries)
Definition: objects-inl.h:1993
int length() const
Definition: utils.h:384
static const int kUC16AlphabetSize
Definition: isolate.h:787
StringSearch(Isolate *isolate, Vector< const PatternChar > pattern)
Definition: string-search.h:79
int SearchString(Isolate *isolate, Vector< const SubjectChar > subject, Vector< const PatternChar > pattern, int start_index)
#define ASSERT_EQ(v1, v2)
Definition: checks.h:271
static const int kBMMaxShift
Definition: isolate.h:788
static bool IsAsciiString(Vector< const char >)
Definition: string-search.h:64
static const unsigned kMaxAsciiCharCodeU
Definition: objects.h:7328
static const int kUC16AlphabetSize
Definition: string-search.h:57
activate correct semantics for inheriting readonliness enable harmony semantics for typeof enable harmony enable harmony proxies enable all harmony harmony_scoping harmony_proxies harmony_scoping tracks arrays with only smi values automatically unbox arrays of doubles use crankshaft use hydrogen range analysis use hydrogen global value numbering use function inlining maximum number of AST nodes considered for a single inlining loop invariant code motion print statistics for hydrogen trace generated IR for specified phases trace register allocator trace range analysis trace representation types environment for every instruction put a break point before deoptimizing polymorphic inlining perform array bounds checks elimination use dead code elimination trace on stack replacement optimize closures cache optimized code for closures functions with arguments object loop weight for representation inference allow uint32 values on optimize frames if they are used only in safe operations track parallel recompilation enable all profiler experiments number of stack frames inspected by the profiler call recompile stub directly when self optimizing trigger profiler ticks based on counting instead of timing weight back edges by jump distance for interrupt triggering percentage of ICs that must have type info to allow optimization watch_ic_patching retry_self_opt interrupt_at_exit extra verbose compilation tracing generate extra emit comments in code disassembly enable use of SSE3 instructions if available enable use of CMOV instruction if available enable use of SAHF instruction if enable use of VFP3 instructions if available this implies enabling ARMv7 and VFP2 enable use of VFP2 instructions if available enable use of SDIV and UDIV instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of MIPS FPU instructions if NULL
Definition: flags.cc:301
static bool IsAsciiString(Vector< const uc16 > string)
Definition: string-search.h:68
static const int kBMMinPatternLength
Definition: string-search.h:62