v8  3.14.5(node0.10.28)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
jsregexp.cc
Go to the documentation of this file.
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #include "v8.h"
29 
30 #include "ast.h"
31 #include "compiler.h"
32 #include "execution.h"
33 #include "factory.h"
34 #include "jsregexp.h"
35 #include "platform.h"
36 #include "string-search.h"
37 #include "runtime.h"
38 #include "compilation-cache.h"
39 #include "string-stream.h"
40 #include "parser.h"
41 #include "regexp-macro-assembler.h"
44 #include "regexp-stack.h"
45 
46 #ifndef V8_INTERPRETED_REGEXP
47 #if V8_TARGET_ARCH_IA32
49 #elif V8_TARGET_ARCH_X64
51 #elif V8_TARGET_ARCH_ARM
53 #elif V8_TARGET_ARCH_MIPS
55 #else
56 #error Unsupported target architecture.
57 #endif
58 #endif
59 
60 #include "interpreter-irregexp.h"
61 
62 
63 namespace v8 {
64 namespace internal {
65 
67  Handle<String> pattern,
69  bool* has_pending_exception) {
70  // Call the construct code with 2 arguments.
71  Handle<Object> argv[] = { pattern, flags };
72  return Execution::New(constructor, ARRAY_SIZE(argv), argv,
73  has_pending_exception);
74 }
75 
76 
77 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
78  int flags = JSRegExp::NONE;
79  for (int i = 0; i < str->length(); i++) {
80  switch (str->Get(i)) {
81  case 'i':
82  flags |= JSRegExp::IGNORE_CASE;
83  break;
84  case 'g':
85  flags |= JSRegExp::GLOBAL;
86  break;
87  case 'm':
88  flags |= JSRegExp::MULTILINE;
89  break;
90  }
91  }
92  return JSRegExp::Flags(flags);
93 }
94 
95 
96 static inline void ThrowRegExpException(Handle<JSRegExp> re,
97  Handle<String> pattern,
98  Handle<String> error_text,
99  const char* message) {
100  Isolate* isolate = re->GetIsolate();
101  Factory* factory = isolate->factory();
102  Handle<FixedArray> elements = factory->NewFixedArray(2);
103  elements->set(0, *pattern);
104  elements->set(1, *error_text);
105  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
106  Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
107  isolate->Throw(*regexp_err);
108 }
109 
110 
112  const int* ranges,
113  int ranges_length,
114  Interval new_range) {
115  ASSERT((ranges_length & 1) == 1);
116  ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
117  if (containment == kLatticeUnknown) return containment;
118  bool inside = false;
119  int last = 0;
120  for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
121  // Consider the range from last to ranges[i].
122  // We haven't got to the new range yet.
123  if (ranges[i] <= new_range.from()) continue;
124  // New range is wholly inside last-ranges[i]. Note that new_range.to() is
125  // inclusive, but the values in ranges are not.
126  if (last <= new_range.from() && new_range.to() < ranges[i]) {
127  return Combine(containment, inside ? kLatticeIn : kLatticeOut);
128  }
129  return kLatticeUnknown;
130  }
131  return containment;
132 }
133 
134 
135 // More makes code generation slower, less makes V8 benchmark score lower.
137 // In a 3-character pattern you can maximally step forwards 3 characters
138 // at a time, which is not always enough to pay for the extra logic.
140 
141 
142 // Identifies the sort of regexps where the regexp engine is faster
143 // than the code used for atom matches.
144 static bool HasFewDifferentCharacters(Handle<String> pattern) {
145  int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
146  if (length <= kPatternTooShortForBoyerMoore) return false;
147  const int kMod = 128;
148  bool character_found[kMod];
149  int different = 0;
150  memset(&character_found[0], 0, sizeof(character_found));
151  for (int i = 0; i < length; i++) {
152  int ch = (pattern->Get(i) & (kMod - 1));
153  if (!character_found[ch]) {
154  character_found[ch] = true;
155  different++;
156  // We declare a regexp low-alphabet if it has at least 3 times as many
157  // characters as it has different characters.
158  if (different * 3 > length) return false;
159  }
160  }
161  return true;
162 }
163 
164 
165 // Generic RegExp methods. Dispatches to implementation specific methods.
166 
167 
169  Handle<String> pattern,
170  Handle<String> flag_str,
171  Zone* zone) {
172  ZoneScope zone_scope(zone, DELETE_ON_EXIT);
173  Isolate* isolate = re->GetIsolate();
174  JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
175  CompilationCache* compilation_cache = isolate->compilation_cache();
176  Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
177  bool in_cache = !cached.is_null();
178  LOG(isolate, RegExpCompileEvent(re, in_cache));
179 
180  Handle<Object> result;
181  if (in_cache) {
182  re->set_data(*cached);
183  return re;
184  }
185  pattern = FlattenGetString(pattern);
186  PostponeInterruptsScope postpone(isolate);
187  RegExpCompileData parse_result;
188  FlatStringReader reader(isolate, pattern);
189  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
190  &parse_result, zone)) {
191  // Throw an exception if we fail to parse the pattern.
192  ThrowRegExpException(re,
193  pattern,
194  parse_result.error,
195  "malformed_regexp");
196  return Handle<Object>::null();
197  }
198 
199  bool has_been_compiled = false;
200 
201  if (parse_result.simple &&
202  !flags.is_ignore_case() &&
203  !HasFewDifferentCharacters(pattern)) {
204  // Parse-tree is a single atom that is equal to the pattern.
205  AtomCompile(re, pattern, flags, pattern);
206  has_been_compiled = true;
207  } else if (parse_result.tree->IsAtom() &&
208  !flags.is_ignore_case() &&
209  parse_result.capture_count == 0) {
210  RegExpAtom* atom = parse_result.tree->AsAtom();
211  Vector<const uc16> atom_pattern = atom->data();
212  Handle<String> atom_string =
213  isolate->factory()->NewStringFromTwoByte(atom_pattern);
214  if (!HasFewDifferentCharacters(atom_string)) {
215  AtomCompile(re, pattern, flags, atom_string);
216  has_been_compiled = true;
217  }
218  }
219  if (!has_been_compiled) {
220  IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
221  }
222  ASSERT(re->data()->IsFixedArray());
223  // Compilation succeeded so the data is set on the regexp
224  // and we can store it in the cache.
225  Handle<FixedArray> data(FixedArray::cast(re->data()));
226  compilation_cache->PutRegExp(pattern, flags, data);
227 
228  return re;
229 }
230 
231 
233  Handle<String> subject,
234  int index,
235  Handle<JSArray> last_match_info) {
236  switch (regexp->TypeTag()) {
237  case JSRegExp::ATOM:
238  return AtomExec(regexp, subject, index, last_match_info);
239  case JSRegExp::IRREGEXP: {
240  Handle<Object> result =
241  IrregexpExec(regexp, subject, index, last_match_info);
242  ASSERT(!result.is_null() ||
243  regexp->GetIsolate()->has_pending_exception());
244  return result;
245  }
246  default:
247  UNREACHABLE();
248  return Handle<Object>::null();
249  }
250 }
251 
252 
253 // RegExp Atom implementation: Simple string search using indexOf.
254 
255 
257  Handle<String> pattern,
258  JSRegExp::Flags flags,
259  Handle<String> match_pattern) {
260  re->GetIsolate()->factory()->SetRegExpAtomData(re,
262  pattern,
263  flags,
264  match_pattern);
265 }
266 
267 
268 static void SetAtomLastCapture(FixedArray* array,
269  String* subject,
270  int from,
271  int to) {
272  NoHandleAllocation no_handles;
274  RegExpImpl::SetLastSubject(array, subject);
275  RegExpImpl::SetLastInput(array, subject);
276  RegExpImpl::SetCapture(array, 0, from);
277  RegExpImpl::SetCapture(array, 1, to);
278 }
279 
280 
282  Handle<String> subject,
283  int index,
284  int32_t* output,
285  int output_size) {
286  Isolate* isolate = regexp->GetIsolate();
287 
288  ASSERT(0 <= index);
289  ASSERT(index <= subject->length());
290 
291  if (!subject->IsFlat()) FlattenString(subject);
292  AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
293 
294  String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
295  int needle_len = needle->length();
296  ASSERT(needle->IsFlat());
297  ASSERT_LT(0, needle_len);
298 
299  if (index + needle_len > subject->length()) {
300  return RegExpImpl::RE_FAILURE;
301  }
302 
303  for (int i = 0; i < output_size; i += 2) {
304  String::FlatContent needle_content = needle->GetFlatContent();
305  String::FlatContent subject_content = subject->GetFlatContent();
306  ASSERT(needle_content.IsFlat());
307  ASSERT(subject_content.IsFlat());
308  // dispatch on type of strings
309  index = (needle_content.IsAscii()
310  ? (subject_content.IsAscii()
311  ? SearchString(isolate,
312  subject_content.ToAsciiVector(),
313  needle_content.ToAsciiVector(),
314  index)
315  : SearchString(isolate,
316  subject_content.ToUC16Vector(),
317  needle_content.ToAsciiVector(),
318  index))
319  : (subject_content.IsAscii()
320  ? SearchString(isolate,
321  subject_content.ToAsciiVector(),
322  needle_content.ToUC16Vector(),
323  index)
324  : SearchString(isolate,
325  subject_content.ToUC16Vector(),
326  needle_content.ToUC16Vector(),
327  index)));
328  if (index == -1) {
329  return i / 2; // Return number of matches.
330  } else {
331  output[i] = index;
332  output[i+1] = index + needle_len;
333  index += needle_len;
334  }
335  }
336  return output_size / 2;
337 }
338 
339 
341  Handle<String> subject,
342  int index,
343  Handle<JSArray> last_match_info) {
344  Isolate* isolate = re->GetIsolate();
345 
346  static const int kNumRegisters = 2;
348  int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
349 
350  int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
351 
352  if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
353 
355  NoHandleAllocation no_handles;
356  FixedArray* array = FixedArray::cast(last_match_info->elements());
357  SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
358  return last_match_info;
359 }
360 
361 
362 // Irregexp implementation.
363 
364 // Ensures that the regexp object contains a compiled version of the
365 // source for either ASCII or non-ASCII strings.
366 // If the compiled version doesn't already exist, it is compiled
367 // from the source pattern.
368 // If compilation fails, an exception is thrown and this function
369 // returns false.
370 bool RegExpImpl::EnsureCompiledIrregexp(
371  Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
372  Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
373 #ifdef V8_INTERPRETED_REGEXP
374  if (compiled_code->IsByteArray()) return true;
375 #else // V8_INTERPRETED_REGEXP (RegExp native code)
376  if (compiled_code->IsCode()) return true;
377 #endif
378  // We could potentially have marked this as flushable, but have kept
379  // a saved version if we did not flush it yet.
380  Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
381  if (saved_code->IsCode()) {
382  // Reinstate the code in the original place.
383  re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
384  ASSERT(compiled_code->IsSmi());
385  return true;
386  }
387  return CompileIrregexp(re, sample_subject, is_ascii);
388 }
389 
390 
391 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
392  bool is_ascii,
393  Handle<String> error_message,
394  Isolate* isolate) {
395  Factory* factory = isolate->factory();
396  Handle<FixedArray> elements = factory->NewFixedArray(2);
397  elements->set(0, re->Pattern());
398  elements->set(1, *error_message);
399  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
400  Handle<Object> regexp_err =
401  factory->NewSyntaxError("malformed_regexp", array);
402  isolate->Throw(*regexp_err);
403  return false;
404 }
405 
406 
407 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
408  Handle<String> sample_subject,
409  bool is_ascii) {
410  // Compile the RegExp.
411  Isolate* isolate = re->GetIsolate();
412  ZoneScope zone_scope(isolate->runtime_zone(), DELETE_ON_EXIT);
413  PostponeInterruptsScope postpone(isolate);
414  // If we had a compilation error the last time this is saved at the
415  // saved code index.
416  Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
417  // When arriving here entry can only be a smi, either representing an
418  // uncompiled regexp, a previous compilation error, or code that has
419  // been flushed.
420  ASSERT(entry->IsSmi());
421  int entry_value = Smi::cast(entry)->value();
422  ASSERT(entry_value == JSRegExp::kUninitializedValue ||
423  entry_value == JSRegExp::kCompilationErrorValue ||
424  (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
425 
426  if (entry_value == JSRegExp::kCompilationErrorValue) {
427  // A previous compilation failed and threw an error which we store in
428  // the saved code index (we store the error message, not the actual
429  // error). Recreate the error object and throw it.
430  Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
431  ASSERT(error_string->IsString());
432  Handle<String> error_message(String::cast(error_string));
433  CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
434  return false;
435  }
436 
437  JSRegExp::Flags flags = re->GetFlags();
438 
439  Handle<String> pattern(re->Pattern());
440  if (!pattern->IsFlat()) FlattenString(pattern);
441  RegExpCompileData compile_data;
442  FlatStringReader reader(isolate, pattern);
443  Zone* zone = isolate->runtime_zone();
444  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
445  &compile_data,
446  zone)) {
447  // Throw an exception if we fail to parse the pattern.
448  // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
449  ThrowRegExpException(re,
450  pattern,
451  compile_data.error,
452  "malformed_regexp");
453  return false;
454  }
455  RegExpEngine::CompilationResult result =
456  RegExpEngine::Compile(&compile_data,
457  flags.is_ignore_case(),
458  flags.is_global(),
459  flags.is_multiline(),
460  pattern,
461  sample_subject,
462  is_ascii,
463  zone);
464  if (result.error_message != NULL) {
465  // Unable to compile regexp.
466  Handle<String> error_message =
467  isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
468  CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
469  return false;
470  }
471 
472  Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
473  data->set(JSRegExp::code_index(is_ascii), result.code);
474  int register_max = IrregexpMaxRegisterCount(*data);
475  if (result.num_registers > register_max) {
476  SetIrregexpMaxRegisterCount(*data, result.num_registers);
477  }
478 
479  return true;
480 }
481 
482 
484  return Smi::cast(
486 }
487 
488 
491 }
492 
493 
495  return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
496 }
497 
498 
501 }
502 
503 
505  return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
506 }
507 
508 
510  return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
511 }
512 
513 
515  Handle<String> pattern,
516  JSRegExp::Flags flags,
517  int capture_count) {
518  // Initialize compiled code entries to null.
519  re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
521  pattern,
522  flags,
523  capture_count);
524 }
525 
526 
528  Handle<String> subject) {
529  if (!subject->IsFlat()) FlattenString(subject);
530 
531  // Check the asciiness of the underlying storage.
532  bool is_ascii = subject->IsAsciiRepresentationUnderneath();
533  if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
534 
535 #ifdef V8_INTERPRETED_REGEXP
536  // Byte-code regexp needs space allocated for all its registers.
537  // The result captures are copied to the start of the registers array
538  // if the match succeeds. This way those registers are not clobbered
539  // when we set the last match info from last successful match.
540  return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
541  (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
542 #else // V8_INTERPRETED_REGEXP
543  // Native regexp only needs room to output captures. Registers are handled
544  // internally.
545  return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
546 #endif // V8_INTERPRETED_REGEXP
547 }
548 
549 
551  Handle<String> subject,
552  int index,
553  int32_t* output,
554  int output_size) {
555  Isolate* isolate = regexp->GetIsolate();
556 
557  Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
558 
559  ASSERT(index >= 0);
560  ASSERT(index <= subject->length());
561  ASSERT(subject->IsFlat());
562 
563  bool is_ascii = subject->IsAsciiRepresentationUnderneath();
564 
565 #ifndef V8_INTERPRETED_REGEXP
566  ASSERT(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
567  do {
568  EnsureCompiledIrregexp(regexp, subject, is_ascii);
569  Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
570  // The stack is used to allocate registers for the compiled regexp code.
571  // This means that in case of failure, the output registers array is left
572  // untouched and contains the capture results from the previous successful
573  // match. We can use that to set the last match info lazily.
576  subject,
577  output,
578  output_size,
579  index,
580  isolate);
583  isolate->has_pending_exception());
585  static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
587  static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
589  == RE_EXCEPTION);
590  return static_cast<IrregexpResult>(res);
591  }
592  // If result is RETRY, the string has changed representation, and we
593  // must restart from scratch.
594  // In this case, it means we must make sure we are prepared to handle
595  // the, potentially, different subject (the string can switch between
596  // being internal and external, and even between being ASCII and UC16,
597  // but the characters are always the same).
598  IrregexpPrepare(regexp, subject);
599  is_ascii = subject->IsAsciiRepresentationUnderneath();
600  } while (true);
601  UNREACHABLE();
602  return RE_EXCEPTION;
603 #else // V8_INTERPRETED_REGEXP
604 
605  ASSERT(output_size >= IrregexpNumberOfRegisters(*irregexp));
606  // We must have done EnsureCompiledIrregexp, so we can get the number of
607  // registers.
608  int number_of_capture_registers =
609  (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
610  int32_t* raw_output = &output[number_of_capture_registers];
611  // We do not touch the actual capture result registers until we know there
612  // has been a match so that we can use those capture results to set the
613  // last match info.
614  for (int i = number_of_capture_registers - 1; i >= 0; i--) {
615  raw_output[i] = -1;
616  }
617  Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
618 
620  byte_codes,
621  subject,
622  raw_output,
623  index);
624  if (result == RE_SUCCESS) {
625  // Copy capture results to the start of the registers array.
626  memcpy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
627  }
628  if (result == RE_EXCEPTION) {
629  ASSERT(!isolate->has_pending_exception());
630  isolate->StackOverflow();
631  }
632  return result;
633 #endif // V8_INTERPRETED_REGEXP
634 }
635 
636 
638  Handle<String> subject,
639  int previous_index,
640  Handle<JSArray> last_match_info) {
641  Isolate* isolate = regexp->GetIsolate();
642  ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
643 
644  // Prepare space for the return values.
645 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
646  if (FLAG_trace_regexp_bytecodes) {
647  String* pattern = regexp->Pattern();
648  PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
649  PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
650  }
651 #endif
652  int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
653  if (required_registers < 0) {
654  // Compiling failed with an exception.
655  ASSERT(isolate->has_pending_exception());
656  return Handle<Object>::null();
657  }
658 
659  int32_t* output_registers = NULL;
660  if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
661  output_registers = NewArray<int32_t>(required_registers);
662  }
663  SmartArrayPointer<int32_t> auto_release(output_registers);
664  if (output_registers == NULL) {
665  output_registers = isolate->jsregexp_static_offsets_vector();
666  }
667 
668  int res = RegExpImpl::IrregexpExecRaw(
669  regexp, subject, previous_index, output_registers, required_registers);
670  if (res == RE_SUCCESS) {
671  int capture_count =
673  return SetLastMatchInfo(
674  last_match_info, subject, capture_count, output_registers);
675  }
676  if (res == RE_EXCEPTION) {
677  ASSERT(isolate->has_pending_exception());
678  return Handle<Object>::null();
679  }
680  ASSERT(res == RE_FAILURE);
681  return isolate->factory()->null_value();
682 }
683 
684 
686  Handle<String> subject,
687  int capture_count,
688  int32_t* match) {
689  int capture_register_count = (capture_count + 1) * 2;
690  last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
691  AssertNoAllocation no_gc;
692  FixedArray* array = FixedArray::cast(last_match_info->elements());
693  if (match != NULL) {
694  for (int i = 0; i < capture_register_count; i += 2) {
695  SetCapture(array, i, match[i]);
696  SetCapture(array, i + 1, match[i + 1]);
697  }
698  }
699  SetLastCaptureCount(array, capture_register_count);
700  SetLastSubject(array, *subject);
701  SetLastInput(array, *subject);
702  return last_match_info;
703 }
704 
705 
707  Handle<String> subject,
708  bool is_global,
709  Isolate* isolate)
710  : register_array_(NULL),
711  register_array_size_(0),
712  regexp_(regexp),
713  subject_(subject) {
714 #ifdef V8_INTERPRETED_REGEXP
715  bool interpreted = true;
716 #else
717  bool interpreted = false;
718 #endif // V8_INTERPRETED_REGEXP
719 
720  if (regexp_->TypeTag() == JSRegExp::ATOM) {
721  static const int kAtomRegistersPerMatch = 2;
722  registers_per_match_ = kAtomRegistersPerMatch;
723  // There is no distinction between interpreted and native for atom regexps.
724  interpreted = false;
725  } else {
726  registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
727  if (registers_per_match_ < 0) {
728  num_matches_ = -1; // Signal exception.
729  return;
730  }
731  }
732 
733  if (is_global && !interpreted) {
734  register_array_size_ =
735  Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
736  max_matches_ = register_array_size_ / registers_per_match_;
737  } else {
738  // Global loop in interpreted regexp is not implemented. We choose
739  // the size of the offsets vector so that it can only store one match.
740  register_array_size_ = registers_per_match_;
741  max_matches_ = 1;
742  }
743 
744  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
745  register_array_ = NewArray<int32_t>(register_array_size_);
746  } else {
747  register_array_ = isolate->jsregexp_static_offsets_vector();
748  }
749 
750  // Set state so that fetching the results the first time triggers a call
751  // to the compiled regexp.
752  current_match_index_ = max_matches_ - 1;
753  num_matches_ = max_matches_;
754  ASSERT(registers_per_match_ >= 2); // Each match has at least one capture.
755  ASSERT_GE(register_array_size_, registers_per_match_);
756  int32_t* last_match =
757  &register_array_[current_match_index_ * registers_per_match_];
758  last_match[0] = -1;
759  last_match[1] = 0;
760 }
761 
762 
764  // Deallocate the register array if we allocated it in the constructor
765  // (as opposed to using the existing jsregexp_static_offsets_vector).
766  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
767  DeleteArray(register_array_);
768  }
769 }
770 
771 
773  current_match_index_++;
774  if (current_match_index_ >= num_matches_) {
775  // Current batch of results exhausted.
776  // Fail if last batch was not even fully filled.
777  if (num_matches_ < max_matches_) {
778  num_matches_ = 0; // Signal failed match.
779  return NULL;
780  }
781 
782  int32_t* last_match =
783  &register_array_[(current_match_index_ - 1) * registers_per_match_];
784  int last_end_index = last_match[1];
785 
786  if (regexp_->TypeTag() == JSRegExp::ATOM) {
787  num_matches_ = RegExpImpl::AtomExecRaw(regexp_,
788  subject_,
789  last_end_index,
790  register_array_,
791  register_array_size_);
792  } else {
793  int last_start_index = last_match[0];
794  if (last_start_index == last_end_index) last_end_index++;
795  if (last_end_index > subject_->length()) {
796  num_matches_ = 0; // Signal failed match.
797  return NULL;
798  }
799  num_matches_ = RegExpImpl::IrregexpExecRaw(regexp_,
800  subject_,
801  last_end_index,
802  register_array_,
803  register_array_size_);
804  }
805 
806  if (num_matches_ <= 0) return NULL;
807  current_match_index_ = 0;
808  return register_array_;
809  } else {
810  return &register_array_[current_match_index_ * registers_per_match_];
811  }
812 }
813 
814 
816  int index = current_match_index_ * registers_per_match_;
817  if (num_matches_ == 0) {
818  // After a failed match we shift back by one result.
819  index -= registers_per_match_;
820  }
821  return &register_array_[index];
822 }
823 
824 
825 // -------------------------------------------------------------------
826 // Implementation of the Irregexp regular expression engine.
827 //
828 // The Irregexp regular expression engine is intended to be a complete
829 // implementation of ECMAScript regular expressions. It generates either
830 // bytecodes or native code.
831 
832 // The Irregexp regexp engine is structured in three steps.
833 // 1) The parser generates an abstract syntax tree. See ast.cc.
834 // 2) From the AST a node network is created. The nodes are all
835 // subclasses of RegExpNode. The nodes represent states when
836 // executing a regular expression. Several optimizations are
837 // performed on the node network.
838 // 3) From the nodes we generate either byte codes or native code
839 // that can actually execute the regular expression (perform
840 // the search). The code generation step is described in more
841 // detail below.
842 
843 // Code generation.
844 //
845 // The nodes are divided into four main categories.
846 // * Choice nodes
847 // These represent places where the regular expression can
848 // match in more than one way. For example on entry to an
849 // alternation (foo|bar) or a repetition (*, +, ? or {}).
850 // * Action nodes
851 // These represent places where some action should be
852 // performed. Examples include recording the current position
853 // in the input string to a register (in order to implement
854 // captures) or other actions on register for example in order
855 // to implement the counters needed for {} repetitions.
856 // * Matching nodes
857 // These attempt to match some element part of the input string.
858 // Examples of elements include character classes, plain strings
859 // or back references.
860 // * End nodes
861 // These are used to implement the actions required on finding
862 // a successful match or failing to find a match.
863 //
864 // The code generated (whether as byte codes or native code) maintains
865 // some state as it runs. This consists of the following elements:
866 //
867 // * The capture registers. Used for string captures.
868 // * Other registers. Used for counters etc.
869 // * The current position.
870 // * The stack of backtracking information. Used when a matching node
871 // fails to find a match and needs to try an alternative.
872 //
873 // Conceptual regular expression execution model:
874 //
875 // There is a simple conceptual model of regular expression execution
876 // which will be presented first. The actual code generated is a more
877 // efficient simulation of the simple conceptual model:
878 //
879 // * Choice nodes are implemented as follows:
880 // For each choice except the last {
881 // push current position
882 // push backtrack code location
883 // <generate code to test for choice>
884 // backtrack code location:
885 // pop current position
886 // }
887 // <generate code to test for last choice>
888 //
889 // * Actions nodes are generated as follows
890 // <push affected registers on backtrack stack>
891 // <generate code to perform action>
892 // push backtrack code location
893 // <generate code to test for following nodes>
894 // backtrack code location:
895 // <pop affected registers to restore their state>
896 // <pop backtrack location from stack and go to it>
897 //
898 // * Matching nodes are generated as follows:
899 // if input string matches at current position
900 // update current position
901 // <generate code to test for following nodes>
902 // else
903 // <pop backtrack location from stack and go to it>
904 //
905 // Thus it can be seen that the current position is saved and restored
906 // by the choice nodes, whereas the registers are saved and restored by
907 // by the action nodes that manipulate them.
908 //
909 // The other interesting aspect of this model is that nodes are generated
910 // at the point where they are needed by a recursive call to Emit(). If
911 // the node has already been code generated then the Emit() call will
912 // generate a jump to the previously generated code instead. In order to
913 // limit recursion it is possible for the Emit() function to put the node
914 // on a work list for later generation and instead generate a jump. The
915 // destination of the jump is resolved later when the code is generated.
916 //
917 // Actual regular expression code generation.
918 //
919 // Code generation is actually more complicated than the above. In order
920 // to improve the efficiency of the generated code some optimizations are
921 // performed
922 //
923 // * Choice nodes have 1-character lookahead.
924 // A choice node looks at the following character and eliminates some of
925 // the choices immediately based on that character. This is not yet
926 // implemented.
927 // * Simple greedy loops store reduced backtracking information.
928 // A quantifier like /.*foo/m will greedily match the whole input. It will
929 // then need to backtrack to a point where it can match "foo". The naive
930 // implementation of this would push each character position onto the
931 // backtracking stack, then pop them off one by one. This would use space
932 // proportional to the length of the input string. However since the "."
933 // can only match in one way and always has a constant length (in this case
934 // of 1) it suffices to store the current position on the top of the stack
935 // once. Matching now becomes merely incrementing the current position and
936 // backtracking becomes decrementing the current position and checking the
937 // result against the stored current position. This is faster and saves
938 // space.
939 // * The current state is virtualized.
940 // This is used to defer expensive operations until it is clear that they
941 // are needed and to generate code for a node more than once, allowing
942 // specialized an efficient versions of the code to be created. This is
943 // explained in the section below.
944 //
945 // Execution state virtualization.
946 //
947 // Instead of emitting code, nodes that manipulate the state can record their
948 // manipulation in an object called the Trace. The Trace object can record a
949 // current position offset, an optional backtrack code location on the top of
950 // the virtualized backtrack stack and some register changes. When a node is
951 // to be emitted it can flush the Trace or update it. Flushing the Trace
952 // will emit code to bring the actual state into line with the virtual state.
953 // Avoiding flushing the state can postpone some work (e.g. updates of capture
954 // registers). Postponing work can save time when executing the regular
955 // expression since it may be found that the work never has to be done as a
956 // failure to match can occur. In addition it is much faster to jump to a
957 // known backtrack code location than it is to pop an unknown backtrack
958 // location from the stack and jump there.
959 //
960 // The virtual state found in the Trace affects code generation. For example
961 // the virtual state contains the difference between the actual current
962 // position and the virtual current position, and matching code needs to use
963 // this offset to attempt a match in the correct location of the input
964 // string. Therefore code generated for a non-trivial trace is specialized
965 // to that trace. The code generator therefore has the ability to generate
966 // code for each node several times. In order to limit the size of the
967 // generated code there is an arbitrary limit on how many specialized sets of
968 // code may be generated for a given node. If the limit is reached, the
969 // trace is flushed and a generic version of the code for a node is emitted.
970 // This is subsequently used for that node. The code emitted for non-generic
971 // trace is not recorded in the node and so it cannot currently be reused in
972 // the event that code generation is requested for an identical trace.
973 
974 
976  UNREACHABLE();
977 }
978 
979 
981  text->AddElement(TextElement::Atom(this), zone);
982 }
983 
984 
986  text->AddElement(TextElement::CharClass(this), zone);
987 }
988 
989 
991  for (int i = 0; i < elements()->length(); i++)
992  text->AddElement(elements()->at(i), zone);
993 }
994 
995 
997  TextElement result = TextElement(ATOM);
998  result.data.u_atom = atom;
999  return result;
1000 }
1001 
1002 
1004  RegExpCharacterClass* char_class) {
1005  TextElement result = TextElement(CHAR_CLASS);
1006  result.data.u_char_class = char_class;
1007  return result;
1008 }
1009 
1010 
1012  if (type == ATOM) {
1013  return data.u_atom->length();
1014  } else {
1015  ASSERT(type == CHAR_CLASS);
1016  return 1;
1017  }
1018 }
1019 
1020 
1022  if (table_ == NULL) {
1023  table_ = new(zone()) DispatchTable(zone());
1024  DispatchTableConstructor cons(table_, ignore_case, zone());
1025  cons.BuildTable(this);
1026  }
1027  return table_;
1028 }
1029 
1030 
1032  public:
1033  FrequencyCollator() : total_samples_(0) {
1034  for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
1035  frequencies_[i] = CharacterFrequency(i);
1036  }
1037  }
1038 
1039  void CountCharacter(int character) {
1040  int index = (character & RegExpMacroAssembler::kTableMask);
1041  frequencies_[index].Increment();
1042  total_samples_++;
1043  }
1044 
1045  // Does not measure in percent, but rather per-128 (the table size from the
1046  // regexp macro assembler).
1047  int Frequency(int in_character) {
1048  ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
1049  if (total_samples_ < 1) return 1; // Division by zero.
1050  int freq_in_per128 =
1051  (frequencies_[in_character].counter() * 128) / total_samples_;
1052  return freq_in_per128;
1053  }
1054 
1055  private:
1056  class CharacterFrequency {
1057  public:
1058  CharacterFrequency() : counter_(0), character_(-1) { }
1059  explicit CharacterFrequency(int character)
1060  : counter_(0), character_(character) { }
1061 
1062  void Increment() { counter_++; }
1063  int counter() { return counter_; }
1064  int character() { return character_; }
1065 
1066  private:
1067  int counter_;
1068  int character_;
1069  };
1070 
1071 
1072  private:
1073  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
1074  int total_samples_;
1075 };
1076 
1077 
1079  public:
1080  RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii,
1081  Zone* zone);
1082 
1084  if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1085  reg_exp_too_big_ = true;
1086  return next_register_;
1087  }
1088  return next_register_++;
1089  }
1090 
1092  RegExpNode* start,
1093  int capture_count,
1094  Handle<String> pattern);
1095 
1096  inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1097 
1098  static const int kImplementationOffset = 0;
1099  static const int kNumberOfRegistersOffset = 0;
1100  static const int kCodeOffset = 1;
1101 
1102  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1103  EndNode* accept() { return accept_; }
1104 
1105  static const int kMaxRecursion = 100;
1106  inline int recursion_depth() { return recursion_depth_; }
1107  inline void IncrementRecursionDepth() { recursion_depth_++; }
1108  inline void DecrementRecursionDepth() { recursion_depth_--; }
1109 
1110  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1111 
1112  inline bool ignore_case() { return ignore_case_; }
1113  inline bool ascii() { return ascii_; }
1114  FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1115 
1116  int current_expansion_factor() { return current_expansion_factor_; }
1118  current_expansion_factor_ = value;
1119  }
1120 
1121  Zone* zone() const { return zone_; }
1122 
1123  static const int kNoRegister = -1;
1124 
1125  private:
1126  EndNode* accept_;
1127  int next_register_;
1128  List<RegExpNode*>* work_list_;
1129  int recursion_depth_;
1130  RegExpMacroAssembler* macro_assembler_;
1131  bool ignore_case_;
1132  bool ascii_;
1133  bool reg_exp_too_big_;
1134  int current_expansion_factor_;
1135  FrequencyCollator frequency_collator_;
1136  Zone* zone_;
1137 };
1138 
1139 
1141  public:
1142  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1143  compiler->IncrementRecursionDepth();
1144  }
1145  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1146  private:
1147  RegExpCompiler* compiler_;
1148 };
1149 
1150 
1151 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
1152  return RegExpEngine::CompilationResult("RegExp too big");
1153 }
1154 
1155 
1156 // Attempts to compile the regexp using an Irregexp code generator. Returns
1157 // a fixed array or a null handle depending on whether it succeeded.
1158 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii,
1159  Zone* zone)
1160  : next_register_(2 * (capture_count + 1)),
1161  work_list_(NULL),
1162  recursion_depth_(0),
1163  ignore_case_(ignore_case),
1164  ascii_(ascii),
1165  reg_exp_too_big_(false),
1166  current_expansion_factor_(1),
1167  frequency_collator_(),
1168  zone_(zone) {
1169  accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1170  ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1171 }
1172 
1173 
1175  RegExpMacroAssembler* macro_assembler,
1176  RegExpNode* start,
1177  int capture_count,
1178  Handle<String> pattern) {
1179  Heap* heap = pattern->GetHeap();
1180 
1181  bool use_slow_safe_regexp_compiler = false;
1182  if (heap->total_regexp_code_generated() >
1184  heap->isolate()->memory_allocator()->SizeExecutable() >
1186  use_slow_safe_regexp_compiler = true;
1187  }
1188 
1189  macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1190 
1191 #ifdef DEBUG
1192  if (FLAG_trace_regexp_assembler)
1193  macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1194  else
1195 #endif
1196  macro_assembler_ = macro_assembler;
1197 
1198  List <RegExpNode*> work_list(0);
1199  work_list_ = &work_list;
1200  Label fail;
1201  macro_assembler_->PushBacktrack(&fail);
1202  Trace new_trace;
1203  start->Emit(this, &new_trace);
1204  macro_assembler_->Bind(&fail);
1205  macro_assembler_->Fail();
1206  while (!work_list.is_empty()) {
1207  work_list.RemoveLast()->Emit(this, &new_trace);
1208  }
1209  if (reg_exp_too_big_) return IrregexpRegExpTooBig();
1210 
1211  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1212  heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1213  work_list_ = NULL;
1214 #ifdef DEBUG
1215  if (FLAG_print_code) {
1216  Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
1217  }
1218  if (FLAG_trace_regexp_assembler) {
1219  delete macro_assembler_;
1220  }
1221 #endif
1222  return RegExpEngine::CompilationResult(*code, next_register_);
1223 }
1224 
1225 
1227  if (type() == ActionNode::CLEAR_CAPTURES) {
1228  Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1229  return range.Contains(that);
1230  } else {
1231  return reg() == that;
1232  }
1233 }
1234 
1235 
1236 bool Trace::mentions_reg(int reg) {
1237  for (DeferredAction* action = actions_;
1238  action != NULL;
1239  action = action->next()) {
1240  if (action->Mentions(reg))
1241  return true;
1242  }
1243  return false;
1244 }
1245 
1246 
1248  ASSERT_EQ(0, *cp_offset);
1249  for (DeferredAction* action = actions_;
1250  action != NULL;
1251  action = action->next()) {
1252  if (action->Mentions(reg)) {
1253  if (action->type() == ActionNode::STORE_POSITION) {
1254  *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1255  return true;
1256  } else {
1257  return false;
1258  }
1259  }
1260  }
1261  return false;
1262 }
1263 
1264 
1265 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1266  Zone* zone) {
1267  int max_register = RegExpCompiler::kNoRegister;
1268  for (DeferredAction* action = actions_;
1269  action != NULL;
1270  action = action->next()) {
1271  if (action->type() == ActionNode::CLEAR_CAPTURES) {
1272  Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1273  for (int i = range.from(); i <= range.to(); i++)
1274  affected_registers->Set(i, zone);
1275  if (range.to() > max_register) max_register = range.to();
1276  } else {
1277  affected_registers->Set(action->reg(), zone);
1278  if (action->reg() > max_register) max_register = action->reg();
1279  }
1280  }
1281  return max_register;
1282 }
1283 
1284 
1285 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1286  int max_register,
1287  OutSet& registers_to_pop,
1288  OutSet& registers_to_clear) {
1289  for (int reg = max_register; reg >= 0; reg--) {
1290  if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
1291  else if (registers_to_clear.Get(reg)) {
1292  int clear_to = reg;
1293  while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1294  reg--;
1295  }
1296  assembler->ClearRegisters(reg, clear_to);
1297  }
1298  }
1299 }
1300 
1301 
1302 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1303  int max_register,
1304  OutSet& affected_registers,
1305  OutSet* registers_to_pop,
1306  OutSet* registers_to_clear,
1307  Zone* zone) {
1308  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1309  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1310 
1311  // Count pushes performed to force a stack limit check occasionally.
1312  int pushes = 0;
1313 
1314  for (int reg = 0; reg <= max_register; reg++) {
1315  if (!affected_registers.Get(reg)) {
1316  continue;
1317  }
1318 
1319  // The chronologically first deferred action in the trace
1320  // is used to infer the action needed to restore a register
1321  // to its previous state (or not, if it's safe to ignore it).
1322  enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1323  DeferredActionUndoType undo_action = IGNORE;
1324 
1325  int value = 0;
1326  bool absolute = false;
1327  bool clear = false;
1328  int store_position = -1;
1329  // This is a little tricky because we are scanning the actions in reverse
1330  // historical order (newest first).
1331  for (DeferredAction* action = actions_;
1332  action != NULL;
1333  action = action->next()) {
1334  if (action->Mentions(reg)) {
1335  switch (action->type()) {
1336  case ActionNode::SET_REGISTER: {
1337  Trace::DeferredSetRegister* psr =
1338  static_cast<Trace::DeferredSetRegister*>(action);
1339  if (!absolute) {
1340  value += psr->value();
1341  absolute = true;
1342  }
1343  // SET_REGISTER is currently only used for newly introduced loop
1344  // counters. They can have a significant previous value if they
1345  // occour in a loop. TODO(lrn): Propagate this information, so
1346  // we can set undo_action to IGNORE if we know there is no value to
1347  // restore.
1348  undo_action = RESTORE;
1349  ASSERT_EQ(store_position, -1);
1350  ASSERT(!clear);
1351  break;
1352  }
1354  if (!absolute) {
1355  value++;
1356  }
1357  ASSERT_EQ(store_position, -1);
1358  ASSERT(!clear);
1359  undo_action = RESTORE;
1360  break;
1362  Trace::DeferredCapture* pc =
1363  static_cast<Trace::DeferredCapture*>(action);
1364  if (!clear && store_position == -1) {
1365  store_position = pc->cp_offset();
1366  }
1367 
1368  // For captures we know that stores and clears alternate.
1369  // Other register, are never cleared, and if the occur
1370  // inside a loop, they might be assigned more than once.
1371  if (reg <= 1) {
1372  // Registers zero and one, aka "capture zero", is
1373  // always set correctly if we succeed. There is no
1374  // need to undo a setting on backtrack, because we
1375  // will set it again or fail.
1376  undo_action = IGNORE;
1377  } else {
1378  undo_action = pc->is_capture() ? CLEAR : RESTORE;
1379  }
1380  ASSERT(!absolute);
1381  ASSERT_EQ(value, 0);
1382  break;
1383  }
1385  // Since we're scanning in reverse order, if we've already
1386  // set the position we have to ignore historically earlier
1387  // clearing operations.
1388  if (store_position == -1) {
1389  clear = true;
1390  }
1391  undo_action = RESTORE;
1392  ASSERT(!absolute);
1393  ASSERT_EQ(value, 0);
1394  break;
1395  }
1396  default:
1397  UNREACHABLE();
1398  break;
1399  }
1400  }
1401  }
1402  // Prepare for the undo-action (e.g., push if it's going to be popped).
1403  if (undo_action == RESTORE) {
1404  pushes++;
1407  if (pushes == push_limit) {
1409  pushes = 0;
1410  }
1411 
1412  assembler->PushRegister(reg, stack_check);
1413  registers_to_pop->Set(reg, zone);
1414  } else if (undo_action == CLEAR) {
1415  registers_to_clear->Set(reg, zone);
1416  }
1417  // Perform the chronologically last action (or accumulated increment)
1418  // for the register.
1419  if (store_position != -1) {
1420  assembler->WriteCurrentPositionToRegister(reg, store_position);
1421  } else if (clear) {
1422  assembler->ClearRegisters(reg, reg);
1423  } else if (absolute) {
1424  assembler->SetRegister(reg, value);
1425  } else if (value != 0) {
1426  assembler->AdvanceRegister(reg, value);
1427  }
1428  }
1429 }
1430 
1431 
1432 // This is called as we come into a loop choice node and some other tricky
1433 // nodes. It normalizes the state of the code generator to ensure we can
1434 // generate generic code.
1435 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1436  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1437 
1438  ASSERT(!is_trivial());
1439 
1440  if (actions_ == NULL && backtrack() == NULL) {
1441  // Here we just have some deferred cp advances to fix and we are back to
1442  // a normal situation. We may also have to forget some information gained
1443  // through a quick check that was already performed.
1444  if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1445  // Create a new trivial state and generate the node with that.
1446  Trace new_state;
1447  successor->Emit(compiler, &new_state);
1448  return;
1449  }
1450 
1451  // Generate deferred actions here along with code to undo them again.
1452  OutSet affected_registers;
1453 
1454  if (backtrack() != NULL) {
1455  // Here we have a concrete backtrack location. These are set up by choice
1456  // nodes and so they indicate that we have a deferred save of the current
1457  // position which we may need to emit here.
1458  assembler->PushCurrentPosition();
1459  }
1460 
1461  int max_register = FindAffectedRegisters(&affected_registers,
1462  compiler->zone());
1463  OutSet registers_to_pop;
1464  OutSet registers_to_clear;
1465  PerformDeferredActions(assembler,
1466  max_register,
1467  affected_registers,
1468  &registers_to_pop,
1469  &registers_to_clear,
1470  compiler->zone());
1471  if (cp_offset_ != 0) {
1472  assembler->AdvanceCurrentPosition(cp_offset_);
1473  }
1474 
1475  // Create a new trivial state and generate the node with that.
1476  Label undo;
1477  assembler->PushBacktrack(&undo);
1478  Trace new_state;
1479  successor->Emit(compiler, &new_state);
1480 
1481  // On backtrack we need to restore state.
1482  assembler->Bind(&undo);
1483  RestoreAffectedRegisters(assembler,
1484  max_register,
1485  registers_to_pop,
1486  registers_to_clear);
1487  if (backtrack() == NULL) {
1488  assembler->Backtrack();
1489  } else {
1490  assembler->PopCurrentPosition();
1491  assembler->GoTo(backtrack());
1492  }
1493 }
1494 
1495 
1497  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1498 
1499  // Omit flushing the trace. We discard the entire stack frame anyway.
1500 
1501  if (!label()->is_bound()) {
1502  // We are completely independent of the trace, since we ignore it,
1503  // so this code can be used as the generic version.
1504  assembler->Bind(label());
1505  }
1506 
1507  // Throw away everything on the backtrack stack since the start
1508  // of the negative submatch and restore the character position.
1509  assembler->ReadCurrentPositionFromRegister(current_position_register_);
1510  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1511  if (clear_capture_count_ > 0) {
1512  // Clear any captures that might have been performed during the success
1513  // of the body of the negative look-ahead.
1514  int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1515  assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1516  }
1517  // Now that we have unwound the stack we find at the top of the stack the
1518  // backtrack that the BeginSubmatch node got.
1519  assembler->Backtrack();
1520 }
1521 
1522 
1523 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1524  if (!trace->is_trivial()) {
1525  trace->Flush(compiler, this);
1526  return;
1527  }
1528  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1529  if (!label()->is_bound()) {
1530  assembler->Bind(label());
1531  }
1532  switch (action_) {
1533  case ACCEPT:
1534  assembler->Succeed();
1535  return;
1536  case BACKTRACK:
1537  assembler->GoTo(trace->backtrack());
1538  return;
1539  case NEGATIVE_SUBMATCH_SUCCESS:
1540  // This case is handled in a different virtual method.
1541  UNREACHABLE();
1542  }
1543  UNIMPLEMENTED();
1544 }
1545 
1546 
1548  if (guards_ == NULL)
1549  guards_ = new(zone) ZoneList<Guard*>(1, zone);
1550  guards_->Add(guard, zone);
1551 }
1552 
1553 
1555  int val,
1556  RegExpNode* on_success) {
1557  ActionNode* result =
1558  new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1559  result->data_.u_store_register.reg = reg;
1560  result->data_.u_store_register.value = val;
1561  return result;
1562 }
1563 
1564 
1566  ActionNode* result =
1567  new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1568  result->data_.u_increment_register.reg = reg;
1569  return result;
1570 }
1571 
1572 
1574  bool is_capture,
1575  RegExpNode* on_success) {
1576  ActionNode* result =
1577  new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1578  result->data_.u_position_register.reg = reg;
1579  result->data_.u_position_register.is_capture = is_capture;
1580  return result;
1581 }
1582 
1583 
1585  RegExpNode* on_success) {
1586  ActionNode* result =
1587  new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1588  result->data_.u_clear_captures.range_from = range.from();
1589  result->data_.u_clear_captures.range_to = range.to();
1590  return result;
1591 }
1592 
1593 
1595  int position_reg,
1596  RegExpNode* on_success) {
1597  ActionNode* result =
1598  new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1599  result->data_.u_submatch.stack_pointer_register = stack_reg;
1600  result->data_.u_submatch.current_position_register = position_reg;
1601  return result;
1602 }
1603 
1604 
1606  int position_reg,
1607  int clear_register_count,
1608  int clear_register_from,
1609  RegExpNode* on_success) {
1610  ActionNode* result =
1611  new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1612  result->data_.u_submatch.stack_pointer_register = stack_reg;
1613  result->data_.u_submatch.current_position_register = position_reg;
1614  result->data_.u_submatch.clear_register_count = clear_register_count;
1615  result->data_.u_submatch.clear_register_from = clear_register_from;
1616  return result;
1617 }
1618 
1619 
1621  int repetition_register,
1622  int repetition_limit,
1623  RegExpNode* on_success) {
1624  ActionNode* result =
1625  new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1626  result->data_.u_empty_match_check.start_register = start_register;
1627  result->data_.u_empty_match_check.repetition_register = repetition_register;
1628  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1629  return result;
1630 }
1631 
1632 
1633 #define DEFINE_ACCEPT(Type) \
1634  void Type##Node::Accept(NodeVisitor* visitor) { \
1635  visitor->Visit##Type(this); \
1636  }
1638 #undef DEFINE_ACCEPT
1639 
1640 
1642  visitor->VisitLoopChoice(this);
1643 }
1644 
1645 
1646 // -------------------------------------------------------------------
1647 // Emit code.
1648 
1649 
1650 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1651  Guard* guard,
1652  Trace* trace) {
1653  switch (guard->op()) {
1654  case Guard::LT:
1655  ASSERT(!trace->mentions_reg(guard->reg()));
1656  macro_assembler->IfRegisterGE(guard->reg(),
1657  guard->value(),
1658  trace->backtrack());
1659  break;
1660  case Guard::GEQ:
1661  ASSERT(!trace->mentions_reg(guard->reg()));
1662  macro_assembler->IfRegisterLT(guard->reg(),
1663  guard->value(),
1664  trace->backtrack());
1665  break;
1666  }
1667 }
1668 
1669 
1670 // Returns the number of characters in the equivalence class, omitting those
1671 // that cannot occur in the source string because it is ASCII.
1672 static int GetCaseIndependentLetters(Isolate* isolate,
1673  uc16 character,
1674  bool ascii_subject,
1675  unibrow::uchar* letters) {
1676  int length =
1677  isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1678  // Unibrow returns 0 or 1 for characters where case independence is
1679  // trivial.
1680  if (length == 0) {
1681  letters[0] = character;
1682  length = 1;
1683  }
1684  if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1685  return length;
1686  }
1687  // The standard requires that non-ASCII characters cannot have ASCII
1688  // character codes in their equivalence class.
1689  return 0;
1690 }
1691 
1692 
1693 static inline bool EmitSimpleCharacter(Isolate* isolate,
1694  RegExpCompiler* compiler,
1695  uc16 c,
1696  Label* on_failure,
1697  int cp_offset,
1698  bool check,
1699  bool preloaded) {
1700  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1701  bool bound_checked = false;
1702  if (!preloaded) {
1703  assembler->LoadCurrentCharacter(
1704  cp_offset,
1705  on_failure,
1706  check);
1707  bound_checked = true;
1708  }
1709  assembler->CheckNotCharacter(c, on_failure);
1710  return bound_checked;
1711 }
1712 
1713 
1714 // Only emits non-letters (things that don't have case). Only used for case
1715 // independent matches.
1716 static inline bool EmitAtomNonLetter(Isolate* isolate,
1717  RegExpCompiler* compiler,
1718  uc16 c,
1719  Label* on_failure,
1720  int cp_offset,
1721  bool check,
1722  bool preloaded) {
1723  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1724  bool ascii = compiler->ascii();
1726  int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1727  if (length < 1) {
1728  // This can't match. Must be an ASCII subject and a non-ASCII character.
1729  // We do not need to do anything since the ASCII pass already handled this.
1730  return false; // Bounds not checked.
1731  }
1732  bool checked = false;
1733  // We handle the length > 1 case in a later pass.
1734  if (length == 1) {
1735  if (ascii && c > String::kMaxAsciiCharCodeU) {
1736  // Can't match - see above.
1737  return false; // Bounds not checked.
1738  }
1739  if (!preloaded) {
1740  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1741  checked = check;
1742  }
1743  macro_assembler->CheckNotCharacter(c, on_failure);
1744  }
1745  return checked;
1746 }
1747 
1748 
1749 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1750  bool ascii,
1751  uc16 c1,
1752  uc16 c2,
1753  Label* on_failure) {
1754  uc16 char_mask;
1755  if (ascii) {
1756  char_mask = String::kMaxAsciiCharCode;
1757  } else {
1758  char_mask = String::kMaxUtf16CodeUnit;
1759  }
1760  uc16 exor = c1 ^ c2;
1761  // Check whether exor has only one bit set.
1762  if (((exor - 1) & exor) == 0) {
1763  // If c1 and c2 differ only by one bit.
1764  // Ecma262UnCanonicalize always gives the highest number last.
1765  ASSERT(c2 > c1);
1766  uc16 mask = char_mask ^ exor;
1767  macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1768  return true;
1769  }
1770  ASSERT(c2 > c1);
1771  uc16 diff = c2 - c1;
1772  if (((diff - 1) & diff) == 0 && c1 >= diff) {
1773  // If the characters differ by 2^n but don't differ by one bit then
1774  // subtract the difference from the found character, then do the or
1775  // trick. We avoid the theoretical case where negative numbers are
1776  // involved in order to simplify code generation.
1777  uc16 mask = char_mask ^ diff;
1778  macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1779  diff,
1780  mask,
1781  on_failure);
1782  return true;
1783  }
1784  return false;
1785 }
1786 
1787 
1788 typedef bool EmitCharacterFunction(Isolate* isolate,
1789  RegExpCompiler* compiler,
1790  uc16 c,
1791  Label* on_failure,
1792  int cp_offset,
1793  bool check,
1794  bool preloaded);
1795 
1796 // Only emits letters (things that have case). Only used for case independent
1797 // matches.
1798 static inline bool EmitAtomLetter(Isolate* isolate,
1799  RegExpCompiler* compiler,
1800  uc16 c,
1801  Label* on_failure,
1802  int cp_offset,
1803  bool check,
1804  bool preloaded) {
1805  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1806  bool ascii = compiler->ascii();
1808  int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1809  if (length <= 1) return false;
1810  // We may not need to check against the end of the input string
1811  // if this character lies before a character that matched.
1812  if (!preloaded) {
1813  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1814  }
1815  Label ok;
1817  switch (length) {
1818  case 2: {
1819  if (ShortCutEmitCharacterPair(macro_assembler,
1820  ascii,
1821  chars[0],
1822  chars[1],
1823  on_failure)) {
1824  } else {
1825  macro_assembler->CheckCharacter(chars[0], &ok);
1826  macro_assembler->CheckNotCharacter(chars[1], on_failure);
1827  macro_assembler->Bind(&ok);
1828  }
1829  break;
1830  }
1831  case 4:
1832  macro_assembler->CheckCharacter(chars[3], &ok);
1833  // Fall through!
1834  case 3:
1835  macro_assembler->CheckCharacter(chars[0], &ok);
1836  macro_assembler->CheckCharacter(chars[1], &ok);
1837  macro_assembler->CheckNotCharacter(chars[2], on_failure);
1838  macro_assembler->Bind(&ok);
1839  break;
1840  default:
1841  UNREACHABLE();
1842  break;
1843  }
1844  return true;
1845 }
1846 
1847 
1848 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1849  int border,
1850  Label* fall_through,
1851  Label* above_or_equal,
1852  Label* below) {
1853  if (below != fall_through) {
1854  masm->CheckCharacterLT(border, below);
1855  if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1856  } else {
1857  masm->CheckCharacterGT(border - 1, above_or_equal);
1858  }
1859 }
1860 
1861 
1862 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1863  int first,
1864  int last,
1865  Label* fall_through,
1866  Label* in_range,
1867  Label* out_of_range) {
1868  if (in_range == fall_through) {
1869  if (first == last) {
1870  masm->CheckNotCharacter(first, out_of_range);
1871  } else {
1872  masm->CheckCharacterNotInRange(first, last, out_of_range);
1873  }
1874  } else {
1875  if (first == last) {
1876  masm->CheckCharacter(first, in_range);
1877  } else {
1878  masm->CheckCharacterInRange(first, last, in_range);
1879  }
1880  if (out_of_range != fall_through) masm->GoTo(out_of_range);
1881  }
1882 }
1883 
1884 
1885 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1886 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1887 static void EmitUseLookupTable(
1888  RegExpMacroAssembler* masm,
1889  ZoneList<int>* ranges,
1890  int start_index,
1891  int end_index,
1892  int min_char,
1893  Label* fall_through,
1894  Label* even_label,
1895  Label* odd_label) {
1896  static const int kSize = RegExpMacroAssembler::kTableSize;
1897  static const int kMask = RegExpMacroAssembler::kTableMask;
1898 
1899  int base = (min_char & ~kMask);
1900  USE(base);
1901 
1902  // Assert that everything is on one kTableSize page.
1903  for (int i = start_index; i <= end_index; i++) {
1904  ASSERT_EQ(ranges->at(i) & ~kMask, base);
1905  }
1906  ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1907 
1908  char templ[kSize];
1909  Label* on_bit_set;
1910  Label* on_bit_clear;
1911  int bit;
1912  if (even_label == fall_through) {
1913  on_bit_set = odd_label;
1914  on_bit_clear = even_label;
1915  bit = 1;
1916  } else {
1917  on_bit_set = even_label;
1918  on_bit_clear = odd_label;
1919  bit = 0;
1920  }
1921  for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1922  templ[i] = bit;
1923  }
1924  int j = 0;
1925  bit ^= 1;
1926  for (int i = start_index; i < end_index; i++) {
1927  for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1928  templ[j] = bit;
1929  }
1930  bit ^= 1;
1931  }
1932  for (int i = j; i < kSize; i++) {
1933  templ[i] = bit;
1934  }
1935  // TODO(erikcorry): Cache these.
1936  Handle<ByteArray> ba = FACTORY->NewByteArray(kSize, TENURED);
1937  for (int i = 0; i < kSize; i++) {
1938  ba->set(i, templ[i]);
1939  }
1940  masm->CheckBitInTable(ba, on_bit_set);
1941  if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1942 }
1943 
1944 
1945 static void CutOutRange(RegExpMacroAssembler* masm,
1946  ZoneList<int>* ranges,
1947  int start_index,
1948  int end_index,
1949  int cut_index,
1950  Label* even_label,
1951  Label* odd_label) {
1952  bool odd = (((cut_index - start_index) & 1) == 1);
1953  Label* in_range_label = odd ? odd_label : even_label;
1954  Label dummy;
1955  EmitDoubleBoundaryTest(masm,
1956  ranges->at(cut_index),
1957  ranges->at(cut_index + 1) - 1,
1958  &dummy,
1959  in_range_label,
1960  &dummy);
1961  ASSERT(!dummy.is_linked());
1962  // Cut out the single range by rewriting the array. This creates a new
1963  // range that is a merger of the two ranges on either side of the one we
1964  // are cutting out. The oddity of the labels is preserved.
1965  for (int j = cut_index; j > start_index; j--) {
1966  ranges->at(j) = ranges->at(j - 1);
1967  }
1968  for (int j = cut_index + 1; j < end_index; j++) {
1969  ranges->at(j) = ranges->at(j + 1);
1970  }
1971 }
1972 
1973 
1974 // Unicode case. Split the search space into kSize spaces that are handled
1975 // with recursion.
1976 static void SplitSearchSpace(ZoneList<int>* ranges,
1977  int start_index,
1978  int end_index,
1979  int* new_start_index,
1980  int* new_end_index,
1981  int* border) {
1982  static const int kSize = RegExpMacroAssembler::kTableSize;
1983  static const int kMask = RegExpMacroAssembler::kTableMask;
1984 
1985  int first = ranges->at(start_index);
1986  int last = ranges->at(end_index) - 1;
1987 
1988  *new_start_index = start_index;
1989  *border = (ranges->at(start_index) & ~kMask) + kSize;
1990  while (*new_start_index < end_index) {
1991  if (ranges->at(*new_start_index) > *border) break;
1992  (*new_start_index)++;
1993  }
1994  // new_start_index is the index of the first edge that is beyond the
1995  // current kSize space.
1996 
1997  // For very large search spaces we do a binary chop search of the non-ASCII
1998  // space instead of just going to the end of the current kSize space. The
1999  // heuristics are complicated a little by the fact that any 128-character
2000  // encoding space can be quickly tested with a table lookup, so we don't
2001  // wish to do binary chop search at a smaller granularity than that. A
2002  // 128-character space can take up a lot of space in the ranges array if,
2003  // for example, we only want to match every second character (eg. the lower
2004  // case characters on some Unicode pages).
2005  int binary_chop_index = (end_index + start_index) / 2;
2006  // The first test ensures that we get to the code that handles the ASCII
2007  // range with a single not-taken branch, speeding up this important
2008  // character range (even non-ASCII charset-based text has spaces and
2009  // punctuation).
2010  if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case.
2011  end_index - start_index > (*new_start_index - start_index) * 2 &&
2012  last - first > kSize * 2 &&
2013  binary_chop_index > *new_start_index &&
2014  ranges->at(binary_chop_index) >= first + 2 * kSize) {
2015  int scan_forward_for_section_border = binary_chop_index;;
2016  int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
2017 
2018  while (scan_forward_for_section_border < end_index) {
2019  if (ranges->at(scan_forward_for_section_border) > new_border) {
2020  *new_start_index = scan_forward_for_section_border;
2021  *border = new_border;
2022  break;
2023  }
2024  scan_forward_for_section_border++;
2025  }
2026  }
2027 
2028  ASSERT(*new_start_index > start_index);
2029  *new_end_index = *new_start_index - 1;
2030  if (ranges->at(*new_end_index) == *border) {
2031  (*new_end_index)--;
2032  }
2033  if (*border >= ranges->at(end_index)) {
2034  *border = ranges->at(end_index);
2035  *new_start_index = end_index; // Won't be used.
2036  *new_end_index = end_index - 1;
2037  }
2038 }
2039 
2040 
2041 // Gets a series of segment boundaries representing a character class. If the
2042 // character is in the range between an even and an odd boundary (counting from
2043 // start_index) then go to even_label, otherwise go to odd_label. We already
2044 // know that the character is in the range of min_char to max_char inclusive.
2045 // Either label can be NULL indicating backtracking. Either label can also be
2046 // equal to the fall_through label.
2047 static void GenerateBranches(RegExpMacroAssembler* masm,
2048  ZoneList<int>* ranges,
2049  int start_index,
2050  int end_index,
2051  uc16 min_char,
2052  uc16 max_char,
2053  Label* fall_through,
2054  Label* even_label,
2055  Label* odd_label) {
2056  int first = ranges->at(start_index);
2057  int last = ranges->at(end_index) - 1;
2058 
2059  ASSERT_LT(min_char, first);
2060 
2061  // Just need to test if the character is before or on-or-after
2062  // a particular character.
2063  if (start_index == end_index) {
2064  EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2065  return;
2066  }
2067 
2068  // Another almost trivial case: There is one interval in the middle that is
2069  // different from the end intervals.
2070  if (start_index + 1 == end_index) {
2071  EmitDoubleBoundaryTest(
2072  masm, first, last, fall_through, even_label, odd_label);
2073  return;
2074  }
2075 
2076  // It's not worth using table lookup if there are very few intervals in the
2077  // character class.
2078  if (end_index - start_index <= 6) {
2079  // It is faster to test for individual characters, so we look for those
2080  // first, then try arbitrary ranges in the second round.
2081  static int kNoCutIndex = -1;
2082  int cut = kNoCutIndex;
2083  for (int i = start_index; i < end_index; i++) {
2084  if (ranges->at(i) == ranges->at(i + 1) - 1) {
2085  cut = i;
2086  break;
2087  }
2088  }
2089  if (cut == kNoCutIndex) cut = start_index;
2090  CutOutRange(
2091  masm, ranges, start_index, end_index, cut, even_label, odd_label);
2092  ASSERT_GE(end_index - start_index, 2);
2093  GenerateBranches(masm,
2094  ranges,
2095  start_index + 1,
2096  end_index - 1,
2097  min_char,
2098  max_char,
2099  fall_through,
2100  even_label,
2101  odd_label);
2102  return;
2103  }
2104 
2105  // If there are a lot of intervals in the regexp, then we will use tables to
2106  // determine whether the character is inside or outside the character class.
2107  static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2108 
2109  if ((max_char >> kBits) == (min_char >> kBits)) {
2110  EmitUseLookupTable(masm,
2111  ranges,
2112  start_index,
2113  end_index,
2114  min_char,
2115  fall_through,
2116  even_label,
2117  odd_label);
2118  return;
2119  }
2120 
2121  if ((min_char >> kBits) != (first >> kBits)) {
2122  masm->CheckCharacterLT(first, odd_label);
2123  GenerateBranches(masm,
2124  ranges,
2125  start_index + 1,
2126  end_index,
2127  first,
2128  max_char,
2129  fall_through,
2130  odd_label,
2131  even_label);
2132  return;
2133  }
2134 
2135  int new_start_index = 0;
2136  int new_end_index = 0;
2137  int border = 0;
2138 
2139  SplitSearchSpace(ranges,
2140  start_index,
2141  end_index,
2142  &new_start_index,
2143  &new_end_index,
2144  &border);
2145 
2146  Label handle_rest;
2147  Label* above = &handle_rest;
2148  if (border == last + 1) {
2149  // We didn't find any section that started after the limit, so everything
2150  // above the border is one of the terminal labels.
2151  above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2152  ASSERT(new_end_index == end_index - 1);
2153  }
2154 
2155  ASSERT_LE(start_index, new_end_index);
2156  ASSERT_LE(new_start_index, end_index);
2157  ASSERT_LT(start_index, new_start_index);
2158  ASSERT_LT(new_end_index, end_index);
2159  ASSERT(new_end_index + 1 == new_start_index ||
2160  (new_end_index + 2 == new_start_index &&
2161  border == ranges->at(new_end_index + 1)));
2162  ASSERT_LT(min_char, border - 1);
2163  ASSERT_LT(border, max_char);
2164  ASSERT_LT(ranges->at(new_end_index), border);
2165  ASSERT(border < ranges->at(new_start_index) ||
2166  (border == ranges->at(new_start_index) &&
2167  new_start_index == end_index &&
2168  new_end_index == end_index - 1 &&
2169  border == last + 1));
2170  ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2171 
2172  masm->CheckCharacterGT(border - 1, above);
2173  Label dummy;
2174  GenerateBranches(masm,
2175  ranges,
2176  start_index,
2177  new_end_index,
2178  min_char,
2179  border - 1,
2180  &dummy,
2181  even_label,
2182  odd_label);
2183  if (handle_rest.is_linked()) {
2184  masm->Bind(&handle_rest);
2185  bool flip = (new_start_index & 1) != (start_index & 1);
2186  GenerateBranches(masm,
2187  ranges,
2188  new_start_index,
2189  end_index,
2190  border,
2191  max_char,
2192  &dummy,
2193  flip ? odd_label : even_label,
2194  flip ? even_label : odd_label);
2195  }
2196 }
2197 
2198 
2199 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2200  RegExpCharacterClass* cc,
2201  bool ascii,
2202  Label* on_failure,
2203  int cp_offset,
2204  bool check_offset,
2205  bool preloaded,
2206  Zone* zone) {
2207  ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2208  if (!CharacterRange::IsCanonical(ranges)) {
2210  }
2211 
2212  int max_char;
2213  if (ascii) {
2214  max_char = String::kMaxAsciiCharCode;
2215  } else {
2216  max_char = String::kMaxUtf16CodeUnit;
2217  }
2218 
2219  int range_count = ranges->length();
2220 
2221  int last_valid_range = range_count - 1;
2222  while (last_valid_range >= 0) {
2223  CharacterRange& range = ranges->at(last_valid_range);
2224  if (range.from() <= max_char) {
2225  break;
2226  }
2227  last_valid_range--;
2228  }
2229 
2230  if (last_valid_range < 0) {
2231  if (!cc->is_negated()) {
2232  macro_assembler->GoTo(on_failure);
2233  }
2234  if (check_offset) {
2235  macro_assembler->CheckPosition(cp_offset, on_failure);
2236  }
2237  return;
2238  }
2239 
2240  if (last_valid_range == 0 &&
2241  ranges->at(0).IsEverything(max_char)) {
2242  if (cc->is_negated()) {
2243  macro_assembler->GoTo(on_failure);
2244  } else {
2245  // This is a common case hit by non-anchored expressions.
2246  if (check_offset) {
2247  macro_assembler->CheckPosition(cp_offset, on_failure);
2248  }
2249  }
2250  return;
2251  }
2252  if (last_valid_range == 0 &&
2253  !cc->is_negated() &&
2254  ranges->at(0).IsEverything(max_char)) {
2255  // This is a common case hit by non-anchored expressions.
2256  if (check_offset) {
2257  macro_assembler->CheckPosition(cp_offset, on_failure);
2258  }
2259  return;
2260  }
2261 
2262  if (!preloaded) {
2263  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2264  }
2265 
2266  if (cc->is_standard(zone) &&
2267  macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2268  on_failure)) {
2269  return;
2270  }
2271 
2272 
2273  // A new list with ascending entries. Each entry is a code unit
2274  // where there is a boundary between code units that are part of
2275  // the class and code units that are not. Normally we insert an
2276  // entry at zero which goes to the failure label, but if there
2277  // was already one there we fall through for success on that entry.
2278  // Subsequent entries have alternating meaning (success/failure).
2279  ZoneList<int>* range_boundaries =
2280  new(zone) ZoneList<int>(last_valid_range, zone);
2281 
2282  bool zeroth_entry_is_failure = !cc->is_negated();
2283 
2284  for (int i = 0; i <= last_valid_range; i++) {
2285  CharacterRange& range = ranges->at(i);
2286  if (range.from() == 0) {
2287  ASSERT_EQ(i, 0);
2288  zeroth_entry_is_failure = !zeroth_entry_is_failure;
2289  } else {
2290  range_boundaries->Add(range.from(), zone);
2291  }
2292  range_boundaries->Add(range.to() + 1, zone);
2293  }
2294  int end_index = range_boundaries->length() - 1;
2295  if (range_boundaries->at(end_index) > max_char) {
2296  end_index--;
2297  }
2298 
2299  Label fall_through;
2300  GenerateBranches(macro_assembler,
2301  range_boundaries,
2302  0, // start_index.
2303  end_index,
2304  0, // min_char.
2305  max_char,
2306  &fall_through,
2307  zeroth_entry_is_failure ? &fall_through : on_failure,
2308  zeroth_entry_is_failure ? on_failure : &fall_through);
2309  macro_assembler->Bind(&fall_through);
2310 }
2311 
2312 
2314 }
2315 
2316 
2318  Trace* trace) {
2319  // If we are generating a greedy loop then don't stop and don't reuse code.
2320  if (trace->stop_node() != NULL) {
2321  return CONTINUE;
2322  }
2323 
2324  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2325  if (trace->is_trivial()) {
2326  if (label_.is_bound()) {
2327  // We are being asked to generate a generic version, but that's already
2328  // been done so just go to it.
2329  macro_assembler->GoTo(&label_);
2330  return DONE;
2331  }
2332  if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2333  // To avoid too deep recursion we push the node to the work queue and just
2334  // generate a goto here.
2335  compiler->AddWork(this);
2336  macro_assembler->GoTo(&label_);
2337  return DONE;
2338  }
2339  // Generate generic version of the node and bind the label for later use.
2340  macro_assembler->Bind(&label_);
2341  return CONTINUE;
2342  }
2343 
2344  // We are being asked to make a non-generic version. Keep track of how many
2345  // non-generic versions we generate so as not to overdo it.
2346  trace_count_++;
2347  if (FLAG_regexp_optimization &&
2348  trace_count_ < kMaxCopiesCodeGenerated &&
2350  return CONTINUE;
2351  }
2352 
2353  // If we get here code has been generated for this node too many times or
2354  // recursion is too deep. Time to switch to a generic version. The code for
2355  // generic versions above can handle deep recursion properly.
2356  trace->Flush(compiler, this);
2357  return DONE;
2358 }
2359 
2360 
2361 int ActionNode::EatsAtLeast(int still_to_find,
2362  int recursion_depth,
2363  bool not_at_start) {
2364  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2365  if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2366  return on_success()->EatsAtLeast(still_to_find,
2367  recursion_depth + 1,
2368  not_at_start);
2369 }
2370 
2371 
2373  int recursion_depth,
2374  int budget,
2375  BoyerMooreLookahead* bm,
2376  bool not_at_start) {
2377  if (type_ == BEGIN_SUBMATCH) {
2378  bm->SetRest(offset);
2379  } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {
2380  on_success()->FillInBMInfo(
2381  offset, recursion_depth + 1, budget - 1, bm, not_at_start);
2382  }
2383  SaveBMInfo(bm, not_at_start, offset);
2384 }
2385 
2386 
2387 int AssertionNode::EatsAtLeast(int still_to_find,
2388  int recursion_depth,
2389  bool not_at_start) {
2390  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2391  // If we know we are not at the start and we are asked "how many characters
2392  // will you match if you succeed?" then we can answer anything since false
2393  // implies false. So lets just return the max answer (still_to_find) since
2394  // that won't prevent us from preloading a lot of characters for the other
2395  // branches in the node graph.
2396  if (type() == AT_START && not_at_start) return still_to_find;
2397  return on_success()->EatsAtLeast(still_to_find,
2398  recursion_depth + 1,
2399  not_at_start);
2400 }
2401 
2402 
2404  int recursion_depth,
2405  int budget,
2406  BoyerMooreLookahead* bm,
2407  bool not_at_start) {
2408  // Match the behaviour of EatsAtLeast on this node.
2409  if (type() == AT_START && not_at_start) return;
2410  on_success()->FillInBMInfo(
2411  offset, recursion_depth + 1, budget - 1, bm, not_at_start);
2412  SaveBMInfo(bm, not_at_start, offset);
2413 }
2414 
2415 
2416 int BackReferenceNode::EatsAtLeast(int still_to_find,
2417  int recursion_depth,
2418  bool not_at_start) {
2419  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2420  return on_success()->EatsAtLeast(still_to_find,
2421  recursion_depth + 1,
2422  not_at_start);
2423 }
2424 
2425 
2426 int TextNode::EatsAtLeast(int still_to_find,
2427  int recursion_depth,
2428  bool not_at_start) {
2429  int answer = Length();
2430  if (answer >= still_to_find) return answer;
2431  if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
2432  // We are not at start after this node so we set the last argument to 'true'.
2433  return answer + on_success()->EatsAtLeast(still_to_find - answer,
2434  recursion_depth + 1,
2435  true);
2436 }
2437 
2438 
2440  int recursion_depth,
2441  bool not_at_start) {
2442  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2443  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2444  // afterwards.
2445  RegExpNode* node = alternatives_->at(1).node();
2446  return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start);
2447 }
2448 
2449 
2451  QuickCheckDetails* details,
2452  RegExpCompiler* compiler,
2453  int filled_in,
2454  bool not_at_start) {
2455  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2456  // afterwards.
2457  RegExpNode* node = alternatives_->at(1).node();
2458  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2459 }
2460 
2461 
2462 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2463  int recursion_depth,
2464  RegExpNode* ignore_this_node,
2465  bool not_at_start) {
2466  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2467  int min = 100;
2468  int choice_count = alternatives_->length();
2469  for (int i = 0; i < choice_count; i++) {
2470  RegExpNode* node = alternatives_->at(i).node();
2471  if (node == ignore_this_node) continue;
2472  int node_eats_at_least = node->EatsAtLeast(still_to_find,
2473  recursion_depth + 1,
2474  not_at_start);
2475  if (node_eats_at_least < min) min = node_eats_at_least;
2476  }
2477  return min;
2478 }
2479 
2480 
2481 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2482  int recursion_depth,
2483  bool not_at_start) {
2484  return EatsAtLeastHelper(still_to_find,
2485  recursion_depth,
2486  loop_node_,
2487  not_at_start);
2488 }
2489 
2490 
2491 int ChoiceNode::EatsAtLeast(int still_to_find,
2492  int recursion_depth,
2493  bool not_at_start) {
2494  return EatsAtLeastHelper(still_to_find,
2495  recursion_depth,
2496  NULL,
2497  not_at_start);
2498 }
2499 
2500 
2501 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2502 static inline uint32_t SmearBitsRight(uint32_t v) {
2503  v |= v >> 1;
2504  v |= v >> 2;
2505  v |= v >> 4;
2506  v |= v >> 8;
2507  v |= v >> 16;
2508  return v;
2509 }
2510 
2511 
2513  bool found_useful_op = false;
2514  uint32_t char_mask;
2515  if (asc) {
2516  char_mask = String::kMaxAsciiCharCode;
2517  } else {
2518  char_mask = String::kMaxUtf16CodeUnit;
2519  }
2520  mask_ = 0;
2521  value_ = 0;
2522  int char_shift = 0;
2523  for (int i = 0; i < characters_; i++) {
2524  Position* pos = &positions_[i];
2525  if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
2526  found_useful_op = true;
2527  }
2528  mask_ |= (pos->mask & char_mask) << char_shift;
2529  value_ |= (pos->value & char_mask) << char_shift;
2530  char_shift += asc ? 8 : 16;
2531  }
2532  return found_useful_op;
2533 }
2534 
2535 
2537  Trace* trace,
2538  bool preload_has_checked_bounds,
2539  Label* on_possible_success,
2540  QuickCheckDetails* details,
2541  bool fall_through_on_failure) {
2542  if (details->characters() == 0) return false;
2543  GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
2544  if (details->cannot_match()) return false;
2545  if (!details->Rationalize(compiler->ascii())) return false;
2546  ASSERT(details->characters() == 1 ||
2547  compiler->macro_assembler()->CanReadUnaligned());
2548  uint32_t mask = details->mask();
2549  uint32_t value = details->value();
2550 
2551  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2552 
2553  if (trace->characters_preloaded() != details->characters()) {
2554  assembler->LoadCurrentCharacter(trace->cp_offset(),
2555  trace->backtrack(),
2556  !preload_has_checked_bounds,
2557  details->characters());
2558  }
2559 
2560 
2561  bool need_mask = true;
2562 
2563  if (details->characters() == 1) {
2564  // If number of characters preloaded is 1 then we used a byte or 16 bit
2565  // load so the value is already masked down.
2566  uint32_t char_mask;
2567  if (compiler->ascii()) {
2568  char_mask = String::kMaxAsciiCharCode;
2569  } else {
2570  char_mask = String::kMaxUtf16CodeUnit;
2571  }
2572  if ((mask & char_mask) == char_mask) need_mask = false;
2573  mask &= char_mask;
2574  } else {
2575  // For 2-character preloads in ASCII mode or 1-character preloads in
2576  // TWO_BYTE mode we also use a 16 bit load with zero extend.
2577  if (details->characters() == 2 && compiler->ascii()) {
2578  if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
2579  } else if (details->characters() == 1 && !compiler->ascii()) {
2580  if ((mask & 0xffff) == 0xffff) need_mask = false;
2581  } else {
2582  if (mask == 0xffffffff) need_mask = false;
2583  }
2584  }
2585 
2586  if (fall_through_on_failure) {
2587  if (need_mask) {
2588  assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2589  } else {
2590  assembler->CheckCharacter(value, on_possible_success);
2591  }
2592  } else {
2593  if (need_mask) {
2594  assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2595  } else {
2596  assembler->CheckNotCharacter(value, trace->backtrack());
2597  }
2598  }
2599  return true;
2600 }
2601 
2602 
2603 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2604 // super-class in the .h file).
2605 //
2606 // We iterate along the text object, building up for each character a
2607 // mask and value that can be used to test for a quick failure to match.
2608 // The masks and values for the positions will be combined into a single
2609 // machine word for the current character width in order to be used in
2610 // generating a quick check.
2612  RegExpCompiler* compiler,
2613  int characters_filled_in,
2614  bool not_at_start) {
2615  Isolate* isolate = Isolate::Current();
2616  ASSERT(characters_filled_in < details->characters());
2617  int characters = details->characters();
2618  int char_mask;
2619  if (compiler->ascii()) {
2620  char_mask = String::kMaxAsciiCharCode;
2621  } else {
2622  char_mask = String::kMaxUtf16CodeUnit;
2623  }
2624  for (int k = 0; k < elms_->length(); k++) {
2625  TextElement elm = elms_->at(k);
2626  if (elm.type == TextElement::ATOM) {
2627  Vector<const uc16> quarks = elm.data.u_atom->data();
2628  for (int i = 0; i < characters && i < quarks.length(); i++) {
2630  details->positions(characters_filled_in);
2631  uc16 c = quarks[i];
2632  if (c > char_mask) {
2633  // If we expect a non-ASCII character from an ASCII string,
2634  // there is no way we can match. Not even case independent
2635  // matching can turn an ASCII character into non-ASCII or
2636  // vice versa.
2637  details->set_cannot_match();
2638  pos->determines_perfectly = false;
2639  return;
2640  }
2641  if (compiler->ignore_case()) {
2643  int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
2644  chars);
2645  ASSERT(length != 0); // Can only happen if c > char_mask (see above).
2646  if (length == 1) {
2647  // This letter has no case equivalents, so it's nice and simple
2648  // and the mask-compare will determine definitely whether we have
2649  // a match at this character position.
2650  pos->mask = char_mask;
2651  pos->value = c;
2652  pos->determines_perfectly = true;
2653  } else {
2654  uint32_t common_bits = char_mask;
2655  uint32_t bits = chars[0];
2656  for (int j = 1; j < length; j++) {
2657  uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2658  common_bits ^= differing_bits;
2659  bits &= common_bits;
2660  }
2661  // If length is 2 and common bits has only one zero in it then
2662  // our mask and compare instruction will determine definitely
2663  // whether we have a match at this character position. Otherwise
2664  // it can only be an approximate check.
2665  uint32_t one_zero = (common_bits | ~char_mask);
2666  if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2667  pos->determines_perfectly = true;
2668  }
2669  pos->mask = common_bits;
2670  pos->value = bits;
2671  }
2672  } else {
2673  // Don't ignore case. Nice simple case where the mask-compare will
2674  // determine definitely whether we have a match at this character
2675  // position.
2676  pos->mask = char_mask;
2677  pos->value = c;
2678  pos->determines_perfectly = true;
2679  }
2680  characters_filled_in++;
2681  ASSERT(characters_filled_in <= details->characters());
2682  if (characters_filled_in == details->characters()) {
2683  return;
2684  }
2685  }
2686  } else {
2688  details->positions(characters_filled_in);
2690  ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2691  if (tree->is_negated()) {
2692  // A quick check uses multi-character mask and compare. There is no
2693  // useful way to incorporate a negative char class into this scheme
2694  // so we just conservatively create a mask and value that will always
2695  // succeed.
2696  pos->mask = 0;
2697  pos->value = 0;
2698  } else {
2699  int first_range = 0;
2700  while (ranges->at(first_range).from() > char_mask) {
2701  first_range++;
2702  if (first_range == ranges->length()) {
2703  details->set_cannot_match();
2704  pos->determines_perfectly = false;
2705  return;
2706  }
2707  }
2708  CharacterRange range = ranges->at(first_range);
2709  uc16 from = range.from();
2710  uc16 to = range.to();
2711  if (to > char_mask) {
2712  to = char_mask;
2713  }
2714  uint32_t differing_bits = (from ^ to);
2715  // A mask and compare is only perfect if the differing bits form a
2716  // number like 00011111 with one single block of trailing 1s.
2717  if ((differing_bits & (differing_bits + 1)) == 0 &&
2718  from + differing_bits == to) {
2719  pos->determines_perfectly = true;
2720  }
2721  uint32_t common_bits = ~SmearBitsRight(differing_bits);
2722  uint32_t bits = (from & common_bits);
2723  for (int i = first_range + 1; i < ranges->length(); i++) {
2724  CharacterRange range = ranges->at(i);
2725  uc16 from = range.from();
2726  uc16 to = range.to();
2727  if (from > char_mask) continue;
2728  if (to > char_mask) to = char_mask;
2729  // Here we are combining more ranges into the mask and compare
2730  // value. With each new range the mask becomes more sparse and
2731  // so the chances of a false positive rise. A character class
2732  // with multiple ranges is assumed never to be equivalent to a
2733  // mask and compare operation.
2734  pos->determines_perfectly = false;
2735  uint32_t new_common_bits = (from ^ to);
2736  new_common_bits = ~SmearBitsRight(new_common_bits);
2737  common_bits &= new_common_bits;
2738  bits &= new_common_bits;
2739  uint32_t differing_bits = (from & common_bits) ^ bits;
2740  common_bits ^= differing_bits;
2741  bits &= common_bits;
2742  }
2743  pos->mask = common_bits;
2744  pos->value = bits;
2745  }
2746  characters_filled_in++;
2747  ASSERT(characters_filled_in <= details->characters());
2748  if (characters_filled_in == details->characters()) {
2749  return;
2750  }
2751  }
2752  }
2753  ASSERT(characters_filled_in != details->characters());
2754  if (!details->cannot_match()) {
2755  on_success()-> GetQuickCheckDetails(details,
2756  compiler,
2757  characters_filled_in,
2758  true);
2759  }
2760 }
2761 
2762 
2764  for (int i = 0; i < characters_; i++) {
2765  positions_[i].mask = 0;
2766  positions_[i].value = 0;
2767  positions_[i].determines_perfectly = false;
2768  }
2769  characters_ = 0;
2770 }
2771 
2772 
2773 void QuickCheckDetails::Advance(int by, bool ascii) {
2774  ASSERT(by >= 0);
2775  if (by >= characters_) {
2776  Clear();
2777  return;
2778  }
2779  for (int i = 0; i < characters_ - by; i++) {
2780  positions_[i] = positions_[by + i];
2781  }
2782  for (int i = characters_ - by; i < characters_; i++) {
2783  positions_[i].mask = 0;
2784  positions_[i].value = 0;
2785  positions_[i].determines_perfectly = false;
2786  }
2787  characters_ -= by;
2788  // We could change mask_ and value_ here but we would never advance unless
2789  // they had already been used in a check and they won't be used again because
2790  // it would gain us nothing. So there's no point.
2791 }
2792 
2793 
2794 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2795  ASSERT(characters_ == other->characters_);
2796  if (other->cannot_match_) {
2797  return;
2798  }
2799  if (cannot_match_) {
2800  *this = *other;
2801  return;
2802  }
2803  for (int i = from_index; i < characters_; i++) {
2804  QuickCheckDetails::Position* pos = positions(i);
2805  QuickCheckDetails::Position* other_pos = other->positions(i);
2806  if (pos->mask != other_pos->mask ||
2807  pos->value != other_pos->value ||
2808  !other_pos->determines_perfectly) {
2809  // Our mask-compare operation will be approximate unless we have the
2810  // exact same operation on both sides of the alternation.
2811  pos->determines_perfectly = false;
2812  }
2813  pos->mask &= other_pos->mask;
2814  pos->value &= pos->mask;
2815  other_pos->value &= pos->mask;
2816  uc16 differing_bits = (pos->value ^ other_pos->value);
2817  pos->mask &= ~differing_bits;
2818  pos->value &= pos->mask;
2819  }
2820 }
2821 
2822 
2824  public:
2825  explicit VisitMarker(NodeInfo* info) : info_(info) {
2826  ASSERT(!info->visited);
2827  info->visited = true;
2828  }
2830  info_->visited = false;
2831  }
2832  private:
2833  NodeInfo* info_;
2834 };
2835 
2836 
2838  if (info()->replacement_calculated) return replacement();
2839  if (depth < 0) return this;
2840  ASSERT(!info()->visited);
2841  VisitMarker marker(info());
2842  return FilterSuccessor(depth - 1);
2843 }
2844 
2845 
2847  RegExpNode* next = on_success_->FilterASCII(depth - 1);
2848  if (next == NULL) return set_replacement(NULL);
2849  on_success_ = next;
2850  return set_replacement(this);
2851 }
2852 
2853 
2855  if (info()->replacement_calculated) return replacement();
2856  if (depth < 0) return this;
2857  ASSERT(!info()->visited);
2858  VisitMarker marker(info());
2859  int element_count = elms_->length();
2860  for (int i = 0; i < element_count; i++) {
2861  TextElement elm = elms_->at(i);
2862  if (elm.type == TextElement::ATOM) {
2863  Vector<const uc16> quarks = elm.data.u_atom->data();
2864  for (int j = 0; j < quarks.length(); j++) {
2865  // We don't need special handling for case independence
2866  // because of the rule that case independence cannot make
2867  // a non-ASCII character match an ASCII character.
2868  if (quarks[j] > String::kMaxAsciiCharCode) {
2869  return set_replacement(NULL);
2870  }
2871  }
2872  } else {
2875  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2876  if (!CharacterRange::IsCanonical(ranges)) {
2878  }
2879  // Now they are in order so we only need to look at the first.
2880  int range_count = ranges->length();
2881  if (cc->is_negated()) {
2882  if (range_count != 0 &&
2883  ranges->at(0).from() == 0 &&
2884  ranges->at(0).to() >= String::kMaxAsciiCharCode) {
2885  return set_replacement(NULL);
2886  }
2887  } else {
2888  if (range_count == 0 ||
2889  ranges->at(0).from() > String::kMaxAsciiCharCode) {
2890  return set_replacement(NULL);
2891  }
2892  }
2893  }
2894  }
2895  return FilterSuccessor(depth - 1);
2896 }
2897 
2898 
2900  if (info()->replacement_calculated) return replacement();
2901  if (depth < 0) return this;
2902  if (info()->visited) return this;
2903  {
2904  VisitMarker marker(info());
2905 
2906  RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
2907  // If we can't continue after the loop then there is no sense in doing the
2908  // loop.
2909  if (continue_replacement == NULL) return set_replacement(NULL);
2910  }
2911 
2912  return ChoiceNode::FilterASCII(depth - 1);
2913 }
2914 
2915 
2917  if (info()->replacement_calculated) return replacement();
2918  if (depth < 0) return this;
2919  if (info()->visited) return this;
2920  VisitMarker marker(info());
2921  int choice_count = alternatives_->length();
2922 
2923  for (int i = 0; i < choice_count; i++) {
2924  GuardedAlternative alternative = alternatives_->at(i);
2925  if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2926  set_replacement(this);
2927  return this;
2928  }
2929  }
2930 
2931  int surviving = 0;
2932  RegExpNode* survivor = NULL;
2933  for (int i = 0; i < choice_count; i++) {
2934  GuardedAlternative alternative = alternatives_->at(i);
2935  RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
2936  ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2937  if (replacement != NULL) {
2938  alternatives_->at(i).set_node(replacement);
2939  surviving++;
2940  survivor = replacement;
2941  }
2942  }
2943  if (surviving < 2) return set_replacement(survivor);
2944 
2945  set_replacement(this);
2946  if (surviving == choice_count) {
2947  return this;
2948  }
2949  // Only some of the nodes survived the filtering. We need to rebuild the
2950  // alternatives list.
2951  ZoneList<GuardedAlternative>* new_alternatives =
2952  new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2953  for (int i = 0; i < choice_count; i++) {
2954  RegExpNode* replacement =
2955  alternatives_->at(i).node()->FilterASCII(depth - 1);
2956  if (replacement != NULL) {
2957  alternatives_->at(i).set_node(replacement);
2958  new_alternatives->Add(alternatives_->at(i), zone());
2959  }
2960  }
2961  alternatives_ = new_alternatives;
2962  return this;
2963 }
2964 
2965 
2967  if (info()->replacement_calculated) return replacement();
2968  if (depth < 0) return this;
2969  if (info()->visited) return this;
2970  VisitMarker marker(info());
2971  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2972  // afterwards.
2973  RegExpNode* node = alternatives_->at(1).node();
2974  RegExpNode* replacement = node->FilterASCII(depth - 1);
2975  if (replacement == NULL) return set_replacement(NULL);
2976  alternatives_->at(1).set_node(replacement);
2977 
2978  RegExpNode* neg_node = alternatives_->at(0).node();
2979  RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
2980  // If the negative lookahead is always going to fail then
2981  // we don't need to check it.
2982  if (neg_replacement == NULL) return set_replacement(replacement);
2983  alternatives_->at(0).set_node(neg_replacement);
2984  return set_replacement(this);
2985 }
2986 
2987 
2989  RegExpCompiler* compiler,
2990  int characters_filled_in,
2991  bool not_at_start) {
2992  if (body_can_be_zero_length_ || info()->visited) return;
2993  VisitMarker marker(info());
2994  return ChoiceNode::GetQuickCheckDetails(details,
2995  compiler,
2996  characters_filled_in,
2997  not_at_start);
2998 }
2999 
3000 
3002  int recursion_depth,
3003  int budget,
3004  BoyerMooreLookahead* bm,
3005  bool not_at_start) {
3006  if (body_can_be_zero_length_ ||
3007  recursion_depth > RegExpCompiler::kMaxRecursion ||
3008  budget <= 0) {
3009  bm->SetRest(offset);
3010  SaveBMInfo(bm, not_at_start, offset);
3011  return;
3012  }
3014  offset, recursion_depth + 1, budget - 1, bm, not_at_start);
3015  SaveBMInfo(bm, not_at_start, offset);
3016 }
3017 
3018 
3020  RegExpCompiler* compiler,
3021  int characters_filled_in,
3022  bool not_at_start) {
3023  not_at_start = (not_at_start || not_at_start_);
3024  int choice_count = alternatives_->length();
3025  ASSERT(choice_count > 0);
3026  alternatives_->at(0).node()->GetQuickCheckDetails(details,
3027  compiler,
3028  characters_filled_in,
3029  not_at_start);
3030  for (int i = 1; i < choice_count; i++) {
3031  QuickCheckDetails new_details(details->characters());
3032  RegExpNode* node = alternatives_->at(i).node();
3033  node->GetQuickCheckDetails(&new_details, compiler,
3034  characters_filled_in,
3035  not_at_start);
3036  // Here we merge the quick match details of the two branches.
3037  details->Merge(&new_details, characters_filled_in);
3038  }
3039 }
3040 
3041 
3042 // Check for [0-9A-Z_a-z].
3043 static void EmitWordCheck(RegExpMacroAssembler* assembler,
3044  Label* word,
3045  Label* non_word,
3046  bool fall_through_on_word) {
3047  if (assembler->CheckSpecialCharacterClass(
3048  fall_through_on_word ? 'w' : 'W',
3049  fall_through_on_word ? non_word : word)) {
3050  // Optimized implementation available.
3051  return;
3052  }
3053  assembler->CheckCharacterGT('z', non_word);
3054  assembler->CheckCharacterLT('0', non_word);
3055  assembler->CheckCharacterGT('a' - 1, word);
3056  assembler->CheckCharacterLT('9' + 1, word);
3057  assembler->CheckCharacterLT('A', non_word);
3058  assembler->CheckCharacterLT('Z' + 1, word);
3059  if (fall_through_on_word) {
3060  assembler->CheckNotCharacter('_', non_word);
3061  } else {
3062  assembler->CheckCharacter('_', word);
3063  }
3064 }
3065 
3066 
3067 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3068 // that matches newline or the start of input).
3069 static void EmitHat(RegExpCompiler* compiler,
3070  RegExpNode* on_success,
3071  Trace* trace) {
3072  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3073  // We will be loading the previous character into the current character
3074  // register.
3075  Trace new_trace(*trace);
3076  new_trace.InvalidateCurrentCharacter();
3077 
3078  Label ok;
3079  if (new_trace.cp_offset() == 0) {
3080  // The start of input counts as a newline in this context, so skip to
3081  // ok if we are at the start.
3082  assembler->CheckAtStart(&ok);
3083  }
3084  // We already checked that we are not at the start of input so it must be
3085  // OK to load the previous character.
3086  assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3087  new_trace.backtrack(),
3088  false);
3089  if (!assembler->CheckSpecialCharacterClass('n',
3090  new_trace.backtrack())) {
3091  // Newline means \n, \r, 0x2028 or 0x2029.
3092  if (!compiler->ascii()) {
3093  assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3094  }
3095  assembler->CheckCharacter('\n', &ok);
3096  assembler->CheckNotCharacter('\r', new_trace.backtrack());
3097  }
3098  assembler->Bind(&ok);
3099  on_success->Emit(compiler, &new_trace);
3100 }
3101 
3102 
3103 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3104 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3105  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3106  Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3107  bool not_at_start = (trace->at_start() == Trace::FALSE);
3108  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3109  if (lookahead == NULL) {
3110  int eats_at_least =
3112  EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
3113  if (eats_at_least >= 1) {
3114  BoyerMooreLookahead* bm =
3115  new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3116  FillInBMInfo(0, 0, kFillInBMBudget, bm, not_at_start);
3117  if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;
3118  if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE;
3119  }
3120  } else {
3121  if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;
3122  if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE;
3123  }
3124  bool at_boundary = (type_ == AssertionNode::AT_BOUNDARY);
3125  if (next_is_word_character == Trace::UNKNOWN) {
3126  Label before_non_word;
3127  Label before_word;
3128  if (trace->characters_preloaded() != 1) {
3129  assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3130  }
3131  // Fall through on non-word.
3132  EmitWordCheck(assembler, &before_word, &before_non_word, false);
3133  // Next character is not a word character.
3134  assembler->Bind(&before_non_word);
3135  Label ok;
3136  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3137  assembler->GoTo(&ok);
3138 
3139  assembler->Bind(&before_word);
3140  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3141  assembler->Bind(&ok);
3142  } else if (next_is_word_character == Trace::TRUE) {
3143  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3144  } else {
3145  ASSERT(next_is_word_character == Trace::FALSE);
3146  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3147  }
3148 }
3149 
3150 
3151 void AssertionNode::BacktrackIfPrevious(
3152  RegExpCompiler* compiler,
3153  Trace* trace,
3154  AssertionNode::IfPrevious backtrack_if_previous) {
3155  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3156  Trace new_trace(*trace);
3157  new_trace.InvalidateCurrentCharacter();
3158 
3159  Label fall_through, dummy;
3160 
3161  Label* non_word = backtrack_if_previous == kIsNonWord ?
3162  new_trace.backtrack() :
3163  &fall_through;
3164  Label* word = backtrack_if_previous == kIsNonWord ?
3165  &fall_through :
3166  new_trace.backtrack();
3167 
3168  if (new_trace.cp_offset() == 0) {
3169  // The start of input counts as a non-word character, so the question is
3170  // decided if we are at the start.
3171  assembler->CheckAtStart(non_word);
3172  }
3173  // We already checked that we are not at the start of input so it must be
3174  // OK to load the previous character.
3175  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3176  EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3177 
3178  assembler->Bind(&fall_through);
3179  on_success()->Emit(compiler, &new_trace);
3180 }
3181 
3182 
3184  RegExpCompiler* compiler,
3185  int filled_in,
3186  bool not_at_start) {
3187  if (type_ == AT_START && not_at_start) {
3188  details->set_cannot_match();
3189  return;
3190  }
3191  return on_success()->GetQuickCheckDetails(details,
3192  compiler,
3193  filled_in,
3194  not_at_start);
3195 }
3196 
3197 
3198 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3199  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3200  switch (type_) {
3201  case AT_END: {
3202  Label ok;
3203  assembler->CheckPosition(trace->cp_offset(), &ok);
3204  assembler->GoTo(trace->backtrack());
3205  assembler->Bind(&ok);
3206  break;
3207  }
3208  case AT_START: {
3209  if (trace->at_start() == Trace::FALSE) {
3210  assembler->GoTo(trace->backtrack());
3211  return;
3212  }
3213  if (trace->at_start() == Trace::UNKNOWN) {
3214  assembler->CheckNotAtStart(trace->backtrack());
3215  Trace at_start_trace = *trace;
3216  at_start_trace.set_at_start(true);
3217  on_success()->Emit(compiler, &at_start_trace);
3218  return;
3219  }
3220  }
3221  break;
3222  case AFTER_NEWLINE:
3223  EmitHat(compiler, on_success(), trace);
3224  return;
3225  case AT_BOUNDARY:
3226  case AT_NON_BOUNDARY: {
3227  EmitBoundaryCheck(compiler, trace);
3228  return;
3229  }
3230  }
3231  on_success()->Emit(compiler, trace);
3232 }
3233 
3234 
3235 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3236  if (quick_check == NULL) return false;
3237  if (offset >= quick_check->characters()) return false;
3238  return quick_check->positions(offset)->determines_perfectly;
3239 }
3240 
3241 
3242 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3243  if (index > *checked_up_to) {
3244  *checked_up_to = index;
3245  }
3246 }
3247 
3248 
3249 // We call this repeatedly to generate code for each pass over the text node.
3250 // The passes are in increasing order of difficulty because we hope one
3251 // of the first passes will fail in which case we are saved the work of the
3252 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3253 // we will check the '%' in the first pass, the case independent 'a' in the
3254 // second pass and the character class in the last pass.
3255 //
3256 // The passes are done from right to left, so for example to test for /bar/
3257 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3258 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3259 // bounds check most of the time. In the example we only need to check for
3260 // end-of-input when loading the putative 'r'.
3261 //
3262 // A slight complication involves the fact that the first character may already
3263 // be fetched into a register by the previous node. In this case we want to
3264 // do the test for that character first. We do this in separate passes. The
3265 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3266 // pass has been performed then subsequent passes will have true in
3267 // first_element_checked to indicate that that character does not need to be
3268 // checked again.
3269 //
3270 // In addition to all this we are passed a Trace, which can
3271 // contain an AlternativeGeneration object. In this AlternativeGeneration
3272 // object we can see details of any quick check that was already passed in
3273 // order to get to the code we are now generating. The quick check can involve
3274 // loading characters, which means we do not need to recheck the bounds
3275 // up to the limit the quick check already checked. In addition the quick
3276 // check can have involved a mask and compare operation which may simplify
3277 // or obviate the need for further checks at some character positions.
3278 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3279  TextEmitPassType pass,
3280  bool preloaded,
3281  Trace* trace,
3282  bool first_element_checked,
3283  int* checked_up_to) {
3284  Isolate* isolate = Isolate::Current();
3285  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3286  bool ascii = compiler->ascii();
3287  Label* backtrack = trace->backtrack();
3288  QuickCheckDetails* quick_check = trace->quick_check_performed();
3289  int element_count = elms_->length();
3290  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3291  TextElement elm = elms_->at(i);
3292  int cp_offset = trace->cp_offset() + elm.cp_offset;
3293  if (elm.type == TextElement::ATOM) {
3294  Vector<const uc16> quarks = elm.data.u_atom->data();
3295  for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3296  if (first_element_checked && i == 0 && j == 0) continue;
3297  if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
3298  EmitCharacterFunction* emit_function = NULL;
3299  switch (pass) {
3300  case NON_ASCII_MATCH:
3301  ASSERT(ascii);
3302  if (quarks[j] > String::kMaxAsciiCharCode) {
3303  assembler->GoTo(backtrack);
3304  return;
3305  }
3306  break;
3307  case NON_LETTER_CHARACTER_MATCH:
3308  emit_function = &EmitAtomNonLetter;
3309  break;
3310  case SIMPLE_CHARACTER_MATCH:
3311  emit_function = &EmitSimpleCharacter;
3312  break;
3313  case CASE_CHARACTER_MATCH:
3314  emit_function = &EmitAtomLetter;
3315  break;
3316  default:
3317  break;
3318  }
3319  if (emit_function != NULL) {
3320  bool bound_checked = emit_function(isolate,
3321  compiler,
3322  quarks[j],
3323  backtrack,
3324  cp_offset + j,
3325  *checked_up_to < cp_offset + j,
3326  preloaded);
3327  if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3328  }
3329  }
3330  } else {
3332  if (pass == CHARACTER_CLASS_MATCH) {
3333  if (first_element_checked && i == 0) continue;
3334  if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
3335  RegExpCharacterClass* cc = elm.data.u_char_class;
3336  EmitCharClass(assembler,
3337  cc,
3338  ascii,
3339  backtrack,
3340  cp_offset,
3341  *checked_up_to < cp_offset,
3342  preloaded,
3343  zone());
3344  UpdateBoundsCheck(cp_offset, checked_up_to);
3345  }
3346  }
3347  }
3348 }
3349 
3350 
3351 int TextNode::Length() {
3352  TextElement elm = elms_->last();
3353  ASSERT(elm.cp_offset >= 0);
3354  if (elm.type == TextElement::ATOM) {
3355  return elm.cp_offset + elm.data.u_atom->data().length();
3356  } else {
3357  return elm.cp_offset + 1;
3358  }
3359 }
3360 
3361 
3362 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3363  TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3364  if (ignore_case) {
3365  return pass == SIMPLE_CHARACTER_MATCH;
3366  } else {
3367  return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3368  }
3369 }
3370 
3371 
3372 // This generates the code to match a text node. A text node can contain
3373 // straight character sequences (possibly to be matched in a case-independent
3374 // way) and character classes. For efficiency we do not do this in a single
3375 // pass from left to right. Instead we pass over the text node several times,
3376 // emitting code for some character positions every time. See the comment on
3377 // TextEmitPass for details.
3378 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3379  LimitResult limit_result = LimitVersions(compiler, trace);
3380  if (limit_result == DONE) return;
3381  ASSERT(limit_result == CONTINUE);
3382 
3383  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3384  compiler->SetRegExpTooBig();
3385  return;
3386  }
3387 
3388  if (compiler->ascii()) {
3389  int dummy = 0;
3390  TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
3391  }
3392 
3393  bool first_elt_done = false;
3394  int bound_checked_to = trace->cp_offset() - 1;
3395  bound_checked_to += trace->bound_checked_up_to();
3396 
3397  // If a character is preloaded into the current character register then
3398  // check that now.
3399  if (trace->characters_preloaded() == 1) {
3400  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3401  if (!SkipPass(pass, compiler->ignore_case())) {
3402  TextEmitPass(compiler,
3403  static_cast<TextEmitPassType>(pass),
3404  true,
3405  trace,
3406  false,
3407  &bound_checked_to);
3408  }
3409  }
3410  first_elt_done = true;
3411  }
3412 
3413  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3414  if (!SkipPass(pass, compiler->ignore_case())) {
3415  TextEmitPass(compiler,
3416  static_cast<TextEmitPassType>(pass),
3417  false,
3418  trace,
3419  first_elt_done,
3420  &bound_checked_to);
3421  }
3422  }
3423 
3424  Trace successor_trace(*trace);
3425  successor_trace.set_at_start(false);
3426  successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
3427  RecursionCheck rc(compiler);
3428  on_success()->Emit(compiler, &successor_trace);
3429 }
3430 
3431 
3433  characters_preloaded_ = 0;
3434 }
3435 
3436 
3438  ASSERT(by > 0);
3439  // We don't have an instruction for shifting the current character register
3440  // down or for using a shifted value for anything so lets just forget that
3441  // we preloaded any characters into it.
3442  characters_preloaded_ = 0;
3443  // Adjust the offsets of the quick check performed information. This
3444  // information is used to find out what we already determined about the
3445  // characters by means of mask and compare.
3446  quick_check_performed_.Advance(by, compiler->ascii());
3447  cp_offset_ += by;
3448  if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3449  compiler->SetRegExpTooBig();
3450  cp_offset_ = 0;
3451  }
3452  bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3453 }
3454 
3455 
3456 void TextNode::MakeCaseIndependent(bool is_ascii) {
3457  int element_count = elms_->length();
3458  for (int i = 0; i < element_count; i++) {
3459  TextElement elm = elms_->at(i);
3460  if (elm.type == TextElement::CHAR_CLASS) {
3462  // None of the standard character classes is different in the case
3463  // independent case and it slows us down if we don't know that.
3464  if (cc->is_standard(zone())) continue;
3465  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3466  int range_count = ranges->length();
3467  for (int j = 0; j < range_count; j++) {
3468  ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone());
3469  }
3470  }
3471  }
3472 }
3473 
3474 
3476  TextElement elm = elms_->at(elms_->length() - 1);
3477  if (elm.type == TextElement::CHAR_CLASS) {
3478  return elm.cp_offset + 1;
3479  } else {
3480  return elm.cp_offset + elm.data.u_atom->data().length();
3481  }
3482 }
3483 
3484 
3486  RegExpCompiler* compiler) {
3487  if (elms_->length() != 1) return NULL;
3488  TextElement elm = elms_->at(0);
3489  if (elm.type != TextElement::CHAR_CLASS) return NULL;
3491  ZoneList<CharacterRange>* ranges = node->ranges(zone());
3492  if (!CharacterRange::IsCanonical(ranges)) {
3494  }
3495  if (node->is_negated()) {
3496  return ranges->length() == 0 ? on_success() : NULL;
3497  }
3498  if (ranges->length() != 1) return NULL;
3499  uint32_t max_char;
3500  if (compiler->ascii()) {
3501  max_char = String::kMaxAsciiCharCode;
3502  } else {
3503  max_char = String::kMaxUtf16CodeUnit;
3504  }
3505  return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3506 }
3507 
3508 
3509 // Finds the fixed match length of a sequence of nodes that goes from
3510 // this alternative and back to this choice node. If there are variable
3511 // length nodes or other complications in the way then return a sentinel
3512 // value indicating that a greedy loop cannot be constructed.
3514  GuardedAlternative* alternative) {
3515  int length = 0;
3516  RegExpNode* node = alternative->node();
3517  // Later we will generate code for all these text nodes using recursion
3518  // so we have to limit the max number.
3519  int recursion_depth = 0;
3520  while (node != this) {
3521  if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3522  return kNodeIsTooComplexForGreedyLoops;
3523  }
3524  int node_length = node->GreedyLoopTextLength();
3525  if (node_length == kNodeIsTooComplexForGreedyLoops) {
3526  return kNodeIsTooComplexForGreedyLoops;
3527  }
3528  length += node_length;
3529  SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3530  node = seq_node->on_success();
3531  }
3532  return length;
3533 }
3534 
3535 
3537  ASSERT_EQ(loop_node_, NULL);
3538  AddAlternative(alt);
3539  loop_node_ = alt.node();
3540 }
3541 
3542 
3544  ASSERT_EQ(continue_node_, NULL);
3545  AddAlternative(alt);
3546  continue_node_ = alt.node();
3547 }
3548 
3549 
3550 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3551  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3552  if (trace->stop_node() == this) {
3553  int text_length =
3554  GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3555  ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
3556  // Update the counter-based backtracking info on the stack. This is an
3557  // optimization for greedy loops (see below).
3558  ASSERT(trace->cp_offset() == text_length);
3559  macro_assembler->AdvanceCurrentPosition(text_length);
3560  macro_assembler->GoTo(trace->loop_label());
3561  return;
3562  }
3563  ASSERT(trace->stop_node() == NULL);
3564  if (!trace->is_trivial()) {
3565  trace->Flush(compiler, this);
3566  return;
3567  }
3568  ChoiceNode::Emit(compiler, trace);
3569 }
3570 
3571 
3572 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3573  int eats_at_least) {
3574  int preload_characters = Min(4, eats_at_least);
3575  if (compiler->macro_assembler()->CanReadUnaligned()) {
3576  bool ascii = compiler->ascii();
3577  if (ascii) {
3578  if (preload_characters > 4) preload_characters = 4;
3579  // We can't preload 3 characters because there is no machine instruction
3580  // to do that. We can't just load 4 because we could be reading
3581  // beyond the end of the string, which could cause a memory fault.
3582  if (preload_characters == 3) preload_characters = 2;
3583  } else {
3584  if (preload_characters > 2) preload_characters = 2;
3585  }
3586  } else {
3587  if (preload_characters > 1) preload_characters = 1;
3588  }
3589  return preload_characters;
3590 }
3591 
3592 
3593 // This class is used when generating the alternatives in a choice node. It
3594 // records the way the alternative is being code generated.
3596  public:
3598  : possible_success(),
3599  expects_preload(false),
3600  after(),
3601  quick_check_details() { }
3604  Label after;
3606 };
3607 
3608 
3609 // Creates a list of AlternativeGenerations. If the list has a reasonable
3610 // size then it is on the stack, otherwise the excess is on the heap.
3612  public:
3614  : alt_gens_(count, zone) {
3615  for (int i = 0; i < count && i < kAFew; i++) {
3616  alt_gens_.Add(a_few_alt_gens_ + i, zone);
3617  }
3618  for (int i = kAFew; i < count; i++) {
3619  alt_gens_.Add(new AlternativeGeneration(), zone);
3620  }
3621  }
3623  for (int i = kAFew; i < alt_gens_.length(); i++) {
3624  delete alt_gens_[i];
3625  alt_gens_[i] = NULL;
3626  }
3627  }
3628 
3630  return alt_gens_[i];
3631  }
3632 
3633  private:
3634  static const int kAFew = 10;
3636  AlternativeGeneration a_few_alt_gens_[kAFew];
3637 };
3638 
3639 
3640 // The '2' variant is has inclusive from and exclusive to.
3641 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
3642  0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
3643  0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
3644 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
3645 
3646 static const int kWordRanges[] = {
3647  '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3648 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
3649 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3650 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
3651 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3652 static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
3653 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3654  0x2028, 0x202A, 0x10000 };
3655 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
3656 
3657 
3658 void BoyerMoorePositionInfo::Set(int character) {
3659  SetInterval(Interval(character, character));
3660 }
3661 
3662 
3664  s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3665  w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3666  d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3667  surrogate_ =
3668  AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3669  if (interval.to() - interval.from() >= kMapSize - 1) {
3670  if (map_count_ != kMapSize) {
3671  map_count_ = kMapSize;
3672  for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3673  }
3674  return;
3675  }
3676  for (int i = interval.from(); i <= interval.to(); i++) {
3677  int mod_character = (i & kMask);
3678  if (!map_->at(mod_character)) {
3679  map_count_++;
3680  map_->at(mod_character) = true;
3681  }
3682  if (map_count_ == kMapSize) return;
3683  }
3684 }
3685 
3686 
3688  s_ = w_ = d_ = kLatticeUnknown;
3689  if (map_count_ != kMapSize) {
3690  map_count_ = kMapSize;
3691  for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3692  }
3693 }
3694 
3695 
3697  int length, RegExpCompiler* compiler, Zone* zone)
3698  : length_(length),
3699  compiler_(compiler) {
3700  if (compiler->ascii()) {
3701  max_char_ = String::kMaxAsciiCharCode;
3702  } else {
3703  max_char_ = String::kMaxUtf16CodeUnit;
3704  }
3705  bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3706  for (int i = 0; i < length; i++) {
3707  bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3708  }
3709 }
3710 
3711 
3712 // Find the longest range of lookahead that has the fewest number of different
3713 // characters that can occur at a given position. Since we are optimizing two
3714 // different parameters at once this is a tradeoff.
3715 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3716  int biggest_points = 0;
3717  // If more than 32 characters out of 128 can occur it is unlikely that we can
3718  // be lucky enough to step forwards much of the time.
3719  const int kMaxMax = 32;
3720  for (int max_number_of_chars = 4;
3721  max_number_of_chars < kMaxMax;
3722  max_number_of_chars *= 2) {
3723  biggest_points =
3724  FindBestInterval(max_number_of_chars, biggest_points, from, to);
3725  }
3726  if (biggest_points == 0) return false;
3727  return true;
3728 }
3729 
3730 
3731 // Find the highest-points range between 0 and length_ where the character
3732 // information is not too vague. 'Too vague' means that there are more than
3733 // max_number_of_chars that can occur at this position. Calculates the number
3734 // of points as the product of width-of-the-range and
3735 // probability-of-finding-one-of-the-characters, where the probability is
3736 // calculated using the frequency distribution of the sample subject string.
3737 int BoyerMooreLookahead::FindBestInterval(
3738  int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3739  int biggest_points = old_biggest_points;
3740  static const int kSize = RegExpMacroAssembler::kTableSize;
3741  for (int i = 0; i < length_; ) {
3742  while (i < length_ && Count(i) > max_number_of_chars) i++;
3743  if (i == length_) break;
3744  int remembered_from = i;
3745  bool union_map[kSize];
3746  for (int j = 0; j < kSize; j++) union_map[j] = false;
3747  while (i < length_ && Count(i) <= max_number_of_chars) {
3748  BoyerMoorePositionInfo* map = bitmaps_->at(i);
3749  for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3750  i++;
3751  }
3752  int frequency = 0;
3753  for (int j = 0; j < kSize; j++) {
3754  if (union_map[j]) {
3755  // Add 1 to the frequency to give a small per-character boost for
3756  // the cases where our sampling is not good enough and many
3757  // characters have a frequency of zero. This means the frequency
3758  // can theoretically be up to 2*kSize though we treat it mostly as
3759  // a fraction of kSize.
3760  frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3761  }
3762  }
3763  // We use the probability of skipping times the distance we are skipping to
3764  // judge the effectiveness of this. Actually we have a cut-off: By
3765  // dividing by 2 we switch off the skipping if the probability of skipping
3766  // is less than 50%. This is because the multibyte mask-and-compare
3767  // skipping in quickcheck is more likely to do well on this case.
3768  bool in_quickcheck_range = ((i - remembered_from < 4) ||
3769  (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
3770  // Called 'probability' but it is only a rough estimate and can actually
3771  // be outside the 0-kSize range.
3772  int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3773  int points = (i - remembered_from) * probability;
3774  if (points > biggest_points) {
3775  *from = remembered_from;
3776  *to = i - 1;
3777  biggest_points = points;
3778  }
3779  }
3780  return biggest_points;
3781 }
3782 
3783 
3784 // Take all the characters that will not prevent a successful match if they
3785 // occur in the subject string in the range between min_lookahead and
3786 // max_lookahead (inclusive) measured from the current position. If the
3787 // character at max_lookahead offset is not one of these characters, then we
3788 // can safely skip forwards by the number of characters in the range.
3789 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3790  int max_lookahead,
3791  Handle<ByteArray> boolean_skip_table) {
3792  const int kSize = RegExpMacroAssembler::kTableSize;
3793 
3794  const int kSkipArrayEntry = 0;
3795  const int kDontSkipArrayEntry = 1;
3796 
3797  for (int i = 0; i < kSize; i++) {
3798  boolean_skip_table->set(i, kSkipArrayEntry);
3799  }
3800  int skip = max_lookahead + 1 - min_lookahead;
3801 
3802  for (int i = max_lookahead; i >= min_lookahead; i--) {
3803  BoyerMoorePositionInfo* map = bitmaps_->at(i);
3804  for (int j = 0; j < kSize; j++) {
3805  if (map->at(j)) {
3806  boolean_skip_table->set(j, kDontSkipArrayEntry);
3807  }
3808  }
3809  }
3810 
3811  return skip;
3812 }
3813 
3814 
3815 // See comment above on the implementation of GetSkipTable.
3817  const int kSize = RegExpMacroAssembler::kTableSize;
3818 
3819  int min_lookahead = 0;
3820  int max_lookahead = 0;
3821 
3822  if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
3823 
3824  bool found_single_character = false;
3825  int single_character = 0;
3826  for (int i = max_lookahead; i >= min_lookahead; i--) {
3827  BoyerMoorePositionInfo* map = bitmaps_->at(i);
3828  if (map->map_count() > 1 ||
3829  (found_single_character && map->map_count() != 0)) {
3830  found_single_character = false;
3831  break;
3832  }
3833  for (int j = 0; j < kSize; j++) {
3834  if (map->at(j)) {
3835  found_single_character = true;
3836  single_character = j;
3837  break;
3838  }
3839  }
3840  }
3841 
3842  int lookahead_width = max_lookahead + 1 - min_lookahead;
3843 
3844  if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3845  // The mask-compare can probably handle this better.
3846  return false;
3847  }
3848 
3849  if (found_single_character) {
3850  Label cont, again;
3851  masm->Bind(&again);
3852  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3853  if (max_char_ > kSize) {
3854  masm->CheckCharacterAfterAnd(single_character,
3856  &cont);
3857  } else {
3858  masm->CheckCharacter(single_character, &cont);
3859  }
3860  masm->AdvanceCurrentPosition(lookahead_width);
3861  masm->GoTo(&again);
3862  masm->Bind(&cont);
3863  return true;
3864  }
3865 
3866  Handle<ByteArray> boolean_skip_table =
3867  FACTORY->NewByteArray(kSize, TENURED);
3868  int skip_distance = GetSkipTable(
3869  min_lookahead, max_lookahead, boolean_skip_table);
3870  ASSERT(skip_distance != 0);
3871 
3872  Label cont, again;
3873  masm->Bind(&again);
3874  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3875  masm->CheckBitInTable(boolean_skip_table, &cont);
3876  masm->AdvanceCurrentPosition(skip_distance);
3877  masm->GoTo(&again);
3878  masm->Bind(&cont);
3879 
3880  return true;
3881 }
3882 
3883 
3884 /* Code generation for choice nodes.
3885  *
3886  * We generate quick checks that do a mask and compare to eliminate a
3887  * choice. If the quick check succeeds then it jumps to the continuation to
3888  * do slow checks and check subsequent nodes. If it fails (the common case)
3889  * it falls through to the next choice.
3890  *
3891  * Here is the desired flow graph. Nodes directly below each other imply
3892  * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3893  * 3 doesn't have a quick check so we have to call the slow check.
3894  * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3895  * regexp continuation is generated directly after the Sn node, up to the
3896  * next GoTo if we decide to reuse some already generated code. Some
3897  * nodes expect preload_characters to be preloaded into the current
3898  * character register. R nodes do this preloading. Vertices are marked
3899  * F for failures and S for success (possible success in the case of quick
3900  * nodes). L, V, < and > are used as arrow heads.
3901  *
3902  * ----------> R
3903  * |
3904  * V
3905  * Q1 -----> S1
3906  * | S /
3907  * F| /
3908  * | F/
3909  * | /
3910  * | R
3911  * | /
3912  * V L
3913  * Q2 -----> S2
3914  * | S /
3915  * F| /
3916  * | F/
3917  * | /
3918  * | R
3919  * | /
3920  * V L
3921  * S3
3922  * |
3923  * F|
3924  * |
3925  * R
3926  * |
3927  * backtrack V
3928  * <----------Q4
3929  * \ F |
3930  * \ |S
3931  * \ F V
3932  * \-----S4
3933  *
3934  * For greedy loops we reverse our expectation and expect to match rather
3935  * than fail. Therefore we want the loop code to look like this (U is the
3936  * unwind code that steps back in the greedy loop). The following alternatives
3937  * look the same as above.
3938  * _____
3939  * / \
3940  * V |
3941  * ----------> S1 |
3942  * /| |
3943  * / |S |
3944  * F/ \_____/
3945  * /
3946  * |<-----------
3947  * | \
3948  * V \
3949  * Q2 ---> S2 \
3950  * | S / |
3951  * F| / |
3952  * | F/ |
3953  * | / |
3954  * | R |
3955  * | / |
3956  * F VL |
3957  * <------U |
3958  * back |S |
3959  * \______________/
3960  */
3961 
3962 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3963  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3964  int choice_count = alternatives_->length();
3965 #ifdef DEBUG
3966  for (int i = 0; i < choice_count - 1; i++) {
3967  GuardedAlternative alternative = alternatives_->at(i);
3968  ZoneList<Guard*>* guards = alternative.guards();
3969  int guard_count = (guards == NULL) ? 0 : guards->length();
3970  for (int j = 0; j < guard_count; j++) {
3971  ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
3972  }
3973  }
3974 #endif
3975 
3976  LimitResult limit_result = LimitVersions(compiler, trace);
3977  if (limit_result == DONE) return;
3978  ASSERT(limit_result == CONTINUE);
3979 
3980  int new_flush_budget = trace->flush_budget() / choice_count;
3981  if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3982  trace->Flush(compiler, this);
3983  return;
3984  }
3985 
3986  RecursionCheck rc(compiler);
3987 
3988  Trace* current_trace = trace;
3989 
3990  int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3991  bool greedy_loop = false;
3992  Label greedy_loop_label;
3993  Trace counter_backtrack_trace;
3994  counter_backtrack_trace.set_backtrack(&greedy_loop_label);
3995  if (not_at_start()) counter_backtrack_trace.set_at_start(false);
3996 
3997  if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3998  // Here we have special handling for greedy loops containing only text nodes
3999  // and other simple nodes. These are handled by pushing the current
4000  // position on the stack and then incrementing the current position each
4001  // time around the switch. On backtrack we decrement the current position
4002  // and check it against the pushed value. This avoids pushing backtrack
4003  // information for each iteration of the loop, which could take up a lot of
4004  // space.
4005  greedy_loop = true;
4006  ASSERT(trace->stop_node() == NULL);
4007  macro_assembler->PushCurrentPosition();
4008  current_trace = &counter_backtrack_trace;
4009  Label greedy_match_failed;
4010  Trace greedy_match_trace;
4011  if (not_at_start()) greedy_match_trace.set_at_start(false);
4012  greedy_match_trace.set_backtrack(&greedy_match_failed);
4013  Label loop_label;
4014  macro_assembler->Bind(&loop_label);
4015  greedy_match_trace.set_stop_node(this);
4016  greedy_match_trace.set_loop_label(&loop_label);
4017  alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4018  macro_assembler->Bind(&greedy_match_failed);
4019  }
4020 
4021  Label second_choice; // For use in greedy matches.
4022  macro_assembler->Bind(&second_choice);
4023 
4024  int first_normal_choice = greedy_loop ? 1 : 0;
4025 
4026  bool not_at_start = current_trace->at_start() == Trace::FALSE;
4027  const int kEatsAtLeastNotYetInitialized = -1;
4028  int eats_at_least = kEatsAtLeastNotYetInitialized;
4029 
4030  bool skip_was_emitted = false;
4031 
4032  if (!greedy_loop && choice_count == 2) {
4033  GuardedAlternative alt1 = alternatives_->at(1);
4034  if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
4035  RegExpNode* eats_anything_node = alt1.node();
4036  if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
4037  this) {
4038  // At this point we know that we are at a non-greedy loop that will eat
4039  // any character one at a time. Any non-anchored regexp has such a
4040  // loop prepended to it in order to find where it starts. We look for
4041  // a pattern of the form ...abc... where we can look 6 characters ahead
4042  // and step forwards 3 if the character is not one of abc. Abc need
4043  // not be atoms, they can be any reasonably limited character class or
4044  // small alternation.
4045  ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
4046  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
4047  if (lookahead == NULL) {
4048  eats_at_least =
4050  EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
4051  if (eats_at_least >= 1) {
4052  BoyerMooreLookahead* bm =
4053  new(zone()) BoyerMooreLookahead(eats_at_least,
4054  compiler,
4055  zone());
4056  GuardedAlternative alt0 = alternatives_->at(0);
4057  alt0.node()->FillInBMInfo(0, 0, kFillInBMBudget, bm, not_at_start);
4058  skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
4059  }
4060  } else {
4061  skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
4062  }
4063  }
4064  }
4065  }
4066 
4067  if (eats_at_least == kEatsAtLeastNotYetInitialized) {
4068  // Save some time by looking at most one machine word ahead.
4069  eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);
4070  }
4071  int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
4072 
4073  bool preload_is_current = !skip_was_emitted &&
4074  (current_trace->characters_preloaded() == preload_characters);
4075  bool preload_has_checked_bounds = preload_is_current;
4076 
4077  AlternativeGenerationList alt_gens(choice_count, zone());
4078 
4079  // For now we just call all choices one after the other. The idea ultimately
4080  // is to use the Dispatch table to try only the relevant ones.
4081  for (int i = first_normal_choice; i < choice_count; i++) {
4082  GuardedAlternative alternative = alternatives_->at(i);
4083  AlternativeGeneration* alt_gen = alt_gens.at(i);
4084  alt_gen->quick_check_details.set_characters(preload_characters);
4085  ZoneList<Guard*>* guards = alternative.guards();
4086  int guard_count = (guards == NULL) ? 0 : guards->length();
4087  Trace new_trace(*current_trace);
4088  new_trace.set_characters_preloaded(preload_is_current ?
4089  preload_characters :
4090  0);
4091  if (preload_has_checked_bounds) {
4092  new_trace.set_bound_checked_up_to(preload_characters);
4093  }
4094  new_trace.quick_check_performed()->Clear();
4095  if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
4096  alt_gen->expects_preload = preload_is_current;
4097  bool generate_full_check_inline = false;
4098  if (FLAG_regexp_optimization &&
4100  alternative.node()->EmitQuickCheck(compiler,
4101  &new_trace,
4102  preload_has_checked_bounds,
4103  &alt_gen->possible_success,
4104  &alt_gen->quick_check_details,
4105  i < choice_count - 1)) {
4106  // Quick check was generated for this choice.
4107  preload_is_current = true;
4108  preload_has_checked_bounds = true;
4109  // On the last choice in the ChoiceNode we generated the quick
4110  // check to fall through on possible success. So now we need to
4111  // generate the full check inline.
4112  if (i == choice_count - 1) {
4113  macro_assembler->Bind(&alt_gen->possible_success);
4114  new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4115  new_trace.set_characters_preloaded(preload_characters);
4116  new_trace.set_bound_checked_up_to(preload_characters);
4117  generate_full_check_inline = true;
4118  }
4119  } else if (alt_gen->quick_check_details.cannot_match()) {
4120  if (i == choice_count - 1 && !greedy_loop) {
4121  macro_assembler->GoTo(trace->backtrack());
4122  }
4123  continue;
4124  } else {
4125  // No quick check was generated. Put the full code here.
4126  // If this is not the first choice then there could be slow checks from
4127  // previous cases that go here when they fail. There's no reason to
4128  // insist that they preload characters since the slow check we are about
4129  // to generate probably can't use it.
4130  if (i != first_normal_choice) {
4131  alt_gen->expects_preload = false;
4132  new_trace.InvalidateCurrentCharacter();
4133  }
4134  if (i < choice_count - 1) {
4135  new_trace.set_backtrack(&alt_gen->after);
4136  }
4137  generate_full_check_inline = true;
4138  }
4139  if (generate_full_check_inline) {
4140  if (new_trace.actions() != NULL) {
4141  new_trace.set_flush_budget(new_flush_budget);
4142  }
4143  for (int j = 0; j < guard_count; j++) {
4144  GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4145  }
4146  alternative.node()->Emit(compiler, &new_trace);
4147  preload_is_current = false;
4148  }
4149  macro_assembler->Bind(&alt_gen->after);
4150  }
4151  if (greedy_loop) {
4152  macro_assembler->Bind(&greedy_loop_label);
4153  // If we have unwound to the bottom then backtrack.
4154  macro_assembler->CheckGreedyLoop(trace->backtrack());
4155  // Otherwise try the second priority at an earlier position.
4156  macro_assembler->AdvanceCurrentPosition(-text_length);
4157  macro_assembler->GoTo(&second_choice);
4158  }
4159 
4160  // At this point we need to generate slow checks for the alternatives where
4161  // the quick check was inlined. We can recognize these because the associated
4162  // label was bound.
4163  for (int i = first_normal_choice; i < choice_count - 1; i++) {
4164  AlternativeGeneration* alt_gen = alt_gens.at(i);
4165  Trace new_trace(*current_trace);
4166  // If there are actions to be flushed we have to limit how many times
4167  // they are flushed. Take the budget of the parent trace and distribute
4168  // it fairly amongst the children.
4169  if (new_trace.actions() != NULL) {
4170  new_trace.set_flush_budget(new_flush_budget);
4171  }
4172  EmitOutOfLineContinuation(compiler,
4173  &new_trace,
4174  alternatives_->at(i),
4175  alt_gen,
4176  preload_characters,
4177  alt_gens.at(i + 1)->expects_preload);
4178  }
4179 }
4180 
4181 
4182 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4183  Trace* trace,
4184  GuardedAlternative alternative,
4185  AlternativeGeneration* alt_gen,
4186  int preload_characters,
4187  bool next_expects_preload) {
4188  if (!alt_gen->possible_success.is_linked()) return;
4189 
4190  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4191  macro_assembler->Bind(&alt_gen->possible_success);
4192  Trace out_of_line_trace(*trace);
4193  out_of_line_trace.set_characters_preloaded(preload_characters);
4194  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4195  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
4196  ZoneList<Guard*>* guards = alternative.guards();
4197  int guard_count = (guards == NULL) ? 0 : guards->length();
4198  if (next_expects_preload) {
4199  Label reload_current_char;
4200  out_of_line_trace.set_backtrack(&reload_current_char);
4201  for (int j = 0; j < guard_count; j++) {
4202  GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4203  }
4204  alternative.node()->Emit(compiler, &out_of_line_trace);
4205  macro_assembler->Bind(&reload_current_char);
4206  // Reload the current character, since the next quick check expects that.
4207  // We don't need to check bounds here because we only get into this
4208  // code through a quick check which already did the checked load.
4209  macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4210  NULL,
4211  false,
4212  preload_characters);
4213  macro_assembler->GoTo(&(alt_gen->after));
4214  } else {
4215  out_of_line_trace.set_backtrack(&(alt_gen->after));
4216  for (int j = 0; j < guard_count; j++) {
4217  GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4218  }
4219  alternative.node()->Emit(compiler, &out_of_line_trace);
4220  }
4221 }
4222 
4223 
4224 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4225  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4226  LimitResult limit_result = LimitVersions(compiler, trace);
4227  if (limit_result == DONE) return;
4228  ASSERT(limit_result == CONTINUE);
4229 
4230  RecursionCheck rc(compiler);
4231 
4232  switch (type_) {
4233  case STORE_POSITION: {
4235  new_capture(data_.u_position_register.reg,
4236  data_.u_position_register.is_capture,
4237  trace);
4238  Trace new_trace = *trace;
4239  new_trace.add_action(&new_capture);
4240  on_success()->Emit(compiler, &new_trace);
4241  break;
4242  }
4243  case INCREMENT_REGISTER: {
4245  new_increment(data_.u_increment_register.reg);
4246  Trace new_trace = *trace;
4247  new_trace.add_action(&new_increment);
4248  on_success()->Emit(compiler, &new_trace);
4249  break;
4250  }
4251  case SET_REGISTER: {
4253  new_set(data_.u_store_register.reg, data_.u_store_register.value);
4254  Trace new_trace = *trace;
4255  new_trace.add_action(&new_set);
4256  on_success()->Emit(compiler, &new_trace);
4257  break;
4258  }
4259  case CLEAR_CAPTURES: {
4261  new_capture(Interval(data_.u_clear_captures.range_from,
4262  data_.u_clear_captures.range_to));
4263  Trace new_trace = *trace;
4264  new_trace.add_action(&new_capture);
4265  on_success()->Emit(compiler, &new_trace);
4266  break;
4267  }
4268  case BEGIN_SUBMATCH:
4269  if (!trace->is_trivial()) {
4270  trace->Flush(compiler, this);
4271  } else {
4272  assembler->WriteCurrentPositionToRegister(
4273  data_.u_submatch.current_position_register, 0);
4274  assembler->WriteStackPointerToRegister(
4275  data_.u_submatch.stack_pointer_register);
4276  on_success()->Emit(compiler, trace);
4277  }
4278  break;
4279  case EMPTY_MATCH_CHECK: {
4280  int start_pos_reg = data_.u_empty_match_check.start_register;
4281  int stored_pos = 0;
4282  int rep_reg = data_.u_empty_match_check.repetition_register;
4283  bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4284  bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4285  if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4286  // If we know we haven't advanced and there is no minimum we
4287  // can just backtrack immediately.
4288  assembler->GoTo(trace->backtrack());
4289  } else if (know_dist && stored_pos < trace->cp_offset()) {
4290  // If we know we've advanced we can generate the continuation
4291  // immediately.
4292  on_success()->Emit(compiler, trace);
4293  } else if (!trace->is_trivial()) {
4294  trace->Flush(compiler, this);
4295  } else {
4296  Label skip_empty_check;
4297  // If we have a minimum number of repetitions we check the current
4298  // number first and skip the empty check if it's not enough.
4299  if (has_minimum) {
4300  int limit = data_.u_empty_match_check.repetition_limit;
4301  assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4302  }
4303  // If the match is empty we bail out, otherwise we fall through
4304  // to the on-success continuation.
4305  assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4306  trace->backtrack());
4307  assembler->Bind(&skip_empty_check);
4308  on_success()->Emit(compiler, trace);
4309  }
4310  break;
4311  }
4313  if (!trace->is_trivial()) {
4314  trace->Flush(compiler, this);
4315  return;
4316  }
4318  data_.u_submatch.current_position_register);
4319  assembler->ReadStackPointerFromRegister(
4320  data_.u_submatch.stack_pointer_register);
4321  int clear_register_count = data_.u_submatch.clear_register_count;
4322  if (clear_register_count == 0) {
4323  on_success()->Emit(compiler, trace);
4324  return;
4325  }
4326  int clear_registers_from = data_.u_submatch.clear_register_from;
4327  Label clear_registers_backtrack;
4328  Trace new_trace = *trace;
4329  new_trace.set_backtrack(&clear_registers_backtrack);
4330  on_success()->Emit(compiler, &new_trace);
4331 
4332  assembler->Bind(&clear_registers_backtrack);
4333  int clear_registers_to = clear_registers_from + clear_register_count - 1;
4334  assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4335 
4336  ASSERT(trace->backtrack() == NULL);
4337  assembler->Backtrack();
4338  return;
4339  }
4340  default:
4341  UNREACHABLE();
4342  }
4343 }
4344 
4345 
4347  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4348  if (!trace->is_trivial()) {
4349  trace->Flush(compiler, this);
4350  return;
4351  }
4352 
4353  LimitResult limit_result = LimitVersions(compiler, trace);
4354  if (limit_result == DONE) return;
4355  ASSERT(limit_result == CONTINUE);
4356 
4357  RecursionCheck rc(compiler);
4358 
4359  ASSERT_EQ(start_reg_ + 1, end_reg_);
4360  if (compiler->ignore_case()) {
4361  assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4362  trace->backtrack());
4363  } else {
4364  assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4365  }
4366  on_success()->Emit(compiler, trace);
4367 }
4368 
4369 
4370 // -------------------------------------------------------------------
4371 // Dot/dotty output
4372 
4373 
4374 #ifdef DEBUG
4375 
4376 
4377 class DotPrinter: public NodeVisitor {
4378  public:
4379  explicit DotPrinter(bool ignore_case)
4380  : ignore_case_(ignore_case),
4381  stream_(&alloc_) { }
4382  void PrintNode(const char* label, RegExpNode* node);
4383  void Visit(RegExpNode* node);
4384  void PrintAttributes(RegExpNode* from);
4385  StringStream* stream() { return &stream_; }
4386  void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4387 #define DECLARE_VISIT(Type) \
4388  virtual void Visit##Type(Type##Node* that);
4390 #undef DECLARE_VISIT
4391  private:
4392  bool ignore_case_;
4393  HeapStringAllocator alloc_;
4394  StringStream stream_;
4395 };
4396 
4397 
4398 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4399  stream()->Add("digraph G {\n graph [label=\"");
4400  for (int i = 0; label[i]; i++) {
4401  switch (label[i]) {
4402  case '\\':
4403  stream()->Add("\\\\");
4404  break;
4405  case '"':
4406  stream()->Add("\"");
4407  break;
4408  default:
4409  stream()->Put(label[i]);
4410  break;
4411  }
4412  }
4413  stream()->Add("\"];\n");
4414  Visit(node);
4415  stream()->Add("}\n");
4416  printf("%s", *(stream()->ToCString()));
4417 }
4418 
4419 
4420 void DotPrinter::Visit(RegExpNode* node) {
4421  if (node->info()->visited) return;
4422  node->info()->visited = true;
4423  node->Accept(this);
4424 }
4425 
4426 
4427 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4428  stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
4429  Visit(on_failure);
4430 }
4431 
4432 
4433 class TableEntryBodyPrinter {
4434  public:
4435  TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
4436  : stream_(stream), choice_(choice) { }
4437  void Call(uc16 from, DispatchTable::Entry entry) {
4438  OutSet* out_set = entry.out_set();
4439  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4440  if (out_set->Get(i)) {
4441  stream()->Add(" n%p:s%io%i -> n%p;\n",
4442  choice(),
4443  from,
4444  i,
4445  choice()->alternatives()->at(i).node());
4446  }
4447  }
4448  }
4449  private:
4450  StringStream* stream() { return stream_; }
4451  ChoiceNode* choice() { return choice_; }
4452  StringStream* stream_;
4453  ChoiceNode* choice_;
4454 };
4455 
4456 
4457 class TableEntryHeaderPrinter {
4458  public:
4459  explicit TableEntryHeaderPrinter(StringStream* stream)
4460  : first_(true), stream_(stream) { }
4461  void Call(uc16 from, DispatchTable::Entry entry) {
4462  if (first_) {
4463  first_ = false;
4464  } else {
4465  stream()->Add("|");
4466  }
4467  stream()->Add("{\\%k-\\%k|{", from, entry.to());
4468  OutSet* out_set = entry.out_set();
4469  int priority = 0;
4470  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4471  if (out_set->Get(i)) {
4472  if (priority > 0) stream()->Add("|");
4473  stream()->Add("<s%io%i> %i", from, i, priority);
4474  priority++;
4475  }
4476  }
4477  stream()->Add("}}");
4478  }
4479 
4480  private:
4481  bool first_;
4482  StringStream* stream() { return stream_; }
4483  StringStream* stream_;
4484 };
4485 
4486 
4487 class AttributePrinter {
4488  public:
4489  explicit AttributePrinter(DotPrinter* out)
4490  : out_(out), first_(true) { }
4491  void PrintSeparator() {
4492  if (first_) {
4493  first_ = false;
4494  } else {
4495  out_->stream()->Add("|");
4496  }
4497  }
4498  void PrintBit(const char* name, bool value) {
4499  if (!value) return;
4500  PrintSeparator();
4501  out_->stream()->Add("{%s}", name);
4502  }
4503  void PrintPositive(const char* name, int value) {
4504  if (value < 0) return;
4505  PrintSeparator();
4506  out_->stream()->Add("{%s|%x}", name, value);
4507  }
4508  private:
4509  DotPrinter* out_;
4510  bool first_;
4511 };
4512 
4513 
4514 void DotPrinter::PrintAttributes(RegExpNode* that) {
4515  stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
4516  "margin=0.1, fontsize=10, label=\"{",
4517  that);
4518  AttributePrinter printer(this);
4519  NodeInfo* info = that->info();
4520  printer.PrintBit("NI", info->follows_newline_interest);
4521  printer.PrintBit("WI", info->follows_word_interest);
4522  printer.PrintBit("SI", info->follows_start_interest);
4523  Label* label = that->label();
4524  if (label->is_bound())
4525  printer.PrintPositive("@", label->pos());
4526  stream()->Add("}\"];\n");
4527  stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
4528  "arrowhead=none];\n", that, that);
4529 }
4530 
4531 
4532 static const bool kPrintDispatchTable = false;
4533 void DotPrinter::VisitChoice(ChoiceNode* that) {
4534  if (kPrintDispatchTable) {
4535  stream()->Add(" n%p [shape=Mrecord, label=\"", that);
4536  TableEntryHeaderPrinter header_printer(stream());
4537  that->GetTable(ignore_case_)->ForEach(&header_printer);
4538  stream()->Add("\"]\n", that);
4539  PrintAttributes(that);
4540  TableEntryBodyPrinter body_printer(stream(), that);
4541  that->GetTable(ignore_case_)->ForEach(&body_printer);
4542  } else {
4543  stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
4544  for (int i = 0; i < that->alternatives()->length(); i++) {
4545  GuardedAlternative alt = that->alternatives()->at(i);
4546  stream()->Add(" n%p -> n%p;\n", that, alt.node());
4547  }
4548  }
4549  for (int i = 0; i < that->alternatives()->length(); i++) {
4550  GuardedAlternative alt = that->alternatives()->at(i);
4551  alt.node()->Accept(this);
4552  }
4553 }
4554 
4555 
4556 void DotPrinter::VisitText(TextNode* that) {
4557  Zone* zone = that->zone();
4558  stream()->Add(" n%p [label=\"", that);
4559  for (int i = 0; i < that->elements()->length(); i++) {
4560  if (i > 0) stream()->Add(" ");
4561  TextElement elm = that->elements()->at(i);
4562  switch (elm.type) {
4563  case TextElement::ATOM: {
4564  stream()->Add("'%w'", elm.data.u_atom->data());
4565  break;
4566  }
4567  case TextElement::CHAR_CLASS: {
4568  RegExpCharacterClass* node = elm.data.u_char_class;
4569  stream()->Add("[");
4570  if (node->is_negated())
4571  stream()->Add("^");
4572  for (int j = 0; j < node->ranges(zone)->length(); j++) {
4573  CharacterRange range = node->ranges(zone)->at(j);
4574  stream()->Add("%k-%k", range.from(), range.to());
4575  }
4576  stream()->Add("]");
4577  break;
4578  }
4579  default:
4580  UNREACHABLE();
4581  }
4582  }
4583  stream()->Add("\", shape=box, peripheries=2];\n");
4584  PrintAttributes(that);
4585  stream()->Add(" n%p -> n%p;\n", that, that->on_success());
4586  Visit(that->on_success());
4587 }
4588 
4589 
4590 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4591  stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
4592  that,
4593  that->start_register(),
4594  that->end_register());
4595  PrintAttributes(that);
4596  stream()->Add(" n%p -> n%p;\n", that, that->on_success());
4597  Visit(that->on_success());
4598 }
4599 
4600 
4601 void DotPrinter::VisitEnd(EndNode* that) {
4602  stream()->Add(" n%p [style=bold, shape=point];\n", that);
4603  PrintAttributes(that);
4604 }
4605 
4606 
4607 void DotPrinter::VisitAssertion(AssertionNode* that) {
4608  stream()->Add(" n%p [", that);
4609  switch (that->type()) {
4610  case AssertionNode::AT_END:
4611  stream()->Add("label=\"$\", shape=septagon");
4612  break;
4614  stream()->Add("label=\"^\", shape=septagon");
4615  break;
4617  stream()->Add("label=\"\\b\", shape=septagon");
4618  break;
4620  stream()->Add("label=\"\\B\", shape=septagon");
4621  break;
4623  stream()->Add("label=\"(?<=\\n)\", shape=septagon");
4624  break;
4625  }
4626  stream()->Add("];\n");
4627  PrintAttributes(that);
4628  RegExpNode* successor = that->on_success();
4629  stream()->Add(" n%p -> n%p;\n", that, successor);
4630  Visit(successor);
4631 }
4632 
4633 
4634 void DotPrinter::VisitAction(ActionNode* that) {
4635  stream()->Add(" n%p [", that);
4636  switch (that->type_) {
4638  stream()->Add("label=\"$%i:=%i\", shape=octagon",
4639  that->data_.u_store_register.reg,
4640  that->data_.u_store_register.value);
4641  break;
4643  stream()->Add("label=\"$%i++\", shape=octagon",
4644  that->data_.u_increment_register.reg);
4645  break;
4647  stream()->Add("label=\"$%i:=$pos\", shape=octagon",
4648  that->data_.u_position_register.reg);
4649  break;
4651  stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
4652  that->data_.u_submatch.current_position_register);
4653  break;
4655  stream()->Add("label=\"escape\", shape=septagon");
4656  break;
4658  stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
4659  that->data_.u_empty_match_check.start_register,
4660  that->data_.u_empty_match_check.repetition_register,
4661  that->data_.u_empty_match_check.repetition_limit);
4662  break;
4664  stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
4665  that->data_.u_clear_captures.range_from,
4666  that->data_.u_clear_captures.range_to);
4667  break;
4668  }
4669  }
4670  stream()->Add("];\n");
4671  PrintAttributes(that);
4672  RegExpNode* successor = that->on_success();
4673  stream()->Add(" n%p -> n%p;\n", that, successor);
4674  Visit(successor);
4675 }
4676 
4677 
4678 class DispatchTableDumper {
4679  public:
4680  explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
4681  void Call(uc16 key, DispatchTable::Entry entry);
4682  StringStream* stream() { return stream_; }
4683  private:
4684  StringStream* stream_;
4685 };
4686 
4687 
4688 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4689  stream()->Add("[%k-%k]: {", key, entry.to());
4690  OutSet* set = entry.out_set();
4691  bool first = true;
4692  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4693  if (set->Get(i)) {
4694  if (first) {
4695  first = false;
4696  } else {
4697  stream()->Add(", ");
4698  }
4699  stream()->Add("%i", i);
4700  }
4701  }
4702  stream()->Add("}\n");
4703 }
4704 
4705 
4706 void DispatchTable::Dump() {
4707  HeapStringAllocator alloc;
4708  StringStream stream(&alloc);
4709  DispatchTableDumper dumper(&stream);
4710  tree()->ForEach(&dumper);
4711  OS::PrintError("%s", *stream.ToCString());
4712 }
4713 
4714 
4715 void RegExpEngine::DotPrint(const char* label,
4716  RegExpNode* node,
4717  bool ignore_case) {
4718  DotPrinter printer(ignore_case);
4719  printer.PrintNode(label, node);
4720 }
4721 
4722 
4723 #endif // DEBUG
4724 
4725 
4726 // -------------------------------------------------------------------
4727 // Tree to graph conversion
4728 
4730  RegExpNode* on_success) {
4731  ZoneList<TextElement>* elms =
4732  new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4733  elms->Add(TextElement::Atom(this), compiler->zone());
4734  return new(compiler->zone()) TextNode(elms, on_success);
4735 }
4736 
4737 
4739  RegExpNode* on_success) {
4740  return new(compiler->zone()) TextNode(elements(), on_success);
4741 }
4742 
4743 
4744 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4745  const int* special_class,
4746  int length) {
4747  length--; // Remove final 0x10000.
4748  ASSERT(special_class[length] == 0x10000);
4749  ASSERT(ranges->length() != 0);
4750  ASSERT(length != 0);
4751  ASSERT(special_class[0] != 0);
4752  if (ranges->length() != (length >> 1) + 1) {
4753  return false;
4754  }
4755  CharacterRange range = ranges->at(0);
4756  if (range.from() != 0) {
4757  return false;
4758  }
4759  for (int i = 0; i < length; i += 2) {
4760  if (special_class[i] != (range.to() + 1)) {
4761  return false;
4762  }
4763  range = ranges->at((i >> 1) + 1);
4764  if (special_class[i+1] != range.from()) {
4765  return false;
4766  }
4767  }
4768  if (range.to() != 0xffff) {
4769  return false;
4770  }
4771  return true;
4772 }
4773 
4774 
4775 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4776  const int* special_class,
4777  int length) {
4778  length--; // Remove final 0x10000.
4779  ASSERT(special_class[length] == 0x10000);
4780  if (ranges->length() * 2 != length) {
4781  return false;
4782  }
4783  for (int i = 0; i < length; i += 2) {
4784  CharacterRange range = ranges->at(i >> 1);
4785  if (range.from() != special_class[i] ||
4786  range.to() != special_class[i + 1] - 1) {
4787  return false;
4788  }
4789  }
4790  return true;
4791 }
4792 
4793 
4795  // TODO(lrn): Remove need for this function, by not throwing away information
4796  // along the way.
4797  if (is_negated_) {
4798  return false;
4799  }
4800  if (set_.is_standard()) {
4801  return true;
4802  }
4803  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4804  set_.set_standard_set_type('s');
4805  return true;
4806  }
4807  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4808  set_.set_standard_set_type('S');
4809  return true;
4810  }
4811  if (CompareInverseRanges(set_.ranges(zone),
4812  kLineTerminatorRanges,
4813  kLineTerminatorRangeCount)) {
4814  set_.set_standard_set_type('.');
4815  return true;
4816  }
4817  if (CompareRanges(set_.ranges(zone),
4818  kLineTerminatorRanges,
4819  kLineTerminatorRangeCount)) {
4820  set_.set_standard_set_type('n');
4821  return true;
4822  }
4823  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4824  set_.set_standard_set_type('w');
4825  return true;
4826  }
4827  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4828  set_.set_standard_set_type('W');
4829  return true;
4830  }
4831  return false;
4832 }
4833 
4834 
4836  RegExpNode* on_success) {
4837  return new(compiler->zone()) TextNode(this, on_success);
4838 }
4839 
4840 
4842  RegExpNode* on_success) {
4844  int length = alternatives->length();
4845  ChoiceNode* result =
4846  new(compiler->zone()) ChoiceNode(length, compiler->zone());
4847  for (int i = 0; i < length; i++) {
4848  GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4849  on_success));
4850  result->AddAlternative(alternative);
4851  }
4852  return result;
4853 }
4854 
4855 
4857  RegExpNode* on_success) {
4858  return ToNode(min(),
4859  max(),
4860  is_greedy(),
4861  body(),
4862  compiler,
4863  on_success);
4864 }
4865 
4866 
4867 // Scoped object to keep track of how much we unroll quantifier loops in the
4868 // regexp graph generator.
4870  public:
4871  static const int kMaxExpansionFactor = 6;
4873  : compiler_(compiler),
4874  saved_expansion_factor_(compiler->current_expansion_factor()),
4875  ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4876  ASSERT(factor > 0);
4877  if (ok_to_expand_) {
4878  if (factor > kMaxExpansionFactor) {
4879  // Avoid integer overflow of the current expansion factor.
4880  ok_to_expand_ = false;
4882  } else {
4883  int new_factor = saved_expansion_factor_ * factor;
4884  ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4885  compiler->set_current_expansion_factor(new_factor);
4886  }
4887  }
4888  }
4889 
4891  compiler_->set_current_expansion_factor(saved_expansion_factor_);
4892  }
4893 
4894  bool ok_to_expand() { return ok_to_expand_; }
4895 
4896  private:
4897  RegExpCompiler* compiler_;
4898  int saved_expansion_factor_;
4899  bool ok_to_expand_;
4900 
4901  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4902 };
4903 
4904 
4906  int max,
4907  bool is_greedy,
4908  RegExpTree* body,
4909  RegExpCompiler* compiler,
4910  RegExpNode* on_success,
4911  bool not_at_start) {
4912  // x{f, t} becomes this:
4913  //
4914  // (r++)<-.
4915  // | `
4916  // | (x)
4917  // v ^
4918  // (r=0)-->(?)---/ [if r < t]
4919  // |
4920  // [if r >= f] \----> ...
4921  //
4922 
4923  // 15.10.2.5 RepeatMatcher algorithm.
4924  // The parser has already eliminated the case where max is 0. In the case
4925  // where max_match is zero the parser has removed the quantifier if min was
4926  // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4927 
4928  // If we know that we cannot match zero length then things are a little
4929  // simpler since we don't need to make the special zero length match check
4930  // from step 2.1. If the min and max are small we can unroll a little in
4931  // this case.
4932  static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
4933  static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
4934  if (max == 0) return on_success; // This can happen due to recursion.
4935  bool body_can_be_empty = (body->min_match() == 0);
4936  int body_start_reg = RegExpCompiler::kNoRegister;
4937  Interval capture_registers = body->CaptureRegisters();
4938  bool needs_capture_clearing = !capture_registers.is_empty();
4939  Zone* zone = compiler->zone();
4940 
4941  if (body_can_be_empty) {
4942  body_start_reg = compiler->AllocateRegister();
4943  } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
4944  // Only unroll if there are no captures and the body can't be
4945  // empty.
4946  {
4947  RegExpExpansionLimiter limiter(
4948  compiler, min + ((max != min) ? 1 : 0));
4949  if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4950  int new_max = (max == kInfinity) ? max : max - min;
4951  // Recurse once to get the loop or optional matches after the fixed
4952  // ones.
4953  RegExpNode* answer = ToNode(
4954  0, new_max, is_greedy, body, compiler, on_success, true);
4955  // Unroll the forced matches from 0 to min. This can cause chains of
4956  // TextNodes (which the parser does not generate). These should be
4957  // combined if it turns out they hinder good code generation.
4958  for (int i = 0; i < min; i++) {
4959  answer = body->ToNode(compiler, answer);
4960  }
4961  return answer;
4962  }
4963  }
4964  if (max <= kMaxUnrolledMaxMatches && min == 0) {
4965  ASSERT(max > 0); // Due to the 'if' above.
4966  RegExpExpansionLimiter limiter(compiler, max);
4967  if (limiter.ok_to_expand()) {
4968  // Unroll the optional matches up to max.
4969  RegExpNode* answer = on_success;
4970  for (int i = 0; i < max; i++) {
4971  ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
4972  if (is_greedy) {
4973  alternation->AddAlternative(
4974  GuardedAlternative(body->ToNode(compiler, answer)));
4975  alternation->AddAlternative(GuardedAlternative(on_success));
4976  } else {
4977  alternation->AddAlternative(GuardedAlternative(on_success));
4978  alternation->AddAlternative(
4979  GuardedAlternative(body->ToNode(compiler, answer)));
4980  }
4981  answer = alternation;
4982  if (not_at_start) alternation->set_not_at_start();
4983  }
4984  return answer;
4985  }
4986  }
4987  }
4988  bool has_min = min > 0;
4989  bool has_max = max < RegExpTree::kInfinity;
4990  bool needs_counter = has_min || has_max;
4991  int reg_ctr = needs_counter
4992  ? compiler->AllocateRegister()
4994  LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4995  zone);
4996  if (not_at_start) center->set_not_at_start();
4997  RegExpNode* loop_return = needs_counter
4998  ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
4999  : static_cast<RegExpNode*>(center);
5000  if (body_can_be_empty) {
5001  // If the body can be empty we need to check if it was and then
5002  // backtrack.
5003  loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5004  reg_ctr,
5005  min,
5006  loop_return);
5007  }
5008  RegExpNode* body_node = body->ToNode(compiler, loop_return);
5009  if (body_can_be_empty) {
5010  // If the body can be empty we need to store the start position
5011  // so we can bail out if it was empty.
5012  body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5013  }
5014  if (needs_capture_clearing) {
5015  // Before entering the body of this loop we need to clear captures.
5016  body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5017  }
5018  GuardedAlternative body_alt(body_node);
5019  if (has_max) {
5020  Guard* body_guard =
5021  new(zone) Guard(reg_ctr, Guard::LT, max);
5022  body_alt.AddGuard(body_guard, zone);
5023  }
5024  GuardedAlternative rest_alt(on_success);
5025  if (has_min) {
5026  Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5027  rest_alt.AddGuard(rest_guard, zone);
5028  }
5029  if (is_greedy) {
5030  center->AddLoopAlternative(body_alt);
5031  center->AddContinueAlternative(rest_alt);
5032  } else {
5033  center->AddContinueAlternative(rest_alt);
5034  center->AddLoopAlternative(body_alt);
5035  }
5036  if (needs_counter) {
5037  return ActionNode::SetRegister(reg_ctr, 0, center);
5038  } else {
5039  return center;
5040  }
5041 }
5042 
5043 
5045  RegExpNode* on_success) {
5046  NodeInfo info;
5047  Zone* zone = compiler->zone();
5048 
5049  switch (type()) {
5050  case START_OF_LINE:
5051  return AssertionNode::AfterNewline(on_success);
5052  case START_OF_INPUT:
5053  return AssertionNode::AtStart(on_success);
5054  case BOUNDARY:
5055  return AssertionNode::AtBoundary(on_success);
5056  case NON_BOUNDARY:
5057  return AssertionNode::AtNonBoundary(on_success);
5058  case END_OF_INPUT:
5059  return AssertionNode::AtEnd(on_success);
5060  case END_OF_LINE: {
5061  // Compile $ in multiline regexps as an alternation with a positive
5062  // lookahead in one side and an end-of-input on the other side.
5063  // We need two registers for the lookahead.
5064  int stack_pointer_register = compiler->AllocateRegister();
5065  int position_register = compiler->AllocateRegister();
5066  // The ChoiceNode to distinguish between a newline and end-of-input.
5067  ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5068  // Create a newline atom.
5069  ZoneList<CharacterRange>* newline_ranges =
5070  new(zone) ZoneList<CharacterRange>(3, zone);
5071  CharacterRange::AddClassEscape('n', newline_ranges, zone);
5072  RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5073  TextNode* newline_matcher = new(zone) TextNode(
5074  newline_atom,
5075  ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5076  position_register,
5077  0, // No captures inside.
5078  -1, // Ignored if no captures.
5079  on_success));
5080  // Create an end-of-input matcher.
5081  RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5082  stack_pointer_register,
5083  position_register,
5084  newline_matcher);
5085  // Add the two alternatives to the ChoiceNode.
5086  GuardedAlternative eol_alternative(end_of_line);
5087  result->AddAlternative(eol_alternative);
5088  GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5089  result->AddAlternative(end_alternative);
5090  return result;
5091  }
5092  default:
5093  UNREACHABLE();
5094  }
5095  return on_success;
5096 }
5097 
5098 
5100  RegExpNode* on_success) {
5101  return new(compiler->zone())
5104  on_success);
5105 }
5106 
5107 
5109  RegExpNode* on_success) {
5110  return on_success;
5111 }
5112 
5113 
5115  RegExpNode* on_success) {
5116  int stack_pointer_register = compiler->AllocateRegister();
5117  int position_register = compiler->AllocateRegister();
5118 
5119  const int registers_per_capture = 2;
5120  const int register_of_first_capture = 2;
5121  int register_count = capture_count_ * registers_per_capture;
5122  int register_start =
5123  register_of_first_capture + capture_from_ * registers_per_capture;
5124 
5125  RegExpNode* success;
5126  if (is_positive()) {
5128  stack_pointer_register,
5129  position_register,
5130  body()->ToNode(
5131  compiler,
5132  ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5133  position_register,
5134  register_count,
5135  register_start,
5136  on_success)));
5137  return node;
5138  } else {
5139  // We use a ChoiceNode for a negative lookahead because it has most of
5140  // the characteristics we need. It has the body of the lookahead as its
5141  // first alternative and the expression after the lookahead of the second
5142  // alternative. If the first alternative succeeds then the
5143  // NegativeSubmatchSuccess will unwind the stack including everything the
5144  // choice node set up and backtrack. If the first alternative fails then
5145  // the second alternative is tried, which is exactly the desired result
5146  // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5147  // ChoiceNode that knows to ignore the first exit when calculating quick
5148  // checks.
5149  Zone* zone = compiler->zone();
5150 
5151  GuardedAlternative body_alt(
5152  body()->ToNode(
5153  compiler,
5154  success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5155  position_register,
5156  register_count,
5157  register_start,
5158  zone)));
5159  ChoiceNode* choice_node =
5160  new(zone) NegativeLookaheadChoiceNode(body_alt,
5161  GuardedAlternative(on_success),
5162  zone);
5163  return ActionNode::BeginSubmatch(stack_pointer_register,
5164  position_register,
5165  choice_node);
5166  }
5167 }
5168 
5169 
5171  RegExpNode* on_success) {
5172  return ToNode(body(), index(), compiler, on_success);
5173 }
5174 
5175 
5177  int index,
5178  RegExpCompiler* compiler,
5179  RegExpNode* on_success) {
5180  int start_reg = RegExpCapture::StartRegister(index);
5181  int end_reg = RegExpCapture::EndRegister(index);
5182  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5183  RegExpNode* body_node = body->ToNode(compiler, store_end);
5184  return ActionNode::StorePosition(start_reg, true, body_node);
5185 }
5186 
5187 
5189  RegExpNode* on_success) {
5190  ZoneList<RegExpTree*>* children = nodes();
5191  RegExpNode* current = on_success;
5192  for (int i = children->length() - 1; i >= 0; i--) {
5193  current = children->at(i)->ToNode(compiler, current);
5194  }
5195  return current;
5196 }
5197 
5198 
5199 static void AddClass(const int* elmv,
5200  int elmc,
5201  ZoneList<CharacterRange>* ranges,
5202  Zone* zone) {
5203  elmc--;
5204  ASSERT(elmv[elmc] == 0x10000);
5205  for (int i = 0; i < elmc; i += 2) {
5206  ASSERT(elmv[i] < elmv[i + 1]);
5207  ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5208  }
5209 }
5210 
5211 
5212 static void AddClassNegated(const int *elmv,
5213  int elmc,
5214  ZoneList<CharacterRange>* ranges,
5215  Zone* zone) {
5216  elmc--;
5217  ASSERT(elmv[elmc] == 0x10000);
5218  ASSERT(elmv[0] != 0x0000);
5219  ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
5220  uc16 last = 0x0000;
5221  for (int i = 0; i < elmc; i += 2) {
5222  ASSERT(last <= elmv[i] - 1);
5223  ASSERT(elmv[i] < elmv[i + 1]);
5224  ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5225  last = elmv[i + 1];
5226  }
5227  ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
5228 }
5229 
5230 
5232  ZoneList<CharacterRange>* ranges,
5233  Zone* zone) {
5234  switch (type) {
5235  case 's':
5236  AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5237  break;
5238  case 'S':
5239  AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5240  break;
5241  case 'w':
5242  AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5243  break;
5244  case 'W':
5245  AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5246  break;
5247  case 'd':
5248  AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5249  break;
5250  case 'D':
5251  AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5252  break;
5253  case '.':
5254  AddClassNegated(kLineTerminatorRanges,
5255  kLineTerminatorRangeCount,
5256  ranges,
5257  zone);
5258  break;
5259  // This is not a character range as defined by the spec but a
5260  // convenient shorthand for a character class that matches any
5261  // character.
5262  case '*':
5263  ranges->Add(CharacterRange::Everything(), zone);
5264  break;
5265  // This is the set of characters matched by the $ and ^ symbols
5266  // in multiline mode.
5267  case 'n':
5268  AddClass(kLineTerminatorRanges,
5269  kLineTerminatorRangeCount,
5270  ranges,
5271  zone);
5272  break;
5273  default:
5274  UNREACHABLE();
5275  }
5276 }
5277 
5278 
5280  return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5281 }
5282 
5283 
5285  public:
5287  ZoneList<CharacterRange>** excluded,
5288  Zone* zone)
5289  : included_(included),
5290  excluded_(excluded),
5291  zone_(zone) { }
5292  void Call(uc16 from, DispatchTable::Entry entry);
5293 
5294  static const int kInBase = 0;
5295  static const int kInOverlay = 1;
5296 
5297  private:
5298  ZoneList<CharacterRange>** included_;
5299  ZoneList<CharacterRange>** excluded_;
5300  Zone* zone_;
5301 };
5302 
5303 
5305  if (!entry.out_set()->Get(kInBase)) return;
5306  ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5307  ? included_
5308  : excluded_;
5309  if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5310  (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5311 }
5312 
5313 
5315  Vector<const int> overlay,
5316  ZoneList<CharacterRange>** included,
5317  ZoneList<CharacterRange>** excluded,
5318  Zone* zone) {
5319  ASSERT_EQ(NULL, *included);
5320  ASSERT_EQ(NULL, *excluded);
5321  DispatchTable table(zone);
5322  for (int i = 0; i < base->length(); i++)
5323  table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5324  for (int i = 0; i < overlay.length(); i += 2) {
5325  table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5327  }
5328  CharacterRangeSplitter callback(included, excluded, zone);
5329  table.ForEach(&callback);
5330 }
5331 
5332 
5334  bool is_ascii,
5335  Zone* zone) {
5336  Isolate* isolate = Isolate::Current();
5337  uc16 bottom = from();
5338  uc16 top = to();
5339  if (is_ascii) {
5340  if (bottom > String::kMaxAsciiCharCode) return;
5342  }
5344  if (top == bottom) {
5345  // If this is a singleton we just expand the one character.
5346  int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5347  for (int i = 0; i < length; i++) {
5348  uc32 chr = chars[i];
5349  if (chr != bottom) {
5350  ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5351  }
5352  }
5353  } else {
5354  // If this is a range we expand the characters block by block,
5355  // expanding contiguous subranges (blocks) one at a time.
5356  // The approach is as follows. For a given start character we
5357  // look up the remainder of the block that contains it (represented
5358  // by the end point), for instance we find 'z' if the character
5359  // is 'c'. A block is characterized by the property
5360  // that all characters uncanonicalize in the same way, except that
5361  // each entry in the result is incremented by the distance from the first
5362  // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5363  // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5364  // Once we've found the end point we look up its uncanonicalization
5365  // and produce a range for each element. For instance for [c-f]
5366  // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
5367  // add a range if it is not already contained in the input, so [c-f]
5368  // will be skipped but [C-F] will be added. If this range is not
5369  // completely contained in a block we do this for all the blocks
5370  // covered by the range (handling characters that is not in a block
5371  // as a "singleton block").
5373  int pos = bottom;
5374  while (pos <= top) {
5375  int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5376  uc16 block_end;
5377  if (length == 0) {
5378  block_end = pos;
5379  } else {
5380  ASSERT_EQ(1, length);
5381  block_end = range[0];
5382  }
5383  int end = (block_end > top) ? top : block_end;
5384  length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5385  for (int i = 0; i < length; i++) {
5386  uc32 c = range[i];
5387  uc16 range_from = c - (block_end - pos);
5388  uc16 range_to = c - (block_end - end);
5389  if (!(bottom <= range_from && range_to <= top)) {
5390  ranges->Add(CharacterRange(range_from, range_to), zone);
5391  }
5392  }
5393  pos = end + 1;
5394  }
5395  }
5396 }
5397 
5398 
5400  ASSERT_NOT_NULL(ranges);
5401  int n = ranges->length();
5402  if (n <= 1) return true;
5403  int max = ranges->at(0).to();
5404  for (int i = 1; i < n; i++) {
5405  CharacterRange next_range = ranges->at(i);
5406  if (next_range.from() <= max + 1) return false;
5407  max = next_range.to();
5408  }
5409  return true;
5410 }
5411 
5412 
5413 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5414  if (ranges_ == NULL) {
5415  ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5416  CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5417  }
5418  return ranges_;
5419 }
5420 
5421 
5422 // Move a number of elements in a zonelist to another position
5423 // in the same list. Handles overlapping source and target areas.
5424 static void MoveRanges(ZoneList<CharacterRange>* list,
5425  int from,
5426  int to,
5427  int count) {
5428  // Ranges are potentially overlapping.
5429  if (from < to) {
5430  for (int i = count - 1; i >= 0; i--) {
5431  list->at(to + i) = list->at(from + i);
5432  }
5433  } else {
5434  for (int i = 0; i < count; i++) {
5435  list->at(to + i) = list->at(from + i);
5436  }
5437  }
5438 }
5439 
5440 
5441 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5442  int count,
5443  CharacterRange insert) {
5444  // Inserts a range into list[0..count[, which must be sorted
5445  // by from value and non-overlapping and non-adjacent, using at most
5446  // list[0..count] for the result. Returns the number of resulting
5447  // canonicalized ranges. Inserting a range may collapse existing ranges into
5448  // fewer ranges, so the return value can be anything in the range 1..count+1.
5449  uc16 from = insert.from();
5450  uc16 to = insert.to();
5451  int start_pos = 0;
5452  int end_pos = count;
5453  for (int i = count - 1; i >= 0; i--) {
5454  CharacterRange current = list->at(i);
5455  if (current.from() > to + 1) {
5456  end_pos = i;
5457  } else if (current.to() + 1 < from) {
5458  start_pos = i + 1;
5459  break;
5460  }
5461  }
5462 
5463  // Inserted range overlaps, or is adjacent to, ranges at positions
5464  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5465  // not affected by the insertion.
5466  // If start_pos == end_pos, the range must be inserted before start_pos.
5467  // if start_pos < end_pos, the entire range from start_pos to end_pos
5468  // must be merged with the insert range.
5469 
5470  if (start_pos == end_pos) {
5471  // Insert between existing ranges at position start_pos.
5472  if (start_pos < count) {
5473  MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5474  }
5475  list->at(start_pos) = insert;
5476  return count + 1;
5477  }
5478  if (start_pos + 1 == end_pos) {
5479  // Replace single existing range at position start_pos.
5480  CharacterRange to_replace = list->at(start_pos);
5481  int new_from = Min(to_replace.from(), from);
5482  int new_to = Max(to_replace.to(), to);
5483  list->at(start_pos) = CharacterRange(new_from, new_to);
5484  return count;
5485  }
5486  // Replace a number of existing ranges from start_pos to end_pos - 1.
5487  // Move the remaining ranges down.
5488 
5489  int new_from = Min(list->at(start_pos).from(), from);
5490  int new_to = Max(list->at(end_pos - 1).to(), to);
5491  if (end_pos < count) {
5492  MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5493  }
5494  list->at(start_pos) = CharacterRange(new_from, new_to);
5495  return count - (end_pos - start_pos) + 1;
5496 }
5497 
5498 
5500  // Special/default classes are always considered canonical. The result
5501  // of calling ranges() will be sorted.
5502  if (ranges_ == NULL) return;
5504 }
5505 
5506 
5508  if (character_ranges->length() <= 1) return;
5509  // Check whether ranges are already canonical (increasing, non-overlapping,
5510  // non-adjacent).
5511  int n = character_ranges->length();
5512  int max = character_ranges->at(0).to();
5513  int i = 1;
5514  while (i < n) {
5515  CharacterRange current = character_ranges->at(i);
5516  if (current.from() <= max + 1) {
5517  break;
5518  }
5519  max = current.to();
5520  i++;
5521  }
5522  // Canonical until the i'th range. If that's all of them, we are done.
5523  if (i == n) return;
5524 
5525  // The ranges at index i and forward are not canonicalized. Make them so by
5526  // doing the equivalent of insertion sort (inserting each into the previous
5527  // list, in order).
5528  // Notice that inserting a range can reduce the number of ranges in the
5529  // result due to combining of adjacent and overlapping ranges.
5530  int read = i; // Range to insert.
5531  int num_canonical = i; // Length of canonicalized part of list.
5532  do {
5533  num_canonical = InsertRangeInCanonicalList(character_ranges,
5534  num_canonical,
5535  character_ranges->at(read));
5536  read++;
5537  } while (read < n);
5538  character_ranges->Rewind(num_canonical);
5539 
5540  ASSERT(CharacterRange::IsCanonical(character_ranges));
5541 }
5542 
5543 
5545  ZoneList<CharacterRange>* negated_ranges,
5546  Zone* zone) {
5548  ASSERT_EQ(0, negated_ranges->length());
5549  int range_count = ranges->length();
5550  uc16 from = 0;
5551  int i = 0;
5552  if (range_count > 0 && ranges->at(0).from() == 0) {
5553  from = ranges->at(0).to();
5554  i = 1;
5555  }
5556  while (i < range_count) {
5557  CharacterRange range = ranges->at(i);
5558  negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
5559  from = range.to();
5560  i++;
5561  }
5562  if (from < String::kMaxUtf16CodeUnit) {
5563  negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5564  zone);
5565  }
5566 }
5567 
5568 
5569 // -------------------------------------------------------------------
5570 // Splay tree
5571 
5572 
5573 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
5574  if (Get(value))
5575  return this;
5576  if (successors(zone) != NULL) {
5577  for (int i = 0; i < successors(zone)->length(); i++) {
5578  OutSet* successor = successors(zone)->at(i);
5579  if (successor->Get(value))
5580  return successor;
5581  }
5582  } else {
5583  successors_ = new(zone) ZoneList<OutSet*>(2, zone);
5584  }
5585  OutSet* result = new(zone) OutSet(first_, remaining_);
5586  result->Set(value, zone);
5587  successors(zone)->Add(result, zone);
5588  return result;
5589 }
5590 
5591 
5592 void OutSet::Set(unsigned value, Zone *zone) {
5593  if (value < kFirstLimit) {
5594  first_ |= (1 << value);
5595  } else {
5596  if (remaining_ == NULL)
5597  remaining_ = new(zone) ZoneList<unsigned>(1, zone);
5598  if (remaining_->is_empty() || !remaining_->Contains(value))
5599  remaining_->Add(value, zone);
5600  }
5601 }
5602 
5603 
5604 bool OutSet::Get(unsigned value) {
5605  if (value < kFirstLimit) {
5606  return (first_ & (1 << value)) != 0;
5607  } else if (remaining_ == NULL) {
5608  return false;
5609  } else {
5610  return remaining_->Contains(value);
5611  }
5612 }
5613 
5614 
5616 
5617 
5618 void DispatchTable::AddRange(CharacterRange full_range, int value,
5619  Zone* zone) {
5620  CharacterRange current = full_range;
5621  if (tree()->is_empty()) {
5622  // If this is the first range we just insert into the table.
5624  ASSERT_RESULT(tree()->Insert(current.from(), &loc));
5625  loc.set_value(Entry(current.from(), current.to(),
5626  empty()->Extend(value, zone)));
5627  return;
5628  }
5629  // First see if there is a range to the left of this one that
5630  // overlaps.
5632  if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5633  Entry* entry = &loc.value();
5634  // If we've found a range that overlaps with this one, and it
5635  // starts strictly to the left of this one, we have to fix it
5636  // because the following code only handles ranges that start on
5637  // or after the start point of the range we're adding.
5638  if (entry->from() < current.from() && entry->to() >= current.from()) {
5639  // Snap the overlapping range in half around the start point of
5640  // the range we're adding.
5641  CharacterRange left(entry->from(), current.from() - 1);
5642  CharacterRange right(current.from(), entry->to());
5643  // The left part of the overlapping range doesn't overlap.
5644  // Truncate the whole entry to be just the left part.
5645  entry->set_to(left.to());
5646  // The right part is the one that overlaps. We add this part
5647  // to the map and let the next step deal with merging it with
5648  // the range we're adding.
5650  ASSERT_RESULT(tree()->Insert(right.from(), &loc));
5651  loc.set_value(Entry(right.from(),
5652  right.to(),
5653  entry->out_set()));
5654  }
5655  }
5656  while (current.is_valid()) {
5657  if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5658  (loc.value().from() <= current.to()) &&
5659  (loc.value().to() >= current.from())) {
5660  Entry* entry = &loc.value();
5661  // We have overlap. If there is space between the start point of
5662  // the range we're adding and where the overlapping range starts
5663  // then we have to add a range covering just that space.
5664  if (current.from() < entry->from()) {
5666  ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5667  ins.set_value(Entry(current.from(),
5668  entry->from() - 1,
5669  empty()->Extend(value, zone)));
5670  current.set_from(entry->from());
5671  }
5672  ASSERT_EQ(current.from(), entry->from());
5673  // If the overlapping range extends beyond the one we want to add
5674  // we have to snap the right part off and add it separately.
5675  if (entry->to() > current.to()) {
5677  ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
5678  ins.set_value(Entry(current.to() + 1,
5679  entry->to(),
5680  entry->out_set()));
5681  entry->set_to(current.to());
5682  }
5683  ASSERT(entry->to() <= current.to());
5684  // The overlapping range is now completely contained by the range
5685  // we're adding so we can just update it and move the start point
5686  // of the range we're adding just past it.
5687  entry->AddValue(value, zone);
5688  // Bail out if the last interval ended at 0xFFFF since otherwise
5689  // adding 1 will wrap around to 0.
5690  if (entry->to() == String::kMaxUtf16CodeUnit)
5691  break;
5692  ASSERT(entry->to() + 1 > current.from());
5693  current.set_from(entry->to() + 1);
5694  } else {
5695  // There is no overlap so we can just add the range
5697  ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5698  ins.set_value(Entry(current.from(),
5699  current.to(),
5700  empty()->Extend(value, zone)));
5701  break;
5702  }
5703  }
5704 }
5705 
5706 
5709  if (!tree()->FindGreatestLessThan(value, &loc))
5710  return empty();
5711  Entry* entry = &loc.value();
5712  if (value <= entry->to())
5713  return entry->out_set();
5714  else
5715  return empty();
5716 }
5717 
5718 
5719 // -------------------------------------------------------------------
5720 // Analysis
5721 
5722 
5724  StackLimitCheck check(Isolate::Current());
5725  if (check.HasOverflowed()) {
5726  fail("Stack overflow");
5727  return;
5728  }
5729  if (that->info()->been_analyzed || that->info()->being_analyzed)
5730  return;
5731  that->info()->being_analyzed = true;
5732  that->Accept(this);
5733  that->info()->being_analyzed = false;
5734  that->info()->been_analyzed = true;
5735 }
5736 
5737 
5738 void Analysis::VisitEnd(EndNode* that) {
5739  // nothing to do
5740 }
5741 
5742 
5744  int element_count = elements()->length();
5745  // Set up the offsets of the elements relative to the start. This is a fixed
5746  // quantity since a TextNode can only contain fixed-width things.
5747  int cp_offset = 0;
5748  for (int i = 0; i < element_count; i++) {
5749  TextElement& elm = elements()->at(i);
5750  elm.cp_offset = cp_offset;
5751  if (elm.type == TextElement::ATOM) {
5752  cp_offset += elm.data.u_atom->data().length();
5753  } else {
5754  cp_offset++;
5755  }
5756  }
5757 }
5758 
5759 
5760 void Analysis::VisitText(TextNode* that) {
5761  if (ignore_case_) {
5762  that->MakeCaseIndependent(is_ascii_);
5763  }
5764  EnsureAnalyzed(that->on_success());
5765  if (!has_failed()) {
5766  that->CalculateOffsets();
5767  }
5768 }
5769 
5770 
5771 void Analysis::VisitAction(ActionNode* that) {
5772  RegExpNode* target = that->on_success();
5773  EnsureAnalyzed(target);
5774  if (!has_failed()) {
5775  // If the next node is interested in what it follows then this node
5776  // has to be interested too so it can pass the information on.
5777  that->info()->AddFromFollowing(target->info());
5778  }
5779 }
5780 
5781 
5782 void Analysis::VisitChoice(ChoiceNode* that) {
5783  NodeInfo* info = that->info();
5784  for (int i = 0; i < that->alternatives()->length(); i++) {
5785  RegExpNode* node = that->alternatives()->at(i).node();
5786  EnsureAnalyzed(node);
5787  if (has_failed()) return;
5788  // Anything the following nodes need to know has to be known by
5789  // this node also, so it can pass it on.
5790  info->AddFromFollowing(node->info());
5791  }
5792 }
5793 
5794 
5796  NodeInfo* info = that->info();
5797  for (int i = 0; i < that->alternatives()->length(); i++) {
5798  RegExpNode* node = that->alternatives()->at(i).node();
5799  if (node != that->loop_node()) {
5800  EnsureAnalyzed(node);
5801  if (has_failed()) return;
5802  info->AddFromFollowing(node->info());
5803  }
5804  }
5805  // Check the loop last since it may need the value of this node
5806  // to get a correct result.
5807  EnsureAnalyzed(that->loop_node());
5808  if (!has_failed()) {
5809  info->AddFromFollowing(that->loop_node()->info());
5810  }
5811 }
5812 
5813 
5814 void Analysis::VisitBackReference(BackReferenceNode* that) {
5815  EnsureAnalyzed(that->on_success());
5816 }
5817 
5818 
5819 void Analysis::VisitAssertion(AssertionNode* that) {
5820  EnsureAnalyzed(that->on_success());
5821 }
5822 
5823 
5825  int recursion_depth,
5826  int budget,
5827  BoyerMooreLookahead* bm,
5828  bool not_at_start) {
5829  // Working out the set of characters that a backreference can match is too
5830  // hard, so we just say that any character can match.
5831  bm->SetRest(offset);
5832  SaveBMInfo(bm, not_at_start, offset);
5833 }
5834 
5835 
5838 
5839 
5841  int recursion_depth,
5842  int budget,
5843  BoyerMooreLookahead* bm,
5844  bool not_at_start) {
5846  budget = (budget - 1) / alts->length();
5847  for (int i = 0; i < alts->length(); i++) {
5848  GuardedAlternative& alt = alts->at(i);
5849  if (alt.guards() != NULL && alt.guards()->length() != 0) {
5850  bm->SetRest(offset); // Give up trying to fill in info.
5851  SaveBMInfo(bm, not_at_start, offset);
5852  return;
5853  }
5854  alt.node()->FillInBMInfo(
5855  offset, recursion_depth + 1, budget, bm, not_at_start);
5856  }
5857  SaveBMInfo(bm, not_at_start, offset);
5858 }
5859 
5860 
5861 void TextNode::FillInBMInfo(int initial_offset,
5862  int recursion_depth,
5863  int budget,
5864  BoyerMooreLookahead* bm,
5865  bool not_at_start) {
5866  if (initial_offset >= bm->length()) return;
5867  int offset = initial_offset;
5868  int max_char = bm->max_char();
5869  for (int i = 0; i < elements()->length(); i++) {
5870  if (offset >= bm->length()) {
5871  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5872  return;
5873  }
5874  TextElement text = elements()->at(i);
5875  if (text.type == TextElement::ATOM) {
5876  RegExpAtom* atom = text.data.u_atom;
5877  for (int j = 0; j < atom->length(); j++, offset++) {
5878  if (offset >= bm->length()) {
5879  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5880  return;
5881  }
5882  uc16 character = atom->data()[j];
5883  if (bm->compiler()->ignore_case()) {
5885  int length = GetCaseIndependentLetters(
5886  ISOLATE,
5887  character,
5889  chars);
5890  for (int j = 0; j < length; j++) {
5891  bm->Set(offset, chars[j]);
5892  }
5893  } else {
5894  if (character <= max_char) bm->Set(offset, character);
5895  }
5896  }
5897  } else {
5899  RegExpCharacterClass* char_class = text.data.u_char_class;
5900  ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
5901  if (char_class->is_negated()) {
5902  bm->SetAll(offset);
5903  } else {
5904  for (int k = 0; k < ranges->length(); k++) {
5905  CharacterRange& range = ranges->at(k);
5906  if (range.from() > max_char) continue;
5907  int to = Min(max_char, static_cast<int>(range.to()));
5908  bm->SetInterval(offset, Interval(range.from(), to));
5909  }
5910  }
5911  offset++;
5912  }
5913  }
5914  if (offset >= bm->length()) {
5915  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5916  return;
5917  }
5918  on_success()->FillInBMInfo(offset,
5919  recursion_depth + 1,
5920  budget - 1,
5921  bm,
5922  true); // Not at start after a text node.
5923  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5924 }
5925 
5926 
5927 // -------------------------------------------------------------------
5928 // Dispatch table construction
5929 
5930 
5931 void DispatchTableConstructor::VisitEnd(EndNode* that) {
5933 }
5934 
5935 
5937  node->set_being_calculated(true);
5938  ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5939  for (int i = 0; i < alternatives->length(); i++) {
5940  set_choice_index(i);
5941  alternatives->at(i).node()->Accept(this);
5942  }
5943  node->set_being_calculated(false);
5944 }
5945 
5946 
5948  public:
5950  : constructor_(constructor) { }
5951  void Call(uc32 from, DispatchTable::Entry entry);
5952  private:
5953  DispatchTableConstructor* constructor_;
5954 };
5955 
5956 
5958  CharacterRange range(from, entry.to());
5959  constructor_->AddRange(range);
5960 }
5961 
5962 
5963 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5964  if (node->being_calculated())
5965  return;
5967  AddDispatchRange adder(this);
5968  table->ForEach(&adder);
5969 }
5970 
5971 
5972 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5973  // TODO(160): Find the node that we refer back to and propagate its start
5974  // set back to here. For now we just accept anything.
5976 }
5977 
5978 
5979 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5980  RegExpNode* target = that->on_success();
5981  target->Accept(this);
5982 }
5983 
5984 
5985 static int CompareRangeByFrom(const CharacterRange* a,
5986  const CharacterRange* b) {
5987  return Compare<uc16>(a->from(), b->from());
5988 }
5989 
5990 
5992  ranges->Sort(CompareRangeByFrom);
5993  uc16 last = 0;
5994  for (int i = 0; i < ranges->length(); i++) {
5995  CharacterRange range = ranges->at(i);
5996  if (last < range.from())
5997  AddRange(CharacterRange(last, range.from() - 1));
5998  if (range.to() >= last) {
5999  if (range.to() == String::kMaxUtf16CodeUnit) {
6000  return;
6001  } else {
6002  last = range.to() + 1;
6003  }
6004  }
6005  }
6007 }
6008 
6009 
6010 void DispatchTableConstructor::VisitText(TextNode* that) {
6011  TextElement elm = that->elements()->at(0);
6012  switch (elm.type) {
6013  case TextElement::ATOM: {
6014  uc16 c = elm.data.u_atom->data()[0];
6015  AddRange(CharacterRange(c, c));
6016  break;
6017  }
6018  case TextElement::CHAR_CLASS: {
6019  RegExpCharacterClass* tree = elm.data.u_char_class;
6020  ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6021  if (tree->is_negated()) {
6022  AddInverse(ranges);
6023  } else {
6024  for (int i = 0; i < ranges->length(); i++)
6025  AddRange(ranges->at(i));
6026  }
6027  break;
6028  }
6029  default: {
6030  UNIMPLEMENTED();
6031  }
6032  }
6033 }
6034 
6035 
6036 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6037  RegExpNode* target = that->on_success();
6038  target->Accept(this);
6039 }
6040 
6041 
6043  RegExpCompileData* data,
6044  bool ignore_case,
6045  bool is_global,
6046  bool is_multiline,
6047  Handle<String> pattern,
6048  Handle<String> sample_subject,
6049  bool is_ascii,
6050  Zone* zone) {
6051  if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6052  return IrregexpRegExpTooBig();
6053  }
6054  RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone);
6055 
6056  // Sample some characters from the middle of the string.
6057  static const int kSampleSize = 128;
6058 
6059  FlattenString(sample_subject);
6060  int chars_sampled = 0;
6061  int half_way = (sample_subject->length() - kSampleSize) / 2;
6062  for (int i = Max(0, half_way);
6063  i < sample_subject->length() && chars_sampled < kSampleSize;
6064  i++, chars_sampled++) {
6065  compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6066  }
6067 
6068  // Wrap the body of the regexp in capture #0.
6069  RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6070  0,
6071  &compiler,
6072  compiler.accept());
6073  RegExpNode* node = captured_body;
6074  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6075  bool is_start_anchored = data->tree->IsAnchoredAtStart();
6076  int max_length = data->tree->max_match();
6077  if (!is_start_anchored) {
6078  // Add a .*? at the beginning, outside the body capture, unless
6079  // this expression is anchored at the beginning.
6080  RegExpNode* loop_node =
6083  false,
6084  new(zone) RegExpCharacterClass('*'),
6085  &compiler,
6086  captured_body,
6087  data->contains_anchor);
6088 
6089  if (data->contains_anchor) {
6090  // Unroll loop once, to take care of the case that might start
6091  // at the start of input.
6092  ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6093  first_step_node->AddAlternative(GuardedAlternative(captured_body));
6094  first_step_node->AddAlternative(GuardedAlternative(
6095  new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
6096  node = first_step_node;
6097  } else {
6098  node = loop_node;
6099  }
6100  }
6101  if (is_ascii) {
6103  // Do it again to propagate the new nodes to places where they were not
6104  // put because they had not been calculated yet.
6105  if (node != NULL) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
6106  }
6107 
6108  if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6109  data->node = node;
6110  Analysis analysis(ignore_case, is_ascii);
6111  analysis.EnsureAnalyzed(node);
6112  if (analysis.has_failed()) {
6113  const char* error_message = analysis.error_message();
6114  return CompilationResult(error_message);
6115  }
6116 
6117  // Create the correct assembler for the architecture.
6118 #ifndef V8_INTERPRETED_REGEXP
6119  // Native regexp implementation.
6120 
6124 
6125 #if V8_TARGET_ARCH_IA32
6126  RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2,
6127  zone);
6128 #elif V8_TARGET_ARCH_X64
6129  RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2,
6130  zone);
6131 #elif V8_TARGET_ARCH_ARM
6132  RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2,
6133  zone);
6134 #elif V8_TARGET_ARCH_MIPS
6135  RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6136  zone);
6137 #endif
6138 
6139 #else // V8_INTERPRETED_REGEXP
6140  // Interpreted regexp implementation.
6142  RegExpMacroAssemblerIrregexp macro_assembler(codes, zone);
6143 #endif // V8_INTERPRETED_REGEXP
6144 
6145  // Inserted here, instead of in Assembler, because it depends on information
6146  // in the AST that isn't replicated in the Node structure.
6147  static const int kMaxBacksearchLimit = 1024;
6148  if (is_end_anchored &&
6149  !is_start_anchored &&
6150  max_length < kMaxBacksearchLimit) {
6151  macro_assembler.SetCurrentPositionFromEnd(max_length);
6152  }
6153 
6154  if (is_global) {
6155  macro_assembler.set_global_mode(
6156  (data->tree->min_match() > 0)
6159  }
6160 
6161  return compiler.Assemble(&macro_assembler,
6162  node,
6163  data->capture_count,
6164  pattern);
6165 }
6166 
6167 
6168 }} // namespace v8::internal
void SetInterval(const Interval &interval)
Definition: jsregexp.cc:3663
ZoneList< TextElement > * elements()
Definition: ast.h:2316
AlternativeGenerationList(int count, Zone *zone)
Definition: jsregexp.cc:3613
virtual void WriteStackPointerToRegister(int reg)=0
OutSet * Get(uc16 value)
Definition: jsregexp.cc:5707
static Handle< Object > New(Handle< JSFunction > func, int argc, Handle< Object > argv[], bool *pending_exception)
Definition: execution.cc:183
bool EmitCharacterFunction(Isolate *isolate, RegExpCompiler *compiler, uc16 c, Label *on_failure, int cp_offset, bool check, bool preloaded)
Definition: jsregexp.cc:1788
Failure * StackOverflow()
Definition: isolate.cc:924
static bool ParseRegExp(FlatStringReader *input, bool multiline, RegExpCompileData *result, Zone *zone)
Definition: parser.cc:5921
virtual void FillInBMInfo(int offset, int recursion_depth, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:5824
void FlattenString(Handle< String > string)
Definition: handles.cc:211
RegExpNode * stop_node()
Definition: jsregexp.h:1459
int GreedyLoopTextLengthForAlternative(GuardedAlternative *alternative)
Definition: jsregexp.cc:3513
static const int kIrregexpMaxRegisterCountIndex
Definition: objects.h:6657
double total_regexp_code_generated()
Definition: heap.h:1527
static const int kMaxAsciiCharCode
Definition: objects.h:7327
bool Contains(const T &elm) const
Definition: list-inl.h:178
#define DECLARE_VISIT(type)
Definition: full-codegen.h:594
static Vector< const int > GetWordBounds()
Definition: jsregexp.cc:5279
#define ASSERT_RESULT(expr)
Definition: checks.h:269
virtual void GoTo(Label *label)=0
static AssertionNode * AtStart(RegExpNode *on_success)
Definition: jsregexp.h:900
const char * ToCString(const v8::String::Utf8Value &value)
void SetInterval(int map_number, const Interval &interval)
Definition: jsregexp.h:1305
static ActionNode * SetRegister(int reg, int val, RegExpNode *on_success)
Definition: jsregexp.cc:1554
RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler *assembler, RegExpNode *start, int capture_count, Handle< String > pattern)
Definition: jsregexp.cc:1174
static Code * IrregexpNativeCode(FixedArray *re, bool is_ascii)
Definition: jsregexp.cc:509
void set(int index, Object *value)
Definition: objects-inl.h:1757
CompilationCache * compilation_cache()
Definition: isolate.h:827
CharacterRangeSplitter(ZoneList< CharacterRange > **included, ZoneList< CharacterRange > **excluded, Zone *zone)
Definition: jsregexp.cc:5286
void PrintF(const char *format,...)
Definition: v8utils.cc:40
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:1496
static String * cast(Object *obj)
static ActionNode * BeginSubmatch(int stack_pointer_reg, int position_reg, RegExpNode *on_success)
Definition: jsregexp.cc:1594
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)
Definition: jsregexp.cc:2416
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3378
static const int kMaxUtf16CodeUnit
Definition: objects.h:7329
void set_characters(int characters)
Definition: jsregexp.h:542
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)
Definition: jsregexp.cc:2426
unibrow::Mapping< unibrow::Ecma262UnCanonicalize > * jsregexp_uncanonicalize()
Definition: isolate.h:892
VisitMarker(NodeInfo *info)
Definition: jsregexp.cc:2825
const int kNumRegisters
Definition: constants-arm.h:92
Isolate * isolate()
Definition: heap-inl.h:503
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:4738
virtual void AppendToText(RegExpText *text, Zone *zone)
Definition: jsregexp.cc:990
virtual RegExpNode * GetSuccessorOfOmnivorousTextNode(RegExpCompiler *compiler)
Definition: jsregexp.h:609
static Smi * FromInt(int value)
Definition: objects-inl.h:981
struct v8::internal::ActionNode::@10::@12 u_increment_register
#define LOG(isolate, Call)
Definition: log.h:81
bool Contains(int value)
Definition: jsregexp.h:706
int to() const
Definition: jsregexp.h:711
ZoneList< CharacterRange > * ranges(Zone *zone)
Definition: ast.h:2272
void set_at_start(bool at_start)
Definition: jsregexp.h:1456
GlobalCache(Handle< JSRegExp > regexp, Handle< String > subject, bool is_global, Isolate *isolate)
Definition: jsregexp.cc:706
virtual void CheckNotBackReference(int start_reg, Label *on_no_match)=0
#define ASSERT_NOT_NULL(p)
Definition: checks.h:285
bool EmitQuickCheck(RegExpCompiler *compiler, Trace *trace, bool preload_has_checked_bounds, Label *on_possible_success, QuickCheckDetails *details_return, bool fall_through_on_failure)
Definition: jsregexp.cc:2536
void set_loop_label(Label *label)
Definition: jsregexp.h:1478
static Handle< T > cast(Handle< S > that)
Definition: handles.h:81
static const int kJSRegexpStaticOffsetsVectorSize
Definition: isolate.h:994
static ActionNode * EmptyMatchCheck(int start_register, int repetition_register, int repetition_limit, RegExpNode *on_success)
Definition: jsregexp.cc:1620
RegExpTree * body()
Definition: ast.h:2419
T Max(T a, T b)
Definition: utils.h:222
virtual void ClearRegisters(int reg_from, int reg_to)=0
Position * positions(int index)
Definition: jsregexp.h:543
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:4729
virtual RegExpNode * GetSuccessorOfOmnivorousTextNode(RegExpCompiler *compiler)
Definition: jsregexp.cc:3485
int32_t uc32
Definition: globals.h:260
LimitResult LimitVersions(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:2317
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:1523
static Handle< Object > CreateRegExpLiteral(Handle< JSFunction > constructor, Handle< String > pattern, Handle< String > flags, bool *has_pending_exception)
Definition: jsregexp.cc:66
Vector< const char > ToAsciiVector()
Definition: objects.h:7139
static TextElement CharClass(RegExpCharacterClass *char_class)
Definition: jsregexp.cc:1003
virtual RegExpNode * FilterASCII(int depth)
Definition: jsregexp.cc:2854
void AddFromFollowing(NodeInfo *that)
Definition: jsregexp.h:487
void AddRange(CharacterRange range, int value, Zone *zone)
Definition: jsregexp.cc:5618
static ActionNode * IncrementRegister(int reg, RegExpNode *on_success)
Definition: jsregexp.cc:1565
uc16 to()
Definition: jsregexp.h:355
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)=0
static ByteArray * cast(Object *obj)
static int IrregexpNumberOfCaptures(FixedArray *re)
Definition: jsregexp.cc:494
void set_to(uc16 value)
Definition: jsregexp.h:356
int int32_t
Definition: unicode.cc:47
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:5114
virtual void LoadCurrentCharacter(int cp_offset, Label *on_end_of_input, bool check_bounds=true, int characters=1)=0
static void Canonicalize(ZoneList< CharacterRange > *ranges)
Definition: jsregexp.cc:5507
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)
Definition: jsregexp.cc:2988
Handle< FixedArray > LookupRegExp(Handle< String > source, JSRegExp::Flags flags)
void EnsureAnalyzed(RegExpNode *node)
Definition: jsregexp.cc:5723
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)
Definition: jsregexp.cc:2481
void set_flush_budget(int to)
Definition: jsregexp.h:1481
static CharacterRange Everything()
Definition: jsregexp.h:275
virtual bool IsAnchoredAtEnd()
Definition: ast.h:2132
RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii, Zone *zone)
Definition: jsregexp.cc:1158
virtual bool try_to_emit_quick_check_for_alternative(int i)
Definition: jsregexp.h:1099
void AddValue(int value, Zone *zone)
Definition: jsregexp.h:357
bool Get(unsigned value)
Definition: jsregexp.cc:5604
unibrow::Mapping< unibrow::Ecma262Canonicalize > Canonicalize
#define ASSERT(condition)
Definition: checks.h:270
static ActionNode * StorePosition(int reg, bool is_capture, RegExpNode *on_success)
Definition: jsregexp.cc:1573
#define ASSERT_GE(v1, v2)
Definition: checks.h:273
void set_characters_preloaded(int count)
Definition: jsregexp.h:1479
const int kPatternTooShortForBoyerMoore
Definition: jsregexp.cc:139
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)
Definition: jsregexp.cc:2361
void AdvanceCurrentPositionInTrace(int by, RegExpCompiler *compiler)
Definition: jsregexp.cc:3437
const char * error_message()
Definition: jsregexp.h:1586
RegExpCompiler * compiler()
Definition: jsregexp.h:1291
void BuildTable(ChoiceNode *node)
Definition: jsregexp.cc:5936
virtual void ReadCurrentPositionFromRegister(int reg)=0
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)
Definition: jsregexp.cc:2491
bool EmitSkipInstructions(RegExpMacroAssembler *masm)
Definition: jsregexp.cc:3816
#define DEFINE_ACCEPT(Type)
Definition: jsregexp.cc:1633
ZoneList< TextElement > * elements()
Definition: jsregexp.h:853
Factory * factory()
Definition: isolate.h:992
void SetAll(int map_number)
Definition: jsregexp.h:1315
static Code * cast(Object *obj)
static void SetLastSubject(FixedArray *array, String *to)
Definition: jsregexp.h:207
static void Negate(ZoneList< CharacterRange > *src, ZoneList< CharacterRange > *dst, Zone *zone)
Definition: jsregexp.cc:5544
static Handle< Object > AtomExec(Handle< JSRegExp > regexp, Handle< String > subject, int index, Handle< JSArray > lastMatchInfo)
Definition: jsregexp.cc:340
static Smi * cast(Object *object)
Handle< String > FlattenGetString(Handle< String > string)
Definition: handles.cc:216
static const int kRegExpExecutableMemoryLimit
Definition: jsregexp.h:236
virtual void Emit(RegExpCompiler *compiler, Trace *trace)=0
static ActionNode * PositiveSubmatchSuccess(int stack_pointer_reg, int restore_reg, int clear_capture_count, int clear_capture_from, RegExpNode *on_success)
Definition: jsregexp.cc:1605
static const int kNoRegister
Definition: jsregexp.cc:1123
virtual void AdvanceCurrentPosition(int by)=0
int get(uchar c, uchar n, uchar *result)
Definition: unicode-inl.h:49
RegExpMacroAssembler * macro_assembler()
Definition: jsregexp.cc:1102
static void Split(ZoneList< CharacterRange > *base, Vector< const int > overlay, ZoneList< CharacterRange > **included, ZoneList< CharacterRange > **excluded, Zone *zone)
Definition: jsregexp.cc:5314
struct v8::internal::ActionNode::@10::@14 u_submatch
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:5188
RegExpCharacterClass * u_char_class
Definition: jsregexp.h:441
SmartArrayPointer< char > ToCString(AllowNullsFlag allow_nulls, RobustnessFlag robustness_flag, int offset, int length, int *length_output=0)
Definition: objects.cc:6233
RegExpNode * FilterSuccessor(int depth)
Definition: jsregexp.cc:2846
static void SetLastInput(FixedArray *array, String *to)
Definition: jsregexp.h:211
union v8::internal::TextElement::@9 data
virtual void FillInBMInfo(int offset, int recursion_depth, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:5861
virtual Interval CaptureRegisters()
Definition: ast.h:2137
static const int kRegWxpCompiledLimit
Definition: jsregexp.h:237
void Advance(int by, bool ascii)
Definition: jsregexp.cc:2773
void Merge(QuickCheckDetails *other, int from_index)
Definition: jsregexp.cc:2794
Label * loop_label()
Definition: jsregexp.h:1458
#define UNREACHABLE()
Definition: checks.h:50
RegExpExpansionLimiter(RegExpCompiler *compiler, int factor)
Definition: jsregexp.cc:4872
static int IrregexpPrepare(Handle< JSRegExp > regexp, Handle< String > subject)
Definition: jsregexp.cc:527
void IncreaseTotalRegexpCodeGenerated(int size)
Definition: heap.h:1528
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)
Definition: jsregexp.cc:2387
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:4346
STATIC_ASSERT((FixedDoubleArray::kHeaderSize &kDoubleAlignmentMask)==0)
static const unsigned kFirstLimit
Definition: jsregexp.h:321
void AddAlternative(GuardedAlternative node)
Definition: jsregexp.h:1072
static void SetLastCaptureCount(FixedArray *array, int to)
Definition: jsregexp.h:203
void fail(const char *error_message)
Definition: jsregexp.h:1590
AlternativeGeneration * at(int i)
Definition: jsregexp.cc:3629
virtual void CheckCharacterGT(uc16 limit, Label *on_greater)=0
void Call(uc32 from, DispatchTable::Entry entry)
Definition: jsregexp.cc:5957
virtual void FillInBMInfo(int offset, int recursion_depth, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:5840
void set_bound_checked_up_to(int to)
Definition: jsregexp.h:1480
void set_backtrack(Label *backtrack)
Definition: jsregexp.h:1476
static const int kInfinity
Definition: ast.h:2125
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)
Definition: jsregexp.cc:3019
virtual void Accept(NodeVisitor *visitor)
Definition: jsregexp.cc:1641
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:5099
v8::Handle< v8::Object > bottom
Definition: test-api.cc:1688
MemoryAllocator * memory_allocator()
Definition: isolate.h:845
virtual RegExpNode * FilterASCII(int depth)
Definition: jsregexp.cc:2916
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)
Definition: jsregexp.cc:2450
void AddWork(RegExpNode *node)
Definition: jsregexp.cc:1096
static bool IsCanonical(ZoneList< CharacterRange > *ranges)
Definition: jsregexp.cc:5399
QuickCheckDetails * quick_check_performed()
Definition: jsregexp.h:1463
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)=0
static Handle< JSArray > SetLastMatchInfo(Handle< JSArray > last_match_info, Handle< String > subject, int capture_count, int32_t *match)
Definition: jsregexp.cc:685
void AddGuard(Guard *guard, Zone *zone)
Definition: jsregexp.cc:1547
RegExpTree * body()
Definition: ast.h:2388
static int StartRegister(int index)
Definition: ast.h:2390
void set_bm_info(bool not_at_start, BoyerMooreLookahead *bm)
Definition: jsregexp.h:673
void AddInverse(ZoneList< CharacterRange > *ranges)
Definition: jsregexp.cc:5991
struct v8::internal::ActionNode::@10::@13 u_position_register
virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start)
Definition: jsregexp.cc:2439
virtual void VisitLoopChoice(LoopChoiceNode *that)
Definition: jsregexp.h:1520
ZoneList< RegExpTree * > * alternatives()
Definition: ast.h:2161
virtual int min_match()=0
void AddCaseEquivalents(ZoneList< CharacterRange > *ranges, bool is_ascii, Zone *zone)
Definition: jsregexp.cc:5333
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:4224
virtual void ReadStackPointerFromRegister(int reg)=0
static void IrregexpInitialize(Handle< JSRegExp > re, Handle< String > pattern, JSRegExp::Flags flags, int capture_register_count)
Definition: jsregexp.cc:514
virtual void IfRegisterLT(int reg, int comparand, Label *if_lt)=0
static Handle< Object > Compile(Handle< JSRegExp > re, Handle< String > pattern, Handle< String > flags, Zone *zone)
Definition: jsregexp.cc:168
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, Label *on_no_match)=0
struct v8::internal::ActionNode::@10::@15 u_empty_match_check
TriBool at_start()
Definition: jsregexp.h:1455
Label * backtrack()
Definition: jsregexp.h:1457
static RegExpImpl::IrregexpResult Match(Isolate *isolate, Handle< ByteArray > code, Handle< String > subject, int *captures, int start_position)
Handle< String > NewStringFromTwoByte(Vector< const uc16 > str, PretenureFlag pretenure=NOT_TENURED)
Definition: factory.cc:216
const Register pc
QuickCheckDetails quick_check_details
Definition: jsregexp.cc:3605
static Handle< Object > Exec(Handle< JSRegExp > regexp, Handle< String > subject, int index, Handle< JSArray > lastMatchInfo)
Definition: jsregexp.cc:232
Vector< const uc16 > ToUC16Vector()
Definition: objects.h:7145
virtual RegExpNode * FilterASCII(int depth)
Definition: jsregexp.h:631
static int IrregexpExecRaw(Handle< JSRegExp > regexp, Handle< String > subject, int index, int32_t *output, int output_size)
Definition: jsregexp.cc:550
const int kMaxLookaheadForBoyerMoore
Definition: jsregexp.cc:136
Zone * zone() const
Definition: jsregexp.h:665
ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b)
Definition: jsregexp.h:1237
int length() const
Definition: utils.h:384
#define ASSERT_LE(v1, v2)
Definition: checks.h:275
virtual void CheckNotAtStart(Label *on_not_at_start)=0
void add_action(DeferredAction *new_action)
Definition: jsregexp.h:1471
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3550
bool has_pending_exception()
Definition: isolate.h:561
BoyerMooreLookahead(int length, RegExpCompiler *compiler, Zone *zone)
Definition: jsregexp.cc:3696
virtual void CheckCharacter(unsigned c, Label *on_equal)=0
void SaveBMInfo(BoyerMooreLookahead *bm, bool not_at_start, int offset)
Definition: jsregexp.h:647
static const int kIrregexpCaptureCountIndex
Definition: objects.h:6659
void AddRange(CharacterRange range)
Definition: jsregexp.h:1537
virtual void FillInBMInfo(int offset, int recursion_depth, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.h:620
void set_being_calculated(bool b)
Definition: jsregexp.h:1098
static void AtomCompile(Handle< JSRegExp > re, Handle< String > pattern, JSRegExp::Flags flags, Handle< String > match_pattern)
Definition: jsregexp.cc:256
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:5044
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:4835
int characters_preloaded()
Definition: jsregexp.h:1460
DeferredAction * actions()
Definition: jsregexp.h:1435
activate correct semantics for inheriting readonliness false
Definition: flags.cc:141
static const int kNodeIsTooComplexForGreedyLoops
Definition: jsregexp.h:605
uc16 from()
Definition: jsregexp.h:354
virtual void FillInBMInfo(int offset, int recursion_depth, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:2372
void PutRegExp(Handle< String > source, JSRegExp::Flags flags, Handle< FixedArray > data)
Vector< const char > CStrVector(const char *data)
Definition: utils.h:526
DispatchTable * GetTable(bool ignore_case)
Definition: jsregexp.cc:1021
static int IrregexpMaxRegisterCount(FixedArray *re)
Definition: jsregexp.cc:483
void CountCharacter(int character)
Definition: jsregexp.cc:1039
RegExpNode * on_success()
Definition: jsregexp.h:724
static void SetIrregexpMaxRegisterCount(FixedArray *re, int value)
Definition: jsregexp.cc:489
BoyerMooreLookahead * bm_info(bool not_at_start)
Definition: jsregexp.h:661
static int AtomExecRaw(Handle< JSRegExp > regexp, Handle< String > subject, int index, int32_t *output, int output_size)
Definition: jsregexp.cc:281
#define ASSERT_LT(v1, v2)
Definition: checks.h:274
static const int kLastMatchOverhead
Definition: jsregexp.h:186
void set_from(uc16 value)
Definition: jsregexp.h:280
FlatContent GetFlatContent()
Definition: objects.cc:6191
virtual void AppendToText(RegExpText *text, Zone *zone)
Definition: jsregexp.cc:985
#define ISOLATE
Definition: isolate.h:1435
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:5108
int bound_checked_up_to()
Definition: jsregexp.h:1461
virtual int GreedyLoopTextLength()
Definition: jsregexp.h:606
virtual bool CheckSpecialCharacterClass(uc16 type, Label *on_no_match)
static const int kMaxRecursion
Definition: jsregexp.cc:1105
void AddContinueAlternative(GuardedAlternative alt)
Definition: jsregexp.cc:3543
bool GetStoredPosition(int reg, int *cp_offset)
Definition: jsregexp.cc:1247
static AssertionNode * AfterNewline(RegExpNode *on_success)
Definition: jsregexp.h:909
int from() const
Definition: jsregexp.h:710
FrequencyCollator * frequency_collator()
Definition: jsregexp.cc:1114
void Sort(int(*cmp)(const T *x, const T *y))
static AssertionNode * AtNonBoundary(RegExpNode *on_success)
Definition: jsregexp.h:906
bool is_null() const
Definition: handles.h:87
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset)=0
static const int kMaxWidth
Definition: unicode.h:311
virtual void VisitLoopChoice(LoopChoiceNode *that)
Definition: jsregexp.cc:5795
static const int kAtomPatternIndex
Definition: objects.h:6635
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)
Definition: jsregexp.cc:2611
virtual void Bind(Label *label)=0
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int characters_filled_in, bool not_at_start)=0
int Count(int map_number)
Definition: jsregexp.h:1293
#define UNIMPLEMENTED()
Definition: checks.h:48
uint16_t uc16
Definition: globals.h:259
void MakeCaseIndependent(bool is_ascii)
Definition: jsregexp.cc:3456
virtual void CheckCharacterLT(uc16 limit, Label *on_less)=0
ZoneList< GuardedAlternative > * alternatives()
Definition: jsregexp.h:1075
virtual bool IsAnchoredAtStart()
Definition: ast.h:2131
static int code_index(bool is_ascii)
Definition: objects.h:6603
static void PrintError(const char *format,...)
static Handle< T > null()
Definition: handles.h:86
int SearchString(Isolate *isolate, Vector< const SubjectChar > subject, Vector< const PatternChar > pattern, int start_index)
virtual RegExpNode * FilterASCII(int depth)
Definition: jsregexp.cc:2837
void SetRest(int from_map)
Definition: jsregexp.h:1319
Vector< const uc16 > data()
Definition: ast.h:2293
static Result Match(Handle< Code > regexp, Handle< String > subject, int *offsets_vector, int offsets_vector_length, int previous_index, Isolate *isolate)
#define ASSERT_EQ(v1, v2)
Definition: checks.h:271
void ForEach(Callback *callback)
Definition: jsregexp.h:388
virtual void AppendToText(RegExpText *text, Zone *zone)
Definition: jsregexp.cc:975
OutSet * Extend(unsigned value, Zone *zone)
Definition: jsregexp.cc:5573
static void DotPrint(const char *label, RegExpNode *node, bool ignore_case)
static const uchar kBadChar
Definition: unicode.h:162
void USE(T)
Definition: globals.h:289
activate correct semantics for inheriting readonliness enable harmony semantics for typeof enable harmony enable harmony proxies enable all harmony harmony_scoping harmony_proxies harmony_scoping tracks arrays with only smi values automatically unbox arrays of doubles use crankshaft use hydrogen range analysis use hydrogen global value numbering use function inlining maximum number of AST nodes considered for a single inlining loop invariant code motion print statistics for hydrogen trace generated IR for specified phases trace register allocator trace range analysis trace representation types environment for every instruction put a break point before deoptimizing polymorphic inlining perform array bounds checks elimination use dead code elimination trace on stack replacement optimize closures cache optimized code for closures functions with arguments object loop weight for representation inference allow uint32 values on optimize frames if they are used only in safe operations track parallel recompilation enable all profiler experiments number of stack frames inspected by the profiler call recompile stub directly when self optimizing trigger profiler ticks based on counting instead of timing weight back edges by jump distance for interrupt triggering percentage of ICs that must have type info to allow optimization watch_ic_patching retry_self_opt interrupt_at_exit extra verbose compilation tracing generate extra emit comments in code disassembly enable use of SSE3 instructions if available enable use of CMOV instruction if available enable use of SAHF instruction if enable use of VFP3 instructions if available this implies enabling ARMv7 and VFP2 enable use of VFP2 instructions if available enable use of SDIV and UDIV instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of MIPS FPU instructions if expose natives in global object expose gc extension number of stack frames to capture disable builtin natives files print a stack trace if an assertion failure occurs use random jit cookie to mask large constants trace lazy optimization use adaptive optimizations prepare for turning on always opt minimum length for automatic enable preparsing maximum number of optimization attempts before giving up cache prototype transitions automatically set the debug break flag when debugger commands are in the queue always cause a debug break before aborting maximum length of function source code printed in a stack trace max size of the new max size of the old max size of executable always perform global GCs print one trace line following each garbage collection do not print trace line after scavenger collection print more details following each garbage collection print amount of external allocated memory after each time it is adjusted flush code that we expect not to use again before full gc do incremental marking steps track object counts and memory usage use caching Perform compaction on every full GC Never perform compaction on full GC testing only Compact code space on full incremental collections Default seed for initializing random allows verbose printing trace parsing and preparsing Check icache flushes in ARM and MIPS simulator Stack alingment in bytes in print stack trace when throwing exceptions randomize hashes to avoid predictable hash Fixed seed to use to hash property activate a timer that switches between V8 threads testing_bool_flag float flag Seed used for threading test randomness A filename with extra code to be included in the Print usage message
Definition: flags.cc:495
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:5170
ZoneList< GuardedAlternative > * alternatives_
Definition: jsregexp.h:1104
static const unsigned kMaxAsciiCharCodeU
Definition: objects.h:7328
virtual void CheckCharacterAfterAnd(unsigned c, unsigned and_with, Label *on_equal)=0
ContainedInLattice AddRange(ContainedInLattice containment, const int *ranges, int ranges_length, Interval new_range)
Definition: jsregexp.cc:111
virtual void IfRegisterEqPos(int reg, Label *if_eq)=0
virtual int GreedyLoopTextLength()
Definition: jsregexp.cc:3475
static ByteArray * IrregexpByteCode(FixedArray *re, bool is_ascii)
Definition: jsregexp.cc:504
static FixedArray * cast(Object *obj)
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3198
void set_current_expansion_factor(int value)
Definition: jsregexp.cc:1117
static ActionNode * ClearCaptures(Interval range, RegExpNode *on_success)
Definition: jsregexp.cc:1584
void Set(int map_number, int character)
Definition: jsregexp.h:1299
static int IrregexpNumberOfRegisters(FixedArray *re)
Definition: jsregexp.cc:499
virtual void CheckBitInTable(Handle< ByteArray > table, Label *on_bit_set)=0
virtual RegExpNode * FilterASCII(int depth)
Definition: jsregexp.cc:2966
unibrow::Mapping< unibrow::CanonicalizationRange > * jsregexp_canonrange()
Definition: isolate.h:896
struct v8::internal::ActionNode::@10::@16 u_clear_captures
static const int kCompilationErrorValue
Definition: objects.h:6686
void Add(const T &element, AllocationPolicy allocator=AllocationPolicy())
Definition: list-inl.h:38
virtual void AppendToText(RegExpText *text, Zone *zone)
Definition: jsregexp.cc:980
#define FACTORY
Definition: isolate.h:1434
Definition: jsregexp.h:349
bool Rationalize(bool ascii)
Definition: jsregexp.cc:2512
activate correct semantics for inheriting readonliness enable harmony semantics for typeof enable harmony enable harmony proxies enable all harmony harmony_scoping harmony_proxies harmony_scoping tracks arrays with only smi values automatically unbox arrays of doubles use crankshaft use hydrogen range analysis use hydrogen global value numbering use function inlining maximum number of AST nodes considered for a single inlining loop invariant code motion print statistics for hydrogen trace generated IR for specified phases trace register allocator trace range analysis trace representation types environment for every instruction put a break point before deoptimizing polymorphic inlining perform array bounds checks elimination use dead code elimination trace on stack replacement optimize closures cache optimized code for closures functions with arguments object loop weight for representation inference allow uint32 values on optimize frames if they are used only in safe operations track parallel recompilation enable all profiler experiments number of stack frames inspected by the profiler call recompile stub directly when self optimizing trigger profiler ticks based on counting instead of timing weight back edges by jump distance for interrupt triggering percentage of ICs that must have type info to allow optimization watch_ic_patching retry_self_opt interrupt_at_exit extra verbose compilation tracing generate extra emit comments in code disassembly enable use of SSE3 instructions if available enable use of CMOV instruction if available enable use of SAHF instruction if enable use of VFP3 instructions if available this implies enabling ARMv7 and VFP2 enable use of VFP2 instructions if available enable use of SDIV and UDIV instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of MIPS FPU instructions if NULL
Definition: flags.cc:301
Object * get(int index)
Definition: objects-inl.h:1737
virtual void CheckGreedyLoop(Label *on_tos_equals_current_position)=0
void AddLoopAlternative(GuardedAlternative alt)
Definition: jsregexp.cc:3536
activate correct semantics for inheriting readonliness enable harmony semantics for typeof enable harmony enable harmony proxies enable all harmony harmony_scoping harmony_proxies harmony_scoping tracks arrays with only smi values automatically unbox arrays of doubles use crankshaft use hydrogen range analysis use hydrogen global value numbering use function inlining maximum number of AST nodes considered for a single inlining loop invariant code motion print statistics for hydrogen trace generated IR for specified phases trace register allocator trace range analysis trace representation types environment for every instruction put a break point before deoptimizing polymorphic inlining perform array bounds checks elimination use dead code elimination trace on stack replacement optimize closures cache optimized code for closures functions with arguments object loop weight for representation inference allow uint32 values on optimize frames if they are used only in safe operations track parallel recompilation enable all profiler experiments number of stack frames inspected by the profiler call recompile stub directly when self optimizing trigger profiler ticks based on counting instead of timing weight back edges by jump distance for interrupt triggering percentage of ICs that must have type info to allow optimization watch_ic_patching retry_self_opt interrupt_at_exit extra verbose compilation tracing generate extra code(assertions) for debugging") DEFINE_bool(code_comments
virtual void Accept(NodeVisitor *visitor)=0
RegExpNode * loop_node()
Definition: jsregexp.h:1184
int EatsAtLeastHelper(int still_to_find, int recursion_depth, RegExpNode *ignore_this_node, bool not_at_start)
Definition: jsregexp.cc:2462
static TextElement Atom(RegExpAtom *atom)
Definition: jsregexp.cc:996
ZoneList< Guard * > * guards()
Definition: jsregexp.h:1051
static const int kFillInBMBudget
Definition: jsregexp.h:619
static int EndRegister(int index)
Definition: ast.h:2391
struct v8::internal::ActionNode::@10::@11 u_store_register
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: jsregexp.cc:3962
static Handle< Object > IrregexpExec(Handle< JSRegExp > regexp, Handle< String > subject, int index, Handle< JSArray > lastMatchInfo)
Definition: jsregexp.cc:637
#define FOR_EACH_NODE_TYPE(VISIT)
Definition: jsregexp.h:402
void DeleteArray(T *array)
Definition: allocation.h:91
T Min(T a, T b)
Definition: utils.h:229
activate correct semantics for inheriting readonliness enable harmony semantics for typeof enable harmony enable harmony proxies enable all harmony harmony_scoping harmony_proxies harmony_scoping tracks arrays with only smi values automatically unbox arrays of doubles use crankshaft use hydrogen range analysis use hydrogen global value numbering use function inlining maximum number of AST nodes considered for a single inlining loop invariant code motion print statistics for hydrogen trace generated IR for specified phases trace register allocator trace range analysis trace representation types environment for every instruction put a break point before deoptimizing polymorphic inlining perform array bounds checks elimination use dead code elimination trace on stack replacement optimize closures cache optimized code for closures functions with arguments object loop weight for representation inference allow uint32 values on optimize frames if they are used only in safe operations track parallel recompilation enable all profiler experiments number of stack frames inspected by the profiler call recompile stub directly when self optimizing trigger profiler ticks based on counting instead of timing weight back edges by jump distance for interrupt triggering percentage of ICs that must have type info to allow optimization watch_ic_patching retry_self_opt interrupt_at_exit extra verbose compilation tracing generate extra emit comments in code disassembly enable use of SSE3 instructions if available enable use of CMOV instruction if available enable use of SAHF instruction if enable use of VFP3 instructions if available this implies enabling ARMv7 and VFP2 enable use of VFP2 instructions if available enable use of SDIV and UDIV instructions if enable loading bit constant by means of movw movt instruction enable unaligned accesses for enable use of MIPS FPU instructions if expose natives in global object expose gc extension number of stack frames to capture disable builtin natives files print a stack trace if an assertion failure occurs use random jit cookie to mask large constants trace lazy optimization use adaptive optimizations prepare for turning on always opt minimum length for automatic enable preparsing maximum number of optimization attempts before giving up cache prototype transitions automatically set the debug break flag when debugger commands are in the queue always cause a debug break before aborting maximum length of function source code printed in a stack trace max size of the new max size of the old max size of executable always perform global GCs print one trace line following each garbage collection do not print trace line after scavenger collection print more details following each garbage collection print amount of external allocated memory after each time it is adjusted flush code that we expect not to use again before full gc do incremental marking steps track object counts and memory usage use caching Perform compaction on every full GC Never perform compaction on full GC testing only Compact code space on full incremental collections Default seed for initializing random allows verbose printing trace parsing and preparsing Check icache flushes in ARM and MIPS simulator Stack alingment in bytes in print stack trace when throwing exceptions randomize hashes to avoid predictable hash Fixed seed to use to hash property activate a timer that switches between V8 threads testing_bool_flag float flag Seed used for threading test randomness A filename with extra code to be included in the Print usage including flags
Definition: flags.cc:495
void Call(uc16 from, DispatchTable::Entry entry)
Definition: jsregexp.cc:5304
virtual int max_match()=0
void Flush(RegExpCompiler *compiler, RegExpNode *successor)
Definition: jsregexp.cc:1435
static const int kUninitializedValue
Definition: objects.h:6682
ZoneList< RegExpTree * > * nodes()
Definition: ast.h:2182
void set_quick_check_performed(QuickCheckDetails *d)
Definition: jsregexp.h:1482
void check(i::Vector< const char > string)
const int kSampleSize
Definition: process.cc:570
void InvalidateCurrentCharacter()
Definition: jsregexp.cc:3432
#define ARRAY_SIZE(a)
Definition: globals.h:281
virtual void CheckNotCharacter(unsigned c, Label *on_not_equal)=0
RegExpTree * body()
Definition: ast.h:2358
static void AddClassEscape(uc16 type, ZoneList< CharacterRange > *ranges, Zone *zone)
Definition: jsregexp.cc:5231
OutSet * out_set()
Definition: jsregexp.h:360
static int saved_code_index(bool is_ascii)
Definition: objects.h:6611
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:4856
int Frequency(int in_character)
Definition: jsregexp.cc:1047
virtual void CheckPosition(int cp_offset, Label *on_outside_input)
RecursionCheck(RegExpCompiler *compiler)
Definition: jsregexp.cc:1142
static CharacterRange Singleton(uc16 value)
Definition: jsregexp.h:268
unsigned int uchar
Definition: unicode.h:40
static AssertionNode * AtEnd(RegExpNode *on_success)
Definition: jsregexp.h:897
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: jsregexp.cc:4841
virtual void FillInBMInfo(int offset, int recursion_depth, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:2403
void AddElement(TextElement elm, Zone *zone)
Definition: ast.h:2312
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, Label *on_not_equal)=0
virtual Handle< HeapObject > GetCode(Handle< String > source)=0
AddDispatchRange(DispatchTableConstructor *constructor)
Definition: jsregexp.cc:5949
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, int filled_in, bool not_at_start)
Definition: jsregexp.cc:3183
virtual void IfRegisterGE(int reg, int comparand, Label *if_ge)=0
void set_stop_node(RegExpNode *node)
Definition: jsregexp.h:1477
virtual void PushBacktrack(Label *label)=0
static CompilationResult Compile(RegExpCompileData *input, bool ignore_case, bool global, bool multiline, Handle< String > pattern, Handle< String > sample_subject, bool is_ascii, Zone *zone)
Definition: jsregexp.cc:6042
virtual RegExpNode * FilterASCII(int depth)
Definition: jsregexp.cc:2899
virtual void FillInBMInfo(int offset, int recursion_depth, int budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: jsregexp.cc:3001
static void SetCapture(FixedArray *array, int index, int to)
Definition: jsregexp.h:215
static AssertionNode * AtBoundary(RegExpNode *on_success)
Definition: jsregexp.h:903
bool mentions_reg(int reg)
Definition: jsregexp.cc:1236
const int kNoRegister