v8  3.11.10(node0.8.26)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
scanner.cc
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 // Features shared by parsing and pre-parsing scanners.
29 
30 #include "scanner.h"
31 
32 #include "../include/v8stdint.h"
33 #include "char-predicates-inl.h"
34 
35 namespace v8 {
36 namespace internal {
37 
38 // ----------------------------------------------------------------------------
39 // Scanner
40 
42  : unicode_cache_(unicode_cache),
43  octal_pos_(Location::invalid()),
44  harmony_scoping_(false),
45  harmony_modules_(false) { }
46 
47 
49  source_ = source;
50  // Need to capture identifiers in order to recognize "get" and "set"
51  // in object literals.
52  Init();
53  // Skip initial whitespace allowing HTML comment ends just like
54  // after a newline and scan first token.
55  has_line_terminator_before_next_ = true;
56  SkipWhiteSpace();
57  Scan();
58 }
59 
60 
61 uc32 Scanner::ScanHexNumber(int expected_length) {
62  ASSERT(expected_length <= 4); // prevent overflow
63 
64  uc32 digits[4] = { 0, 0, 0, 0 };
65  uc32 x = 0;
66  for (int i = 0; i < expected_length; i++) {
67  digits[i] = c0_;
68  int d = HexValue(c0_);
69  if (d < 0) {
70  // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
71  // should be illegal, but other JS VMs just return the
72  // non-escaped version of the original character.
73 
74  // Push back digits that we have advanced past.
75  for (int j = i-1; j >= 0; j--) {
76  PushBack(digits[j]);
77  }
78  return -1;
79  }
80  x = x * 16 + d;
81  Advance();
82  }
83 
84  return x;
85 }
86 
87 
88 // Ensure that tokens can be stored in a byte.
90 
91 // Table of one-character tokens, by character (0x00..0x7f only).
92 static const byte one_char_tokens[] = {
93  Token::ILLEGAL,
94  Token::ILLEGAL,
95  Token::ILLEGAL,
96  Token::ILLEGAL,
97  Token::ILLEGAL,
98  Token::ILLEGAL,
99  Token::ILLEGAL,
100  Token::ILLEGAL,
101  Token::ILLEGAL,
102  Token::ILLEGAL,
103  Token::ILLEGAL,
104  Token::ILLEGAL,
105  Token::ILLEGAL,
106  Token::ILLEGAL,
107  Token::ILLEGAL,
108  Token::ILLEGAL,
109  Token::ILLEGAL,
110  Token::ILLEGAL,
111  Token::ILLEGAL,
112  Token::ILLEGAL,
113  Token::ILLEGAL,
114  Token::ILLEGAL,
115  Token::ILLEGAL,
116  Token::ILLEGAL,
117  Token::ILLEGAL,
118  Token::ILLEGAL,
119  Token::ILLEGAL,
120  Token::ILLEGAL,
121  Token::ILLEGAL,
122  Token::ILLEGAL,
123  Token::ILLEGAL,
124  Token::ILLEGAL,
125  Token::ILLEGAL,
126  Token::ILLEGAL,
127  Token::ILLEGAL,
128  Token::ILLEGAL,
129  Token::ILLEGAL,
130  Token::ILLEGAL,
131  Token::ILLEGAL,
132  Token::ILLEGAL,
133  Token::LPAREN, // 0x28
134  Token::RPAREN, // 0x29
135  Token::ILLEGAL,
136  Token::ILLEGAL,
137  Token::COMMA, // 0x2c
138  Token::ILLEGAL,
139  Token::ILLEGAL,
140  Token::ILLEGAL,
141  Token::ILLEGAL,
142  Token::ILLEGAL,
143  Token::ILLEGAL,
144  Token::ILLEGAL,
145  Token::ILLEGAL,
146  Token::ILLEGAL,
147  Token::ILLEGAL,
148  Token::ILLEGAL,
149  Token::ILLEGAL,
150  Token::ILLEGAL,
151  Token::COLON, // 0x3a
152  Token::SEMICOLON, // 0x3b
153  Token::ILLEGAL,
154  Token::ILLEGAL,
155  Token::ILLEGAL,
156  Token::CONDITIONAL, // 0x3f
157  Token::ILLEGAL,
158  Token::ILLEGAL,
159  Token::ILLEGAL,
160  Token::ILLEGAL,
161  Token::ILLEGAL,
162  Token::ILLEGAL,
163  Token::ILLEGAL,
164  Token::ILLEGAL,
165  Token::ILLEGAL,
166  Token::ILLEGAL,
167  Token::ILLEGAL,
168  Token::ILLEGAL,
169  Token::ILLEGAL,
170  Token::ILLEGAL,
171  Token::ILLEGAL,
172  Token::ILLEGAL,
173  Token::ILLEGAL,
174  Token::ILLEGAL,
175  Token::ILLEGAL,
176  Token::ILLEGAL,
177  Token::ILLEGAL,
178  Token::ILLEGAL,
179  Token::ILLEGAL,
180  Token::ILLEGAL,
181  Token::ILLEGAL,
182  Token::ILLEGAL,
183  Token::ILLEGAL,
184  Token::LBRACK, // 0x5b
185  Token::ILLEGAL,
186  Token::RBRACK, // 0x5d
187  Token::ILLEGAL,
188  Token::ILLEGAL,
189  Token::ILLEGAL,
190  Token::ILLEGAL,
191  Token::ILLEGAL,
192  Token::ILLEGAL,
193  Token::ILLEGAL,
194  Token::ILLEGAL,
195  Token::ILLEGAL,
196  Token::ILLEGAL,
197  Token::ILLEGAL,
198  Token::ILLEGAL,
199  Token::ILLEGAL,
200  Token::ILLEGAL,
201  Token::ILLEGAL,
202  Token::ILLEGAL,
203  Token::ILLEGAL,
204  Token::ILLEGAL,
205  Token::ILLEGAL,
206  Token::ILLEGAL,
207  Token::ILLEGAL,
208  Token::ILLEGAL,
209  Token::ILLEGAL,
210  Token::ILLEGAL,
211  Token::ILLEGAL,
212  Token::ILLEGAL,
213  Token::ILLEGAL,
214  Token::ILLEGAL,
215  Token::ILLEGAL,
216  Token::LBRACE, // 0x7b
217  Token::ILLEGAL,
218  Token::RBRACE, // 0x7d
219  Token::BIT_NOT, // 0x7e
220  Token::ILLEGAL
221 };
222 
223 
225  current_ = next_;
226  has_line_terminator_before_next_ = false;
227  has_multiline_comment_before_next_ = false;
228  if (static_cast<unsigned>(c0_) <= 0x7f) {
229  Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
230  if (token != Token::ILLEGAL) {
231  int pos = source_pos();
232  next_.token = token;
233  next_.location.beg_pos = pos;
234  next_.location.end_pos = pos + 1;
235  Advance();
236  return current_.token;
237  }
238  }
239  Scan();
240  return current_.token;
241 }
242 
243 
244 static inline bool IsByteOrderMark(uc32 c) {
245  // The Unicode value U+FFFE is guaranteed never to be assigned as a
246  // Unicode character; this implies that in a Unicode context the
247  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
248  // character expressed in little-endian byte order (since it could
249  // not be a U+FFFE character expressed in big-endian byte
250  // order). Nevertheless, we check for it to be compatible with
251  // Spidermonkey.
252  return c == 0xFEFF || c == 0xFFFE;
253 }
254 
255 
256 bool Scanner::SkipWhiteSpace() {
257  int start_position = source_pos();
258 
259  while (true) {
260  // We treat byte-order marks (BOMs) as whitespace for better
261  // compatibility with Spidermonkey and other JavaScript engines.
262  while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
263  // IsWhiteSpace() includes line terminators!
264  if (unicode_cache_->IsLineTerminator(c0_)) {
265  // Ignore line terminators, but remember them. This is necessary
266  // for automatic semicolon insertion.
267  has_line_terminator_before_next_ = true;
268  }
269  Advance();
270  }
271 
272  // If there is an HTML comment end '-->' at the beginning of a
273  // line (with only whitespace in front of it), we treat the rest
274  // of the line as a comment. This is in line with the way
275  // SpiderMonkey handles it.
276  if (c0_ == '-' && has_line_terminator_before_next_) {
277  Advance();
278  if (c0_ == '-') {
279  Advance();
280  if (c0_ == '>') {
281  // Treat the rest of the line as a comment.
282  SkipSingleLineComment();
283  // Continue skipping white space after the comment.
284  continue;
285  }
286  PushBack('-'); // undo Advance()
287  }
288  PushBack('-'); // undo Advance()
289  }
290  // Return whether or not we skipped any characters.
291  return source_pos() != start_position;
292  }
293 }
294 
295 
296 Token::Value Scanner::SkipSingleLineComment() {
297  Advance();
298 
299  // The line terminator at the end of the line is not considered
300  // to be part of the single-line comment; it is recognized
301  // separately by the lexical grammar and becomes part of the
302  // stream of input elements for the syntactic grammar (see
303  // ECMA-262, section 7.4).
304  while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
305  Advance();
306  }
307 
308  return Token::WHITESPACE;
309 }
310 
311 
312 Token::Value Scanner::SkipMultiLineComment() {
313  ASSERT(c0_ == '*');
314  Advance();
315 
316  while (c0_ >= 0) {
317  uc32 ch = c0_;
318  Advance();
319  if (unicode_cache_->IsLineTerminator(ch)) {
320  // Following ECMA-262, section 7.4, a comment containing
321  // a newline will make the comment count as a line-terminator.
322  has_multiline_comment_before_next_ = true;
323  }
324  // If we have reached the end of the multi-line comment, we
325  // consume the '/' and insert a whitespace. This way all
326  // multi-line comments are treated as whitespace.
327  if (ch == '*' && c0_ == '/') {
328  c0_ = ' ';
329  return Token::WHITESPACE;
330  }
331  }
332 
333  // Unterminated multi-line comment.
334  return Token::ILLEGAL;
335 }
336 
337 
338 Token::Value Scanner::ScanHtmlComment() {
339  // Check for <!-- comments.
340  ASSERT(c0_ == '!');
341  Advance();
342  if (c0_ == '-') {
343  Advance();
344  if (c0_ == '-') return SkipSingleLineComment();
345  PushBack('-'); // undo Advance()
346  }
347  PushBack('!'); // undo Advance()
348  ASSERT(c0_ == '!');
349  return Token::LT;
350 }
351 
352 
353 void Scanner::Scan() {
354  next_.literal_chars = NULL;
355  Token::Value token;
356  do {
357  // Remember the position of the next token
358  next_.location.beg_pos = source_pos();
359 
360  switch (c0_) {
361  case ' ':
362  case '\t':
363  Advance();
364  token = Token::WHITESPACE;
365  break;
366 
367  case '\n':
368  Advance();
369  has_line_terminator_before_next_ = true;
370  token = Token::WHITESPACE;
371  break;
372 
373  case '"': case '\'':
374  token = ScanString();
375  break;
376 
377  case '<':
378  // < <= << <<= <!--
379  Advance();
380  if (c0_ == '=') {
381  token = Select(Token::LTE);
382  } else if (c0_ == '<') {
383  token = Select('=', Token::ASSIGN_SHL, Token::SHL);
384  } else if (c0_ == '!') {
385  token = ScanHtmlComment();
386  } else {
387  token = Token::LT;
388  }
389  break;
390 
391  case '>':
392  // > >= >> >>= >>> >>>=
393  Advance();
394  if (c0_ == '=') {
395  token = Select(Token::GTE);
396  } else if (c0_ == '>') {
397  // >> >>= >>> >>>=
398  Advance();
399  if (c0_ == '=') {
400  token = Select(Token::ASSIGN_SAR);
401  } else if (c0_ == '>') {
402  token = Select('=', Token::ASSIGN_SHR, Token::SHR);
403  } else {
404  token = Token::SAR;
405  }
406  } else {
407  token = Token::GT;
408  }
409  break;
410 
411  case '=':
412  // = == ===
413  Advance();
414  if (c0_ == '=') {
415  token = Select('=', Token::EQ_STRICT, Token::EQ);
416  } else {
417  token = Token::ASSIGN;
418  }
419  break;
420 
421  case '!':
422  // ! != !==
423  Advance();
424  if (c0_ == '=') {
425  token = Select('=', Token::NE_STRICT, Token::NE);
426  } else {
427  token = Token::NOT;
428  }
429  break;
430 
431  case '+':
432  // + ++ +=
433  Advance();
434  if (c0_ == '+') {
435  token = Select(Token::INC);
436  } else if (c0_ == '=') {
437  token = Select(Token::ASSIGN_ADD);
438  } else {
439  token = Token::ADD;
440  }
441  break;
442 
443  case '-':
444  // - -- --> -=
445  Advance();
446  if (c0_ == '-') {
447  Advance();
448  if (c0_ == '>' && has_line_terminator_before_next_) {
449  // For compatibility with SpiderMonkey, we skip lines that
450  // start with an HTML comment end '-->'.
451  token = SkipSingleLineComment();
452  } else {
453  token = Token::DEC;
454  }
455  } else if (c0_ == '=') {
456  token = Select(Token::ASSIGN_SUB);
457  } else {
458  token = Token::SUB;
459  }
460  break;
461 
462  case '*':
463  // * *=
464  token = Select('=', Token::ASSIGN_MUL, Token::MUL);
465  break;
466 
467  case '%':
468  // % %=
469  token = Select('=', Token::ASSIGN_MOD, Token::MOD);
470  break;
471 
472  case '/':
473  // / // /* /=
474  Advance();
475  if (c0_ == '/') {
476  token = SkipSingleLineComment();
477  } else if (c0_ == '*') {
478  token = SkipMultiLineComment();
479  } else if (c0_ == '=') {
480  token = Select(Token::ASSIGN_DIV);
481  } else {
482  token = Token::DIV;
483  }
484  break;
485 
486  case '&':
487  // & && &=
488  Advance();
489  if (c0_ == '&') {
490  token = Select(Token::AND);
491  } else if (c0_ == '=') {
492  token = Select(Token::ASSIGN_BIT_AND);
493  } else {
494  token = Token::BIT_AND;
495  }
496  break;
497 
498  case '|':
499  // | || |=
500  Advance();
501  if (c0_ == '|') {
502  token = Select(Token::OR);
503  } else if (c0_ == '=') {
504  token = Select(Token::ASSIGN_BIT_OR);
505  } else {
506  token = Token::BIT_OR;
507  }
508  break;
509 
510  case '^':
511  // ^ ^=
512  token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
513  break;
514 
515  case '.':
516  // . Number
517  Advance();
518  if (IsDecimalDigit(c0_)) {
519  token = ScanNumber(true);
520  } else {
521  token = Token::PERIOD;
522  }
523  break;
524 
525  case ':':
526  token = Select(Token::COLON);
527  break;
528 
529  case ';':
530  token = Select(Token::SEMICOLON);
531  break;
532 
533  case ',':
534  token = Select(Token::COMMA);
535  break;
536 
537  case '(':
538  token = Select(Token::LPAREN);
539  break;
540 
541  case ')':
542  token = Select(Token::RPAREN);
543  break;
544 
545  case '[':
546  token = Select(Token::LBRACK);
547  break;
548 
549  case ']':
550  token = Select(Token::RBRACK);
551  break;
552 
553  case '{':
554  token = Select(Token::LBRACE);
555  break;
556 
557  case '}':
558  token = Select(Token::RBRACE);
559  break;
560 
561  case '?':
562  token = Select(Token::CONDITIONAL);
563  break;
564 
565  case '~':
566  token = Select(Token::BIT_NOT);
567  break;
568 
569  default:
570  if (unicode_cache_->IsIdentifierStart(c0_)) {
571  token = ScanIdentifierOrKeyword();
572  } else if (IsDecimalDigit(c0_)) {
573  token = ScanNumber(false);
574  } else if (SkipWhiteSpace()) {
575  token = Token::WHITESPACE;
576  } else if (c0_ < 0) {
577  token = Token::EOS;
578  } else {
579  token = Select(Token::ILLEGAL);
580  }
581  break;
582  }
583 
584  // Continue scanning for tokens as long as we're just skipping
585  // whitespace.
586  } while (token == Token::WHITESPACE);
587 
588  next_.location.end_pos = source_pos();
589  next_.token = token;
590 }
591 
592 
593 void Scanner::SeekForward(int pos) {
594  // After this call, we will have the token at the given position as
595  // the "next" token. The "current" token will be invalid.
596  if (pos == next_.location.beg_pos) return;
597  int current_pos = source_pos();
598  ASSERT_EQ(next_.location.end_pos, current_pos);
599  // Positions inside the lookahead token aren't supported.
600  ASSERT(pos >= current_pos);
601  if (pos != current_pos) {
602  source_->SeekForward(pos - source_->pos());
603  Advance();
604  // This function is only called to seek to the location
605  // of the end of a function (at the "}" token). It doesn't matter
606  // whether there was a line terminator in the part we skip.
607  has_line_terminator_before_next_ = false;
608  has_multiline_comment_before_next_ = false;
609  }
610  Scan();
611 }
612 
613 
614 bool Scanner::ScanEscape() {
615  uc32 c = c0_;
616  Advance();
617 
618  // Skip escaped newlines.
619  if (unicode_cache_->IsLineTerminator(c)) {
620  // Allow CR+LF newlines in multiline string literals.
621  if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
622  // Allow LF+CR newlines in multiline string literals.
623  if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
624  return true;
625  }
626 
627  switch (c) {
628  case '\'': // fall through
629  case '"' : // fall through
630  case '\\': break;
631  case 'b' : c = '\b'; break;
632  case 'f' : c = '\f'; break;
633  case 'n' : c = '\n'; break;
634  case 'r' : c = '\r'; break;
635  case 't' : c = '\t'; break;
636  case 'u' : {
637  c = ScanHexNumber(4);
638  if (c < 0) return false;
639  break;
640  }
641  case 'v' : c = '\v'; break;
642  case 'x' : {
643  c = ScanHexNumber(2);
644  if (c < 0) return false;
645  break;
646  }
647  case '0' : // fall through
648  case '1' : // fall through
649  case '2' : // fall through
650  case '3' : // fall through
651  case '4' : // fall through
652  case '5' : // fall through
653  case '6' : // fall through
654  case '7' : c = ScanOctalEscape(c, 2); break;
655  }
656 
657  // According to ECMA-262, section 7.8.4, characters not covered by the
658  // above cases should be illegal, but they are commonly handled as
659  // non-escaped characters by JS VMs.
660  AddLiteralChar(c);
661  return true;
662 }
663 
664 
665 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
666 // ECMA-262. Other JS VMs support them.
668  uc32 x = c - '0';
669  int i = 0;
670  for (; i < length; i++) {
671  int d = c0_ - '0';
672  if (d < 0 || d > 7) break;
673  int nx = x * 8 + d;
674  if (nx >= 256) break;
675  x = nx;
676  Advance();
677  }
678  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
679  // Remember the position of octal escape sequences so that an error
680  // can be reported later (in strict mode).
681  // We don't report the error immediately, because the octal escape can
682  // occur before the "use strict" directive.
683  if (c != '0' || i > 0) {
684  octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
685  }
686  return x;
687 }
688 
689 
690 Token::Value Scanner::ScanString() {
691  uc32 quote = c0_;
692  Advance(); // consume quote
693 
694  LiteralScope literal(this);
695  while (c0_ != quote && c0_ >= 0
696  && !unicode_cache_->IsLineTerminator(c0_)) {
697  uc32 c = c0_;
698  Advance();
699  if (c == '\\') {
700  if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL;
701  } else {
702  AddLiteralChar(c);
703  }
704  }
705  if (c0_ != quote) return Token::ILLEGAL;
706  literal.Complete();
707 
708  Advance(); // consume quote
709  return Token::STRING;
710 }
711 
712 
713 void Scanner::ScanDecimalDigits() {
714  while (IsDecimalDigit(c0_))
715  AddLiteralCharAdvance();
716 }
717 
718 
719 Token::Value Scanner::ScanNumber(bool seen_period) {
720  ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
721 
722  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
723 
724  LiteralScope literal(this);
725  if (seen_period) {
726  // we have already seen a decimal point of the float
727  AddLiteralChar('.');
728  ScanDecimalDigits(); // we know we have at least one digit
729 
730  } else {
731  // if the first character is '0' we must check for octals and hex
732  if (c0_ == '0') {
733  int start_pos = source_pos(); // For reporting octal positions.
734  AddLiteralCharAdvance();
735 
736  // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
737  if (c0_ == 'x' || c0_ == 'X') {
738  // hex number
739  kind = HEX;
740  AddLiteralCharAdvance();
741  if (!IsHexDigit(c0_)) {
742  // we must have at least one hex digit after 'x'/'X'
743  return Token::ILLEGAL;
744  }
745  while (IsHexDigit(c0_)) {
746  AddLiteralCharAdvance();
747  }
748  } else if ('0' <= c0_ && c0_ <= '7') {
749  // (possible) octal number
750  kind = OCTAL;
751  while (true) {
752  if (c0_ == '8' || c0_ == '9') {
753  kind = DECIMAL;
754  break;
755  }
756  if (c0_ < '0' || '7' < c0_) {
757  // Octal literal finished.
758  octal_pos_ = Location(start_pos, source_pos());
759  break;
760  }
761  AddLiteralCharAdvance();
762  }
763  }
764  }
765 
766  // Parse decimal digits and allow trailing fractional part.
767  if (kind == DECIMAL) {
768  ScanDecimalDigits(); // optional
769  if (c0_ == '.') {
770  AddLiteralCharAdvance();
771  ScanDecimalDigits(); // optional
772  }
773  }
774  }
775 
776  // scan exponent, if any
777  if (c0_ == 'e' || c0_ == 'E') {
778  ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
779  if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
780  // scan exponent
781  AddLiteralCharAdvance();
782  if (c0_ == '+' || c0_ == '-')
783  AddLiteralCharAdvance();
784  if (!IsDecimalDigit(c0_)) {
785  // we must have at least one decimal digit after 'e'/'E'
786  return Token::ILLEGAL;
787  }
788  ScanDecimalDigits();
789  }
790 
791  // The source character immediately following a numeric literal must
792  // not be an identifier start or a decimal digit; see ECMA-262
793  // section 7.8.3, page 17 (note that we read only one decimal digit
794  // if the value is 0).
795  if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
796  return Token::ILLEGAL;
797 
798  literal.Complete();
799 
800  return Token::NUMBER;
801 }
802 
803 
804 uc32 Scanner::ScanIdentifierUnicodeEscape() {
805  Advance();
806  if (c0_ != 'u') return -1;
807  Advance();
808  uc32 result = ScanHexNumber(4);
809  if (result < 0) PushBack('u');
810  return result;
811 }
812 
813 
814 // ----------------------------------------------------------------------------
815 // Keyword Matcher
816 
817 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
818  KEYWORD_GROUP('b') \
819  KEYWORD("break", Token::BREAK) \
820  KEYWORD_GROUP('c') \
821  KEYWORD("case", Token::CASE) \
822  KEYWORD("catch", Token::CATCH) \
823  KEYWORD("class", Token::FUTURE_RESERVED_WORD) \
824  KEYWORD("const", Token::CONST) \
825  KEYWORD("continue", Token::CONTINUE) \
826  KEYWORD_GROUP('d') \
827  KEYWORD("debugger", Token::DEBUGGER) \
828  KEYWORD("default", Token::DEFAULT) \
829  KEYWORD("delete", Token::DELETE) \
830  KEYWORD("do", Token::DO) \
831  KEYWORD_GROUP('e') \
832  KEYWORD("else", Token::ELSE) \
833  KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \
834  KEYWORD("export", harmony_modules \
835  ? Token::EXPORT : Token::FUTURE_RESERVED_WORD) \
836  KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \
837  KEYWORD_GROUP('f') \
838  KEYWORD("false", Token::FALSE_LITERAL) \
839  KEYWORD("finally", Token::FINALLY) \
840  KEYWORD("for", Token::FOR) \
841  KEYWORD("function", Token::FUNCTION) \
842  KEYWORD_GROUP('i') \
843  KEYWORD("if", Token::IF) \
844  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
845  KEYWORD("import", harmony_modules \
846  ? Token::IMPORT : Token::FUTURE_RESERVED_WORD) \
847  KEYWORD("in", Token::IN) \
848  KEYWORD("instanceof", Token::INSTANCEOF) \
849  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \
850  KEYWORD_GROUP('l') \
851  KEYWORD("let", harmony_scoping \
852  ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
853  KEYWORD_GROUP('n') \
854  KEYWORD("new", Token::NEW) \
855  KEYWORD("null", Token::NULL_LITERAL) \
856  KEYWORD_GROUP('p') \
857  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \
858  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \
859  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \
860  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \
861  KEYWORD_GROUP('r') \
862  KEYWORD("return", Token::RETURN) \
863  KEYWORD_GROUP('s') \
864  KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \
865  KEYWORD("super", Token::FUTURE_RESERVED_WORD) \
866  KEYWORD("switch", Token::SWITCH) \
867  KEYWORD_GROUP('t') \
868  KEYWORD("this", Token::THIS) \
869  KEYWORD("throw", Token::THROW) \
870  KEYWORD("true", Token::TRUE_LITERAL) \
871  KEYWORD("try", Token::TRY) \
872  KEYWORD("typeof", Token::TYPEOF) \
873  KEYWORD_GROUP('v') \
874  KEYWORD("var", Token::VAR) \
875  KEYWORD("void", Token::VOID) \
876  KEYWORD_GROUP('w') \
877  KEYWORD("while", Token::WHILE) \
878  KEYWORD("with", Token::WITH) \
879  KEYWORD_GROUP('y') \
880  KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)
881 
882 
883 static Token::Value KeywordOrIdentifierToken(const char* input,
884  int input_length,
885  bool harmony_scoping,
886  bool harmony_modules) {
887  ASSERT(input_length >= 1);
888  const int kMinLength = 2;
889  const int kMaxLength = 10;
890  if (input_length < kMinLength || input_length > kMaxLength) {
891  return Token::IDENTIFIER;
892  }
893  switch (input[0]) {
894  default:
895 #define KEYWORD_GROUP_CASE(ch) \
896  break; \
897  case ch:
898 #define KEYWORD(keyword, token) \
899  { \
900  /* 'keyword' is a char array, so sizeof(keyword) is */ \
901  /* strlen(keyword) plus 1 for the NUL char. */ \
902  const int keyword_length = sizeof(keyword) - 1; \
903  STATIC_ASSERT(keyword_length >= kMinLength); \
904  STATIC_ASSERT(keyword_length <= kMaxLength); \
905  if (input_length == keyword_length && \
906  input[1] == keyword[1] && \
907  (keyword_length <= 2 || input[2] == keyword[2]) && \
908  (keyword_length <= 3 || input[3] == keyword[3]) && \
909  (keyword_length <= 4 || input[4] == keyword[4]) && \
910  (keyword_length <= 5 || input[5] == keyword[5]) && \
911  (keyword_length <= 6 || input[6] == keyword[6]) && \
912  (keyword_length <= 7 || input[7] == keyword[7]) && \
913  (keyword_length <= 8 || input[8] == keyword[8]) && \
914  (keyword_length <= 9 || input[9] == keyword[9])) { \
915  return token; \
916  } \
917  }
919  }
920  return Token::IDENTIFIER;
921 }
922 
923 
924 Token::Value Scanner::ScanIdentifierOrKeyword() {
925  ASSERT(unicode_cache_->IsIdentifierStart(c0_));
926  LiteralScope literal(this);
927  // Scan identifier start character.
928  if (c0_ == '\\') {
929  uc32 c = ScanIdentifierUnicodeEscape();
930  // Only allow legal identifier start characters.
931  if (c < 0 ||
932  c == '\\' || // No recursive escapes.
933  !unicode_cache_->IsIdentifierStart(c)) {
934  return Token::ILLEGAL;
935  }
936  AddLiteralChar(c);
937  return ScanIdentifierSuffix(&literal);
938  }
939 
940  uc32 first_char = c0_;
941  Advance();
942  AddLiteralChar(first_char);
943 
944  // Scan the rest of the identifier characters.
945  while (unicode_cache_->IsIdentifierPart(c0_)) {
946  if (c0_ != '\\') {
947  uc32 next_char = c0_;
948  Advance();
949  AddLiteralChar(next_char);
950  continue;
951  }
952  // Fallthrough if no longer able to complete keyword.
953  return ScanIdentifierSuffix(&literal);
954  }
955 
956  literal.Complete();
957 
958  if (next_.literal_chars->is_ascii()) {
959  Vector<const char> chars = next_.literal_chars->ascii_literal();
960  return KeywordOrIdentifierToken(chars.start(),
961  chars.length(),
962  harmony_scoping_,
963  harmony_modules_);
964  }
965 
966  return Token::IDENTIFIER;
967 }
968 
969 
970 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
971  // Scan the rest of the identifier characters.
972  while (unicode_cache_->IsIdentifierPart(c0_)) {
973  if (c0_ == '\\') {
974  uc32 c = ScanIdentifierUnicodeEscape();
975  // Only allow legal identifier part characters.
976  if (c < 0 ||
977  c == '\\' ||
978  !unicode_cache_->IsIdentifierPart(c)) {
979  return Token::ILLEGAL;
980  }
981  AddLiteralChar(c);
982  } else {
983  AddLiteralChar(c0_);
984  Advance();
985  }
986  }
987  literal->Complete();
988 
989  return Token::IDENTIFIER;
990 }
991 
992 
993 bool Scanner::ScanRegExpPattern(bool seen_equal) {
994  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
995  bool in_character_class = false;
996 
997  // Previous token is either '/' or '/=', in the second case, the
998  // pattern starts at =.
999  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1000  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1001 
1002  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1003  // the scanner should pass uninterpreted bodies to the RegExp
1004  // constructor.
1005  LiteralScope literal(this);
1006  if (seen_equal) {
1007  AddLiteralChar('=');
1008  }
1009 
1010  while (c0_ != '/' || in_character_class) {
1011  if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1012  if (c0_ == '\\') { // Escape sequence.
1013  AddLiteralCharAdvance();
1014  if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1015  AddLiteralCharAdvance();
1016  // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1017  // only "safe" characters are allowed (letters, digits, underscore),
1018  // otherwise the escape isn't valid and the invalid character has
1019  // its normal meaning. I.e., we can just continue scanning without
1020  // worrying whether the following characters are part of the escape
1021  // or not, since any '/', '\\' or '[' is guaranteed to not be part
1022  // of the escape sequence.
1023 
1024  // TODO(896): At some point, parse RegExps more throughly to capture
1025  // octal esacpes in strict mode.
1026  } else { // Unescaped character.
1027  if (c0_ == '[') in_character_class = true;
1028  if (c0_ == ']') in_character_class = false;
1029  AddLiteralCharAdvance();
1030  }
1031  }
1032  Advance(); // consume '/'
1033 
1034  literal.Complete();
1035 
1036  return true;
1037 }
1038 
1039 
1040 bool Scanner::ScanLiteralUnicodeEscape() {
1041  ASSERT(c0_ == '\\');
1042  uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
1043  Advance();
1044  int i = 1;
1045  if (c0_ == 'u') {
1046  i++;
1047  while (i < 6) {
1048  Advance();
1049  if (!IsHexDigit(c0_)) break;
1050  chars_read[i] = c0_;
1051  i++;
1052  }
1053  }
1054  if (i < 6) {
1055  // Incomplete escape. Undo all advances and return false.
1056  while (i > 0) {
1057  i--;
1058  PushBack(chars_read[i]);
1059  }
1060  return false;
1061  }
1062  // Complete escape. Add all chars to current literal buffer.
1063  for (int i = 0; i < 6; i++) {
1064  AddLiteralChar(chars_read[i]);
1065  }
1066  return true;
1067 }
1068 
1069 
1071  // Scan regular expression flags.
1072  LiteralScope literal(this);
1073  while (unicode_cache_->IsIdentifierPart(c0_)) {
1074  if (c0_ != '\\') {
1075  AddLiteralCharAdvance();
1076  } else {
1077  if (!ScanLiteralUnicodeEscape()) {
1078  break;
1079  }
1080  }
1081  }
1082  literal.Complete();
1083 
1084  next_.location.end_pos = source_pos() - 1;
1085  return true;
1086 }
1087 
1088 } } // namespace v8::internal
bool IsIdentifierPart(unibrow::uchar c)
Definition: scanner.h:155
uc32 ScanOctalEscape(uc32 c, int length)
Definition: scanner.cc:667
int32_t uc32
Definition: globals.h:274
bool IsWhiteSpace(unibrow::uchar c)
Definition: scanner.h:157
Scanner(UnicodeCache *scanner_contants)
Definition: scanner.cc:41
#define ASSERT(condition)
Definition: checks.h:270
#define KEYWORD(keyword, token)
uint8_t byte
Definition: globals.h:171
STATIC_ASSERT((FixedDoubleArray::kHeaderSize &kDoubleAlignmentMask)==0)
void SeekForward(int pos)
Definition: scanner.cc:593
#define KEYWORD_GROUP_CASE(ch)
#define KEYWORDS(KEYWORD_GROUP, KEYWORD)
Definition: scanner.cc:817
int HexValue(uc32 c)
Definition: scanner.h:66
void Initialize(Utf16CharacterStream *source)
Definition: scanner.cc:48
bool IsLineTerminator(unibrow::uchar c)
Definition: scanner.h:156
bool IsCarriageReturn(uc32 c)
#define ASSERT_EQ(v1, v2)
Definition: checks.h:271
activate correct semantics for inheriting readonliness enable harmony semantics for typeof enable harmony enable harmony proxies enable all harmony harmony_scoping harmony_proxies harmony_scoping tracks arrays with only smi values automatically unbox arrays of doubles use crankshaft use hydrogen range analysis use hydrogen global value numbering use function inlining maximum number of AST nodes considered for a single inlining loop invariant code motion print statistics for hydrogen trace generated IR for specified phases trace register allocator trace range analysis trace representation types environment for every instruction put a break point before deoptimizing polymorphic inlining perform array bounds checks elimination trace on stack replacement optimize closures functions with arguments object optimize functions containing for in loops profiler considers IC stability primitive functions trigger their own optimization re try self optimization if it failed insert an interrupt check at function exit execution budget before interrupt is triggered call count before self optimization self_optimization count_based_interrupts weighted_back_edges trace_opt emit comments in code disassembly enable use of SSE3 instructions if available enable use of CMOV instruction if available enable use of SAHF instruction if enable use of VFP3 instructions if available this implies enabling ARMv7 enable use of ARMv7 instructions if enable use of MIPS FPU instructions if NULL
Definition: flags.cc:274
Token::Value Next()
Definition: scanner.cc:224
bool IsHexDigit(uc32 c)
bool IsIdentifierStart(unibrow::uchar c)
Definition: scanner.h:154
unsigned SeekForward(unsigned code_unit_count)
Definition: scanner.h:110
bool ScanRegExpPattern(bool seen_equal)
Definition: scanner.cc:993
bool IsDecimalDigit(uc32 c)
bool IsLineFeed(uc32 c)