v8  3.11.10(node0.8.26)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
scanner-character-streams.cc
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #include "v8.h"
29 
31 
32 #include "handles.h"
33 #include "unicode-inl.h"
34 
35 namespace v8 {
36 namespace internal {
37 
38 // ----------------------------------------------------------------------------
39 // BufferedUtf16CharacterStreams
40 
43  pushback_limit_(NULL) {
44  // Initialize buffer as being empty. First read will fill the buffer.
47 }
48 
50 
52  if (character == kEndOfInput) {
53  pos_--;
54  return;
55  }
57  // buffer_ is writable, buffer_cursor_ is const pointer.
58  buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
59  pos_--;
60  return;
61  }
62  SlowPushBack(static_cast<uc16>(character));
63 }
64 
65 
67  // In pushback mode, the end of the buffer contains pushback,
68  // and the start of the buffer (from buffer start to pushback_limit_)
69  // contains valid data that comes just after the pushback.
70  // We NULL the pushback_limit_ if pushing all the way back to the
71  // start of the buffer.
72 
73  if (pushback_limit_ == NULL) {
74  // Enter pushback mode.
78  }
79  // Ensure that there is room for at least one pushback.
81  ASSERT(pos_ > 0);
82  buffer_[--buffer_cursor_ - buffer_] = character;
83  if (buffer_cursor_ == buffer_) {
85  } else if (buffer_cursor_ < pushback_limit_) {
87  }
88  pos_--;
89 }
90 
91 
94  if (pushback_limit_ != NULL) {
95  // Leave pushback mode.
98  // If there were any valid characters left at the
99  // start of the buffer, use those.
100  if (buffer_cursor_ < buffer_end_) return true;
101  // Otherwise read a new block.
102  }
103  unsigned length = FillBuffer(pos_, kBufferSize);
104  buffer_end_ = buffer_ + length;
105  return length > 0;
106 }
107 
108 
110  // Leave pushback mode (i.e., ignore that there might be valid data
111  // in the buffer before the pushback_limit_ point).
113  return BufferSeekForward(delta);
114 }
115 
116 // ----------------------------------------------------------------------------
117 // GenericStringUtf16CharacterStream
118 
119 
121  Handle<String> data,
122  unsigned start_position,
123  unsigned end_position)
124  : string_(data),
125  length_(end_position) {
126  ASSERT(end_position >= start_position);
129  pos_ = start_position;
130 }
131 
132 
134 
135 
137  unsigned old_pos = pos_;
138  pos_ = Min(pos_ + delta, length_);
139  ReadBlock();
140  return pos_ - old_pos;
141 }
142 
143 
145  unsigned length) {
146  if (from_pos >= length_) return 0;
147  if (from_pos + length > length_) {
148  length = length_ - from_pos;
149  }
150  String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
151  return length;
152 }
153 
154 
155 // ----------------------------------------------------------------------------
156 // Utf8ToUtf16CharacterStream
158  unsigned length)
160  raw_data_(data),
161  raw_data_length_(length),
162  raw_data_pos_(0),
163  raw_character_position_(0) {
164  ReadBlock();
165 }
166 
167 
169 
170 
172  unsigned old_pos = pos_;
173  unsigned target_pos = pos_ + delta;
174  SetRawPosition(target_pos);
176  ReadBlock();
177  return pos_ - old_pos;
178 }
179 
180 
181 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
182  unsigned length) {
183  static const unibrow::uchar kMaxUtf16Character = 0xffff;
184  SetRawPosition(char_position);
185  if (raw_character_position_ != char_position) {
186  // char_position was not a valid position in the stream (hit the end
187  // while spooling to it).
188  return 0u;
189  }
190  unsigned i = 0;
191  while (i < length - 1) {
192  if (raw_data_pos_ == raw_data_length_) break;
195  raw_data_pos_++;
196  } else {
199  &raw_data_pos_);
200  }
201  if (c > kMaxUtf16Character) {
204  } else {
205  buffer_[i++] = static_cast<uc16>(c);
206  }
207  }
208  raw_character_position_ = char_position + i;
209  return i;
210 }
211 
212 
213 static const byte kUtf8MultiByteMask = 0xC0;
214 static const byte kUtf8MultiByteCharStart = 0xC0;
215 static const byte kUtf8MultiByteCharFollower = 0x80;
216 
217 
218 #ifdef DEBUG
219 static bool IsUtf8MultiCharacterStart(byte first_byte) {
220  return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
221 }
222 #endif
223 
224 
225 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
226  return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
227 }
228 
229 
230 // Move the cursor back to point at the preceding UTF-8 character start
231 // in the buffer.
232 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
233  byte character = buffer[--*cursor];
234  if (character > unibrow::Utf8::kMaxOneByteChar) {
235  ASSERT(IsUtf8MultiCharacterFollower(character));
236  // Last byte of a multi-byte character encoding. Step backwards until
237  // pointing to the first byte of the encoding, recognized by having the
238  // top two bits set.
239  while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
240  ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
241  }
242 }
243 
244 
245 // Move the cursor forward to point at the next following UTF-8 character start
246 // in the buffer.
247 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
248  byte character = buffer[(*cursor)++];
249  if (character > unibrow::Utf8::kMaxOneByteChar) {
250  // First character of a multi-byte character encoding.
251  // The number of most-significant one-bits determines the length of the
252  // encoding:
253  // 110..... - (0xCx, 0xDx) one additional byte (minimum).
254  // 1110.... - (0xEx) two additional bytes.
255  // 11110... - (0xFx) three additional bytes (maximum).
256  ASSERT(IsUtf8MultiCharacterStart(character));
257  // Additional bytes is:
258  // 1 if value in range 0xC0 .. 0xDF.
259  // 2 if value in range 0xE0 .. 0xEF.
260  // 3 if value in range 0xF0 .. 0xF7.
261  // Encode that in a single value.
262  unsigned additional_bytes =
263  ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
264  *cursor += additional_bytes;
265  ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
266  }
267 }
268 
269 
270 // This can't set a raw position between two surrogate pairs, since there
271 // is no position in the UTF8 stream that corresponds to that. This assumes
272 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If
273 // it is illegally coded as two 3 byte sequences then there is no problem here.
274 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
275  if (raw_character_position_ > target_position) {
276  // Spool backwards in utf8 buffer.
277  do {
278  int old_pos = raw_data_pos_;
279  Utf8CharacterBack(raw_data_, &raw_data_pos_);
281  ASSERT(old_pos - raw_data_pos_ <= 4);
282  // Step back over both code units for surrogate pairs.
283  if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
284  } while (raw_character_position_ > target_position);
285  // No surrogate pair splitting.
286  ASSERT(raw_character_position_ == target_position);
287  return;
288  }
289  // Spool forwards in the utf8 buffer.
290  while (raw_character_position_ < target_position) {
291  if (raw_data_pos_ == raw_data_length_) return;
292  int old_pos = raw_data_pos_;
293  Utf8CharacterForward(raw_data_, &raw_data_pos_);
295  ASSERT(raw_data_pos_ - old_pos <= 4);
296  if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
297  }
298  // No surrogate pair splitting.
299  ASSERT(raw_character_position_ == target_position);
300 }
301 
302 
303 // ----------------------------------------------------------------------------
304 // ExternalTwoByteStringUtf16CharacterStream
305 
308 
309 
313  int start_position,
314  int end_position)
316  source_(data),
317  raw_data_(data->GetTwoByteData(start_position)) {
318  buffer_cursor_ = raw_data_,
319  buffer_end_ = raw_data_ + (end_position - start_position);
320  pos_ = start_position;
321 }
322 
323 } } // namespace v8::internal
static uchar TrailSurrogate(int char_code)
Definition: unicode.h:146
static uchar LeadSurrogate(int char_code)
Definition: unicode.h:143
virtual unsigned FillBuffer(unsigned position, unsigned length)=0
static const uc32 kEndOfInput
Definition: scanner.h:128
int32_t uc32
Definition: globals.h:274
#define ASSERT(condition)
Definition: checks.h:270
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:210
uint8_t byte
Definition: globals.h:171
virtual unsigned BufferSeekForward(unsigned delta)=0
Utf8ToUtf16CharacterStream(const byte *data, unsigned length)
GenericStringUtf16CharacterStream(Handle< String > data, unsigned start_position, unsigned end_position)
uint16_t uc16
Definition: globals.h:273
activate correct semantics for inheriting readonliness enable harmony semantics for typeof enable harmony enable harmony proxies enable all harmony harmony_scoping harmony_proxies harmony_scoping tracks arrays with only smi values automatically unbox arrays of doubles use crankshaft use hydrogen range analysis use hydrogen global value numbering use function inlining maximum number of AST nodes considered for a single inlining loop invariant code motion print statistics for hydrogen trace generated IR for specified phases trace register allocator trace range analysis trace representation types environment for every instruction put a break point before deoptimizing polymorphic inlining perform array bounds checks elimination trace on stack replacement optimize closures functions with arguments object optimize functions containing for in loops profiler considers IC stability primitive functions trigger their own optimization re try self optimization if it failed insert an interrupt check at function exit execution budget before interrupt is triggered call count before self optimization self_optimization count_based_interrupts weighted_back_edges trace_opt emit comments in code disassembly enable use of SSE3 instructions if available enable use of CMOV instruction if available enable use of SAHF instruction if enable use of VFP3 instructions if available this implies enabling ARMv7 enable use of ARMv7 instructions if enable use of MIPS FPU instructions if NULL
Definition: flags.cc:274
static const unsigned kMaxOneByteChar
Definition: unicode.h:164
ExternalTwoByteStringUtf16CharacterStream(Handle< ExternalTwoByteString > data, int start_position, int end_position)
virtual unsigned BufferSeekForward(unsigned delta)
virtual unsigned FillBuffer(unsigned char_position, unsigned length)
T Min(T a, T b)
Definition: utils.h:229
virtual unsigned FillBuffer(unsigned position, unsigned length)
unsigned int uchar
Definition: unicode.h:40