v8  3.25.30(node0.11.13)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
scanner-character-streams.cc
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #include "v8.h"
29 
31 
32 #include "handles.h"
33 #include "unicode-inl.h"
34 
35 namespace v8 {
36 namespace internal {
37 
38 // ----------------------------------------------------------------------------
39 // BufferedUtf16CharacterStreams
40 
43  pushback_limit_(NULL) {
44  // Initialize buffer as being empty. First read will fill the buffer.
47 }
48 
49 
51 
53  if (character == kEndOfInput) {
54  pos_--;
55  return;
56  }
58  // buffer_ is writable, buffer_cursor_ is const pointer.
59  buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
60  pos_--;
61  return;
62  }
63  SlowPushBack(static_cast<uc16>(character));
64 }
65 
66 
68  // In pushback mode, the end of the buffer contains pushback,
69  // and the start of the buffer (from buffer start to pushback_limit_)
70  // contains valid data that comes just after the pushback.
71  // We NULL the pushback_limit_ if pushing all the way back to the
72  // start of the buffer.
73 
74  if (pushback_limit_ == NULL) {
75  // Enter pushback mode.
79  }
80  // Ensure that there is room for at least one pushback.
82  ASSERT(pos_ > 0);
83  buffer_[--buffer_cursor_ - buffer_] = character;
84  if (buffer_cursor_ == buffer_) {
86  } else if (buffer_cursor_ < pushback_limit_) {
88  }
89  pos_--;
90 }
91 
92 
95  if (pushback_limit_ != NULL) {
96  // Leave pushback mode.
99  // If there were any valid characters left at the
100  // start of the buffer, use those.
101  if (buffer_cursor_ < buffer_end_) return true;
102  // Otherwise read a new block.
103  }
104  unsigned length = FillBuffer(pos_, kBufferSize);
105  buffer_end_ = buffer_ + length;
106  return length > 0;
107 }
108 
109 
111  // Leave pushback mode (i.e., ignore that there might be valid data
112  // in the buffer before the pushback_limit_ point).
114  return BufferSeekForward(delta);
115 }
116 
117 
118 // ----------------------------------------------------------------------------
119 // GenericStringUtf16CharacterStream
120 
121 
123  Handle<String> data,
124  unsigned start_position,
125  unsigned end_position)
126  : string_(data),
127  length_(end_position) {
128  ASSERT(end_position >= start_position);
131  pos_ = start_position;
132 }
133 
134 
136 
137 
139  unsigned old_pos = pos_;
140  pos_ = Min(pos_ + delta, length_);
141  ReadBlock();
142  return pos_ - old_pos;
143 }
144 
145 
147  unsigned length) {
148  if (from_pos >= length_) return 0;
149  if (from_pos + length > length_) {
150  length = length_ - from_pos;
151  }
152  String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
153  return length;
154 }
155 
156 
157 // ----------------------------------------------------------------------------
158 // Utf8ToUtf16CharacterStream
160  unsigned length)
162  raw_data_(data),
163  raw_data_length_(length),
164  raw_data_pos_(0),
165  raw_character_position_(0) {
166  ReadBlock();
167 }
168 
169 
171 
172 
174  unsigned old_pos = pos_;
175  unsigned target_pos = pos_ + delta;
176  SetRawPosition(target_pos);
178  ReadBlock();
179  return pos_ - old_pos;
180 }
181 
182 
183 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
184  unsigned length) {
185  static const unibrow::uchar kMaxUtf16Character = 0xffff;
186  SetRawPosition(char_position);
187  if (raw_character_position_ != char_position) {
188  // char_position was not a valid position in the stream (hit the end
189  // while spooling to it).
190  return 0u;
191  }
192  unsigned i = 0;
193  while (i < length - 1) {
194  if (raw_data_pos_ == raw_data_length_) break;
197  raw_data_pos_++;
198  } else {
201  &raw_data_pos_);
202  }
203  if (c > kMaxUtf16Character) {
206  } else {
207  buffer_[i++] = static_cast<uc16>(c);
208  }
209  }
210  raw_character_position_ = char_position + i;
211  return i;
212 }
213 
214 
215 static const byte kUtf8MultiByteMask = 0xC0;
216 static const byte kUtf8MultiByteCharFollower = 0x80;
217 
218 
219 #ifdef DEBUG
220 static const byte kUtf8MultiByteCharStart = 0xC0;
221 static bool IsUtf8MultiCharacterStart(byte first_byte) {
222  return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
223 }
224 #endif
225 
226 
227 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
228  return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
229 }
230 
231 
232 // Move the cursor back to point at the preceding UTF-8 character start
233 // in the buffer.
234 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
235  byte character = buffer[--*cursor];
236  if (character > unibrow::Utf8::kMaxOneByteChar) {
237  ASSERT(IsUtf8MultiCharacterFollower(character));
238  // Last byte of a multi-byte character encoding. Step backwards until
239  // pointing to the first byte of the encoding, recognized by having the
240  // top two bits set.
241  while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
242  ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
243  }
244 }
245 
246 
247 // Move the cursor forward to point at the next following UTF-8 character start
248 // in the buffer.
249 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
250  byte character = buffer[(*cursor)++];
251  if (character > unibrow::Utf8::kMaxOneByteChar) {
252  // First character of a multi-byte character encoding.
253  // The number of most-significant one-bits determines the length of the
254  // encoding:
255  // 110..... - (0xCx, 0xDx) one additional byte (minimum).
256  // 1110.... - (0xEx) two additional bytes.
257  // 11110... - (0xFx) three additional bytes (maximum).
258  ASSERT(IsUtf8MultiCharacterStart(character));
259  // Additional bytes is:
260  // 1 if value in range 0xC0 .. 0xDF.
261  // 2 if value in range 0xE0 .. 0xEF.
262  // 3 if value in range 0xF0 .. 0xF7.
263  // Encode that in a single value.
264  unsigned additional_bytes =
265  ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
266  *cursor += additional_bytes;
267  ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
268  }
269 }
270 
271 
272 // This can't set a raw position between two surrogate pairs, since there
273 // is no position in the UTF8 stream that corresponds to that. This assumes
274 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If
275 // it is illegally coded as two 3 byte sequences then there is no problem here.
276 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
277  if (raw_character_position_ > target_position) {
278  // Spool backwards in utf8 buffer.
279  do {
280  int old_pos = raw_data_pos_;
281  Utf8CharacterBack(raw_data_, &raw_data_pos_);
283  ASSERT(old_pos - raw_data_pos_ <= 4);
284  // Step back over both code units for surrogate pairs.
285  if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
286  } while (raw_character_position_ > target_position);
287  // No surrogate pair splitting.
288  ASSERT(raw_character_position_ == target_position);
289  return;
290  }
291  // Spool forwards in the utf8 buffer.
292  while (raw_character_position_ < target_position) {
293  if (raw_data_pos_ == raw_data_length_) return;
294  int old_pos = raw_data_pos_;
295  Utf8CharacterForward(raw_data_, &raw_data_pos_);
297  ASSERT(raw_data_pos_ - old_pos <= 4);
298  if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
299  }
300  // No surrogate pair splitting.
301  ASSERT(raw_character_position_ == target_position);
302 }
303 
304 
305 // ----------------------------------------------------------------------------
306 // ExternalTwoByteStringUtf16CharacterStream
307 
310 
311 
315  int start_position,
316  int end_position)
318  source_(data),
319  raw_data_(data->GetTwoByteData(start_position)) {
320  buffer_cursor_ = raw_data_,
321  buffer_end_ = raw_data_ + (end_position - start_position);
322  pos_ = start_position;
323 }
324 
325 } } // namespace v8::internal
enable upcoming ES6 features enable harmony block scoping enable harmony enable harmony proxies enable harmony generators enable harmony numeric enable harmony string enable harmony math functions harmony_scoping harmony_symbols harmony_collections harmony_iteration harmony_strings harmony_scoping harmony_maths tracks arrays with only smi values Optimize object Array DOM strings and string pretenure call new trace pretenuring decisions of HAllocate instructions track fields with only smi values track fields with heap values track_fields track_fields Enables optimizations which favor memory size over execution speed use string slices optimization filter maximum number of GVN fix point iterations use function inlining use allocation folding eliminate write barriers targeting allocations in optimized code maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining crankshaft harvests type feedback from stub cache trace check elimination phase hydrogen tracing filter NULL
Definition: flags.cc:269
const uint16_t * buffer_cursor_
Definition: scanner.h:123
virtual unsigned FillBuffer(unsigned position, unsigned length)=0
static const uc32 kEndOfInput
Definition: scanner.h:114
int32_t uc32
Definition: globals.h:310
static uint16_t TrailSurrogate(uint32_t char_code)
Definition: unicode.h:134
const uint16_t * buffer_end_
Definition: scanner.h:124
#define ASSERT(condition)
Definition: checks.h:329
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:214
uint8_t byte
Definition: globals.h:185
static uint16_t LeadSurrogate(uint32_t char_code)
Definition: unicode.h:131
virtual unsigned BufferSeekForward(unsigned delta)=0
Utf8ToUtf16CharacterStream(const byte *data, unsigned length)
GenericStringUtf16CharacterStream(Handle< String > data, unsigned start_position, unsigned end_position)
uint16_t uc16
Definition: globals.h:309
static const unsigned kMaxOneByteChar
Definition: unicode.h:164
ExternalTwoByteStringUtf16CharacterStream(Handle< ExternalTwoByteString > data, int start_position, int end_position)
virtual unsigned BufferSeekForward(unsigned delta)
virtual unsigned FillBuffer(unsigned char_position, unsigned length)
T Min(T a, T b)
Definition: utils.h:234
virtual unsigned FillBuffer(unsigned position, unsigned length)
unsigned int uchar
Definition: unicode.h:40