v8  3.25.30(node0.11.13)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
unicode-inl.h
Go to the documentation of this file.
1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_UNICODE_INL_H_
29 #define V8_UNICODE_INL_H_
30 
31 #include "unicode.h"
32 #include "checks.h"
33 #include "platform.h"
34 
35 namespace unibrow {
36 
37 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
38  CacheEntry entry = entries_[code_point & kMask];
39  if (entry.code_point_ == code_point) return entry.value_;
40  return CalculateValue(code_point);
41 }
42 
43 template <class T, int s> bool Predicate<T, s>::CalculateValue(
44  uchar code_point) {
45  bool result = T::Is(code_point);
46  entries_[code_point & kMask] = CacheEntry(code_point, result);
47  return result;
48 }
49 
50 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
51  uchar* result) {
52  CacheEntry entry = entries_[c & kMask];
53  if (entry.code_point_ == c) {
54  if (entry.offset_ == 0) {
55  return 0;
56  } else {
57  result[0] = c + entry.offset_;
58  return 1;
59  }
60  } else {
61  return CalculateValue(c, n, result);
62  }
63 }
64 
65 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
66  uchar* result) {
67  bool allow_caching = true;
68  int length = T::Convert(c, n, result, &allow_caching);
69  if (allow_caching) {
70  if (length == 1) {
71  entries_[c & kMask] = CacheEntry(c, result[0] - c);
72  return 1;
73  } else {
74  entries_[c & kMask] = CacheEntry(c, 0);
75  return 0;
76  }
77  } else {
78  return length;
79  }
80 }
81 
82 
85  switch (c) {
86  // This are equivalent characters in unicode.
87  case 0x39c:
88  case 0x3bc:
89  return 0xb5;
90  // This is an uppercase of a Latin-1 character
91  // outside of Latin-1.
92  case 0x178:
93  return 0xff;
94  }
95  return 0;
96 }
97 
98 
99 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
100  static const int kMask = ~(1 << 6);
101  if (c <= kMaxOneByteChar) {
102  str[0] = c;
103  return 1;
104  }
105  str[0] = 0xC0 | (c >> 6);
106  str[1] = 0x80 | (c & kMask);
107  return 2;
108 }
109 
110 // Encode encodes the UTF-16 code units c and previous into the given str
111 // buffer, and combines surrogate code units into single code points. If
112 // replace_invalid is set to true, orphan surrogate code units will be replaced
113 // with kBadChar.
114 unsigned Utf8::Encode(char* str,
115  uchar c,
116  int previous,
117  bool replace_invalid) {
118  static const int kMask = ~(1 << 6);
119  if (c <= kMaxOneByteChar) {
120  str[0] = c;
121  return 1;
122  } else if (c <= kMaxTwoByteChar) {
123  str[0] = 0xC0 | (c >> 6);
124  str[1] = 0x80 | (c & kMask);
125  return 2;
126  } else if (c <= kMaxThreeByteChar) {
127  if (Utf16::IsSurrogatePair(previous, c)) {
128  const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
129  return Encode(str - kUnmatchedSize,
130  Utf16::CombineSurrogatePair(previous, c),
132  replace_invalid) - kUnmatchedSize;
133  } else if (replace_invalid &&
136  c = kBadChar;
137  }
138  str[0] = 0xE0 | (c >> 12);
139  str[1] = 0x80 | ((c >> 6) & kMask);
140  str[2] = 0x80 | (c & kMask);
141  return 3;
142  } else {
143  str[0] = 0xF0 | (c >> 18);
144  str[1] = 0x80 | ((c >> 12) & kMask);
145  str[2] = 0x80 | ((c >> 6) & kMask);
146  str[3] = 0x80 | (c & kMask);
147  return 4;
148  }
149 }
150 
151 
152 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
153  if (length <= 0) return kBadChar;
154  byte first = bytes[0];
155  // Characters between 0000 and 0007F are encoded as a single character
156  if (first <= kMaxOneByteChar) {
157  *cursor += 1;
158  return first;
159  }
160  return CalculateValue(bytes, length, cursor);
161 }
162 
163 unsigned Utf8::Length(uchar c, int previous) {
164  if (c <= kMaxOneByteChar) {
165  return 1;
166  } else if (c <= kMaxTwoByteChar) {
167  return 2;
168  } else if (c <= kMaxThreeByteChar) {
169  if (Utf16::IsTrailSurrogate(c) &&
170  Utf16::IsLeadSurrogate(previous)) {
172  }
173  return 3;
174  } else {
175  return 4;
176  }
177 }
178 
180  : unbuffered_start_(NULL),
181  utf16_length_(0),
182  last_byte_of_buffer_unused_(false) {}
183 
185  unsigned buffer_length,
186  const uint8_t* stream,
187  unsigned stream_length) {
188  Reset(buffer, buffer_length, stream, stream_length);
189 }
190 
191 template<unsigned kBufferSize>
192 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
193  : Utf8DecoderBase(buffer_,
194  kBufferSize,
195  reinterpret_cast<const uint8_t*>(stream),
196  length) {
197 }
198 
199 template<unsigned kBufferSize>
200 void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
201  Utf8DecoderBase::Reset(buffer_,
202  kBufferSize,
203  reinterpret_cast<const uint8_t*>(stream),
204  length);
205 }
206 
207 template <unsigned kBufferSize>
209  unsigned length) const {
210  ASSERT(length > 0);
211  if (length > utf16_length_) length = utf16_length_;
212  // memcpy everything in buffer.
213  unsigned buffer_length =
214  last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
215  unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
216  v8::internal::OS::MemCopy(data, buffer_, memcpy_length*sizeof(uint16_t));
217  if (length <= buffer_length) return length;
218  ASSERT(unbuffered_start_ != NULL);
219  // Copy the rest the slow way.
220  WriteUtf16Slow(unbuffered_start_,
221  data + buffer_length,
222  length - buffer_length);
223  return length;
224 }
225 
226 } // namespace unibrow
227 
228 #endif // V8_UNICODE_INL_H_
enable upcoming ES6 features enable harmony block scoping enable harmony enable harmony proxies enable harmony generators enable harmony numeric enable harmony string enable harmony math functions harmony_scoping harmony_symbols harmony_collections harmony_iteration harmony_strings harmony_scoping harmony_maths tracks arrays with only smi values Optimize object Array DOM strings and string pretenure call new trace pretenuring decisions of HAllocate instructions track fields with only smi values track fields with heap values track_fields track_fields Enables optimizations which favor memory size over execution speed use string slices optimization filter maximum number of GVN fix point iterations use function inlining use allocation folding eliminate write barriers targeting allocations in optimized code maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining crankshaft harvests type feedback from stub cache trace check elimination phase hydrogen tracing filter NULL
bool get(uchar c)
Definition: unicode-inl.h:37
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition: unicode.h:117
static const unsigned kSizeOfUnmatchedSurrogate
Definition: unicode.h:172
static const unsigned kMaxChar
Definition: unicode.h:141
static const unsigned kMaxTwoByteChar
Definition: unicode.h:165
static uint16_t ConvertNonLatin1ToLatin1(uint16_t)
Definition: unicode-inl.h:83
#define ASSERT(condition)
Definition: checks.h:329
unsigned short uint16_t
Definition: unicode.cc:46
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:214
void Reset(const char *stream, unsigned length)
Definition: unicode-inl.h:200
int get(uchar c, uchar n, uchar *result)
Definition: unicode-inl.h:50
static uchar Length(uchar chr, int previous)
Definition: unicode-inl.h:163
static void MemCopy(void *dest, const void *src, size_t size)
Definition: platform.h:399
static unsigned EncodeOneByte(char *out, uint8_t c)
Definition: unicode-inl.h:99
bool Is(Object *obj)
static unsigned Encode(char *out, uchar c, int previous, bool replace_invalid=false)
Definition: unicode-inl.h:114
static uchar ValueOf(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode-inl.h:152
static const unsigned kBytesSavedByCombiningSurrogates
Definition: unicode.h:171
static const uchar kBadChar
Definition: unicode.h:162
static bool IsSurrogatePair(int lead, int trail)
Definition: unicode.h:105
static const unsigned kMaxThreeByteChar
Definition: unicode.h:166
static const unsigned kMaxOneByteChar
Definition: unicode.h:164
void Reset(uint16_t *buffer, unsigned buffer_length, const uint8_t *stream, unsigned stream_length)
Definition: unicode.cc:284
static bool IsLeadSurrogate(int code)
Definition: unicode.h:108
static bool IsTrailSurrogate(int code)
Definition: unicode.h:112
unsigned char byte
Definition: unicode.h:41
unsigned WriteUtf16(uint16_t *data, unsigned length) const
Definition: unicode-inl.h:208
unsigned int uchar
Definition: unicode.h:40
static const int kNoPreviousCharacter
Definition: unicode.h:120