v8  3.25.30(node0.11.13)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
unicode.h
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_UNICODE_H_
29 #define V8_UNICODE_H_
30 
31 #include <sys/types.h>
32 #include "globals.h"
38 namespace unibrow {
39 
40 typedef unsigned int uchar;
41 typedef unsigned char byte;
42 
47 const int kMaxMappingSize = 4;
48 
49 template <class T, int size = 256>
50 class Predicate {
51  public:
52  inline Predicate() { }
53  inline bool get(uchar c);
54  private:
55  friend class Test;
56  bool CalculateValue(uchar c);
57  struct CacheEntry {
58  inline CacheEntry() : code_point_(0), value_(0) { }
59  inline CacheEntry(uchar code_point, bool value)
60  : code_point_(code_point),
61  value_(value) { }
62  uchar code_point_ : 21;
63  bool value_ : 1;
64  };
65  static const int kSize = size;
66  static const int kMask = kSize - 1;
67  CacheEntry entries_[kSize];
68 };
69 
70 // A cache used in case conversion. It caches the value for characters
71 // that either have no mapping or map to a single character independent
72 // of context. Characters that map to more than one character or that
73 // map differently depending on context are always looked up.
74 template <class T, int size = 256>
75 class Mapping {
76  public:
77  inline Mapping() { }
78  inline int get(uchar c, uchar n, uchar* result);
79  private:
80  friend class Test;
81  int CalculateValue(uchar c, uchar n, uchar* result);
82  struct CacheEntry {
83  inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84  inline CacheEntry(uchar code_point, signed offset)
85  : code_point_(code_point),
86  offset_(offset) { }
87  uchar code_point_;
88  signed offset_;
89  static const int kNoChar = (1 << 21) - 1;
90  };
91  static const int kSize = size;
92  static const int kMask = kSize - 1;
93  CacheEntry entries_[kSize];
94 };
95 
96 class UnicodeData {
97  private:
98  friend class Test;
99  static int GetByteCount();
100  static const uchar kMaxCodePoint;
101 };
102 
103 class Utf16 {
104  public:
105  static inline bool IsSurrogatePair(int lead, int trail) {
106  return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
107  }
108  static inline bool IsLeadSurrogate(int code) {
109  if (code == kNoPreviousCharacter) return false;
110  return (code & 0xfc00) == 0xd800;
111  }
112  static inline bool IsTrailSurrogate(int code) {
113  if (code == kNoPreviousCharacter) return false;
114  return (code & 0xfc00) == 0xdc00;
115  }
116 
117  static inline int CombineSurrogatePair(uchar lead, uchar trail) {
118  return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
119  }
120  static const int kNoPreviousCharacter = -1;
121  static const uchar kMaxNonSurrogateCharCode = 0xffff;
122  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
123  // of UTF-8 data. The special case where the unit is a surrogate
124  // trail produces 1 byte net, because the encoding of the pair is
125  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
126  // can be reclaimed.
128  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
129  // The illegality stems from the surrogate not being part of a pair.
130  static const int kUtf8BytesToCodeASurrogate = 3;
131  static inline uint16_t LeadSurrogate(uint32_t char_code) {
132  return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
133  }
134  static inline uint16_t TrailSurrogate(uint32_t char_code) {
135  return 0xdc00 + (char_code & 0x3ff);
136  }
137 };
138 
139 class Latin1 {
140  public:
141  static const unsigned kMaxChar = 0xff;
142  // Returns 0 if character does not convert to single latin-1 character
143  // or if the character doesn't not convert back to latin-1 via inverse
144  // operation (upper to lower, etc).
146 };
147 
148 class Utf8 {
149  public:
150  static inline uchar Length(uchar chr, int previous);
151  static inline unsigned EncodeOneByte(char* out, uint8_t c);
152  static inline unsigned Encode(char* out,
153  uchar c,
154  int previous,
155  bool replace_invalid = false);
156  static uchar CalculateValue(const byte* str,
157  unsigned length,
158  unsigned* cursor);
159 
160  // The unicode replacement character, used to signal invalid unicode
161  // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
162  static const uchar kBadChar = 0xFFFD;
163  static const unsigned kMaxEncodedSize = 4;
164  static const unsigned kMaxOneByteChar = 0x7f;
165  static const unsigned kMaxTwoByteChar = 0x7ff;
166  static const unsigned kMaxThreeByteChar = 0xffff;
167  static const unsigned kMaxFourByteChar = 0x1fffff;
168 
169  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
170  // that match are coded as a 4 byte UTF-8 sequence.
171  static const unsigned kBytesSavedByCombiningSurrogates = 2;
172  static const unsigned kSizeOfUnmatchedSurrogate = 3;
173  // The maximum size a single UTF-16 code unit may take up when encoded as
174  // UTF-8.
175  static const unsigned kMax16BitCodeUnitSize = 3;
176  static inline uchar ValueOf(const byte* str,
177  unsigned length,
178  unsigned* cursor);
179 };
180 
181 
183  public:
184  // Initialization done in subclass.
185  inline Utf8DecoderBase();
186  inline Utf8DecoderBase(uint16_t* buffer,
187  unsigned buffer_length,
188  const uint8_t* stream,
189  unsigned stream_length);
190  inline unsigned Utf16Length() const { return utf16_length_; }
191  protected:
192  // This reads all characters and sets the utf16_length_.
193  // The first buffer_length utf16 chars are cached in the buffer.
194  void Reset(uint16_t* buffer,
195  unsigned buffer_length,
196  const uint8_t* stream,
197  unsigned stream_length);
198  static void WriteUtf16Slow(const uint8_t* stream,
199  uint16_t* data,
200  unsigned length);
201  const uint8_t* unbuffered_start_;
202  unsigned utf16_length_;
204  private:
205  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
206 };
207 
208 template <unsigned kBufferSize>
209 class Utf8Decoder : public Utf8DecoderBase {
210  public:
211  inline Utf8Decoder() {}
212  inline Utf8Decoder(const char* stream, unsigned length);
213  inline void Reset(const char* stream, unsigned length);
214  inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
215  private:
216  uint16_t buffer_[kBufferSize];
217 };
218 
219 
220 struct Uppercase {
221  static bool Is(uchar c);
222 };
223 struct Lowercase {
224  static bool Is(uchar c);
225 };
226 struct Letter {
227  static bool Is(uchar c);
228 };
229 struct Number {
230  static bool Is(uchar c);
231 };
232 struct WhiteSpace {
233  static bool Is(uchar c);
234 };
236  static bool Is(uchar c);
237 };
239  static bool Is(uchar c);
240 };
242  static bool Is(uchar c);
243 };
244 struct ToLowercase {
245  static const int kMaxWidth = 3;
246  static const bool kIsToLower = true;
247  static int Convert(uchar c,
248  uchar n,
249  uchar* result,
250  bool* allow_caching_ptr);
251 };
252 struct ToUppercase {
253  static const int kMaxWidth = 3;
254  static const bool kIsToLower = false;
255  static int Convert(uchar c,
256  uchar n,
257  uchar* result,
258  bool* allow_caching_ptr);
259 };
261  static const int kMaxWidth = 1;
262  static int Convert(uchar c,
263  uchar n,
264  uchar* result,
265  bool* allow_caching_ptr);
266 };
268  static const int kMaxWidth = 4;
269  static int Convert(uchar c,
270  uchar n,
271  uchar* result,
272  bool* allow_caching_ptr);
273 };
275  static const int kMaxWidth = 1;
276  static int Convert(uchar c,
277  uchar n,
278  uchar* result,
279  bool* allow_caching_ptr);
280 };
281 
282 } // namespace unibrow
283 
284 #endif // V8_UNICODE_H_
friend class Test
Definition: unicode.h:80
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition: unicode.h:117
static const unsigned kSizeOfUnmatchedSurrogate
Definition: unicode.h:172
static bool Is(uchar c)
Definition: unicode.cc:875
static const unsigned kMaxChar
Definition: unicode.h:141
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1763
static const unsigned kMaxTwoByteChar
Definition: unicode.h:165
static uint16_t TrailSurrogate(uint32_t char_code)
Definition: unicode.h:134
friend class Test
Definition: unicode.h:55
bool last_byte_of_buffer_unused_
Definition: unicode.h:203
const int kMaxMappingSize
Definition: unicode.h:47
static bool Is(uchar c)
Definition: unicode.cc:681
static const unsigned kMax16BitCodeUnitSize
Definition: unicode.h:175
static bool Is(uchar c)
Definition: unicode.cc:731
static uint16_t ConvertNonLatin1ToLatin1(uint16_t)
Definition: unicode-inl.h:83
static const int kMaxWidth
Definition: unicode.h:275
unsigned short uint16_t
Definition: unicode.cc:46
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1698
static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit
Definition: unicode.h:127
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:214
void Reset(const char *stream, unsigned length)
Definition: unicode-inl.h:200
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1166
static const unsigned kMaxEncodedSize
Definition: unicode.h:163
enable upcoming ES6 features enable harmony block scoping enable harmony enable harmony proxies enable harmony generators enable harmony numeric enable harmony string enable harmony math functions harmony_scoping harmony_symbols harmony_collections harmony_iteration harmony_strings harmony_scoping harmony_maths tracks arrays with only smi values Optimize object size
static bool Is(uchar c)
Definition: unicode.cc:756
static const bool kIsToLower
Definition: unicode.h:246
static uchar Length(uchar chr, int previous)
Definition: unicode-inl.h:163
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1307
static const uchar kMaxNonSurrogateCharCode
Definition: unicode.h:121
static const int kMaxWidth
Definition: unicode.h:253
static uint16_t LeadSurrogate(uint32_t char_code)
Definition: unicode.h:131
friend class Test
Definition: unicode.h:98
static bool Is(uchar c)
Definition: unicode.cc:778
static const bool kIsToLower
Definition: unicode.h:254
static unsigned EncodeOneByte(char *out, uint8_t c)
Definition: unicode-inl.h:99
static const int kUtf8BytesToCodeASurrogate
Definition: unicode.h:130
static unsigned Encode(char *out, uchar c, int previous, bool replace_invalid=false)
Definition: unicode-inl.h:114
static uchar ValueOf(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode-inl.h:152
static const int kMaxWidth
Definition: unicode.h:245
static const unsigned kMaxFourByteChar
Definition: unicode.h:167
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:995
static void WriteUtf16Slow(const uint8_t *stream, uint16_t *data, unsigned length)
Definition: unicode.cc:331
static const int kMaxWidth
Definition: unicode.h:268
enable upcoming ES6 features enable harmony block scoping enable harmony enable harmony proxies enable harmony generators enable harmony numeric enable harmony string enable harmony math functions harmony_scoping harmony_symbols harmony_collections harmony_iteration harmony_strings harmony_scoping harmony_maths tracks arrays with only smi values Optimize object Array DOM strings and string pretenure call new trace pretenuring decisions of HAllocate instructions track fields with only smi values track fields with heap values track_fields track_fields Enables optimizations which favor memory size over execution speed use string slices optimization filter maximum number of GVN fix point iterations use function inlining use allocation folding eliminate write barriers targeting allocations in optimized code maximum source size in bytes considered for a single inlining maximum cumulative number of AST nodes considered for inlining crankshaft harvests type feedback from stub cache trace check elimination phase hydrogen tracing filter trace hydrogen to given file name trace inlining decisions trace store elimination trace all use positions trace global value numbering trace hydrogen escape analysis trace the tracking of allocation sites trace map generalization environment for every instruction deoptimize every n garbage collections put a break point before deoptimizing deoptimize uncommon cases use on stack replacement trace array bounds check elimination perform array index dehoisting use load elimination use store elimination use constant folding eliminate unreachable code number of stress runs when picking a function to watch for shared function not JSFunction itself flushes the cache of optimized code for closures on every GC functions with arguments object maximum number of escape analysis fix point iterations allow uint32 values on optimize frames if they are used only in safe operations track concurrent recompilation artificial compilation delay in ms concurrent on stack replacement do not emit check maps for constant values that have a leaf deoptimize the optimized code if the layout of the maps changes number of stack frames inspected by the profiler percentage of ICs that must have type info to allow optimization extra verbose compilation tracing generate extra code(assertions) for debugging") DEFINE_bool(code_comments
static bool Is(uchar c)
Definition: unicode.cc:844
static const unsigned kBytesSavedByCombiningSurrogates
Definition: unicode.h:171
static bool Is(uchar c)
Definition: unicode.cc:556
static const uchar kBadChar
Definition: unicode.h:162
static bool IsSurrogatePair(int lead, int trail)
Definition: unicode.h:105
static const unsigned kMaxThreeByteChar
Definition: unicode.h:166
static const unsigned kMaxOneByteChar
Definition: unicode.h:164
const uint8_t * unbuffered_start_
Definition: unicode.h:201
void Reset(uint16_t *buffer, unsigned buffer_length, const uint8_t *stream, unsigned stream_length)
Definition: unicode.cc:284
static bool IsLeadSurrogate(int code)
Definition: unicode.h:108
static bool IsTrailSurrogate(int code)
Definition: unicode.h:112
unsigned char byte
Definition: unicode.h:41
static const int kMaxWidth
Definition: unicode.h:261
unsigned Utf16Length() const
Definition: unicode.h:190
unsigned WriteUtf16(uint16_t *data, unsigned length) const
Definition: unicode-inl.h:208
static bool Is(uchar c)
Definition: unicode.cc:444
unsigned int uchar
Definition: unicode.h:40
static const int kNoPreviousCharacter
Definition: unicode.h:120