v8  3.14.5(node0.10.28)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
unicode.h
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_UNICODE_H_
29 #define V8_UNICODE_H_
30 
31 #include <sys/types.h>
32 
38 namespace unibrow {
39 
40 typedef unsigned int uchar;
41 typedef unsigned char byte;
42 
47 const int kMaxMappingSize = 4;
48 
49 template <class T, int size = 256>
50 class Predicate {
51  public:
52  inline Predicate() { }
53  inline bool get(uchar c);
54  private:
55  friend class Test;
56  bool CalculateValue(uchar c);
57  struct CacheEntry {
58  inline CacheEntry() : code_point_(0), value_(0) { }
59  inline CacheEntry(uchar code_point, bool value)
60  : code_point_(code_point),
61  value_(value) { }
62  uchar code_point_ : 21;
63  bool value_ : 1;
64  };
65  static const int kSize = size;
66  static const int kMask = kSize - 1;
67  CacheEntry entries_[kSize];
68 };
69 
70 // A cache used in case conversion. It caches the value for characters
71 // that either have no mapping or map to a single character independent
72 // of context. Characters that map to more than one character or that
73 // map differently depending on context are always looked up.
74 template <class T, int size = 256>
75 class Mapping {
76  public:
77  inline Mapping() { }
78  inline int get(uchar c, uchar n, uchar* result);
79  private:
80  friend class Test;
81  int CalculateValue(uchar c, uchar n, uchar* result);
82  struct CacheEntry {
83  inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84  inline CacheEntry(uchar code_point, signed offset)
85  : code_point_(code_point),
86  offset_(offset) { }
87  uchar code_point_;
88  signed offset_;
89  static const int kNoChar = (1 << 21) - 1;
90  };
91  static const int kSize = size;
92  static const int kMask = kSize - 1;
93  CacheEntry entries_[kSize];
94 };
95 
96 class UnicodeData {
97  private:
98  friend class Test;
99  static int GetByteCount();
100  static const uchar kMaxCodePoint;
101 };
102 
103 // --- U t f 8 a n d 16 ---
104 
105 template <typename Data>
106 class Buffer {
107  public:
108  inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109  inline Buffer() : data_(0), length_(0) { }
110  Data data() { return data_; }
111  unsigned length() { return length_; }
112  private:
113  Data data_;
114  unsigned length_;
115 };
116 
117 
118 class Utf16 {
119  public:
120  static inline bool IsLeadSurrogate(int code) {
121  if (code == kNoPreviousCharacter) return false;
122  return (code & 0xfc00) == 0xd800;
123  }
124  static inline bool IsTrailSurrogate(int code) {
125  if (code == kNoPreviousCharacter) return false;
126  return (code & 0xfc00) == 0xdc00;
127  }
128 
129  static inline int CombineSurrogatePair(uchar lead, uchar trail) {
130  return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
131  }
132  static const int kNoPreviousCharacter = -1;
133  static const uchar kMaxNonSurrogateCharCode = 0xffff;
134  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
135  // of UTF-8 data. The special case where the unit is a surrogate
136  // trail produces 1 byte net, because the encoding of the pair is
137  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
138  // can be reclaimed.
140  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
141  // The illegality stems from the surrogate not being part of a pair.
142  static const int kUtf8BytesToCodeASurrogate = 3;
143  static inline uchar LeadSurrogate(int char_code) {
144  return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
145  }
146  static inline uchar TrailSurrogate(int char_code) {
147  return 0xdc00 + (char_code & 0x3ff);
148  }
149 };
150 
151 
152 class Utf8 {
153  public:
154  static inline uchar Length(uchar chr, int previous);
155  static inline unsigned Encode(
156  char* out, uchar c, int previous);
157  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158  unsigned capacity, unsigned* chars_read, unsigned* offset);
159  static uchar CalculateValue(const byte* str,
160  unsigned length,
161  unsigned* cursor);
162  static const uchar kBadChar = 0xFFFD;
163  static const unsigned kMaxEncodedSize = 4;
164  static const unsigned kMaxOneByteChar = 0x7f;
165  static const unsigned kMaxTwoByteChar = 0x7ff;
166  static const unsigned kMaxThreeByteChar = 0xffff;
167  static const unsigned kMaxFourByteChar = 0x1fffff;
168 
169  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
170  // that match are coded as a 4 byte UTF-8 sequence.
171  static const unsigned kBytesSavedByCombiningSurrogates = 2;
172  static const unsigned kSizeOfUnmatchedSurrogate = 3;
173 
174  private:
175  template <unsigned s> friend class Utf8InputBuffer;
176  friend class Test;
177  static inline uchar ValueOf(const byte* str,
178  unsigned length,
179  unsigned* cursor);
180 };
181 
182 // --- C h a r a c t e r S t r e a m ---
183 
185  public:
186  inline uchar GetNext();
187  inline bool has_more() { return remaining_ != 0; }
188  // Note that default implementation is not efficient.
189  virtual void Seek(unsigned);
190  unsigned Length();
191  unsigned Utf16Length();
192  virtual ~CharacterStream() { }
193  static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
194  unsigned& offset);
195  static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
196  unsigned capacity, unsigned& offset);
197  static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
198  unsigned capacity, unsigned& offset);
199  static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
200  virtual void Rewind() = 0;
201 
202  protected:
203  virtual void FillBuffer() = 0;
204  virtual bool BoundsCheck(unsigned offset) = 0;
205  // The number of characters left in the current buffer
206  unsigned remaining_;
207  // The current offset within the buffer
208  unsigned cursor_;
209  // The buffer containing the decoded characters.
210  const byte* buffer_;
211 };
212 
213 // --- I n p u t B u f f e r ---
214 
221 template <class Reader, class Input = Reader*, unsigned kSize = 256>
222 class InputBuffer : public CharacterStream {
223  public:
224  virtual void Rewind();
225  inline void Reset(Input input);
226  void Seek(unsigned position);
227  inline void Reset(unsigned position, Input input);
228  protected:
230  explicit InputBuffer(Input input) { Reset(input); }
231  virtual void FillBuffer();
232  virtual bool BoundsCheck(unsigned offset) {
233  return (buffer_ != util_buffer_) || (offset < kSize);
234  }
235 
236  // A custom offset that can be used by the string implementation to
237  // mark progress within the encoded string.
238  unsigned offset_;
239  // The input string
240  Input input_;
241  // To avoid heap allocation, we keep an internal buffer to which
242  // the encoded string can write its characters. The string
243  // implementation is free to decide whether it wants to use this
244  // buffer or not.
246 };
247 
248 // --- U t f 8 I n p u t B u f f e r ---
249 
250 template <unsigned s = 256>
251 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
252  public:
253  inline Utf8InputBuffer() { }
254  inline Utf8InputBuffer(const char* data, unsigned length);
255  inline void Reset(const char* data, unsigned length) {
257  Buffer<const char*>(data, length));
258  }
259 };
260 
261 
262 struct Uppercase {
263  static bool Is(uchar c);
264 };
265 struct Lowercase {
266  static bool Is(uchar c);
267 };
268 struct Letter {
269  static bool Is(uchar c);
270 };
271 struct Space {
272  static bool Is(uchar c);
273 };
274 struct Number {
275  static bool Is(uchar c);
276 };
277 struct WhiteSpace {
278  static bool Is(uchar c);
279 };
281  static bool Is(uchar c);
282 };
284  static bool Is(uchar c);
285 };
287  static bool Is(uchar c);
288 };
289 struct ToLowercase {
290  static const int kMaxWidth = 3;
291  static int Convert(uchar c,
292  uchar n,
293  uchar* result,
294  bool* allow_caching_ptr);
295 };
296 struct ToUppercase {
297  static const int kMaxWidth = 3;
298  static int Convert(uchar c,
299  uchar n,
300  uchar* result,
301  bool* allow_caching_ptr);
302 };
304  static const int kMaxWidth = 1;
305  static int Convert(uchar c,
306  uchar n,
307  uchar* result,
308  bool* allow_caching_ptr);
309 };
311  static const int kMaxWidth = 4;
312  static int Convert(uchar c,
313  uchar n,
314  uchar* result,
315  bool* allow_caching_ptr);
316 };
318  static const int kMaxWidth = 1;
319  static int Convert(uchar c,
320  uchar n,
321  uchar* result,
322  bool* allow_caching_ptr);
323 };
324 
325 } // namespace unibrow
326 
327 #endif // V8_UNICODE_H_
friend class Test
Definition: unicode.h:80
static uchar TrailSurrogate(int char_code)
Definition: unicode.h:146
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition: unicode.h:129
static const unsigned kSizeOfUnmatchedSurrogate
Definition: unicode.h:172
virtual bool BoundsCheck(unsigned offset)
Definition: unicode.h:232
static bool Is(uchar c)
Definition: unicode.cc:895
static uchar LeadSurrogate(int char_code)
Definition: unicode.h:143
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1783
unsigned Utf16Length()
Definition: unicode.cc:342
static const unsigned kMaxTwoByteChar
Definition: unicode.h:165
static bool EncodeNonAsciiCharacter(uchar c, byte *buffer, unsigned capacity, unsigned &offset)
Definition: unicode-inl.h:170
friend class Test
Definition: unicode.h:55
static uchar DecodeCharacter(const byte *buffer, unsigned *offset)
Definition: unicode-inl.h:196
const int kMaxMappingSize
Definition: unicode.h:47
static bool Is(uchar c)
Definition: unicode.cc:685
virtual bool BoundsCheck(unsigned offset)=0
void Reset(const char *data, unsigned length)
Definition: unicode.h:255
static bool Is(uchar c)
Definition: unicode.cc:755
unsigned offset_
Definition: unicode.h:238
static const int kMaxWidth
Definition: unicode.h:318
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1718
static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit
Definition: unicode.h:139
byte util_buffer_[kSize]
Definition: unicode.h:245
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:210
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1186
static const unsigned kMaxEncodedSize
Definition: unicode.h:163
static bool Is(uchar c)
Definition: unicode.cc:779
static uchar Length(uchar chr, int previous)
Definition: unicode-inl.h:124
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1327
virtual void Seek(unsigned)
Definition: unicode.cc:352
static const uchar kMaxNonSurrogateCharCode
Definition: unicode.h:133
static const int kMaxWidth
Definition: unicode.h:297
Buffer(Data data, unsigned length)
Definition: unicode.h:108
void Seek(unsigned position)
Definition: unicode-inl.h:238
friend class Test
Definition: unicode.h:98
static bool Is(uchar c)
Definition: unicode.cc:800
static const int kUtf8BytesToCodeASurrogate
Definition: unicode.h:142
activate correct semantics for inheriting readonliness enable harmony semantics for typeof enable harmony enable harmony proxies enable all harmony harmony_scoping harmony_proxies harmony_scoping tracks arrays with only smi values automatically unbox arrays of doubles use crankshaft use hydrogen range analysis use hydrogen global value numbering use function inlining maximum number of AST nodes considered for a single inlining loop invariant code motion print statistics for hydrogen trace generated IR for specified phases trace register allocator trace range analysis trace representation types environment for every instruction put a break point before deoptimizing polymorphic inlining perform array bounds checks elimination use dead code elimination trace on stack replacement optimize closures cache optimized code for closures functions with arguments object loop weight for representation inference allow uint32 values on optimize frames if they are used only in safe operations track parallel recompilation enable all profiler experiments number of stack frames inspected by the profiler call recompile stub directly when self optimizing trigger profiler ticks based on counting instead of timing weight back edges by jump distance for interrupt triggering percentage of ICs that must have type info to allow optimization watch_ic_patching retry_self_opt interrupt_at_exit extra verbose compilation tracing generate extra code(assertions) for debugging") DEFINE_bool(code_comments
Data data()
Definition: unicode.h:110
static unsigned Encode(char *out, uchar c, int previous)
Definition: unicode-inl.h:82
static const int kMaxWidth
Definition: unicode.h:290
static const unsigned kMaxFourByteChar
Definition: unicode.h:167
virtual void Rewind()
Definition: unicode-inl.h:219
virtual void FillBuffer()=0
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1015
static const int kMaxWidth
Definition: unicode.h:311
static bool Is(uchar c)
Definition: unicode.cc:865
static const unsigned kBytesSavedByCombiningSurrogates
Definition: unicode.h:171
static bool Is(uchar c)
Definition: unicode.cc:561
static const uchar kBadChar
Definition: unicode.h:162
static const byte * ReadBlock(Buffer< const char * > str, byte *buffer, unsigned capacity, unsigned *chars_read, unsigned *offset)
Definition: unicode.cc:280
static const unsigned kMaxThreeByteChar
Definition: unicode.h:166
InputBuffer(Input input)
Definition: unicode.h:230
virtual void Rewind()=0
static const unsigned kMaxOneByteChar
Definition: unicode.h:164
static bool Is(uchar c)
Definition: unicode.cc:724
friend class Test
Definition: unicode.h:176
void Reset(Input input)
Definition: unicode-inl.h:233
static bool IsLeadSurrogate(int code)
Definition: unicode.h:120
static bool IsTrailSurrogate(int code)
Definition: unicode.h:124
unsigned length()
Definition: unicode.h:111
static bool EncodeAsciiCharacter(uchar c, byte *buffer, unsigned capacity, unsigned &offset)
Definition: unicode-inl.h:162
unsigned char byte
Definition: unicode.h:41
static const int kMaxWidth
Definition: unicode.h:304
virtual ~CharacterStream()
Definition: unicode.h:192
const byte * buffer_
Definition: unicode.h:210
static bool Is(uchar c)
Definition: unicode.cc:450
virtual void FillBuffer()
Definition: unicode-inl.h:214
static bool EncodeCharacter(uchar c, byte *buffer, unsigned capacity, unsigned &offset)
Definition: unicode-inl.h:187
unsigned int uchar
Definition: unicode.h:40
static const int kNoPreviousCharacter
Definition: unicode.h:132