v8  3.11.10(node0.8.26)
V8 is Google's open source JavaScript engine
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
unicode.h
Go to the documentation of this file.
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_UNICODE_H_
29 #define V8_UNICODE_H_
30 
31 #include <sys/types.h>
32 
38 namespace unibrow {
39 
40 typedef unsigned int uchar;
41 typedef unsigned char byte;
42 
47 const int kMaxMappingSize = 4;
48 
49 template <class T, int size = 256>
50 class Predicate {
51  public:
52  inline Predicate() { }
53  inline bool get(uchar c);
54  private:
55  friend class Test;
56  bool CalculateValue(uchar c);
57  struct CacheEntry {
58  inline CacheEntry() : code_point_(0), value_(0) { }
59  inline CacheEntry(uchar code_point, bool value)
60  : code_point_(code_point),
61  value_(value) { }
62  uchar code_point_ : 21;
63  bool value_ : 1;
64  };
65  static const int kSize = size;
66  static const int kMask = kSize - 1;
67  CacheEntry entries_[kSize];
68 };
69 
70 // A cache used in case conversion. It caches the value for characters
71 // that either have no mapping or map to a single character independent
72 // of context. Characters that map to more than one character or that
73 // map differently depending on context are always looked up.
74 template <class T, int size = 256>
75 class Mapping {
76  public:
77  inline Mapping() { }
78  inline int get(uchar c, uchar n, uchar* result);
79  private:
80  friend class Test;
81  int CalculateValue(uchar c, uchar n, uchar* result);
82  struct CacheEntry {
83  inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
84  inline CacheEntry(uchar code_point, signed offset)
85  : code_point_(code_point),
86  offset_(offset) { }
87  uchar code_point_;
88  signed offset_;
89  static const int kNoChar = (1 << 21) - 1;
90  };
91  static const int kSize = size;
92  static const int kMask = kSize - 1;
93  CacheEntry entries_[kSize];
94 };
95 
96 class UnicodeData {
97  private:
98  friend class Test;
99  static int GetByteCount();
100  static const uchar kMaxCodePoint;
101 };
102 
103 // --- U t f 8 a n d 16 ---
104 
105 template <typename Data>
106 class Buffer {
107  public:
108  inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
109  inline Buffer() : data_(0), length_(0) { }
110  Data data() { return data_; }
111  unsigned length() { return length_; }
112  private:
113  Data data_;
114  unsigned length_;
115 };
116 
117 
118 class Utf16 {
119  public:
120  static inline bool IsLeadSurrogate(int code) {
121  if (code == kNoPreviousCharacter) return false;
122  return (code & 0xfc00) == 0xd800;
123  }
124  static inline bool IsTrailSurrogate(int code) {
125  if (code == kNoPreviousCharacter) return false;
126  return (code & 0xfc00) == 0xdc00;
127  }
128 
129  static inline int CombineSurrogatePair(uchar lead, uchar trail) {
130  return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
131  }
132  static const int kNoPreviousCharacter = -1;
133  static const uchar kMaxNonSurrogateCharCode = 0xffff;
134  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
135  // of UTF-8 data. The special case where the unit is a surrogate
136  // trail produces 1 byte net, because the encoding of the pair is
137  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
138  // can be reclaimed.
140  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
141  // The illegality stems from the surrogate not being part of a pair.
142  static const int kUtf8BytesToCodeASurrogate = 3;
143  static inline uchar LeadSurrogate(int char_code) {
144  return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
145  }
146  static inline uchar TrailSurrogate(int char_code) {
147  return 0xdc00 + (char_code & 0x3ff);
148  }
149 };
150 
151 
152 class Utf8 {
153  public:
154  static inline uchar Length(uchar chr, int previous);
155  static inline unsigned Encode(
156  char* out, uchar c, int previous);
157  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158  unsigned capacity, unsigned* chars_read, unsigned* offset);
159  static uchar CalculateValue(const byte* str,
160  unsigned length,
161  unsigned* cursor);
162  static const uchar kBadChar = 0xFFFD;
163  static const unsigned kMaxEncodedSize = 4;
164  static const unsigned kMaxOneByteChar = 0x7f;
165  static const unsigned kMaxTwoByteChar = 0x7ff;
166  static const unsigned kMaxThreeByteChar = 0xffff;
167  static const unsigned kMaxFourByteChar = 0x1fffff;
168 
169  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
170  // that match are coded as a 4 byte UTF-8 sequence.
171  static const unsigned kBytesSavedByCombiningSurrogates = 2;
172  static const unsigned kSizeOfUnmatchedSurrogate = 3;
173 
174  private:
175  template <unsigned s> friend class Utf8InputBuffer;
176  friend class Test;
177  static inline uchar ValueOf(const byte* str,
178  unsigned length,
179  unsigned* cursor);
180 };
181 
182 // --- C h a r a c t e r S t r e a m ---
183 
185  public:
186  inline uchar GetNext();
187  inline bool has_more() { return remaining_ != 0; }
188  // Note that default implementation is not efficient.
189  virtual void Seek(unsigned);
190  unsigned Length();
191  unsigned Utf16Length();
192  virtual ~CharacterStream() { }
193  static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
194  unsigned& offset);
195  static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
196  unsigned capacity, unsigned& offset);
197  static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
198  unsigned capacity, unsigned& offset);
199  static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
200  virtual void Rewind() = 0;
201 
202  protected:
203  virtual void FillBuffer() = 0;
204  // The number of characters left in the current buffer
205  unsigned remaining_;
206  // The current offset within the buffer
207  unsigned cursor_;
208  // The buffer containing the decoded characters.
209  const byte* buffer_;
210 };
211 
212 // --- I n p u t B u f f e r ---
213 
220 template <class Reader, class Input = Reader*, unsigned kSize = 256>
221 class InputBuffer : public CharacterStream {
222  public:
223  virtual void Rewind();
224  inline void Reset(Input input);
225  void Seek(unsigned position);
226  inline void Reset(unsigned position, Input input);
227  protected:
229  explicit InputBuffer(Input input) { Reset(input); }
230  virtual void FillBuffer();
231 
232  // A custom offset that can be used by the string implementation to
233  // mark progress within the encoded string.
234  unsigned offset_;
235  // The input string
236  Input input_;
237  // To avoid heap allocation, we keep an internal buffer to which
238  // the encoded string can write its characters. The string
239  // implementation is free to decide whether it wants to use this
240  // buffer or not.
242 };
243 
244 // --- U t f 8 I n p u t B u f f e r ---
245 
246 template <unsigned s = 256>
247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
248  public:
249  inline Utf8InputBuffer() { }
250  inline Utf8InputBuffer(const char* data, unsigned length);
251  inline void Reset(const char* data, unsigned length) {
253  Buffer<const char*>(data, length));
254  }
255 };
256 
257 
258 struct Uppercase {
259  static bool Is(uchar c);
260 };
261 struct Lowercase {
262  static bool Is(uchar c);
263 };
264 struct Letter {
265  static bool Is(uchar c);
266 };
267 struct Space {
268  static bool Is(uchar c);
269 };
270 struct Number {
271  static bool Is(uchar c);
272 };
273 struct WhiteSpace {
274  static bool Is(uchar c);
275 };
277  static bool Is(uchar c);
278 };
280  static bool Is(uchar c);
281 };
283  static bool Is(uchar c);
284 };
285 struct ToLowercase {
286  static const int kMaxWidth = 3;
287  static int Convert(uchar c,
288  uchar n,
289  uchar* result,
290  bool* allow_caching_ptr);
291 };
292 struct ToUppercase {
293  static const int kMaxWidth = 3;
294  static int Convert(uchar c,
295  uchar n,
296  uchar* result,
297  bool* allow_caching_ptr);
298 };
300  static const int kMaxWidth = 1;
301  static int Convert(uchar c,
302  uchar n,
303  uchar* result,
304  bool* allow_caching_ptr);
305 };
307  static const int kMaxWidth = 4;
308  static int Convert(uchar c,
309  uchar n,
310  uchar* result,
311  bool* allow_caching_ptr);
312 };
314  static const int kMaxWidth = 1;
315  static int Convert(uchar c,
316  uchar n,
317  uchar* result,
318  bool* allow_caching_ptr);
319 };
320 
321 } // namespace unibrow
322 
323 #endif // V8_UNICODE_H_
friend class Test
Definition: unicode.h:80
static uchar TrailSurrogate(int char_code)
Definition: unicode.h:146
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition: unicode.h:129
static const unsigned kSizeOfUnmatchedSurrogate
Definition: unicode.h:172
static bool Is(uchar c)
Definition: unicode.cc:895
static uchar LeadSurrogate(int char_code)
Definition: unicode.h:143
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1783
unsigned Utf16Length()
Definition: unicode.cc:342
static const unsigned kMaxTwoByteChar
Definition: unicode.h:165
static bool EncodeNonAsciiCharacter(uchar c, byte *buffer, unsigned capacity, unsigned &offset)
Definition: unicode-inl.h:168
friend class Test
Definition: unicode.h:55
static uchar DecodeCharacter(const byte *buffer, unsigned *offset)
Definition: unicode-inl.h:194
const int kMaxMappingSize
Definition: unicode.h:47
static bool Is(uchar c)
Definition: unicode.cc:685
void Reset(const char *data, unsigned length)
Definition: unicode.h:251
static bool Is(uchar c)
Definition: unicode.cc:755
unsigned offset_
Definition: unicode.h:234
static const int kMaxWidth
Definition: unicode.h:314
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1718
static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit
Definition: unicode.h:139
byte util_buffer_[kSize]
Definition: unicode.h:241
static uchar CalculateValue(const byte *str, unsigned length, unsigned *cursor)
Definition: unicode.cc:210
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1186
static const unsigned kMaxEncodedSize
Definition: unicode.h:163
static bool Is(uchar c)
Definition: unicode.cc:779
static uchar Length(uchar chr, int previous)
Definition: unicode-inl.h:123
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1327
virtual void Seek(unsigned)
Definition: unicode.cc:352
static const uchar kMaxNonSurrogateCharCode
Definition: unicode.h:133
static const int kMaxWidth
Definition: unicode.h:293
Buffer(Data data, unsigned length)
Definition: unicode.h:108
void Seek(unsigned position)
Definition: unicode-inl.h:236
friend class Test
Definition: unicode.h:98
static bool Is(uchar c)
Definition: unicode.cc:800
static const int kUtf8BytesToCodeASurrogate
Definition: unicode.h:142
Data data()
Definition: unicode.h:110
static unsigned Encode(char *out, uchar c, int previous)
Definition: unicode-inl.h:81
static const int kMaxWidth
Definition: unicode.h:286
static const unsigned kMaxFourByteChar
Definition: unicode.h:167
virtual void Rewind()
Definition: unicode-inl.h:217
virtual void FillBuffer()=0
static int Convert(uchar c, uchar n, uchar *result, bool *allow_caching_ptr)
Definition: unicode.cc:1015
static const int kMaxWidth
Definition: unicode.h:307
static bool Is(uchar c)
Definition: unicode.cc:865
static const unsigned kBytesSavedByCombiningSurrogates
Definition: unicode.h:171
static bool Is(uchar c)
Definition: unicode.cc:561
static const uchar kBadChar
Definition: unicode.h:162
static const byte * ReadBlock(Buffer< const char * > str, byte *buffer, unsigned capacity, unsigned *chars_read, unsigned *offset)
Definition: unicode.cc:280
static const unsigned kMaxThreeByteChar
Definition: unicode.h:166
InputBuffer(Input input)
Definition: unicode.h:229
virtual void Rewind()=0
static const unsigned kMaxOneByteChar
Definition: unicode.h:164
static bool Is(uchar c)
Definition: unicode.cc:724
friend class Test
Definition: unicode.h:176
void Reset(Input input)
Definition: unicode-inl.h:231
static bool IsLeadSurrogate(int code)
Definition: unicode.h:120
static bool IsTrailSurrogate(int code)
Definition: unicode.h:124
unsigned length()
Definition: unicode.h:111
static bool EncodeAsciiCharacter(uchar c, byte *buffer, unsigned capacity, unsigned &offset)
Definition: unicode-inl.h:160
unsigned char byte
Definition: unicode.h:41
static const int kMaxWidth
Definition: unicode.h:300
virtual ~CharacterStream()
Definition: unicode.h:192
const byte * buffer_
Definition: unicode.h:209
static bool Is(uchar c)
Definition: unicode.cc:450
virtual void FillBuffer()
Definition: unicode-inl.h:212
static bool EncodeCharacter(uchar c, byte *buffer, unsigned capacity, unsigned &offset)
Definition: unicode-inl.h:185
unsigned int uchar
Definition: unicode.h:40
static const int kNoPreviousCharacter
Definition: unicode.h:132