Node.js  v8.x
Node.js is a JavaScript runtime built on Chrome's V8 JavaScript engine
node_i18n.cc
Go to the documentation of this file.
1 // Copyright Joyent, Inc. and other Node contributors.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a
4 // copy of this software and associated documentation files (the
5 // "Software"), to deal in the Software without restriction, including
6 // without limitation the rights to use, copy, modify, merge, publish,
7 // distribute, sublicense, and/or sell copies of the Software, and to permit
8 // persons to whom the Software is furnished to do so, subject to the
9 // following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included
12 // in all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
17 // NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
18 // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 // USE OR OTHER DEALINGS IN THE SOFTWARE.
21 
22 /*
23  * notes: by srl295
24  * - When in NODE_HAVE_SMALL_ICU mode, ICU is linked against "stub" (null) data
25  * ( stubdata/libicudata.a ) containing nothing, no data, and it's also
26  * linked against a "small" data file which the SMALL_ICUDATA_ENTRY_POINT
27  * macro names. That's the "english+root" data.
28  *
29  * If icu_data_path is non-null, the user has provided a path and we assume
30  * it goes somewhere useful. We set that path in ICU, and exit.
31  * If icu_data_path is null, they haven't set a path and we want the
32  * "english+root" data. We call
33  * udata_setCommonData(SMALL_ICUDATA_ENTRY_POINT,...)
34  * to load up the english+root data.
35  *
36  * - when NOT in NODE_HAVE_SMALL_ICU mode, ICU is linked directly with its full
37  * data. All of the variables and command line options for changing data at
38  * runtime are disabled, as they wouldn't fully override the internal data.
39  * See: http://bugs.icu-project.org/trac/ticket/10924
40  */
41 
42 
43 #include "node_i18n.h"
44 
45 #if defined(NODE_HAVE_I18N_SUPPORT)
46 
47 #include "node.h"
48 #include "node_buffer.h"
49 #include "env.h"
50 #include "env-inl.h"
51 #include "util.h"
52 #include "util-inl.h"
53 #include "base-object.h"
54 #include "base-object-inl.h"
55 #include "v8.h"
56 
57 #include <unicode/utypes.h>
58 #include <unicode/putil.h>
59 #include <unicode/uchar.h>
60 #include <unicode/uclean.h>
61 #include <unicode/udata.h>
62 #include <unicode/uidna.h>
63 #include <unicode/ucnv.h>
64 #include <unicode/utf8.h>
65 #include <unicode/utf16.h>
66 #include <unicode/timezone.h>
67 #include <unicode/ulocdata.h>
68 #include <unicode/uvernum.h>
69 #include <unicode/uversion.h>
70 #include <unicode/ustring.h>
71 
72 #ifdef NODE_HAVE_SMALL_ICU
73 /* if this is defined, we have a 'secondary' entry point.
74  compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
75 #define SMALL_ICUDATA_ENTRY_POINT \
76  SMALL_DEF2(U_ICU_VERSION_MAJOR_NUM, U_LIB_SUFFIX_C_NAME)
77 #define SMALL_DEF2(major, suff) SMALL_DEF(major, suff)
78 #ifndef U_LIB_SUFFIX_C_NAME
79 #define SMALL_DEF(major, suff) icusmdt##major##_dat
80 #else
81 #define SMALL_DEF(major, suff) icusmdt##suff##major##_dat
82 #endif
83 
84 extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
85 #endif
86 
87 namespace node {
88 
89 using v8::Context;
90 using v8::FunctionCallbackInfo;
91 using v8::HandleScope;
92 using v8::Isolate;
93 using v8::Local;
94 using v8::MaybeLocal;
95 using v8::Object;
96 using v8::ObjectTemplate;
97 using v8::String;
98 using v8::Value;
99 
100 namespace i18n {
101 namespace {
102 
103 template <typename T>
104 MaybeLocal<Object> ToBufferEndian(Environment* env, MaybeStackBuffer<T>* buf) {
105  MaybeLocal<Object> ret = Buffer::New(env, buf);
106  if (ret.IsEmpty())
107  return ret;
108 
109  static_assert(sizeof(T) == 1 || sizeof(T) == 2,
110  "Currently only one- or two-byte buffers are supported");
111  if (sizeof(T) > 1 && IsBigEndian()) {
112  SPREAD_BUFFER_ARG(ret.ToLocalChecked(), retbuf);
113  SwapBytes16(retbuf_data, retbuf_length);
114  }
115 
116  return ret;
117 }
118 
119 struct Converter {
120  explicit Converter(const char* name, const char* sub = NULL)
121  : conv(nullptr) {
122  UErrorCode status = U_ZERO_ERROR;
123  conv = ucnv_open(name, &status);
124  CHECK(U_SUCCESS(status));
125  if (sub != NULL) {
126  ucnv_setSubstChars(conv, sub, strlen(sub), &status);
127  }
128  }
129 
130  explicit Converter(UConverter* converter,
131  const char* sub = NULL) : conv(converter) {
132  CHECK_NE(conv, nullptr);
133  UErrorCode status = U_ZERO_ERROR;
134  if (sub != NULL) {
135  ucnv_setSubstChars(conv, sub, strlen(sub), &status);
136  }
137  }
138 
139  ~Converter() {
140  ucnv_close(conv);
141  }
142 
143  UConverter* conv;
144 };
145 
146 class ConverterObject : public BaseObject, Converter {
147  public:
148  enum ConverterFlags {
149  CONVERTER_FLAGS_FLUSH = 0x1,
150  CONVERTER_FLAGS_FATAL = 0x2,
151  CONVERTER_FLAGS_IGNORE_BOM = 0x4
152  };
153 
154  ~ConverterObject() override {}
155 
156  static void Has(const FunctionCallbackInfo<Value>& args) {
157  Environment* env = Environment::GetCurrent(args);
158  HandleScope scope(env->isolate());
159 
160  CHECK_GE(args.Length(), 1);
161  Utf8Value label(env->isolate(), args[0]);
162 
163  UErrorCode status = U_ZERO_ERROR;
164  UConverter* conv = ucnv_open(*label, &status);
165  args.GetReturnValue().Set(!!U_SUCCESS(status));
166  ucnv_close(conv);
167  }
168 
169  static void Create(const FunctionCallbackInfo<Value>& args) {
170  Environment* env = Environment::GetCurrent(args);
171  HandleScope scope(env->isolate());
172 
173  CHECK_GE(args.Length(), 2);
174  Utf8Value label(env->isolate(), args[0]);
175  int flags = args[1]->Uint32Value(env->context()).ToChecked();
176  bool fatal =
177  (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL;
178  bool ignoreBOM =
179  (flags & CONVERTER_FLAGS_IGNORE_BOM) == CONVERTER_FLAGS_IGNORE_BOM;
180 
181  UErrorCode status = U_ZERO_ERROR;
182  UConverter* conv = ucnv_open(*label, &status);
183  if (U_FAILURE(status))
184  return;
185 
186  if (fatal) {
187  status = U_ZERO_ERROR;
188  ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP,
189  nullptr, nullptr, nullptr, &status);
190  }
191 
192  Local<ObjectTemplate> t = ObjectTemplate::New(env->isolate());
193  t->SetInternalFieldCount(1);
194  Local<Object> obj = t->NewInstance(env->context()).ToLocalChecked();
195  new ConverterObject(env, obj, conv, ignoreBOM);
196  args.GetReturnValue().Set(obj);
197  }
198 
199  static void Decode(const FunctionCallbackInfo<Value>& args) {
200  Environment* env = Environment::GetCurrent(args);
201 
202  CHECK_GE(args.Length(), 3); // Converter, Buffer, Flags
203 
204  Converter utf8("utf8");
205  ConverterObject* converter;
206  ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());
207  SPREAD_BUFFER_ARG(args[1], input_obj);
208  int flags = args[2]->Uint32Value(env->context()).ToChecked();
209 
210  UErrorCode status = U_ZERO_ERROR;
211  MaybeStackBuffer<UChar> result;
212  MaybeLocal<Object> ret;
213  size_t limit = ucnv_getMinCharSize(converter->conv) *
214  input_obj_length;
215  if (limit > 0)
216  result.AllocateSufficientStorage(limit);
217 
218  UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;
219 
220  const char* source = input_obj_data;
221  size_t source_length = input_obj_length;
222 
223  if (converter->unicode_ && !converter->ignoreBOM_ && !converter->bomSeen_) {
224  int32_t bomOffset = 0;
225  ucnv_detectUnicodeSignature(source, source_length, &bomOffset, &status);
226  source += bomOffset;
227  source_length -= bomOffset;
228  converter->bomSeen_ = true;
229  }
230 
231  UChar* target = *result;
232  ucnv_toUnicode(converter->conv,
233  &target, target + (limit * sizeof(UChar)),
234  &source, source + source_length,
235  NULL, flush, &status);
236 
237  if (U_SUCCESS(status)) {
238  if (limit > 0)
239  result.SetLength(target - &result[0]);
240  ret = ToBufferEndian(env, &result);
241  args.GetReturnValue().Set(ret.ToLocalChecked());
242  goto reset;
243  }
244 
245  args.GetReturnValue().Set(status);
246 
247  reset:
248  if (flush) {
249  // Reset the converter state
250  converter->bomSeen_ = false;
251  ucnv_reset(converter->conv);
252  }
253  }
254 
255  protected:
256  ConverterObject(Environment* env,
257  v8::Local<v8::Object> wrap,
258  UConverter* converter,
259  bool ignoreBOM,
260  const char* sub = NULL) :
261  BaseObject(env, wrap),
262  Converter(converter, sub),
263  ignoreBOM_(ignoreBOM) {
264  MakeWeak<ConverterObject>(this);
265 
266  switch (ucnv_getType(converter)) {
267  case UCNV_UTF8:
268  case UCNV_UTF16_BigEndian:
269  case UCNV_UTF16_LittleEndian:
270  unicode_ = true;
271  break;
272  default:
273  unicode_ = false;
274  }
275  }
276 
277  private:
278  bool unicode_ = false; // True if this is a Unicode converter
279  bool ignoreBOM_ = false; // True if the BOM should be ignored on Unicode
280  bool bomSeen_ = false; // True if the BOM has been seen
281 };
282 
283 // One-Shot Converters
284 
285 void CopySourceBuffer(MaybeStackBuffer<UChar>* dest,
286  const char* data,
287  const size_t length,
288  const size_t length_in_chars) {
289  dest->AllocateSufficientStorage(length_in_chars);
290  char* dst = reinterpret_cast<char*>(**dest);
291  memcpy(dst, data, length);
292  if (IsBigEndian()) {
293  SwapBytes16(dst, length);
294  }
295 }
296 
297 typedef MaybeLocal<Object> (*TranscodeFunc)(Environment* env,
298  const char* fromEncoding,
299  const char* toEncoding,
300  const char* source,
301  const size_t source_length,
302  UErrorCode* status);
303 
304 MaybeLocal<Object> Transcode(Environment* env,
305  const char* fromEncoding,
306  const char* toEncoding,
307  const char* source,
308  const size_t source_length,
309  UErrorCode* status) {
310  *status = U_ZERO_ERROR;
311  MaybeLocal<Object> ret;
312  MaybeStackBuffer<char> result;
313  Converter to(toEncoding, "?");
314  Converter from(fromEncoding);
315  const uint32_t limit = source_length * ucnv_getMaxCharSize(to.conv);
316  result.AllocateSufficientStorage(limit);
317  char* target = *result;
318  ucnv_convertEx(to.conv, from.conv, &target, target + limit,
319  &source, source + source_length, nullptr, nullptr,
320  nullptr, nullptr, true, true, status);
321  if (U_SUCCESS(*status)) {
322  result.SetLength(target - &result[0]);
323  ret = ToBufferEndian(env, &result);
324  }
325  return ret;
326 }
327 
328 MaybeLocal<Object> TranscodeToUcs2(Environment* env,
329  const char* fromEncoding,
330  const char* toEncoding,
331  const char* source,
332  const size_t source_length,
333  UErrorCode* status) {
334  *status = U_ZERO_ERROR;
335  MaybeLocal<Object> ret;
336  MaybeStackBuffer<UChar> destbuf(source_length);
337  Converter from(fromEncoding);
338  const size_t length_in_chars = source_length * sizeof(UChar);
339  ucnv_toUChars(from.conv, *destbuf, length_in_chars,
340  source, source_length, status);
341  if (U_SUCCESS(*status))
342  ret = ToBufferEndian(env, &destbuf);
343  return ret;
344 }
345 
346 MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
347  const char* fromEncoding,
348  const char* toEncoding,
349  const char* source,
350  const size_t source_length,
351  UErrorCode* status) {
352  *status = U_ZERO_ERROR;
353  MaybeStackBuffer<UChar> sourcebuf;
354  MaybeLocal<Object> ret;
355  Converter to(toEncoding, "?");
356  const size_t length_in_chars = source_length / sizeof(UChar);
357  CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
358  MaybeStackBuffer<char> destbuf(length_in_chars);
359  const uint32_t len = ucnv_fromUChars(to.conv, *destbuf, length_in_chars,
360  *sourcebuf, length_in_chars, status);
361  if (U_SUCCESS(*status)) {
362  destbuf.SetLength(len);
363  ret = ToBufferEndian(env, &destbuf);
364  }
365  return ret;
366 }
367 
368 MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
369  const char* fromEncoding,
370  const char* toEncoding,
371  const char* source,
372  const size_t source_length,
373  UErrorCode* status) {
374  *status = U_ZERO_ERROR;
375  MaybeStackBuffer<UChar> destbuf;
376  int32_t result_length;
377  u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
378  source, source_length, status);
379  MaybeLocal<Object> ret;
380  if (U_SUCCESS(*status)) {
381  destbuf.SetLength(result_length);
382  ret = ToBufferEndian(env, &destbuf);
383  } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
384  *status = U_ZERO_ERROR;
385  destbuf.AllocateSufficientStorage(result_length);
386  u_strFromUTF8(*destbuf, result_length, &result_length,
387  source, source_length, status);
388  if (U_SUCCESS(*status)) {
389  destbuf.SetLength(result_length);
390  ret = ToBufferEndian(env, &destbuf);
391  }
392  }
393  return ret;
394 }
395 
396 MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
397  const char* fromEncoding,
398  const char* toEncoding,
399  const char* source,
400  const size_t source_length,
401  UErrorCode* status) {
402  *status = U_ZERO_ERROR;
403  MaybeLocal<Object> ret;
404  const size_t length_in_chars = source_length / sizeof(UChar);
405  int32_t result_length;
406  MaybeStackBuffer<UChar> sourcebuf;
407  MaybeStackBuffer<char> destbuf;
408  CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
409  u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
410  *sourcebuf, length_in_chars, status);
411  if (U_SUCCESS(*status)) {
412  destbuf.SetLength(result_length);
413  ret = ToBufferEndian(env, &destbuf);
414  } else if (*status == U_BUFFER_OVERFLOW_ERROR) {
415  *status = U_ZERO_ERROR;
416  destbuf.AllocateSufficientStorage(result_length);
417  u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
418  length_in_chars, status);
419  if (U_SUCCESS(*status)) {
420  destbuf.SetLength(result_length);
421  ret = ToBufferEndian(env, &destbuf);
422  }
423  }
424  return ret;
425 }
426 
427 const char* EncodingName(const enum encoding encoding) {
428  switch (encoding) {
429  case ASCII: return "us-ascii";
430  case LATIN1: return "iso8859-1";
431  case UCS2: return "utf16le";
432  case UTF8: return "utf-8";
433  default: return NULL;
434  }
435 }
436 
437 bool SupportedEncoding(const enum encoding encoding) {
438  switch (encoding) {
439  case ASCII:
440  case LATIN1:
441  case UCS2:
442  case UTF8: return true;
443  default: return false;
444  }
445 }
446 
447 void Transcode(const FunctionCallbackInfo<Value>&args) {
448  Environment* env = Environment::GetCurrent(args);
449  Isolate* isolate = env->isolate();
450  UErrorCode status = U_ZERO_ERROR;
451  MaybeLocal<Object> result;
452 
453  THROW_AND_RETURN_UNLESS_BUFFER(env, args[0]);
454  SPREAD_BUFFER_ARG(args[0], ts_obj);
455  const enum encoding fromEncoding = ParseEncoding(isolate, args[1], BUFFER);
456  const enum encoding toEncoding = ParseEncoding(isolate, args[2], BUFFER);
457 
458  if (SupportedEncoding(fromEncoding) && SupportedEncoding(toEncoding)) {
459  TranscodeFunc tfn = &Transcode;
460  switch (fromEncoding) {
461  case ASCII:
462  case LATIN1:
463  if (toEncoding == UCS2)
464  tfn = &TranscodeToUcs2;
465  break;
466  case UTF8:
467  if (toEncoding == UCS2)
468  tfn = &TranscodeUcs2FromUtf8;
469  break;
470  case UCS2:
471  switch (toEncoding) {
472  case UCS2:
473  tfn = &Transcode;
474  break;
475  case UTF8:
476  tfn = &TranscodeUtf8FromUcs2;
477  break;
478  default:
479  tfn = &TranscodeFromUcs2;
480  }
481  break;
482  default:
483  // This should not happen because of the SupportedEncoding checks
484  ABORT();
485  }
486 
487  result = tfn(env, EncodingName(fromEncoding), EncodingName(toEncoding),
488  ts_obj_data, ts_obj_length, &status);
489  } else {
490  status = U_ILLEGAL_ARGUMENT_ERROR;
491  }
492 
493  if (result.IsEmpty())
494  return args.GetReturnValue().Set(status);
495 
496  return args.GetReturnValue().Set(result.ToLocalChecked());
497 }
498 
499 void ICUErrorName(const FunctionCallbackInfo<Value>& args) {
500  Environment* env = Environment::GetCurrent(args);
501  UErrorCode status = static_cast<UErrorCode>(args[0]->Int32Value());
502  args.GetReturnValue().Set(
503  String::NewFromUtf8(env->isolate(),
504  u_errorName(status),
505  v8::NewStringType::kNormal).ToLocalChecked());
506 }
507 
508 #define TYPE_ICU "icu"
509 #define TYPE_UNICODE "unicode"
510 #define TYPE_CLDR "cldr"
511 #define TYPE_TZ "tz"
512 
521 const char* GetVersion(const char* type,
522  char buf[U_MAX_VERSION_STRING_LENGTH],
523  UErrorCode* status) {
524  if (!strcmp(type, TYPE_ICU)) {
525  return U_ICU_VERSION;
526  } else if (!strcmp(type, TYPE_UNICODE)) {
527  return U_UNICODE_VERSION;
528  } else if (!strcmp(type, TYPE_TZ)) {
529  return TimeZone::getTZDataVersion(*status);
530  } else if (!strcmp(type, TYPE_CLDR)) {
531  UVersionInfo versionArray;
532  ulocdata_getCLDRVersion(versionArray, status);
533  if (U_SUCCESS(*status)) {
534  u_versionToString(versionArray, buf);
535  return buf;
536  }
537  }
538  // Fall through - unknown type or error case
539  return nullptr;
540 }
541 
542 void GetVersion(const FunctionCallbackInfo<Value>& args) {
543  Environment* env = Environment::GetCurrent(args);
544  if ( args.Length() == 0 ) {
545  // With no args - return a comma-separated list of allowed values
546  args.GetReturnValue().Set(
547  String::NewFromUtf8(env->isolate(),
548  TYPE_ICU ","
549  TYPE_UNICODE ","
550  TYPE_CLDR ","
551  TYPE_TZ));
552  } else {
553  CHECK_GE(args.Length(), 1);
554  CHECK(args[0]->IsString());
555  Utf8Value val(env->isolate(), args[0]);
556  UErrorCode status = U_ZERO_ERROR;
557  char buf[U_MAX_VERSION_STRING_LENGTH] = ""; // Possible output buffer.
558  const char* versionString = GetVersion(*val, buf, &status);
559 
560  if (U_SUCCESS(status) && versionString) {
561  // Success.
562  args.GetReturnValue().Set(
563  String::NewFromUtf8(env->isolate(),
564  versionString));
565  }
566  }
567 }
568 
569 } // anonymous namespace
570 
571 bool InitializeICUDirectory(const std::string& path) {
572  UErrorCode status = U_ZERO_ERROR;
573  if (path.empty()) {
574 #ifdef NODE_HAVE_SMALL_ICU
575  // install the 'small' data.
576  udata_setCommonData(&SMALL_ICUDATA_ENTRY_POINT, &status);
577 #else // !NODE_HAVE_SMALL_ICU
578  // no small data, so nothing to do.
579 #endif // !NODE_HAVE_SMALL_ICU
580  } else {
581  u_setDataDirectory(path.c_str());
582  u_init(&status);
583  }
584  return status == U_ZERO_ERROR;
585 }
586 
587 int32_t ToUnicode(MaybeStackBuffer<char>* buf,
588  const char* input,
589  size_t length) {
590  UErrorCode status = U_ZERO_ERROR;
591  uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
592  UIDNA* uidna = uidna_openUTS46(options, &status);
593  if (U_FAILURE(status))
594  return -1;
595  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
596 
597  int32_t len = uidna_nameToUnicodeUTF8(uidna,
598  input, length,
599  **buf, buf->capacity(),
600  &info,
601  &status);
602 
603  // Do not check info.errors like we do with ToASCII since ToUnicode always
604  // returns a string, despite any possible errors that may have occurred.
605 
606  if (status == U_BUFFER_OVERFLOW_ERROR) {
607  status = U_ZERO_ERROR;
608  buf->AllocateSufficientStorage(len);
609  len = uidna_nameToUnicodeUTF8(uidna,
610  input, length,
611  **buf, buf->capacity(),
612  &info,
613  &status);
614  }
615 
616  // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
617  // string, regardless of whether an error occurred.
618 
619  if (U_FAILURE(status)) {
620  len = -1;
621  buf->SetLength(0);
622  } else {
623  buf->SetLength(len);
624  }
625 
626  uidna_close(uidna);
627  return len;
628 }
629 
630 int32_t ToASCII(MaybeStackBuffer<char>* buf,
631  const char* input,
632  size_t length,
633  enum idna_mode mode) {
634  UErrorCode status = U_ZERO_ERROR;
635  uint32_t options = // CheckHyphens = false; handled later
636  UIDNA_CHECK_BIDI | // CheckBidi = true
637  UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true
638  UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing
639  if (mode == IDNA_STRICT) {
640  options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict
641  // VerifyDnsLength = beStrict;
642  // handled later
643  }
644 
645  UIDNA* uidna = uidna_openUTS46(options, &status);
646  if (U_FAILURE(status))
647  return -1;
648  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
649 
650  int32_t len = uidna_nameToASCII_UTF8(uidna,
651  input, length,
652  **buf, buf->capacity(),
653  &info,
654  &status);
655 
656  if (status == U_BUFFER_OVERFLOW_ERROR) {
657  status = U_ZERO_ERROR;
658  buf->AllocateSufficientStorage(len);
659  len = uidna_nameToASCII_UTF8(uidna,
660  input, length,
661  **buf, buf->capacity(),
662  &info,
663  &status);
664  }
665 
666  // In UTS #46 which specifies ToASCII, certain error conditions are
667  // configurable through options, and the WHATWG URL Standard promptly elects
668  // to disable some of them to accommodate for real-world use cases.
669  // Unfortunately, ICU4C's IDNA module does not support disabling some of
670  // these options through `options` above, and thus continues throwing
671  // unnecessary errors. To counter this situation, we just filter out the
672  // errors that may have happened afterwards, before deciding whether to
673  // return an error from this function.
674 
675  // CheckHyphens = false
676  // (Specified in the current UTS #46 draft rev. 18.)
677  // Refs:
678  // - https://github.com/whatwg/url/issues/53
679  // - https://github.com/whatwg/url/pull/309
680  // - http://www.unicode.org/review/pri317/
681  // - http://www.unicode.org/reports/tr46/tr46-18.html
682  // - https://www.icann.org/news/announcement-2000-01-07-en
683  info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
684  info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
685  info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
686 
687  if (mode != IDNA_STRICT) {
688  // VerifyDnsLength = beStrict
689  info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
690  info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
691  info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
692  }
693 
694  if (U_FAILURE(status) || (mode != IDNA_LENIENT && info.errors != 0)) {
695  len = -1;
696  buf->SetLength(0);
697  } else {
698  buf->SetLength(len);
699  }
700 
701  uidna_close(uidna);
702  return len;
703 }
704 
705 static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
706  Environment* env = Environment::GetCurrent(args);
707  CHECK_GE(args.Length(), 1);
708  CHECK(args[0]->IsString());
709  Utf8Value val(env->isolate(), args[0]);
710 
711  MaybeStackBuffer<char> buf;
712  int32_t len = ToUnicode(&buf, *val, val.length());
713 
714  if (len < 0) {
715  return env->ThrowError("Cannot convert name to Unicode");
716  }
717 
718  args.GetReturnValue().Set(
719  String::NewFromUtf8(env->isolate(),
720  *buf,
721  v8::NewStringType::kNormal,
722  len).ToLocalChecked());
723 }
724 
725 static void ToASCII(const FunctionCallbackInfo<Value>& args) {
726  Environment* env = Environment::GetCurrent(args);
727  CHECK_GE(args.Length(), 1);
728  CHECK(args[0]->IsString());
729  Utf8Value val(env->isolate(), args[0]);
730  // optional arg
731  bool lenient = args[1]->BooleanValue(env->context()).FromJust();
732  enum idna_mode mode = lenient ? IDNA_LENIENT : IDNA_DEFAULT;
733 
734  MaybeStackBuffer<char> buf;
735  int32_t len = ToASCII(&buf, *val, val.length(), mode);
736 
737  if (len < 0) {
738  return env->ThrowError("Cannot convert name to ASCII");
739  }
740 
741  args.GetReturnValue().Set(
742  String::NewFromUtf8(env->isolate(),
743  *buf,
744  v8::NewStringType::kNormal,
745  len).ToLocalChecked());
746 }
747 
748 // This is similar to wcwidth except that it takes the current unicode
749 // character properties database into consideration, allowing it to
750 // correctly calculate the column widths of things like emoji's and
751 // newer wide characters. wcwidth, on the other hand, uses a fixed
752 // algorithm that does not take things like emoji into proper
753 // consideration.
754 //
755 // TODO(TimothyGu): Investigate Cc (C0/C1 control codes). Both VTE (used by
756 // GNOME Terminal) and Konsole don't consider them to be zero-width (see refs
757 // below), and when printed in VTE it is Narrow. However GNOME Terminal doesn't
758 // allow it to be input. Linux's PTY terminal prints control characters as
759 // Narrow rhombi.
760 //
761 // TODO(TimothyGu): Investigate Hangul jamo characters. Medial vowels and final
762 // consonants are 0-width when combined with initial consonants; otherwise they
763 // are technically Wide. But many terminals (including Konsole and
764 // VTE/GLib-based) implement all medials and finals as 0-width.
765 //
766 // Refs: https://eev.ee/blog/2015/09/12/dark-corners-of-unicode/#combining-characters-and-character-width
767 // Refs: https://github.com/GNOME/glib/blob/79e4d4c6be/glib/guniprop.c#L388-L420
768 // Refs: https://github.com/KDE/konsole/blob/8c6a5d13c0/src/konsole_wcwidth.cpp#L101-L223
769 static int GetColumnWidth(UChar32 codepoint,
770  bool ambiguous_as_full_width = false) {
771  const auto zero_width_mask = U_GC_CC_MASK | // C0/C1 control code
772  U_GC_CF_MASK | // Format control character
773  U_GC_ME_MASK | // Enclosing mark
774  U_GC_MN_MASK; // Nonspacing mark
775  if (codepoint != 0x00AD && // SOFT HYPHEN is Cf but not zero-width
776  ((U_MASK(u_charType(codepoint)) & zero_width_mask) ||
777  u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER))) {
778  return 0;
779  }
780 
781  // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a
782  // codepoint as being full width, wide, ambiguous, neutral, narrow,
783  // or halfwidth.
784  const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH);
785  switch (eaw) {
786  case U_EA_FULLWIDTH:
787  case U_EA_WIDE:
788  return 2;
789  case U_EA_AMBIGUOUS:
790  // See: http://www.unicode.org/reports/tr11/#Ambiguous for details
791  if (ambiguous_as_full_width) {
792  return 2;
793  }
794  // Fall through if ambiguous_as_full_width if false.
795  case U_EA_NEUTRAL:
796  if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) {
797  return 2;
798  }
799  // Fall through
800  case U_EA_HALFWIDTH:
801  case U_EA_NARROW:
802  default:
803  return 1;
804  }
805 }
806 
807 // Returns the column width for the given String.
808 static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
809  Environment* env = Environment::GetCurrent(args);
810  if (args.Length() < 1)
811  return;
812 
813  bool ambiguous_as_full_width = args[1]->BooleanValue();
814  bool expand_emoji_sequence = args[2]->BooleanValue();
815 
816  if (args[0]->IsNumber()) {
817  args.GetReturnValue().Set(
818  GetColumnWidth(args[0]->Uint32Value(),
819  ambiguous_as_full_width));
820  return;
821  }
822 
823  TwoByteValue value(env->isolate(), args[0]);
824  // reinterpret_cast is required by windows to compile
825  UChar* str = reinterpret_cast<UChar*>(*value);
826  static_assert(sizeof(*str) == sizeof(**value),
827  "sizeof(*str) == sizeof(**value)");
828  UChar32 c = 0;
829  UChar32 p;
830  size_t n = 0;
831  uint32_t width = 0;
832 
833  while (n < value.length()) {
834  p = c;
835  U16_NEXT(str, n, value.length(), c);
836  // Don't count individual emoji codepoints that occur within an
837  // emoji sequence. This is not necessarily foolproof. Some
838  // environments display emoji sequences in the appropriate
839  // condensed form (as a single emoji glyph), other environments
840  // may not understand an emoji sequence and will display each
841  // individual emoji separately. When this happens, the width
842  // calculated will be off, and there's no reliable way of knowing
843  // in advance if a particular sequence is going to be supported.
844  // The expand_emoji_sequence option allows the caller to skip this
845  // check and count each code within an emoji sequence separately.
846  if (!expand_emoji_sequence &&
847  n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner)
848  (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
849  u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) {
850  continue;
851  }
852  width += GetColumnWidth(c, ambiguous_as_full_width);
853  }
854  args.GetReturnValue().Set(width);
855 }
856 
857 void Init(Local<Object> target,
858  Local<Value> unused,
859  Local<Context> context,
860  void* priv) {
861  Environment* env = Environment::GetCurrent(context);
862  env->SetMethod(target, "toUnicode", ToUnicode);
863  env->SetMethod(target, "toASCII", ToASCII);
864  env->SetMethod(target, "getStringWidth", GetStringWidth);
865  env->SetMethod(target, "getVersion", GetVersion);
866 
867  // One-shot converters
868  env->SetMethod(target, "icuErrName", ICUErrorName);
869  env->SetMethod(target, "transcode", Transcode);
870 
871  // ConverterObject
872  env->SetMethod(target, "getConverter", ConverterObject::Create);
873  env->SetMethod(target, "decode", ConverterObject::Decode);
874  env->SetMethod(target, "hasConverter", ConverterObject::Has);
875 }
876 
877 } // namespace i18n
878 } // namespace node
879 
881 
882 #endif // NODE_HAVE_I18N_SUPPORT
unsigned char * buf
Definition: cares_wrap.cc:483
NODE_MODULE_CONTEXT_AWARE_BUILTIN(inspector, node::inspector::Agent::InitInspector)
int len
Definition: cares_wrap.cc:485
std::string source
Definition: module_wrap.cc:306
QueryWrap * wrap
Definition: cares_wrap.cc:478
int status
Definition: cares_wrap.cc:479
union node::cares_wrap::@8::CaresAsyncData::@0 data
encoding
Definition: node.h:322
dtrace p
Definition: v8ustack.d:615
dtrace t
Definition: v8ustack.d:582
enum encoding ParseEncoding(const char *encoding, enum encoding default_encoding)
Definition: node.cc:1485
MaybeLocal< Object > New(Isolate *isolate, Local< String > string, enum encoding enc)
Definition: node_buffer.cc:241
dtrace n
Definition: v8ustack.d:531
void Init(int *argc, const char **argv, int *exec_argc, const char ***exec_argv)
Definition: node.cc:4351