LLVM  16.0.0git
ConvertUTF.cpp
Go to the documentation of this file.
1 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===------------------------------------------------------------------------=*/
8 /*
9  * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
10  * Distributed under the Terms of Use in
11  * http://www.unicode.org/copyright.html.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining
14  * a copy of the Unicode data files and any associated documentation
15  * (the "Data Files") or Unicode software and any associated documentation
16  * (the "Software") to deal in the Data Files or Software
17  * without restriction, including without limitation the rights to use,
18  * copy, modify, merge, publish, distribute, and/or sell copies of
19  * the Data Files or Software, and to permit persons to whom the Data Files
20  * or Software are furnished to do so, provided that
21  * (a) this copyright and permission notice appear with all copies
22  * of the Data Files or Software,
23  * (b) this copyright and permission notice appear in associated
24  * documentation, and
25  * (c) there is clear notice in each modified Data File or in the Software
26  * as well as in the documentation associated with the Data File(s) or
27  * Software that the data or software has been modified.
28  *
29  * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
30  * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
31  * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
33  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
34  * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
35  * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
36  * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
37  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
38  * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
39  *
40  * Except as contained in this notice, the name of a copyright holder
41  * shall not be used in advertising or otherwise to promote the sale,
42  * use or other dealings in these Data Files or Software without prior
43  * written authorization of the copyright holder.
44  */
45 
46 /* ---------------------------------------------------------------------
47 
48  Conversions between UTF32, UTF-16, and UTF-8. Source code file.
49  Author: Mark E. Davis, 1994.
50  Rev History: Rick McGowan, fixes & updates May 2001.
51  Sept 2001: fixed const & error conditions per
52  mods suggested by S. Parent & A. Lillich.
53  June 2002: Tim Dodd added detection and handling of incomplete
54  source sequences, enhanced error detection, added casts
55  to eliminate compiler warnings.
56  July 2003: slight mods to back out aggressive FFFE detection.
57  Jan 2004: updated switches in from-UTF8 conversions.
58  Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
59 
60  See the header file "ConvertUTF.h" for complete documentation.
61 
62 ------------------------------------------------------------------------ */
63 
65 #ifdef CVTUTF_DEBUG
66 #include <stdio.h>
67 #endif
68 #include <assert.h>
69 
70 /*
71  * This code extensively uses fall-through switches.
72  * Keep the compiler from warning about that.
73  */
74 #if defined(__clang__) && defined(__has_warning)
75 # if __has_warning("-Wimplicit-fallthrough")
76 # define ConvertUTF_DISABLE_WARNINGS \
77  _Pragma("clang diagnostic push") \
78  _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
79 # define ConvertUTF_RESTORE_WARNINGS \
80  _Pragma("clang diagnostic pop")
81 # endif
82 #elif defined(__GNUC__) && __GNUC__ > 6
83 # define ConvertUTF_DISABLE_WARNINGS \
84  _Pragma("GCC diagnostic push") \
85  _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
86 # define ConvertUTF_RESTORE_WARNINGS \
87  _Pragma("GCC diagnostic pop")
88 #endif
89 #ifndef ConvertUTF_DISABLE_WARNINGS
90 # define ConvertUTF_DISABLE_WARNINGS
91 #endif
92 #ifndef ConvertUTF_RESTORE_WARNINGS
93 # define ConvertUTF_RESTORE_WARNINGS
94 #endif
95 
97 
98 namespace llvm {
99 
100 static const int halfShift = 10; /* used for shifting by 10 bits */
101 
102 static const UTF32 halfBase = 0x0010000UL;
103 static const UTF32 halfMask = 0x3FFUL;
104 
105 #define UNI_SUR_HIGH_START (UTF32)0xD800
106 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
107 #define UNI_SUR_LOW_START (UTF32)0xDC00
108 #define UNI_SUR_LOW_END (UTF32)0xDFFF
109 
110 /* --------------------------------------------------------------------- */
111 
112 /*
113  * Index into the table below with the first byte of a UTF-8 sequence to
114  * get the number of trailing bytes that are supposed to follow it.
115  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
116  * left as-is for anyone who may want to do such conversion, which was
117  * allowed in earlier algorithms.
118  */
119 static const char trailingBytesForUTF8[256] = {
120  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
125  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
126  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
127  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
128 };
129 
130 /*
131  * Magic values subtracted from a buffer value during UTF8 conversion.
132  * This table contains as many values as there might be trailing bytes
133  * in a UTF-8 sequence.
134  */
135 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
136  0x03C82080UL, 0xFA082080UL, 0x82082080UL };
137 
138 /*
139  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
140  * into the first byte, depending on how many bytes follow. There are
141  * as many entries in this table as there are UTF-8 sequence types.
142  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
143  * for *legal* UTF-8 will be 4 or fewer bytes total.
144  */
145 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
146 
147 /* --------------------------------------------------------------------- */
148 
149 /* The interface converts a whole buffer to avoid function-call overhead.
150  * Constants have been gathered. Loops & conditionals have been removed as
151  * much as possible for efficiency, in favor of drop-through switches.
152  * (See "Note A" at the bottom of the file for equivalent code.)
153  * If your compiler supports it, the "isLegalUTF8" call can be turned
154  * into an inline function.
155  */
156 
157 
158 /* --------------------------------------------------------------------- */
159 
161  const UTF32** sourceStart, const UTF32* sourceEnd,
162  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
164  const UTF32* source = *sourceStart;
165  UTF16* target = *targetStart;
166  while (source < sourceEnd) {
167  UTF32 ch;
168  if (target >= targetEnd) {
169  result = targetExhausted; break;
170  }
171  ch = *source++;
172  if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
173  /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
174  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
175  if (flags == strictConversion) {
176  --source; /* return to the illegal value itself */
178  break;
179  } else {
180  *target++ = UNI_REPLACEMENT_CHAR;
181  }
182  } else {
183  *target++ = (UTF16)ch; /* normal case */
184  }
185  } else if (ch > UNI_MAX_LEGAL_UTF32) {
186  if (flags == strictConversion) {
188  } else {
189  *target++ = UNI_REPLACEMENT_CHAR;
190  }
191  } else {
192  /* target is a character in range 0xFFFF - 0x10FFFF. */
193  if (target + 1 >= targetEnd) {
194  --source; /* Back up source pointer! */
195  result = targetExhausted; break;
196  }
197  ch -= halfBase;
198  *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
199  *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
200  }
201  }
202  *sourceStart = source;
203  *targetStart = target;
204  return result;
205 }
206 
207 /* --------------------------------------------------------------------- */
208 
210  const UTF16** sourceStart, const UTF16* sourceEnd,
211  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
213  const UTF16* source = *sourceStart;
214  UTF32* target = *targetStart;
215  UTF32 ch, ch2;
216  while (source < sourceEnd) {
217  const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
218  ch = *source++;
219  /* If we have a surrogate pair, convert to UTF32 first. */
220  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
221  /* If the 16 bits following the high surrogate are in the source buffer... */
222  if (source < sourceEnd) {
223  ch2 = *source;
224  /* If it's a low surrogate, convert to UTF32. */
225  if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
226  ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
227  + (ch2 - UNI_SUR_LOW_START) + halfBase;
228  ++source;
229  } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
230  --source; /* return to the illegal value itself */
232  break;
233  }
234  } else { /* We don't have the 16 bits following the high surrogate. */
235  --source; /* return to the high surrogate */
237  break;
238  }
239  } else if (flags == strictConversion) {
240  /* UTF-16 surrogate values are illegal in UTF-32 */
241  if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
242  --source; /* return to the illegal value itself */
244  break;
245  }
246  }
247  if (target >= targetEnd) {
248  source = oldSource; /* Back up source pointer! */
249  result = targetExhausted; break;
250  }
251  *target++ = ch;
252  }
253  *sourceStart = source;
254  *targetStart = target;
255 #ifdef CVTUTF_DEBUG
256 if (result == sourceIllegal) {
257  fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
258  fflush(stderr);
259 }
260 #endif
261  return result;
262 }
264  const UTF16** sourceStart, const UTF16* sourceEnd,
265  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
267  const UTF16* source = *sourceStart;
268  UTF8* target = *targetStart;
269  while (source < sourceEnd) {
270  UTF32 ch;
271  unsigned short bytesToWrite = 0;
272  const UTF32 byteMask = 0xBF;
273  const UTF32 byteMark = 0x80;
274  const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
275  ch = *source++;
276  /* If we have a surrogate pair, convert to UTF32 first. */
277  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
278  /* If the 16 bits following the high surrogate are in the source buffer... */
279  if (source < sourceEnd) {
280  UTF32 ch2 = *source;
281  /* If it's a low surrogate, convert to UTF32. */
282  if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
283  ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
284  + (ch2 - UNI_SUR_LOW_START) + halfBase;
285  ++source;
286  } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
287  --source; /* return to the illegal value itself */
289  break;
290  }
291  } else { /* We don't have the 16 bits following the high surrogate. */
292  --source; /* return to the high surrogate */
294  break;
295  }
296  } else if (flags == strictConversion) {
297  /* UTF-16 surrogate values are illegal in UTF-32 */
298  if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
299  --source; /* return to the illegal value itself */
301  break;
302  }
303  }
304  /* Figure out how many bytes the result will require */
305  if (ch < (UTF32)0x80) { bytesToWrite = 1;
306  } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
307  } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
308  } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
309  } else { bytesToWrite = 3;
311  }
312 
313  target += bytesToWrite;
314  if (target > targetEnd) {
315  source = oldSource; /* Back up source pointer! */
316  target -= bytesToWrite; result = targetExhausted; break;
317  }
318  switch (bytesToWrite) { /* note: everything falls through. */
319  case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
320  case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
321  case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
322  case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
323  }
324  target += bytesToWrite;
325  }
326  *sourceStart = source;
327  *targetStart = target;
328  return result;
329 }
330 
331 /* --------------------------------------------------------------------- */
332 
334  const UTF32** sourceStart, const UTF32* sourceEnd,
335  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
337  const UTF32* source = *sourceStart;
338  UTF8* target = *targetStart;
339  while (source < sourceEnd) {
340  UTF32 ch;
341  unsigned short bytesToWrite = 0;
342  const UTF32 byteMask = 0xBF;
343  const UTF32 byteMark = 0x80;
344  ch = *source++;
345  if (flags == strictConversion ) {
346  /* UTF-16 surrogate values are illegal in UTF-32 */
347  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
348  --source; /* return to the illegal value itself */
350  break;
351  }
352  }
353  /*
354  * Figure out how many bytes the result will require. Turn any
355  * illegally large UTF32 things (> Plane 17) into replacement chars.
356  */
357  if (ch < (UTF32)0x80) { bytesToWrite = 1;
358  } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
359  } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
360  } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
361  } else { bytesToWrite = 3;
364  }
365 
366  target += bytesToWrite;
367  if (target > targetEnd) {
368  --source; /* Back up source pointer! */
369  target -= bytesToWrite; result = targetExhausted; break;
370  }
371  switch (bytesToWrite) { /* note: everything falls through. */
372  case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
373  case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
374  case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
375  case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
376  }
377  target += bytesToWrite;
378  }
379  *sourceStart = source;
380  *targetStart = target;
381  return result;
382 }
383 
384 /* --------------------------------------------------------------------- */
385 
386 /*
387  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
388  * This must be called with the length pre-determined by the first byte.
389  * If not calling this from ConvertUTF8to*, then the length can be set by:
390  * length = trailingBytesForUTF8[*source]+1;
391  * and the sequence is illegal right away if there aren't that many bytes
392  * available.
393  * If presented with a length > 4, this returns false. The Unicode
394  * definition of UTF-8 goes up to 4-byte sequences.
395  */
396 
397 static Boolean isLegalUTF8(const UTF8 *source, int length) {
398  UTF8 a;
399  const UTF8 *srcptr = source+length;
400  switch (length) {
401  default: return false;
402  /* Everything else falls through when "true"... */
403  case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
404  case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
405  case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
406 
407  switch (*source) {
408  /* no fall-through in this inner switch */
409  case 0xE0: if (a < 0xA0) return false; break;
410  case 0xED: if (a > 0x9F) return false; break;
411  case 0xF0: if (a < 0x90) return false; break;
412  case 0xF4: if (a > 0x8F) return false; break;
413  default: if (a < 0x80) return false;
414  }
415 
416  case 1: if (*source >= 0x80 && *source < 0xC2) return false;
417  }
418  if (*source > 0xF4) return false;
419  return true;
420 }
421 
422 /* --------------------------------------------------------------------- */
423 
424 /*
425  * Exported function to return whether a UTF-8 sequence is legal or not.
426  * This is not used here; it's just exported.
427  */
428 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
429  int length = trailingBytesForUTF8[*source]+1;
430  if (length > sourceEnd - source) {
431  return false;
432  }
433  return isLegalUTF8(source, length);
434 }
435 
436 /*
437  * Exported function to return the size of the first utf-8 code unit sequence,
438  * Or 0 if the sequence is not valid;
439  */
440 unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
441  int length = trailingBytesForUTF8[*source] + 1;
442  return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
443  : 0;
444 }
445 
446 /* --------------------------------------------------------------------- */
447 
448 static unsigned
450  const UTF8 *sourceEnd) {
451  UTF8 b1, b2, b3;
452 
453  assert(!isLegalUTF8Sequence(source, sourceEnd));
454 
455  /*
456  * Unicode 6.3.0, D93b:
457  *
458  * Maximal subpart of an ill-formed subsequence: The longest code unit
459  * subsequence starting at an unconvertible offset that is either:
460  * a. the initial subsequence of a well-formed code unit sequence, or
461  * b. a subsequence of length one.
462  */
463 
464  if (source == sourceEnd)
465  return 0;
466 
467  /*
468  * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
469  * Byte Sequences.
470  */
471 
472  b1 = *source;
473  ++source;
474  if (b1 >= 0xC2 && b1 <= 0xDF) {
475  /*
476  * First byte is valid, but we know that this code unit sequence is
477  * invalid, so the maximal subpart has to end after the first byte.
478  */
479  return 1;
480  }
481 
482  if (source == sourceEnd)
483  return 1;
484 
485  b2 = *source;
486  ++source;
487 
488  if (b1 == 0xE0) {
489  return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
490  }
491  if (b1 >= 0xE1 && b1 <= 0xEC) {
492  return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
493  }
494  if (b1 == 0xED) {
495  return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
496  }
497  if (b1 >= 0xEE && b1 <= 0xEF) {
498  return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
499  }
500  if (b1 == 0xF0) {
501  if (b2 >= 0x90 && b2 <= 0xBF) {
502  if (source == sourceEnd)
503  return 2;
504 
505  b3 = *source;
506  return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
507  }
508  return 1;
509  }
510  if (b1 >= 0xF1 && b1 <= 0xF3) {
511  if (b2 >= 0x80 && b2 <= 0xBF) {
512  if (source == sourceEnd)
513  return 2;
514 
515  b3 = *source;
516  return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
517  }
518  return 1;
519  }
520  if (b1 == 0xF4) {
521  if (b2 >= 0x80 && b2 <= 0x8F) {
522  if (source == sourceEnd)
523  return 2;
524 
525  b3 = *source;
526  return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
527  }
528  return 1;
529  }
530 
531  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
532  /*
533  * There are no valid sequences that start with these bytes. Maximal subpart
534  * is defined to have length 1 in these cases.
535  */
536  return 1;
537 }
538 
539 /* --------------------------------------------------------------------- */
540 
541 /*
542  * Exported function to return the total number of bytes in a codepoint
543  * represented in UTF-8, given the value of the first byte.
544  */
545 unsigned getNumBytesForUTF8(UTF8 first) {
546  return trailingBytesForUTF8[first] + 1;
547 }
548 
549 /* --------------------------------------------------------------------- */
550 
551 /*
552  * Exported function to return whether a UTF-8 string is legal or not.
553  * This is not used here; it's just exported.
554  */
555 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
556  while (*source != sourceEnd) {
557  int length = trailingBytesForUTF8[**source] + 1;
558  if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
559  return false;
560  *source += length;
561  }
562  return true;
563 }
564 
565 /* --------------------------------------------------------------------- */
566 
568  const UTF8** sourceStart, const UTF8* sourceEnd,
569  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
571  const UTF8* source = *sourceStart;
572  UTF16* target = *targetStart;
573  while (source < sourceEnd) {
574  UTF32 ch = 0;
575  unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
576  if (extraBytesToRead >= sourceEnd - source) {
577  result = sourceExhausted; break;
578  }
579  /* Do this check whether lenient or strict */
580  if (!isLegalUTF8(source, extraBytesToRead+1)) {
582  break;
583  }
584  /*
585  * The cases all fall through. See "Note A" below.
586  */
587  switch (extraBytesToRead) {
588  case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
589  case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
590  case 3: ch += *source++; ch <<= 6;
591  case 2: ch += *source++; ch <<= 6;
592  case 1: ch += *source++; ch <<= 6;
593  case 0: ch += *source++;
594  }
595  ch -= offsetsFromUTF8[extraBytesToRead];
596 
597  if (target >= targetEnd) {
598  source -= (extraBytesToRead+1); /* Back up source pointer! */
599  result = targetExhausted; break;
600  }
601  if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
602  /* UTF-16 surrogate values are illegal in UTF-32 */
603  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
604  if (flags == strictConversion) {
605  source -= (extraBytesToRead+1); /* return to the illegal value itself */
607  break;
608  } else {
609  *target++ = UNI_REPLACEMENT_CHAR;
610  }
611  } else {
612  *target++ = (UTF16)ch; /* normal case */
613  }
614  } else if (ch > UNI_MAX_UTF16) {
615  if (flags == strictConversion) {
617  source -= (extraBytesToRead+1); /* return to the start */
618  break; /* Bail out; shouldn't continue */
619  } else {
620  *target++ = UNI_REPLACEMENT_CHAR;
621  }
622  } else {
623  /* target is a character in range 0xFFFF - 0x10FFFF. */
624  if (target + 1 >= targetEnd) {
625  source -= (extraBytesToRead+1); /* Back up source pointer! */
626  result = targetExhausted; break;
627  }
628  ch -= halfBase;
629  *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
630  *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
631  }
632  }
633  *sourceStart = source;
634  *targetStart = target;
635  return result;
636 }
637 
638 /* --------------------------------------------------------------------- */
639 
641  const UTF8** sourceStart, const UTF8* sourceEnd,
642  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
643  Boolean InputIsPartial) {
645  const UTF8* source = *sourceStart;
646  UTF32* target = *targetStart;
647  while (source < sourceEnd) {
648  UTF32 ch = 0;
649  unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
650  if (extraBytesToRead >= sourceEnd - source) {
651  if (flags == strictConversion || InputIsPartial) {
653  break;
654  } else {
656 
657  /*
658  * Replace the maximal subpart of ill-formed sequence with
659  * replacement character.
660  */
662  sourceEnd);
663  *target++ = UNI_REPLACEMENT_CHAR;
664  continue;
665  }
666  }
667  if (target >= targetEnd) {
668  result = targetExhausted; break;
669  }
670 
671  /* Do this check whether lenient or strict */
672  if (!isLegalUTF8(source, extraBytesToRead+1)) {
674  if (flags == strictConversion) {
675  /* Abort conversion. */
676  break;
677  } else {
678  /*
679  * Replace the maximal subpart of ill-formed sequence with
680  * replacement character.
681  */
683  sourceEnd);
684  *target++ = UNI_REPLACEMENT_CHAR;
685  continue;
686  }
687  }
688  /*
689  * The cases all fall through. See "Note A" below.
690  */
691  switch (extraBytesToRead) {
692  case 5: ch += *source++; ch <<= 6;
693  case 4: ch += *source++; ch <<= 6;
694  case 3: ch += *source++; ch <<= 6;
695  case 2: ch += *source++; ch <<= 6;
696  case 1: ch += *source++; ch <<= 6;
697  case 0: ch += *source++;
698  }
699  ch -= offsetsFromUTF8[extraBytesToRead];
700 
701  if (ch <= UNI_MAX_LEGAL_UTF32) {
702  /*
703  * UTF-16 surrogate values are illegal in UTF-32, and anything
704  * over Plane 17 (> 0x10FFFF) is illegal.
705  */
706  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
707  if (flags == strictConversion) {
708  source -= (extraBytesToRead+1); /* return to the illegal value itself */
710  break;
711  } else {
712  *target++ = UNI_REPLACEMENT_CHAR;
713  }
714  } else {
715  *target++ = ch;
716  }
717  } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
719  *target++ = UNI_REPLACEMENT_CHAR;
720  }
721  }
722  *sourceStart = source;
723  *targetStart = target;
724  return result;
725 }
726 
728  const UTF8 *sourceEnd,
729  UTF32 **targetStart,
730  UTF32 *targetEnd,
731  ConversionFlags flags) {
732  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
733  flags, /*InputIsPartial=*/true);
734 }
735 
737  const UTF8 *sourceEnd, UTF32 **targetStart,
738  UTF32 *targetEnd, ConversionFlags flags) {
739  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
740  flags, /*InputIsPartial=*/false);
741 }
742 
743 /* ---------------------------------------------------------------------
744 
745  Note A.
746  The fall-through switches in UTF-8 reading code save a
747  temp variable, some decrements & conditionals. The switches
748  are equivalent to the following loop:
749  {
750  int tmpBytesToRead = extraBytesToRead+1;
751  do {
752  ch += *source++;
753  --tmpBytesToRead;
754  if (tmpBytesToRead) ch <<= 6;
755  } while (tmpBytesToRead > 0);
756  }
757  In UTF-8 writing code, the switches on "bytesToWrite" are
758  similarly unrolled loops.
759 
760  --------------------------------------------------------------------- */
761 
762 } // namespace llvm
763 
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::findMaximalSubpartOfIllFormedUTF8Sequence
static unsigned findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:449
llvm::ConvertUTF8toUTF32Partial
ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition: ConvertUTF.cpp:727
llvm::conversionOK
@ conversionOK
Definition: ConvertUTF.h:149
UNI_MAX_BMP
#define UNI_MAX_BMP
Definition: ConvertUTF.h:135
ConvertUTF_RESTORE_WARNINGS
#define ConvertUTF_RESTORE_WARNINGS
Definition: ConvertUTF.cpp:93
llvm::Boolean
unsigned char Boolean
Definition: ConvertUTF.h:131
a
=0.0 ? 0.0 :(a > 0.0 ? 1.0 :-1.0) a
Definition: README.txt:489
result
It looks like we only need to define PPCfmarto for these because according to these instructions perform RTO on fma s result
Definition: README_P9.txt:256
llvm::sourceIllegal
@ sourceIllegal
Definition: ConvertUTF.h:152
llvm::ConvertUTF32toUTF8
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:333
llvm::offsetsFromUTF8
static const UTF32 offsetsFromUTF8[6]
Definition: ConvertUTF.cpp:135
UNI_SUR_LOW_START
#define UNI_SUR_LOW_START
Definition: ConvertUTF.cpp:107
llvm::sourceExhausted
@ sourceExhausted
Definition: ConvertUTF.h:150
UNI_SUR_LOW_END
#define UNI_SUR_LOW_END
Definition: ConvertUTF.cpp:108
llvm::halfBase
static const UTF32 halfBase
Definition: ConvertUTF.cpp:102
llvm::ConversionFlags
ConversionFlags
Definition: ConvertUTF.h:155
llvm::targetExhausted
@ targetExhausted
Definition: ConvertUTF.h:151
b2
int b2
Definition: README.txt:84
llvm::firstByteMark
static const UTF8 firstByteMark[7]
Definition: ConvertUTF.cpp:145
llvm::ConversionResult
ConversionResult
Definition: ConvertUTF.h:148
llvm::ConvertUTF8toUTF32Impl
static ConversionResult ConvertUTF8toUTF32Impl(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags, Boolean InputIsPartial)
Definition: ConvertUTF.cpp:640
UNI_SUR_HIGH_START
#define UNI_SUR_HIGH_START
Definition: ConvertUTF.cpp:105
llvm::isLegalUTF8
static Boolean isLegalUTF8(const UTF8 *source, int length)
Definition: ConvertUTF.cpp:397
b3
int b3
Definition: README.txt:84
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::isLegalUTF8Sequence
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:428
b1
int b1
Definition: README.txt:84
llvm::ConvertUTF32toUTF16
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:160
llvm::ConvertUTF8toUTF32
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition: ConvertUTF.cpp:736
llvm::halfMask
static const UTF32 halfMask
Definition: ConvertUTF.cpp:103
llvm::strictConversion
@ strictConversion
Definition: ConvertUTF.h:156
llvm::getNumBytesForUTF8
unsigned getNumBytesForUTF8(UTF8 firstByte)
Definition: ConvertUTF.cpp:545
ConvertUTF.h
llvm::ConvertUTF8toUTF16
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:567
llvm::getUTF8SequenceSize
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:440
UNI_SUR_HIGH_END
#define UNI_SUR_HIGH_END
Definition: ConvertUTF.cpp:106
ConvertUTF_DISABLE_WARNINGS
#define ConvertUTF_DISABLE_WARNINGS
Definition: ConvertUTF.cpp:90
llvm::ConvertUTF16toUTF8
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:263
llvm::isLegalUTF8String
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:555
llvm::trailingBytesForUTF8
static const char trailingBytesForUTF8[256]
Definition: ConvertUTF.cpp:119
llvm::UTF32
unsigned int UTF32
Definition: ConvertUTF.h:128
UNI_MAX_UTF16
#define UNI_MAX_UTF16
Definition: ConvertUTF.h:136
UNI_REPLACEMENT_CHAR
#define UNI_REPLACEMENT_CHAR
Definition: ConvertUTF.h:134
llvm::halfShift
static const int halfShift
Definition: ConvertUTF.cpp:100
llvm::UTF16
unsigned short UTF16
Definition: ConvertUTF.h:129
llvm::UTF8
unsigned char UTF8
Definition: ConvertUTF.h:130
llvm::ConvertUTF16toUTF32
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:209
UNI_MAX_LEGAL_UTF32
#define UNI_MAX_LEGAL_UTF32
Definition: ConvertUTF.h:138