Line data Source code
1 : /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2 : *
3 : * The LLVM Compiler Infrastructure
4 : *
5 : * This file is distributed under the University of Illinois Open Source
6 : * License. See LICENSE.TXT for details.
7 : *
8 : *===------------------------------------------------------------------------=*/
9 : /*
10 : * Copyright 2001-2004 Unicode, Inc.
11 : *
12 : * Disclaimer
13 : *
14 : * This source code is provided as is by Unicode, Inc. No claims are
15 : * made as to fitness for any particular purpose. No warranties of any
16 : * kind are expressed or implied. The recipient agrees to determine
17 : * applicability of information provided. If this file has been
18 : * purchased on magnetic or optical media from Unicode, Inc., the
19 : * sole remedy for any claim will be exchange of defective media
20 : * within 90 days of receipt.
21 : *
22 : * Limitations on Rights to Redistribute This Code
23 : *
24 : * Unicode, Inc. hereby grants the right to freely use the information
25 : * supplied in this file in the creation of products supporting the
26 : * Unicode Standard, and to make copies of this file in any form
27 : * for internal or external distribution as long as this notice
28 : * remains attached.
29 : */
30 :
31 : /* ---------------------------------------------------------------------
32 :
33 : Conversions between UTF32, UTF-16, and UTF-8. Source code file.
34 : Author: Mark E. Davis, 1994.
35 : Rev History: Rick McGowan, fixes & updates May 2001.
36 : Sept 2001: fixed const & error conditions per
37 : mods suggested by S. Parent & A. Lillich.
38 : June 2002: Tim Dodd added detection and handling of incomplete
39 : source sequences, enhanced error detection, added casts
40 : to eliminate compiler warnings.
41 : July 2003: slight mods to back out aggressive FFFE detection.
42 : Jan 2004: updated switches in from-UTF8 conversions.
43 : Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
44 :
45 : See the header file "ConvertUTF.h" for complete documentation.
46 :
47 : ------------------------------------------------------------------------ */
48 :
49 : #include "llvm/Support/ConvertUTF.h"
50 : #ifdef CVTUTF_DEBUG
51 : #include <stdio.h>
52 : #endif
53 : #include <assert.h>
54 :
55 : /*
56 : * This code extensively uses fall-through switches.
57 : * Keep the compiler from warning about that.
58 : */
59 : #if defined(__clang__) && defined(__has_warning)
60 : # if __has_warning("-Wimplicit-fallthrough")
61 : # define ConvertUTF_DISABLE_WARNINGS \
62 : _Pragma("clang diagnostic push") \
63 : _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
64 : # define ConvertUTF_RESTORE_WARNINGS \
65 : _Pragma("clang diagnostic pop")
66 : # endif
67 : #elif defined(__GNUC__) && __GNUC__ > 6
68 : # define ConvertUTF_DISABLE_WARNINGS \
69 : _Pragma("GCC diagnostic push") \
70 : _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
71 : # define ConvertUTF_RESTORE_WARNINGS \
72 : _Pragma("GCC diagnostic pop")
73 : #endif
74 : #ifndef ConvertUTF_DISABLE_WARNINGS
75 : # define ConvertUTF_DISABLE_WARNINGS
76 : #endif
77 : #ifndef ConvertUTF_RESTORE_WARNINGS
78 : # define ConvertUTF_RESTORE_WARNINGS
79 : #endif
80 :
81 : ConvertUTF_DISABLE_WARNINGS
82 :
83 : namespace llvm {
84 :
85 : static const int halfShift = 10; /* used for shifting by 10 bits */
86 :
87 : static const UTF32 halfBase = 0x0010000UL;
88 : static const UTF32 halfMask = 0x3FFUL;
89 :
90 : #define UNI_SUR_HIGH_START (UTF32)0xD800
91 : #define UNI_SUR_HIGH_END (UTF32)0xDBFF
92 : #define UNI_SUR_LOW_START (UTF32)0xDC00
93 : #define UNI_SUR_LOW_END (UTF32)0xDFFF
94 :
95 : /* --------------------------------------------------------------------- */
96 :
97 : /*
98 : * Index into the table below with the first byte of a UTF-8 sequence to
99 : * get the number of trailing bytes that are supposed to follow it.
100 : * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
101 : * left as-is for anyone who may want to do such conversion, which was
102 : * allowed in earlier algorithms.
103 : */
104 : static const char trailingBytesForUTF8[256] = {
105 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
106 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
107 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
108 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
109 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
110 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
111 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
113 : };
114 :
115 : /*
116 : * Magic values subtracted from a buffer value during UTF8 conversion.
117 : * This table contains as many values as there might be trailing bytes
118 : * in a UTF-8 sequence.
119 : */
120 : static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
121 : 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
122 :
123 : /*
124 : * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
125 : * into the first byte, depending on how many bytes follow. There are
126 : * as many entries in this table as there are UTF-8 sequence types.
127 : * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
128 : * for *legal* UTF-8 will be 4 or fewer bytes total.
129 : */
130 : static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
131 :
132 : /* --------------------------------------------------------------------- */
133 :
134 : /* The interface converts a whole buffer to avoid function-call overhead.
135 : * Constants have been gathered. Loops & conditionals have been removed as
136 : * much as possible for efficiency, in favor of drop-through switches.
137 : * (See "Note A" at the bottom of the file for equivalent code.)
138 : * If your compiler supports it, the "isLegalUTF8" call can be turned
139 : * into an inline function.
140 : */
141 :
142 :
143 : /* --------------------------------------------------------------------- */
144 :
145 0 : ConversionResult ConvertUTF32toUTF16 (
146 : const UTF32** sourceStart, const UTF32* sourceEnd,
147 : UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
148 : ConversionResult result = conversionOK;
149 0 : const UTF32* source = *sourceStart;
150 0 : UTF16* target = *targetStart;
151 0 : while (source < sourceEnd) {
152 : UTF32 ch;
153 0 : if (target >= targetEnd) {
154 : result = targetExhausted; break;
155 : }
156 0 : ch = *source++;
157 0 : if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
158 : /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
159 0 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
160 0 : if (flags == strictConversion) {
161 : --source; /* return to the illegal value itself */
162 : result = sourceIllegal;
163 : break;
164 : } else {
165 0 : *target++ = UNI_REPLACEMENT_CHAR;
166 : }
167 : } else {
168 0 : *target++ = (UTF16)ch; /* normal case */
169 : }
170 0 : } else if (ch > UNI_MAX_LEGAL_UTF32) {
171 0 : if (flags == strictConversion) {
172 : result = sourceIllegal;
173 : } else {
174 0 : *target++ = UNI_REPLACEMENT_CHAR;
175 : }
176 : } else {
177 : /* target is a character in range 0xFFFF - 0x10FFFF. */
178 0 : if (target + 1 >= targetEnd) {
179 : --source; /* Back up source pointer! */
180 : result = targetExhausted; break;
181 : }
182 0 : ch -= halfBase;
183 0 : *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
184 0 : *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
185 : }
186 : }
187 0 : *sourceStart = source;
188 0 : *targetStart = target;
189 0 : return result;
190 : }
191 :
192 : /* --------------------------------------------------------------------- */
193 :
194 0 : ConversionResult ConvertUTF16toUTF32 (
195 : const UTF16** sourceStart, const UTF16* sourceEnd,
196 : UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
197 : ConversionResult result = conversionOK;
198 0 : const UTF16* source = *sourceStart;
199 0 : UTF32* target = *targetStart;
200 : UTF32 ch, ch2;
201 0 : while (source < sourceEnd) {
202 : const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
203 0 : ch = *source++;
204 : /* If we have a surrogate pair, convert to UTF32 first. */
205 0 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
206 : /* If the 16 bits following the high surrogate are in the source buffer... */
207 0 : if (source < sourceEnd) {
208 0 : ch2 = *source;
209 : /* If it's a low surrogate, convert to UTF32. */
210 0 : if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
211 0 : ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
212 0 : + (ch2 - UNI_SUR_LOW_START) + halfBase;
213 0 : ++source;
214 0 : } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
215 : --source; /* return to the illegal value itself */
216 : result = sourceIllegal;
217 : break;
218 : }
219 : } else { /* We don't have the 16 bits following the high surrogate. */
220 : --source; /* return to the high surrogate */
221 : result = sourceExhausted;
222 : break;
223 : }
224 0 : } else if (flags == strictConversion) {
225 : /* UTF-16 surrogate values are illegal in UTF-32 */
226 0 : if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
227 : --source; /* return to the illegal value itself */
228 : result = sourceIllegal;
229 : break;
230 : }
231 : }
232 0 : if (target >= targetEnd) {
233 : source = oldSource; /* Back up source pointer! */
234 : result = targetExhausted; break;
235 : }
236 0 : *target++ = ch;
237 : }
238 0 : *sourceStart = source;
239 0 : *targetStart = target;
240 : #ifdef CVTUTF_DEBUG
241 : if (result == sourceIllegal) {
242 : fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
243 : fflush(stderr);
244 : }
245 : #endif
246 0 : return result;
247 : }
248 126 : ConversionResult ConvertUTF16toUTF8 (
249 : const UTF16** sourceStart, const UTF16* sourceEnd,
250 : UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
251 : ConversionResult result = conversionOK;
252 126 : const UTF16* source = *sourceStart;
253 126 : UTF8* target = *targetStart;
254 1419 : while (source < sourceEnd) {
255 : UTF32 ch;
256 : unsigned short bytesToWrite = 0;
257 : const UTF32 byteMask = 0xBF;
258 : const UTF32 byteMark = 0x80;
259 : const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
260 1293 : ch = *source++;
261 : /* If we have a surrogate pair, convert to UTF32 first. */
262 1293 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
263 : /* If the 16 bits following the high surrogate are in the source buffer... */
264 0 : if (source < sourceEnd) {
265 0 : UTF32 ch2 = *source;
266 : /* If it's a low surrogate, convert to UTF32. */
267 0 : if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
268 0 : ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
269 0 : + (ch2 - UNI_SUR_LOW_START) + halfBase;
270 0 : ++source;
271 0 : } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
272 : --source; /* return to the illegal value itself */
273 : result = sourceIllegal;
274 : break;
275 : }
276 : } else { /* We don't have the 16 bits following the high surrogate. */
277 : --source; /* return to the high surrogate */
278 : result = sourceExhausted;
279 : break;
280 : }
281 1293 : } else if (flags == strictConversion) {
282 : /* UTF-16 surrogate values are illegal in UTF-32 */
283 1293 : if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
284 : --source; /* return to the illegal value itself */
285 : result = sourceIllegal;
286 : break;
287 : }
288 : }
289 : /* Figure out how many bytes the result will require */
290 1293 : if (ch < (UTF32)0x80) { bytesToWrite = 1;
291 10 : } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
292 10 : } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
293 : } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
294 : } else { bytesToWrite = 3;
295 : ch = UNI_REPLACEMENT_CHAR;
296 : }
297 :
298 1293 : target += bytesToWrite;
299 1293 : if (target > targetEnd) {
300 : source = oldSource; /* Back up source pointer! */
301 0 : target -= bytesToWrite; result = targetExhausted; break;
302 : }
303 1293 : switch (bytesToWrite) { /* note: everything falls through. */
304 0 : case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
305 10 : case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
306 10 : case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
307 1293 : case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
308 : }
309 1293 : target += bytesToWrite;
310 : }
311 126 : *sourceStart = source;
312 126 : *targetStart = target;
313 126 : return result;
314 : }
315 :
316 : /* --------------------------------------------------------------------- */
317 :
318 381 : ConversionResult ConvertUTF32toUTF8 (
319 : const UTF32** sourceStart, const UTF32* sourceEnd,
320 : UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
321 : ConversionResult result = conversionOK;
322 381 : const UTF32* source = *sourceStart;
323 381 : UTF8* target = *targetStart;
324 958 : while (source < sourceEnd) {
325 : UTF32 ch;
326 : unsigned short bytesToWrite = 0;
327 : const UTF32 byteMask = 0xBF;
328 : const UTF32 byteMark = 0x80;
329 577 : ch = *source++;
330 577 : if (flags == strictConversion ) {
331 : /* UTF-16 surrogate values are illegal in UTF-32 */
332 577 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
333 : --source; /* return to the illegal value itself */
334 : result = sourceIllegal;
335 : break;
336 : }
337 : }
338 : /*
339 : * Figure out how many bytes the result will require. Turn any
340 : * illegally large UTF32 things (> Plane 17) into replacement chars.
341 : */
342 577 : if (ch < (UTF32)0x80) { bytesToWrite = 1;
343 360 : } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
344 133 : } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
345 14 : } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
346 : } else { bytesToWrite = 3;
347 : ch = UNI_REPLACEMENT_CHAR;
348 : result = sourceIllegal;
349 : }
350 :
351 577 : target += bytesToWrite;
352 577 : if (target > targetEnd) {
353 : --source; /* Back up source pointer! */
354 0 : target -= bytesToWrite; result = targetExhausted; break;
355 : }
356 577 : switch (bytesToWrite) { /* note: everything falls through. */
357 14 : case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358 133 : case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
359 360 : case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
360 577 : case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
361 : }
362 577 : target += bytesToWrite;
363 : }
364 381 : *sourceStart = source;
365 381 : *targetStart = target;
366 381 : return result;
367 : }
368 :
369 : /* --------------------------------------------------------------------- */
370 :
371 : /*
372 : * Utility routine to tell whether a sequence of bytes is legal UTF-8.
373 : * This must be called with the length pre-determined by the first byte.
374 : * If not calling this from ConvertUTF8to*, then the length can be set by:
375 : * length = trailingBytesForUTF8[*source]+1;
376 : * and the sequence is illegal right away if there aren't that many bytes
377 : * available.
378 : * If presented with a length > 4, this returns false. The Unicode
379 : * definition of UTF-8 goes up to 4-byte sequences.
380 : */
381 :
382 173283849 : static Boolean isLegalUTF8(const UTF8 *source, int length) {
383 : UTF8 a;
384 173283849 : const UTF8 *srcptr = source+length;
385 173283849 : switch (length) {
386 : default: return false;
387 : /* Everything else falls through when "true"... */
388 396 : case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
389 4495 : case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
390 9360 : case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
391 :
392 9252 : switch (*source) {
393 : /* no fall-through in this inner switch */
394 291 : case 0xE0: if (a < 0xA0) return false; break;
395 30 : case 0xED: if (a > 0x9F) return false; break;
396 342 : case 0xF0: if (a < 0x90) return false; break;
397 5 : case 0xF4: if (a > 0x8F) return false; break;
398 : default: if (a < 0x80) return false;
399 : }
400 :
401 173283493 : case 1: if (*source >= 0x80 && *source < 0xC2) return false;
402 : }
403 173283069 : if (*source > 0xF4) return false;
404 : return true;
405 : }
406 :
407 : /* --------------------------------------------------------------------- */
408 :
409 : /*
410 : * Exported function to return whether a UTF-8 sequence is legal or not.
411 : * This is not used here; it's just exported.
412 : */
413 4078495 : Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
414 4078495 : int length = trailingBytesForUTF8[*source]+1;
415 4078495 : if (length > sourceEnd - source) {
416 : return false;
417 : }
418 4078483 : return isLegalUTF8(source, length);
419 : }
420 :
421 : /* --------------------------------------------------------------------- */
422 :
423 : static unsigned
424 715 : findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
425 : const UTF8 *sourceEnd) {
426 : UTF8 b1, b2, b3;
427 :
428 : assert(!isLegalUTF8Sequence(source, sourceEnd));
429 :
430 : /*
431 : * Unicode 6.3.0, D93b:
432 : *
433 : * Maximal subpart of an ill-formed subsequence: The longest code unit
434 : * subsequence starting at an unconvertible offset that is either:
435 : * a. the initial subsequence of a well-formed code unit sequence, or
436 : * b. a subsequence of length one.
437 : */
438 :
439 715 : if (source == sourceEnd)
440 : return 0;
441 :
442 : /*
443 : * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
444 : * Byte Sequences.
445 : */
446 :
447 715 : b1 = *source;
448 715 : ++source;
449 715 : if (b1 >= 0xC2 && b1 <= 0xDF) {
450 : /*
451 : * First byte is valid, but we know that this code unit sequence is
452 : * invalid, so the maximal subpart has to end after the first byte.
453 : */
454 : return 1;
455 : }
456 :
457 650 : if (source == sourceEnd)
458 : return 1;
459 :
460 496 : b2 = *source;
461 496 : ++source;
462 :
463 496 : if (b1 == 0xE0) {
464 20 : return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
465 : }
466 485 : if (b1 >= 0xE1 && b1 <= 0xEC) {
467 50 : return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
468 : }
469 459 : if (b1 == 0xED) {
470 72 : return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
471 : }
472 422 : if (b1 >= 0xEE && b1 <= 0xEF) {
473 9 : return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
474 : }
475 416 : if (b1 == 0xF0) {
476 16 : if (b2 >= 0x90 && b2 <= 0xBF) {
477 4 : if (source == sourceEnd)
478 : return 2;
479 :
480 2 : b3 = *source;
481 2 : return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
482 : }
483 : return 1;
484 : }
485 400 : if (b1 >= 0xF1 && b1 <= 0xF3) {
486 10 : if (b2 >= 0x80 && b2 <= 0xBF) {
487 4 : if (source == sourceEnd)
488 : return 2;
489 :
490 2 : b3 = *source;
491 2 : return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
492 : }
493 : return 1;
494 : }
495 390 : if (b1 == 0xF4) {
496 12 : if (b2 >= 0x80 && b2 <= 0x8F) {
497 4 : if (source == sourceEnd)
498 : return 2;
499 :
500 2 : b3 = *source;
501 2 : return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
502 : }
503 : return 1;
504 : }
505 :
506 : assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
507 : /*
508 : * There are no valid sequences that start with these bytes. Maximal subpart
509 : * is defined to have length 1 in these cases.
510 : */
511 : return 1;
512 : }
513 :
514 : /* --------------------------------------------------------------------- */
515 :
516 : /*
517 : * Exported function to return the total number of bytes in a codepoint
518 : * represented in UTF-8, given the value of the first byte.
519 : */
520 23744622 : unsigned getNumBytesForUTF8(UTF8 first) {
521 23744622 : return trailingBytesForUTF8[first] + 1;
522 : }
523 :
524 : /* --------------------------------------------------------------------- */
525 :
526 : /*
527 : * Exported function to return whether a UTF-8 string is legal or not.
528 : * This is not used here; it's just exported.
529 : */
530 14737432 : Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
531 159956274 : while (*source != sourceEnd) {
532 145218905 : int length = trailingBytesForUTF8[**source] + 1;
533 145218905 : if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
534 : return false;
535 145218842 : *source += length;
536 : }
537 : return true;
538 : }
539 :
540 : /* --------------------------------------------------------------------- */
541 :
542 5309 : ConversionResult ConvertUTF8toUTF16 (
543 : const UTF8** sourceStart, const UTF8* sourceEnd,
544 : UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
545 : ConversionResult result = conversionOK;
546 5309 : const UTF8* source = *sourceStart;
547 5309 : UTF16* target = *targetStart;
548 38975 : while (source < sourceEnd) {
549 : UTF32 ch = 0;
550 33688 : unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
551 33688 : if (extraBytesToRead >= sourceEnd - source) {
552 : result = sourceExhausted; break;
553 : }
554 : /* Do this check whether lenient or strict */
555 33680 : if (!isLegalUTF8(source, extraBytesToRead+1)) {
556 : result = sourceIllegal;
557 : break;
558 : }
559 : /*
560 : * The cases all fall through. See "Note A" below.
561 : */
562 33666 : switch (extraBytesToRead) {
563 0 : case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
564 0 : case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
565 3 : case 3: ch += *source++; ch <<= 6;
566 101 : case 2: ch += *source++; ch <<= 6;
567 149 : case 1: ch += *source++; ch <<= 6;
568 33666 : case 0: ch += *source++;
569 : }
570 33666 : ch -= offsetsFromUTF8[extraBytesToRead];
571 :
572 33666 : if (target >= targetEnd) {
573 0 : source -= (extraBytesToRead+1); /* Back up source pointer! */
574 0 : result = targetExhausted; break;
575 : }
576 33666 : if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
577 : /* UTF-16 surrogate values are illegal in UTF-32 */
578 33663 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
579 0 : if (flags == strictConversion) {
580 0 : source -= (extraBytesToRead+1); /* return to the illegal value itself */
581 : result = sourceIllegal;
582 0 : break;
583 : } else {
584 0 : *target++ = UNI_REPLACEMENT_CHAR;
585 : }
586 : } else {
587 33663 : *target++ = (UTF16)ch; /* normal case */
588 : }
589 3 : } else if (ch > UNI_MAX_UTF16) {
590 0 : if (flags == strictConversion) {
591 : result = sourceIllegal;
592 0 : source -= (extraBytesToRead+1); /* return to the start */
593 0 : break; /* Bail out; shouldn't continue */
594 : } else {
595 0 : *target++ = UNI_REPLACEMENT_CHAR;
596 : }
597 : } else {
598 : /* target is a character in range 0xFFFF - 0x10FFFF. */
599 3 : if (target + 1 >= targetEnd) {
600 0 : source -= (extraBytesToRead+1); /* Back up source pointer! */
601 0 : result = targetExhausted; break;
602 : }
603 3 : ch -= halfBase;
604 3 : *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
605 3 : *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
606 : }
607 : }
608 5309 : *sourceStart = source;
609 5309 : *targetStart = target;
610 5309 : return result;
611 : }
612 :
613 : /* --------------------------------------------------------------------- */
614 :
615 23889036 : static ConversionResult ConvertUTF8toUTF32Impl(
616 : const UTF8** sourceStart, const UTF8* sourceEnd,
617 : UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
618 : Boolean InputIsPartial) {
619 : ConversionResult result = conversionOK;
620 23889036 : const UTF8* source = *sourceStart;
621 23889036 : UTF32* target = *targetStart;
622 47841876 : while (source < sourceEnd) {
623 : UTF32 ch = 0;
624 23952999 : unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
625 23952999 : if (extraBytesToRead >= sourceEnd - source) {
626 179 : if (flags == strictConversion || InputIsPartial) {
627 : result = sourceExhausted;
628 : break;
629 : } else {
630 : result = sourceIllegal;
631 :
632 : /*
633 : * Replace the maximal subpart of ill-formed sequence with
634 : * replacement character.
635 : */
636 140 : source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
637 : sourceEnd);
638 140 : *target++ = UNI_REPLACEMENT_CHAR;
639 140 : continue;
640 : }
641 : }
642 23952820 : if (target >= targetEnd) {
643 : result = targetExhausted; break;
644 : }
645 :
646 : /* Do this check whether lenient or strict */
647 23952802 : if (!isLegalUTF8(source, extraBytesToRead+1)) {
648 : result = sourceIllegal;
649 677 : if (flags == strictConversion) {
650 : /* Abort conversion. */
651 : break;
652 : } else {
653 : /*
654 : * Replace the maximal subpart of ill-formed sequence with
655 : * replacement character.
656 : */
657 575 : source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
658 : sourceEnd);
659 575 : *target++ = UNI_REPLACEMENT_CHAR;
660 575 : continue;
661 : }
662 : }
663 : /*
664 : * The cases all fall through. See "Note A" below.
665 : */
666 23952125 : switch (extraBytesToRead) {
667 0 : case 5: ch += *source++; ch <<= 6;
668 0 : case 4: ch += *source++; ch <<= 6;
669 297 : case 3: ch += *source++; ch <<= 6;
670 2947 : case 2: ch += *source++; ch <<= 6;
671 6423 : case 1: ch += *source++; ch <<= 6;
672 23952125 : case 0: ch += *source++;
673 : }
674 23952125 : ch -= offsetsFromUTF8[extraBytesToRead];
675 :
676 23952125 : if (ch <= UNI_MAX_LEGAL_UTF32) {
677 : /*
678 : * UTF-16 surrogate values are illegal in UTF-32, and anything
679 : * over Plane 17 (> 0x10FFFF) is illegal.
680 : */
681 23952125 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
682 0 : if (flags == strictConversion) {
683 0 : source -= (extraBytesToRead+1); /* return to the illegal value itself */
684 : result = sourceIllegal;
685 0 : break;
686 : } else {
687 0 : *target++ = UNI_REPLACEMENT_CHAR;
688 : }
689 : } else {
690 23952125 : *target++ = ch;
691 : }
692 : } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
693 : result = sourceIllegal;
694 0 : *target++ = UNI_REPLACEMENT_CHAR;
695 : }
696 : }
697 23889036 : *sourceStart = source;
698 23889036 : *targetStart = target;
699 23889036 : return result;
700 : }
701 :
702 18 : ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
703 : const UTF8 *sourceEnd,
704 : UTF32 **targetStart,
705 : UTF32 *targetEnd,
706 : ConversionFlags flags) {
707 18 : return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
708 18 : flags, /*InputIsPartial=*/true);
709 : }
710 :
711 23889018 : ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
712 : const UTF8 *sourceEnd, UTF32 **targetStart,
713 : UTF32 *targetEnd, ConversionFlags flags) {
714 23889018 : return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
715 23889018 : flags, /*InputIsPartial=*/false);
716 : }
717 :
718 : /* ---------------------------------------------------------------------
719 :
720 : Note A.
721 : The fall-through switches in UTF-8 reading code save a
722 : temp variable, some decrements & conditionals. The switches
723 : are equivalent to the following loop:
724 : {
725 : int tmpBytesToRead = extraBytesToRead+1;
726 : do {
727 : ch += *source++;
728 : --tmpBytesToRead;
729 : if (tmpBytesToRead) ch <<= 6;
730 : } while (tmpBytesToRead > 0);
731 : }
732 : In UTF-8 writing code, the switches on "bytesToWrite" are
733 : similarly unrolled loops.
734 :
735 : --------------------------------------------------------------------- */
736 :
737 : } // namespace llvm
738 :
739 : ConvertUTF_RESTORE_WARNINGS
|