61 #define UNI_SUR_HIGH_START (UTF32)0xD800
62 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
63 #define UNI_SUR_LOW_START (UTF32)0xDC00
64 #define UNI_SUR_LOW_END (UTF32)0xDFFF
78 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
79 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
80 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
82 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
83 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
84 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
85 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
94 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
119 const UTF32** sourceStart,
const UTF32* sourceEnd,
122 const UTF32* source = *sourceStart;
123 UTF16* target = *targetStart;
124 while (source < sourceEnd) {
126 if (target >= targetEnd) {
141 *target++ = (
UTF16)ch;
151 if (target + 1 >= targetEnd) {
160 *sourceStart = source;
161 *targetStart = target;
168 const UTF16** sourceStart,
const UTF16* sourceEnd,
171 const UTF16* source = *sourceStart;
172 UTF32* target = *targetStart;
174 while (source < sourceEnd) {
175 const UTF16* oldSource = source;
180 if (source < sourceEnd) {
205 if (target >= targetEnd) {
211 *sourceStart = source;
212 *targetStart = target;
215 fprintf(stderr,
"ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
222 const UTF16** sourceStart,
const UTF16* sourceEnd,
225 const UTF16* source = *sourceStart;
226 UTF8* target = *targetStart;
227 while (source < sourceEnd) {
229 unsigned short bytesToWrite = 0;
230 const UTF32 byteMask = 0xBF;
231 const UTF32 byteMark = 0x80;
232 const UTF16* oldSource = source;
237 if (source < sourceEnd) {
263 if (ch < (
UTF32)0x80) { bytesToWrite = 1;
264 }
else if (ch < (
UTF32)0x800) { bytesToWrite = 2;
265 }
else if (ch < (
UTF32)0x10000) { bytesToWrite = 3;
266 }
else if (ch < (
UTF32)0x110000) { bytesToWrite = 4;
267 }
else { bytesToWrite = 3;
271 target += bytesToWrite;
272 if (target > targetEnd) {
276 switch (bytesToWrite) {
277 case 4: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
278 case 3: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
279 case 2: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
282 target += bytesToWrite;
284 *sourceStart = source;
285 *targetStart = target;
292 const UTF32** sourceStart,
const UTF32* sourceEnd,
295 const UTF32* source = *sourceStart;
296 UTF8* target = *targetStart;
297 while (source < sourceEnd) {
299 unsigned short bytesToWrite = 0;
300 const UTF32 byteMask = 0xBF;
301 const UTF32 byteMark = 0x80;
315 if (ch < (
UTF32)0x80) { bytesToWrite = 1;
316 }
else if (ch < (
UTF32)0x800) { bytesToWrite = 2;
317 }
else if (ch < (
UTF32)0x10000) { bytesToWrite = 3;
319 }
else { bytesToWrite = 3;
324 target += bytesToWrite;
325 if (target > targetEnd) {
329 switch (bytesToWrite) {
330 case 4: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
331 case 3: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
332 case 2: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
335 target += bytesToWrite;
337 *sourceStart = source;
338 *targetStart = target;
357 const UTF8 *srcptr = source+length;
359 default:
return false;
361 case 4:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
362 case 3:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
363 case 2:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
367 case 0xE0:
if (a < 0xA0)
return false;
break;
368 case 0xED:
if (a > 0x9F)
return false;
break;
369 case 0xF0:
if (a < 0x90)
return false;
break;
370 case 0xF4:
if (a > 0x8F)
return false;
break;
371 default:
if (a < 0x80)
return false;
374 case 1:
if (*source >= 0x80 && *source < 0xC2)
return false;
376 if (*source > 0xF4)
return false;
388 if (length > sourceEnd - source) {
398 const UTF8 *sourceEnd) {
412 if (source == sourceEnd)
422 if (b1 >= 0xC2 && b1 <= 0xDF) {
430 if (source == sourceEnd)
437 return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
439 if (b1 >= 0xE1 && b1 <= 0xEC) {
440 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
443 return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
445 if (b1 >= 0xEE && b1 <= 0xEF) {
446 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
449 if (b2 >= 0x90 && b2 <= 0xBF) {
450 if (source == sourceEnd)
454 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
458 if (b1 >= 0xF1 && b1 <= 0xF3) {
459 if (b2 >= 0x80 && b2 <= 0xBF) {
460 if (source == sourceEnd)
464 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
469 if (b2 >= 0x80 && b2 <= 0x8F) {
470 if (source == sourceEnd)
474 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
479 assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
504 while (*source != sourceEnd) {
506 if (length > sourceEnd - *source || !
isLegalUTF8(*source, length))
516 const UTF8** sourceStart,
const UTF8* sourceEnd,
519 const UTF8* source = *sourceStart;
520 UTF16* target = *targetStart;
521 while (source < sourceEnd) {
524 if (extraBytesToRead >= sourceEnd - source) {
535 switch (extraBytesToRead) {
536 case 5: ch += *source++; ch <<= 6;
537 case 4: ch += *source++; ch <<= 6;
538 case 3: ch += *source++; ch <<= 6;
539 case 2: ch += *source++; ch <<= 6;
540 case 1: ch += *source++; ch <<= 6;
541 case 0: ch += *source++;
545 if (target >= targetEnd) {
546 source -= (extraBytesToRead+1);
553 source -= (extraBytesToRead+1);
560 *target++ = (
UTF16)ch;
565 source -= (extraBytesToRead+1);
572 if (target + 1 >= targetEnd) {
573 source -= (extraBytesToRead+1);
581 *sourceStart = source;
582 *targetStart = target;
589 const UTF8** sourceStart,
const UTF8* sourceEnd,
593 const UTF8* source = *sourceStart;
594 UTF32* target = *targetStart;
595 while (source < sourceEnd) {
598 if (extraBytesToRead >= sourceEnd - source) {
615 if (target >= targetEnd) {
639 switch (extraBytesToRead) {
640 case 5: ch += *source++; ch <<= 6;
641 case 4: ch += *source++; ch <<= 6;
642 case 3: ch += *source++; ch <<= 6;
643 case 2: ch += *source++; ch <<= 6;
644 case 1: ch += *source++; ch <<= 6;
645 case 0: ch += *source++;
656 source -= (extraBytesToRead+1);
670 *sourceStart = source;
671 *targetStart = target;
676 const UTF8 *sourceEnd,
685 const UTF8 *sourceEnd,
UTF32 **targetStart,
ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
static const UTF8 firstByteMark[7]
#define UNI_MAX_LEGAL_UTF32
#define UNI_REPLACEMENT_CHAR
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
static ConversionResult ConvertUTF8toUTF32Impl(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags, Boolean InputIsPartial)
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd)
#define UNI_SUR_LOW_START
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
unsigned getNumBytesForUTF8(UTF8 first)
#define UNI_SUR_HIGH_START
static unsigned findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
static const int halfShift
static const UTF32 offsetsFromUTF8[6]
static const char trailingBytesForUTF8[256]
static const UTF32 halfMask
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
static const UTF32 halfBase
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
static Boolean isLegalUTF8(const UTF8 *source, int length)