LCOV - llvm-toolchain.info - lib/Support/ConvertUTF.cpp

LCOV - code coverage report

Current view:	top level - lib/Support - ConvertUTF.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	157	229	68.6 %
Date:	2018-10-20 13:21:21	Functions:	11	13	84.6 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
       2             :  *
       3             :  *                     The LLVM Compiler Infrastructure
       4             :  *
       5             :  * This file is distributed under the University of Illinois Open Source
       6             :  * License. See LICENSE.TXT for details.
       7             :  *
       8             :  *===------------------------------------------------------------------------=*/
       9             : /*
      10             :  * Copyright 2001-2004 Unicode, Inc.
      11             :  *
      12             :  * Disclaimer
      13             :  *
      14             :  * This source code is provided as is by Unicode, Inc. No claims are
      15             :  * made as to fitness for any particular purpose. No warranties of any
      16             :  * kind are expressed or implied. The recipient agrees to determine
      17             :  * applicability of information provided. If this file has been
      18             :  * purchased on magnetic or optical media from Unicode, Inc., the
      19             :  * sole remedy for any claim will be exchange of defective media
      20             :  * within 90 days of receipt.
      21             :  *
      22             :  * Limitations on Rights to Redistribute This Code
      23             :  *
      24             :  * Unicode, Inc. hereby grants the right to freely use the information
      25             :  * supplied in this file in the creation of products supporting the
      26             :  * Unicode Standard, and to make copies of this file in any form
      27             :  * for internal or external distribution as long as this notice
      28             :  * remains attached.
      29             :  */
      30             : 
      31             : /* ---------------------------------------------------------------------
      32             : 
      33             :     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
      34             :     Author: Mark E. Davis, 1994.
      35             :     Rev History: Rick McGowan, fixes & updates May 2001.
      36             :     Sept 2001: fixed const & error conditions per
      37             :         mods suggested by S. Parent & A. Lillich.
      38             :     June 2002: Tim Dodd added detection and handling of incomplete
      39             :         source sequences, enhanced error detection, added casts
      40             :         to eliminate compiler warnings.
      41             :     July 2003: slight mods to back out aggressive FFFE detection.
      42             :     Jan 2004: updated switches in from-UTF8 conversions.
      43             :     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
      44             : 
      45             :     See the header file "ConvertUTF.h" for complete documentation.
      46             : 
      47             : ------------------------------------------------------------------------ */
      48             : 
      49             : #include "llvm/Support/ConvertUTF.h"
      50             : #ifdef CVTUTF_DEBUG
      51             : #include <stdio.h>
      52             : #endif
      53             : #include <assert.h>
      54             : 
      55             : /*
      56             :  * This code extensively uses fall-through switches.
      57             :  * Keep the compiler from warning about that.
      58             :  */
      59             : #if defined(__clang__) && defined(__has_warning)
      60             : # if __has_warning("-Wimplicit-fallthrough")
      61             : #  define ConvertUTF_DISABLE_WARNINGS \
      62             :     _Pragma("clang diagnostic push")  \
      63             :     _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
      64             : #  define ConvertUTF_RESTORE_WARNINGS \
      65             :     _Pragma("clang diagnostic pop")
      66             : # endif
      67             : #elif defined(__GNUC__) && __GNUC__ > 6
      68             : # define ConvertUTF_DISABLE_WARNINGS \
      69             :    _Pragma("GCC diagnostic push")    \
      70             :    _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
      71             : # define ConvertUTF_RESTORE_WARNINGS \
      72             :    _Pragma("GCC diagnostic pop")
      73             : #endif
      74             : #ifndef ConvertUTF_DISABLE_WARNINGS
      75             : # define ConvertUTF_DISABLE_WARNINGS
      76             : #endif
      77             : #ifndef ConvertUTF_RESTORE_WARNINGS
      78             : # define ConvertUTF_RESTORE_WARNINGS
      79             : #endif
      80             : 
      81             : ConvertUTF_DISABLE_WARNINGS
      82             : 
      83             : namespace llvm {
      84             : 
      85             : static const int halfShift  = 10; /* used for shifting by 10 bits */
      86             : 
      87             : static const UTF32 halfBase = 0x0010000UL;
      88             : static const UTF32 halfMask = 0x3FFUL;
      89             : 
      90             : #define UNI_SUR_HIGH_START  (UTF32)0xD800
      91             : #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
      92             : #define UNI_SUR_LOW_START   (UTF32)0xDC00
      93             : #define UNI_SUR_LOW_END     (UTF32)0xDFFF
      94             : 
      95             : /* --------------------------------------------------------------------- */
      96             : 
      97             : /*
      98             :  * Index into the table below with the first byte of a UTF-8 sequence to
      99             :  * get the number of trailing bytes that are supposed to follow it.
     100             :  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
     101             :  * left as-is for anyone who may want to do such conversion, which was
     102             :  * allowed in earlier algorithms.
     103             :  */
     104             : static const char trailingBytesForUTF8[256] = {
     105             :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     106             :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     107             :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     108             :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     109             :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     110             :     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     111             :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     112             :     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
     113             : };
     114             : 
     115             : /*
     116             :  * Magic values subtracted from a buffer value during UTF8 conversion.
     117             :  * This table contains as many values as there might be trailing bytes
     118             :  * in a UTF-8 sequence.
     119             :  */
     120             : static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
     121             :                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
     122             : 
     123             : /*
     124             :  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
     125             :  * into the first byte, depending on how many bytes follow.  There are
     126             :  * as many entries in this table as there are UTF-8 sequence types.
     127             :  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
     128             :  * for *legal* UTF-8 will be 4 or fewer bytes total.
     129             :  */
     130             : static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
     131             : 
     132             : /* --------------------------------------------------------------------- */
     133             : 
     134             : /* The interface converts a whole buffer to avoid function-call overhead.
     135             :  * Constants have been gathered. Loops & conditionals have been removed as
     136             :  * much as possible for efficiency, in favor of drop-through switches.
     137             :  * (See "Note A" at the bottom of the file for equivalent code.)
     138             :  * If your compiler supports it, the "isLegalUTF8" call can be turned
     139             :  * into an inline function.
     140             :  */
     141             : 
     142             : 
     143             : /* --------------------------------------------------------------------- */
     144             : 
     145           0 : ConversionResult ConvertUTF32toUTF16 (
     146             :         const UTF32** sourceStart, const UTF32* sourceEnd,
     147             :         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     148             :     ConversionResult result = conversionOK;
     149           0 :     const UTF32* source = *sourceStart;
     150           0 :     UTF16* target = *targetStart;
     151           0 :     while (source < sourceEnd) {
     152             :         UTF32 ch;
     153           0 :         if (target >= targetEnd) {
     154             :             result = targetExhausted; break;
     155             :         }
     156           0 :         ch = *source++;
     157           0 :         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     158             :             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
     159           0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     160           0 :                 if (flags == strictConversion) {
     161             :                     --source; /* return to the illegal value itself */
     162             :                     result = sourceIllegal;
     163             :                     break;
     164             :                 } else {
     165           0 :                     *target++ = UNI_REPLACEMENT_CHAR;
     166             :                 }
     167             :             } else {
     168           0 :                 *target++ = (UTF16)ch; /* normal case */
     169             :             }
     170           0 :         } else if (ch > UNI_MAX_LEGAL_UTF32) {
     171           0 :             if (flags == strictConversion) {
     172             :                 result = sourceIllegal;
     173             :             } else {
     174           0 :                 *target++ = UNI_REPLACEMENT_CHAR;
     175             :             }
     176             :         } else {
     177             :             /* target is a character in range 0xFFFF - 0x10FFFF. */
     178           0 :             if (target + 1 >= targetEnd) {
     179             :                 --source; /* Back up source pointer! */
     180             :                 result = targetExhausted; break;
     181             :             }
     182           0 :             ch -= halfBase;
     183           0 :             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     184           0 :             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     185             :         }
     186             :     }
     187           0 :     *sourceStart = source;
     188           0 :     *targetStart = target;
     189           0 :     return result;
     190             : }
     191             : 
     192             : /* --------------------------------------------------------------------- */
     193             : 
     194           0 : ConversionResult ConvertUTF16toUTF32 (
     195             :         const UTF16** sourceStart, const UTF16* sourceEnd,
     196             :         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
     197             :     ConversionResult result = conversionOK;
     198           0 :     const UTF16* source = *sourceStart;
     199           0 :     UTF32* target = *targetStart;
     200             :     UTF32 ch, ch2;
     201           0 :     while (source < sourceEnd) {
     202             :         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
     203           0 :         ch = *source++;
     204             :         /* If we have a surrogate pair, convert to UTF32 first. */
     205           0 :         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     206             :             /* If the 16 bits following the high surrogate are in the source buffer... */
     207           0 :             if (source < sourceEnd) {
     208           0 :                 ch2 = *source;
     209             :                 /* If it's a low surrogate, convert to UTF32. */
     210           0 :                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     211           0 :                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     212           0 :                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
     213           0 :                     ++source;
     214           0 :                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     215             :                     --source; /* return to the illegal value itself */
     216             :                     result = sourceIllegal;
     217             :                     break;
     218             :                 }
     219             :             } else { /* We don't have the 16 bits following the high surrogate. */
     220             :                 --source; /* return to the high surrogate */
     221             :                 result = sourceExhausted;
     222             :                 break;
     223             :             }
     224           0 :         } else if (flags == strictConversion) {
     225             :             /* UTF-16 surrogate values are illegal in UTF-32 */
     226           0 :             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     227             :                 --source; /* return to the illegal value itself */
     228             :                 result = sourceIllegal;
     229             :                 break;
     230             :             }
     231             :         }
     232           0 :         if (target >= targetEnd) {
     233             :             source = oldSource; /* Back up source pointer! */
     234             :             result = targetExhausted; break;
     235             :         }
     236           0 :         *target++ = ch;
     237             :     }
     238           0 :     *sourceStart = source;
     239           0 :     *targetStart = target;
     240             : #ifdef CVTUTF_DEBUG
     241             : if (result == sourceIllegal) {
     242             :     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
     243             :     fflush(stderr);
     244             : }
     245             : #endif
     246           0 :     return result;
     247             : }
     248         126 : ConversionResult ConvertUTF16toUTF8 (
     249             :         const UTF16** sourceStart, const UTF16* sourceEnd,
     250             :         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     251             :     ConversionResult result = conversionOK;
     252         126 :     const UTF16* source = *sourceStart;
     253         126 :     UTF8* target = *targetStart;
     254        1419 :     while (source < sourceEnd) {
     255             :         UTF32 ch;
     256             :         unsigned short bytesToWrite = 0;
     257             :         const UTF32 byteMask = 0xBF;
     258             :         const UTF32 byteMark = 0x80;
     259             :         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
     260        1293 :         ch = *source++;
     261             :         /* If we have a surrogate pair, convert to UTF32 first. */
     262        1293 :         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     263             :             /* If the 16 bits following the high surrogate are in the source buffer... */
     264           0 :             if (source < sourceEnd) {
     265           0 :                 UTF32 ch2 = *source;
     266             :                 /* If it's a low surrogate, convert to UTF32. */
     267           0 :                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     268           0 :                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     269           0 :                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
     270           0 :                     ++source;
     271           0 :                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     272             :                     --source; /* return to the illegal value itself */
     273             :                     result = sourceIllegal;
     274             :                     break;
     275             :                 }
     276             :             } else { /* We don't have the 16 bits following the high surrogate. */
     277             :                 --source; /* return to the high surrogate */
     278             :                 result = sourceExhausted;
     279             :                 break;
     280             :             }
     281        1293 :         } else if (flags == strictConversion) {
     282             :             /* UTF-16 surrogate values are illegal in UTF-32 */
     283        1293 :             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     284             :                 --source; /* return to the illegal value itself */
     285             :                 result = sourceIllegal;
     286             :                 break;
     287             :             }
     288             :         }
     289             :         /* Figure out how many bytes the result will require */
     290        1293 :         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
     291          10 :         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
     292          10 :         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
     293             :         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
     294             :         } else {                            bytesToWrite = 3;
     295             :                                             ch = UNI_REPLACEMENT_CHAR;
     296             :         }
     297             : 
     298        1293 :         target += bytesToWrite;
     299        1293 :         if (target > targetEnd) {
     300             :             source = oldSource; /* Back up source pointer! */
     301           0 :             target -= bytesToWrite; result = targetExhausted; break;
     302             :         }
     303        1293 :         switch (bytesToWrite) { /* note: everything falls through. */
     304           0 :             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     305          10 :             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     306          10 :             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     307        1293 :             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
     308             :         }
     309        1293 :         target += bytesToWrite;
     310             :     }
     311         126 :     *sourceStart = source;
     312         126 :     *targetStart = target;
     313         126 :     return result;
     314             : }
     315             : 
     316             : /* --------------------------------------------------------------------- */
     317             : 
     318         381 : ConversionResult ConvertUTF32toUTF8 (
     319             :         const UTF32** sourceStart, const UTF32* sourceEnd,
     320             :         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     321             :     ConversionResult result = conversionOK;
     322         381 :     const UTF32* source = *sourceStart;
     323         381 :     UTF8* target = *targetStart;
     324         958 :     while (source < sourceEnd) {
     325             :         UTF32 ch;
     326             :         unsigned short bytesToWrite = 0;
     327             :         const UTF32 byteMask = 0xBF;
     328             :         const UTF32 byteMark = 0x80;
     329         577 :         ch = *source++;
     330         577 :         if (flags == strictConversion ) {
     331             :             /* UTF-16 surrogate values are illegal in UTF-32 */
     332         577 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     333             :                 --source; /* return to the illegal value itself */
     334             :                 result = sourceIllegal;
     335             :                 break;
     336             :             }
     337             :         }
     338             :         /*
     339             :          * Figure out how many bytes the result will require. Turn any
     340             :          * illegally large UTF32 things (> Plane 17) into replacement chars.
     341             :          */
     342         577 :         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
     343         360 :         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
     344         133 :         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
     345          14 :         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
     346             :         } else {                            bytesToWrite = 3;
     347             :                                             ch = UNI_REPLACEMENT_CHAR;
     348             :                                             result = sourceIllegal;
     349             :         }
     350             : 
     351         577 :         target += bytesToWrite;
     352         577 :         if (target > targetEnd) {
     353             :             --source; /* Back up source pointer! */
     354           0 :             target -= bytesToWrite; result = targetExhausted; break;
     355             :         }
     356         577 :         switch (bytesToWrite) { /* note: everything falls through. */
     357          14 :             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     358         133 :             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     359         360 :             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     360         577 :             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
     361             :         }
     362         577 :         target += bytesToWrite;
     363             :     }
     364         381 :     *sourceStart = source;
     365         381 :     *targetStart = target;
     366         381 :     return result;
     367             : }
     368             : 
     369             : /* --------------------------------------------------------------------- */
     370             : 
     371             : /*
     372             :  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
     373             :  * This must be called with the length pre-determined by the first byte.
     374             :  * If not calling this from ConvertUTF8to*, then the length can be set by:
     375             :  *  length = trailingBytesForUTF8[*source]+1;
     376             :  * and the sequence is illegal right away if there aren't that many bytes
     377             :  * available.
     378             :  * If presented with a length > 4, this returns false.  The Unicode
     379             :  * definition of UTF-8 goes up to 4-byte sequences.
     380             :  */
     381             : 
     382   173283849 : static Boolean isLegalUTF8(const UTF8 *source, int length) {
     383             :     UTF8 a;
     384   173283849 :     const UTF8 *srcptr = source+length;
     385   173283849 :     switch (length) {
     386             :     default: return false;
     387             :         /* Everything else falls through when "true"... */
     388         396 :     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     389        4495 :     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     390        9360 :     case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     391             : 
     392        9252 :         switch (*source) {
     393             :             /* no fall-through in this inner switch */
     394         291 :             case 0xE0: if (a < 0xA0) return false; break;
     395          30 :             case 0xED: if (a > 0x9F) return false; break;
     396         342 :             case 0xF0: if (a < 0x90) return false; break;
     397           5 :             case 0xF4: if (a > 0x8F) return false; break;
     398             :             default:   if (a < 0x80) return false;
     399             :         }
     400             : 
     401   173283493 :     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
     402             :     }
     403   173283069 :     if (*source > 0xF4) return false;
     404             :     return true;
     405             : }
     406             : 
     407             : /* --------------------------------------------------------------------- */
     408             : 
     409             : /*
     410             :  * Exported function to return whether a UTF-8 sequence is legal or not.
     411             :  * This is not used here; it's just exported.
     412             :  */
     413     4078495 : Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
     414     4078495 :     int length = trailingBytesForUTF8[*source]+1;
     415     4078495 :     if (length > sourceEnd - source) {
     416             :         return false;
     417             :     }
     418     4078483 :     return isLegalUTF8(source, length);
     419             : }
     420             : 
     421             : /* --------------------------------------------------------------------- */
     422             : 
     423             : static unsigned
     424         715 : findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
     425             :                                           const UTF8 *sourceEnd) {
     426             :   UTF8 b1, b2, b3;
     427             : 
     428             :   assert(!isLegalUTF8Sequence(source, sourceEnd));
     429             : 
     430             :   /*
     431             :    * Unicode 6.3.0, D93b:
     432             :    *
     433             :    *   Maximal subpart of an ill-formed subsequence: The longest code unit
     434             :    *   subsequence starting at an unconvertible offset that is either:
     435             :    *   a. the initial subsequence of a well-formed code unit sequence, or
     436             :    *   b. a subsequence of length one.
     437             :    */
     438             : 
     439         715 :   if (source == sourceEnd)
     440             :     return 0;
     441             : 
     442             :   /*
     443             :    * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
     444             :    * Byte Sequences.
     445             :    */
     446             : 
     447         715 :   b1 = *source;
     448         715 :   ++source;
     449         715 :   if (b1 >= 0xC2 && b1 <= 0xDF) {
     450             :     /*
     451             :      * First byte is valid, but we know that this code unit sequence is
     452             :      * invalid, so the maximal subpart has to end after the first byte.
     453             :      */
     454             :     return 1;
     455             :   }
     456             : 
     457         650 :   if (source == sourceEnd)
     458             :     return 1;
     459             : 
     460         496 :   b2 = *source;
     461         496 :   ++source;
     462             : 
     463         496 :   if (b1 == 0xE0) {
     464          20 :     return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
     465             :   }
     466         485 :   if (b1 >= 0xE1 && b1 <= 0xEC) {
     467          50 :     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
     468             :   }
     469         459 :   if (b1 == 0xED) {
     470          72 :     return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
     471             :   }
     472         422 :   if (b1 >= 0xEE && b1 <= 0xEF) {
     473           9 :     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
     474             :   }
     475         416 :   if (b1 == 0xF0) {
     476          16 :     if (b2 >= 0x90 && b2 <= 0xBF) {
     477           4 :       if (source == sourceEnd)
     478             :         return 2;
     479             : 
     480           2 :       b3 = *source;
     481           2 :       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
     482             :     }
     483             :     return 1;
     484             :   }
     485         400 :   if (b1 >= 0xF1 && b1 <= 0xF3) {
     486          10 :     if (b2 >= 0x80 && b2 <= 0xBF) {
     487           4 :       if (source == sourceEnd)
     488             :         return 2;
     489             : 
     490           2 :       b3 = *source;
     491           2 :       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
     492             :     }
     493             :     return 1;
     494             :   }
     495         390 :   if (b1 == 0xF4) {
     496          12 :     if (b2 >= 0x80 && b2 <= 0x8F) {
     497           4 :       if (source == sourceEnd)
     498             :         return 2;
     499             : 
     500           2 :       b3 = *source;
     501           2 :       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
     502             :     }
     503             :     return 1;
     504             :   }
     505             : 
     506             :   assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
     507             :   /*
     508             :    * There are no valid sequences that start with these bytes.  Maximal subpart
     509             :    * is defined to have length 1 in these cases.
     510             :    */
     511             :   return 1;
     512             : }
     513             : 
     514             : /* --------------------------------------------------------------------- */
     515             : 
     516             : /*
     517             :  * Exported function to return the total number of bytes in a codepoint
     518             :  * represented in UTF-8, given the value of the first byte.
     519             :  */
     520    23744622 : unsigned getNumBytesForUTF8(UTF8 first) {
     521    23744622 :   return trailingBytesForUTF8[first] + 1;
     522             : }
     523             : 
     524             : /* --------------------------------------------------------------------- */
     525             : 
     526             : /*
     527             :  * Exported function to return whether a UTF-8 string is legal or not.
     528             :  * This is not used here; it's just exported.
     529             :  */
     530    14737432 : Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
     531   159956274 :     while (*source != sourceEnd) {
     532   145218905 :         int length = trailingBytesForUTF8[**source] + 1;
     533   145218905 :         if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
     534             :             return false;
     535   145218842 :         *source += length;
     536             :     }
     537             :     return true;
     538             : }
     539             : 
     540             : /* --------------------------------------------------------------------- */
     541             : 
     542        5309 : ConversionResult ConvertUTF8toUTF16 (
     543             :         const UTF8** sourceStart, const UTF8* sourceEnd,
     544             :         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     545             :     ConversionResult result = conversionOK;
     546        5309 :     const UTF8* source = *sourceStart;
     547        5309 :     UTF16* target = *targetStart;
     548       38975 :     while (source < sourceEnd) {
     549             :         UTF32 ch = 0;
     550       33688 :         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     551       33688 :         if (extraBytesToRead >= sourceEnd - source) {
     552             :             result = sourceExhausted; break;
     553             :         }
     554             :         /* Do this check whether lenient or strict */
     555       33680 :         if (!isLegalUTF8(source, extraBytesToRead+1)) {
     556             :             result = sourceIllegal;
     557             :             break;
     558             :         }
     559             :         /*
     560             :          * The cases all fall through. See "Note A" below.
     561             :          */
     562       33666 :         switch (extraBytesToRead) {
     563           0 :             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
     564           0 :             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
     565           3 :             case 3: ch += *source++; ch <<= 6;
     566         101 :             case 2: ch += *source++; ch <<= 6;
     567         149 :             case 1: ch += *source++; ch <<= 6;
     568       33666 :             case 0: ch += *source++;
     569             :         }
     570       33666 :         ch -= offsetsFromUTF8[extraBytesToRead];
     571             : 
     572       33666 :         if (target >= targetEnd) {
     573           0 :             source -= (extraBytesToRead+1); /* Back up source pointer! */
     574           0 :             result = targetExhausted; break;
     575             :         }
     576       33666 :         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     577             :             /* UTF-16 surrogate values are illegal in UTF-32 */
     578       33663 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     579           0 :                 if (flags == strictConversion) {
     580           0 :                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
     581             :                     result = sourceIllegal;
     582           0 :                     break;
     583             :                 } else {
     584           0 :                     *target++ = UNI_REPLACEMENT_CHAR;
     585             :                 }
     586             :             } else {
     587       33663 :                 *target++ = (UTF16)ch; /* normal case */
     588             :             }
     589           3 :         } else if (ch > UNI_MAX_UTF16) {
     590           0 :             if (flags == strictConversion) {
     591             :                 result = sourceIllegal;
     592           0 :                 source -= (extraBytesToRead+1); /* return to the start */
     593           0 :                 break; /* Bail out; shouldn't continue */
     594             :             } else {
     595           0 :                 *target++ = UNI_REPLACEMENT_CHAR;
     596             :             }
     597             :         } else {
     598             :             /* target is a character in range 0xFFFF - 0x10FFFF. */
     599           3 :             if (target + 1 >= targetEnd) {
     600           0 :                 source -= (extraBytesToRead+1); /* Back up source pointer! */
     601           0 :                 result = targetExhausted; break;
     602             :             }
     603           3 :             ch -= halfBase;
     604           3 :             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     605           3 :             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     606             :         }
     607             :     }
     608        5309 :     *sourceStart = source;
     609        5309 :     *targetStart = target;
     610        5309 :     return result;
     611             : }
     612             : 
     613             : /* --------------------------------------------------------------------- */
     614             : 
     615    23889036 : static ConversionResult ConvertUTF8toUTF32Impl(
     616             :         const UTF8** sourceStart, const UTF8* sourceEnd,
     617             :         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
     618             :         Boolean InputIsPartial) {
     619             :     ConversionResult result = conversionOK;
     620    23889036 :     const UTF8* source = *sourceStart;
     621    23889036 :     UTF32* target = *targetStart;
     622    47841876 :     while (source < sourceEnd) {
     623             :         UTF32 ch = 0;
     624    23952999 :         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     625    23952999 :         if (extraBytesToRead >= sourceEnd - source) {
     626         179 :             if (flags == strictConversion || InputIsPartial) {
     627             :                 result = sourceExhausted;
     628             :                 break;
     629             :             } else {
     630             :                 result = sourceIllegal;
     631             : 
     632             :                 /*
     633             :                  * Replace the maximal subpart of ill-formed sequence with
     634             :                  * replacement character.
     635             :                  */
     636         140 :                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
     637             :                                                                     sourceEnd);
     638         140 :                 *target++ = UNI_REPLACEMENT_CHAR;
     639         140 :                 continue;
     640             :             }
     641             :         }
     642    23952820 :         if (target >= targetEnd) {
     643             :             result = targetExhausted; break;
     644             :         }
     645             : 
     646             :         /* Do this check whether lenient or strict */
     647    23952802 :         if (!isLegalUTF8(source, extraBytesToRead+1)) {
     648             :             result = sourceIllegal;
     649         677 :             if (flags == strictConversion) {
     650             :                 /* Abort conversion. */
     651             :                 break;
     652             :             } else {
     653             :                 /*
     654             :                  * Replace the maximal subpart of ill-formed sequence with
     655             :                  * replacement character.
     656             :                  */
     657         575 :                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
     658             :                                                                     sourceEnd);
     659         575 :                 *target++ = UNI_REPLACEMENT_CHAR;
     660         575 :                 continue;
     661             :             }
     662             :         }
     663             :         /*
     664             :          * The cases all fall through. See "Note A" below.
     665             :          */
     666    23952125 :         switch (extraBytesToRead) {
     667           0 :             case 5: ch += *source++; ch <<= 6;
     668           0 :             case 4: ch += *source++; ch <<= 6;
     669         297 :             case 3: ch += *source++; ch <<= 6;
     670        2947 :             case 2: ch += *source++; ch <<= 6;
     671        6423 :             case 1: ch += *source++; ch <<= 6;
     672    23952125 :             case 0: ch += *source++;
     673             :         }
     674    23952125 :         ch -= offsetsFromUTF8[extraBytesToRead];
     675             : 
     676    23952125 :         if (ch <= UNI_MAX_LEGAL_UTF32) {
     677             :             /*
     678             :              * UTF-16 surrogate values are illegal in UTF-32, and anything
     679             :              * over Plane 17 (> 0x10FFFF) is illegal.
     680             :              */
     681    23952125 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     682           0 :                 if (flags == strictConversion) {
     683           0 :                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
     684             :                     result = sourceIllegal;
     685           0 :                     break;
     686             :                 } else {
     687           0 :                     *target++ = UNI_REPLACEMENT_CHAR;
     688             :                 }
     689             :             } else {
     690    23952125 :                 *target++ = ch;
     691             :             }
     692             :         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
     693             :             result = sourceIllegal;
     694           0 :             *target++ = UNI_REPLACEMENT_CHAR;
     695             :         }
     696             :     }
     697    23889036 :     *sourceStart = source;
     698    23889036 :     *targetStart = target;
     699    23889036 :     return result;
     700             : }
     701             : 
     702          18 : ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
     703             :                                            const UTF8 *sourceEnd,
     704             :                                            UTF32 **targetStart,
     705             :                                            UTF32 *targetEnd,
     706             :                                            ConversionFlags flags) {
     707          18 :   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
     708          18 :                                 flags, /*InputIsPartial=*/true);
     709             : }
     710             : 
     711    23889018 : ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
     712             :                                     const UTF8 *sourceEnd, UTF32 **targetStart,
     713             :                                     UTF32 *targetEnd, ConversionFlags flags) {
     714    23889018 :   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
     715    23889018 :                                 flags, /*InputIsPartial=*/false);
     716             : }
     717             : 
     718             : /* ---------------------------------------------------------------------
     719             : 
     720             :     Note A.
     721             :     The fall-through switches in UTF-8 reading code save a
     722             :     temp variable, some decrements & conditionals.  The switches
     723             :     are equivalent to the following loop:
     724             :         {
     725             :             int tmpBytesToRead = extraBytesToRead+1;
     726             :             do {
     727             :                 ch += *source++;
     728             :                 --tmpBytesToRead;
     729             :                 if (tmpBytesToRead) ch <<= 6;
     730             :             } while (tmpBytesToRead > 0);
     731             :         }
     732             :     In UTF-8 writing code, the switches on "bytesToWrite" are
     733             :     similarly unrolled loops.
     734             : 
     735             :    --------------------------------------------------------------------- */
     736             : 
     737             : } // namespace llvm
     738             : 
     739             : ConvertUTF_RESTORE_WARNINGS

Generated by: LCOV version 1.13