LLVM  15.0.0git
ConvertUTF.h
Go to the documentation of this file.
1 /*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *==------------------------------------------------------------------------==*/
8 /*
9  * Copyright 2001-2004 Unicode, Inc.
10  *
11  * Disclaimer
12  *
13  * This source code is provided as is by Unicode, Inc. No claims are
14  * made as to fitness for any particular purpose. No warranties of any
15  * kind are expressed or implied. The recipient agrees to determine
16  * applicability of information provided. If this file has been
17  * purchased on magnetic or optical media from Unicode, Inc., the
18  * sole remedy for any claim will be exchange of defective media
19  * within 90 days of receipt.
20  *
21  * Limitations on Rights to Redistribute This Code
22  *
23  * Unicode, Inc. hereby grants the right to freely use the information
24  * supplied in this file in the creation of products supporting the
25  * Unicode Standard, and to make copies of this file in any form
26  * for internal or external distribution as long as this notice
27  * remains attached.
28  */
29 
30 /* ---------------------------------------------------------------------
31 
32  Conversions between UTF32, UTF-16, and UTF-8. Header file.
33 
34  Several funtions are included here, forming a complete set of
35  conversions between the three formats. UTF-7 is not included
36  here, but is handled in a separate source file.
37 
38  Each of these routines takes pointers to input buffers and output
39  buffers. The input buffers are const.
40 
41  Each routine converts the text between *sourceStart and sourceEnd,
42  putting the result into the buffer between *targetStart and
43  targetEnd. Note: the end pointers are *after* the last item: e.g.
44  *(sourceEnd - 1) is the last item.
45 
46  The return result indicates whether the conversion was successful,
47  and if not, whether the problem was in the source or target buffers.
48  (Only the first encountered problem is indicated.)
49 
50  After the conversion, *sourceStart and *targetStart are both
51  updated to point to the end of last text successfully converted in
52  the respective buffers.
53 
54  Input parameters:
55  sourceStart - pointer to a pointer to the source buffer.
56  The contents of this are modified on return so that
57  it points at the next thing to be converted.
58  targetStart - similarly, pointer to pointer to the target buffer.
59  sourceEnd, targetEnd - respectively pointers to the ends of the
60  two buffers, for overflow checking only.
61 
62  These conversion functions take a ConversionFlags argument. When this
63  flag is set to strict, both irregular sequences and isolated surrogates
64  will cause an error. When the flag is set to lenient, both irregular
65  sequences and isolated surrogates are converted.
66 
67  Whether the flag is strict or lenient, all illegal sequences will cause
68  an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
69  or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
70  must check for illegal sequences.
71 
72  When the flag is set to lenient, characters over 0x10FFFF are converted
73  to the replacement character; otherwise (when the flag is set to strict)
74  they constitute an error.
75 
76  Output parameters:
77  The value "sourceIllegal" is returned from some routines if the input
78  sequence is malformed. When "sourceIllegal" is returned, the source
79  value will point to the illegal value that caused the problem. E.g.,
80  in UTF-8 when a sequence is malformed, it points to the start of the
81  malformed sequence.
82 
83  Author: Mark E. Davis, 1994.
84  Rev History: Rick McGowan, fixes & updates May 2001.
85  Fixes & updates, Sept 2001.
86 
87 ------------------------------------------------------------------------ */
88 
89 #ifndef LLVM_SUPPORT_CONVERTUTF_H
90 #define LLVM_SUPPORT_CONVERTUTF_H
91 
92 #include <cstddef>
93 #include <string>
94 
95 #if defined(_WIN32)
96 #include <system_error>
97 #endif
98 
99 // Wrap everything in namespace llvm so that programs can link with llvm and
100 // their own version of the unicode libraries.
101 
102 namespace llvm {
103 
104 /* ---------------------------------------------------------------------
105  The following 4 definitions are compiler-specific.
106  The C standard does not guarantee that wchar_t has at least
107  16 bits, so wchar_t is no less portable than unsigned short!
108  All should be unsigned values to avoid sign extension during
109  bit mask & shift operations.
110 ------------------------------------------------------------------------ */
111 
112 typedef unsigned int UTF32; /* at least 32 bits */
113 typedef unsigned short UTF16; /* at least 16 bits */
114 typedef unsigned char UTF8; /* typically 8 bits */
115 typedef unsigned char Boolean; /* 0 or 1 */
116 
117 /* Some fundamental constants */
118 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
119 #define UNI_MAX_BMP (UTF32)0x0000FFFF
120 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
121 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
122 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
123 
124 #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
125 
126 #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF
127 #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
128 
129 #define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
130 #define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000
131 
132 typedef enum {
133  conversionOK, /* conversion successful */
134  sourceExhausted, /* partial character in source, but hit end */
135  targetExhausted, /* insuff. room in target for conversion */
136  sourceIllegal /* source sequence is illegal/malformed */
138 
139 typedef enum {
143 
145  const UTF8** sourceStart, const UTF8* sourceEnd,
146  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
147 
148 /**
149  * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an
150  * incomplete code unit sequence, returns \c sourceExhausted.
151  */
153  const UTF8** sourceStart, const UTF8* sourceEnd,
154  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
155 
156 /**
157  * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an
158  * incomplete code unit sequence, returns \c sourceIllegal.
159  */
161  const UTF8** sourceStart, const UTF8* sourceEnd,
162  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
163 
165  const UTF16** sourceStart, const UTF16* sourceEnd,
166  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
167 
169  const UTF32** sourceStart, const UTF32* sourceEnd,
170  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
171 
173  const UTF16** sourceStart, const UTF16* sourceEnd,
174  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
175 
177  const UTF32** sourceStart, const UTF32* sourceEnd,
178  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
179 
180 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
181 
182 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
183 
184 unsigned getNumBytesForUTF8(UTF8 firstByte);
185 
186 /*************************************************************************/
187 /* Below are LLVM-specific wrappers of the functions above. */
188 
189 template <typename T> class ArrayRef;
190 template <typename T> class SmallVectorImpl;
191 class StringRef;
192 
193 /**
194  * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
195  * WideCharWidth. The converted data is written to ResultPtr, which needs to
196  * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
197  * ResultPtr will point one after the end of the copied string. On failure,
198  * ResultPtr will not be changed, and ErrorPtr will be set to the location of
199  * the first character which could not be converted.
200  * \return true on success.
201  */
202 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
203  char *&ResultPtr, const UTF8 *&ErrorPtr);
204 
205 /**
206 * Converts a UTF-8 StringRef to a std::wstring.
207 * \return true on success.
208 */
209 bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result);
210 
211 /**
212 * Converts a UTF-8 C-string to a std::wstring.
213 * \return true on success.
214 */
215 bool ConvertUTF8toWide(const char *Source, std::wstring &Result);
216 
217 /**
218 * Converts a std::wstring to a UTF-8 encoded std::string.
219 * \return true on success.
220 */
221 bool convertWideToUTF8(const std::wstring &Source, std::string &Result);
222 
223 
224 /**
225  * Convert an Unicode code point to UTF8 sequence.
226  *
227  * \param Source a Unicode code point.
228  * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
229  * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is
230  * updated one past end of the converted sequence.
231  *
232  * \returns true on success.
233  */
234 bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
235 
236 /**
237  * Convert the first UTF8 sequence in the given source buffer to a UTF32
238  * code point.
239  *
240  * \param [in,out] source A pointer to the source buffer. If the conversion
241  * succeeds, this pointer will be updated to point to the byte just past the
242  * end of the converted sequence.
243  * \param sourceEnd A pointer just past the end of the source buffer.
244  * \param [out] target The converted code
245  * \param flags Whether the conversion is strict or lenient.
246  *
247  * \returns conversionOK on success
248  *
249  * \sa ConvertUTF8toUTF32
250  */
252  const UTF8 *sourceEnd,
253  UTF32 *target,
254  ConversionFlags flags) {
255  if (*source == sourceEnd)
256  return sourceExhausted;
257  unsigned size = getNumBytesForUTF8(**source);
258  if ((ptrdiff_t)size > sourceEnd - *source)
259  return sourceExhausted;
260  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
261 }
262 
263 /**
264  * Returns true if a blob of text starts with a UTF-16 big or little endian byte
265  * order mark.
266  */
267 bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
268 
269 /**
270  * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
271  *
272  * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
273  * \param [out] Out Converted UTF-8 is stored here on success.
274  * \returns true on success
275  */
276 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
277 
278 /**
279 * Converts a UTF16 string into a UTF8 std::string.
280 *
281 * \param [in] Src A buffer of UTF-16 encoded text.
282 * \param [out] Out Converted UTF-8 is stored here on success.
283 * \returns true on success
284 */
285 bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
286 
287 /**
288  * Converts a stream of raw bytes assumed to be UTF32 into a UTF8 std::string.
289  *
290  * \param [in] SrcBytes A buffer of what is assumed to be UTF-32 encoded text.
291  * \param [out] Out Converted UTF-8 is stored here on success.
292  * \returns true on success
293  */
294 bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
295 
296 /**
297  * Converts a UTF32 string into a UTF8 std::string.
298  *
299  * \param [in] Src A buffer of UTF-32 encoded text.
300  * \param [out] Out Converted UTF-8 is stored here on success.
301  * \returns true on success
302  */
303 bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out);
304 
305 /**
306  * Converts a UTF-8 string into a UTF-16 string with native endianness.
307  *
308  * \returns true on success
309  */
310 bool convertUTF8ToUTF16String(StringRef SrcUTF8,
311  SmallVectorImpl<UTF16> &DstUTF16);
312 
313 #if defined(_WIN32)
314 namespace sys {
315 namespace windows {
316 std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
317 /// Convert to UTF16 from the current code page used in the system
318 std::error_code CurCPToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
319 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
320  SmallVectorImpl<char> &utf8);
321 /// Convert from UTF16 to the current code page used in the system
322 std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
323  SmallVectorImpl<char> &utf8);
324 } // namespace windows
325 } // namespace sys
326 #endif
327 
328 } /* end namespace llvm */
329 
330 #endif
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::ConvertUTF8toUTF32Partial
ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition: ConvertUTF.cpp:701
llvm::conversionOK
@ conversionOK
Definition: ConvertUTF.h:133
llvm::ConvertCodePointToUTF8
bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr)
Convert an Unicode code point to UTF8 sequence.
Definition: ConvertUTFWrapper.cpp:65
llvm::hasUTF16ByteOrderMark
bool hasUTF16ByteOrderMark(ArrayRef< char > SrcBytes)
Returns true if a blob of text starts with a UTF-16 big or little endian byte order mark.
Definition: ConvertUTFWrapper.cpp:79
llvm::Boolean
unsigned char Boolean
Definition: ConvertUTF.h:115
llvm::sourceIllegal
@ sourceIllegal
Definition: ConvertUTF.h:136
llvm::ConvertUTF32toUTF8
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:317
ptrdiff_t
llvm::sourceExhausted
@ sourceExhausted
Definition: ConvertUTF.h:134
llvm::convertUTF8Sequence
ConversionResult convertUTF8Sequence(const UTF8 **source, const UTF8 *sourceEnd, UTF32 *target, ConversionFlags flags)
Convert the first UTF8 sequence in the given source buffer to a UTF32 code point.
Definition: ConvertUTF.h:251
llvm::ConversionFlags
ConversionFlags
Definition: ConvertUTF.h:139
llvm::targetExhausted
@ targetExhausted
Definition: ConvertUTF.h:135
llvm::convertUTF16ToUTF8String
bool convertUTF16ToUTF8String(ArrayRef< char > SrcBytes, std::string &Out)
Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
Definition: ConvertUTFWrapper.cpp:84
llvm::lenientConversion
@ lenientConversion
Definition: ConvertUTF.h:141
llvm::ConversionResult
ConversionResult
Definition: ConvertUTF.h:132
llvm::isLegalUTF8Sequence
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:412
llvm::ConvertUTF32toUTF16
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:144
llvm::ConvertUTF8toUTF32
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition: ConvertUTF.cpp:710
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1598
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::convertWideToUTF8
bool convertWideToUTF8(const std::wstring &Source, std::string &Result)
Converts a std::wstring to a UTF-8 encoded std::string.
Definition: ConvertUTFWrapper.cpp:271
llvm::strictConversion
@ strictConversion
Definition: ConvertUTF.h:140
llvm::getNumBytesForUTF8
unsigned getNumBytesForUTF8(UTF8 firstByte)
Definition: ConvertUTF.cpp:519
llvm::convertUTF8ToUTF16String
bool convertUTF8ToUTF16String(StringRef SrcUTF8, SmallVectorImpl< UTF16 > &DstUTF16)
Converts a UTF-8 string into a UTF-16 string with native endianness.
Definition: ConvertUTFWrapper.cpp:200
llvm::ConvertUTF8toUTF16
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:541
llvm::ConvertUTF16toUTF8
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:247
llvm::isLegalUTF8String
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:529
llvm::ConvertUTF8toWide
bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, char *&ResultPtr, const UTF8 *&ErrorPtr)
Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on WideCharWidth.
Definition: ConvertUTFWrapper.cpp:19
llvm::UTF32
unsigned int UTF32
Definition: ConvertUTF.h:112
llvm::UTF16
unsigned short UTF16
Definition: ConvertUTF.h:113
llvm::UTF8
unsigned char UTF8
Definition: ConvertUTF.h:114
llvm::ConvertUTF16toUTF32
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:193
llvm::convertUTF32ToUTF8String
bool convertUTF32ToUTF8String(ArrayRef< char > SrcBytes, std::string &Out)
Converts a stream of raw bytes assumed to be UTF32 into a UTF8 std::string.
Definition: ConvertUTFWrapper.cpp:142