LLVM 23.0.0git
TextEncoding.cpp
Go to the documentation of this file.
1//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides utility classes to convert between different character
11/// encodings.
12///
13//===----------------------------------------------------------------------===//
14
20#include <system_error>
21
22#if HAVE_ICU
23#if HAVE_WINDOWS_ICU
24#include <icu.h>
25#else
26#include <unicode/ucnv.h>
27#endif
28#elif HAVE_ICONV
29#include <iconv.h>
30#endif
31
32using namespace llvm;
33
34// Normalize the charset name with the charset alias matching algorithm proposed
35// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
37 SmallVectorImpl<char> &Normalized) {
38 bool PrevDigit = false;
39 for (auto Ch : CSName) {
40 if (isAlnum(Ch)) {
41 Ch = toLower(Ch);
42 if (Ch != '0' || PrevDigit) {
43 PrevDigit = isDigit(Ch);
44 Normalized.push_back(Ch);
45 }
46 }
47 }
48}
49
50// Maps the encoding name to enum constant if possible.
51static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
52 SmallString<16> Normalized;
53 normalizeCharSetName(Name, Normalized);
54 if (Normalized.equals("utf8"))
55 return TextEncoding::UTF8;
56 if (Normalized.equals("ibm1047"))
58 return std::nullopt;
59}
60
61[[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output,
62 size_t &OutputLength,
63 SmallVectorImpl<char> &Result) {
64 // No space left in output buffer. Double the size of the underlying
65 // memory in the SmallVectorImpl, adjust pointer and length and continue
66 // the conversion.
67 Capacity =
68 (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
69 Result.resize(0);
70 Result.resize_for_overwrite(Capacity);
71 Output = static_cast<char *>(Result.data());
72 OutputLength = Capacity;
73}
74
75namespace {
76enum ConversionType {
77 UTF8ToIBM1047,
78 IBM1047ToUTF8,
79};
80
81// Support conversion between EBCDIC 1047 and UTF-8. This class uses
82// built-in translation tables that allow for translation between the
83// aforementioned encodings. The use of tables for conversion is only
84// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
85// encodings are not supported.
86class TextEncodingConverterTable final
88 const ConversionType ConvType;
89
90public:
91 TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
92
93 std::error_code convertString(StringRef Source,
94 SmallVectorImpl<char> &Result) override;
95
96 void reset() override {}
97};
98
99std::error_code
100TextEncodingConverterTable::convertString(StringRef Source,
101 SmallVectorImpl<char> &Result) {
102 switch (ConvType) {
103 case IBM1047ToUTF8:
104 ConverterEBCDIC::convertToUTF8(Source, Result);
105 return std::error_code();
106 case UTF8ToIBM1047:
107 return ConverterEBCDIC::convertToEBCDIC(Source, Result);
108 }
109 llvm_unreachable("Invalid ConvType!");
110 return std::error_code();
111}
112
113#if HAVE_ICU
114struct UConverterDeleter {
115 void operator()(UConverter *Converter) const {
116 if (Converter)
117 ucnv_close(Converter);
118 }
119};
120using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
121
122class TextEncodingConverterICU final
123 : public details::TextEncodingConverterImplBase {
124 UConverterUniquePtr FromConvDesc;
125 UConverterUniquePtr ToConvDesc;
126
127public:
128 TextEncodingConverterICU(UConverterUniquePtr FromConverter,
129 UConverterUniquePtr ToConverter)
130 : FromConvDesc(std::move(FromConverter)),
131 ToConvDesc(std::move(ToConverter)) {}
132
133 std::error_code convertString(StringRef Source,
134 SmallVectorImpl<char> &Result) override;
135
136 void reset() override;
137};
138
139// TODO: The current implementation discards the partial result and restarts the
140// conversion from the beginning if there is a conversion error due to
141// insufficient buffer size. In the future, it would better to save the partial
142// result and resume the conversion for the remaining string.
143// TODO: Improve translation of ICU errors to error_code
144std::error_code
145TextEncodingConverterICU::convertString(StringRef Source,
146 SmallVectorImpl<char> &Result) {
147 // Setup the input in case it has no backing data.
148 size_t InputLength = Source.size();
149 const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
150
151 // Setup the output. We directly write into the SmallVector.
152 size_t Capacity = Result.capacity();
153 size_t OutputLength = Capacity;
154 Result.resize_for_overwrite(Capacity);
155 char *Output;
156 UErrorCode EC = U_ZERO_ERROR;
157
158 ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
159 &EC);
160 ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
161 NULL, &EC);
162 assert(U_SUCCESS(EC));
163
164 do {
165 EC = U_ZERO_ERROR;
166 const char *Input = In;
167
168 Output = static_cast<char *>(Result.data());
169 ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
170 In + InputLength, /*pivotStart=*/NULL,
171 /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
172 /*pivotLimit=*/NULL, /*reset=*/true,
173 /*flush=*/true, &EC);
174 if (U_FAILURE(EC)) {
175 if (EC == U_BUFFER_OVERFLOW_ERROR) {
176 if (Capacity < Result.max_size()) {
177 HandleOverflow(Capacity, Output, OutputLength, Result);
178 continue;
179 } else {
180 Result.resize(Output - Result.data());
181 return std::error_code(E2BIG, std::generic_category());
182 }
183 }
184 // Some other error occurred.
185 Result.resize(Output - Result.data());
186 return std::error_code(EILSEQ, std::generic_category());
187 }
188 break;
189 } while (true);
190
191 Result.resize(Output - Result.data());
192 return std::error_code();
193}
194
195void TextEncodingConverterICU::reset() {
196 ucnv_reset(&*FromConvDesc);
197 ucnv_reset(&*ToConvDesc);
198}
199
200#elif HAVE_ICONV
201class TextEncodingConverterIconv final
202 : public details::TextEncodingConverterImplBase {
203 class UniqueIconvT {
204 iconv_t ConvDesc;
205
206 public:
207 operator iconv_t() const { return ConvDesc; }
208 UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
209 ~UniqueIconvT() {
210 if (ConvDesc != (iconv_t)-1) {
211 iconv_close(ConvDesc);
212 ConvDesc = (iconv_t)-1;
213 }
214 }
215 UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
216 Other.ConvDesc = (iconv_t)-1;
217 }
218 UniqueIconvT &operator=(UniqueIconvT &&Other) {
219 if (&Other != this) {
220 ConvDesc = Other.ConvDesc;
221 Other.ConvDesc = (iconv_t)-1;
222 }
223 return *this;
224 }
225 };
226 UniqueIconvT ConvDesc;
227
228public:
229 TextEncodingConverterIconv(UniqueIconvT ConvDesc)
230 : ConvDesc(std::move(ConvDesc)) {}
231
232 std::error_code convertString(StringRef Source,
233 SmallVectorImpl<char> &Result) override;
234
235 void reset() override;
236};
237
238// TODO: The current implementation discards the partial result and restarts the
239// conversion from the beginning if there is a conversion error due to
240// insufficient buffer size. In the future, it would better to save the partial
241// result and resume the conversion for the remaining string.
242std::error_code
243TextEncodingConverterIconv::convertString(StringRef Source,
244 SmallVectorImpl<char> &Result) {
245 // Setup the output. We directly write into the SmallVector.
246 size_t Capacity = Result.capacity();
247 char *Output = static_cast<char *>(Result.data());
248 size_t OutputLength = Capacity;
249 Result.resize_for_overwrite(Capacity);
250
251 size_t Ret;
252 // Handle errors returned from iconv().
253 auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
254 this](size_t Ret) {
255 if (Ret == static_cast<size_t>(-1)) {
256 // An error occurred. Check if we can gracefully handle it.
257 if (errno == E2BIG && Capacity < Result.max_size()) {
258 HandleOverflow(Capacity, Output, OutputLength, Result);
259 // Reset converter
260 reset();
261 return std::error_code();
262 } else {
263 // Some other error occurred.
264 Result.resize(Output - Result.data());
265 return std::error_code(errno, std::generic_category());
266 }
267 } else {
268 // A positive return value indicates that some characters were converted
269 // in a nonreversible way, that is, replaced with a SUB symbol. Returning
270 // an error in this case makes sure that both conversion routines behave
271 // in the same way.
272 return std::make_error_code(std::errc::illegal_byte_sequence);
273 }
274 };
275
276 do {
277 size_t InputLength = Source.size();
278 char *Input = const_cast<char *>(Source.data());
279 Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
280 if (Ret != 0) {
281 if (auto EC = HandleError(Ret))
282 return EC;
283 continue;
284 }
285 // Flush the converter
286 Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
287 if (Ret != 0) {
288 if (auto EC = HandleError(Ret))
289 return EC;
290 continue;
291 }
292 break;
293 } while (true);
294
295 // Re-adjust size to actual size.
296 Result.resize(Output - Result.data());
297 return std::error_code();
298}
299
300inline void TextEncodingConverterIconv::reset() {
301 iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
302}
303
304#endif // HAVE_ICONV
305} // namespace
306
307ErrorOr<TextEncodingConverter>
309
310 // Text encodings should be distinct.
311 if (CPFrom == CPTo)
312 return std::make_error_code(std::errc::invalid_argument);
313
314 ConversionType Conversion;
315 if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
316 Conversion = UTF8ToIBM1047;
317 else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
318 Conversion = IBM1047ToUTF8;
319 else
320 return std::make_error_code(std::errc::invalid_argument);
321
322 return TextEncodingConverter(
323 std::make_unique<TextEncodingConverterTable>(Conversion));
324}
325
327 StringRef To) {
328 std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
329 std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
330 if (FromEncoding && ToEncoding) {
332 create(*FromEncoding, *ToEncoding);
333 if (Converter)
334 return Converter;
335 }
336#if HAVE_ICU
337 UErrorCode EC = U_ZERO_ERROR;
338 UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
339 if (U_FAILURE(EC))
340 return std::make_error_code(std::errc::invalid_argument);
341
342 UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
343 if (U_FAILURE(EC))
344 return std::make_error_code(std::errc::invalid_argument);
345
346 auto Converter = std::make_unique<TextEncodingConverterICU>(
347 std::move(FromConvDesc), std::move(ToConvDesc));
348 return TextEncodingConverter(std::move(Converter));
349#elif HAVE_ICONV
350 iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
351 if (ConvDesc == (iconv_t)-1)
352 return std::make_error_code(std::errc::invalid_argument);
353 return TextEncodingConverter(
354 std::make_unique<TextEncodingConverterIconv>(ConvDesc));
355#else
356 return std::make_error_code(std::errc::invalid_argument);
357#endif
358}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file provides utility functions for converting between EBCDIC-1047 and UTF-8.
Early If Converter
This file defines the SmallString class.
This file defines the SmallVector class.
This file contains some functions that are useful when dealing with strings.
static void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, SmallVectorImpl< char > &Result)
static std::optional< TextEncoding > getKnownEncoding(StringRef Name)
static void normalizeCharSetName(StringRef CSName, SmallVectorImpl< char > &Normalized)
This file provides a utility class to convert between different character set encodings.
X86 cmov Conversion
Represents either an error or a value T.
Definition ErrorOr.h:56
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
bool equals(StringRef RHS) const
Check for string equality.
Definition SmallString.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
static LLVM_ABI ErrorOr< TextEncodingConverter > create(TextEncoding From, TextEncoding To)
Creates a TextEncodingConverter instance.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI std::error_code convertToEBCDIC(StringRef Source, SmallVectorImpl< char > &Result)
LLVM_ABI void convertToUTF8(StringRef Source, SmallVectorImpl< char > &Result)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
char toLower(char x)
Returns the corresponding lowercase character if x is uppercase.
@ IBM1047
IBM EBCDIC 1047 character set encoding.
@ UTF8
UTF-8 character set encoding.
bool isDigit(char C)
Checks if character C is one of the 10 decimal digits.
bool isAlnum(char C)
Checks whether character C is either a decimal digit or an uppercase or lowercase letter as classifie...
@ Other
Any other memory.
Definition ModRef.h:68
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917