LLVM 22.0.0git
TextEncoding.cpp
Go to the documentation of this file.
1//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides utility classes to convert between different character
11/// encodings.
12///
13//===----------------------------------------------------------------------===//
14
20#include <system_error>
21
22#if HAVE_ICU
23#include <unicode/ucnv.h>
24#elif HAVE_ICONV
25#include <iconv.h>
26#endif
27
28using namespace llvm;
29
30// Normalize the charset name with the charset alias matching algorithm proposed
31// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
33 SmallVectorImpl<char> &Normalized) {
34 bool PrevDigit = false;
35 for (auto Ch : CSName) {
36 if (isAlnum(Ch)) {
37 Ch = toLower(Ch);
38 if (Ch != '0' || PrevDigit) {
39 PrevDigit = isDigit(Ch);
40 Normalized.push_back(Ch);
41 }
42 }
43 }
44}
45
46// Maps the encoding name to enum constant if possible.
47static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
48 SmallString<16> Normalized;
49 normalizeCharSetName(Name, Normalized);
50 if (Normalized.equals("utf8"))
51 return TextEncoding::UTF8;
52 if (Normalized.equals("ibm1047"))
53 return TextEncoding::IBM1047;
54 return std::nullopt;
55}
56
57LLVM_ATTRIBUTE_UNUSED static void
58HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
59 SmallVectorImpl<char> &Result) {
60 // No space left in output buffer. Double the size of the underlying
61 // memory in the SmallVectorImpl, adjust pointer and length and continue
62 // the conversion.
63 Capacity =
64 (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
65 Result.resize(0);
66 Result.resize_for_overwrite(Capacity);
67 Output = static_cast<char *>(Result.data());
68 OutputLength = Capacity;
69}
70
71namespace {
72enum ConversionType {
73 UTF8ToIBM1047,
74 IBM1047ToUTF8,
75};
76
77// Support conversion between EBCDIC 1047 and UTF-8. This class uses
78// built-in translation tables that allow for translation between the
79// aforementioned encodings. The use of tables for conversion is only
80// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
81// encodings are not supported.
82class TextEncodingConverterTable final
84 const ConversionType ConvType;
85
86public:
87 TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
88
89 std::error_code convertString(StringRef Source,
90 SmallVectorImpl<char> &Result) override;
91
92 void reset() override {}
93};
94
95std::error_code
96TextEncodingConverterTable::convertString(StringRef Source,
97 SmallVectorImpl<char> &Result) {
98 switch (ConvType) {
99 case IBM1047ToUTF8:
100 ConverterEBCDIC::convertToUTF8(Source, Result);
101 return std::error_code();
102 case UTF8ToIBM1047:
103 return ConverterEBCDIC::convertToEBCDIC(Source, Result);
104 }
105 llvm_unreachable("Invalid ConvType!");
106 return std::error_code();
107}
108
109#if HAVE_ICU
110struct UConverterDeleter {
111 void operator()(UConverter *Converter) const {
112 if (Converter)
113 ucnv_close(Converter);
114 }
115};
116using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
117
118class TextEncodingConverterICU final
120 UConverterUniquePtr FromConvDesc;
121 UConverterUniquePtr ToConvDesc;
122
123public:
124 TextEncodingConverterICU(UConverterUniquePtr FromConverter,
125 UConverterUniquePtr ToConverter)
126 : FromConvDesc(std::move(FromConverter)),
127 ToConvDesc(std::move(ToConverter)) {}
128
129 std::error_code convertString(StringRef Source,
130 SmallVectorImpl<char> &Result) override;
131
132 void reset() override;
133};
134
135// TODO: The current implementation discards the partial result and restarts the
136// conversion from the beginning if there is a conversion error due to
137// insufficient buffer size. In the future, it would better to save the partial
138// result and resume the conversion for the remaining string.
139// TODO: Improve translation of ICU errors to error_code
140std::error_code
141TextEncodingConverterICU::convertString(StringRef Source,
142 SmallVectorImpl<char> &Result) {
143 // Setup the input in case it has no backing data.
144 size_t InputLength = Source.size();
145 const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
146
147 // Setup the output. We directly write into the SmallVector.
148 size_t Capacity = Result.capacity();
149 size_t OutputLength = Capacity;
150 Result.resize_for_overwrite(Capacity);
151 char *Output;
152 UErrorCode EC = U_ZERO_ERROR;
153
154 ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
155 &EC);
156 ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
157 NULL, &EC);
158 assert(U_SUCCESS(EC));
159
160 do {
161 EC = U_ZERO_ERROR;
162 const char *Input = In;
163
164 Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
165 ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
166 In + InputLength, /*pivotStart=*/NULL,
167 /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
168 /*pivotLimit=*/NULL, /*reset=*/true,
169 /*flush=*/true, &EC);
170 if (U_FAILURE(EC)) {
171 if (EC == U_BUFFER_OVERFLOW_ERROR) {
172 if (Capacity < Result.max_size()) {
173 HandleOverflow(Capacity, Output, OutputLength, Result);
174 continue;
175 } else
176 return std::error_code(E2BIG, std::generic_category());
177 }
178 // Some other error occured.
179 Result.resize(Output - Result.data());
180 return std::error_code(EILSEQ, std::generic_category());
181 }
182 break;
183 } while (true);
184
185 Result.resize(Output - Result.data());
186 return std::error_code();
187}
188
189void TextEncodingConverterICU::reset() {
190 ucnv_reset(&*FromConvDesc);
191 ucnv_reset(&*ToConvDesc);
192}
193
194#elif HAVE_ICONV
195class TextEncodingConverterIconv final
197 class UniqueIconvT {
198 iconv_t ConvDesc;
199
200 public:
201 operator iconv_t() const { return ConvDesc; }
202 UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
203 ~UniqueIconvT() {
204 if (ConvDesc != (iconv_t)-1) {
205 iconv_close(ConvDesc);
206 ConvDesc = (iconv_t)-1;
207 }
208 }
209 UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
210 Other.ConvDesc = (iconv_t)-1;
211 }
212 UniqueIconvT &operator=(UniqueIconvT &&Other) {
213 if (&Other != this) {
214 ConvDesc = Other.ConvDesc;
215 Other.ConvDesc = (iconv_t)-1;
216 }
217 return *this;
218 }
219 };
220 UniqueIconvT ConvDesc;
221
222public:
223 TextEncodingConverterIconv(UniqueIconvT ConvDesc)
224 : ConvDesc(std::move(ConvDesc)) {}
225
226 std::error_code convertString(StringRef Source,
227 SmallVectorImpl<char> &Result) override;
228
229 void reset() override;
230};
231
232// TODO: The current implementation discards the partial result and restarts the
233// conversion from the beginning if there is a conversion error due to
234// insufficient buffer size. In the future, it would better to save the partial
235// result and resume the conversion for the remaining string.
236std::error_code
237TextEncodingConverterIconv::convertString(StringRef Source,
238 SmallVectorImpl<char> &Result) {
239 // Setup the output. We directly write into the SmallVector.
240 size_t Capacity = Result.capacity();
241 char *Output = static_cast<char *>(Result.data());
242 size_t OutputLength = Capacity;
243 Result.resize_for_overwrite(Capacity);
244
245 size_t Ret;
246 // Handle errors returned from iconv().
247 auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
248 this](size_t Ret) {
249 if (Ret == static_cast<size_t>(-1)) {
250 // An error occured. Check if we can gracefully handle it.
251 if (errno == E2BIG && Capacity < Result.max_size()) {
252 HandleOverflow(Capacity, Output, OutputLength, Result);
253 // Reset converter
254 reset();
255 return std::error_code();
256 } else {
257 // Some other error occured.
258 Result.resize(Output - Result.data());
259 return std::error_code(errno, std::generic_category());
260 }
261 } else {
262 // A positive return value indicates that some characters were converted
263 // in a nonreversible way, that is, replaced with a SUB symbol. Returning
264 // an error in this case makes sure that both conversion routines behave
265 // in the same way.
266 return std::make_error_code(std::errc::illegal_byte_sequence);
267 }
268 };
269
270 do {
271 // Setup the input. Use nullptr to reset iconv state if input length is
272 // zero.
273 size_t InputLength = Source.size();
274 char *Input = const_cast<char *>(InputLength ? Source.data() : "");
275 Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
276 if (Ret != 0) {
277 if (auto EC = HandleError(Ret))
278 return EC;
279 continue;
280 }
281 // Flush the converter
282 Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
283 if (Ret != 0) {
284 if (auto EC = HandleError(Ret))
285 return EC;
286 continue;
287 }
288 break;
289 } while (true);
290
291 // Re-adjust size to actual size.
292 Result.resize(Output - Result.data());
293 return std::error_code();
294}
295
296inline void TextEncodingConverterIconv::reset() {
297 iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
298}
299
300#endif // HAVE_ICONV
301} // namespace
302
305
306 // Text encodings should be distinct.
307 if (CPFrom == CPTo)
308 return std::make_error_code(std::errc::invalid_argument);
309
310 ConversionType Conversion;
311 if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
312 Conversion = UTF8ToIBM1047;
313 else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
314 Conversion = IBM1047ToUTF8;
315 else
316 return std::make_error_code(std::errc::invalid_argument);
317
319 std::make_unique<TextEncodingConverterTable>(Conversion));
320}
321
323 StringRef To) {
324 std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
325 std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
326 if (FromEncoding && ToEncoding) {
328 create(*FromEncoding, *ToEncoding);
329 if (Converter)
330 return Converter;
331 }
332#if HAVE_ICU
333 UErrorCode EC = U_ZERO_ERROR;
334 UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
335 if (U_FAILURE(EC))
336 return std::make_error_code(std::errc::invalid_argument);
337
338 UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
339 if (U_FAILURE(EC))
340 return std::make_error_code(std::errc::invalid_argument);
341
342 auto Converter = std::make_unique<TextEncodingConverterICU>(
343 std::move(FromConvDesc), std::move(ToConvDesc));
344 return TextEncodingConverter(std::move(Converter));
345#elif HAVE_ICONV
346 iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
347 if (ConvDesc == (iconv_t)-1)
348 return std::make_error_code(std::errc::invalid_argument);
350 std::make_unique<TextEncodingConverterIconv>(ConvDesc));
351#else
352 return std::make_error_code(std::errc::invalid_argument);
353#endif
354}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
BlockVerifier::State From
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:298
This file provides utility functions for converting between EBCDIC-1047 and UTF-8.
std::string Name
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
Early If Converter
static bool isDigit(const char C)
This file defines the SmallString class.
This file defines the SmallVector class.
This file contains some functions that are useful when dealing with strings.
static LLVM_ATTRIBUTE_UNUSED void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, SmallVectorImpl< char > &Result)
static std::optional< TextEncoding > getKnownEncoding(StringRef Name)
static void normalizeCharSetName(StringRef CSName, SmallVectorImpl< char > &Normalized)
This file provides a utility class to convert between different character set encodings.
X86 cmov Conversion
Represents either an error or a value T.
Definition: ErrorOr.h:56
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool equals(StringRef RHS) const
Check for string equality.
Definition: SmallString.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:233
Utility class to convert between different character encodings.
Definition: TextEncoding.h:82
static LLVM_ABI ErrorOr< TextEncodingConverter > create(TextEncoding From, TextEncoding To)
Creates a TextEncodingConverter instance.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI std::error_code convertToEBCDIC(StringRef Source, SmallVectorImpl< char > &Result)
LLVM_ABI void convertToUTF8(StringRef Source, SmallVectorImpl< char > &Result)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TextEncoding
Definition: TextEncoding.h:73
@ IBM1047
IBM EBCDIC 1047 character set encoding.
@ UTF8
UTF-8 character set encoding.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1886
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:851