LLVM 23.0.0git
TextEncoding.cpp
Go to the documentation of this file.
1//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides utility classes to convert between different character
11/// encodings.
12///
13//===----------------------------------------------------------------------===//
14
19#include "llvm/Config/config.h"
21#include <system_error>
22
23#if HAVE_ICU
24#if HAVE_WINDOWS_ICU
25#include <icu.h>
26#else
27#include <unicode/ucnv.h>
28#endif
29#elif HAVE_ICONV
30#include <iconv.h>
31#endif
32
33using namespace llvm;
34
35// Normalize the charset name with the charset alias matching algorithm proposed
36// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
38 SmallVectorImpl<char> &Normalized) {
39 bool PrevDigit = false;
40 for (auto Ch : CSName) {
41 if (isAlnum(Ch)) {
42 Ch = toLower(Ch);
43 if (Ch != '0' || PrevDigit) {
44 PrevDigit = isDigit(Ch);
45 Normalized.push_back(Ch);
46 }
47 }
48 }
49}
50
51// Maps the encoding name to enum constant if possible.
52static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
53 SmallString<16> Normalized;
54 normalizeCharSetName(Name, Normalized);
55 if (Normalized.equals("utf8"))
56 return TextEncoding::UTF8;
57 if (Normalized.equals("ibm1047"))
59 return std::nullopt;
60}
61
62[[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output,
63 size_t &OutputLength,
64 SmallVectorImpl<char> &Result) {
65 // No space left in output buffer. Double the size of the underlying
66 // memory in the SmallVectorImpl, adjust pointer and length and continue
67 // the conversion.
68 Capacity =
69 (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
70 Result.resize(0);
71 Result.resize_for_overwrite(Capacity);
72 Output = static_cast<char *>(Result.data());
73 OutputLength = Capacity;
74}
75
76namespace {
77enum ConversionType {
78 UTF8ToIBM1047,
79 IBM1047ToUTF8,
80};
81
82// Support conversion between EBCDIC 1047 and UTF-8. This class uses
83// built-in translation tables that allow for translation between the
84// aforementioned encodings. The use of tables for conversion is only
85// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
86// encodings are not supported.
87class TextEncodingConverterTable final
89 const ConversionType ConvType;
90
91public:
92 TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
93
94 std::error_code convertString(StringRef Source,
95 SmallVectorImpl<char> &Result) override;
96
97 void reset() override {}
98};
99
100std::error_code
101TextEncodingConverterTable::convertString(StringRef Source,
102 SmallVectorImpl<char> &Result) {
103 switch (ConvType) {
104 case IBM1047ToUTF8:
105 ConverterEBCDIC::convertToUTF8(Source, Result);
106 return std::error_code();
107 case UTF8ToIBM1047:
108 return ConverterEBCDIC::convertToEBCDIC(Source, Result);
109 }
110 llvm_unreachable("Invalid ConvType!");
111 return std::error_code();
112}
113
114#if HAVE_ICU
115struct UConverterDeleter {
116 void operator()(UConverter *Converter) const {
117 if (Converter)
118 ucnv_close(Converter);
119 }
120};
121using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
122
123class TextEncodingConverterICU final
124 : public details::TextEncodingConverterImplBase {
125 UConverterUniquePtr FromConvDesc;
126 UConverterUniquePtr ToConvDesc;
127
128public:
129 TextEncodingConverterICU(UConverterUniquePtr FromConverter,
130 UConverterUniquePtr ToConverter)
131 : FromConvDesc(std::move(FromConverter)),
132 ToConvDesc(std::move(ToConverter)) {}
133
134 std::error_code convertString(StringRef Source,
135 SmallVectorImpl<char> &Result) override;
136
137 void reset() override;
138};
139
140// TODO: The current implementation discards the partial result and restarts the
141// conversion from the beginning if there is a conversion error due to
142// insufficient buffer size. In the future, it would better to save the partial
143// result and resume the conversion for the remaining string.
144// TODO: Improve translation of ICU errors to error_code
145std::error_code
146TextEncodingConverterICU::convertString(StringRef Source,
147 SmallVectorImpl<char> &Result) {
148 // Setup the input in case it has no backing data.
149 size_t InputLength = Source.size();
150 const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
151
152 // Setup the output. We directly write into the SmallVector.
153 size_t Capacity = Result.capacity();
154 size_t OutputLength = Capacity;
155 Result.resize_for_overwrite(Capacity);
156 char *Output;
157 UErrorCode EC = U_ZERO_ERROR;
158
159 ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
160 &EC);
161 ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
162 NULL, &EC);
163 assert(U_SUCCESS(EC));
164
165 do {
166 EC = U_ZERO_ERROR;
167 const char *Input = In;
168
169 Output = static_cast<char *>(Result.data());
170 ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
171 In + InputLength, /*pivotStart=*/NULL,
172 /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
173 /*pivotLimit=*/NULL, /*reset=*/true,
174 /*flush=*/true, &EC);
175 if (U_FAILURE(EC)) {
176 if (EC == U_BUFFER_OVERFLOW_ERROR) {
177 if (Capacity < Result.max_size()) {
178 HandleOverflow(Capacity, Output, OutputLength, Result);
179 continue;
180 } else {
181 Result.resize(Output - Result.data());
182 return std::error_code(E2BIG, std::generic_category());
183 }
184 }
185 // Some other error occurred.
186 Result.resize(Output - Result.data());
187 return std::error_code(EILSEQ, std::generic_category());
188 }
189 break;
190 } while (true);
191
192 Result.resize(Output - Result.data());
193 return std::error_code();
194}
195
196void TextEncodingConverterICU::reset() {
197 ucnv_reset(&*FromConvDesc);
198 ucnv_reset(&*ToConvDesc);
199}
200
201#elif HAVE_ICONV
202class TextEncodingConverterIconv final
203 : public details::TextEncodingConverterImplBase {
204 class UniqueIconvT {
205 iconv_t ConvDesc;
206
207 public:
208 operator iconv_t() const { return ConvDesc; }
209 UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
210 ~UniqueIconvT() {
211 if (ConvDesc != (iconv_t)-1) {
212 iconv_close(ConvDesc);
213 ConvDesc = (iconv_t)-1;
214 }
215 }
216 UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
217 Other.ConvDesc = (iconv_t)-1;
218 }
219 UniqueIconvT &operator=(UniqueIconvT &&Other) {
220 if (&Other != this) {
221 ConvDesc = Other.ConvDesc;
222 Other.ConvDesc = (iconv_t)-1;
223 }
224 return *this;
225 }
226 };
227 UniqueIconvT ConvDesc;
228
229public:
230 TextEncodingConverterIconv(UniqueIconvT ConvDesc)
231 : ConvDesc(std::move(ConvDesc)) {}
232
233 std::error_code convertString(StringRef Source,
234 SmallVectorImpl<char> &Result) override;
235
236 void reset() override;
237};
238
239// TODO: The current implementation discards the partial result and restarts the
240// conversion from the beginning if there is a conversion error due to
241// insufficient buffer size. In the future, it would better to save the partial
242// result and resume the conversion for the remaining string.
243std::error_code
244TextEncodingConverterIconv::convertString(StringRef Source,
245 SmallVectorImpl<char> &Result) {
246 // Setup the output. We directly write into the SmallVector.
247 size_t Capacity = Result.capacity();
248 char *Output = static_cast<char *>(Result.data());
249 size_t OutputLength = Capacity;
250 Result.resize_for_overwrite(Capacity);
251
252 size_t Ret;
253 // Handle errors returned from iconv().
254 auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
255 this](size_t Ret) {
256 if (Ret == static_cast<size_t>(-1)) {
257 // An error occurred. Check if we can gracefully handle it.
258 if (errno == E2BIG && Capacity < Result.max_size()) {
259 HandleOverflow(Capacity, Output, OutputLength, Result);
260 // Reset converter
261 reset();
262 return std::error_code();
263 } else {
264 // Some other error occurred.
265 Result.resize(Output - Result.data());
266 return std::error_code(errno, std::generic_category());
267 }
268 } else {
269 // A positive return value indicates that some characters were converted
270 // in a nonreversible way, that is, replaced with a SUB symbol. Returning
271 // an error in this case makes sure that both conversion routines behave
272 // in the same way.
273 return std::make_error_code(std::errc::illegal_byte_sequence);
274 }
275 };
276
277 do {
278 size_t InputLength = Source.size();
279 char *Input = const_cast<char *>(Source.data());
280 Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
281 if (Ret != 0) {
282 if (auto EC = HandleError(Ret))
283 return EC;
284 continue;
285 }
286 // Flush the converter
287 Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
288 if (Ret != 0) {
289 if (auto EC = HandleError(Ret))
290 return EC;
291 continue;
292 }
293 break;
294 } while (true);
295
296 // Re-adjust size to actual size.
297 Result.resize(Output - Result.data());
298 return std::error_code();
299}
300
301inline void TextEncodingConverterIconv::reset() {
302 iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
303}
304
305#endif // HAVE_ICONV
306} // namespace
307
308ErrorOr<TextEncodingConverter>
310
311 // Text encodings should be distinct.
312 if (CPFrom == CPTo)
313 return std::make_error_code(std::errc::invalid_argument);
314
315 ConversionType Conversion;
316 if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
317 Conversion = UTF8ToIBM1047;
318 else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
319 Conversion = IBM1047ToUTF8;
320 else
321 return std::make_error_code(std::errc::invalid_argument);
322
323 return TextEncodingConverter(
324 std::make_unique<TextEncodingConverterTable>(Conversion));
325}
326
328 StringRef To) {
329 std::optional<TextEncoding> FromEncoding = getKnownEncoding(From);
330 std::optional<TextEncoding> ToEncoding = getKnownEncoding(To);
331 if (FromEncoding && ToEncoding) {
333 create(*FromEncoding, *ToEncoding);
334 if (Converter)
335 return Converter;
336 }
337#if HAVE_ICU
338 UErrorCode EC = U_ZERO_ERROR;
339 UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
340 if (U_FAILURE(EC))
341 return std::make_error_code(std::errc::invalid_argument);
342
343 UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
344 if (U_FAILURE(EC))
345 return std::make_error_code(std::errc::invalid_argument);
346
347 auto Converter = std::make_unique<TextEncodingConverterICU>(
348 std::move(FromConvDesc), std::move(ToConvDesc));
349 return TextEncodingConverter(std::move(Converter));
350#elif HAVE_ICONV
351 iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
352 if (ConvDesc == (iconv_t)-1)
353 return std::make_error_code(std::errc::invalid_argument);
354 return TextEncodingConverter(
355 std::make_unique<TextEncodingConverterIconv>(ConvDesc));
356#else
357 return std::make_error_code(std::errc::invalid_argument);
358#endif
359}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file provides utility functions for converting between EBCDIC-1047 and UTF-8.
Early If Converter
This file defines the SmallString class.
This file defines the SmallVector class.
This file contains some functions that are useful when dealing with strings.
static void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, SmallVectorImpl< char > &Result)
static std::optional< TextEncoding > getKnownEncoding(StringRef Name)
static void normalizeCharSetName(StringRef CSName, SmallVectorImpl< char > &Normalized)
This file provides a utility class to convert between different character set encodings.
X86 cmov Conversion
Represents either an error or a value T.
Definition ErrorOr.h:56
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
bool equals(StringRef RHS) const
Check for string equality.
Definition SmallString.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::string str() const
Get the contents as an std::string.
Definition StringRef.h:222
static LLVM_ABI ErrorOr< TextEncodingConverter > create(TextEncoding From, TextEncoding To)
Creates a TextEncodingConverter instance.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI std::error_code convertToEBCDIC(StringRef Source, SmallVectorImpl< char > &Result)
LLVM_ABI void convertToUTF8(StringRef Source, SmallVectorImpl< char > &Result)
This is an optimization pass for GlobalISel generic memory operations.
char toLower(char x)
Returns the corresponding lowercase character if x is uppercase.
@ IBM1047
IBM EBCDIC 1047 character set encoding.
@ UTF8
UTF-8 character set encoding.
bool isDigit(char C)
Checks if character C is one of the 10 decimal digits.
bool isAlnum(char C)
Checks whether character C is either a decimal digit or an uppercase or lowercase letter as classifie...
@ Other
Any other memory.
Definition ModRef.h:68
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917