LLVM 17.0.0git
UnicodeNameToCodepoint.cpp
Go to the documentation of this file.
1//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
2//-*- C++ -*-===//
3//
4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements functions to map the name or alias of a unicode
11// character to its codepoint.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/StringRef.h"
19
20namespace llvm {
21namespace sys {
22namespace unicode {
23
24extern const char *UnicodeNameToCodepointDict;
25extern const uint8_t *UnicodeNameToCodepointIndex;
26extern const std::size_t UnicodeNameToCodepointIndexSize;
27extern const std::size_t UnicodeNameToCodepointLargestNameSize;
28
30
31struct Node {
32 bool IsRoot = false;
33 char32_t Value = 0xFFFFFFFF;
35 bool HasSibling = false;
38 const Node *Parent = nullptr;
39
40 constexpr bool isValid() const {
41 return !Name.empty() || Value == 0xFFFFFFFF;
42 }
43 constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
44
45 std::string fullName() const {
46 std::string S;
47 // Reserve enough space for most unicode code points.
48 // The chosen value represent the 99th percentile of name size as of
49 // Unicode 15.0.
50 S.reserve(46);
51 const Node *N = this;
52 while (N) {
53 std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
54 N = N->Parent;
55 }
56 std::reverse(S.begin(), S.end());
57 return S;
58 }
59};
60
61static Node createRoot() {
62 Node N;
63 N.IsRoot = true;
64 N.ChildrenOffset = 1;
65 N.Size = 1;
66 return N;
67}
68
69static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
70 if (Offset == 0)
71 return createRoot();
72
73 uint32_t Origin = Offset;
74 Node N;
75 N.Parent = Parent;
76 uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
78 return N;
79
80 bool LongName = NameInfo & 0x40;
81 bool HasValue = NameInfo & 0x80;
82 std::size_t Size = NameInfo & ~0xC0;
83 if (LongName) {
84 uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
85 NameOffset |= UnicodeNameToCodepointIndex[Offset++];
86 N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
87 } else {
89 }
90 if (HasValue) {
94 N.Value = ((H << 16) | (M << 8) | L) >> 3;
95
96 bool HasChildren = L & 0x02;
97 N.HasSibling = L & 0x01;
98
99 if (HasChildren) {
100 N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
101 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
102 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
103 }
104 } else {
106 N.HasSibling = H & 0x80;
107 bool HasChildren = H & 0x40;
108 H &= uint8_t(~0xC0);
109 if (HasChildren) {
110 N.ChildrenOffset = (H << 16);
111 N.ChildrenOffset |=
113 N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
114 }
115 }
116 N.Size = Offset - Origin;
117 return N;
118}
119
120static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
121 std::size_t &Consummed, char &PreviousCharInName,
122 char &PreviousCharInNeedle, bool IsPrefix = false) {
123
124 Consummed = 0;
125 if (Strict) {
126 if (!Name.startswith(Needle))
127 return false;
128 Consummed = Needle.size();
129 return true;
130 }
131 if (Needle.empty())
132 return true;
133
134 auto NamePos = Name.begin();
135 auto NeedlePos = Needle.begin();
136
137 char PreviousCharInNameOrigin = PreviousCharInName;
138 char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
139
140 auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
141 bool IgnoreEnd = false) {
142 while (It != End) {
143 const auto Next = std::next(It);
144 // Ignore spaces, underscore, medial hyphens
145 // https://unicode.org/reports/tr44/#UAX44-LM2.
146 bool Ignore =
147 *It == ' ' || *It == '_' ||
148 (*It == '-' && isAlnum(PreviousChar) &&
149 ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
150 PreviousChar = *It;
151 if (!Ignore)
152 break;
153 ++It;
154 }
155 return It;
156 };
157
158 while (true) {
159 NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
160 NeedlePos =
161 IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162 if (NeedlePos == Needle.end())
163 break;
164 if (NamePos == Name.end())
165 break;
166 if (toUpper(*NeedlePos) != toUpper(*NamePos))
167 break;
168 NeedlePos++;
169 NamePos++;
170 }
171 Consummed = std::distance(Name.begin(), NamePos);
172 if (NeedlePos != Needle.end()) {
173 PreviousCharInName = PreviousCharInNameOrigin;
174 PreviousCharInNeedle = PreviousCharInNeedleOrigin;
175 }
176 return NeedlePos == Needle.end();
177}
178
179static std::tuple<Node, bool, uint32_t>
181 char PreviousCharInName, char PreviousCharInNeedle,
182 BufferType &Buffer, const Node *Parent = nullptr) {
183 Node N = readNode(Offset, Parent);
184 std::size_t Consummed = 0;
185 bool DoesStartWith =
186 N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
187 PreviousCharInName, PreviousCharInNeedle);
188 if (!DoesStartWith)
189 return std::make_tuple(N, false, 0);
190
191 if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
192 return std::make_tuple(N, true, N.Value);
193
194 if (N.hasChildren()) {
195 uint32_t ChildOffset = N.ChildrenOffset;
196 for (;;) {
197 Node C;
198 bool Matches;
200 std::tie(C, Matches, Value) =
201 compareNode(ChildOffset, Name.substr(Consummed), Strict,
202 PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
203 if (Matches) {
204 std::reverse_copy(C.Name.begin(), C.Name.end(),
205 std::back_inserter(Buffer));
206 return std::make_tuple(N, true, Value);
207 }
208 ChildOffset += C.Size;
209 if (!C.HasSibling)
210 break;
211 }
212 }
213 return std::make_tuple(N, false, 0);
214}
215
216static std::tuple<Node, bool, uint32_t>
218 return compareNode(Offset, Name, Strict, 0, 0, Buffer);
219}
220
221// clang-format off
222constexpr const char *const HangulSyllables[][3] = {
223 { "G", "A", "" },
224 { "GG", "AE", "G" },
225 { "N", "YA", "GG" },
226 { "D", "YAE", "GS" },
227 { "DD", "EO", "N", },
228 { "R", "E", "NJ" },
229 { "M", "YEO", "NH" },
230 { "B", "YE", "D" },
231 { "BB", "O", "L" },
232 { "S", "WA", "LG" },
233 { "SS", "WAE", "LM" },
234 { "", "OE", "LB" },
235 { "J", "YO", "LS" },
236 { "JJ", "U", "LT" },
237 { "C", "WEO", "LP" },
238 { "K", "WE", "LH" },
239 { "T", "WI", "M" },
240 { "P", "YU", "B" },
241 { "H", "EU", "BS" },
242 { 0, "YI", "S" },
243 { 0, "I", "SS" },
244 { 0, 0, "NG" },
245 { 0, 0, "J" },
246 { 0, 0, "C" },
247 { 0, 0, "K" },
248 { 0, 0, "T" },
249 { 0, 0, "P" },
250 { 0, 0, "H" }
251 };
252// clang-format on
253
254// Unicode 15.0
255// 3.12 Conjoining Jamo Behavior Common constants
256constexpr const char32_t SBase = 0xAC00;
257constexpr const uint32_t LCount = 19;
258constexpr const uint32_t VCount = 21;
259constexpr const uint32_t TCount = 28;
260
261static std::size_t findSyllable(StringRef Name, bool Strict,
262 char &PreviousInName, int &Pos, int Column) {
263 assert(Column == 0 || Column == 1 || Column == 2);
264 static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
265 char NeedleStart = 0;
266 int Len = -1;
267 int Prev = PreviousInName;
268 for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
269 StringRef Syllable(HangulSyllables[I][Column]);
270 if (int(Syllable.size()) <= Len)
271 continue;
272 std::size_t Consummed = 0;
273 char PreviousInNameCopy = PreviousInName;
274 bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
275 PreviousInNameCopy, NeedleStart);
276 if (!DoesStartWith)
277 continue;
278 Len = Consummed;
279 Pos = I;
280 Prev = PreviousInNameCopy;
281 }
282 if (Len == -1)
283 return 0;
284 PreviousInName = Prev;
285 return size_t(Len);
286}
287
288static std::optional<char32_t>
290 Buffer.clear();
291 // Hangul Syllable Decomposition
292 std::size_t Consummed = 0;
293 char NameStart = 0, NeedleStart = 0;
294 bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
295 NameStart, NeedleStart);
296 if (!DoesStartWith)
297 return std::nullopt;
298 Name = Name.substr(Consummed);
299 int L = -1, V = -1, T = -1;
300 Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
301 Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
302 Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
303 if (L != -1 && V != -1 && T != -1 && Name.empty()) {
304 if (!Strict) {
305 Buffer.append("HANGUL SYLLABLE ");
306 if (L != -1)
307 Buffer.append(HangulSyllables[L][0]);
308 if (V != -1)
309 Buffer.append(HangulSyllables[V][1]);
310 if (T != -1)
311 Buffer.append(HangulSyllables[T][2]);
312 }
313 return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
314 std::uint32_t(T);
315 }
316 // Otherwise, it's an illegal syllable name.
317 return std::nullopt;
318}
319
324};
325
326// Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings
328 {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
329 {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
330 {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
331 {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
332 {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
333 {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
334 {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
335 {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
336 {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
337 {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
338 {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
339 {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
340 {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
341 {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
342 {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
343 {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
344};
345
346static std::optional<char32_t>
348 for (auto &&Item : GeneratedNamesDataTable) {
349 Buffer.clear();
350 std::size_t Consummed = 0;
351 char NameStart = 0, NeedleStart = 0;
352 bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
353 NameStart, NeedleStart, /*isPrefix*/ true);
354 if (!DoesStartWith)
355 continue;
356 auto Number = Name.substr(Consummed);
357 unsigned long long V = 0;
358 // Be consistent about mandating upper casing.
359 if (Strict &&
360 llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
361 return {};
362 if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
363 continue;
364 if (!Strict) {
365 Buffer.append(Item.Prefix);
366 Buffer.append(utohexstr(V, true));
367 }
368 return V;
369 }
370 return std::nullopt;
371}
372
373static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
374 BufferType &Buffer) {
375 if (Name.empty())
376 return std::nullopt;
377
378 std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
379 if (!Res)
380 Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
381 if (Res)
382 return *Res;
383
384 Buffer.clear();
385 Node Node;
386 bool Matches;
388 std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
389 if (Matches) {
390 std::reverse(Buffer.begin(), Buffer.end());
391 // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
392 // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
393 if (!Strict && Value == 0x116c &&
394 Name.find_insensitive("O-E") != StringRef::npos) {
395 Buffer = "HANGUL JUNGSEONG O-E";
396 Value = 0x1180;
397 }
398 return Value;
399 }
400 return std::nullopt;
401}
402
403std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
404
405 BufferType Buffer;
406 auto Opt = nameToCodepoint(Name, true, Buffer);
407 return Opt;
408}
409
410std::optional<LooseMatchingResult>
412 BufferType Buffer;
413 auto Opt = nameToCodepoint(Name, false, Buffer);
414 if (!Opt)
415 return std::nullopt;
416 return LooseMatchingResult{*Opt, Buffer};
417}
418
419// Find the unicode character whose editing distance to Pattern
420// is shortest, using the Wagner–Fischer algorithm.
422nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
423 // We maintain a fixed size vector of matches,
424 // sorted by distance
425 // The worst match (with the biggest distance) are discarded when new elements
426 // are added.
427 std::size_t LargestEditDistance = 0;
429 Matches.reserve(MaxMatchesCount + 1);
430
431 auto Insert = [&](const Node &Node, uint32_t Distance,
432 char32_t Value) -> bool {
433 if (Distance > LargestEditDistance) {
434 if (Matches.size() == MaxMatchesCount)
435 return false;
436 LargestEditDistance = Distance;
437 }
438 // To avoid allocations, the creation of the name is delayed
439 // as much as possible.
440 std::string Name;
441 auto GetName = [&] {
442 if (Name.empty())
443 Name = Node.fullName();
444 return Name;
445 };
446
447 auto It = llvm::lower_bound(
448 Matches, Distance,
449 [&](const MatchForCodepointName &a, std::size_t Distance) {
450 if (Distance == a.Distance)
451 return a.Name < GetName();
452 return a.Distance < Distance;
453 });
454 if (It == Matches.end() && Matches.size() == MaxMatchesCount)
455 return false;
456
457 MatchForCodepointName M{GetName(), Distance, Value};
458 Matches.insert(It, std::move(M));
459 if (Matches.size() > MaxMatchesCount)
460 Matches.pop_back();
461 return true;
462 };
463
464 // We ignore case, space, hyphens, etc,
465 // in both the search pattern and the prospective names.
466 auto Normalize = [](StringRef Name) {
467 std::string Out;
468 Out.reserve(Name.size());
469 for (char C : Name) {
470 if (isAlnum(C))
471 Out.push_back(toUpper(C));
472 }
473 return Out;
474 };
475 std::string NormalizedName = Normalize(Pattern);
476
477 // Allocate a matrix big enough for longest names.
478 const std::size_t Columns =
479 std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
480 1;
481
482 LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
484
485 std::vector<char> Distances(
486 Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
487
488 auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
489 assert(Column < Columns);
490 assert(Row < Rows);
491 return Distances[Row * Columns + Column];
492 };
493
494 for (std::size_t I = 0; I < Columns; I++)
495 Get(I, 0) = I;
496
497 // Visit the childrens,
498 // Filling (and overriding) the matrix for the name fragment of each node
499 // iteratively. CompleteName is used to collect the actual name of potential
500 // match, respecting case and spacing.
501 auto VisitNode = [&](const Node &N, std::size_t Row,
502 auto &VisitNode) -> void {
503 std::size_t J = 0;
504 for (; J < N.Name.size(); J++) {
505 if (!isAlnum(N.Name[J]))
506 continue;
507
508 Get(0, Row) = Row;
509
510 for (std::size_t I = 1; I < Columns; I++) {
511 const int Delete = Get(I - 1, Row) + 1;
512 const int Insert = Get(I, Row - 1) + 1;
513
514 const int Replace =
515 Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
516
517 Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
518 }
519
520 Row++;
521 }
522
523 unsigned Cost = Get(Columns - 1, Row - 1);
524 if (N.Value != 0xFFFFFFFF) {
525 Insert(N, Cost, N.Value);
526 }
527
528 if (N.hasChildren()) {
529 auto ChildOffset = N.ChildrenOffset;
530 for (;;) {
531 Node C = readNode(ChildOffset, &N);
532 ChildOffset += C.Size;
533 if (!C.isValid())
534 break;
535 VisitNode(C, Row, VisitNode);
536 if (!C.HasSibling)
537 break;
538 }
539 }
540 };
541
542 Node Root = createRoot();
543 VisitNode(Root, 1, VisitNode);
544 return Matches;
545}
546
547} // namespace unicode
548
549} // namespace sys
550} // namespace llvm
ReachingDefAnalysis InstSet InstSet & Ignore
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:172
std::string Name
uint64_t Size
#define I(x, y, z)
Definition: MD5.cpp:58
#define H(x, y, z)
Definition: MD5.cpp:57
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
@ Normalize
Normalize - Normalize according to the given loops.
This file contains some functions that are useful when dealing with strings.
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
size_t size() const
Definition: SmallVector.h:91
void reserve(size_type N)
Definition: SmallVector.h:667
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
iterator begin() const
Definition: StringRef.h:111
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
iterator end() const
Definition: StringRef.h:113
static constexpr size_t npos
Definition: StringRef.h:52
LLVM Value Representation.
Definition: Value.h:74
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
static std::tuple< Node, bool, uint32_t > compareNode(uint32_t Offset, StringRef Name, bool Strict, char PreviousCharInName, char PreviousCharInNeedle, BufferType &Buffer, const Node *Parent=nullptr)
static Node readNode(uint32_t Offset, const Node *Parent=nullptr)
constexpr const uint32_t TCount
const std::size_t UnicodeNameToCodepointLargestNameSize
std::optional< char32_t > nameToCodepointStrict(StringRef Name)
Maps the name or the alias of a Unicode character to its associated codepoints.
SmallVector< MatchForCodepointName > nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount)
const std::size_t UnicodeNameToCodepointIndexSize
constexpr const char *const HangulSyllables[][3]
std::optional< LooseMatchingResult > nameToCodepointLooseMatching(StringRef Name)
constexpr const uint32_t LCount
static std::optional< char32_t > nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer)
constexpr const uint32_t VCount
static const GeneratedNamesData GeneratedNamesDataTable[]
static std::size_t findSyllable(StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column)
static std::optional< char32_t > nameToCodepoint(StringRef Name, bool Strict, BufferType &Buffer)
static std::optional< char32_t > nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer)
constexpr const char32_t SBase
static bool startsWith(StringRef Name, StringRef Needle, bool Strict, std::size_t &Consummed, char &PreviousCharInName, char &PreviousCharInNeedle, bool IsPrefix=false)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:406
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
detail::ValueMatchesPoly< M > HasValue(M Matcher)
Definition: Error.h:221
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1923
bool getAsUnsignedInteger(StringRef Str, unsigned Radix, unsigned long long &Result)
Helper functions for StringRef::getAsInteger.
Definition: StringRef.cpp:492
#define N
constexpr bool isValid() const
constexpr bool hasChildren() const