LLVM  16.0.0git
Unicode.cpp
Go to the documentation of this file.
1 //===- llvm/Support/Unicode.cpp - Unicode character properties -*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions that allow querying certain properties of
10 // Unicode characters.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Support/Unicode.h"
17 
18 namespace llvm {
19 namespace sys {
20 namespace unicode {
21 
22 /// Unicode code points of the categories L, M, N, P, S and Zs are considered
23 /// printable.
24 /// In addition, U+00AD SOFT HYPHEN is also considered printable, as
25 /// it's actually displayed on most terminals. \return true if the character is
26 /// considered printable.
27 bool isPrintable(int UCS) {
28  // https://unicode.org/Public/14.0.0/ucdxml/
29  static const UnicodeCharRange PrintableRanges[] = {
30  {0x0020, 0x007E}, {0x00A0, 0x00AC}, {0x00AE, 0x0377},
31  {0x037A, 0x037F}, {0x0384, 0x038A}, {0x038C, 0x038C},
32  {0x038E, 0x03A1}, {0x03A3, 0x052F}, {0x0531, 0x0556},
33  {0x0559, 0x058A}, {0x058D, 0x058F}, {0x0591, 0x05C7},
34  {0x05D0, 0x05EA}, {0x05EF, 0x05F4}, {0x0606, 0x061B},
35  {0x061D, 0x06DC}, {0x06DE, 0x070D}, {0x0710, 0x074A},
36  {0x074D, 0x07B1}, {0x07C0, 0x07FA}, {0x07FD, 0x082D},
37  {0x0830, 0x083E}, {0x0840, 0x085B}, {0x085E, 0x085E},
38  {0x0860, 0x086A}, {0x0870, 0x088E}, {0x0898, 0x08E1},
39  {0x08E3, 0x0983}, {0x0985, 0x098C}, {0x098F, 0x0990},
40  {0x0993, 0x09A8}, {0x09AA, 0x09B0}, {0x09B2, 0x09B2},
41  {0x09B6, 0x09B9}, {0x09BC, 0x09C4}, {0x09C7, 0x09C8},
42  {0x09CB, 0x09CE}, {0x09D7, 0x09D7}, {0x09DC, 0x09DD},
43  {0x09DF, 0x09E3}, {0x09E6, 0x09FE}, {0x0A01, 0x0A03},
44  {0x0A05, 0x0A0A}, {0x0A0F, 0x0A10}, {0x0A13, 0x0A28},
45  {0x0A2A, 0x0A30}, {0x0A32, 0x0A33}, {0x0A35, 0x0A36},
46  {0x0A38, 0x0A39}, {0x0A3C, 0x0A3C}, {0x0A3E, 0x0A42},
47  {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A51, 0x0A51},
48  {0x0A59, 0x0A5C}, {0x0A5E, 0x0A5E}, {0x0A66, 0x0A76},
49  {0x0A81, 0x0A83}, {0x0A85, 0x0A8D}, {0x0A8F, 0x0A91},
50  {0x0A93, 0x0AA8}, {0x0AAA, 0x0AB0}, {0x0AB2, 0x0AB3},
51  {0x0AB5, 0x0AB9}, {0x0ABC, 0x0AC5}, {0x0AC7, 0x0AC9},
52  {0x0ACB, 0x0ACD}, {0x0AD0, 0x0AD0}, {0x0AE0, 0x0AE3},
53  {0x0AE6, 0x0AF1}, {0x0AF9, 0x0AFF}, {0x0B01, 0x0B03},
54  {0x0B05, 0x0B0C}, {0x0B0F, 0x0B10}, {0x0B13, 0x0B28},
55  {0x0B2A, 0x0B30}, {0x0B32, 0x0B33}, {0x0B35, 0x0B39},
56  {0x0B3C, 0x0B44}, {0x0B47, 0x0B48}, {0x0B4B, 0x0B4D},
57  {0x0B55, 0x0B57}, {0x0B5C, 0x0B5D}, {0x0B5F, 0x0B63},
58  {0x0B66, 0x0B77}, {0x0B82, 0x0B83}, {0x0B85, 0x0B8A},
59  {0x0B8E, 0x0B90}, {0x0B92, 0x0B95}, {0x0B99, 0x0B9A},
60  {0x0B9C, 0x0B9C}, {0x0B9E, 0x0B9F}, {0x0BA3, 0x0BA4},
61  {0x0BA8, 0x0BAA}, {0x0BAE, 0x0BB9}, {0x0BBE, 0x0BC2},
62  {0x0BC6, 0x0BC8}, {0x0BCA, 0x0BCD}, {0x0BD0, 0x0BD0},
63  {0x0BD7, 0x0BD7}, {0x0BE6, 0x0BFA}, {0x0C00, 0x0C0C},
64  {0x0C0E, 0x0C10}, {0x0C12, 0x0C28}, {0x0C2A, 0x0C39},
65  {0x0C3C, 0x0C44}, {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D},
66  {0x0C55, 0x0C56}, {0x0C58, 0x0C5A}, {0x0C5D, 0x0C5D},
67  {0x0C60, 0x0C63}, {0x0C66, 0x0C6F}, {0x0C77, 0x0C8C},
68  {0x0C8E, 0x0C90}, {0x0C92, 0x0CA8}, {0x0CAA, 0x0CB3},
69  {0x0CB5, 0x0CB9}, {0x0CBC, 0x0CC4}, {0x0CC6, 0x0CC8},
70  {0x0CCA, 0x0CCD}, {0x0CD5, 0x0CD6}, {0x0CDD, 0x0CDE},
71  {0x0CE0, 0x0CE3}, {0x0CE6, 0x0CEF}, {0x0CF1, 0x0CF2},
72  {0x0D00, 0x0D0C}, {0x0D0E, 0x0D10}, {0x0D12, 0x0D44},
73  {0x0D46, 0x0D48}, {0x0D4A, 0x0D4F}, {0x0D54, 0x0D63},
74  {0x0D66, 0x0D7F}, {0x0D81, 0x0D83}, {0x0D85, 0x0D96},
75  {0x0D9A, 0x0DB1}, {0x0DB3, 0x0DBB}, {0x0DBD, 0x0DBD},
76  {0x0DC0, 0x0DC6}, {0x0DCA, 0x0DCA}, {0x0DCF, 0x0DD4},
77  {0x0DD6, 0x0DD6}, {0x0DD8, 0x0DDF}, {0x0DE6, 0x0DEF},
78  {0x0DF2, 0x0DF4}, {0x0E01, 0x0E3A}, {0x0E3F, 0x0E5B},
79  {0x0E81, 0x0E82}, {0x0E84, 0x0E84}, {0x0E86, 0x0E8A},
80  {0x0E8C, 0x0EA3}, {0x0EA5, 0x0EA5}, {0x0EA7, 0x0EBD},
81  {0x0EC0, 0x0EC4}, {0x0EC6, 0x0EC6}, {0x0EC8, 0x0ECD},
82  {0x0ED0, 0x0ED9}, {0x0EDC, 0x0EDF}, {0x0F00, 0x0F47},
83  {0x0F49, 0x0F6C}, {0x0F71, 0x0F97}, {0x0F99, 0x0FBC},
84  {0x0FBE, 0x0FCC}, {0x0FCE, 0x0FDA}, {0x1000, 0x10C5},
85  {0x10C7, 0x10C7}, {0x10CD, 0x10CD}, {0x10D0, 0x1248},
86  {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258},
87  {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D},
88  {0x1290, 0x12B0}, {0x12B2, 0x12B5}, {0x12B8, 0x12BE},
89  {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6},
90  {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A},
91  {0x135D, 0x137C}, {0x1380, 0x1399}, {0x13A0, 0x13F5},
92  {0x13F8, 0x13FD}, {0x1400, 0x169C}, {0x16A0, 0x16F8},
93  {0x1700, 0x1715}, {0x171F, 0x1736}, {0x1740, 0x1753},
94  {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1772, 0x1773},
95  {0x1780, 0x17DD}, {0x17E0, 0x17E9}, {0x17F0, 0x17F9},
96  {0x1800, 0x180D}, {0x180F, 0x1819}, {0x1820, 0x1878},
97  {0x1880, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E},
98  {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1940, 0x1940},
99  {0x1944, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
100  {0x19B0, 0x19C9}, {0x19D0, 0x19DA}, {0x19DE, 0x1A1B},
101  {0x1A1E, 0x1A5E}, {0x1A60, 0x1A7C}, {0x1A7F, 0x1A89},
102  {0x1A90, 0x1A99}, {0x1AA0, 0x1AAD}, {0x1AB0, 0x1ACE},
103  {0x1B00, 0x1B4C}, {0x1B50, 0x1B7E}, {0x1B80, 0x1BF3},
104  {0x1BFC, 0x1C37}, {0x1C3B, 0x1C49}, {0x1C4D, 0x1C88},
105  {0x1C90, 0x1CBA}, {0x1CBD, 0x1CC7}, {0x1CD0, 0x1CFA},
106  {0x1D00, 0x1F15}, {0x1F18, 0x1F1D}, {0x1F20, 0x1F45},
107  {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59},
108  {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D},
109  {0x1F80, 0x1FB4}, {0x1FB6, 0x1FC4}, {0x1FC6, 0x1FD3},
110  {0x1FD6, 0x1FDB}, {0x1FDD, 0x1FEF}, {0x1FF2, 0x1FF4},
111  {0x1FF6, 0x1FFE}, {0x2000, 0x200A}, {0x2010, 0x2027},
112  {0x202F, 0x205F}, {0x2070, 0x2071}, {0x2074, 0x208E},
113  {0x2090, 0x209C}, {0x20A0, 0x20C0}, {0x20D0, 0x20F0},
114  {0x2100, 0x218B}, {0x2190, 0x2426}, {0x2440, 0x244A},
115  {0x2460, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2CF3},
116  {0x2CF9, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D},
117  {0x2D30, 0x2D67}, {0x2D6F, 0x2D70}, {0x2D7F, 0x2D96},
118  {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6},
119  {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE},
120  {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2DE0, 0x2E5D},
121  {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, {0x2F00, 0x2FD5},
122  {0x2FF0, 0x2FFB}, {0x3000, 0x303F}, {0x3041, 0x3096},
123  {0x3099, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E},
124  {0x3190, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0xA48C},
125  {0xA490, 0xA4C6}, {0xA4D0, 0xA62B}, {0xA640, 0xA6F7},
126  {0xA700, 0xA7CA}, {0xA7D0, 0xA7D1}, {0xA7D3, 0xA7D3},
127  {0xA7D5, 0xA7D9}, {0xA7F2, 0xA82C}, {0xA830, 0xA839},
128  {0xA840, 0xA877}, {0xA880, 0xA8C5}, {0xA8CE, 0xA8D9},
129  {0xA8E0, 0xA953}, {0xA95F, 0xA97C}, {0xA980, 0xA9CD},
130  {0xA9CF, 0xA9D9}, {0xA9DE, 0xA9FE}, {0xAA00, 0xAA36},
131  {0xAA40, 0xAA4D}, {0xAA50, 0xAA59}, {0xAA5C, 0xAAC2},
132  {0xAADB, 0xAAF6}, {0xAB01, 0xAB06}, {0xAB09, 0xAB0E},
133  {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E},
134  {0xAB30, 0xAB6B}, {0xAB70, 0xABED}, {0xABF0, 0xABF9},
135  {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
136  {0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06},
137  {0xFB13, 0xFB17}, {0xFB1D, 0xFB36}, {0xFB38, 0xFB3C},
138  {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
139  {0xFB46, 0xFBC2}, {0xFBD3, 0xFD8F}, {0xFD92, 0xFDC7},
140  {0xFDCF, 0xFDCF}, {0xFDF0, 0xFE19}, {0xFE20, 0xFE52},
141  {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFE70, 0xFE74},
142  {0xFE76, 0xFEFC}, {0xFF01, 0xFFBE}, {0xFFC2, 0xFFC7},
143  {0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC},
144  {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD},
145  {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A},
146  {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D},
147  {0x10080, 0x100FA}, {0x10100, 0x10102}, {0x10107, 0x10133},
148  {0x10137, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0},
149  {0x101D0, 0x101FD}, {0x10280, 0x1029C}, {0x102A0, 0x102D0},
150  {0x102E0, 0x102FB}, {0x10300, 0x10323}, {0x1032D, 0x1034A},
151  {0x10350, 0x1037A}, {0x10380, 0x1039D}, {0x1039F, 0x103C3},
152  {0x103C8, 0x103D5}, {0x10400, 0x1049D}, {0x104A0, 0x104A9},
153  {0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527},
154  {0x10530, 0x10563}, {0x1056F, 0x1057A}, {0x1057C, 0x1058A},
155  {0x1058C, 0x10592}, {0x10594, 0x10595}, {0x10597, 0x105A1},
156  {0x105A3, 0x105B1}, {0x105B3, 0x105B9}, {0x105BB, 0x105BC},
157  {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767},
158  {0x10780, 0x10785}, {0x10787, 0x107B0}, {0x107B2, 0x107BA},
159  {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
160  {0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855},
161  {0x10857, 0x1089E}, {0x108A7, 0x108AF}, {0x108E0, 0x108F2},
162  {0x108F4, 0x108F5}, {0x108FB, 0x1091B}, {0x1091F, 0x10939},
163  {0x1093F, 0x1093F}, {0x10980, 0x109B7}, {0x109BC, 0x109CF},
164  {0x109D2, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A13},
165  {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A38, 0x10A3A},
166  {0x10A3F, 0x10A48}, {0x10A50, 0x10A58}, {0x10A60, 0x10A9F},
167  {0x10AC0, 0x10AE6}, {0x10AEB, 0x10AF6}, {0x10B00, 0x10B35},
168  {0x10B39, 0x10B55}, {0x10B58, 0x10B72}, {0x10B78, 0x10B91},
169  {0x10B99, 0x10B9C}, {0x10BA9, 0x10BAF}, {0x10C00, 0x10C48},
170  {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10CFA, 0x10D27},
171  {0x10D30, 0x10D39}, {0x10E60, 0x10E7E}, {0x10E80, 0x10EA9},
172  {0x10EAB, 0x10EAD}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F27},
173  {0x10F30, 0x10F59}, {0x10F70, 0x10F89}, {0x10FB0, 0x10FCB},
174  {0x10FE0, 0x10FF6}, {0x11000, 0x1104D}, {0x11052, 0x11075},
175  {0x1107F, 0x110BC}, {0x110BE, 0x110C2}, {0x110D0, 0x110E8},
176  {0x110F0, 0x110F9}, {0x11100, 0x11134}, {0x11136, 0x11147},
177  {0x11150, 0x11176}, {0x11180, 0x111DF}, {0x111E1, 0x111F4},
178  {0x11200, 0x11211}, {0x11213, 0x1123E}, {0x11280, 0x11286},
179  {0x11288, 0x11288}, {0x1128A, 0x1128D}, {0x1128F, 0x1129D},
180  {0x1129F, 0x112A9}, {0x112B0, 0x112EA}, {0x112F0, 0x112F9},
181  {0x11300, 0x11303}, {0x11305, 0x1130C}, {0x1130F, 0x11310},
182  {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333},
183  {0x11335, 0x11339}, {0x1133B, 0x11344}, {0x11347, 0x11348},
184  {0x1134B, 0x1134D}, {0x11350, 0x11350}, {0x11357, 0x11357},
185  {0x1135D, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374},
186  {0x11400, 0x1145B}, {0x1145D, 0x11461}, {0x11480, 0x114C7},
187  {0x114D0, 0x114D9}, {0x11580, 0x115B5}, {0x115B8, 0x115DD},
188  {0x11600, 0x11644}, {0x11650, 0x11659}, {0x11660, 0x1166C},
189  {0x11680, 0x116B9}, {0x116C0, 0x116C9}, {0x11700, 0x1171A},
190  {0x1171D, 0x1172B}, {0x11730, 0x11746}, {0x11800, 0x1183B},
191  {0x118A0, 0x118F2}, {0x118FF, 0x11906}, {0x11909, 0x11909},
192  {0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x11935},
193  {0x11937, 0x11938}, {0x1193B, 0x11946}, {0x11950, 0x11959},
194  {0x119A0, 0x119A7}, {0x119AA, 0x119D7}, {0x119DA, 0x119E4},
195  {0x11A00, 0x11A47}, {0x11A50, 0x11AA2}, {0x11AB0, 0x11AF8},
196  {0x11C00, 0x11C08}, {0x11C0A, 0x11C36}, {0x11C38, 0x11C45},
197  {0x11C50, 0x11C6C}, {0x11C70, 0x11C8F}, {0x11C92, 0x11CA7},
198  {0x11CA9, 0x11CB6}, {0x11D00, 0x11D06}, {0x11D08, 0x11D09},
199  {0x11D0B, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D},
200  {0x11D3F, 0x11D47}, {0x11D50, 0x11D59}, {0x11D60, 0x11D65},
201  {0x11D67, 0x11D68}, {0x11D6A, 0x11D8E}, {0x11D90, 0x11D91},
202  {0x11D93, 0x11D98}, {0x11DA0, 0x11DA9}, {0x11EE0, 0x11EF8},
203  {0x11FB0, 0x11FB0}, {0x11FC0, 0x11FF1}, {0x11FFF, 0x12399},
204  {0x12400, 0x1246E}, {0x12470, 0x12474}, {0x12480, 0x12543},
205  {0x12F90, 0x12FF2}, {0x13000, 0x1342E}, {0x14400, 0x14646},
206  {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16A60, 0x16A69},
207  {0x16A6E, 0x16ABE}, {0x16AC0, 0x16AC9}, {0x16AD0, 0x16AED},
208  {0x16AF0, 0x16AF5}, {0x16B00, 0x16B45}, {0x16B50, 0x16B59},
209  {0x16B5B, 0x16B61}, {0x16B63, 0x16B77}, {0x16B7D, 0x16B8F},
210  {0x16E40, 0x16E9A}, {0x16F00, 0x16F4A}, {0x16F4F, 0x16F87},
211  {0x16F8F, 0x16F9F}, {0x16FE0, 0x16FE4}, {0x16FF0, 0x16FF1},
212  {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
213  {0x1AFF0, 0x1AFF3}, {0x1AFF5, 0x1AFFB}, {0x1AFFD, 0x1AFFE},
214  {0x1B000, 0x1B122}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167},
215  {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C},
216  {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1BC9C, 0x1BC9F},
217  {0x1CF00, 0x1CF2D}, {0x1CF30, 0x1CF46}, {0x1CF50, 0x1CFC3},
218  {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D172},
219  {0x1D17B, 0x1D1EA}, {0x1D200, 0x1D245}, {0x1D2E0, 0x1D2F3},
220  {0x1D300, 0x1D356}, {0x1D360, 0x1D378}, {0x1D400, 0x1D454},
221  {0x1D456, 0x1D49C}, {0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2},
222  {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9},
223  {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505},
224  {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514}, {0x1D516, 0x1D51C},
225  {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544},
226  {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5},
227  {0x1D6A8, 0x1D7CB}, {0x1D7CE, 0x1DA8B}, {0x1DA9B, 0x1DA9F},
228  {0x1DAA1, 0x1DAAF}, {0x1DF00, 0x1DF1E}, {0x1E000, 0x1E006},
229  {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024},
230  {0x1E026, 0x1E02A}, {0x1E100, 0x1E12C}, {0x1E130, 0x1E13D},
231  {0x1E140, 0x1E149}, {0x1E14E, 0x1E14F}, {0x1E290, 0x1E2AE},
232  {0x1E2C0, 0x1E2F9}, {0x1E2FF, 0x1E2FF}, {0x1E7E0, 0x1E7E6},
233  {0x1E7E8, 0x1E7EB}, {0x1E7ED, 0x1E7EE}, {0x1E7F0, 0x1E7FE},
234  {0x1E800, 0x1E8C4}, {0x1E8C7, 0x1E8D6}, {0x1E900, 0x1E94B},
235  {0x1E950, 0x1E959}, {0x1E95E, 0x1E95F}, {0x1EC71, 0x1ECB4},
236  {0x1ED01, 0x1ED3D}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F},
237  {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
238  {0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39},
239  {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47},
240  {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F},
241  {0x1EE51, 0x1EE52}, {0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57},
242  {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D},
243  {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64},
244  {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72}, {0x1EE74, 0x1EE77},
245  {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89},
246  {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9},
247  {0x1EEAB, 0x1EEBB}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B},
248  {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF},
249  {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F100, 0x1F1AD},
250  {0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248},
251  {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7},
252  {0x1F6DD, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773},
253  {0x1F780, 0x1F7D8}, {0x1F7E0, 0x1F7EB}, {0x1F7F0, 0x1F7F0},
254  {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859},
255  {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1},
256  {0x1F900, 0x1FA53}, {0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74},
257  {0x1FA78, 0x1FA7C}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAAC},
258  {0x1FAB0, 0x1FABA}, {0x1FAC0, 0x1FAC5}, {0x1FAD0, 0x1FAD9},
259  {0x1FAE0, 0x1FAE7}, {0x1FAF0, 0x1FAF6}, {0x1FB00, 0x1FB92},
260  {0x1FB94, 0x1FBCA}, {0x1FBF0, 0x1FBF9}, {0x20000, 0x2A6DF},
261  {0x2A700, 0x2B738}, {0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1},
262  {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
263  {0xE0100, 0xE01EF}};
264 
265  static const UnicodeCharSet Printables(PrintableRanges);
266  // Clang special cases 0x00AD (SOFT HYPHEN) which is rendered as an actual
267  // hyphen in most terminals.
268  return UCS == 0x00AD || Printables.contains(UCS);
269 }
270 
271 /// Unicode code points of the Cf category are considered
272 /// formatting characters.
273 bool isFormatting(int UCS) {
274 
275  // https://unicode.org/Public/14.0.0/ucdxml/
276  static const UnicodeCharRange Cf[] = {
277  {0x00AD, 0x00AD}, {0x0600, 0x0605}, {0x061C, 0x061C},
278  {0x06DD, 0x06DD}, {0x070F, 0x070F}, {0x0890, 0x0891},
279  {0x08E2, 0x08E2}, {0x180E, 0x180E}, {0x200B, 0x200F},
280  {0x202A, 0x202E}, {0x2060, 0x2064}, {0x2066, 0x206F},
281  {0xFEFF, 0xFEFF}, {0xFFF9, 0xFFFB}, {0x110BD, 0x110BD},
282  {0x110CD, 0x110CD}, {0x13430, 0x13438}, {0x1BCA0, 0x1BCA3},
283  {0x1D173, 0x1D17A}, {0xE0001, 0xE0001}, {0xE0020, 0xE007F}};
284 
285  static const UnicodeCharSet Format(Cf);
286  return Format.contains(UCS);
287 }
288 
289 /// Gets the number of positions a character is likely to occupy when output
290 /// on a terminal ("character width"). This depends on the implementation of the
291 /// terminal, and there's no standard definition of character width.
292 /// The implementation defines it in a way that is expected to be compatible
293 /// with a generic Unicode-capable terminal.
294 /// \return Character width:
295 /// * ErrorNonPrintableCharacter (-1) for non-printable characters (as
296 /// identified by isPrintable);
297 /// * 0 for non-spacing and enclosing combining marks;
298 /// * 2 for CJK characters excluding halfwidth forms;
299 /// * 1 for all remaining characters.
300 static inline int charWidth(int UCS)
301 {
302  if (!isPrintable(UCS))
304 
305  // Sorted list of non-spacing and enclosing combining mark intervals as
306  // defined in "3.6 Combination" of
307  // http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
308  static const UnicodeCharRange CombiningCharacterRanges[] = {
309  { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD },
310  { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 },
311  { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, { 0x064B, 0x065F },
312  { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 },
313  { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x0711, 0x0711 },
314  { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 },
315  { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 },
316  { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08E4, 0x08FE },
317  { 0x0900, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C },
318  { 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 },
319  { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC },
320  { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 },
321  { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 },
322  { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 },
323  { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 },
324  { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
325  { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 },
326  { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 },
327  { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B62, 0x0B63 },
328  { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD },
329  { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D },
330  { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0CBC, 0x0CBC },
331  { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
332  { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D },
333  { 0x0D62, 0x0D63 }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 },
334  { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A },
335  { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 },
336  { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
337  { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
338  { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
339  { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
340  { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A },
341  { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 },
342  { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 },
343  { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F },
344  { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
345  { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
346  { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
347  { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
348  { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
349  { 0x1A17, 0x1A18 }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E },
350  { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C },
351  { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1B00, 0x1B03 },
352  { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C },
353  { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 },
354  { 0x1BA2, 0x1BA5 }, { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAB },
355  { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED },
356  { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 },
357  { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 },
358  { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1DC0, 0x1DE6 },
359  { 0x1DFC, 0x1DFF }, { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 },
360  { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D },
361  { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D },
362  { 0xA69F, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 },
363  { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 },
364  { 0xA8C4, 0xA8C4 }, { 0xA8E0, 0xA8F1 }, { 0xA926, 0xA92D },
365  { 0xA947, 0xA951 }, { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 },
366  { 0xA9B6, 0xA9B9 }, { 0xA9BC, 0xA9BC }, { 0xAA29, 0xAA2E },
367  { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 },
368  { 0xAA4C, 0xAA4C }, { 0xAAB0, 0xAAB0 }, { 0xAAB2, 0xAAB4 },
369  { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, { 0xAAC1, 0xAAC1 },
370  { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 }, { 0xABE5, 0xABE5 },
371  { 0xABE8, 0xABE8 }, { 0xABED, 0xABED }, { 0xFB1E, 0xFB1E },
372  { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE26 }, { 0x101FD, 0x101FD },
373  { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
374  { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x11001, 0x11001 },
375  { 0x11038, 0x11046 }, { 0x11080, 0x11081 }, { 0x110B3, 0x110B6 },
376  { 0x110B9, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x1112B },
377  { 0x1112D, 0x11134 }, { 0x11180, 0x11181 }, { 0x111B6, 0x111BE },
378  { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 },
379  { 0x116B7, 0x116B7 }, { 0x16F8F, 0x16F92 }, { 0x1D167, 0x1D169 },
380  { 0x1D17B, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
381  { 0x1D242, 0x1D244 }, { 0xE0100, 0xE01EF },
382  };
383  static const UnicodeCharSet CombiningCharacters(CombiningCharacterRanges);
384 
385  if (CombiningCharacters.contains(UCS))
386  return 0;
387 
388  static const UnicodeCharRange DoubleWidthCharacterRanges[] = {
389  // Hangul Jamo
390  { 0x1100, 0x11FF },
391  // Deprecated fullwidth angle brackets
392  { 0x2329, 0x232A },
393  // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi
394  // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE)
395  { 0x2E80, 0x303E }, { 0x3040, 0xA4CF },
396  // Hangul
397  { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB },
398  // CJK Unified Ideographs
399  { 0xF900, 0xFAFF },
400  // Vertical forms
401  { 0xFE10, 0xFE19 },
402  // CJK Compatibility Forms + Small Form Variants
403  { 0xFE30, 0xFE6F },
404  // Fullwidth forms
405  { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 },
406  // CJK Unified Ideographs
407  { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F }
408  };
409  static const UnicodeCharSet DoubleWidthCharacters(DoubleWidthCharacterRanges);
410 
411  if (DoubleWidthCharacters.contains(UCS))
412  return 2;
413  return 1;
414 }
415 
416 static bool isprintableascii(char c) { return c > 31 && c < 127; }
417 
419  unsigned ColumnWidth = 0;
420  unsigned Length;
421  for (size_t i = 0, e = Text.size(); i < e; i += Length) {
422  Length = getNumBytesForUTF8(Text[i]);
423 
424  // fast path for ASCII characters
425  if (Length == 1) {
426  if (!isprintableascii(Text[i]))
428  ColumnWidth += 1;
429  continue;
430  }
431 
432  if (Length <= 0 || i + Length > Text.size())
433  return ErrorInvalidUTF8;
434  UTF32 buf[1];
435  const UTF8 *Start = reinterpret_cast<const UTF8 *>(Text.data() + i);
436  UTF32 *Target = &buf[0];
437  if (conversionOK != ConvertUTF8toUTF32(&Start, Start + Length, &Target,
438  Target + 1, strictConversion))
439  return ErrorInvalidUTF8;
440  int Width = charWidth(buf[0]);
441  if (Width < 0)
443  ColumnWidth += Width;
444  }
445  return ColumnWidth;
446 }
447 
448 } // namespace unicode
449 } // namespace sys
450 } // namespace llvm
451 
i
i
Definition: README.txt:29
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::conversionOK
@ conversionOK
Definition: ConvertUTF.h:149
llvm::sys::unicode::ErrorInvalidUTF8
@ ErrorInvalidUTF8
Definition: Unicode.h:28
llvm::Target
Target - Wrapper for Target specific information.
Definition: TargetRegistry.h:145
llvm::StringRef::data
const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:123
llvm::sys::unicode::isprintableascii
static bool isprintableascii(char c)
Definition: Unicode.cpp:416
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
llvm::sys::UnicodeCharRange
Represents a closed range of Unicode code points [Lower, Upper].
Definition: UnicodeCharRanges.h:23
llvm::sys::UnicodeCharSet
Holds a reference to an ordered array of UnicodeCharRange and allows to quickly check if a code point...
Definition: UnicodeCharRanges.h:38
llvm::sys::unicode::isFormatting
bool isFormatting(int UCS)
Unicode code points of the Cf category are considered formatting characters.
Definition: Unicode.cpp:273
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:53
UnicodeCharRanges.h
llvm::ConvertUTF8toUTF32
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition: ConvertUTF.cpp:736
llvm::sys::UnicodeCharSet::contains
bool contains(uint32_t C) const
Returns true if the character set contains the Unicode code point C.
Definition: UnicodeCharRanges.h:64
llvm::sys::unicode::columnWidthUTF8
int columnWidthUTF8(StringRef Text)
Gets the number of positions the UTF8-encoded Text is likely to occupy when output on a terminal ("ch...
Definition: Unicode.cpp:418
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::strictConversion
@ strictConversion
Definition: ConvertUTF.h:156
llvm::sys::unicode::charWidth
static int charWidth(int UCS)
Gets the number of positions a character is likely to occupy when output on a terminal ("character wi...
Definition: Unicode.cpp:300
llvm::getNumBytesForUTF8
unsigned getNumBytesForUTF8(UTF8 firstByte)
Definition: ConvertUTF.cpp:545
Unicode.h
ConvertUTF.h
llvm::StringRef::size
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:129
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:439
llvm::UTF32
unsigned int UTF32
Definition: ConvertUTF.h:128
llvm::sys::unicode::ErrorNonPrintableCharacter
@ ErrorNonPrintableCharacter
Definition: Unicode.h:29
llvm::sys::unicode::isPrintable
bool isPrintable(int UCS)
Determines if a character is likely to be displayed correctly on the terminal.
Definition: Unicode.cpp:27
llvm::UTF8
unsigned char UTF8
Definition: ConvertUTF.h:130