Classes
struct	GeneratedNamesData

struct	LooseMatchingResult

struct	MatchForCodepointName

struct	Node

Typedefs
using	BufferType = SmallString< 64 >

Enumerations
enum	ColumnWidthErrors { ErrorInvalidUTF8 = -2 , ErrorNonPrintableCharacter = -1 }

Functions
bool	isPrintable (int UCS)
	Determines if a character is likely to be displayed correctly on the terminal.

bool	isFormatting (int UCS)
	Unicode code points of the Cf category are considered formatting characters.

int	columnWidthUTF8 (StringRef Text)
	Gets the number of positions the UTF8-encoded `Text` is likely to occupy when output on a terminal ("character width").

int	foldCharSimple (int C)
	Fold input unicode character according the Simple unicode case folding rules.

std::optional< char32_t >	nameToCodepointStrict (StringRef Name)
	Maps the name or the alias of a Unicode character to its associated codepoints.

std::optional< LooseMatchingResult >	nameToCodepointLooseMatching (StringRef Name)

SmallVector< MatchForCodepointName >	nearestMatchesForCodepointName (StringRef Pattern, std::size_t MaxMatchesCount)

static int	charWidth (int UCS)
	Gets the number of positions a character is likely to occupy when output on a terminal ("character width").

static bool	isprintableascii (char c)

static Node	createRoot ()

static Node	readNode (uint32_t Offset, const Node *Parent=nullptr)

static bool	startsWith (StringRef Name, StringRef Needle, bool Strict, std::size_t &Consummed, char &PreviousCharInName, bool IsPrefix=false)

static std::tuple< Node, bool, uint32_t >	compareNode (uint32_t Offset, StringRef Name, bool Strict, char PreviousCharInName, BufferType &Buffer, const Node *Parent=nullptr)

static std::tuple< Node, bool, uint32_t >	compareNode (uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer)

static std::size_t	findSyllable (StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column)

static std::optional< char32_t >	nameToHangulCodePoint (StringRef Name, bool Strict, BufferType &Buffer)

static std::optional< char32_t >	nameToGeneratedCodePoint (StringRef Name, bool Strict, BufferType &Buffer)

static std::optional< char32_t >	nameToCodepoint (StringRef Name, bool Strict, BufferType &Buffer)

Variables
const char *	UnicodeNameToCodepointDict

const uint8_t *	UnicodeNameToCodepointIndex = UnicodeNameToCodepointIndex_

const std::size_t	UnicodeNameToCodepointIndexSize = 242258

const std::size_t	UnicodeNameToCodepointLargestNameSize = 74

constexpr const char *const	HangulSyllables [][3]

constexpr const char32_t	SBase = 0xAC00

constexpr const uint32_t	LCount = 19

constexpr const uint32_t	VCount = 21

constexpr const uint32_t	TCount = 28

static const GeneratedNamesData	GeneratedNamesDataTable []

uint8_t	UnicodeNameToCodepointIndex_ [242258]

Typedef Documentation

◆ BufferType

using llvm::sys::unicode::BufferType = typedef SmallString<64>

Definition at line 29 of file UnicodeNameToCodepoint.cpp.

Enumeration Type Documentation

◆ ColumnWidthErrors

enum llvm::sys::unicode::ColumnWidthErrors

Enumerator
ErrorInvalidUTF8
ErrorNonPrintableCharacter

Definition at line 27 of file Unicode.h.

Function Documentation

◆ charWidth()

static int llvm::sys::unicode::charWidth ( int UCS )

inlinestatic

Gets the number of positions a character is likely to occupy when output on a terminal ("character width").

This depends on the implementation of the terminal, and there's no standard definition of character width. The implementation defines it in a way that is expected to be compatible with a generic Unicode-capable terminal.

Returns

Character width:

ErrorNonPrintableCharacter (-1) for non-printable characters (as identified by isPrintable);
0 for non-spacing and enclosing combining marks;
2 for CJK characters excluding halfwidth forms;
1 for all remaining characters.

Definition at line 304 of file Unicode.cpp.

References llvm::sys::UnicodeCharSet::contains(), ErrorNonPrintableCharacter, and isPrintable().

Referenced by columnWidthUTF8().

◆ columnWidthUTF8()

int llvm::sys::unicode::columnWidthUTF8 ( StringRef Text )

Gets the number of positions the UTF8-encoded Text is likely to occupy when output on a terminal ("character width").

This depends on the implementation of the terminal, and there's no standard definition of character width.

The implementation defines it in a way that is expected to be compatible with a generic Unicode-capable terminal.

Returns

Character width:

ErrorNonPrintableCharacter (-1) if Text contains non-printable characters (as identified by isPrintable);
0 for each non-spacing and enclosing combining mark;
2 for each CJK character excluding halfwidth forms;
1 for each of the remaining characters.

Definition at line 481 of file Unicode.cpp.

References charWidth(), llvm::conversionOK, llvm::ConvertUTF8toUTF32(), ErrorInvalidUTF8, ErrorNonPrintableCharacter, llvm::getNumBytesForUTF8(), isprintableascii(), llvm::Length, and llvm::strictConversion.

Referenced by llvm::sys::locale::columnWidth().

◆ compareNode() [1/2]

static std::tuple< Node, bool, uint32_t > llvm::sys::unicode::compareNode	(	uint32_t	Offset,
		StringRef	Name,
		bool	Strict,
		BufferType &	Buffer
	)

static

Definition at line 215 of file UnicodeNameToCodepoint.cpp.

References compareNode(), Name, and llvm::Offset.

◆ compareNode() [2/2]

static std::tuple< Node, bool, uint32_t > llvm::sys::unicode::compareNode	(	uint32_t	Offset,
		StringRef	Name,
		bool	Strict,
		char	PreviousCharInName,
		BufferType &	Buffer,
		const Node *	Parent = `nullptr`
	)

static

Definition at line 179 of file UnicodeNameToCodepoint.cpp.

References llvm::CallingConv::C, compareNode(), N, Name, llvm::Offset, readNode(), and startsWith().

Referenced by compareNode(), and nameToCodepoint().

◆ createRoot()

static Node llvm::sys::unicode::createRoot ( )

static

Definition at line 61 of file UnicodeNameToCodepoint.cpp.

References N.

Referenced by nearestMatchesForCodepointName(), and readNode().

◆ findSyllable()

static std::size_t llvm::sys::unicode::findSyllable	(	StringRef	Name,
		bool	Strict,
		char &	PreviousInName,
		int &	Pos,
		int	Column
	)

static

Definition at line 259 of file UnicodeNameToCodepoint.cpp.

References assert(), HangulSyllables, I, LCount, Name, llvm::StringRef::size(), startsWith(), TCount, and VCount.

Referenced by nameToHangulCodePoint().

◆ foldCharSimple()

int llvm::sys::unicode::foldCharSimple ( int C )

Fold input unicode character according the Simple unicode case folding rules.

Definition at line 16 of file UnicodeCaseFold.cpp.

References C.

Referenced by foldCharDwarf().

◆ isFormatting()

bool llvm::sys::unicode::isFormatting ( int UCS )

Unicode code points of the Cf category are considered formatting characters.

Definition at line 277 of file Unicode.cpp.

References llvm::Format.

◆ isPrintable()

bool llvm::sys::unicode::isPrintable ( int UCS )

Determines if a character is likely to be displayed correctly on the terminal.

Unicode code points of the categories L, M, N, P, S and Zs are considered printable.

Exact implementation would have to depend on the specific terminal, so we define the semantic that should be suitable for generic case of a terminal capable to output Unicode characters.

Printable codepoints are those in the categories L, M, N, P, S and Zs

Returns: true if the character is considered printable.

In addition, U+00AD SOFT HYPHEN is also considered printable, as it's actually displayed on most terminals.

Returns: true if the character is considered printable.

Definition at line 27 of file Unicode.cpp.

References llvm::sys::UnicodeCharSet::contains().

Referenced by charWidth(), llvm::yaml::escape(), and llvm::sys::locale::isPrint().

◆ isprintableascii()

static bool llvm::sys::unicode::isprintableascii ( char c )

static

Definition at line 479 of file Unicode.cpp.

Referenced by columnWidthUTF8().

◆ nameToCodepoint()

static std::optional< char32_t > llvm::sys::unicode::nameToCodepoint	(	StringRef	Name,
		bool	Strict,
		BufferType &	Buffer
	)

static

Definition at line 371 of file UnicodeNameToCodepoint.cpp.

References llvm::SmallVectorTemplateCommon< T, typename >::begin(), llvm::SmallVectorImpl< T >::clear(), compareNode(), llvm::SmallVectorTemplateCommon< T, typename >::end(), Name, nameToGeneratedCodePoint(), and nameToHangulCodePoint().

Referenced by nameToCodepointLooseMatching(), and nameToCodepointStrict().

◆ nameToCodepointLooseMatching()

std::optional< LooseMatchingResult > llvm::sys::unicode::nameToCodepointLooseMatching ( StringRef Name )

Definition at line 408 of file UnicodeNameToCodepoint.cpp.

References Name, and nameToCodepoint().

◆ nameToCodepointStrict()

std::optional< char32_t > llvm::sys::unicode::nameToCodepointStrict ( StringRef Name )

Maps the name or the alias of a Unicode character to its associated codepoints.

The names and aliases are derived from UnicodeData.txt and NameAliases.txt For compatibility with the semantics of named character escape sequences in C++, this mapping does an exact match sensitive to casing and spacing.

Returns: The codepoint of the corresponding character, if any.

Definition at line 400 of file UnicodeNameToCodepoint.cpp.

References Name, and nameToCodepoint().

◆ nameToGeneratedCodePoint()

static std::optional< char32_t > llvm::sys::unicode::nameToGeneratedCodePoint	(	StringRef	Name,
		bool	Strict,
		BufferType &	Buffer
	)

static

Definition at line 345 of file UnicodeNameToCodepoint.cpp.

References llvm::any_of(), llvm::SmallString< InternalLen >::append(), llvm::CallingConv::C, llvm::SmallVectorImpl< T >::clear(), GeneratedNamesDataTable, llvm::getAsUnsignedInteger(), Name, llvm::Number, and startsWith().

Referenced by nameToCodepoint().

◆ nameToHangulCodePoint()

static std::optional< char32_t > llvm::sys::unicode::nameToHangulCodePoint	(	StringRef	Name,
		bool	Strict,
		BufferType &	Buffer
	)

static

Definition at line 286 of file UnicodeNameToCodepoint.cpp.

References llvm::SmallString< InternalLen >::append(), llvm::SmallVectorImpl< T >::clear(), findSyllable(), HangulSyllables, Name, SBase, startsWith(), TCount, and VCount.

Referenced by nameToCodepoint().

◆ nearestMatchesForCodepointName()

llvm::SmallVector< MatchForCodepointName > llvm::sys::unicode::nearestMatchesForCodepointName	(	StringRef	Pattern,
		std::size_t	MaxMatchesCount
	)

◆ readNode()

static Node llvm::sys::unicode::readNode	(	uint32_t	Offset,
		const Node *	Parent = `nullptr`
	)

static

Definition at line 69 of file UnicodeNameToCodepoint.cpp.

References createRoot(), H, llvm::HasValue(), N, llvm::Offset, Size, UnicodeNameToCodepointDict, UnicodeNameToCodepointIndex, and UnicodeNameToCodepointIndexSize.

Referenced by compareNode(), and nearestMatchesForCodepointName().

◆ startsWith()

static bool llvm::sys::unicode::startsWith	(	StringRef	Name,
		StringRef	Needle,
		bool	Strict,
		std::size_t &	Consummed,
		char &	PreviousCharInName,
		bool	IsPrefix = `false`
	)

static

Definition at line 120 of file UnicodeNameToCodepoint.cpp.

References llvm::StringRef::begin(), llvm::StringRef::empty(), llvm::StringRef::end(), End, Ignore, Name, and llvm::StringRef::size().

Referenced by compareNode(), findSyllable(), nameToGeneratedCodePoint(), and nameToHangulCodePoint().

Variable Documentation

◆ GeneratedNamesDataTable

const GeneratedNamesData llvm::sys::unicode::GeneratedNamesDataTable[]

static

Initial value:

= {
    {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
    {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
    {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
    {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
    {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
    {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
    {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
    {"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D},
    {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
    {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
    {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
    {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
    {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
    {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
    {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
    {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
    {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
}

Definition at line 324 of file UnicodeNameToCodepoint.cpp.

Referenced by nameToGeneratedCodePoint().

◆ HangulSyllables

constexpr const char* const llvm::sys::unicode::HangulSyllables[][3]

constexpr

Initial value:

= {
    { "G",  "A",   ""   },
    { "GG", "AE",  "G"  },
    { "N",  "YA",  "GG" },
    { "D",  "YAE", "GS" },
    { "DD", "EO",  "N", },
    { "R",  "E",   "NJ" },
    { "M",  "YEO", "NH" },
    { "B",  "YE",  "D"  },
    { "BB", "O",   "L"  },
    { "S",  "WA",  "LG" },
    { "SS", "WAE", "LM" },
    { "",   "OE",  "LB" },
    { "J",  "YO",  "LS" },
    { "JJ", "U",   "LT" },
    { "C",  "WEO", "LP" },
    { "K",  "WE",  "LH" },
    { "T",  "WI",  "M"  },
    { "P",  "YU",  "B"  },
    { "H",  "EU",  "BS" },
    { 0,    "YI",  "S"  },
    { 0,    "I",   "SS" },
    { 0,    0,     "NG" },
    { 0,    0,     "J"  },
    { 0,    0,     "C"  },
    { 0,    0,     "K"  },
    { 0,    0,     "T"  },
    { 0,    0,     "P"  },
    { 0,    0,     "H"  }
    }