14 #include "llvm/ADT/StringExtras.h" 15 #include "llvm/ADT/StringSwitch.h" 16 #include "llvm/Support/ConvertUTF.h" 17 #include "llvm/Support/ErrorHandling.h" 23 llvm::errs() <<
"comments::Token Kind=" <<
Kind <<
" ";
25 llvm::errs() <<
" " << Length <<
" \"" << L.
getSpelling(*
this, SM) <<
"\"\n";
41 llvm::BumpPtrAllocator &Allocator,
43 char *Resolved = Allocator.Allocate<
char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
44 char *ResolvedPtr = Resolved;
45 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
46 return StringRef(Resolved, ResolvedPtr - Resolved);
53 #include "clang/AST/CommentHTMLTags.inc" 54 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 58 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name)
const {
60 return llvm::StringSwitch<StringRef>(Name)
67 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
70 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name)
const {
71 unsigned CodePoint = 0;
72 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
75 CodePoint += Name[i] -
'0';
80 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name)
const {
81 unsigned CodePoint = 0;
82 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
84 const char C = Name[i];
86 CodePoint += llvm::hexDigitValue(C);
91 void Lexer::skipLineStartingDecorations() {
93 assert(CommentState == LCS_InsideCComment);
95 if (BufferPtr == CommentEnd)
103 const char *NewBufferPtr = BufferPtr;
105 if (NewBufferPtr == CommentEnd)
108 char C = *NewBufferPtr;
111 if (NewBufferPtr == CommentEnd)
116 BufferPtr = NewBufferPtr + 1;
127 const char *findNewline(
const char *BufferPtr,
const char *BufferEnd) {
128 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
135 const char *skipNewline(
const char *BufferPtr,
const char *BufferEnd) {
136 if (BufferPtr == BufferEnd)
139 if (*BufferPtr ==
'\n')
142 assert(*BufferPtr ==
'\r');
144 if (BufferPtr != BufferEnd && *BufferPtr ==
'\n')
150 const char *skipNamedCharacterReference(
const char *BufferPtr,
151 const char *BufferEnd) {
152 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
159 const char *skipDecimalCharacterReference(
const char *BufferPtr,
160 const char *BufferEnd) {
161 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
168 const char *skipHexCharacterReference(
const char *BufferPtr,
169 const char *BufferEnd) {
170 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
177 bool isHTMLIdentifierStartingCharacter(
char C) {
181 bool isHTMLIdentifierCharacter(
char C) {
185 const char *skipHTMLIdentifier(
const char *BufferPtr,
const char *BufferEnd) {
186 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
187 if (!isHTMLIdentifierCharacter(*BufferPtr))
197 const char *skipHTMLQuotedString(
const char *BufferPtr,
const char *BufferEnd)
199 const char Quote = *BufferPtr;
200 assert(Quote ==
'\"' || Quote ==
'\'');
203 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204 const char C = *BufferPtr;
205 if (C == Quote && BufferPtr[-1] !=
'\\')
211 const char *
skipWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
212 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219 bool isWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
223 bool isCommandNameStartCharacter(
char C) {
227 bool isCommandNameCharacter(
char C) {
231 const char *skipCommandName(
const char *BufferPtr,
const char *BufferEnd) {
232 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
233 if (!isCommandNameCharacter(*BufferPtr))
241 const char *findBCPLCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
242 const char *CurPtr = BufferPtr;
243 while (CurPtr != BufferEnd) {
246 if (CurPtr == BufferEnd)
250 const char *EscapePtr = CurPtr - 1;
254 if (*EscapePtr ==
'\\' ||
255 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] ==
'/' &&
256 EscapePtr[-1] ==
'?' && EscapePtr[-2] ==
'?')) {
258 CurPtr = skipNewline(CurPtr, BufferEnd);
267 const char *findCCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
268 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
269 if (*BufferPtr ==
'*') {
270 assert(BufferPtr + 1 != BufferEnd);
271 if (*(BufferPtr + 1) ==
'/')
275 llvm_unreachable(
"buffer end hit before '*/' was seen");
280 void Lexer::formTokenWithChars(
Token &Result,
const char *TokEnd,
282 const unsigned TokLen = TokEnd - BufferPtr;
287 Result.TextPtr =
"<UNSET>";
293 void Lexer::lexCommentText(
Token &T) {
294 assert(CommentState == LCS_InsideBCPLComment ||
295 CommentState == LCS_InsideCComment);
298 auto HandleNonCommandToken = [&]() ->
void {
299 assert(
State == LS_Normal);
301 const char *TokenPtr = BufferPtr;
302 assert(TokenPtr < CommentEnd);
306 TokenPtr = skipNewline(TokenPtr, CommentEnd);
309 if (CommentState == LCS_InsideCComment)
310 skipLineStartingDecorations();
314 StringRef TokStartSymbols = ParseCommands ?
"\n\r\\@&<" :
"\n\r";
315 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
316 .find_first_of(TokStartSymbols);
317 if (End != StringRef::npos)
320 TokenPtr = CommentEnd;
321 formTextToken(T, TokenPtr);
328 return HandleNonCommandToken();
333 case LS_VerbatimBlockFirstLine:
334 lexVerbatimBlockFirstLine(T);
336 case LS_VerbatimBlockBody:
337 lexVerbatimBlockBody(T);
339 case LS_VerbatimLineText:
340 lexVerbatimLineText(T);
342 case LS_HTMLStartTag:
350 assert(
State == LS_Normal);
351 const char *TokenPtr = BufferPtr;
352 assert(TokenPtr < CommentEnd);
362 if (TokenPtr == CommentEnd) {
363 formTextToken(T, TokenPtr);
371 case '\\':
case '@':
case '&':
case '$':
372 case '#':
case '<':
case '>':
case '%':
373 case '\"':
case '.':
case ':':
376 if (C ==
':' && TokenPtr != CommentEnd && *TokenPtr ==
':') {
380 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
381 formTokenWithChars(T, TokenPtr,
tok::text);
387 if (!isCommandNameStartCharacter(*TokenPtr)) {
388 formTextToken(T, TokenPtr);
392 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
393 unsigned Length = TokenPtr - (BufferPtr + 1);
397 if (Length == 1 && TokenPtr[-1] ==
'f' && TokenPtr != CommentEnd) {
399 if (C ==
'$' || C ==
'[' || C ==
']' || C ==
'{' || C ==
'}') {
405 StringRef CommandName(BufferPtr + 1, Length);
407 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
409 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
410 StringRef CorrectedName = Info->
Name;
415 Diag(Loc, diag::warn_correct_comment_command_name)
416 << FullRange << CommandName << CorrectedName
427 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
431 setupAndLexVerbatimLine(T, TokenPtr, Info);
434 formTokenWithChars(T, TokenPtr, CommandKind);
440 lexHTMLCharacterReference(T);
445 if (TokenPtr == CommentEnd) {
446 formTextToken(T, TokenPtr);
449 const char C = *TokenPtr;
450 if (isHTMLIdentifierStartingCharacter(C))
451 setupAndLexHTMLStartTag(T);
453 setupAndLexHTMLEndTag(T);
455 formTextToken(T, TokenPtr);
460 return HandleNonCommandToken();
464 void Lexer::setupAndLexVerbatimBlock(
Token &T,
465 const char *TextBegin,
469 VerbatimBlockEndCommandName.clear();
470 VerbatimBlockEndCommandName.append(Marker ==
'\\' ?
"\\" :
"@");
479 if (BufferPtr != CommentEnd &&
481 BufferPtr = skipNewline(BufferPtr, CommentEnd);
482 State = LS_VerbatimBlockBody;
486 State = LS_VerbatimBlockFirstLine;
489 void Lexer::lexVerbatimBlockFirstLine(
Token &T) {
491 assert(BufferPtr < CommentEnd);
497 const char *Newline = findNewline(BufferPtr, CommentEnd);
498 StringRef
Line(BufferPtr, Newline - BufferPtr);
501 size_t Pos = Line.find(VerbatimBlockEndCommandName);
503 const char *NextLine;
504 if (Pos == StringRef::npos) {
507 NextLine = skipNewline(Newline, CommentEnd);
508 }
else if (Pos == 0) {
510 const char *
End = BufferPtr + VerbatimBlockEndCommandName.size();
511 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
518 TextEnd = BufferPtr + Pos;
527 StringRef
Text(BufferPtr, TextEnd - BufferPtr);
531 State = LS_VerbatimBlockBody;
534 void Lexer::lexVerbatimBlockBody(
Token &T) {
535 assert(
State == LS_VerbatimBlockBody);
537 if (CommentState == LCS_InsideCComment)
538 skipLineStartingDecorations();
540 if (BufferPtr == CommentEnd) {
546 lexVerbatimBlockFirstLine(T);
549 void Lexer::setupAndLexVerbatimLine(
Token &T,
const char *TextBegin,
555 State = LS_VerbatimLineText;
558 void Lexer::lexVerbatimLineText(
Token &T) {
559 assert(
State == LS_VerbatimLineText);
562 const char *Newline = findNewline(BufferPtr, CommentEnd);
563 StringRef
Text(BufferPtr, Newline - BufferPtr);
570 void Lexer::lexHTMLCharacterReference(
Token &T) {
571 const char *TokenPtr = BufferPtr;
572 assert(*TokenPtr ==
'&');
574 if (TokenPtr == CommentEnd) {
575 formTextToken(T, TokenPtr);
580 bool isDecimal =
false;
584 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
586 }
else if (C ==
'#') {
588 if (TokenPtr == CommentEnd) {
589 formTextToken(T, TokenPtr);
595 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
597 }
else if (C ==
'x' || C ==
'X') {
600 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
602 formTextToken(T, TokenPtr);
606 formTextToken(T, TokenPtr);
609 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
611 formTextToken(T, TokenPtr);
614 StringRef Name(NamePtr, TokenPtr - NamePtr);
618 Resolved = resolveHTMLNamedCharacterReference(Name);
620 Resolved = resolveHTMLDecimalCharacterReference(Name);
622 Resolved = resolveHTMLHexCharacterReference(Name);
624 if (Resolved.empty()) {
625 formTextToken(T, TokenPtr);
628 formTokenWithChars(T, TokenPtr,
tok::text);
632 void Lexer::setupAndLexHTMLStartTag(
Token &T) {
633 assert(BufferPtr[0] ==
'<' &&
634 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
635 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
636 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
637 if (!isHTMLTagName(Name)) {
638 formTextToken(T, TagNameEnd);
647 const char C = *BufferPtr;
648 if (BufferPtr != CommentEnd &&
649 (C ==
'>' || C ==
'/' || isHTMLIdentifierStartingCharacter(C)))
650 State = LS_HTMLStartTag;
653 void Lexer::lexHTMLStartTag(
Token &T) {
654 assert(
State == LS_HTMLStartTag);
656 const char *TokenPtr = BufferPtr;
658 if (isHTMLIdentifierCharacter(C)) {
659 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
660 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
671 const char *OpenQuote = TokenPtr;
672 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
673 const char *ClosingQuote = TokenPtr;
674 if (TokenPtr != CommentEnd)
678 ClosingQuote - (OpenQuote + 1)));
688 if (TokenPtr != CommentEnd && *TokenPtr ==
'>') {
692 formTextToken(T, TokenPtr);
702 if (BufferPtr == CommentEnd) {
708 if (!isHTMLIdentifierStartingCharacter(C) &&
709 C !=
'=' && C !=
'\"' && C !=
'\'' && C !=
'>') {
715 void Lexer::setupAndLexHTMLEndTag(
Token &T) {
716 assert(BufferPtr[0] ==
'<' && BufferPtr[1] ==
'/');
718 const char *TagNameBegin =
skipWhitespace(BufferPtr + 2, CommentEnd);
719 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
720 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
721 if (!isHTMLTagName(Name)) {
722 formTextToken(T, TagNameEnd);
731 if (BufferPtr != CommentEnd && *BufferPtr ==
'>')
732 State = LS_HTMLEndTag;
735 void Lexer::lexHTMLEndTag(
Token &T) {
736 assert(BufferPtr != CommentEnd && *BufferPtr ==
'>');
744 const char *BufferStart,
const char *BufferEnd,
746 : Allocator(Allocator), Diags(Diags), Traits(Traits),
747 BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
748 BufferPtr(BufferStart), CommentState(LCS_BeforeComment),
State(LS_Normal),
749 ParseCommands(ParseCommands) {}
753 switch (CommentState) {
754 case LCS_BeforeComment:
755 if (BufferPtr == BufferEnd) {
756 formTokenWithChars(T, BufferPtr,
tok::eof);
760 assert(*BufferPtr ==
'/');
766 if (BufferPtr != BufferEnd) {
771 const char C = *BufferPtr;
772 if (C ==
'/' || C ==
'!')
779 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
782 CommentState = LCS_InsideBCPLComment;
783 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
785 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
792 const char C = *BufferPtr;
793 if ((C ==
'*' && *(BufferPtr + 1) !=
'/') || C ==
'!')
797 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
800 CommentState = LCS_InsideCComment;
802 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
806 llvm_unreachable(
"second character of comment should be '/' or '*'");
809 case LCS_BetweenComments: {
812 const char *EndWhitespace = BufferPtr;
813 while(EndWhitespace != BufferEnd && *EndWhitespace !=
'/')
822 CommentState = LCS_BeforeComment;
826 case LCS_InsideBCPLComment:
827 case LCS_InsideCComment:
828 if (BufferPtr != CommentEnd) {
833 if (CommentState == LCS_InsideCComment) {
834 assert(BufferPtr[0] ==
'*' && BufferPtr[1] ==
'/');
836 assert(BufferPtr <= BufferEnd);
842 CommentState = LCS_BetweenComments;
846 CommentState = LCS_BetweenComments;
855 bool *Invalid)
const {
859 bool InvalidTemp =
false;
860 StringRef File = SourceMgr.
getBufferData(LocInfo.first, &InvalidTemp);
866 const char *
Begin = File.data() + LocInfo.second;
867 return StringRef(Begin, Tok.
getLength());
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID)
Produce a diagnostic highlighting some portion of a literal.
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.
Concrete class used by the front-end to report problems and issues.
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
void dump(const SourceManager &SM) const
const AnnotatedLine * Line
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Encodes a location in the source.
Dataflow Directional Tag Classes.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string...
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.