LLVM API Documentation
00001 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements a YAML parser. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #include "llvm/Support/YAMLParser.h" 00015 #include "llvm/ADT/SmallVector.h" 00016 #include "llvm/ADT/StringExtras.h" 00017 #include "llvm/ADT/Twine.h" 00018 #include "llvm/ADT/ilist.h" 00019 #include "llvm/ADT/ilist_node.h" 00020 #include "llvm/Support/ErrorHandling.h" 00021 #include "llvm/Support/MemoryBuffer.h" 00022 #include "llvm/Support/SourceMgr.h" 00023 #include "llvm/Support/raw_ostream.h" 00024 00025 using namespace llvm; 00026 using namespace yaml; 00027 00028 enum UnicodeEncodingForm { 00029 UEF_UTF32_LE, ///< UTF-32 Little Endian 00030 UEF_UTF32_BE, ///< UTF-32 Big Endian 00031 UEF_UTF16_LE, ///< UTF-16 Little Endian 00032 UEF_UTF16_BE, ///< UTF-16 Big Endian 00033 UEF_UTF8, ///< UTF-8 or ascii. 00034 UEF_Unknown ///< Not a valid Unicode encoding. 00035 }; 00036 00037 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 00038 /// it exists. Length is in {0, 2, 3, 4}. 00039 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 00040 00041 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 00042 /// encoding form of \a Input. 00043 /// 00044 /// @param Input A string of length 0 or more. 00045 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 00046 /// and how long the byte order mark is if one exists. 00047 static EncodingInfo getUnicodeEncoding(StringRef Input) { 00048 if (Input.size() == 0) 00049 return std::make_pair(UEF_Unknown, 0); 00050 00051 switch (uint8_t(Input[0])) { 00052 case 0x00: 00053 if (Input.size() >= 4) { 00054 if ( Input[1] == 0 00055 && uint8_t(Input[2]) == 0xFE 00056 && uint8_t(Input[3]) == 0xFF) 00057 return std::make_pair(UEF_UTF32_BE, 4); 00058 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 00059 return std::make_pair(UEF_UTF32_BE, 0); 00060 } 00061 00062 if (Input.size() >= 2 && Input[1] != 0) 00063 return std::make_pair(UEF_UTF16_BE, 0); 00064 return std::make_pair(UEF_Unknown, 0); 00065 case 0xFF: 00066 if ( Input.size() >= 4 00067 && uint8_t(Input[1]) == 0xFE 00068 && Input[2] == 0 00069 && Input[3] == 0) 00070 return std::make_pair(UEF_UTF32_LE, 4); 00071 00072 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 00073 return std::make_pair(UEF_UTF16_LE, 2); 00074 return std::make_pair(UEF_Unknown, 0); 00075 case 0xFE: 00076 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 00077 return std::make_pair(UEF_UTF16_BE, 2); 00078 return std::make_pair(UEF_Unknown, 0); 00079 case 0xEF: 00080 if ( Input.size() >= 3 00081 && uint8_t(Input[1]) == 0xBB 00082 && uint8_t(Input[2]) == 0xBF) 00083 return std::make_pair(UEF_UTF8, 3); 00084 return std::make_pair(UEF_Unknown, 0); 00085 } 00086 00087 // It could still be utf-32 or utf-16. 00088 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 00089 return std::make_pair(UEF_UTF32_LE, 0); 00090 00091 if (Input.size() >= 2 && Input[1] == 0) 00092 return std::make_pair(UEF_UTF16_LE, 0); 00093 00094 return std::make_pair(UEF_UTF8, 0); 00095 } 00096 00097 namespace llvm { 00098 namespace yaml { 00099 /// Token - A single YAML token. 00100 struct Token : ilist_node<Token> { 00101 enum TokenKind { 00102 TK_Error, // Uninitialized token. 00103 TK_StreamStart, 00104 TK_StreamEnd, 00105 TK_VersionDirective, 00106 TK_TagDirective, 00107 TK_DocumentStart, 00108 TK_DocumentEnd, 00109 TK_BlockEntry, 00110 TK_BlockEnd, 00111 TK_BlockSequenceStart, 00112 TK_BlockMappingStart, 00113 TK_FlowEntry, 00114 TK_FlowSequenceStart, 00115 TK_FlowSequenceEnd, 00116 TK_FlowMappingStart, 00117 TK_FlowMappingEnd, 00118 TK_Key, 00119 TK_Value, 00120 TK_Scalar, 00121 TK_Alias, 00122 TK_Anchor, 00123 TK_Tag 00124 } Kind; 00125 00126 /// A string of length 0 or more whose begin() points to the logical location 00127 /// of the token in the input. 00128 StringRef Range; 00129 00130 Token() : Kind(TK_Error) {} 00131 }; 00132 } 00133 } 00134 00135 namespace llvm { 00136 template<> 00137 struct ilist_sentinel_traits<Token> { 00138 Token *createSentinel() const { 00139 return &Sentinel; 00140 } 00141 static void destroySentinel(Token*) {} 00142 00143 Token *provideInitialHead() const { return createSentinel(); } 00144 Token *ensureHead(Token*) const { return createSentinel(); } 00145 static void noteHead(Token*, Token*) {} 00146 00147 private: 00148 mutable Token Sentinel; 00149 }; 00150 00151 template<> 00152 struct ilist_node_traits<Token> { 00153 Token *createNode(const Token &V) { 00154 return new (Alloc.Allocate<Token>()) Token(V); 00155 } 00156 static void deleteNode(Token *V) {} 00157 00158 void addNodeToList(Token *) {} 00159 void removeNodeFromList(Token *) {} 00160 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 00161 ilist_iterator<Token> /*first*/, 00162 ilist_iterator<Token> /*last*/) {} 00163 00164 BumpPtrAllocator Alloc; 00165 }; 00166 } 00167 00168 typedef ilist<Token> TokenQueueT; 00169 00170 namespace { 00171 /// @brief This struct is used to track simple keys. 00172 /// 00173 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 00174 /// which could legally be the start of a simple key. When peekNext is called, 00175 /// if the Token To be returned is referenced by a SimpleKey, we continue 00176 /// tokenizing until that potential simple key has either been found to not be 00177 /// a simple key (we moved on to the next line or went further than 1024 chars). 00178 /// Or when we run into a Value, and then insert a Key token (and possibly 00179 /// others) before the SimpleKey's Tok. 00180 struct SimpleKey { 00181 TokenQueueT::iterator Tok; 00182 unsigned Column; 00183 unsigned Line; 00184 unsigned FlowLevel; 00185 bool IsRequired; 00186 00187 bool operator ==(const SimpleKey &Other) { 00188 return Tok == Other.Tok; 00189 } 00190 }; 00191 } 00192 00193 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 00194 /// subsequence and the subsequence's length in code units (uint8_t). 00195 /// A length of 0 represents an error. 00196 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 00197 00198 static UTF8Decoded decodeUTF8(StringRef Range) { 00199 StringRef::iterator Position= Range.begin(); 00200 StringRef::iterator End = Range.end(); 00201 // 1 byte: [0x00, 0x7f] 00202 // Bit pattern: 0xxxxxxx 00203 if ((*Position & 0x80) == 0) { 00204 return std::make_pair(*Position, 1); 00205 } 00206 // 2 bytes: [0x80, 0x7ff] 00207 // Bit pattern: 110xxxxx 10xxxxxx 00208 if (Position + 1 != End && 00209 ((*Position & 0xE0) == 0xC0) && 00210 ((*(Position + 1) & 0xC0) == 0x80)) { 00211 uint32_t codepoint = ((*Position & 0x1F) << 6) | 00212 (*(Position + 1) & 0x3F); 00213 if (codepoint >= 0x80) 00214 return std::make_pair(codepoint, 2); 00215 } 00216 // 3 bytes: [0x8000, 0xffff] 00217 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 00218 if (Position + 2 != End && 00219 ((*Position & 0xF0) == 0xE0) && 00220 ((*(Position + 1) & 0xC0) == 0x80) && 00221 ((*(Position + 2) & 0xC0) == 0x80)) { 00222 uint32_t codepoint = ((*Position & 0x0F) << 12) | 00223 ((*(Position + 1) & 0x3F) << 6) | 00224 (*(Position + 2) & 0x3F); 00225 // Codepoints between 0xD800 and 0xDFFF are invalid, as 00226 // they are high / low surrogate halves used by UTF-16. 00227 if (codepoint >= 0x800 && 00228 (codepoint < 0xD800 || codepoint > 0xDFFF)) 00229 return std::make_pair(codepoint, 3); 00230 } 00231 // 4 bytes: [0x10000, 0x10FFFF] 00232 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00233 if (Position + 3 != End && 00234 ((*Position & 0xF8) == 0xF0) && 00235 ((*(Position + 1) & 0xC0) == 0x80) && 00236 ((*(Position + 2) & 0xC0) == 0x80) && 00237 ((*(Position + 3) & 0xC0) == 0x80)) { 00238 uint32_t codepoint = ((*Position & 0x07) << 18) | 00239 ((*(Position + 1) & 0x3F) << 12) | 00240 ((*(Position + 2) & 0x3F) << 6) | 00241 (*(Position + 3) & 0x3F); 00242 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 00243 return std::make_pair(codepoint, 4); 00244 } 00245 return std::make_pair(0, 0); 00246 } 00247 00248 namespace llvm { 00249 namespace yaml { 00250 /// @brief Scans YAML tokens from a MemoryBuffer. 00251 class Scanner { 00252 public: 00253 Scanner(const StringRef Input, SourceMgr &SM); 00254 Scanner(MemoryBuffer *Buffer, SourceMgr &SM_); 00255 00256 /// @brief Parse the next token and return it without popping it. 00257 Token &peekNext(); 00258 00259 /// @brief Parse the next token and pop it from the queue. 00260 Token getNext(); 00261 00262 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 00263 ArrayRef<SMRange> Ranges = None) { 00264 SM.PrintMessage(Loc, Kind, Message, Ranges); 00265 } 00266 00267 void setError(const Twine &Message, StringRef::iterator Position) { 00268 if (Current >= End) 00269 Current = End - 1; 00270 00271 // Don't print out more errors after the first one we encounter. The rest 00272 // are just the result of the first, and have no meaning. 00273 if (!Failed) 00274 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 00275 Failed = true; 00276 } 00277 00278 void setError(const Twine &Message) { 00279 setError(Message, Current); 00280 } 00281 00282 /// @brief Returns true if an error occurred while parsing. 00283 bool failed() { 00284 return Failed; 00285 } 00286 00287 private: 00288 StringRef currentInput() { 00289 return StringRef(Current, End - Current); 00290 } 00291 00292 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 00293 /// at \a Position. 00294 /// 00295 /// If the UTF-8 code units starting at Position do not form a well-formed 00296 /// code unit subsequence, then the Unicode scalar value is 0, and the length 00297 /// is 0. 00298 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 00299 return ::decodeUTF8(StringRef(Position, End - Position)); 00300 } 00301 00302 // The following functions are based on the gramar rules in the YAML spec. The 00303 // style of the function names it meant to closely match how they are written 00304 // in the spec. The number within the [] is the number of the grammar rule in 00305 // the spec. 00306 // 00307 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 00308 // 00309 // c- 00310 // A production starting and ending with a special character. 00311 // b- 00312 // A production matching a single line break. 00313 // nb- 00314 // A production starting and ending with a non-break character. 00315 // s- 00316 // A production starting and ending with a white space character. 00317 // ns- 00318 // A production starting and ending with a non-space character. 00319 // l- 00320 // A production matching complete line(s). 00321 00322 /// @brief Skip a single nb-char[27] starting at Position. 00323 /// 00324 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 00325 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 00326 /// 00327 /// @returns The code unit after the nb-char, or Position if it's not an 00328 /// nb-char. 00329 StringRef::iterator skip_nb_char(StringRef::iterator Position); 00330 00331 /// @brief Skip a single b-break[28] starting at Position. 00332 /// 00333 /// A b-break is 0xD 0xA | 0xD | 0xA 00334 /// 00335 /// @returns The code unit after the b-break, or Position if it's not a 00336 /// b-break. 00337 StringRef::iterator skip_b_break(StringRef::iterator Position); 00338 00339 /// @brief Skip a single s-white[33] starting at Position. 00340 /// 00341 /// A s-white is 0x20 | 0x9 00342 /// 00343 /// @returns The code unit after the s-white, or Position if it's not a 00344 /// s-white. 00345 StringRef::iterator skip_s_white(StringRef::iterator Position); 00346 00347 /// @brief Skip a single ns-char[34] starting at Position. 00348 /// 00349 /// A ns-char is nb-char - s-white 00350 /// 00351 /// @returns The code unit after the ns-char, or Position if it's not a 00352 /// ns-char. 00353 StringRef::iterator skip_ns_char(StringRef::iterator Position); 00354 00355 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 00356 /// @brief Skip minimal well-formed code unit subsequences until Func 00357 /// returns its input. 00358 /// 00359 /// @returns The code unit after the last minimal well-formed code unit 00360 /// subsequence that Func accepted. 00361 StringRef::iterator skip_while( SkipWhileFunc Func 00362 , StringRef::iterator Position); 00363 00364 /// @brief Scan ns-uri-char[39]s starting at Cur. 00365 /// 00366 /// This updates Cur and Column while scanning. 00367 /// 00368 /// @returns A StringRef starting at Cur which covers the longest contiguous 00369 /// sequence of ns-uri-char. 00370 StringRef scan_ns_uri_char(); 00371 00372 /// @brief Scan ns-plain-one-line[133] starting at \a Cur. 00373 StringRef scan_ns_plain_one_line(); 00374 00375 /// @brief Consume a minimal well-formed code unit subsequence starting at 00376 /// \a Cur. Return false if it is not the same Unicode scalar value as 00377 /// \a Expected. This updates \a Column. 00378 bool consume(uint32_t Expected); 00379 00380 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 00381 void skip(uint32_t Distance); 00382 00383 /// @brief Return true if the minimal well-formed code unit subsequence at 00384 /// Pos is whitespace or a new line 00385 bool isBlankOrBreak(StringRef::iterator Position); 00386 00387 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 00388 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 00389 , unsigned AtColumn 00390 , bool IsRequired); 00391 00392 /// @brief Remove simple keys that can no longer be valid simple keys. 00393 /// 00394 /// Invalid simple keys are not on the current line or are further than 1024 00395 /// columns back. 00396 void removeStaleSimpleKeyCandidates(); 00397 00398 /// @brief Remove all simple keys on FlowLevel \a Level. 00399 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 00400 00401 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 00402 /// tokens if needed. 00403 bool unrollIndent(int ToColumn); 00404 00405 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 00406 /// if needed. 00407 bool rollIndent( int ToColumn 00408 , Token::TokenKind Kind 00409 , TokenQueueT::iterator InsertPoint); 00410 00411 /// @brief Skip whitespace and comments until the start of the next token. 00412 void scanToNextToken(); 00413 00414 /// @brief Must be the first token generated. 00415 bool scanStreamStart(); 00416 00417 /// @brief Generate tokens needed to close out the stream. 00418 bool scanStreamEnd(); 00419 00420 /// @brief Scan a %BLAH directive. 00421 bool scanDirective(); 00422 00423 /// @brief Scan a ... or ---. 00424 bool scanDocumentIndicator(bool IsStart); 00425 00426 /// @brief Scan a [ or { and generate the proper flow collection start token. 00427 bool scanFlowCollectionStart(bool IsSequence); 00428 00429 /// @brief Scan a ] or } and generate the proper flow collection end token. 00430 bool scanFlowCollectionEnd(bool IsSequence); 00431 00432 /// @brief Scan the , that separates entries in a flow collection. 00433 bool scanFlowEntry(); 00434 00435 /// @brief Scan the - that starts block sequence entries. 00436 bool scanBlockEntry(); 00437 00438 /// @brief Scan an explicit ? indicating a key. 00439 bool scanKey(); 00440 00441 /// @brief Scan an explicit : indicating a value. 00442 bool scanValue(); 00443 00444 /// @brief Scan a quoted scalar. 00445 bool scanFlowScalar(bool IsDoubleQuoted); 00446 00447 /// @brief Scan an unquoted scalar. 00448 bool scanPlainScalar(); 00449 00450 /// @brief Scan an Alias or Anchor starting with * or &. 00451 bool scanAliasOrAnchor(bool IsAlias); 00452 00453 /// @brief Scan a block scalar starting with | or >. 00454 bool scanBlockScalar(bool IsLiteral); 00455 00456 /// @brief Scan a tag of the form !stuff. 00457 bool scanTag(); 00458 00459 /// @brief Dispatch to the next scanning function based on \a *Cur. 00460 bool fetchMoreTokens(); 00461 00462 /// @brief The SourceMgr used for diagnostics and buffer management. 00463 SourceMgr &SM; 00464 00465 /// @brief The original input. 00466 MemoryBuffer *InputBuffer; 00467 00468 /// @brief The current position of the scanner. 00469 StringRef::iterator Current; 00470 00471 /// @brief The end of the input (one past the last character). 00472 StringRef::iterator End; 00473 00474 /// @brief Current YAML indentation level in spaces. 00475 int Indent; 00476 00477 /// @brief Current column number in Unicode code points. 00478 unsigned Column; 00479 00480 /// @brief Current line number. 00481 unsigned Line; 00482 00483 /// @brief How deep we are in flow style containers. 0 Means at block level. 00484 unsigned FlowLevel; 00485 00486 /// @brief Are we at the start of the stream? 00487 bool IsStartOfStream; 00488 00489 /// @brief Can the next token be the start of a simple key? 00490 bool IsSimpleKeyAllowed; 00491 00492 /// @brief True if an error has occurred. 00493 bool Failed; 00494 00495 /// @brief Queue of tokens. This is required to queue up tokens while looking 00496 /// for the end of a simple key. And for cases where a single character 00497 /// can produce multiple tokens (e.g. BlockEnd). 00498 TokenQueueT TokenQueue; 00499 00500 /// @brief Indentation levels. 00501 SmallVector<int, 4> Indents; 00502 00503 /// @brief Potential simple keys. 00504 SmallVector<SimpleKey, 4> SimpleKeys; 00505 }; 00506 00507 } // end namespace yaml 00508 } // end namespace llvm 00509 00510 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 00511 static void encodeUTF8( uint32_t UnicodeScalarValue 00512 , SmallVectorImpl<char> &Result) { 00513 if (UnicodeScalarValue <= 0x7F) { 00514 Result.push_back(UnicodeScalarValue & 0x7F); 00515 } else if (UnicodeScalarValue <= 0x7FF) { 00516 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 00517 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 00518 Result.push_back(FirstByte); 00519 Result.push_back(SecondByte); 00520 } else if (UnicodeScalarValue <= 0xFFFF) { 00521 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 00522 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 00523 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 00524 Result.push_back(FirstByte); 00525 Result.push_back(SecondByte); 00526 Result.push_back(ThirdByte); 00527 } else if (UnicodeScalarValue <= 0x10FFFF) { 00528 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 00529 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 00530 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 00531 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 00532 Result.push_back(FirstByte); 00533 Result.push_back(SecondByte); 00534 Result.push_back(ThirdByte); 00535 Result.push_back(FourthByte); 00536 } 00537 } 00538 00539 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 00540 SourceMgr SM; 00541 Scanner scanner(Input, SM); 00542 while (true) { 00543 Token T = scanner.getNext(); 00544 switch (T.Kind) { 00545 case Token::TK_StreamStart: 00546 OS << "Stream-Start: "; 00547 break; 00548 case Token::TK_StreamEnd: 00549 OS << "Stream-End: "; 00550 break; 00551 case Token::TK_VersionDirective: 00552 OS << "Version-Directive: "; 00553 break; 00554 case Token::TK_TagDirective: 00555 OS << "Tag-Directive: "; 00556 break; 00557 case Token::TK_DocumentStart: 00558 OS << "Document-Start: "; 00559 break; 00560 case Token::TK_DocumentEnd: 00561 OS << "Document-End: "; 00562 break; 00563 case Token::TK_BlockEntry: 00564 OS << "Block-Entry: "; 00565 break; 00566 case Token::TK_BlockEnd: 00567 OS << "Block-End: "; 00568 break; 00569 case Token::TK_BlockSequenceStart: 00570 OS << "Block-Sequence-Start: "; 00571 break; 00572 case Token::TK_BlockMappingStart: 00573 OS << "Block-Mapping-Start: "; 00574 break; 00575 case Token::TK_FlowEntry: 00576 OS << "Flow-Entry: "; 00577 break; 00578 case Token::TK_FlowSequenceStart: 00579 OS << "Flow-Sequence-Start: "; 00580 break; 00581 case Token::TK_FlowSequenceEnd: 00582 OS << "Flow-Sequence-End: "; 00583 break; 00584 case Token::TK_FlowMappingStart: 00585 OS << "Flow-Mapping-Start: "; 00586 break; 00587 case Token::TK_FlowMappingEnd: 00588 OS << "Flow-Mapping-End: "; 00589 break; 00590 case Token::TK_Key: 00591 OS << "Key: "; 00592 break; 00593 case Token::TK_Value: 00594 OS << "Value: "; 00595 break; 00596 case Token::TK_Scalar: 00597 OS << "Scalar: "; 00598 break; 00599 case Token::TK_Alias: 00600 OS << "Alias: "; 00601 break; 00602 case Token::TK_Anchor: 00603 OS << "Anchor: "; 00604 break; 00605 case Token::TK_Tag: 00606 OS << "Tag: "; 00607 break; 00608 case Token::TK_Error: 00609 break; 00610 } 00611 OS << T.Range << "\n"; 00612 if (T.Kind == Token::TK_StreamEnd) 00613 break; 00614 else if (T.Kind == Token::TK_Error) 00615 return false; 00616 } 00617 return true; 00618 } 00619 00620 bool yaml::scanTokens(StringRef Input) { 00621 llvm::SourceMgr SM; 00622 llvm::yaml::Scanner scanner(Input, SM); 00623 for (;;) { 00624 llvm::yaml::Token T = scanner.getNext(); 00625 if (T.Kind == Token::TK_StreamEnd) 00626 break; 00627 else if (T.Kind == Token::TK_Error) 00628 return false; 00629 } 00630 return true; 00631 } 00632 00633 std::string yaml::escape(StringRef Input) { 00634 std::string EscapedInput; 00635 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 00636 if (*i == '\\') 00637 EscapedInput += "\\\\"; 00638 else if (*i == '"') 00639 EscapedInput += "\\\""; 00640 else if (*i == 0) 00641 EscapedInput += "\\0"; 00642 else if (*i == 0x07) 00643 EscapedInput += "\\a"; 00644 else if (*i == 0x08) 00645 EscapedInput += "\\b"; 00646 else if (*i == 0x09) 00647 EscapedInput += "\\t"; 00648 else if (*i == 0x0A) 00649 EscapedInput += "\\n"; 00650 else if (*i == 0x0B) 00651 EscapedInput += "\\v"; 00652 else if (*i == 0x0C) 00653 EscapedInput += "\\f"; 00654 else if (*i == 0x0D) 00655 EscapedInput += "\\r"; 00656 else if (*i == 0x1B) 00657 EscapedInput += "\\e"; 00658 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 00659 std::string HexStr = utohexstr(*i); 00660 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 00661 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 00662 UTF8Decoded UnicodeScalarValue 00663 = decodeUTF8(StringRef(i, Input.end() - i)); 00664 if (UnicodeScalarValue.second == 0) { 00665 // Found invalid char. 00666 SmallString<4> Val; 00667 encodeUTF8(0xFFFD, Val); 00668 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 00669 // FIXME: Error reporting. 00670 return EscapedInput; 00671 } 00672 if (UnicodeScalarValue.first == 0x85) 00673 EscapedInput += "\\N"; 00674 else if (UnicodeScalarValue.first == 0xA0) 00675 EscapedInput += "\\_"; 00676 else if (UnicodeScalarValue.first == 0x2028) 00677 EscapedInput += "\\L"; 00678 else if (UnicodeScalarValue.first == 0x2029) 00679 EscapedInput += "\\P"; 00680 else { 00681 std::string HexStr = utohexstr(UnicodeScalarValue.first); 00682 if (HexStr.size() <= 2) 00683 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 00684 else if (HexStr.size() <= 4) 00685 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 00686 else if (HexStr.size() <= 8) 00687 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 00688 } 00689 i += UnicodeScalarValue.second - 1; 00690 } else 00691 EscapedInput.push_back(*i); 00692 } 00693 return EscapedInput; 00694 } 00695 00696 Scanner::Scanner(StringRef Input, SourceMgr &sm) 00697 : SM(sm) 00698 , Indent(-1) 00699 , Column(0) 00700 , Line(0) 00701 , FlowLevel(0) 00702 , IsStartOfStream(true) 00703 , IsSimpleKeyAllowed(true) 00704 , Failed(false) { 00705 InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); 00706 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 00707 Current = InputBuffer->getBufferStart(); 00708 End = InputBuffer->getBufferEnd(); 00709 } 00710 00711 Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_) 00712 : SM(SM_) 00713 , InputBuffer(Buffer) 00714 , Current(InputBuffer->getBufferStart()) 00715 , End(InputBuffer->getBufferEnd()) 00716 , Indent(-1) 00717 , Column(0) 00718 , Line(0) 00719 , FlowLevel(0) 00720 , IsStartOfStream(true) 00721 , IsSimpleKeyAllowed(true) 00722 , Failed(false) { 00723 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 00724 } 00725 00726 Token &Scanner::peekNext() { 00727 // If the current token is a possible simple key, keep parsing until we 00728 // can confirm. 00729 bool NeedMore = false; 00730 while (true) { 00731 if (TokenQueue.empty() || NeedMore) { 00732 if (!fetchMoreTokens()) { 00733 TokenQueue.clear(); 00734 TokenQueue.push_back(Token()); 00735 return TokenQueue.front(); 00736 } 00737 } 00738 assert(!TokenQueue.empty() && 00739 "fetchMoreTokens lied about getting tokens!"); 00740 00741 removeStaleSimpleKeyCandidates(); 00742 SimpleKey SK; 00743 SK.Tok = TokenQueue.front(); 00744 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 00745 == SimpleKeys.end()) 00746 break; 00747 else 00748 NeedMore = true; 00749 } 00750 return TokenQueue.front(); 00751 } 00752 00753 Token Scanner::getNext() { 00754 Token Ret = peekNext(); 00755 // TokenQueue can be empty if there was an error getting the next token. 00756 if (!TokenQueue.empty()) 00757 TokenQueue.pop_front(); 00758 00759 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 00760 // quick deallocation of them all. 00761 if (TokenQueue.empty()) { 00762 TokenQueue.Alloc.Reset(); 00763 } 00764 00765 return Ret; 00766 } 00767 00768 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 00769 if (Position == End) 00770 return Position; 00771 // Check 7 bit c-printable - b-char. 00772 if ( *Position == 0x09 00773 || (*Position >= 0x20 && *Position <= 0x7E)) 00774 return Position + 1; 00775 00776 // Check for valid UTF-8. 00777 if (uint8_t(*Position) & 0x80) { 00778 UTF8Decoded u8d = decodeUTF8(Position); 00779 if ( u8d.second != 0 00780 && u8d.first != 0xFEFF 00781 && ( u8d.first == 0x85 00782 || ( u8d.first >= 0xA0 00783 && u8d.first <= 0xD7FF) 00784 || ( u8d.first >= 0xE000 00785 && u8d.first <= 0xFFFD) 00786 || ( u8d.first >= 0x10000 00787 && u8d.first <= 0x10FFFF))) 00788 return Position + u8d.second; 00789 } 00790 return Position; 00791 } 00792 00793 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 00794 if (Position == End) 00795 return Position; 00796 if (*Position == 0x0D) { 00797 if (Position + 1 != End && *(Position + 1) == 0x0A) 00798 return Position + 2; 00799 return Position + 1; 00800 } 00801 00802 if (*Position == 0x0A) 00803 return Position + 1; 00804 return Position; 00805 } 00806 00807 00808 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 00809 if (Position == End) 00810 return Position; 00811 if (*Position == ' ' || *Position == '\t') 00812 return Position + 1; 00813 return Position; 00814 } 00815 00816 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 00817 if (Position == End) 00818 return Position; 00819 if (*Position == ' ' || *Position == '\t') 00820 return Position; 00821 return skip_nb_char(Position); 00822 } 00823 00824 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 00825 , StringRef::iterator Position) { 00826 while (true) { 00827 StringRef::iterator i = (this->*Func)(Position); 00828 if (i == Position) 00829 break; 00830 Position = i; 00831 } 00832 return Position; 00833 } 00834 00835 static bool is_ns_hex_digit(const char C) { 00836 return (C >= '0' && C <= '9') 00837 || (C >= 'a' && C <= 'z') 00838 || (C >= 'A' && C <= 'Z'); 00839 } 00840 00841 static bool is_ns_word_char(const char C) { 00842 return C == '-' 00843 || (C >= 'a' && C <= 'z') 00844 || (C >= 'A' && C <= 'Z'); 00845 } 00846 00847 StringRef Scanner::scan_ns_uri_char() { 00848 StringRef::iterator Start = Current; 00849 while (true) { 00850 if (Current == End) 00851 break; 00852 if (( *Current == '%' 00853 && Current + 2 < End 00854 && is_ns_hex_digit(*(Current + 1)) 00855 && is_ns_hex_digit(*(Current + 2))) 00856 || is_ns_word_char(*Current) 00857 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 00858 != StringRef::npos) { 00859 ++Current; 00860 ++Column; 00861 } else 00862 break; 00863 } 00864 return StringRef(Start, Current - Start); 00865 } 00866 00867 StringRef Scanner::scan_ns_plain_one_line() { 00868 StringRef::iterator start = Current; 00869 // The first character must already be verified. 00870 ++Current; 00871 while (true) { 00872 if (Current == End) { 00873 break; 00874 } else if (*Current == ':') { 00875 // Check if the next character is a ns-char. 00876 if (Current + 1 == End) 00877 break; 00878 StringRef::iterator i = skip_ns_char(Current + 1); 00879 if (Current + 1 != i) { 00880 Current = i; 00881 Column += 2; // Consume both the ':' and ns-char. 00882 } else 00883 break; 00884 } else if (*Current == '#') { 00885 // Check if the previous character was a ns-char. 00886 // The & 0x80 check is to check for the trailing byte of a utf-8 00887 if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) { 00888 ++Current; 00889 ++Column; 00890 } else 00891 break; 00892 } else { 00893 StringRef::iterator i = skip_nb_char(Current); 00894 if (i == Current) 00895 break; 00896 Current = i; 00897 ++Column; 00898 } 00899 } 00900 return StringRef(start, Current - start); 00901 } 00902 00903 bool Scanner::consume(uint32_t Expected) { 00904 if (Expected >= 0x80) 00905 report_fatal_error("Not dealing with this yet"); 00906 if (Current == End) 00907 return false; 00908 if (uint8_t(*Current) >= 0x80) 00909 report_fatal_error("Not dealing with this yet"); 00910 if (uint8_t(*Current) == Expected) { 00911 ++Current; 00912 ++Column; 00913 return true; 00914 } 00915 return false; 00916 } 00917 00918 void Scanner::skip(uint32_t Distance) { 00919 Current += Distance; 00920 Column += Distance; 00921 assert(Current <= End && "Skipped past the end"); 00922 } 00923 00924 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 00925 if (Position == End) 00926 return false; 00927 if ( *Position == ' ' || *Position == '\t' 00928 || *Position == '\r' || *Position == '\n') 00929 return true; 00930 return false; 00931 } 00932 00933 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 00934 , unsigned AtColumn 00935 , bool IsRequired) { 00936 if (IsSimpleKeyAllowed) { 00937 SimpleKey SK; 00938 SK.Tok = Tok; 00939 SK.Line = Line; 00940 SK.Column = AtColumn; 00941 SK.IsRequired = IsRequired; 00942 SK.FlowLevel = FlowLevel; 00943 SimpleKeys.push_back(SK); 00944 } 00945 } 00946 00947 void Scanner::removeStaleSimpleKeyCandidates() { 00948 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 00949 i != SimpleKeys.end();) { 00950 if (i->Line != Line || i->Column + 1024 < Column) { 00951 if (i->IsRequired) 00952 setError( "Could not find expected : for simple key" 00953 , i->Tok->Range.begin()); 00954 i = SimpleKeys.erase(i); 00955 } else 00956 ++i; 00957 } 00958 } 00959 00960 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 00961 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 00962 SimpleKeys.pop_back(); 00963 } 00964 00965 bool Scanner::unrollIndent(int ToColumn) { 00966 Token T; 00967 // Indentation is ignored in flow. 00968 if (FlowLevel != 0) 00969 return true; 00970 00971 while (Indent > ToColumn) { 00972 T.Kind = Token::TK_BlockEnd; 00973 T.Range = StringRef(Current, 1); 00974 TokenQueue.push_back(T); 00975 Indent = Indents.pop_back_val(); 00976 } 00977 00978 return true; 00979 } 00980 00981 bool Scanner::rollIndent( int ToColumn 00982 , Token::TokenKind Kind 00983 , TokenQueueT::iterator InsertPoint) { 00984 if (FlowLevel) 00985 return true; 00986 if (Indent < ToColumn) { 00987 Indents.push_back(Indent); 00988 Indent = ToColumn; 00989 00990 Token T; 00991 T.Kind = Kind; 00992 T.Range = StringRef(Current, 0); 00993 TokenQueue.insert(InsertPoint, T); 00994 } 00995 return true; 00996 } 00997 00998 void Scanner::scanToNextToken() { 00999 while (true) { 01000 while (*Current == ' ' || *Current == '\t') { 01001 skip(1); 01002 } 01003 01004 // Skip comment. 01005 if (*Current == '#') { 01006 while (true) { 01007 // This may skip more than one byte, thus Column is only incremented 01008 // for code points. 01009 StringRef::iterator i = skip_nb_char(Current); 01010 if (i == Current) 01011 break; 01012 Current = i; 01013 ++Column; 01014 } 01015 } 01016 01017 // Skip EOL. 01018 StringRef::iterator i = skip_b_break(Current); 01019 if (i == Current) 01020 break; 01021 Current = i; 01022 ++Line; 01023 Column = 0; 01024 // New lines may start a simple key. 01025 if (!FlowLevel) 01026 IsSimpleKeyAllowed = true; 01027 } 01028 } 01029 01030 bool Scanner::scanStreamStart() { 01031 IsStartOfStream = false; 01032 01033 EncodingInfo EI = getUnicodeEncoding(currentInput()); 01034 01035 Token T; 01036 T.Kind = Token::TK_StreamStart; 01037 T.Range = StringRef(Current, EI.second); 01038 TokenQueue.push_back(T); 01039 Current += EI.second; 01040 return true; 01041 } 01042 01043 bool Scanner::scanStreamEnd() { 01044 // Force an ending new line if one isn't present. 01045 if (Column != 0) { 01046 Column = 0; 01047 ++Line; 01048 } 01049 01050 unrollIndent(-1); 01051 SimpleKeys.clear(); 01052 IsSimpleKeyAllowed = false; 01053 01054 Token T; 01055 T.Kind = Token::TK_StreamEnd; 01056 T.Range = StringRef(Current, 0); 01057 TokenQueue.push_back(T); 01058 return true; 01059 } 01060 01061 bool Scanner::scanDirective() { 01062 // Reset the indentation level. 01063 unrollIndent(-1); 01064 SimpleKeys.clear(); 01065 IsSimpleKeyAllowed = false; 01066 01067 StringRef::iterator Start = Current; 01068 consume('%'); 01069 StringRef::iterator NameStart = Current; 01070 Current = skip_while(&Scanner::skip_ns_char, Current); 01071 StringRef Name(NameStart, Current - NameStart); 01072 Current = skip_while(&Scanner::skip_s_white, Current); 01073 01074 if (Name == "YAML") { 01075 Current = skip_while(&Scanner::skip_ns_char, Current); 01076 Token T; 01077 T.Kind = Token::TK_VersionDirective; 01078 T.Range = StringRef(Start, Current - Start); 01079 TokenQueue.push_back(T); 01080 return true; 01081 } 01082 return false; 01083 } 01084 01085 bool Scanner::scanDocumentIndicator(bool IsStart) { 01086 unrollIndent(-1); 01087 SimpleKeys.clear(); 01088 IsSimpleKeyAllowed = false; 01089 01090 Token T; 01091 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 01092 T.Range = StringRef(Current, 3); 01093 skip(3); 01094 TokenQueue.push_back(T); 01095 return true; 01096 } 01097 01098 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 01099 Token T; 01100 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 01101 : Token::TK_FlowMappingStart; 01102 T.Range = StringRef(Current, 1); 01103 skip(1); 01104 TokenQueue.push_back(T); 01105 01106 // [ and { may begin a simple key. 01107 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 01108 01109 // And may also be followed by a simple key. 01110 IsSimpleKeyAllowed = true; 01111 ++FlowLevel; 01112 return true; 01113 } 01114 01115 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 01116 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 01117 IsSimpleKeyAllowed = false; 01118 Token T; 01119 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 01120 : Token::TK_FlowMappingEnd; 01121 T.Range = StringRef(Current, 1); 01122 skip(1); 01123 TokenQueue.push_back(T); 01124 if (FlowLevel) 01125 --FlowLevel; 01126 return true; 01127 } 01128 01129 bool Scanner::scanFlowEntry() { 01130 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 01131 IsSimpleKeyAllowed = true; 01132 Token T; 01133 T.Kind = Token::TK_FlowEntry; 01134 T.Range = StringRef(Current, 1); 01135 skip(1); 01136 TokenQueue.push_back(T); 01137 return true; 01138 } 01139 01140 bool Scanner::scanBlockEntry() { 01141 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 01142 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 01143 IsSimpleKeyAllowed = true; 01144 Token T; 01145 T.Kind = Token::TK_BlockEntry; 01146 T.Range = StringRef(Current, 1); 01147 skip(1); 01148 TokenQueue.push_back(T); 01149 return true; 01150 } 01151 01152 bool Scanner::scanKey() { 01153 if (!FlowLevel) 01154 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 01155 01156 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 01157 IsSimpleKeyAllowed = !FlowLevel; 01158 01159 Token T; 01160 T.Kind = Token::TK_Key; 01161 T.Range = StringRef(Current, 1); 01162 skip(1); 01163 TokenQueue.push_back(T); 01164 return true; 01165 } 01166 01167 bool Scanner::scanValue() { 01168 // If the previous token could have been a simple key, insert the key token 01169 // into the token queue. 01170 if (!SimpleKeys.empty()) { 01171 SimpleKey SK = SimpleKeys.pop_back_val(); 01172 Token T; 01173 T.Kind = Token::TK_Key; 01174 T.Range = SK.Tok->Range; 01175 TokenQueueT::iterator i, e; 01176 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 01177 if (i == SK.Tok) 01178 break; 01179 } 01180 assert(i != e && "SimpleKey not in token queue!"); 01181 i = TokenQueue.insert(i, T); 01182 01183 // We may also need to add a Block-Mapping-Start token. 01184 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 01185 01186 IsSimpleKeyAllowed = false; 01187 } else { 01188 if (!FlowLevel) 01189 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 01190 IsSimpleKeyAllowed = !FlowLevel; 01191 } 01192 01193 Token T; 01194 T.Kind = Token::TK_Value; 01195 T.Range = StringRef(Current, 1); 01196 skip(1); 01197 TokenQueue.push_back(T); 01198 return true; 01199 } 01200 01201 // Forbidding inlining improves performance by roughly 20%. 01202 // FIXME: Remove once llvm optimizes this to the faster version without hints. 01203 LLVM_ATTRIBUTE_NOINLINE static bool 01204 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 01205 01206 // Returns whether a character at 'Position' was escaped with a leading '\'. 01207 // 'First' specifies the position of the first character in the string. 01208 static bool wasEscaped(StringRef::iterator First, 01209 StringRef::iterator Position) { 01210 assert(Position - 1 >= First); 01211 StringRef::iterator I = Position - 1; 01212 // We calculate the number of consecutive '\'s before the current position 01213 // by iterating backwards through our string. 01214 while (I >= First && *I == '\\') --I; 01215 // (Position - 1 - I) now contains the number of '\'s before the current 01216 // position. If it is odd, the character at 'Position' was escaped. 01217 return (Position - 1 - I) % 2 == 1; 01218 } 01219 01220 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 01221 StringRef::iterator Start = Current; 01222 unsigned ColStart = Column; 01223 if (IsDoubleQuoted) { 01224 do { 01225 ++Current; 01226 while (Current != End && *Current != '"') 01227 ++Current; 01228 // Repeat until the previous character was not a '\' or was an escaped 01229 // backslash. 01230 } while ( Current != End 01231 && *(Current - 1) == '\\' 01232 && wasEscaped(Start + 1, Current)); 01233 } else { 01234 skip(1); 01235 while (true) { 01236 // Skip a ' followed by another '. 01237 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 01238 skip(2); 01239 continue; 01240 } else if (*Current == '\'') 01241 break; 01242 StringRef::iterator i = skip_nb_char(Current); 01243 if (i == Current) { 01244 i = skip_b_break(Current); 01245 if (i == Current) 01246 break; 01247 Current = i; 01248 Column = 0; 01249 ++Line; 01250 } else { 01251 if (i == End) 01252 break; 01253 Current = i; 01254 ++Column; 01255 } 01256 } 01257 } 01258 01259 if (Current == End) { 01260 setError("Expected quote at end of scalar", Current); 01261 return false; 01262 } 01263 01264 skip(1); // Skip ending quote. 01265 Token T; 01266 T.Kind = Token::TK_Scalar; 01267 T.Range = StringRef(Start, Current - Start); 01268 TokenQueue.push_back(T); 01269 01270 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 01271 01272 IsSimpleKeyAllowed = false; 01273 01274 return true; 01275 } 01276 01277 bool Scanner::scanPlainScalar() { 01278 StringRef::iterator Start = Current; 01279 unsigned ColStart = Column; 01280 unsigned LeadingBlanks = 0; 01281 assert(Indent >= -1 && "Indent must be >= -1 !"); 01282 unsigned indent = static_cast<unsigned>(Indent + 1); 01283 while (true) { 01284 if (*Current == '#') 01285 break; 01286 01287 while (!isBlankOrBreak(Current)) { 01288 if ( FlowLevel && *Current == ':' 01289 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 01290 setError("Found unexpected ':' while scanning a plain scalar", Current); 01291 return false; 01292 } 01293 01294 // Check for the end of the plain scalar. 01295 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 01296 || ( FlowLevel 01297 && (StringRef(Current, 1).find_first_of(",:?[]{}") 01298 != StringRef::npos))) 01299 break; 01300 01301 StringRef::iterator i = skip_nb_char(Current); 01302 if (i == Current) 01303 break; 01304 Current = i; 01305 ++Column; 01306 } 01307 01308 // Are we at the end? 01309 if (!isBlankOrBreak(Current)) 01310 break; 01311 01312 // Eat blanks. 01313 StringRef::iterator Tmp = Current; 01314 while (isBlankOrBreak(Tmp)) { 01315 StringRef::iterator i = skip_s_white(Tmp); 01316 if (i != Tmp) { 01317 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 01318 setError("Found invalid tab character in indentation", Tmp); 01319 return false; 01320 } 01321 Tmp = i; 01322 ++Column; 01323 } else { 01324 i = skip_b_break(Tmp); 01325 if (!LeadingBlanks) 01326 LeadingBlanks = 1; 01327 Tmp = i; 01328 Column = 0; 01329 ++Line; 01330 } 01331 } 01332 01333 if (!FlowLevel && Column < indent) 01334 break; 01335 01336 Current = Tmp; 01337 } 01338 if (Start == Current) { 01339 setError("Got empty plain scalar", Start); 01340 return false; 01341 } 01342 Token T; 01343 T.Kind = Token::TK_Scalar; 01344 T.Range = StringRef(Start, Current - Start); 01345 TokenQueue.push_back(T); 01346 01347 // Plain scalars can be simple keys. 01348 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 01349 01350 IsSimpleKeyAllowed = false; 01351 01352 return true; 01353 } 01354 01355 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 01356 StringRef::iterator Start = Current; 01357 unsigned ColStart = Column; 01358 skip(1); 01359 while(true) { 01360 if ( *Current == '[' || *Current == ']' 01361 || *Current == '{' || *Current == '}' 01362 || *Current == ',' 01363 || *Current == ':') 01364 break; 01365 StringRef::iterator i = skip_ns_char(Current); 01366 if (i == Current) 01367 break; 01368 Current = i; 01369 ++Column; 01370 } 01371 01372 if (Start == Current) { 01373 setError("Got empty alias or anchor", Start); 01374 return false; 01375 } 01376 01377 Token T; 01378 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 01379 T.Range = StringRef(Start, Current - Start); 01380 TokenQueue.push_back(T); 01381 01382 // Alias and anchors can be simple keys. 01383 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 01384 01385 IsSimpleKeyAllowed = false; 01386 01387 return true; 01388 } 01389 01390 bool Scanner::scanBlockScalar(bool IsLiteral) { 01391 StringRef::iterator Start = Current; 01392 skip(1); // Eat | or > 01393 while(true) { 01394 StringRef::iterator i = skip_nb_char(Current); 01395 if (i == Current) { 01396 if (Column == 0) 01397 break; 01398 i = skip_b_break(Current); 01399 if (i != Current) { 01400 // We got a line break. 01401 Column = 0; 01402 ++Line; 01403 Current = i; 01404 continue; 01405 } else { 01406 // There was an error, which should already have been printed out. 01407 return false; 01408 } 01409 } 01410 Current = i; 01411 ++Column; 01412 } 01413 01414 if (Start == Current) { 01415 setError("Got empty block scalar", Start); 01416 return false; 01417 } 01418 01419 Token T; 01420 T.Kind = Token::TK_Scalar; 01421 T.Range = StringRef(Start, Current - Start); 01422 TokenQueue.push_back(T); 01423 return true; 01424 } 01425 01426 bool Scanner::scanTag() { 01427 StringRef::iterator Start = Current; 01428 unsigned ColStart = Column; 01429 skip(1); // Eat !. 01430 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 01431 else if (*Current == '<') { 01432 skip(1); 01433 scan_ns_uri_char(); 01434 if (!consume('>')) 01435 return false; 01436 } else { 01437 // FIXME: Actually parse the c-ns-shorthand-tag rule. 01438 Current = skip_while(&Scanner::skip_ns_char, Current); 01439 } 01440 01441 Token T; 01442 T.Kind = Token::TK_Tag; 01443 T.Range = StringRef(Start, Current - Start); 01444 TokenQueue.push_back(T); 01445 01446 // Tags can be simple keys. 01447 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 01448 01449 IsSimpleKeyAllowed = false; 01450 01451 return true; 01452 } 01453 01454 bool Scanner::fetchMoreTokens() { 01455 if (IsStartOfStream) 01456 return scanStreamStart(); 01457 01458 scanToNextToken(); 01459 01460 if (Current == End) 01461 return scanStreamEnd(); 01462 01463 removeStaleSimpleKeyCandidates(); 01464 01465 unrollIndent(Column); 01466 01467 if (Column == 0 && *Current == '%') 01468 return scanDirective(); 01469 01470 if (Column == 0 && Current + 4 <= End 01471 && *Current == '-' 01472 && *(Current + 1) == '-' 01473 && *(Current + 2) == '-' 01474 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 01475 return scanDocumentIndicator(true); 01476 01477 if (Column == 0 && Current + 4 <= End 01478 && *Current == '.' 01479 && *(Current + 1) == '.' 01480 && *(Current + 2) == '.' 01481 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 01482 return scanDocumentIndicator(false); 01483 01484 if (*Current == '[') 01485 return scanFlowCollectionStart(true); 01486 01487 if (*Current == '{') 01488 return scanFlowCollectionStart(false); 01489 01490 if (*Current == ']') 01491 return scanFlowCollectionEnd(true); 01492 01493 if (*Current == '}') 01494 return scanFlowCollectionEnd(false); 01495 01496 if (*Current == ',') 01497 return scanFlowEntry(); 01498 01499 if (*Current == '-' && isBlankOrBreak(Current + 1)) 01500 return scanBlockEntry(); 01501 01502 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 01503 return scanKey(); 01504 01505 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 01506 return scanValue(); 01507 01508 if (*Current == '*') 01509 return scanAliasOrAnchor(true); 01510 01511 if (*Current == '&') 01512 return scanAliasOrAnchor(false); 01513 01514 if (*Current == '!') 01515 return scanTag(); 01516 01517 if (*Current == '|' && !FlowLevel) 01518 return scanBlockScalar(true); 01519 01520 if (*Current == '>' && !FlowLevel) 01521 return scanBlockScalar(false); 01522 01523 if (*Current == '\'') 01524 return scanFlowScalar(false); 01525 01526 if (*Current == '"') 01527 return scanFlowScalar(true); 01528 01529 // Get a plain scalar. 01530 StringRef FirstChar(Current, 1); 01531 if (!(isBlankOrBreak(Current) 01532 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 01533 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 01534 || (!FlowLevel && (*Current == '?' || *Current == ':') 01535 && isBlankOrBreak(Current + 1)) 01536 || (!FlowLevel && *Current == ':' 01537 && Current + 2 < End 01538 && *(Current + 1) == ':' 01539 && !isBlankOrBreak(Current + 2))) 01540 return scanPlainScalar(); 01541 01542 setError("Unrecognized character while tokenizing."); 01543 return false; 01544 } 01545 01546 Stream::Stream(StringRef Input, SourceMgr &SM) 01547 : scanner(new Scanner(Input, SM)) 01548 , CurrentDoc(0) {} 01549 01550 Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM) 01551 : scanner(new Scanner(InputBuffer, SM)) 01552 , CurrentDoc(0) {} 01553 01554 Stream::~Stream() {} 01555 01556 bool Stream::failed() { return scanner->failed(); } 01557 01558 void Stream::printError(Node *N, const Twine &Msg) { 01559 SmallVector<SMRange, 1> Ranges; 01560 Ranges.push_back(N->getSourceRange()); 01561 scanner->printError( N->getSourceRange().Start 01562 , SourceMgr::DK_Error 01563 , Msg 01564 , Ranges); 01565 } 01566 01567 void Stream::handleYAMLDirective(const Token &t) { 01568 // TODO: Ensure version is 1.x. 01569 } 01570 01571 document_iterator Stream::begin() { 01572 if (CurrentDoc) 01573 report_fatal_error("Can only iterate over the stream once"); 01574 01575 // Skip Stream-Start. 01576 scanner->getNext(); 01577 01578 CurrentDoc.reset(new Document(*this)); 01579 return document_iterator(CurrentDoc); 01580 } 01581 01582 document_iterator Stream::end() { 01583 return document_iterator(); 01584 } 01585 01586 void Stream::skip() { 01587 for (document_iterator i = begin(), e = end(); i != e; ++i) 01588 i->skip(); 01589 } 01590 01591 Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A) 01592 : Doc(D) 01593 , TypeID(Type) 01594 , Anchor(A) { 01595 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 01596 SourceRange = SMRange(Start, Start); 01597 } 01598 01599 Token &Node::peekNext() { 01600 return Doc->peekNext(); 01601 } 01602 01603 Token Node::getNext() { 01604 return Doc->getNext(); 01605 } 01606 01607 Node *Node::parseBlockNode() { 01608 return Doc->parseBlockNode(); 01609 } 01610 01611 BumpPtrAllocator &Node::getAllocator() { 01612 return Doc->NodeAllocator; 01613 } 01614 01615 void Node::setError(const Twine &Msg, Token &Tok) const { 01616 Doc->setError(Msg, Tok); 01617 } 01618 01619 bool Node::failed() const { 01620 return Doc->failed(); 01621 } 01622 01623 01624 01625 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 01626 // TODO: Handle newlines properly. We need to remove leading whitespace. 01627 if (Value[0] == '"') { // Double quoted. 01628 // Pull off the leading and trailing "s. 01629 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 01630 // Search for characters that would require unescaping the value. 01631 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 01632 if (i != StringRef::npos) 01633 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 01634 return UnquotedValue; 01635 } else if (Value[0] == '\'') { // Single quoted. 01636 // Pull off the leading and trailing 's. 01637 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 01638 StringRef::size_type i = UnquotedValue.find('\''); 01639 if (i != StringRef::npos) { 01640 // We're going to need Storage. 01641 Storage.clear(); 01642 Storage.reserve(UnquotedValue.size()); 01643 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 01644 StringRef Valid(UnquotedValue.begin(), i); 01645 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 01646 Storage.push_back('\''); 01647 UnquotedValue = UnquotedValue.substr(i + 2); 01648 } 01649 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 01650 return StringRef(Storage.begin(), Storage.size()); 01651 } 01652 return UnquotedValue; 01653 } 01654 // Plain or block. 01655 return Value.rtrim(" "); 01656 } 01657 01658 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 01659 , StringRef::size_type i 01660 , SmallVectorImpl<char> &Storage) 01661 const { 01662 // Use Storage to build proper value. 01663 Storage.clear(); 01664 Storage.reserve(UnquotedValue.size()); 01665 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 01666 // Insert all previous chars into Storage. 01667 StringRef Valid(UnquotedValue.begin(), i); 01668 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 01669 // Chop off inserted chars. 01670 UnquotedValue = UnquotedValue.substr(i); 01671 01672 assert(!UnquotedValue.empty() && "Can't be empty!"); 01673 01674 // Parse escape or line break. 01675 switch (UnquotedValue[0]) { 01676 case '\r': 01677 case '\n': 01678 Storage.push_back('\n'); 01679 if ( UnquotedValue.size() > 1 01680 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 01681 UnquotedValue = UnquotedValue.substr(1); 01682 UnquotedValue = UnquotedValue.substr(1); 01683 break; 01684 default: 01685 if (UnquotedValue.size() == 1) 01686 // TODO: Report error. 01687 break; 01688 UnquotedValue = UnquotedValue.substr(1); 01689 switch (UnquotedValue[0]) { 01690 default: { 01691 Token T; 01692 T.Range = StringRef(UnquotedValue.begin(), 1); 01693 setError("Unrecognized escape code!", T); 01694 return ""; 01695 } 01696 case '\r': 01697 case '\n': 01698 // Remove the new line. 01699 if ( UnquotedValue.size() > 1 01700 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 01701 UnquotedValue = UnquotedValue.substr(1); 01702 // If this was just a single byte newline, it will get skipped 01703 // below. 01704 break; 01705 case '0': 01706 Storage.push_back(0x00); 01707 break; 01708 case 'a': 01709 Storage.push_back(0x07); 01710 break; 01711 case 'b': 01712 Storage.push_back(0x08); 01713 break; 01714 case 't': 01715 case 0x09: 01716 Storage.push_back(0x09); 01717 break; 01718 case 'n': 01719 Storage.push_back(0x0A); 01720 break; 01721 case 'v': 01722 Storage.push_back(0x0B); 01723 break; 01724 case 'f': 01725 Storage.push_back(0x0C); 01726 break; 01727 case 'r': 01728 Storage.push_back(0x0D); 01729 break; 01730 case 'e': 01731 Storage.push_back(0x1B); 01732 break; 01733 case ' ': 01734 Storage.push_back(0x20); 01735 break; 01736 case '"': 01737 Storage.push_back(0x22); 01738 break; 01739 case '/': 01740 Storage.push_back(0x2F); 01741 break; 01742 case '\\': 01743 Storage.push_back(0x5C); 01744 break; 01745 case 'N': 01746 encodeUTF8(0x85, Storage); 01747 break; 01748 case '_': 01749 encodeUTF8(0xA0, Storage); 01750 break; 01751 case 'L': 01752 encodeUTF8(0x2028, Storage); 01753 break; 01754 case 'P': 01755 encodeUTF8(0x2029, Storage); 01756 break; 01757 case 'x': { 01758 if (UnquotedValue.size() < 3) 01759 // TODO: Report error. 01760 break; 01761 unsigned int UnicodeScalarValue; 01762 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 01763 // TODO: Report error. 01764 UnicodeScalarValue = 0xFFFD; 01765 encodeUTF8(UnicodeScalarValue, Storage); 01766 UnquotedValue = UnquotedValue.substr(2); 01767 break; 01768 } 01769 case 'u': { 01770 if (UnquotedValue.size() < 5) 01771 // TODO: Report error. 01772 break; 01773 unsigned int UnicodeScalarValue; 01774 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 01775 // TODO: Report error. 01776 UnicodeScalarValue = 0xFFFD; 01777 encodeUTF8(UnicodeScalarValue, Storage); 01778 UnquotedValue = UnquotedValue.substr(4); 01779 break; 01780 } 01781 case 'U': { 01782 if (UnquotedValue.size() < 9) 01783 // TODO: Report error. 01784 break; 01785 unsigned int UnicodeScalarValue; 01786 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 01787 // TODO: Report error. 01788 UnicodeScalarValue = 0xFFFD; 01789 encodeUTF8(UnicodeScalarValue, Storage); 01790 UnquotedValue = UnquotedValue.substr(8); 01791 break; 01792 } 01793 } 01794 UnquotedValue = UnquotedValue.substr(1); 01795 } 01796 } 01797 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 01798 return StringRef(Storage.begin(), Storage.size()); 01799 } 01800 01801 Node *KeyValueNode::getKey() { 01802 if (Key) 01803 return Key; 01804 // Handle implicit null keys. 01805 { 01806 Token &t = peekNext(); 01807 if ( t.Kind == Token::TK_BlockEnd 01808 || t.Kind == Token::TK_Value 01809 || t.Kind == Token::TK_Error) { 01810 return Key = new (getAllocator()) NullNode(Doc); 01811 } 01812 if (t.Kind == Token::TK_Key) 01813 getNext(); // skip TK_Key. 01814 } 01815 01816 // Handle explicit null keys. 01817 Token &t = peekNext(); 01818 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 01819 return Key = new (getAllocator()) NullNode(Doc); 01820 } 01821 01822 // We've got a normal key. 01823 return Key = parseBlockNode(); 01824 } 01825 01826 Node *KeyValueNode::getValue() { 01827 if (Value) 01828 return Value; 01829 getKey()->skip(); 01830 if (failed()) 01831 return Value = new (getAllocator()) NullNode(Doc); 01832 01833 // Handle implicit null values. 01834 { 01835 Token &t = peekNext(); 01836 if ( t.Kind == Token::TK_BlockEnd 01837 || t.Kind == Token::TK_FlowMappingEnd 01838 || t.Kind == Token::TK_Key 01839 || t.Kind == Token::TK_FlowEntry 01840 || t.Kind == Token::TK_Error) { 01841 return Value = new (getAllocator()) NullNode(Doc); 01842 } 01843 01844 if (t.Kind != Token::TK_Value) { 01845 setError("Unexpected token in Key Value.", t); 01846 return Value = new (getAllocator()) NullNode(Doc); 01847 } 01848 getNext(); // skip TK_Value. 01849 } 01850 01851 // Handle explicit null values. 01852 Token &t = peekNext(); 01853 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 01854 return Value = new (getAllocator()) NullNode(Doc); 01855 } 01856 01857 // We got a normal value. 01858 return Value = parseBlockNode(); 01859 } 01860 01861 void MappingNode::increment() { 01862 if (failed()) { 01863 IsAtEnd = true; 01864 CurrentEntry = 0; 01865 return; 01866 } 01867 if (CurrentEntry) { 01868 CurrentEntry->skip(); 01869 if (Type == MT_Inline) { 01870 IsAtEnd = true; 01871 CurrentEntry = 0; 01872 return; 01873 } 01874 } 01875 Token T = peekNext(); 01876 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 01877 // KeyValueNode eats the TK_Key. That way it can detect null keys. 01878 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 01879 } else if (Type == MT_Block) { 01880 switch (T.Kind) { 01881 case Token::TK_BlockEnd: 01882 getNext(); 01883 IsAtEnd = true; 01884 CurrentEntry = 0; 01885 break; 01886 default: 01887 setError("Unexpected token. Expected Key or Block End", T); 01888 case Token::TK_Error: 01889 IsAtEnd = true; 01890 CurrentEntry = 0; 01891 } 01892 } else { 01893 switch (T.Kind) { 01894 case Token::TK_FlowEntry: 01895 // Eat the flow entry and recurse. 01896 getNext(); 01897 return increment(); 01898 case Token::TK_FlowMappingEnd: 01899 getNext(); 01900 case Token::TK_Error: 01901 // Set this to end iterator. 01902 IsAtEnd = true; 01903 CurrentEntry = 0; 01904 break; 01905 default: 01906 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 01907 "Mapping End." 01908 , T); 01909 IsAtEnd = true; 01910 CurrentEntry = 0; 01911 } 01912 } 01913 } 01914 01915 void SequenceNode::increment() { 01916 if (failed()) { 01917 IsAtEnd = true; 01918 CurrentEntry = 0; 01919 return; 01920 } 01921 if (CurrentEntry) 01922 CurrentEntry->skip(); 01923 Token T = peekNext(); 01924 if (SeqType == ST_Block) { 01925 switch (T.Kind) { 01926 case Token::TK_BlockEntry: 01927 getNext(); 01928 CurrentEntry = parseBlockNode(); 01929 if (CurrentEntry == 0) { // An error occurred. 01930 IsAtEnd = true; 01931 CurrentEntry = 0; 01932 } 01933 break; 01934 case Token::TK_BlockEnd: 01935 getNext(); 01936 IsAtEnd = true; 01937 CurrentEntry = 0; 01938 break; 01939 default: 01940 setError( "Unexpected token. Expected Block Entry or Block End." 01941 , T); 01942 case Token::TK_Error: 01943 IsAtEnd = true; 01944 CurrentEntry = 0; 01945 } 01946 } else if (SeqType == ST_Indentless) { 01947 switch (T.Kind) { 01948 case Token::TK_BlockEntry: 01949 getNext(); 01950 CurrentEntry = parseBlockNode(); 01951 if (CurrentEntry == 0) { // An error occurred. 01952 IsAtEnd = true; 01953 CurrentEntry = 0; 01954 } 01955 break; 01956 default: 01957 case Token::TK_Error: 01958 IsAtEnd = true; 01959 CurrentEntry = 0; 01960 } 01961 } else if (SeqType == ST_Flow) { 01962 switch (T.Kind) { 01963 case Token::TK_FlowEntry: 01964 // Eat the flow entry and recurse. 01965 getNext(); 01966 WasPreviousTokenFlowEntry = true; 01967 return increment(); 01968 case Token::TK_FlowSequenceEnd: 01969 getNext(); 01970 case Token::TK_Error: 01971 // Set this to end iterator. 01972 IsAtEnd = true; 01973 CurrentEntry = 0; 01974 break; 01975 case Token::TK_StreamEnd: 01976 case Token::TK_DocumentEnd: 01977 case Token::TK_DocumentStart: 01978 setError("Could not find closing ]!", T); 01979 // Set this to end iterator. 01980 IsAtEnd = true; 01981 CurrentEntry = 0; 01982 break; 01983 default: 01984 if (!WasPreviousTokenFlowEntry) { 01985 setError("Expected , between entries!", T); 01986 IsAtEnd = true; 01987 CurrentEntry = 0; 01988 break; 01989 } 01990 // Otherwise it must be a flow entry. 01991 CurrentEntry = parseBlockNode(); 01992 if (!CurrentEntry) { 01993 IsAtEnd = true; 01994 } 01995 WasPreviousTokenFlowEntry = false; 01996 break; 01997 } 01998 } 01999 } 02000 02001 Document::Document(Stream &S) : stream(S), Root(0) { 02002 if (parseDirectives()) 02003 expectToken(Token::TK_DocumentStart); 02004 Token &T = peekNext(); 02005 if (T.Kind == Token::TK_DocumentStart) 02006 getNext(); 02007 } 02008 02009 bool Document::skip() { 02010 if (stream.scanner->failed()) 02011 return false; 02012 if (!Root) 02013 getRoot(); 02014 Root->skip(); 02015 Token &T = peekNext(); 02016 if (T.Kind == Token::TK_StreamEnd) 02017 return false; 02018 if (T.Kind == Token::TK_DocumentEnd) { 02019 getNext(); 02020 return skip(); 02021 } 02022 return true; 02023 } 02024 02025 Token &Document::peekNext() { 02026 return stream.scanner->peekNext(); 02027 } 02028 02029 Token Document::getNext() { 02030 return stream.scanner->getNext(); 02031 } 02032 02033 void Document::setError(const Twine &Message, Token &Location) const { 02034 stream.scanner->setError(Message, Location.Range.begin()); 02035 } 02036 02037 bool Document::failed() const { 02038 return stream.scanner->failed(); 02039 } 02040 02041 Node *Document::parseBlockNode() { 02042 Token T = peekNext(); 02043 // Handle properties. 02044 Token AnchorInfo; 02045 parse_property: 02046 switch (T.Kind) { 02047 case Token::TK_Alias: 02048 getNext(); 02049 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 02050 case Token::TK_Anchor: 02051 if (AnchorInfo.Kind == Token::TK_Anchor) { 02052 setError("Already encountered an anchor for this node!", T); 02053 return 0; 02054 } 02055 AnchorInfo = getNext(); // Consume TK_Anchor. 02056 T = peekNext(); 02057 goto parse_property; 02058 case Token::TK_Tag: 02059 getNext(); // Skip TK_Tag. 02060 T = peekNext(); 02061 goto parse_property; 02062 default: 02063 break; 02064 } 02065 02066 switch (T.Kind) { 02067 case Token::TK_BlockEntry: 02068 // We got an unindented BlockEntry sequence. This is not terminated with 02069 // a BlockEnd. 02070 // Don't eat the TK_BlockEntry, SequenceNode needs it. 02071 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 02072 , AnchorInfo.Range.substr(1) 02073 , SequenceNode::ST_Indentless); 02074 case Token::TK_BlockSequenceStart: 02075 getNext(); 02076 return new (NodeAllocator) 02077 SequenceNode( stream.CurrentDoc 02078 , AnchorInfo.Range.substr(1) 02079 , SequenceNode::ST_Block); 02080 case Token::TK_BlockMappingStart: 02081 getNext(); 02082 return new (NodeAllocator) 02083 MappingNode( stream.CurrentDoc 02084 , AnchorInfo.Range.substr(1) 02085 , MappingNode::MT_Block); 02086 case Token::TK_FlowSequenceStart: 02087 getNext(); 02088 return new (NodeAllocator) 02089 SequenceNode( stream.CurrentDoc 02090 , AnchorInfo.Range.substr(1) 02091 , SequenceNode::ST_Flow); 02092 case Token::TK_FlowMappingStart: 02093 getNext(); 02094 return new (NodeAllocator) 02095 MappingNode( stream.CurrentDoc 02096 , AnchorInfo.Range.substr(1) 02097 , MappingNode::MT_Flow); 02098 case Token::TK_Scalar: 02099 getNext(); 02100 return new (NodeAllocator) 02101 ScalarNode( stream.CurrentDoc 02102 , AnchorInfo.Range.substr(1) 02103 , T.Range); 02104 case Token::TK_Key: 02105 // Don't eat the TK_Key, KeyValueNode expects it. 02106 return new (NodeAllocator) 02107 MappingNode( stream.CurrentDoc 02108 , AnchorInfo.Range.substr(1) 02109 , MappingNode::MT_Inline); 02110 case Token::TK_DocumentStart: 02111 case Token::TK_DocumentEnd: 02112 case Token::TK_StreamEnd: 02113 default: 02114 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 02115 // !!null null. 02116 return new (NodeAllocator) NullNode(stream.CurrentDoc); 02117 case Token::TK_Error: 02118 return 0; 02119 } 02120 llvm_unreachable("Control flow shouldn't reach here."); 02121 return 0; 02122 } 02123 02124 bool Document::parseDirectives() { 02125 bool isDirective = false; 02126 while (true) { 02127 Token T = peekNext(); 02128 if (T.Kind == Token::TK_TagDirective) { 02129 handleTagDirective(getNext()); 02130 isDirective = true; 02131 } else if (T.Kind == Token::TK_VersionDirective) { 02132 stream.handleYAMLDirective(getNext()); 02133 isDirective = true; 02134 } else 02135 break; 02136 } 02137 return isDirective; 02138 } 02139 02140 bool Document::expectToken(int TK) { 02141 Token T = getNext(); 02142 if (T.Kind != TK) { 02143 setError("Unexpected token", T); 02144 return false; 02145 } 02146 return true; 02147 }