LLVM API Documentation

YAMLParser.cpp
Go to the documentation of this file.
00001 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 //  This file implements a YAML parser.
00011 //
00012 //===----------------------------------------------------------------------===//
00013 
00014 #include "llvm/Support/YAMLParser.h"
00015 #include "llvm/ADT/SmallVector.h"
00016 #include "llvm/ADT/StringExtras.h"
00017 #include "llvm/ADT/Twine.h"
00018 #include "llvm/ADT/ilist.h"
00019 #include "llvm/ADT/ilist_node.h"
00020 #include "llvm/Support/ErrorHandling.h"
00021 #include "llvm/Support/MemoryBuffer.h"
00022 #include "llvm/Support/SourceMgr.h"
00023 #include "llvm/Support/raw_ostream.h"
00024 
00025 using namespace llvm;
00026 using namespace yaml;
00027 
00028 enum UnicodeEncodingForm {
00029   UEF_UTF32_LE, ///< UTF-32 Little Endian
00030   UEF_UTF32_BE, ///< UTF-32 Big Endian
00031   UEF_UTF16_LE, ///< UTF-16 Little Endian
00032   UEF_UTF16_BE, ///< UTF-16 Big Endian
00033   UEF_UTF8,     ///< UTF-8 or ascii.
00034   UEF_Unknown   ///< Not a valid Unicode encoding.
00035 };
00036 
00037 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
00038 ///                it exists. Length is in {0, 2, 3, 4}.
00039 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
00040 
00041 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
00042 ///                      encoding form of \a Input.
00043 ///
00044 /// @param Input A string of length 0 or more.
00045 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
00046 ///          and how long the byte order mark is if one exists.
00047 static EncodingInfo getUnicodeEncoding(StringRef Input) {
00048   if (Input.size() == 0)
00049     return std::make_pair(UEF_Unknown, 0);
00050 
00051   switch (uint8_t(Input[0])) {
00052   case 0x00:
00053     if (Input.size() >= 4) {
00054       if (  Input[1] == 0
00055          && uint8_t(Input[2]) == 0xFE
00056          && uint8_t(Input[3]) == 0xFF)
00057         return std::make_pair(UEF_UTF32_BE, 4);
00058       if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
00059         return std::make_pair(UEF_UTF32_BE, 0);
00060     }
00061 
00062     if (Input.size() >= 2 && Input[1] != 0)
00063       return std::make_pair(UEF_UTF16_BE, 0);
00064     return std::make_pair(UEF_Unknown, 0);
00065   case 0xFF:
00066     if (  Input.size() >= 4
00067        && uint8_t(Input[1]) == 0xFE
00068        && Input[2] == 0
00069        && Input[3] == 0)
00070       return std::make_pair(UEF_UTF32_LE, 4);
00071 
00072     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
00073       return std::make_pair(UEF_UTF16_LE, 2);
00074     return std::make_pair(UEF_Unknown, 0);
00075   case 0xFE:
00076     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
00077       return std::make_pair(UEF_UTF16_BE, 2);
00078     return std::make_pair(UEF_Unknown, 0);
00079   case 0xEF:
00080     if (  Input.size() >= 3
00081        && uint8_t(Input[1]) == 0xBB
00082        && uint8_t(Input[2]) == 0xBF)
00083       return std::make_pair(UEF_UTF8, 3);
00084     return std::make_pair(UEF_Unknown, 0);
00085   }
00086 
00087   // It could still be utf-32 or utf-16.
00088   if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
00089     return std::make_pair(UEF_UTF32_LE, 0);
00090 
00091   if (Input.size() >= 2 && Input[1] == 0)
00092     return std::make_pair(UEF_UTF16_LE, 0);
00093 
00094   return std::make_pair(UEF_UTF8, 0);
00095 }
00096 
00097 namespace llvm {
00098 namespace yaml {
00099 /// Token - A single YAML token.
00100 struct Token : ilist_node<Token> {
00101   enum TokenKind {
00102     TK_Error, // Uninitialized token.
00103     TK_StreamStart,
00104     TK_StreamEnd,
00105     TK_VersionDirective,
00106     TK_TagDirective,
00107     TK_DocumentStart,
00108     TK_DocumentEnd,
00109     TK_BlockEntry,
00110     TK_BlockEnd,
00111     TK_BlockSequenceStart,
00112     TK_BlockMappingStart,
00113     TK_FlowEntry,
00114     TK_FlowSequenceStart,
00115     TK_FlowSequenceEnd,
00116     TK_FlowMappingStart,
00117     TK_FlowMappingEnd,
00118     TK_Key,
00119     TK_Value,
00120     TK_Scalar,
00121     TK_Alias,
00122     TK_Anchor,
00123     TK_Tag
00124   } Kind;
00125 
00126   /// A string of length 0 or more whose begin() points to the logical location
00127   /// of the token in the input.
00128   StringRef Range;
00129 
00130   Token() : Kind(TK_Error) {}
00131 };
00132 }
00133 }
00134 
00135 namespace llvm {
00136 template<>
00137 struct ilist_sentinel_traits<Token> {
00138   Token *createSentinel() const {
00139     return &Sentinel;
00140   }
00141   static void destroySentinel(Token*) {}
00142 
00143   Token *provideInitialHead() const { return createSentinel(); }
00144   Token *ensureHead(Token*) const { return createSentinel(); }
00145   static void noteHead(Token*, Token*) {}
00146 
00147 private:
00148   mutable Token Sentinel;
00149 };
00150 
00151 template<>
00152 struct ilist_node_traits<Token> {
00153   Token *createNode(const Token &V) {
00154     return new (Alloc.Allocate<Token>()) Token(V);
00155   }
00156   static void deleteNode(Token *V) {}
00157 
00158   void addNodeToList(Token *) {}
00159   void removeNodeFromList(Token *) {}
00160   void transferNodesFromList(ilist_node_traits &    /*SrcTraits*/,
00161                              ilist_iterator<Token> /*first*/,
00162                              ilist_iterator<Token> /*last*/) {}
00163 
00164   BumpPtrAllocator Alloc;
00165 };
00166 }
00167 
00168 typedef ilist<Token> TokenQueueT;
00169 
00170 namespace {
00171 /// @brief This struct is used to track simple keys.
00172 ///
00173 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
00174 /// which could legally be the start of a simple key. When peekNext is called,
00175 /// if the Token To be returned is referenced by a SimpleKey, we continue
00176 /// tokenizing until that potential simple key has either been found to not be
00177 /// a simple key (we moved on to the next line or went further than 1024 chars).
00178 /// Or when we run into a Value, and then insert a Key token (and possibly
00179 /// others) before the SimpleKey's Tok.
00180 struct SimpleKey {
00181   TokenQueueT::iterator Tok;
00182   unsigned Column;
00183   unsigned Line;
00184   unsigned FlowLevel;
00185   bool IsRequired;
00186 
00187   bool operator ==(const SimpleKey &Other) {
00188     return Tok == Other.Tok;
00189   }
00190 };
00191 }
00192 
00193 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
00194 ///        subsequence and the subsequence's length in code units (uint8_t).
00195 ///        A length of 0 represents an error.
00196 typedef std::pair<uint32_t, unsigned> UTF8Decoded;
00197 
00198 static UTF8Decoded decodeUTF8(StringRef Range) {
00199   StringRef::iterator Position= Range.begin();
00200   StringRef::iterator End = Range.end();
00201   // 1 byte: [0x00, 0x7f]
00202   // Bit pattern: 0xxxxxxx
00203   if ((*Position & 0x80) == 0) {
00204      return std::make_pair(*Position, 1);
00205   }
00206   // 2 bytes: [0x80, 0x7ff]
00207   // Bit pattern: 110xxxxx 10xxxxxx
00208   if (Position + 1 != End &&
00209       ((*Position & 0xE0) == 0xC0) &&
00210       ((*(Position + 1) & 0xC0) == 0x80)) {
00211     uint32_t codepoint = ((*Position & 0x1F) << 6) |
00212                           (*(Position + 1) & 0x3F);
00213     if (codepoint >= 0x80)
00214       return std::make_pair(codepoint, 2);
00215   }
00216   // 3 bytes: [0x8000, 0xffff]
00217   // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
00218   if (Position + 2 != End &&
00219       ((*Position & 0xF0) == 0xE0) &&
00220       ((*(Position + 1) & 0xC0) == 0x80) &&
00221       ((*(Position + 2) & 0xC0) == 0x80)) {
00222     uint32_t codepoint = ((*Position & 0x0F) << 12) |
00223                          ((*(Position + 1) & 0x3F) << 6) |
00224                           (*(Position + 2) & 0x3F);
00225     // Codepoints between 0xD800 and 0xDFFF are invalid, as
00226     // they are high / low surrogate halves used by UTF-16.
00227     if (codepoint >= 0x800 &&
00228         (codepoint < 0xD800 || codepoint > 0xDFFF))
00229       return std::make_pair(codepoint, 3);
00230   }
00231   // 4 bytes: [0x10000, 0x10FFFF]
00232   // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00233   if (Position + 3 != End &&
00234       ((*Position & 0xF8) == 0xF0) &&
00235       ((*(Position + 1) & 0xC0) == 0x80) &&
00236       ((*(Position + 2) & 0xC0) == 0x80) &&
00237       ((*(Position + 3) & 0xC0) == 0x80)) {
00238     uint32_t codepoint = ((*Position & 0x07) << 18) |
00239                          ((*(Position + 1) & 0x3F) << 12) |
00240                          ((*(Position + 2) & 0x3F) << 6) |
00241                           (*(Position + 3) & 0x3F);
00242     if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
00243       return std::make_pair(codepoint, 4);
00244   }
00245   return std::make_pair(0, 0);
00246 }
00247 
00248 namespace llvm {
00249 namespace yaml {
00250 /// @brief Scans YAML tokens from a MemoryBuffer.
00251 class Scanner {
00252 public:
00253   Scanner(const StringRef Input, SourceMgr &SM);
00254   Scanner(MemoryBuffer *Buffer, SourceMgr &SM_);
00255 
00256   /// @brief Parse the next token and return it without popping it.
00257   Token &peekNext();
00258 
00259   /// @brief Parse the next token and pop it from the queue.
00260   Token getNext();
00261 
00262   void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
00263                   ArrayRef<SMRange> Ranges = None) {
00264     SM.PrintMessage(Loc, Kind, Message, Ranges);
00265   }
00266 
00267   void setError(const Twine &Message, StringRef::iterator Position) {
00268     if (Current >= End)
00269       Current = End - 1;
00270 
00271     // Don't print out more errors after the first one we encounter. The rest
00272     // are just the result of the first, and have no meaning.
00273     if (!Failed)
00274       printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
00275     Failed = true;
00276   }
00277 
00278   void setError(const Twine &Message) {
00279     setError(Message, Current);
00280   }
00281 
00282   /// @brief Returns true if an error occurred while parsing.
00283   bool failed() {
00284     return Failed;
00285   }
00286 
00287 private:
00288   StringRef currentInput() {
00289     return StringRef(Current, End - Current);
00290   }
00291 
00292   /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
00293   ///        at \a Position.
00294   ///
00295   /// If the UTF-8 code units starting at Position do not form a well-formed
00296   /// code unit subsequence, then the Unicode scalar value is 0, and the length
00297   /// is 0.
00298   UTF8Decoded decodeUTF8(StringRef::iterator Position) {
00299     return ::decodeUTF8(StringRef(Position, End - Position));
00300   }
00301 
00302   // The following functions are based on the gramar rules in the YAML spec. The
00303   // style of the function names it meant to closely match how they are written
00304   // in the spec. The number within the [] is the number of the grammar rule in
00305   // the spec.
00306   //
00307   // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
00308   //
00309   // c-
00310   //   A production starting and ending with a special character.
00311   // b-
00312   //   A production matching a single line break.
00313   // nb-
00314   //   A production starting and ending with a non-break character.
00315   // s-
00316   //   A production starting and ending with a white space character.
00317   // ns-
00318   //   A production starting and ending with a non-space character.
00319   // l-
00320   //   A production matching complete line(s).
00321 
00322   /// @brief Skip a single nb-char[27] starting at Position.
00323   ///
00324   /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
00325   ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
00326   ///
00327   /// @returns The code unit after the nb-char, or Position if it's not an
00328   ///          nb-char.
00329   StringRef::iterator skip_nb_char(StringRef::iterator Position);
00330 
00331   /// @brief Skip a single b-break[28] starting at Position.
00332   ///
00333   /// A b-break is 0xD 0xA | 0xD | 0xA
00334   ///
00335   /// @returns The code unit after the b-break, or Position if it's not a
00336   ///          b-break.
00337   StringRef::iterator skip_b_break(StringRef::iterator Position);
00338 
00339   /// @brief Skip a single s-white[33] starting at Position.
00340   ///
00341   /// A s-white is 0x20 | 0x9
00342   ///
00343   /// @returns The code unit after the s-white, or Position if it's not a
00344   ///          s-white.
00345   StringRef::iterator skip_s_white(StringRef::iterator Position);
00346 
00347   /// @brief Skip a single ns-char[34] starting at Position.
00348   ///
00349   /// A ns-char is nb-char - s-white
00350   ///
00351   /// @returns The code unit after the ns-char, or Position if it's not a
00352   ///          ns-char.
00353   StringRef::iterator skip_ns_char(StringRef::iterator Position);
00354 
00355   typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
00356   /// @brief Skip minimal well-formed code unit subsequences until Func
00357   ///        returns its input.
00358   ///
00359   /// @returns The code unit after the last minimal well-formed code unit
00360   ///          subsequence that Func accepted.
00361   StringRef::iterator skip_while( SkipWhileFunc Func
00362                                 , StringRef::iterator Position);
00363 
00364   /// @brief Scan ns-uri-char[39]s starting at Cur.
00365   ///
00366   /// This updates Cur and Column while scanning.
00367   ///
00368   /// @returns A StringRef starting at Cur which covers the longest contiguous
00369   ///          sequence of ns-uri-char.
00370   StringRef scan_ns_uri_char();
00371 
00372   /// @brief Scan ns-plain-one-line[133] starting at \a Cur.
00373   StringRef scan_ns_plain_one_line();
00374 
00375   /// @brief Consume a minimal well-formed code unit subsequence starting at
00376   ///        \a Cur. Return false if it is not the same Unicode scalar value as
00377   ///        \a Expected. This updates \a Column.
00378   bool consume(uint32_t Expected);
00379 
00380   /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
00381   void skip(uint32_t Distance);
00382 
00383   /// @brief Return true if the minimal well-formed code unit subsequence at
00384   ///        Pos is whitespace or a new line
00385   bool isBlankOrBreak(StringRef::iterator Position);
00386 
00387   /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
00388   void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
00389                              , unsigned AtColumn
00390                              , bool IsRequired);
00391 
00392   /// @brief Remove simple keys that can no longer be valid simple keys.
00393   ///
00394   /// Invalid simple keys are not on the current line or are further than 1024
00395   /// columns back.
00396   void removeStaleSimpleKeyCandidates();
00397 
00398   /// @brief Remove all simple keys on FlowLevel \a Level.
00399   void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
00400 
00401   /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
00402   ///        tokens if needed.
00403   bool unrollIndent(int ToColumn);
00404 
00405   /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
00406   ///        if needed.
00407   bool rollIndent( int ToColumn
00408                  , Token::TokenKind Kind
00409                  , TokenQueueT::iterator InsertPoint);
00410 
00411   /// @brief Skip whitespace and comments until the start of the next token.
00412   void scanToNextToken();
00413 
00414   /// @brief Must be the first token generated.
00415   bool scanStreamStart();
00416 
00417   /// @brief Generate tokens needed to close out the stream.
00418   bool scanStreamEnd();
00419 
00420   /// @brief Scan a %BLAH directive.
00421   bool scanDirective();
00422 
00423   /// @brief Scan a ... or ---.
00424   bool scanDocumentIndicator(bool IsStart);
00425 
00426   /// @brief Scan a [ or { and generate the proper flow collection start token.
00427   bool scanFlowCollectionStart(bool IsSequence);
00428 
00429   /// @brief Scan a ] or } and generate the proper flow collection end token.
00430   bool scanFlowCollectionEnd(bool IsSequence);
00431 
00432   /// @brief Scan the , that separates entries in a flow collection.
00433   bool scanFlowEntry();
00434 
00435   /// @brief Scan the - that starts block sequence entries.
00436   bool scanBlockEntry();
00437 
00438   /// @brief Scan an explicit ? indicating a key.
00439   bool scanKey();
00440 
00441   /// @brief Scan an explicit : indicating a value.
00442   bool scanValue();
00443 
00444   /// @brief Scan a quoted scalar.
00445   bool scanFlowScalar(bool IsDoubleQuoted);
00446 
00447   /// @brief Scan an unquoted scalar.
00448   bool scanPlainScalar();
00449 
00450   /// @brief Scan an Alias or Anchor starting with * or &.
00451   bool scanAliasOrAnchor(bool IsAlias);
00452 
00453   /// @brief Scan a block scalar starting with | or >.
00454   bool scanBlockScalar(bool IsLiteral);
00455 
00456   /// @brief Scan a tag of the form !stuff.
00457   bool scanTag();
00458 
00459   /// @brief Dispatch to the next scanning function based on \a *Cur.
00460   bool fetchMoreTokens();
00461 
00462   /// @brief The SourceMgr used for diagnostics and buffer management.
00463   SourceMgr &SM;
00464 
00465   /// @brief The original input.
00466   MemoryBuffer *InputBuffer;
00467 
00468   /// @brief The current position of the scanner.
00469   StringRef::iterator Current;
00470 
00471   /// @brief The end of the input (one past the last character).
00472   StringRef::iterator End;
00473 
00474   /// @brief Current YAML indentation level in spaces.
00475   int Indent;
00476 
00477   /// @brief Current column number in Unicode code points.
00478   unsigned Column;
00479 
00480   /// @brief Current line number.
00481   unsigned Line;
00482 
00483   /// @brief How deep we are in flow style containers. 0 Means at block level.
00484   unsigned FlowLevel;
00485 
00486   /// @brief Are we at the start of the stream?
00487   bool IsStartOfStream;
00488 
00489   /// @brief Can the next token be the start of a simple key?
00490   bool IsSimpleKeyAllowed;
00491 
00492   /// @brief True if an error has occurred.
00493   bool Failed;
00494 
00495   /// @brief Queue of tokens. This is required to queue up tokens while looking
00496   ///        for the end of a simple key. And for cases where a single character
00497   ///        can produce multiple tokens (e.g. BlockEnd).
00498   TokenQueueT TokenQueue;
00499 
00500   /// @brief Indentation levels.
00501   SmallVector<int, 4> Indents;
00502 
00503   /// @brief Potential simple keys.
00504   SmallVector<SimpleKey, 4> SimpleKeys;
00505 };
00506 
00507 } // end namespace yaml
00508 } // end namespace llvm
00509 
00510 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
00511 static void encodeUTF8( uint32_t UnicodeScalarValue
00512                       , SmallVectorImpl<char> &Result) {
00513   if (UnicodeScalarValue <= 0x7F) {
00514     Result.push_back(UnicodeScalarValue & 0x7F);
00515   } else if (UnicodeScalarValue <= 0x7FF) {
00516     uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
00517     uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
00518     Result.push_back(FirstByte);
00519     Result.push_back(SecondByte);
00520   } else if (UnicodeScalarValue <= 0xFFFF) {
00521     uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
00522     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
00523     uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
00524     Result.push_back(FirstByte);
00525     Result.push_back(SecondByte);
00526     Result.push_back(ThirdByte);
00527   } else if (UnicodeScalarValue <= 0x10FFFF) {
00528     uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
00529     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
00530     uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
00531     uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
00532     Result.push_back(FirstByte);
00533     Result.push_back(SecondByte);
00534     Result.push_back(ThirdByte);
00535     Result.push_back(FourthByte);
00536   }
00537 }
00538 
00539 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
00540   SourceMgr SM;
00541   Scanner scanner(Input, SM);
00542   while (true) {
00543     Token T = scanner.getNext();
00544     switch (T.Kind) {
00545     case Token::TK_StreamStart:
00546       OS << "Stream-Start: ";
00547       break;
00548     case Token::TK_StreamEnd:
00549       OS << "Stream-End: ";
00550       break;
00551     case Token::TK_VersionDirective:
00552       OS << "Version-Directive: ";
00553       break;
00554     case Token::TK_TagDirective:
00555       OS << "Tag-Directive: ";
00556       break;
00557     case Token::TK_DocumentStart:
00558       OS << "Document-Start: ";
00559       break;
00560     case Token::TK_DocumentEnd:
00561       OS << "Document-End: ";
00562       break;
00563     case Token::TK_BlockEntry:
00564       OS << "Block-Entry: ";
00565       break;
00566     case Token::TK_BlockEnd:
00567       OS << "Block-End: ";
00568       break;
00569     case Token::TK_BlockSequenceStart:
00570       OS << "Block-Sequence-Start: ";
00571       break;
00572     case Token::TK_BlockMappingStart:
00573       OS << "Block-Mapping-Start: ";
00574       break;
00575     case Token::TK_FlowEntry:
00576       OS << "Flow-Entry: ";
00577       break;
00578     case Token::TK_FlowSequenceStart:
00579       OS << "Flow-Sequence-Start: ";
00580       break;
00581     case Token::TK_FlowSequenceEnd:
00582       OS << "Flow-Sequence-End: ";
00583       break;
00584     case Token::TK_FlowMappingStart:
00585       OS << "Flow-Mapping-Start: ";
00586       break;
00587     case Token::TK_FlowMappingEnd:
00588       OS << "Flow-Mapping-End: ";
00589       break;
00590     case Token::TK_Key:
00591       OS << "Key: ";
00592       break;
00593     case Token::TK_Value:
00594       OS << "Value: ";
00595       break;
00596     case Token::TK_Scalar:
00597       OS << "Scalar: ";
00598       break;
00599     case Token::TK_Alias:
00600       OS << "Alias: ";
00601       break;
00602     case Token::TK_Anchor:
00603       OS << "Anchor: ";
00604       break;
00605     case Token::TK_Tag:
00606       OS << "Tag: ";
00607       break;
00608     case Token::TK_Error:
00609       break;
00610     }
00611     OS << T.Range << "\n";
00612     if (T.Kind == Token::TK_StreamEnd)
00613       break;
00614     else if (T.Kind == Token::TK_Error)
00615       return false;
00616   }
00617   return true;
00618 }
00619 
00620 bool yaml::scanTokens(StringRef Input) {
00621   llvm::SourceMgr SM;
00622   llvm::yaml::Scanner scanner(Input, SM);
00623   for (;;) {
00624     llvm::yaml::Token T = scanner.getNext();
00625     if (T.Kind == Token::TK_StreamEnd)
00626       break;
00627     else if (T.Kind == Token::TK_Error)
00628       return false;
00629   }
00630   return true;
00631 }
00632 
00633 std::string yaml::escape(StringRef Input) {
00634   std::string EscapedInput;
00635   for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
00636     if (*i == '\\')
00637       EscapedInput += "\\\\";
00638     else if (*i == '"')
00639       EscapedInput += "\\\"";
00640     else if (*i == 0)
00641       EscapedInput += "\\0";
00642     else if (*i == 0x07)
00643       EscapedInput += "\\a";
00644     else if (*i == 0x08)
00645       EscapedInput += "\\b";
00646     else if (*i == 0x09)
00647       EscapedInput += "\\t";
00648     else if (*i == 0x0A)
00649       EscapedInput += "\\n";
00650     else if (*i == 0x0B)
00651       EscapedInput += "\\v";
00652     else if (*i == 0x0C)
00653       EscapedInput += "\\f";
00654     else if (*i == 0x0D)
00655       EscapedInput += "\\r";
00656     else if (*i == 0x1B)
00657       EscapedInput += "\\e";
00658     else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
00659       std::string HexStr = utohexstr(*i);
00660       EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
00661     } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
00662       UTF8Decoded UnicodeScalarValue
00663         = decodeUTF8(StringRef(i, Input.end() - i));
00664       if (UnicodeScalarValue.second == 0) {
00665         // Found invalid char.
00666         SmallString<4> Val;
00667         encodeUTF8(0xFFFD, Val);
00668         EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
00669         // FIXME: Error reporting.
00670         return EscapedInput;
00671       }
00672       if (UnicodeScalarValue.first == 0x85)
00673         EscapedInput += "\\N";
00674       else if (UnicodeScalarValue.first == 0xA0)
00675         EscapedInput += "\\_";
00676       else if (UnicodeScalarValue.first == 0x2028)
00677         EscapedInput += "\\L";
00678       else if (UnicodeScalarValue.first == 0x2029)
00679         EscapedInput += "\\P";
00680       else {
00681         std::string HexStr = utohexstr(UnicodeScalarValue.first);
00682         if (HexStr.size() <= 2)
00683           EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
00684         else if (HexStr.size() <= 4)
00685           EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
00686         else if (HexStr.size() <= 8)
00687           EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
00688       }
00689       i += UnicodeScalarValue.second - 1;
00690     } else
00691       EscapedInput.push_back(*i);
00692   }
00693   return EscapedInput;
00694 }
00695 
00696 Scanner::Scanner(StringRef Input, SourceMgr &sm)
00697   : SM(sm)
00698   , Indent(-1)
00699   , Column(0)
00700   , Line(0)
00701   , FlowLevel(0)
00702   , IsStartOfStream(true)
00703   , IsSimpleKeyAllowed(true)
00704   , Failed(false) {
00705   InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
00706   SM.AddNewSourceBuffer(InputBuffer, SMLoc());
00707   Current = InputBuffer->getBufferStart();
00708   End = InputBuffer->getBufferEnd();
00709 }
00710 
00711 Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_)
00712   : SM(SM_)
00713   , InputBuffer(Buffer)
00714   , Current(InputBuffer->getBufferStart())
00715   , End(InputBuffer->getBufferEnd())
00716   , Indent(-1)
00717   , Column(0)
00718   , Line(0)
00719   , FlowLevel(0)
00720   , IsStartOfStream(true)
00721   , IsSimpleKeyAllowed(true)
00722   , Failed(false) {
00723     SM.AddNewSourceBuffer(InputBuffer, SMLoc());
00724 }
00725 
00726 Token &Scanner::peekNext() {
00727   // If the current token is a possible simple key, keep parsing until we
00728   // can confirm.
00729   bool NeedMore = false;
00730   while (true) {
00731     if (TokenQueue.empty() || NeedMore) {
00732       if (!fetchMoreTokens()) {
00733         TokenQueue.clear();
00734         TokenQueue.push_back(Token());
00735         return TokenQueue.front();
00736       }
00737     }
00738     assert(!TokenQueue.empty() &&
00739             "fetchMoreTokens lied about getting tokens!");
00740 
00741     removeStaleSimpleKeyCandidates();
00742     SimpleKey SK;
00743     SK.Tok = TokenQueue.front();
00744     if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
00745         == SimpleKeys.end())
00746       break;
00747     else
00748       NeedMore = true;
00749   }
00750   return TokenQueue.front();
00751 }
00752 
00753 Token Scanner::getNext() {
00754   Token Ret = peekNext();
00755   // TokenQueue can be empty if there was an error getting the next token.
00756   if (!TokenQueue.empty())
00757     TokenQueue.pop_front();
00758 
00759   // There cannot be any referenced Token's if the TokenQueue is empty. So do a
00760   // quick deallocation of them all.
00761   if (TokenQueue.empty()) {
00762     TokenQueue.Alloc.Reset();
00763   }
00764 
00765   return Ret;
00766 }
00767 
00768 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
00769   if (Position == End)
00770     return Position;
00771   // Check 7 bit c-printable - b-char.
00772   if (   *Position == 0x09
00773       || (*Position >= 0x20 && *Position <= 0x7E))
00774     return Position + 1;
00775 
00776   // Check for valid UTF-8.
00777   if (uint8_t(*Position) & 0x80) {
00778     UTF8Decoded u8d = decodeUTF8(Position);
00779     if (   u8d.second != 0
00780         && u8d.first != 0xFEFF
00781         && ( u8d.first == 0x85
00782           || ( u8d.first >= 0xA0
00783             && u8d.first <= 0xD7FF)
00784           || ( u8d.first >= 0xE000
00785             && u8d.first <= 0xFFFD)
00786           || ( u8d.first >= 0x10000
00787             && u8d.first <= 0x10FFFF)))
00788       return Position + u8d.second;
00789   }
00790   return Position;
00791 }
00792 
00793 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
00794   if (Position == End)
00795     return Position;
00796   if (*Position == 0x0D) {
00797     if (Position + 1 != End && *(Position + 1) == 0x0A)
00798       return Position + 2;
00799     return Position + 1;
00800   }
00801 
00802   if (*Position == 0x0A)
00803     return Position + 1;
00804   return Position;
00805 }
00806 
00807 
00808 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
00809   if (Position == End)
00810     return Position;
00811   if (*Position == ' ' || *Position == '\t')
00812     return Position + 1;
00813   return Position;
00814 }
00815 
00816 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
00817   if (Position == End)
00818     return Position;
00819   if (*Position == ' ' || *Position == '\t')
00820     return Position;
00821   return skip_nb_char(Position);
00822 }
00823 
00824 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
00825                                        , StringRef::iterator Position) {
00826   while (true) {
00827     StringRef::iterator i = (this->*Func)(Position);
00828     if (i == Position)
00829       break;
00830     Position = i;
00831   }
00832   return Position;
00833 }
00834 
00835 static bool is_ns_hex_digit(const char C) {
00836   return    (C >= '0' && C <= '9')
00837          || (C >= 'a' && C <= 'z')
00838          || (C >= 'A' && C <= 'Z');
00839 }
00840 
00841 static bool is_ns_word_char(const char C) {
00842   return    C == '-'
00843          || (C >= 'a' && C <= 'z')
00844          || (C >= 'A' && C <= 'Z');
00845 }
00846 
00847 StringRef Scanner::scan_ns_uri_char() {
00848   StringRef::iterator Start = Current;
00849   while (true) {
00850     if (Current == End)
00851       break;
00852     if ((   *Current == '%'
00853           && Current + 2 < End
00854           && is_ns_hex_digit(*(Current + 1))
00855           && is_ns_hex_digit(*(Current + 2)))
00856         || is_ns_word_char(*Current)
00857         || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
00858           != StringRef::npos) {
00859       ++Current;
00860       ++Column;
00861     } else
00862       break;
00863   }
00864   return StringRef(Start, Current - Start);
00865 }
00866 
00867 StringRef Scanner::scan_ns_plain_one_line() {
00868   StringRef::iterator start = Current;
00869   // The first character must already be verified.
00870   ++Current;
00871   while (true) {
00872     if (Current == End) {
00873       break;
00874     } else if (*Current == ':') {
00875       // Check if the next character is a ns-char.
00876       if (Current + 1 == End)
00877         break;
00878       StringRef::iterator i = skip_ns_char(Current + 1);
00879       if (Current + 1 != i) {
00880         Current = i;
00881         Column += 2; // Consume both the ':' and ns-char.
00882       } else
00883         break;
00884     } else if (*Current == '#') {
00885       // Check if the previous character was a ns-char.
00886       // The & 0x80 check is to check for the trailing byte of a utf-8
00887       if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) {
00888         ++Current;
00889         ++Column;
00890       } else
00891         break;
00892     } else {
00893       StringRef::iterator i = skip_nb_char(Current);
00894       if (i == Current)
00895         break;
00896       Current = i;
00897       ++Column;
00898     }
00899   }
00900   return StringRef(start, Current - start);
00901 }
00902 
00903 bool Scanner::consume(uint32_t Expected) {
00904   if (Expected >= 0x80)
00905     report_fatal_error("Not dealing with this yet");
00906   if (Current == End)
00907     return false;
00908   if (uint8_t(*Current) >= 0x80)
00909     report_fatal_error("Not dealing with this yet");
00910   if (uint8_t(*Current) == Expected) {
00911     ++Current;
00912     ++Column;
00913     return true;
00914   }
00915   return false;
00916 }
00917 
00918 void Scanner::skip(uint32_t Distance) {
00919   Current += Distance;
00920   Column += Distance;
00921   assert(Current <= End && "Skipped past the end");
00922 }
00923 
00924 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
00925   if (Position == End)
00926     return false;
00927   if (   *Position == ' ' || *Position == '\t'
00928       || *Position == '\r' || *Position == '\n')
00929     return true;
00930   return false;
00931 }
00932 
00933 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
00934                                     , unsigned AtColumn
00935                                     , bool IsRequired) {
00936   if (IsSimpleKeyAllowed) {
00937     SimpleKey SK;
00938     SK.Tok = Tok;
00939     SK.Line = Line;
00940     SK.Column = AtColumn;
00941     SK.IsRequired = IsRequired;
00942     SK.FlowLevel = FlowLevel;
00943     SimpleKeys.push_back(SK);
00944   }
00945 }
00946 
00947 void Scanner::removeStaleSimpleKeyCandidates() {
00948   for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
00949                                             i != SimpleKeys.end();) {
00950     if (i->Line != Line || i->Column + 1024 < Column) {
00951       if (i->IsRequired)
00952         setError( "Could not find expected : for simple key"
00953                 , i->Tok->Range.begin());
00954       i = SimpleKeys.erase(i);
00955     } else
00956       ++i;
00957   }
00958 }
00959 
00960 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
00961   if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
00962     SimpleKeys.pop_back();
00963 }
00964 
00965 bool Scanner::unrollIndent(int ToColumn) {
00966   Token T;
00967   // Indentation is ignored in flow.
00968   if (FlowLevel != 0)
00969     return true;
00970 
00971   while (Indent > ToColumn) {
00972     T.Kind = Token::TK_BlockEnd;
00973     T.Range = StringRef(Current, 1);
00974     TokenQueue.push_back(T);
00975     Indent = Indents.pop_back_val();
00976   }
00977 
00978   return true;
00979 }
00980 
00981 bool Scanner::rollIndent( int ToColumn
00982                         , Token::TokenKind Kind
00983                         , TokenQueueT::iterator InsertPoint) {
00984   if (FlowLevel)
00985     return true;
00986   if (Indent < ToColumn) {
00987     Indents.push_back(Indent);
00988     Indent = ToColumn;
00989 
00990     Token T;
00991     T.Kind = Kind;
00992     T.Range = StringRef(Current, 0);
00993     TokenQueue.insert(InsertPoint, T);
00994   }
00995   return true;
00996 }
00997 
00998 void Scanner::scanToNextToken() {
00999   while (true) {
01000     while (*Current == ' ' || *Current == '\t') {
01001       skip(1);
01002     }
01003 
01004     // Skip comment.
01005     if (*Current == '#') {
01006       while (true) {
01007         // This may skip more than one byte, thus Column is only incremented
01008         // for code points.
01009         StringRef::iterator i = skip_nb_char(Current);
01010         if (i == Current)
01011           break;
01012         Current = i;
01013         ++Column;
01014       }
01015     }
01016 
01017     // Skip EOL.
01018     StringRef::iterator i = skip_b_break(Current);
01019     if (i == Current)
01020       break;
01021     Current = i;
01022     ++Line;
01023     Column = 0;
01024     // New lines may start a simple key.
01025     if (!FlowLevel)
01026       IsSimpleKeyAllowed = true;
01027   }
01028 }
01029 
01030 bool Scanner::scanStreamStart() {
01031   IsStartOfStream = false;
01032 
01033   EncodingInfo EI = getUnicodeEncoding(currentInput());
01034 
01035   Token T;
01036   T.Kind = Token::TK_StreamStart;
01037   T.Range = StringRef(Current, EI.second);
01038   TokenQueue.push_back(T);
01039   Current += EI.second;
01040   return true;
01041 }
01042 
01043 bool Scanner::scanStreamEnd() {
01044   // Force an ending new line if one isn't present.
01045   if (Column != 0) {
01046     Column = 0;
01047     ++Line;
01048   }
01049 
01050   unrollIndent(-1);
01051   SimpleKeys.clear();
01052   IsSimpleKeyAllowed = false;
01053 
01054   Token T;
01055   T.Kind = Token::TK_StreamEnd;
01056   T.Range = StringRef(Current, 0);
01057   TokenQueue.push_back(T);
01058   return true;
01059 }
01060 
01061 bool Scanner::scanDirective() {
01062   // Reset the indentation level.
01063   unrollIndent(-1);
01064   SimpleKeys.clear();
01065   IsSimpleKeyAllowed = false;
01066 
01067   StringRef::iterator Start = Current;
01068   consume('%');
01069   StringRef::iterator NameStart = Current;
01070   Current = skip_while(&Scanner::skip_ns_char, Current);
01071   StringRef Name(NameStart, Current - NameStart);
01072   Current = skip_while(&Scanner::skip_s_white, Current);
01073 
01074   if (Name == "YAML") {
01075     Current = skip_while(&Scanner::skip_ns_char, Current);
01076     Token T;
01077     T.Kind = Token::TK_VersionDirective;
01078     T.Range = StringRef(Start, Current - Start);
01079     TokenQueue.push_back(T);
01080     return true;
01081   }
01082   return false;
01083 }
01084 
01085 bool Scanner::scanDocumentIndicator(bool IsStart) {
01086   unrollIndent(-1);
01087   SimpleKeys.clear();
01088   IsSimpleKeyAllowed = false;
01089 
01090   Token T;
01091   T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
01092   T.Range = StringRef(Current, 3);
01093   skip(3);
01094   TokenQueue.push_back(T);
01095   return true;
01096 }
01097 
01098 bool Scanner::scanFlowCollectionStart(bool IsSequence) {
01099   Token T;
01100   T.Kind = IsSequence ? Token::TK_FlowSequenceStart
01101                       : Token::TK_FlowMappingStart;
01102   T.Range = StringRef(Current, 1);
01103   skip(1);
01104   TokenQueue.push_back(T);
01105 
01106   // [ and { may begin a simple key.
01107   saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
01108 
01109   // And may also be followed by a simple key.
01110   IsSimpleKeyAllowed = true;
01111   ++FlowLevel;
01112   return true;
01113 }
01114 
01115 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
01116   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
01117   IsSimpleKeyAllowed = false;
01118   Token T;
01119   T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
01120                       : Token::TK_FlowMappingEnd;
01121   T.Range = StringRef(Current, 1);
01122   skip(1);
01123   TokenQueue.push_back(T);
01124   if (FlowLevel)
01125     --FlowLevel;
01126   return true;
01127 }
01128 
01129 bool Scanner::scanFlowEntry() {
01130   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
01131   IsSimpleKeyAllowed = true;
01132   Token T;
01133   T.Kind = Token::TK_FlowEntry;
01134   T.Range = StringRef(Current, 1);
01135   skip(1);
01136   TokenQueue.push_back(T);
01137   return true;
01138 }
01139 
01140 bool Scanner::scanBlockEntry() {
01141   rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
01142   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
01143   IsSimpleKeyAllowed = true;
01144   Token T;
01145   T.Kind = Token::TK_BlockEntry;
01146   T.Range = StringRef(Current, 1);
01147   skip(1);
01148   TokenQueue.push_back(T);
01149   return true;
01150 }
01151 
01152 bool Scanner::scanKey() {
01153   if (!FlowLevel)
01154     rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
01155 
01156   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
01157   IsSimpleKeyAllowed = !FlowLevel;
01158 
01159   Token T;
01160   T.Kind = Token::TK_Key;
01161   T.Range = StringRef(Current, 1);
01162   skip(1);
01163   TokenQueue.push_back(T);
01164   return true;
01165 }
01166 
01167 bool Scanner::scanValue() {
01168   // If the previous token could have been a simple key, insert the key token
01169   // into the token queue.
01170   if (!SimpleKeys.empty()) {
01171     SimpleKey SK = SimpleKeys.pop_back_val();
01172     Token T;
01173     T.Kind = Token::TK_Key;
01174     T.Range = SK.Tok->Range;
01175     TokenQueueT::iterator i, e;
01176     for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
01177       if (i == SK.Tok)
01178         break;
01179     }
01180     assert(i != e && "SimpleKey not in token queue!");
01181     i = TokenQueue.insert(i, T);
01182 
01183     // We may also need to add a Block-Mapping-Start token.
01184     rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
01185 
01186     IsSimpleKeyAllowed = false;
01187   } else {
01188     if (!FlowLevel)
01189       rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
01190     IsSimpleKeyAllowed = !FlowLevel;
01191   }
01192 
01193   Token T;
01194   T.Kind = Token::TK_Value;
01195   T.Range = StringRef(Current, 1);
01196   skip(1);
01197   TokenQueue.push_back(T);
01198   return true;
01199 }
01200 
01201 // Forbidding inlining improves performance by roughly 20%.
01202 // FIXME: Remove once llvm optimizes this to the faster version without hints.
01203 LLVM_ATTRIBUTE_NOINLINE static bool
01204 wasEscaped(StringRef::iterator First, StringRef::iterator Position);
01205 
01206 // Returns whether a character at 'Position' was escaped with a leading '\'.
01207 // 'First' specifies the position of the first character in the string.
01208 static bool wasEscaped(StringRef::iterator First,
01209                        StringRef::iterator Position) {
01210   assert(Position - 1 >= First);
01211   StringRef::iterator I = Position - 1;
01212   // We calculate the number of consecutive '\'s before the current position
01213   // by iterating backwards through our string.
01214   while (I >= First && *I == '\\') --I;
01215   // (Position - 1 - I) now contains the number of '\'s before the current
01216   // position. If it is odd, the character at 'Position' was escaped.
01217   return (Position - 1 - I) % 2 == 1;
01218 }
01219 
01220 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
01221   StringRef::iterator Start = Current;
01222   unsigned ColStart = Column;
01223   if (IsDoubleQuoted) {
01224     do {
01225       ++Current;
01226       while (Current != End && *Current != '"')
01227         ++Current;
01228       // Repeat until the previous character was not a '\' or was an escaped
01229       // backslash.
01230     } while (   Current != End
01231              && *(Current - 1) == '\\'
01232              && wasEscaped(Start + 1, Current));
01233   } else {
01234     skip(1);
01235     while (true) {
01236       // Skip a ' followed by another '.
01237       if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
01238         skip(2);
01239         continue;
01240       } else if (*Current == '\'')
01241         break;
01242       StringRef::iterator i = skip_nb_char(Current);
01243       if (i == Current) {
01244         i = skip_b_break(Current);
01245         if (i == Current)
01246           break;
01247         Current = i;
01248         Column = 0;
01249         ++Line;
01250       } else {
01251         if (i == End)
01252           break;
01253         Current = i;
01254         ++Column;
01255       }
01256     }
01257   }
01258 
01259   if (Current == End) {
01260     setError("Expected quote at end of scalar", Current);
01261     return false;
01262   }
01263 
01264   skip(1); // Skip ending quote.
01265   Token T;
01266   T.Kind = Token::TK_Scalar;
01267   T.Range = StringRef(Start, Current - Start);
01268   TokenQueue.push_back(T);
01269 
01270   saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
01271 
01272   IsSimpleKeyAllowed = false;
01273 
01274   return true;
01275 }
01276 
01277 bool Scanner::scanPlainScalar() {
01278   StringRef::iterator Start = Current;
01279   unsigned ColStart = Column;
01280   unsigned LeadingBlanks = 0;
01281   assert(Indent >= -1 && "Indent must be >= -1 !");
01282   unsigned indent = static_cast<unsigned>(Indent + 1);
01283   while (true) {
01284     if (*Current == '#')
01285       break;
01286 
01287     while (!isBlankOrBreak(Current)) {
01288       if (  FlowLevel && *Current == ':'
01289           && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
01290         setError("Found unexpected ':' while scanning a plain scalar", Current);
01291         return false;
01292       }
01293 
01294       // Check for the end of the plain scalar.
01295       if (  (*Current == ':' && isBlankOrBreak(Current + 1))
01296           || (  FlowLevel
01297           && (StringRef(Current, 1).find_first_of(",:?[]{}")
01298               != StringRef::npos)))
01299         break;
01300 
01301       StringRef::iterator i = skip_nb_char(Current);
01302       if (i == Current)
01303         break;
01304       Current = i;
01305       ++Column;
01306     }
01307 
01308     // Are we at the end?
01309     if (!isBlankOrBreak(Current))
01310       break;
01311 
01312     // Eat blanks.
01313     StringRef::iterator Tmp = Current;
01314     while (isBlankOrBreak(Tmp)) {
01315       StringRef::iterator i = skip_s_white(Tmp);
01316       if (i != Tmp) {
01317         if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
01318           setError("Found invalid tab character in indentation", Tmp);
01319           return false;
01320         }
01321         Tmp = i;
01322         ++Column;
01323       } else {
01324         i = skip_b_break(Tmp);
01325         if (!LeadingBlanks)
01326           LeadingBlanks = 1;
01327         Tmp = i;
01328         Column = 0;
01329         ++Line;
01330       }
01331     }
01332 
01333     if (!FlowLevel && Column < indent)
01334       break;
01335 
01336     Current = Tmp;
01337   }
01338   if (Start == Current) {
01339     setError("Got empty plain scalar", Start);
01340     return false;
01341   }
01342   Token T;
01343   T.Kind = Token::TK_Scalar;
01344   T.Range = StringRef(Start, Current - Start);
01345   TokenQueue.push_back(T);
01346 
01347   // Plain scalars can be simple keys.
01348   saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
01349 
01350   IsSimpleKeyAllowed = false;
01351 
01352   return true;
01353 }
01354 
01355 bool Scanner::scanAliasOrAnchor(bool IsAlias) {
01356   StringRef::iterator Start = Current;
01357   unsigned ColStart = Column;
01358   skip(1);
01359   while(true) {
01360     if (   *Current == '[' || *Current == ']'
01361         || *Current == '{' || *Current == '}'
01362         || *Current == ','
01363         || *Current == ':')
01364       break;
01365     StringRef::iterator i = skip_ns_char(Current);
01366     if (i == Current)
01367       break;
01368     Current = i;
01369     ++Column;
01370   }
01371 
01372   if (Start == Current) {
01373     setError("Got empty alias or anchor", Start);
01374     return false;
01375   }
01376 
01377   Token T;
01378   T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
01379   T.Range = StringRef(Start, Current - Start);
01380   TokenQueue.push_back(T);
01381 
01382   // Alias and anchors can be simple keys.
01383   saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
01384 
01385   IsSimpleKeyAllowed = false;
01386 
01387   return true;
01388 }
01389 
01390 bool Scanner::scanBlockScalar(bool IsLiteral) {
01391   StringRef::iterator Start = Current;
01392   skip(1); // Eat | or >
01393   while(true) {
01394     StringRef::iterator i = skip_nb_char(Current);
01395     if (i == Current) {
01396       if (Column == 0)
01397         break;
01398       i = skip_b_break(Current);
01399       if (i != Current) {
01400         // We got a line break.
01401         Column = 0;
01402         ++Line;
01403         Current = i;
01404         continue;
01405       } else {
01406         // There was an error, which should already have been printed out.
01407         return false;
01408       }
01409     }
01410     Current = i;
01411     ++Column;
01412   }
01413 
01414   if (Start == Current) {
01415     setError("Got empty block scalar", Start);
01416     return false;
01417   }
01418 
01419   Token T;
01420   T.Kind = Token::TK_Scalar;
01421   T.Range = StringRef(Start, Current - Start);
01422   TokenQueue.push_back(T);
01423   return true;
01424 }
01425 
01426 bool Scanner::scanTag() {
01427   StringRef::iterator Start = Current;
01428   unsigned ColStart = Column;
01429   skip(1); // Eat !.
01430   if (Current == End || isBlankOrBreak(Current)); // An empty tag.
01431   else if (*Current == '<') {
01432     skip(1);
01433     scan_ns_uri_char();
01434     if (!consume('>'))
01435       return false;
01436   } else {
01437     // FIXME: Actually parse the c-ns-shorthand-tag rule.
01438     Current = skip_while(&Scanner::skip_ns_char, Current);
01439   }
01440 
01441   Token T;
01442   T.Kind = Token::TK_Tag;
01443   T.Range = StringRef(Start, Current - Start);
01444   TokenQueue.push_back(T);
01445 
01446   // Tags can be simple keys.
01447   saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
01448 
01449   IsSimpleKeyAllowed = false;
01450 
01451   return true;
01452 }
01453 
01454 bool Scanner::fetchMoreTokens() {
01455   if (IsStartOfStream)
01456     return scanStreamStart();
01457 
01458   scanToNextToken();
01459 
01460   if (Current == End)
01461     return scanStreamEnd();
01462 
01463   removeStaleSimpleKeyCandidates();
01464 
01465   unrollIndent(Column);
01466 
01467   if (Column == 0 && *Current == '%')
01468     return scanDirective();
01469 
01470   if (Column == 0 && Current + 4 <= End
01471       && *Current == '-'
01472       && *(Current + 1) == '-'
01473       && *(Current + 2) == '-'
01474       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
01475     return scanDocumentIndicator(true);
01476 
01477   if (Column == 0 && Current + 4 <= End
01478       && *Current == '.'
01479       && *(Current + 1) == '.'
01480       && *(Current + 2) == '.'
01481       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
01482     return scanDocumentIndicator(false);
01483 
01484   if (*Current == '[')
01485     return scanFlowCollectionStart(true);
01486 
01487   if (*Current == '{')
01488     return scanFlowCollectionStart(false);
01489 
01490   if (*Current == ']')
01491     return scanFlowCollectionEnd(true);
01492 
01493   if (*Current == '}')
01494     return scanFlowCollectionEnd(false);
01495 
01496   if (*Current == ',')
01497     return scanFlowEntry();
01498 
01499   if (*Current == '-' && isBlankOrBreak(Current + 1))
01500     return scanBlockEntry();
01501 
01502   if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
01503     return scanKey();
01504 
01505   if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
01506     return scanValue();
01507 
01508   if (*Current == '*')
01509     return scanAliasOrAnchor(true);
01510 
01511   if (*Current == '&')
01512     return scanAliasOrAnchor(false);
01513 
01514   if (*Current == '!')
01515     return scanTag();
01516 
01517   if (*Current == '|' && !FlowLevel)
01518     return scanBlockScalar(true);
01519 
01520   if (*Current == '>' && !FlowLevel)
01521     return scanBlockScalar(false);
01522 
01523   if (*Current == '\'')
01524     return scanFlowScalar(false);
01525 
01526   if (*Current == '"')
01527     return scanFlowScalar(true);
01528 
01529   // Get a plain scalar.
01530   StringRef FirstChar(Current, 1);
01531   if (!(isBlankOrBreak(Current)
01532         || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
01533       || (*Current == '-' && !isBlankOrBreak(Current + 1))
01534       || (!FlowLevel && (*Current == '?' || *Current == ':')
01535           && isBlankOrBreak(Current + 1))
01536       || (!FlowLevel && *Current == ':'
01537                       && Current + 2 < End
01538                       && *(Current + 1) == ':'
01539                       && !isBlankOrBreak(Current + 2)))
01540     return scanPlainScalar();
01541 
01542   setError("Unrecognized character while tokenizing.");
01543   return false;
01544 }
01545 
01546 Stream::Stream(StringRef Input, SourceMgr &SM)
01547   : scanner(new Scanner(Input, SM))
01548   , CurrentDoc(0) {}
01549 
01550 Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM)
01551   : scanner(new Scanner(InputBuffer, SM))
01552   , CurrentDoc(0) {}
01553 
01554 Stream::~Stream() {}
01555 
01556 bool Stream::failed() { return scanner->failed(); }
01557 
01558 void Stream::printError(Node *N, const Twine &Msg) {
01559   SmallVector<SMRange, 1> Ranges;
01560   Ranges.push_back(N->getSourceRange());
01561   scanner->printError( N->getSourceRange().Start
01562                      , SourceMgr::DK_Error
01563                      , Msg
01564                      , Ranges);
01565 }
01566 
01567 void Stream::handleYAMLDirective(const Token &t) {
01568   // TODO: Ensure version is 1.x.
01569 }
01570 
01571 document_iterator Stream::begin() {
01572   if (CurrentDoc)
01573     report_fatal_error("Can only iterate over the stream once");
01574 
01575   // Skip Stream-Start.
01576   scanner->getNext();
01577 
01578   CurrentDoc.reset(new Document(*this));
01579   return document_iterator(CurrentDoc);
01580 }
01581 
01582 document_iterator Stream::end() {
01583   return document_iterator();
01584 }
01585 
01586 void Stream::skip() {
01587   for (document_iterator i = begin(), e = end(); i != e; ++i)
01588     i->skip();
01589 }
01590 
01591 Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A)
01592   : Doc(D)
01593   , TypeID(Type)
01594   , Anchor(A) {
01595   SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
01596   SourceRange = SMRange(Start, Start);
01597 }
01598 
01599 Token &Node::peekNext() {
01600   return Doc->peekNext();
01601 }
01602 
01603 Token Node::getNext() {
01604   return Doc->getNext();
01605 }
01606 
01607 Node *Node::parseBlockNode() {
01608   return Doc->parseBlockNode();
01609 }
01610 
01611 BumpPtrAllocator &Node::getAllocator() {
01612   return Doc->NodeAllocator;
01613 }
01614 
01615 void Node::setError(const Twine &Msg, Token &Tok) const {
01616   Doc->setError(Msg, Tok);
01617 }
01618 
01619 bool Node::failed() const {
01620   return Doc->failed();
01621 }
01622 
01623 
01624 
01625 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
01626   // TODO: Handle newlines properly. We need to remove leading whitespace.
01627   if (Value[0] == '"') { // Double quoted.
01628     // Pull off the leading and trailing "s.
01629     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
01630     // Search for characters that would require unescaping the value.
01631     StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
01632     if (i != StringRef::npos)
01633       return unescapeDoubleQuoted(UnquotedValue, i, Storage);
01634     return UnquotedValue;
01635   } else if (Value[0] == '\'') { // Single quoted.
01636     // Pull off the leading and trailing 's.
01637     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
01638     StringRef::size_type i = UnquotedValue.find('\'');
01639     if (i != StringRef::npos) {
01640       // We're going to need Storage.
01641       Storage.clear();
01642       Storage.reserve(UnquotedValue.size());
01643       for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
01644         StringRef Valid(UnquotedValue.begin(), i);
01645         Storage.insert(Storage.end(), Valid.begin(), Valid.end());
01646         Storage.push_back('\'');
01647         UnquotedValue = UnquotedValue.substr(i + 2);
01648       }
01649       Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
01650       return StringRef(Storage.begin(), Storage.size());
01651     }
01652     return UnquotedValue;
01653   }
01654   // Plain or block.
01655   return Value.rtrim(" ");
01656 }
01657 
01658 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
01659                                           , StringRef::size_type i
01660                                           , SmallVectorImpl<char> &Storage)
01661                                           const {
01662   // Use Storage to build proper value.
01663   Storage.clear();
01664   Storage.reserve(UnquotedValue.size());
01665   for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
01666     // Insert all previous chars into Storage.
01667     StringRef Valid(UnquotedValue.begin(), i);
01668     Storage.insert(Storage.end(), Valid.begin(), Valid.end());
01669     // Chop off inserted chars.
01670     UnquotedValue = UnquotedValue.substr(i);
01671 
01672     assert(!UnquotedValue.empty() && "Can't be empty!");
01673 
01674     // Parse escape or line break.
01675     switch (UnquotedValue[0]) {
01676     case '\r':
01677     case '\n':
01678       Storage.push_back('\n');
01679       if (   UnquotedValue.size() > 1
01680           && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
01681         UnquotedValue = UnquotedValue.substr(1);
01682       UnquotedValue = UnquotedValue.substr(1);
01683       break;
01684     default:
01685       if (UnquotedValue.size() == 1)
01686         // TODO: Report error.
01687         break;
01688       UnquotedValue = UnquotedValue.substr(1);
01689       switch (UnquotedValue[0]) {
01690       default: {
01691           Token T;
01692           T.Range = StringRef(UnquotedValue.begin(), 1);
01693           setError("Unrecognized escape code!", T);
01694           return "";
01695         }
01696       case '\r':
01697       case '\n':
01698         // Remove the new line.
01699         if (   UnquotedValue.size() > 1
01700             && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
01701           UnquotedValue = UnquotedValue.substr(1);
01702         // If this was just a single byte newline, it will get skipped
01703         // below.
01704         break;
01705       case '0':
01706         Storage.push_back(0x00);
01707         break;
01708       case 'a':
01709         Storage.push_back(0x07);
01710         break;
01711       case 'b':
01712         Storage.push_back(0x08);
01713         break;
01714       case 't':
01715       case 0x09:
01716         Storage.push_back(0x09);
01717         break;
01718       case 'n':
01719         Storage.push_back(0x0A);
01720         break;
01721       case 'v':
01722         Storage.push_back(0x0B);
01723         break;
01724       case 'f':
01725         Storage.push_back(0x0C);
01726         break;
01727       case 'r':
01728         Storage.push_back(0x0D);
01729         break;
01730       case 'e':
01731         Storage.push_back(0x1B);
01732         break;
01733       case ' ':
01734         Storage.push_back(0x20);
01735         break;
01736       case '"':
01737         Storage.push_back(0x22);
01738         break;
01739       case '/':
01740         Storage.push_back(0x2F);
01741         break;
01742       case '\\':
01743         Storage.push_back(0x5C);
01744         break;
01745       case 'N':
01746         encodeUTF8(0x85, Storage);
01747         break;
01748       case '_':
01749         encodeUTF8(0xA0, Storage);
01750         break;
01751       case 'L':
01752         encodeUTF8(0x2028, Storage);
01753         break;
01754       case 'P':
01755         encodeUTF8(0x2029, Storage);
01756         break;
01757       case 'x': {
01758           if (UnquotedValue.size() < 3)
01759             // TODO: Report error.
01760             break;
01761           unsigned int UnicodeScalarValue;
01762           if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
01763             // TODO: Report error.
01764             UnicodeScalarValue = 0xFFFD;
01765           encodeUTF8(UnicodeScalarValue, Storage);
01766           UnquotedValue = UnquotedValue.substr(2);
01767           break;
01768         }
01769       case 'u': {
01770           if (UnquotedValue.size() < 5)
01771             // TODO: Report error.
01772             break;
01773           unsigned int UnicodeScalarValue;
01774           if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
01775             // TODO: Report error.
01776             UnicodeScalarValue = 0xFFFD;
01777           encodeUTF8(UnicodeScalarValue, Storage);
01778           UnquotedValue = UnquotedValue.substr(4);
01779           break;
01780         }
01781       case 'U': {
01782           if (UnquotedValue.size() < 9)
01783             // TODO: Report error.
01784             break;
01785           unsigned int UnicodeScalarValue;
01786           if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
01787             // TODO: Report error.
01788             UnicodeScalarValue = 0xFFFD;
01789           encodeUTF8(UnicodeScalarValue, Storage);
01790           UnquotedValue = UnquotedValue.substr(8);
01791           break;
01792         }
01793       }
01794       UnquotedValue = UnquotedValue.substr(1);
01795     }
01796   }
01797   Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
01798   return StringRef(Storage.begin(), Storage.size());
01799 }
01800 
01801 Node *KeyValueNode::getKey() {
01802   if (Key)
01803     return Key;
01804   // Handle implicit null keys.
01805   {
01806     Token &t = peekNext();
01807     if (   t.Kind == Token::TK_BlockEnd
01808         || t.Kind == Token::TK_Value
01809         || t.Kind == Token::TK_Error) {
01810       return Key = new (getAllocator()) NullNode(Doc);
01811     }
01812     if (t.Kind == Token::TK_Key)
01813       getNext(); // skip TK_Key.
01814   }
01815 
01816   // Handle explicit null keys.
01817   Token &t = peekNext();
01818   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
01819     return Key = new (getAllocator()) NullNode(Doc);
01820   }
01821 
01822   // We've got a normal key.
01823   return Key = parseBlockNode();
01824 }
01825 
01826 Node *KeyValueNode::getValue() {
01827   if (Value)
01828     return Value;
01829   getKey()->skip();
01830   if (failed())
01831     return Value = new (getAllocator()) NullNode(Doc);
01832 
01833   // Handle implicit null values.
01834   {
01835     Token &t = peekNext();
01836     if (   t.Kind == Token::TK_BlockEnd
01837         || t.Kind == Token::TK_FlowMappingEnd
01838         || t.Kind == Token::TK_Key
01839         || t.Kind == Token::TK_FlowEntry
01840         || t.Kind == Token::TK_Error) {
01841       return Value = new (getAllocator()) NullNode(Doc);
01842     }
01843 
01844     if (t.Kind != Token::TK_Value) {
01845       setError("Unexpected token in Key Value.", t);
01846       return Value = new (getAllocator()) NullNode(Doc);
01847     }
01848     getNext(); // skip TK_Value.
01849   }
01850 
01851   // Handle explicit null values.
01852   Token &t = peekNext();
01853   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
01854     return Value = new (getAllocator()) NullNode(Doc);
01855   }
01856 
01857   // We got a normal value.
01858   return Value = parseBlockNode();
01859 }
01860 
01861 void MappingNode::increment() {
01862   if (failed()) {
01863     IsAtEnd = true;
01864     CurrentEntry = 0;
01865     return;
01866   }
01867   if (CurrentEntry) {
01868     CurrentEntry->skip();
01869     if (Type == MT_Inline) {
01870       IsAtEnd = true;
01871       CurrentEntry = 0;
01872       return;
01873     }
01874   }
01875   Token T = peekNext();
01876   if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
01877     // KeyValueNode eats the TK_Key. That way it can detect null keys.
01878     CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
01879   } else if (Type == MT_Block) {
01880     switch (T.Kind) {
01881     case Token::TK_BlockEnd:
01882       getNext();
01883       IsAtEnd = true;
01884       CurrentEntry = 0;
01885       break;
01886     default:
01887       setError("Unexpected token. Expected Key or Block End", T);
01888     case Token::TK_Error:
01889       IsAtEnd = true;
01890       CurrentEntry = 0;
01891     }
01892   } else {
01893     switch (T.Kind) {
01894     case Token::TK_FlowEntry:
01895       // Eat the flow entry and recurse.
01896       getNext();
01897       return increment();
01898     case Token::TK_FlowMappingEnd:
01899       getNext();
01900     case Token::TK_Error:
01901       // Set this to end iterator.
01902       IsAtEnd = true;
01903       CurrentEntry = 0;
01904       break;
01905     default:
01906       setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
01907                 "Mapping End."
01908               , T);
01909       IsAtEnd = true;
01910       CurrentEntry = 0;
01911     }
01912   }
01913 }
01914 
01915 void SequenceNode::increment() {
01916   if (failed()) {
01917     IsAtEnd = true;
01918     CurrentEntry = 0;
01919     return;
01920   }
01921   if (CurrentEntry)
01922     CurrentEntry->skip();
01923   Token T = peekNext();
01924   if (SeqType == ST_Block) {
01925     switch (T.Kind) {
01926     case Token::TK_BlockEntry:
01927       getNext();
01928       CurrentEntry = parseBlockNode();
01929       if (CurrentEntry == 0) { // An error occurred.
01930         IsAtEnd = true;
01931         CurrentEntry = 0;
01932       }
01933       break;
01934     case Token::TK_BlockEnd:
01935       getNext();
01936       IsAtEnd = true;
01937       CurrentEntry = 0;
01938       break;
01939     default:
01940       setError( "Unexpected token. Expected Block Entry or Block End."
01941               , T);
01942     case Token::TK_Error:
01943       IsAtEnd = true;
01944       CurrentEntry = 0;
01945     }
01946   } else if (SeqType == ST_Indentless) {
01947     switch (T.Kind) {
01948     case Token::TK_BlockEntry:
01949       getNext();
01950       CurrentEntry = parseBlockNode();
01951       if (CurrentEntry == 0) { // An error occurred.
01952         IsAtEnd = true;
01953         CurrentEntry = 0;
01954       }
01955       break;
01956     default:
01957     case Token::TK_Error:
01958       IsAtEnd = true;
01959       CurrentEntry = 0;
01960     }
01961   } else if (SeqType == ST_Flow) {
01962     switch (T.Kind) {
01963     case Token::TK_FlowEntry:
01964       // Eat the flow entry and recurse.
01965       getNext();
01966       WasPreviousTokenFlowEntry = true;
01967       return increment();
01968     case Token::TK_FlowSequenceEnd:
01969       getNext();
01970     case Token::TK_Error:
01971       // Set this to end iterator.
01972       IsAtEnd = true;
01973       CurrentEntry = 0;
01974       break;
01975     case Token::TK_StreamEnd:
01976     case Token::TK_DocumentEnd:
01977     case Token::TK_DocumentStart:
01978       setError("Could not find closing ]!", T);
01979       // Set this to end iterator.
01980       IsAtEnd = true;
01981       CurrentEntry = 0;
01982       break;
01983     default:
01984       if (!WasPreviousTokenFlowEntry) {
01985         setError("Expected , between entries!", T);
01986         IsAtEnd = true;
01987         CurrentEntry = 0;
01988         break;
01989       }
01990       // Otherwise it must be a flow entry.
01991       CurrentEntry = parseBlockNode();
01992       if (!CurrentEntry) {
01993         IsAtEnd = true;
01994       }
01995       WasPreviousTokenFlowEntry = false;
01996       break;
01997     }
01998   }
01999 }
02000 
02001 Document::Document(Stream &S) : stream(S), Root(0) {
02002   if (parseDirectives())
02003     expectToken(Token::TK_DocumentStart);
02004   Token &T = peekNext();
02005   if (T.Kind == Token::TK_DocumentStart)
02006     getNext();
02007 }
02008 
02009 bool Document::skip()  {
02010   if (stream.scanner->failed())
02011     return false;
02012   if (!Root)
02013     getRoot();
02014   Root->skip();
02015   Token &T = peekNext();
02016   if (T.Kind == Token::TK_StreamEnd)
02017     return false;
02018   if (T.Kind == Token::TK_DocumentEnd) {
02019     getNext();
02020     return skip();
02021   }
02022   return true;
02023 }
02024 
02025 Token &Document::peekNext() {
02026   return stream.scanner->peekNext();
02027 }
02028 
02029 Token Document::getNext() {
02030   return stream.scanner->getNext();
02031 }
02032 
02033 void Document::setError(const Twine &Message, Token &Location) const {
02034   stream.scanner->setError(Message, Location.Range.begin());
02035 }
02036 
02037 bool Document::failed() const {
02038   return stream.scanner->failed();
02039 }
02040 
02041 Node *Document::parseBlockNode() {
02042   Token T = peekNext();
02043   // Handle properties.
02044   Token AnchorInfo;
02045 parse_property:
02046   switch (T.Kind) {
02047   case Token::TK_Alias:
02048     getNext();
02049     return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
02050   case Token::TK_Anchor:
02051     if (AnchorInfo.Kind == Token::TK_Anchor) {
02052       setError("Already encountered an anchor for this node!", T);
02053       return 0;
02054     }
02055     AnchorInfo = getNext(); // Consume TK_Anchor.
02056     T = peekNext();
02057     goto parse_property;
02058   case Token::TK_Tag:
02059     getNext(); // Skip TK_Tag.
02060     T = peekNext();
02061     goto parse_property;
02062   default:
02063     break;
02064   }
02065 
02066   switch (T.Kind) {
02067   case Token::TK_BlockEntry:
02068     // We got an unindented BlockEntry sequence. This is not terminated with
02069     // a BlockEnd.
02070     // Don't eat the TK_BlockEntry, SequenceNode needs it.
02071     return new (NodeAllocator) SequenceNode( stream.CurrentDoc
02072                                            , AnchorInfo.Range.substr(1)
02073                                            , SequenceNode::ST_Indentless);
02074   case Token::TK_BlockSequenceStart:
02075     getNext();
02076     return new (NodeAllocator)
02077       SequenceNode( stream.CurrentDoc
02078                   , AnchorInfo.Range.substr(1)
02079                   , SequenceNode::ST_Block);
02080   case Token::TK_BlockMappingStart:
02081     getNext();
02082     return new (NodeAllocator)
02083       MappingNode( stream.CurrentDoc
02084                  , AnchorInfo.Range.substr(1)
02085                  , MappingNode::MT_Block);
02086   case Token::TK_FlowSequenceStart:
02087     getNext();
02088     return new (NodeAllocator)
02089       SequenceNode( stream.CurrentDoc
02090                   , AnchorInfo.Range.substr(1)
02091                   , SequenceNode::ST_Flow);
02092   case Token::TK_FlowMappingStart:
02093     getNext();
02094     return new (NodeAllocator)
02095       MappingNode( stream.CurrentDoc
02096                  , AnchorInfo.Range.substr(1)
02097                  , MappingNode::MT_Flow);
02098   case Token::TK_Scalar:
02099     getNext();
02100     return new (NodeAllocator)
02101       ScalarNode( stream.CurrentDoc
02102                 , AnchorInfo.Range.substr(1)
02103                 , T.Range);
02104   case Token::TK_Key:
02105     // Don't eat the TK_Key, KeyValueNode expects it.
02106     return new (NodeAllocator)
02107       MappingNode( stream.CurrentDoc
02108                  , AnchorInfo.Range.substr(1)
02109                  , MappingNode::MT_Inline);
02110   case Token::TK_DocumentStart:
02111   case Token::TK_DocumentEnd:
02112   case Token::TK_StreamEnd:
02113   default:
02114     // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
02115     //       !!null null.
02116     return new (NodeAllocator) NullNode(stream.CurrentDoc);
02117   case Token::TK_Error:
02118     return 0;
02119   }
02120   llvm_unreachable("Control flow shouldn't reach here.");
02121   return 0;
02122 }
02123 
02124 bool Document::parseDirectives() {
02125   bool isDirective = false;
02126   while (true) {
02127     Token T = peekNext();
02128     if (T.Kind == Token::TK_TagDirective) {
02129       handleTagDirective(getNext());
02130       isDirective = true;
02131     } else if (T.Kind == Token::TK_VersionDirective) {
02132       stream.handleYAMLDirective(getNext());
02133       isDirective = true;
02134     } else
02135       break;
02136   }
02137   return isDirective;
02138 }
02139 
02140 bool Document::expectToken(int TK) {
02141   Token T = getNext();
02142   if (T.Kind != TK) {
02143     setError("Unexpected token", T);
02144     return false;
02145   }
02146   return true;
02147 }