LLVM  4.0.0
YAMLParser.cpp
Go to the documentation of this file.
1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements a YAML parser.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallString.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/StringExtras.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/ADT/AllocatorList.h"
23 #include "llvm/Support/SourceMgr.h"
25 
26 using namespace llvm;
27 using namespace yaml;
28 
30  UEF_UTF32_LE, ///< UTF-32 Little Endian
31  UEF_UTF32_BE, ///< UTF-32 Big Endian
32  UEF_UTF16_LE, ///< UTF-16 Little Endian
33  UEF_UTF16_BE, ///< UTF-16 Big Endian
34  UEF_UTF8, ///< UTF-8 or ascii.
35  UEF_Unknown ///< Not a valid Unicode encoding.
36 };
37 
38 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
39 /// it exists. Length is in {0, 2, 3, 4}.
40 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
41 
42 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
43 /// encoding form of \a Input.
44 ///
45 /// @param Input A string of length 0 or more.
46 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
47 /// and how long the byte order mark is if one exists.
49  if (Input.size() == 0)
50  return std::make_pair(UEF_Unknown, 0);
51 
52  switch (uint8_t(Input[0])) {
53  case 0x00:
54  if (Input.size() >= 4) {
55  if ( Input[1] == 0
56  && uint8_t(Input[2]) == 0xFE
57  && uint8_t(Input[3]) == 0xFF)
58  return std::make_pair(UEF_UTF32_BE, 4);
59  if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
60  return std::make_pair(UEF_UTF32_BE, 0);
61  }
62 
63  if (Input.size() >= 2 && Input[1] != 0)
64  return std::make_pair(UEF_UTF16_BE, 0);
65  return std::make_pair(UEF_Unknown, 0);
66  case 0xFF:
67  if ( Input.size() >= 4
68  && uint8_t(Input[1]) == 0xFE
69  && Input[2] == 0
70  && Input[3] == 0)
71  return std::make_pair(UEF_UTF32_LE, 4);
72 
73  if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
74  return std::make_pair(UEF_UTF16_LE, 2);
75  return std::make_pair(UEF_Unknown, 0);
76  case 0xFE:
77  if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
78  return std::make_pair(UEF_UTF16_BE, 2);
79  return std::make_pair(UEF_Unknown, 0);
80  case 0xEF:
81  if ( Input.size() >= 3
82  && uint8_t(Input[1]) == 0xBB
83  && uint8_t(Input[2]) == 0xBF)
84  return std::make_pair(UEF_UTF8, 3);
85  return std::make_pair(UEF_Unknown, 0);
86  }
87 
88  // It could still be utf-32 or utf-16.
89  if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
90  return std::make_pair(UEF_UTF32_LE, 0);
91 
92  if (Input.size() >= 2 && Input[1] == 0)
93  return std::make_pair(UEF_UTF16_LE, 0);
94 
95  return std::make_pair(UEF_UTF8, 0);
96 }
97 
98 namespace llvm {
99 namespace yaml {
100 /// Pin the vtables to this file.
101 void Node::anchor() {}
102 void NullNode::anchor() {}
103 void ScalarNode::anchor() {}
104 void BlockScalarNode::anchor() {}
105 void KeyValueNode::anchor() {}
106 void MappingNode::anchor() {}
107 void SequenceNode::anchor() {}
108 void AliasNode::anchor() {}
109 
110 /// Token - A single YAML token.
111 struct Token {
112  enum TokenKind {
113  TK_Error, // Uninitialized token.
135  TK_Tag
136  } Kind;
137 
138  /// A string of length 0 or more whose begin() points to the logical location
139  /// of the token in the input.
141 
142  /// The value of a block scalar node.
143  std::string Value;
144 
145  Token() : Kind(TK_Error) {}
146 };
147 }
148 }
149 
151 
152 namespace {
153 /// @brief This struct is used to track simple keys.
154 ///
155 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
156 /// which could legally be the start of a simple key. When peekNext is called,
157 /// if the Token To be returned is referenced by a SimpleKey, we continue
158 /// tokenizing until that potential simple key has either been found to not be
159 /// a simple key (we moved on to the next line or went further than 1024 chars).
160 /// Or when we run into a Value, and then insert a Key token (and possibly
161 /// others) before the SimpleKey's Tok.
162 struct SimpleKey {
164  unsigned Column;
165  unsigned Line;
166  unsigned FlowLevel;
167  bool IsRequired;
168 
169  bool operator ==(const SimpleKey &Other) {
170  return Tok == Other.Tok;
171  }
172 };
173 }
174 
175 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
176 /// subsequence and the subsequence's length in code units (uint8_t).
177 /// A length of 0 represents an error.
178 typedef std::pair<uint32_t, unsigned> UTF8Decoded;
179 
181  StringRef::iterator Position= Range.begin();
182  StringRef::iterator End = Range.end();
183  // 1 byte: [0x00, 0x7f]
184  // Bit pattern: 0xxxxxxx
185  if ((*Position & 0x80) == 0) {
186  return std::make_pair(*Position, 1);
187  }
188  // 2 bytes: [0x80, 0x7ff]
189  // Bit pattern: 110xxxxx 10xxxxxx
190  if (Position + 1 != End &&
191  ((*Position & 0xE0) == 0xC0) &&
192  ((*(Position + 1) & 0xC0) == 0x80)) {
193  uint32_t codepoint = ((*Position & 0x1F) << 6) |
194  (*(Position + 1) & 0x3F);
195  if (codepoint >= 0x80)
196  return std::make_pair(codepoint, 2);
197  }
198  // 3 bytes: [0x8000, 0xffff]
199  // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
200  if (Position + 2 != End &&
201  ((*Position & 0xF0) == 0xE0) &&
202  ((*(Position + 1) & 0xC0) == 0x80) &&
203  ((*(Position + 2) & 0xC0) == 0x80)) {
204  uint32_t codepoint = ((*Position & 0x0F) << 12) |
205  ((*(Position + 1) & 0x3F) << 6) |
206  (*(Position + 2) & 0x3F);
207  // Codepoints between 0xD800 and 0xDFFF are invalid, as
208  // they are high / low surrogate halves used by UTF-16.
209  if (codepoint >= 0x800 &&
210  (codepoint < 0xD800 || codepoint > 0xDFFF))
211  return std::make_pair(codepoint, 3);
212  }
213  // 4 bytes: [0x10000, 0x10FFFF]
214  // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
215  if (Position + 3 != End &&
216  ((*Position & 0xF8) == 0xF0) &&
217  ((*(Position + 1) & 0xC0) == 0x80) &&
218  ((*(Position + 2) & 0xC0) == 0x80) &&
219  ((*(Position + 3) & 0xC0) == 0x80)) {
220  uint32_t codepoint = ((*Position & 0x07) << 18) |
221  ((*(Position + 1) & 0x3F) << 12) |
222  ((*(Position + 2) & 0x3F) << 6) |
223  (*(Position + 3) & 0x3F);
224  if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
225  return std::make_pair(codepoint, 4);
226  }
227  return std::make_pair(0, 0);
228 }
229 
230 namespace llvm {
231 namespace yaml {
232 /// @brief Scans YAML tokens from a MemoryBuffer.
233 class Scanner {
234 public:
235  Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true,
236  std::error_code *EC = nullptr);
237  Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true,
238  std::error_code *EC = nullptr);
239 
240  /// @brief Parse the next token and return it without popping it.
241  Token &peekNext();
242 
243  /// @brief Parse the next token and pop it from the queue.
244  Token getNext();
245 
246  void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
247  ArrayRef<SMRange> Ranges = None) {
248  SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
249  }
250 
251  void setError(const Twine &Message, StringRef::iterator Position) {
252  if (Current >= End)
253  Current = End - 1;
254 
255  // propagate the error if possible
256  if (EC)
257  *EC = make_error_code(std::errc::invalid_argument);
258 
259  // Don't print out more errors after the first one we encounter. The rest
260  // are just the result of the first, and have no meaning.
261  if (!Failed)
262  printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
263  Failed = true;
264  }
265 
266  void setError(const Twine &Message) {
267  setError(Message, Current);
268  }
269 
270  /// @brief Returns true if an error occurred while parsing.
271  bool failed() {
272  return Failed;
273  }
274 
275 private:
276  void init(MemoryBufferRef Buffer);
277 
278  StringRef currentInput() {
279  return StringRef(Current, End - Current);
280  }
281 
282  /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
283  /// at \a Position.
284  ///
285  /// If the UTF-8 code units starting at Position do not form a well-formed
286  /// code unit subsequence, then the Unicode scalar value is 0, and the length
287  /// is 0.
289  return ::decodeUTF8(StringRef(Position, End - Position));
290  }
291 
292  // The following functions are based on the gramar rules in the YAML spec. The
293  // style of the function names it meant to closely match how they are written
294  // in the spec. The number within the [] is the number of the grammar rule in
295  // the spec.
296  //
297  // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
298  //
299  // c-
300  // A production starting and ending with a special character.
301  // b-
302  // A production matching a single line break.
303  // nb-
304  // A production starting and ending with a non-break character.
305  // s-
306  // A production starting and ending with a white space character.
307  // ns-
308  // A production starting and ending with a non-space character.
309  // l-
310  // A production matching complete line(s).
311 
312  /// @brief Skip a single nb-char[27] starting at Position.
313  ///
314  /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
315  /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
316  ///
317  /// @returns The code unit after the nb-char, or Position if it's not an
318  /// nb-char.
319  StringRef::iterator skip_nb_char(StringRef::iterator Position);
320 
321  /// @brief Skip a single b-break[28] starting at Position.
322  ///
323  /// A b-break is 0xD 0xA | 0xD | 0xA
324  ///
325  /// @returns The code unit after the b-break, or Position if it's not a
326  /// b-break.
327  StringRef::iterator skip_b_break(StringRef::iterator Position);
328 
329  /// Skip a single s-space[31] starting at Position.
330  ///
331  /// An s-space is 0x20
332  ///
333  /// @returns The code unit after the s-space, or Position if it's not a
334  /// s-space.
335  StringRef::iterator skip_s_space(StringRef::iterator Position);
336 
337  /// @brief Skip a single s-white[33] starting at Position.
338  ///
339  /// A s-white is 0x20 | 0x9
340  ///
341  /// @returns The code unit after the s-white, or Position if it's not a
342  /// s-white.
343  StringRef::iterator skip_s_white(StringRef::iterator Position);
344 
345  /// @brief Skip a single ns-char[34] starting at Position.
346  ///
347  /// A ns-char is nb-char - s-white
348  ///
349  /// @returns The code unit after the ns-char, or Position if it's not a
350  /// ns-char.
351  StringRef::iterator skip_ns_char(StringRef::iterator Position);
352 
353  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
354  /// @brief Skip minimal well-formed code unit subsequences until Func
355  /// returns its input.
356  ///
357  /// @returns The code unit after the last minimal well-formed code unit
358  /// subsequence that Func accepted.
359  StringRef::iterator skip_while( SkipWhileFunc Func
360  , StringRef::iterator Position);
361 
362  /// Skip minimal well-formed code unit subsequences until Func returns its
363  /// input.
364  void advanceWhile(SkipWhileFunc Func);
365 
366  /// @brief Scan ns-uri-char[39]s starting at Cur.
367  ///
368  /// This updates Cur and Column while scanning.
369  void scan_ns_uri_char();
370 
371  /// @brief Consume a minimal well-formed code unit subsequence starting at
372  /// \a Cur. Return false if it is not the same Unicode scalar value as
373  /// \a Expected. This updates \a Column.
374  bool consume(uint32_t Expected);
375 
376  /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
377  void skip(uint32_t Distance);
378 
379  /// @brief Return true if the minimal well-formed code unit subsequence at
380  /// Pos is whitespace or a new line
381  bool isBlankOrBreak(StringRef::iterator Position);
382 
383  /// Consume a single b-break[28] if it's present at the current position.
384  ///
385  /// Return false if the code unit at the current position isn't a line break.
386  bool consumeLineBreakIfPresent();
387 
388  /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
389  void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
390  , unsigned AtColumn
391  , bool IsRequired);
392 
393  /// @brief Remove simple keys that can no longer be valid simple keys.
394  ///
395  /// Invalid simple keys are not on the current line or are further than 1024
396  /// columns back.
397  void removeStaleSimpleKeyCandidates();
398 
399  /// @brief Remove all simple keys on FlowLevel \a Level.
400  void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
401 
402  /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
403  /// tokens if needed.
404  bool unrollIndent(int ToColumn);
405 
406  /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
407  /// if needed.
408  bool rollIndent( int ToColumn
410  , TokenQueueT::iterator InsertPoint);
411 
412  /// @brief Skip a single-line comment when the comment starts at the current
413  /// position of the scanner.
414  void skipComment();
415 
416  /// @brief Skip whitespace and comments until the start of the next token.
417  void scanToNextToken();
418 
419  /// @brief Must be the first token generated.
420  bool scanStreamStart();
421 
422  /// @brief Generate tokens needed to close out the stream.
423  bool scanStreamEnd();
424 
425  /// @brief Scan a %BLAH directive.
426  bool scanDirective();
427 
428  /// @brief Scan a ... or ---.
429  bool scanDocumentIndicator(bool IsStart);
430 
431  /// @brief Scan a [ or { and generate the proper flow collection start token.
432  bool scanFlowCollectionStart(bool IsSequence);
433 
434  /// @brief Scan a ] or } and generate the proper flow collection end token.
435  bool scanFlowCollectionEnd(bool IsSequence);
436 
437  /// @brief Scan the , that separates entries in a flow collection.
438  bool scanFlowEntry();
439 
440  /// @brief Scan the - that starts block sequence entries.
441  bool scanBlockEntry();
442 
443  /// @brief Scan an explicit ? indicating a key.
444  bool scanKey();
445 
446  /// @brief Scan an explicit : indicating a value.
447  bool scanValue();
448 
449  /// @brief Scan a quoted scalar.
450  bool scanFlowScalar(bool IsDoubleQuoted);
451 
452  /// @brief Scan an unquoted scalar.
453  bool scanPlainScalar();
454 
455  /// @brief Scan an Alias or Anchor starting with * or &.
456  bool scanAliasOrAnchor(bool IsAlias);
457 
458  /// @brief Scan a block scalar starting with | or >.
459  bool scanBlockScalar(bool IsLiteral);
460 
461  /// Scan a chomping indicator in a block scalar header.
462  char scanBlockChompingIndicator();
463 
464  /// Scan an indentation indicator in a block scalar header.
465  unsigned scanBlockIndentationIndicator();
466 
467  /// Scan a block scalar header.
468  ///
469  /// Return false if an error occurred.
470  bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
471  bool &IsDone);
472 
473  /// Look for the indentation level of a block scalar.
474  ///
475  /// Return false if an error occurred.
476  bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
477  unsigned &LineBreaks, bool &IsDone);
478 
479  /// Scan the indentation of a text line in a block scalar.
480  ///
481  /// Return false if an error occurred.
482  bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
483  bool &IsDone);
484 
485  /// @brief Scan a tag of the form !stuff.
486  bool scanTag();
487 
488  /// @brief Dispatch to the next scanning function based on \a *Cur.
489  bool fetchMoreTokens();
490 
491  /// @brief The SourceMgr used for diagnostics and buffer management.
492  SourceMgr &SM;
493 
494  /// @brief The original input.
495  MemoryBufferRef InputBuffer;
496 
497  /// @brief The current position of the scanner.
498  StringRef::iterator Current;
499 
500  /// @brief The end of the input (one past the last character).
502 
503  /// @brief Current YAML indentation level in spaces.
504  int Indent;
505 
506  /// @brief Current column number in Unicode code points.
507  unsigned Column;
508 
509  /// @brief Current line number.
510  unsigned Line;
511 
512  /// @brief How deep we are in flow style containers. 0 Means at block level.
513  unsigned FlowLevel;
514 
515  /// @brief Are we at the start of the stream?
516  bool IsStartOfStream;
517 
518  /// @brief Can the next token be the start of a simple key?
519  bool IsSimpleKeyAllowed;
520 
521  /// @brief True if an error has occurred.
522  bool Failed;
523 
524  /// @brief Should colors be used when printing out the diagnostic messages?
525  bool ShowColors;
526 
527  /// @brief Queue of tokens. This is required to queue up tokens while looking
528  /// for the end of a simple key. And for cases where a single character
529  /// can produce multiple tokens (e.g. BlockEnd).
530  TokenQueueT TokenQueue;
531 
532  /// @brief Indentation levels.
533  SmallVector<int, 4> Indents;
534 
535  /// @brief Potential simple keys.
536  SmallVector<SimpleKey, 4> SimpleKeys;
537 
538  std::error_code *EC;
539 };
540 
541 } // end namespace yaml
542 } // end namespace llvm
543 
544 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
545 static void encodeUTF8( uint32_t UnicodeScalarValue
546  , SmallVectorImpl<char> &Result) {
547  if (UnicodeScalarValue <= 0x7F) {
548  Result.push_back(UnicodeScalarValue & 0x7F);
549  } else if (UnicodeScalarValue <= 0x7FF) {
550  uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
551  uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
552  Result.push_back(FirstByte);
553  Result.push_back(SecondByte);
554  } else if (UnicodeScalarValue <= 0xFFFF) {
555  uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
556  uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
557  uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
558  Result.push_back(FirstByte);
559  Result.push_back(SecondByte);
560  Result.push_back(ThirdByte);
561  } else if (UnicodeScalarValue <= 0x10FFFF) {
562  uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
563  uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
564  uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
565  uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
566  Result.push_back(FirstByte);
567  Result.push_back(SecondByte);
568  Result.push_back(ThirdByte);
569  Result.push_back(FourthByte);
570  }
571 }
572 
574  SourceMgr SM;
575  Scanner scanner(Input, SM);
576  while (true) {
577  Token T = scanner.getNext();
578  switch (T.Kind) {
580  OS << "Stream-Start: ";
581  break;
582  case Token::TK_StreamEnd:
583  OS << "Stream-End: ";
584  break;
586  OS << "Version-Directive: ";
587  break;
589  OS << "Tag-Directive: ";
590  break;
592  OS << "Document-Start: ";
593  break;
595  OS << "Document-End: ";
596  break;
598  OS << "Block-Entry: ";
599  break;
600  case Token::TK_BlockEnd:
601  OS << "Block-End: ";
602  break;
604  OS << "Block-Sequence-Start: ";
605  break;
607  OS << "Block-Mapping-Start: ";
608  break;
609  case Token::TK_FlowEntry:
610  OS << "Flow-Entry: ";
611  break;
613  OS << "Flow-Sequence-Start: ";
614  break;
616  OS << "Flow-Sequence-End: ";
617  break;
619  OS << "Flow-Mapping-Start: ";
620  break;
622  OS << "Flow-Mapping-End: ";
623  break;
624  case Token::TK_Key:
625  OS << "Key: ";
626  break;
627  case Token::TK_Value:
628  OS << "Value: ";
629  break;
630  case Token::TK_Scalar:
631  OS << "Scalar: ";
632  break;
634  OS << "Block Scalar: ";
635  break;
636  case Token::TK_Alias:
637  OS << "Alias: ";
638  break;
639  case Token::TK_Anchor:
640  OS << "Anchor: ";
641  break;
642  case Token::TK_Tag:
643  OS << "Tag: ";
644  break;
645  case Token::TK_Error:
646  break;
647  }
648  OS << T.Range << "\n";
649  if (T.Kind == Token::TK_StreamEnd)
650  break;
651  else if (T.Kind == Token::TK_Error)
652  return false;
653  }
654  return true;
655 }
656 
658  llvm::SourceMgr SM;
659  llvm::yaml::Scanner scanner(Input, SM);
660  for (;;) {
661  llvm::yaml::Token T = scanner.getNext();
662  if (T.Kind == Token::TK_StreamEnd)
663  break;
664  else if (T.Kind == Token::TK_Error)
665  return false;
666  }
667  return true;
668 }
669 
671  std::string EscapedInput;
672  for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
673  if (*i == '\\')
674  EscapedInput += "\\\\";
675  else if (*i == '"')
676  EscapedInput += "\\\"";
677  else if (*i == 0)
678  EscapedInput += "\\0";
679  else if (*i == 0x07)
680  EscapedInput += "\\a";
681  else if (*i == 0x08)
682  EscapedInput += "\\b";
683  else if (*i == 0x09)
684  EscapedInput += "\\t";
685  else if (*i == 0x0A)
686  EscapedInput += "\\n";
687  else if (*i == 0x0B)
688  EscapedInput += "\\v";
689  else if (*i == 0x0C)
690  EscapedInput += "\\f";
691  else if (*i == 0x0D)
692  EscapedInput += "\\r";
693  else if (*i == 0x1B)
694  EscapedInput += "\\e";
695  else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
696  std::string HexStr = utohexstr(*i);
697  EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
698  } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
699  UTF8Decoded UnicodeScalarValue
700  = decodeUTF8(StringRef(i, Input.end() - i));
701  if (UnicodeScalarValue.second == 0) {
702  // Found invalid char.
703  SmallString<4> Val;
704  encodeUTF8(0xFFFD, Val);
705  EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
706  // FIXME: Error reporting.
707  return EscapedInput;
708  }
709  if (UnicodeScalarValue.first == 0x85)
710  EscapedInput += "\\N";
711  else if (UnicodeScalarValue.first == 0xA0)
712  EscapedInput += "\\_";
713  else if (UnicodeScalarValue.first == 0x2028)
714  EscapedInput += "\\L";
715  else if (UnicodeScalarValue.first == 0x2029)
716  EscapedInput += "\\P";
717  else {
718  std::string HexStr = utohexstr(UnicodeScalarValue.first);
719  if (HexStr.size() <= 2)
720  EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
721  else if (HexStr.size() <= 4)
722  EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
723  else if (HexStr.size() <= 8)
724  EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
725  }
726  i += UnicodeScalarValue.second - 1;
727  } else
728  EscapedInput.push_back(*i);
729  }
730  return EscapedInput;
731 }
732 
734  std::error_code *EC)
735  : SM(sm), ShowColors(ShowColors), EC(EC) {
736  init(MemoryBufferRef(Input, "YAML"));
737 }
738 
739 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors,
740  std::error_code *EC)
741  : SM(SM_), ShowColors(ShowColors), EC(EC) {
742  init(Buffer);
743 }
744 
745 void Scanner::init(MemoryBufferRef Buffer) {
746  InputBuffer = Buffer;
747  Current = InputBuffer.getBufferStart();
748  End = InputBuffer.getBufferEnd();
749  Indent = -1;
750  Column = 0;
751  Line = 0;
752  FlowLevel = 0;
753  IsStartOfStream = true;
754  IsSimpleKeyAllowed = true;
755  Failed = false;
756  std::unique_ptr<MemoryBuffer> InputBufferOwner =
758  SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
759 }
760 
762  // If the current token is a possible simple key, keep parsing until we
763  // can confirm.
764  bool NeedMore = false;
765  while (true) {
766  if (TokenQueue.empty() || NeedMore) {
767  if (!fetchMoreTokens()) {
768  TokenQueue.clear();
769  TokenQueue.push_back(Token());
770  return TokenQueue.front();
771  }
772  }
773  assert(!TokenQueue.empty() &&
774  "fetchMoreTokens lied about getting tokens!");
775 
776  removeStaleSimpleKeyCandidates();
777  SimpleKey SK;
778  SK.Tok = TokenQueue.begin();
779  if (!is_contained(SimpleKeys, SK))
780  break;
781  else
782  NeedMore = true;
783  }
784  return TokenQueue.front();
785 }
786 
788  Token Ret = peekNext();
789  // TokenQueue can be empty if there was an error getting the next token.
790  if (!TokenQueue.empty())
791  TokenQueue.pop_front();
792 
793  // There cannot be any referenced Token's if the TokenQueue is empty. So do a
794  // quick deallocation of them all.
795  if (TokenQueue.empty())
796  TokenQueue.resetAlloc();
797 
798  return Ret;
799 }
800 
801 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
802  if (Position == End)
803  return Position;
804  // Check 7 bit c-printable - b-char.
805  if ( *Position == 0x09
806  || (*Position >= 0x20 && *Position <= 0x7E))
807  return Position + 1;
808 
809  // Check for valid UTF-8.
810  if (uint8_t(*Position) & 0x80) {
811  UTF8Decoded u8d = decodeUTF8(Position);
812  if ( u8d.second != 0
813  && u8d.first != 0xFEFF
814  && ( u8d.first == 0x85
815  || ( u8d.first >= 0xA0
816  && u8d.first <= 0xD7FF)
817  || ( u8d.first >= 0xE000
818  && u8d.first <= 0xFFFD)
819  || ( u8d.first >= 0x10000
820  && u8d.first <= 0x10FFFF)))
821  return Position + u8d.second;
822  }
823  return Position;
824 }
825 
826 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
827  if (Position == End)
828  return Position;
829  if (*Position == 0x0D) {
830  if (Position + 1 != End && *(Position + 1) == 0x0A)
831  return Position + 2;
832  return Position + 1;
833  }
834 
835  if (*Position == 0x0A)
836  return Position + 1;
837  return Position;
838 }
839 
840 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
841  if (Position == End)
842  return Position;
843  if (*Position == ' ')
844  return Position + 1;
845  return Position;
846 }
847 
848 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
849  if (Position == End)
850  return Position;
851  if (*Position == ' ' || *Position == '\t')
852  return Position + 1;
853  return Position;
854 }
855 
856 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
857  if (Position == End)
858  return Position;
859  if (*Position == ' ' || *Position == '\t')
860  return Position;
861  return skip_nb_char(Position);
862 }
863 
864 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
865  , StringRef::iterator Position) {
866  while (true) {
867  StringRef::iterator i = (this->*Func)(Position);
868  if (i == Position)
869  break;
870  Position = i;
871  }
872  return Position;
873 }
874 
875 void Scanner::advanceWhile(SkipWhileFunc Func) {
876  auto Final = skip_while(Func, Current);
877  Column += Final - Current;
878  Current = Final;
879 }
880 
881 static bool is_ns_hex_digit(const char C) {
882  return (C >= '0' && C <= '9')
883  || (C >= 'a' && C <= 'z')
884  || (C >= 'A' && C <= 'Z');
885 }
886 
887 static bool is_ns_word_char(const char C) {
888  return C == '-'
889  || (C >= 'a' && C <= 'z')
890  || (C >= 'A' && C <= 'Z');
891 }
892 
893 void Scanner::scan_ns_uri_char() {
894  while (true) {
895  if (Current == End)
896  break;
897  if (( *Current == '%'
898  && Current + 2 < End
899  && is_ns_hex_digit(*(Current + 1))
900  && is_ns_hex_digit(*(Current + 2)))
901  || is_ns_word_char(*Current)
902  || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
903  != StringRef::npos) {
904  ++Current;
905  ++Column;
906  } else
907  break;
908  }
909 }
910 
911 bool Scanner::consume(uint32_t Expected) {
912  if (Expected >= 0x80)
913  report_fatal_error("Not dealing with this yet");
914  if (Current == End)
915  return false;
916  if (uint8_t(*Current) >= 0x80)
917  report_fatal_error("Not dealing with this yet");
918  if (uint8_t(*Current) == Expected) {
919  ++Current;
920  ++Column;
921  return true;
922  }
923  return false;
924 }
925 
926 void Scanner::skip(uint32_t Distance) {
927  Current += Distance;
928  Column += Distance;
929  assert(Current <= End && "Skipped past the end");
930 }
931 
932 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
933  if (Position == End)
934  return false;
935  return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
936  *Position == '\n';
937 }
938 
939 bool Scanner::consumeLineBreakIfPresent() {
940  auto Next = skip_b_break(Current);
941  if (Next == Current)
942  return false;
943  Column = 0;
944  ++Line;
945  Current = Next;
946  return true;
947 }
948 
949 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
950  , unsigned AtColumn
951  , bool IsRequired) {
952  if (IsSimpleKeyAllowed) {
953  SimpleKey SK;
954  SK.Tok = Tok;
955  SK.Line = Line;
956  SK.Column = AtColumn;
957  SK.IsRequired = IsRequired;
958  SK.FlowLevel = FlowLevel;
959  SimpleKeys.push_back(SK);
960  }
961 }
962 
963 void Scanner::removeStaleSimpleKeyCandidates() {
964  for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
965  i != SimpleKeys.end();) {
966  if (i->Line != Line || i->Column + 1024 < Column) {
967  if (i->IsRequired)
968  setError( "Could not find expected : for simple key"
969  , i->Tok->Range.begin());
970  i = SimpleKeys.erase(i);
971  } else
972  ++i;
973  }
974 }
975 
976 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
977  if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
978  SimpleKeys.pop_back();
979 }
980 
981 bool Scanner::unrollIndent(int ToColumn) {
982  Token T;
983  // Indentation is ignored in flow.
984  if (FlowLevel != 0)
985  return true;
986 
987  while (Indent > ToColumn) {
989  T.Range = StringRef(Current, 1);
990  TokenQueue.push_back(T);
991  Indent = Indents.pop_back_val();
992  }
993 
994  return true;
995 }
996 
997 bool Scanner::rollIndent( int ToColumn
999  , TokenQueueT::iterator InsertPoint) {
1000  if (FlowLevel)
1001  return true;
1002  if (Indent < ToColumn) {
1003  Indents.push_back(Indent);
1004  Indent = ToColumn;
1005 
1006  Token T;
1007  T.Kind = Kind;
1008  T.Range = StringRef(Current, 0);
1009  TokenQueue.insert(InsertPoint, T);
1010  }
1011  return true;
1012 }
1013 
1014 void Scanner::skipComment() {
1015  if (*Current != '#')
1016  return;
1017  while (true) {
1018  // This may skip more than one byte, thus Column is only incremented
1019  // for code points.
1020  StringRef::iterator I = skip_nb_char(Current);
1021  if (I == Current)
1022  break;
1023  Current = I;
1024  ++Column;
1025  }
1026 }
1027 
1028 void Scanner::scanToNextToken() {
1029  while (true) {
1030  while (*Current == ' ' || *Current == '\t') {
1031  skip(1);
1032  }
1033 
1034  skipComment();
1035 
1036  // Skip EOL.
1037  StringRef::iterator i = skip_b_break(Current);
1038  if (i == Current)
1039  break;
1040  Current = i;
1041  ++Line;
1042  Column = 0;
1043  // New lines may start a simple key.
1044  if (!FlowLevel)
1045  IsSimpleKeyAllowed = true;
1046  }
1047 }
1048 
1049 bool Scanner::scanStreamStart() {
1050  IsStartOfStream = false;
1051 
1052  EncodingInfo EI = getUnicodeEncoding(currentInput());
1053 
1054  Token T;
1056  T.Range = StringRef(Current, EI.second);
1057  TokenQueue.push_back(T);
1058  Current += EI.second;
1059  return true;
1060 }
1061 
1062 bool Scanner::scanStreamEnd() {
1063  // Force an ending new line if one isn't present.
1064  if (Column != 0) {
1065  Column = 0;
1066  ++Line;
1067  }
1068 
1069  unrollIndent(-1);
1070  SimpleKeys.clear();
1071  IsSimpleKeyAllowed = false;
1072 
1073  Token T;
1075  T.Range = StringRef(Current, 0);
1076  TokenQueue.push_back(T);
1077  return true;
1078 }
1079 
1080 bool Scanner::scanDirective() {
1081  // Reset the indentation level.
1082  unrollIndent(-1);
1083  SimpleKeys.clear();
1084  IsSimpleKeyAllowed = false;
1085 
1086  StringRef::iterator Start = Current;
1087  consume('%');
1088  StringRef::iterator NameStart = Current;
1089  Current = skip_while(&Scanner::skip_ns_char, Current);
1090  StringRef Name(NameStart, Current - NameStart);
1091  Current = skip_while(&Scanner::skip_s_white, Current);
1092 
1093  Token T;
1094  if (Name == "YAML") {
1095  Current = skip_while(&Scanner::skip_ns_char, Current);
1097  T.Range = StringRef(Start, Current - Start);
1098  TokenQueue.push_back(T);
1099  return true;
1100  } else if(Name == "TAG") {
1101  Current = skip_while(&Scanner::skip_ns_char, Current);
1102  Current = skip_while(&Scanner::skip_s_white, Current);
1103  Current = skip_while(&Scanner::skip_ns_char, Current);
1105  T.Range = StringRef(Start, Current - Start);
1106  TokenQueue.push_back(T);
1107  return true;
1108  }
1109  return false;
1110 }
1111 
1112 bool Scanner::scanDocumentIndicator(bool IsStart) {
1113  unrollIndent(-1);
1114  SimpleKeys.clear();
1115  IsSimpleKeyAllowed = false;
1116 
1117  Token T;
1119  T.Range = StringRef(Current, 3);
1120  skip(3);
1121  TokenQueue.push_back(T);
1122  return true;
1123 }
1124 
1125 bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1126  Token T;
1127  T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1129  T.Range = StringRef(Current, 1);
1130  skip(1);
1131  TokenQueue.push_back(T);
1132 
1133  // [ and { may begin a simple key.
1134  saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
1135 
1136  // And may also be followed by a simple key.
1137  IsSimpleKeyAllowed = true;
1138  ++FlowLevel;
1139  return true;
1140 }
1141 
1142 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1143  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1144  IsSimpleKeyAllowed = false;
1145  Token T;
1146  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1148  T.Range = StringRef(Current, 1);
1149  skip(1);
1150  TokenQueue.push_back(T);
1151  if (FlowLevel)
1152  --FlowLevel;
1153  return true;
1154 }
1155 
1156 bool Scanner::scanFlowEntry() {
1157  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1158  IsSimpleKeyAllowed = true;
1159  Token T;
1161  T.Range = StringRef(Current, 1);
1162  skip(1);
1163  TokenQueue.push_back(T);
1164  return true;
1165 }
1166 
1167 bool Scanner::scanBlockEntry() {
1168  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1169  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1170  IsSimpleKeyAllowed = true;
1171  Token T;
1173  T.Range = StringRef(Current, 1);
1174  skip(1);
1175  TokenQueue.push_back(T);
1176  return true;
1177 }
1178 
1179 bool Scanner::scanKey() {
1180  if (!FlowLevel)
1181  rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1182 
1183  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1184  IsSimpleKeyAllowed = !FlowLevel;
1185 
1186  Token T;
1187  T.Kind = Token::TK_Key;
1188  T.Range = StringRef(Current, 1);
1189  skip(1);
1190  TokenQueue.push_back(T);
1191  return true;
1192 }
1193 
1194 bool Scanner::scanValue() {
1195  // If the previous token could have been a simple key, insert the key token
1196  // into the token queue.
1197  if (!SimpleKeys.empty()) {
1198  SimpleKey SK = SimpleKeys.pop_back_val();
1199  Token T;
1200  T.Kind = Token::TK_Key;
1201  T.Range = SK.Tok->Range;
1203  for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1204  if (i == SK.Tok)
1205  break;
1206  }
1207  assert(i != e && "SimpleKey not in token queue!");
1208  i = TokenQueue.insert(i, T);
1209 
1210  // We may also need to add a Block-Mapping-Start token.
1211  rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1212 
1213  IsSimpleKeyAllowed = false;
1214  } else {
1215  if (!FlowLevel)
1216  rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1217  IsSimpleKeyAllowed = !FlowLevel;
1218  }
1219 
1220  Token T;
1221  T.Kind = Token::TK_Value;
1222  T.Range = StringRef(Current, 1);
1223  skip(1);
1224  TokenQueue.push_back(T);
1225  return true;
1226 }
1227 
1228 // Forbidding inlining improves performance by roughly 20%.
1229 // FIXME: Remove once llvm optimizes this to the faster version without hints.
1230 LLVM_ATTRIBUTE_NOINLINE static bool
1232 
1233 // Returns whether a character at 'Position' was escaped with a leading '\'.
1234 // 'First' specifies the position of the first character in the string.
1236  StringRef::iterator Position) {
1237  assert(Position - 1 >= First);
1238  StringRef::iterator I = Position - 1;
1239  // We calculate the number of consecutive '\'s before the current position
1240  // by iterating backwards through our string.
1241  while (I >= First && *I == '\\') --I;
1242  // (Position - 1 - I) now contains the number of '\'s before the current
1243  // position. If it is odd, the character at 'Position' was escaped.
1244  return (Position - 1 - I) % 2 == 1;
1245 }
1246 
1247 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1248  StringRef::iterator Start = Current;
1249  unsigned ColStart = Column;
1250  if (IsDoubleQuoted) {
1251  do {
1252  ++Current;
1253  while (Current != End && *Current != '"')
1254  ++Current;
1255  // Repeat until the previous character was not a '\' or was an escaped
1256  // backslash.
1257  } while ( Current != End
1258  && *(Current - 1) == '\\'
1259  && wasEscaped(Start + 1, Current));
1260  } else {
1261  skip(1);
1262  while (true) {
1263  // Skip a ' followed by another '.
1264  if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1265  skip(2);
1266  continue;
1267  } else if (*Current == '\'')
1268  break;
1269  StringRef::iterator i = skip_nb_char(Current);
1270  if (i == Current) {
1271  i = skip_b_break(Current);
1272  if (i == Current)
1273  break;
1274  Current = i;
1275  Column = 0;
1276  ++Line;
1277  } else {
1278  if (i == End)
1279  break;
1280  Current = i;
1281  ++Column;
1282  }
1283  }
1284  }
1285 
1286  if (Current == End) {
1287  setError("Expected quote at end of scalar", Current);
1288  return false;
1289  }
1290 
1291  skip(1); // Skip ending quote.
1292  Token T;
1293  T.Kind = Token::TK_Scalar;
1294  T.Range = StringRef(Start, Current - Start);
1295  TokenQueue.push_back(T);
1296 
1297  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1298 
1299  IsSimpleKeyAllowed = false;
1300 
1301  return true;
1302 }
1303 
1304 bool Scanner::scanPlainScalar() {
1305  StringRef::iterator Start = Current;
1306  unsigned ColStart = Column;
1307  unsigned LeadingBlanks = 0;
1308  assert(Indent >= -1 && "Indent must be >= -1 !");
1309  unsigned indent = static_cast<unsigned>(Indent + 1);
1310  while (true) {
1311  if (*Current == '#')
1312  break;
1313 
1314  while (!isBlankOrBreak(Current)) {
1315  if ( FlowLevel && *Current == ':'
1316  && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
1317  setError("Found unexpected ':' while scanning a plain scalar", Current);
1318  return false;
1319  }
1320 
1321  // Check for the end of the plain scalar.
1322  if ( (*Current == ':' && isBlankOrBreak(Current + 1))
1323  || ( FlowLevel
1324  && (StringRef(Current, 1).find_first_of(",:?[]{}")
1325  != StringRef::npos)))
1326  break;
1327 
1328  StringRef::iterator i = skip_nb_char(Current);
1329  if (i == Current)
1330  break;
1331  Current = i;
1332  ++Column;
1333  }
1334 
1335  // Are we at the end?
1336  if (!isBlankOrBreak(Current))
1337  break;
1338 
1339  // Eat blanks.
1340  StringRef::iterator Tmp = Current;
1341  while (isBlankOrBreak(Tmp)) {
1342  StringRef::iterator i = skip_s_white(Tmp);
1343  if (i != Tmp) {
1344  if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1345  setError("Found invalid tab character in indentation", Tmp);
1346  return false;
1347  }
1348  Tmp = i;
1349  ++Column;
1350  } else {
1351  i = skip_b_break(Tmp);
1352  if (!LeadingBlanks)
1353  LeadingBlanks = 1;
1354  Tmp = i;
1355  Column = 0;
1356  ++Line;
1357  }
1358  }
1359 
1360  if (!FlowLevel && Column < indent)
1361  break;
1362 
1363  Current = Tmp;
1364  }
1365  if (Start == Current) {
1366  setError("Got empty plain scalar", Start);
1367  return false;
1368  }
1369  Token T;
1370  T.Kind = Token::TK_Scalar;
1371  T.Range = StringRef(Start, Current - Start);
1372  TokenQueue.push_back(T);
1373 
1374  // Plain scalars can be simple keys.
1375  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1376 
1377  IsSimpleKeyAllowed = false;
1378 
1379  return true;
1380 }
1381 
1382 bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1383  StringRef::iterator Start = Current;
1384  unsigned ColStart = Column;
1385  skip(1);
1386  while(true) {
1387  if ( *Current == '[' || *Current == ']'
1388  || *Current == '{' || *Current == '}'
1389  || *Current == ','
1390  || *Current == ':')
1391  break;
1392  StringRef::iterator i = skip_ns_char(Current);
1393  if (i == Current)
1394  break;
1395  Current = i;
1396  ++Column;
1397  }
1398 
1399  if (Start == Current) {
1400  setError("Got empty alias or anchor", Start);
1401  return false;
1402  }
1403 
1404  Token T;
1405  T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1406  T.Range = StringRef(Start, Current - Start);
1407  TokenQueue.push_back(T);
1408 
1409  // Alias and anchors can be simple keys.
1410  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1411 
1412  IsSimpleKeyAllowed = false;
1413 
1414  return true;
1415 }
1416 
1417 char Scanner::scanBlockChompingIndicator() {
1418  char Indicator = ' ';
1419  if (Current != End && (*Current == '+' || *Current == '-')) {
1420  Indicator = *Current;
1421  skip(1);
1422  }
1423  return Indicator;
1424 }
1425 
1426 /// Get the number of line breaks after chomping.
1427 ///
1428 /// Return the number of trailing line breaks to emit, depending on
1429 /// \p ChompingIndicator.
1430 static unsigned getChompedLineBreaks(char ChompingIndicator,
1431  unsigned LineBreaks, StringRef Str) {
1432  if (ChompingIndicator == '-') // Strip all line breaks.
1433  return 0;
1434  if (ChompingIndicator == '+') // Keep all line breaks.
1435  return LineBreaks;
1436  // Clip trailing lines.
1437  return Str.empty() ? 0 : 1;
1438 }
1439 
1440 unsigned Scanner::scanBlockIndentationIndicator() {
1441  unsigned Indent = 0;
1442  if (Current != End && (*Current >= '1' && *Current <= '9')) {
1443  Indent = unsigned(*Current - '0');
1444  skip(1);
1445  }
1446  return Indent;
1447 }
1448 
1449 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
1450  unsigned &IndentIndicator, bool &IsDone) {
1451  auto Start = Current;
1452 
1453  ChompingIndicator = scanBlockChompingIndicator();
1454  IndentIndicator = scanBlockIndentationIndicator();
1455  // Check for the chomping indicator once again.
1456  if (ChompingIndicator == ' ')
1457  ChompingIndicator = scanBlockChompingIndicator();
1458  Current = skip_while(&Scanner::skip_s_white, Current);
1459  skipComment();
1460 
1461  if (Current == End) { // EOF, we have an empty scalar.
1462  Token T;
1464  T.Range = StringRef(Start, Current - Start);
1465  TokenQueue.push_back(T);
1466  IsDone = true;
1467  return true;
1468  }
1469 
1470  if (!consumeLineBreakIfPresent()) {
1471  setError("Expected a line break after block scalar header", Current);
1472  return false;
1473  }
1474  return true;
1475 }
1476 
1477 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
1478  unsigned BlockExitIndent,
1479  unsigned &LineBreaks, bool &IsDone) {
1480  unsigned MaxAllSpaceLineCharacters = 0;
1481  StringRef::iterator LongestAllSpaceLine;
1482 
1483  while (true) {
1484  advanceWhile(&Scanner::skip_s_space);
1485  if (skip_nb_char(Current) != Current) {
1486  // This line isn't empty, so try and find the indentation.
1487  if (Column <= BlockExitIndent) { // End of the block literal.
1488  IsDone = true;
1489  return true;
1490  }
1491  // We found the block's indentation.
1492  BlockIndent = Column;
1493  if (MaxAllSpaceLineCharacters > BlockIndent) {
1494  setError(
1495  "Leading all-spaces line must be smaller than the block indent",
1496  LongestAllSpaceLine);
1497  return false;
1498  }
1499  return true;
1500  }
1501  if (skip_b_break(Current) != Current &&
1502  Column > MaxAllSpaceLineCharacters) {
1503  // Record the longest all-space line in case it's longer than the
1504  // discovered block indent.
1505  MaxAllSpaceLineCharacters = Column;
1506  LongestAllSpaceLine = Current;
1507  }
1508 
1509  // Check for EOF.
1510  if (Current == End) {
1511  IsDone = true;
1512  return true;
1513  }
1514 
1515  if (!consumeLineBreakIfPresent()) {
1516  IsDone = true;
1517  return true;
1518  }
1519  ++LineBreaks;
1520  }
1521  return true;
1522 }
1523 
1524 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
1525  unsigned BlockExitIndent, bool &IsDone) {
1526  // Skip the indentation.
1527  while (Column < BlockIndent) {
1528  auto I = skip_s_space(Current);
1529  if (I == Current)
1530  break;
1531  Current = I;
1532  ++Column;
1533  }
1534 
1535  if (skip_nb_char(Current) == Current)
1536  return true;
1537 
1538  if (Column <= BlockExitIndent) { // End of the block literal.
1539  IsDone = true;
1540  return true;
1541  }
1542 
1543  if (Column < BlockIndent) {
1544  if (Current != End && *Current == '#') { // Trailing comment.
1545  IsDone = true;
1546  return true;
1547  }
1548  setError("A text line is less indented than the block scalar", Current);
1549  return false;
1550  }
1551  return true; // A normal text line.
1552 }
1553 
1554 bool Scanner::scanBlockScalar(bool IsLiteral) {
1555  // Eat '|' or '>'
1556  assert(*Current == '|' || *Current == '>');
1557  skip(1);
1558 
1559  char ChompingIndicator;
1560  unsigned BlockIndent;
1561  bool IsDone = false;
1562  if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
1563  return false;
1564  if (IsDone)
1565  return true;
1566 
1567  auto Start = Current;
1568  unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
1569  unsigned LineBreaks = 0;
1570  if (BlockIndent == 0) {
1571  if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
1572  IsDone))
1573  return false;
1574  }
1575 
1576  // Scan the block's scalars body.
1577  SmallString<256> Str;
1578  while (!IsDone) {
1579  if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
1580  return false;
1581  if (IsDone)
1582  break;
1583 
1584  // Parse the current line.
1585  auto LineStart = Current;
1586  advanceWhile(&Scanner::skip_nb_char);
1587  if (LineStart != Current) {
1588  Str.append(LineBreaks, '\n');
1589  Str.append(StringRef(LineStart, Current - LineStart));
1590  LineBreaks = 0;
1591  }
1592 
1593  // Check for EOF.
1594  if (Current == End)
1595  break;
1596 
1597  if (!consumeLineBreakIfPresent())
1598  break;
1599  ++LineBreaks;
1600  }
1601 
1602  if (Current == End && !LineBreaks)
1603  // Ensure that there is at least one line break before the end of file.
1604  LineBreaks = 1;
1605  Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
1606 
1607  // New lines may start a simple key.
1608  if (!FlowLevel)
1609  IsSimpleKeyAllowed = true;
1610 
1611  Token T;
1613  T.Range = StringRef(Start, Current - Start);
1614  T.Value = Str.str().str();
1615  TokenQueue.push_back(T);
1616  return true;
1617 }
1618 
1619 bool Scanner::scanTag() {
1620  StringRef::iterator Start = Current;
1621  unsigned ColStart = Column;
1622  skip(1); // Eat !.
1623  if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1624  else if (*Current == '<') {
1625  skip(1);
1626  scan_ns_uri_char();
1627  if (!consume('>'))
1628  return false;
1629  } else {
1630  // FIXME: Actually parse the c-ns-shorthand-tag rule.
1631  Current = skip_while(&Scanner::skip_ns_char, Current);
1632  }
1633 
1634  Token T;
1635  T.Kind = Token::TK_Tag;
1636  T.Range = StringRef(Start, Current - Start);
1637  TokenQueue.push_back(T);
1638 
1639  // Tags can be simple keys.
1640  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1641 
1642  IsSimpleKeyAllowed = false;
1643 
1644  return true;
1645 }
1646 
1647 bool Scanner::fetchMoreTokens() {
1648  if (IsStartOfStream)
1649  return scanStreamStart();
1650 
1651  scanToNextToken();
1652 
1653  if (Current == End)
1654  return scanStreamEnd();
1655 
1656  removeStaleSimpleKeyCandidates();
1657 
1658  unrollIndent(Column);
1659 
1660  if (Column == 0 && *Current == '%')
1661  return scanDirective();
1662 
1663  if (Column == 0 && Current + 4 <= End
1664  && *Current == '-'
1665  && *(Current + 1) == '-'
1666  && *(Current + 2) == '-'
1667  && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1668  return scanDocumentIndicator(true);
1669 
1670  if (Column == 0 && Current + 4 <= End
1671  && *Current == '.'
1672  && *(Current + 1) == '.'
1673  && *(Current + 2) == '.'
1674  && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1675  return scanDocumentIndicator(false);
1676 
1677  if (*Current == '[')
1678  return scanFlowCollectionStart(true);
1679 
1680  if (*Current == '{')
1681  return scanFlowCollectionStart(false);
1682 
1683  if (*Current == ']')
1684  return scanFlowCollectionEnd(true);
1685 
1686  if (*Current == '}')
1687  return scanFlowCollectionEnd(false);
1688 
1689  if (*Current == ',')
1690  return scanFlowEntry();
1691 
1692  if (*Current == '-' && isBlankOrBreak(Current + 1))
1693  return scanBlockEntry();
1694 
1695  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
1696  return scanKey();
1697 
1698  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
1699  return scanValue();
1700 
1701  if (*Current == '*')
1702  return scanAliasOrAnchor(true);
1703 
1704  if (*Current == '&')
1705  return scanAliasOrAnchor(false);
1706 
1707  if (*Current == '!')
1708  return scanTag();
1709 
1710  if (*Current == '|' && !FlowLevel)
1711  return scanBlockScalar(true);
1712 
1713  if (*Current == '>' && !FlowLevel)
1714  return scanBlockScalar(false);
1715 
1716  if (*Current == '\'')
1717  return scanFlowScalar(false);
1718 
1719  if (*Current == '"')
1720  return scanFlowScalar(true);
1721 
1722  // Get a plain scalar.
1723  StringRef FirstChar(Current, 1);
1724  if (!(isBlankOrBreak(Current)
1725  || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
1726  || (*Current == '-' && !isBlankOrBreak(Current + 1))
1727  || (!FlowLevel && (*Current == '?' || *Current == ':')
1728  && isBlankOrBreak(Current + 1))
1729  || (!FlowLevel && *Current == ':'
1730  && Current + 2 < End
1731  && *(Current + 1) == ':'
1732  && !isBlankOrBreak(Current + 2)))
1733  return scanPlainScalar();
1734 
1735  setError("Unrecognized character while tokenizing.");
1736  return false;
1737 }
1738 
1739 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors,
1740  std::error_code *EC)
1741  : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {}
1742 
1743 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
1744  std::error_code *EC)
1745  : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {}
1746 
1748 
1749 bool Stream::failed() { return scanner->failed(); }
1750 
1751 void Stream::printError(Node *N, const Twine &Msg) {
1752  scanner->printError( N->getSourceRange().Start
1754  , Msg
1755  , N->getSourceRange());
1756 }
1757 
1759  if (CurrentDoc)
1760  report_fatal_error("Can only iterate over the stream once");
1761 
1762  // Skip Stream-Start.
1763  scanner->getNext();
1764 
1765  CurrentDoc.reset(new Document(*this));
1766  return document_iterator(CurrentDoc);
1767 }
1768 
1770  return document_iterator();
1771 }
1772 
1774  for (document_iterator i = begin(), e = end(); i != e; ++i)
1775  i->skip();
1776 }
1777 
1778 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
1779  StringRef T)
1780  : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
1781  SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1782  SourceRange = SMRange(Start, Start);
1783 }
1784 
1785 std::string Node::getVerbatimTag() const {
1786  StringRef Raw = getRawTag();
1787  if (!Raw.empty() && Raw != "!") {
1788  std::string Ret;
1789  if (Raw.find_last_of('!') == 0) {
1790  Ret = Doc->getTagMap().find("!")->second;
1791  Ret += Raw.substr(1);
1792  return Ret;
1793  } else if (Raw.startswith("!!")) {
1794  Ret = Doc->getTagMap().find("!!")->second;
1795  Ret += Raw.substr(2);
1796  return Ret;
1797  } else {
1798  StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1799  std::map<StringRef, StringRef>::const_iterator It =
1800  Doc->getTagMap().find(TagHandle);
1801  if (It != Doc->getTagMap().end())
1802  Ret = It->second;
1803  else {
1804  Token T;
1805  T.Kind = Token::TK_Tag;
1806  T.Range = TagHandle;
1807  setError(Twine("Unknown tag handle ") + TagHandle, T);
1808  }
1809  Ret += Raw.substr(Raw.find_last_of('!') + 1);
1810  return Ret;
1811  }
1812  }
1813 
1814  switch (getType()) {
1815  case NK_Null:
1816  return "tag:yaml.org,2002:null";
1817  case NK_Scalar:
1818  case NK_BlockScalar:
1819  // TODO: Tag resolution.
1820  return "tag:yaml.org,2002:str";
1821  case NK_Mapping:
1822  return "tag:yaml.org,2002:map";
1823  case NK_Sequence:
1824  return "tag:yaml.org,2002:seq";
1825  }
1826 
1827  return "";
1828 }
1829 
1830 Token &Node::peekNext() {
1831  return Doc->peekNext();
1832 }
1833 
1834 Token Node::getNext() {
1835  return Doc->getNext();
1836 }
1837 
1838 Node *Node::parseBlockNode() {
1839  return Doc->parseBlockNode();
1840 }
1841 
1842 BumpPtrAllocator &Node::getAllocator() {
1843  return Doc->NodeAllocator;
1844 }
1845 
1846 void Node::setError(const Twine &Msg, Token &Tok) const {
1847  Doc->setError(Msg, Tok);
1848 }
1849 
1850 bool Node::failed() const {
1851  return Doc->failed();
1852 }
1853 
1854 
1855 
1856 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
1857  // TODO: Handle newlines properly. We need to remove leading whitespace.
1858  if (Value[0] == '"') { // Double quoted.
1859  // Pull off the leading and trailing "s.
1860  StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1861  // Search for characters that would require unescaping the value.
1862  StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
1863  if (i != StringRef::npos)
1864  return unescapeDoubleQuoted(UnquotedValue, i, Storage);
1865  return UnquotedValue;
1866  } else if (Value[0] == '\'') { // Single quoted.
1867  // Pull off the leading and trailing 's.
1868  StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1869  StringRef::size_type i = UnquotedValue.find('\'');
1870  if (i != StringRef::npos) {
1871  // We're going to need Storage.
1872  Storage.clear();
1873  Storage.reserve(UnquotedValue.size());
1874  for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
1875  StringRef Valid(UnquotedValue.begin(), i);
1876  Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1877  Storage.push_back('\'');
1878  UnquotedValue = UnquotedValue.substr(i + 2);
1879  }
1880  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1881  return StringRef(Storage.begin(), Storage.size());
1882  }
1883  return UnquotedValue;
1884  }
1885  // Plain or block.
1886  return Value.rtrim(' ');
1887 }
1888 
1889 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
1891  , SmallVectorImpl<char> &Storage)
1892  const {
1893  // Use Storage to build proper value.
1894  Storage.clear();
1895  Storage.reserve(UnquotedValue.size());
1896  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
1897  // Insert all previous chars into Storage.
1898  StringRef Valid(UnquotedValue.begin(), i);
1899  Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1900  // Chop off inserted chars.
1901  UnquotedValue = UnquotedValue.substr(i);
1902 
1903  assert(!UnquotedValue.empty() && "Can't be empty!");
1904 
1905  // Parse escape or line break.
1906  switch (UnquotedValue[0]) {
1907  case '\r':
1908  case '\n':
1909  Storage.push_back('\n');
1910  if ( UnquotedValue.size() > 1
1911  && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1912  UnquotedValue = UnquotedValue.substr(1);
1913  UnquotedValue = UnquotedValue.substr(1);
1914  break;
1915  default:
1916  if (UnquotedValue.size() == 1)
1917  // TODO: Report error.
1918  break;
1919  UnquotedValue = UnquotedValue.substr(1);
1920  switch (UnquotedValue[0]) {
1921  default: {
1922  Token T;
1923  T.Range = StringRef(UnquotedValue.begin(), 1);
1924  setError("Unrecognized escape code!", T);
1925  return "";
1926  }
1927  case '\r':
1928  case '\n':
1929  // Remove the new line.
1930  if ( UnquotedValue.size() > 1
1931  && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1932  UnquotedValue = UnquotedValue.substr(1);
1933  // If this was just a single byte newline, it will get skipped
1934  // below.
1935  break;
1936  case '0':
1937  Storage.push_back(0x00);
1938  break;
1939  case 'a':
1940  Storage.push_back(0x07);
1941  break;
1942  case 'b':
1943  Storage.push_back(0x08);
1944  break;
1945  case 't':
1946  case 0x09:
1947  Storage.push_back(0x09);
1948  break;
1949  case 'n':
1950  Storage.push_back(0x0A);
1951  break;
1952  case 'v':
1953  Storage.push_back(0x0B);
1954  break;
1955  case 'f':
1956  Storage.push_back(0x0C);
1957  break;
1958  case 'r':
1959  Storage.push_back(0x0D);
1960  break;
1961  case 'e':
1962  Storage.push_back(0x1B);
1963  break;
1964  case ' ':
1965  Storage.push_back(0x20);
1966  break;
1967  case '"':
1968  Storage.push_back(0x22);
1969  break;
1970  case '/':
1971  Storage.push_back(0x2F);
1972  break;
1973  case '\\':
1974  Storage.push_back(0x5C);
1975  break;
1976  case 'N':
1977  encodeUTF8(0x85, Storage);
1978  break;
1979  case '_':
1980  encodeUTF8(0xA0, Storage);
1981  break;
1982  case 'L':
1983  encodeUTF8(0x2028, Storage);
1984  break;
1985  case 'P':
1986  encodeUTF8(0x2029, Storage);
1987  break;
1988  case 'x': {
1989  if (UnquotedValue.size() < 3)
1990  // TODO: Report error.
1991  break;
1992  unsigned int UnicodeScalarValue;
1993  if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
1994  // TODO: Report error.
1995  UnicodeScalarValue = 0xFFFD;
1996  encodeUTF8(UnicodeScalarValue, Storage);
1997  UnquotedValue = UnquotedValue.substr(2);
1998  break;
1999  }
2000  case 'u': {
2001  if (UnquotedValue.size() < 5)
2002  // TODO: Report error.
2003  break;
2004  unsigned int UnicodeScalarValue;
2005  if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
2006  // TODO: Report error.
2007  UnicodeScalarValue = 0xFFFD;
2008  encodeUTF8(UnicodeScalarValue, Storage);
2009  UnquotedValue = UnquotedValue.substr(4);
2010  break;
2011  }
2012  case 'U': {
2013  if (UnquotedValue.size() < 9)
2014  // TODO: Report error.
2015  break;
2016  unsigned int UnicodeScalarValue;
2017  if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
2018  // TODO: Report error.
2019  UnicodeScalarValue = 0xFFFD;
2020  encodeUTF8(UnicodeScalarValue, Storage);
2021  UnquotedValue = UnquotedValue.substr(8);
2022  break;
2023  }
2024  }
2025  UnquotedValue = UnquotedValue.substr(1);
2026  }
2027  }
2028  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
2029  return StringRef(Storage.begin(), Storage.size());
2030 }
2031 
2032 Node *KeyValueNode::getKey() {
2033  if (Key)
2034  return Key;
2035  // Handle implicit null keys.
2036  {
2037  Token &t = peekNext();
2038  if ( t.Kind == Token::TK_BlockEnd
2039  || t.Kind == Token::TK_Value
2040  || t.Kind == Token::TK_Error) {
2041  return Key = new (getAllocator()) NullNode(Doc);
2042  }
2043  if (t.Kind == Token::TK_Key)
2044  getNext(); // skip TK_Key.
2045  }
2046 
2047  // Handle explicit null keys.
2048  Token &t = peekNext();
2049  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
2050  return Key = new (getAllocator()) NullNode(Doc);
2051  }
2052 
2053  // We've got a normal key.
2054  return Key = parseBlockNode();
2055 }
2056 
2057 Node *KeyValueNode::getValue() {
2058  if (Value)
2059  return Value;
2060  getKey()->skip();
2061  if (failed())
2062  return Value = new (getAllocator()) NullNode(Doc);
2063 
2064  // Handle implicit null values.
2065  {
2066  Token &t = peekNext();
2067  if ( t.Kind == Token::TK_BlockEnd
2068  || t.Kind == Token::TK_FlowMappingEnd
2069  || t.Kind == Token::TK_Key
2070  || t.Kind == Token::TK_FlowEntry
2071  || t.Kind == Token::TK_Error) {
2072  return Value = new (getAllocator()) NullNode(Doc);
2073  }
2074 
2075  if (t.Kind != Token::TK_Value) {
2076  setError("Unexpected token in Key Value.", t);
2077  return Value = new (getAllocator()) NullNode(Doc);
2078  }
2079  getNext(); // skip TK_Value.
2080  }
2081 
2082  // Handle explicit null values.
2083  Token &t = peekNext();
2084  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
2085  return Value = new (getAllocator()) NullNode(Doc);
2086  }
2087 
2088  // We got a normal value.
2089  return Value = parseBlockNode();
2090 }
2091 
2092 void MappingNode::increment() {
2093  if (failed()) {
2094  IsAtEnd = true;
2095  CurrentEntry = nullptr;
2096  return;
2097  }
2098  if (CurrentEntry) {
2099  CurrentEntry->skip();
2100  if (Type == MT_Inline) {
2101  IsAtEnd = true;
2102  CurrentEntry = nullptr;
2103  return;
2104  }
2105  }
2106  Token T = peekNext();
2107  if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
2108  // KeyValueNode eats the TK_Key. That way it can detect null keys.
2109  CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
2110  } else if (Type == MT_Block) {
2111  switch (T.Kind) {
2112  case Token::TK_BlockEnd:
2113  getNext();
2114  IsAtEnd = true;
2115  CurrentEntry = nullptr;
2116  break;
2117  default:
2118  setError("Unexpected token. Expected Key or Block End", T);
2119  case Token::TK_Error:
2120  IsAtEnd = true;
2121  CurrentEntry = nullptr;
2122  }
2123  } else {
2124  switch (T.Kind) {
2125  case Token::TK_FlowEntry:
2126  // Eat the flow entry and recurse.
2127  getNext();
2128  return increment();
2129  case Token::TK_FlowMappingEnd:
2130  getNext();
2131  case Token::TK_Error:
2132  // Set this to end iterator.
2133  IsAtEnd = true;
2134  CurrentEntry = nullptr;
2135  break;
2136  default:
2137  setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
2138  "Mapping End."
2139  , T);
2140  IsAtEnd = true;
2141  CurrentEntry = nullptr;
2142  }
2143  }
2144 }
2145 
2146 void SequenceNode::increment() {
2147  if (failed()) {
2148  IsAtEnd = true;
2149  CurrentEntry = nullptr;
2150  return;
2151  }
2152  if (CurrentEntry)
2153  CurrentEntry->skip();
2154  Token T = peekNext();
2155  if (SeqType == ST_Block) {
2156  switch (T.Kind) {
2157  case Token::TK_BlockEntry:
2158  getNext();
2159  CurrentEntry = parseBlockNode();
2160  if (!CurrentEntry) { // An error occurred.
2161  IsAtEnd = true;
2162  CurrentEntry = nullptr;
2163  }
2164  break;
2165  case Token::TK_BlockEnd:
2166  getNext();
2167  IsAtEnd = true;
2168  CurrentEntry = nullptr;
2169  break;
2170  default:
2171  setError( "Unexpected token. Expected Block Entry or Block End."
2172  , T);
2173  case Token::TK_Error:
2174  IsAtEnd = true;
2175  CurrentEntry = nullptr;
2176  }
2177  } else if (SeqType == ST_Indentless) {
2178  switch (T.Kind) {
2179  case Token::TK_BlockEntry:
2180  getNext();
2181  CurrentEntry = parseBlockNode();
2182  if (!CurrentEntry) { // An error occurred.
2183  IsAtEnd = true;
2184  CurrentEntry = nullptr;
2185  }
2186  break;
2187  default:
2188  case Token::TK_Error:
2189  IsAtEnd = true;
2190  CurrentEntry = nullptr;
2191  }
2192  } else if (SeqType == ST_Flow) {
2193  switch (T.Kind) {
2194  case Token::TK_FlowEntry:
2195  // Eat the flow entry and recurse.
2196  getNext();
2197  WasPreviousTokenFlowEntry = true;
2198  return increment();
2199  case Token::TK_FlowSequenceEnd:
2200  getNext();
2201  case Token::TK_Error:
2202  // Set this to end iterator.
2203  IsAtEnd = true;
2204  CurrentEntry = nullptr;
2205  break;
2206  case Token::TK_StreamEnd:
2207  case Token::TK_DocumentEnd:
2208  case Token::TK_DocumentStart:
2209  setError("Could not find closing ]!", T);
2210  // Set this to end iterator.
2211  IsAtEnd = true;
2212  CurrentEntry = nullptr;
2213  break;
2214  default:
2215  if (!WasPreviousTokenFlowEntry) {
2216  setError("Expected , between entries!", T);
2217  IsAtEnd = true;
2218  CurrentEntry = nullptr;
2219  break;
2220  }
2221  // Otherwise it must be a flow entry.
2222  CurrentEntry = parseBlockNode();
2223  if (!CurrentEntry) {
2224  IsAtEnd = true;
2225  }
2226  WasPreviousTokenFlowEntry = false;
2227  break;
2228  }
2229  }
2230 }
2231 
2232 Document::Document(Stream &S) : stream(S), Root(nullptr) {
2233  // Tag maps starts with two default mappings.
2234  TagMap["!"] = "!";
2235  TagMap["!!"] = "tag:yaml.org,2002:";
2236 
2237  if (parseDirectives())
2238  expectToken(Token::TK_DocumentStart);
2239  Token &T = peekNext();
2240  if (T.Kind == Token::TK_DocumentStart)
2241  getNext();
2242 }
2243 
2245  if (stream.scanner->failed())
2246  return false;
2247  if (!Root)
2248  getRoot();
2249  Root->skip();
2250  Token &T = peekNext();
2251  if (T.Kind == Token::TK_StreamEnd)
2252  return false;
2253  if (T.Kind == Token::TK_DocumentEnd) {
2254  getNext();
2255  return skip();
2256  }
2257  return true;
2258 }
2259 
2260 Token &Document::peekNext() {
2261  return stream.scanner->peekNext();
2262 }
2263 
2264 Token Document::getNext() {
2265  return stream.scanner->getNext();
2266 }
2267 
2268 void Document::setError(const Twine &Message, Token &Location) const {
2269  stream.scanner->setError(Message, Location.Range.begin());
2270 }
2271 
2272 bool Document::failed() const {
2273  return stream.scanner->failed();
2274 }
2275 
2277  Token T = peekNext();
2278  // Handle properties.
2279  Token AnchorInfo;
2280  Token TagInfo;
2281 parse_property:
2282  switch (T.Kind) {
2283  case Token::TK_Alias:
2284  getNext();
2285  return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2286  case Token::TK_Anchor:
2287  if (AnchorInfo.Kind == Token::TK_Anchor) {
2288  setError("Already encountered an anchor for this node!", T);
2289  return nullptr;
2290  }
2291  AnchorInfo = getNext(); // Consume TK_Anchor.
2292  T = peekNext();
2293  goto parse_property;
2294  case Token::TK_Tag:
2295  if (TagInfo.Kind == Token::TK_Tag) {
2296  setError("Already encountered a tag for this node!", T);
2297  return nullptr;
2298  }
2299  TagInfo = getNext(); // Consume TK_Tag.
2300  T = peekNext();
2301  goto parse_property;
2302  default:
2303  break;
2304  }
2305 
2306  switch (T.Kind) {
2307  case Token::TK_BlockEntry:
2308  // We got an unindented BlockEntry sequence. This is not terminated with
2309  // a BlockEnd.
2310  // Don't eat the TK_BlockEntry, SequenceNode needs it.
2311  return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2312  , AnchorInfo.Range.substr(1)
2313  , TagInfo.Range
2316  getNext();
2317  return new (NodeAllocator)
2318  SequenceNode( stream.CurrentDoc
2319  , AnchorInfo.Range.substr(1)
2320  , TagInfo.Range
2323  getNext();
2324  return new (NodeAllocator)
2325  MappingNode( stream.CurrentDoc
2326  , AnchorInfo.Range.substr(1)
2327  , TagInfo.Range
2330  getNext();
2331  return new (NodeAllocator)
2332  SequenceNode( stream.CurrentDoc
2333  , AnchorInfo.Range.substr(1)
2334  , TagInfo.Range
2337  getNext();
2338  return new (NodeAllocator)
2339  MappingNode( stream.CurrentDoc
2340  , AnchorInfo.Range.substr(1)
2341  , TagInfo.Range
2343  case Token::TK_Scalar:
2344  getNext();
2345  return new (NodeAllocator)
2346  ScalarNode( stream.CurrentDoc
2347  , AnchorInfo.Range.substr(1)
2348  , TagInfo.Range
2349  , T.Range);
2350  case Token::TK_BlockScalar: {
2351  getNext();
2352  StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
2353  StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
2354  return new (NodeAllocator)
2355  BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
2356  TagInfo.Range, StrCopy, T.Range);
2357  }
2358  case Token::TK_Key:
2359  // Don't eat the TK_Key, KeyValueNode expects it.
2360  return new (NodeAllocator)
2361  MappingNode( stream.CurrentDoc
2362  , AnchorInfo.Range.substr(1)
2363  , TagInfo.Range
2366  case Token::TK_DocumentEnd:
2367  case Token::TK_StreamEnd:
2368  default:
2369  // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2370  // !!null null.
2371  return new (NodeAllocator) NullNode(stream.CurrentDoc);
2372  case Token::TK_Error:
2373  return nullptr;
2374  }
2375  llvm_unreachable("Control flow shouldn't reach here.");
2376  return nullptr;
2377 }
2378 
2379 bool Document::parseDirectives() {
2380  bool isDirective = false;
2381  while (true) {
2382  Token T = peekNext();
2383  if (T.Kind == Token::TK_TagDirective) {
2384  parseTAGDirective();
2385  isDirective = true;
2386  } else if (T.Kind == Token::TK_VersionDirective) {
2387  parseYAMLDirective();
2388  isDirective = true;
2389  } else
2390  break;
2391  }
2392  return isDirective;
2393 }
2394 
2395 void Document::parseYAMLDirective() {
2396  getNext(); // Eat %YAML <version>
2397 }
2398 
2399 void Document::parseTAGDirective() {
2400  Token Tag = getNext(); // %TAG <handle> <prefix>
2401  StringRef T = Tag.Range;
2402  // Strip %TAG
2403  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2404  std::size_t HandleEnd = T.find_first_of(" \t");
2405  StringRef TagHandle = T.substr(0, HandleEnd);
2406  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2407  TagMap[TagHandle] = TagPrefix;
2408 }
2409 
2410 bool Document::expectToken(int TK) {
2411  Token T = getNext();
2412  if (T.Kind != TK) {
2413  setError("Unexpected token", T);
2414  return false;
2415  }
2416  return true;
2417 }
const NoneType None
Definition: None.h:23
std::enable_if< std::numeric_limits< T >::is_signed, bool >::type getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:494
Represents a range in source code.
Definition: SMLoc.h:49
void push_back(const T &Elt)
Definition: SmallVector.h:211
std::unique_ptr< Document > & Doc
Definition: YAMLParser.h:160
std::string getVerbatimTag() const
Get the verbatium tag for a given Node.
bool operator==(const BinaryRef &LHS, const BinaryRef &RHS)
Definition: YAML.h:88
uint64_t Token
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
size_t i
llvm::BumpPtrList< Token > TokenQueueT
Definition: YAMLParser.cpp:150
A linked-list with a custom, local allocator.
Definition: AllocatorList.h:29
Node(unsigned int Type, std::unique_ptr< Document > &, StringRef Anchor, StringRef Tag)
Not a valid Unicode encoding.
Definition: YAMLParser.cpp:35
Scanner(StringRef Input, SourceMgr &SM, bool ShowColors=true, std::error_code *EC=nullptr)
Definition: YAMLParser.cpp:733
bool scanTokens(StringRef Input)
Scans all tokens in input without outputting anything.
Definition: YAMLParser.cpp:657
static std::unique_ptr< MemoryBuffer > getMemBuffer(StringRef InputData, StringRef BufferName="", bool RequiresNullTerminator=true)
Open the specified memory range as a MemoryBuffer.
std::string Value
The value of a block scalar node.
Definition: YAMLParser.cpp:143
StringRef getRawTag() const
Get the tag as it was written in the document.
Definition: YAMLParser.h:128
Represents a YAML sequence created from either a block sequence for a flow sequence.
Definition: YAMLParser.h:437
static LLVM_ATTRIBUTE_NOINLINE bool wasEscaped(StringRef::iterator First, StringRef::iterator Position)
UTF-8 or ascii.
Definition: YAMLParser.cpp:34
const char * getBufferStart() const
Definition: MemoryBuffer.h:173
void resetAlloc()
Reset the underlying allocator.
Node * parseBlockNode()
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:905
std::pair< uint32_t, unsigned > UTF8Decoded
The Unicode scalar value of a UTF-8 minimal well-formed code unit subsequence and the subsequence's l...
Definition: YAMLParser.cpp:178
document_iterator begin()
Represents an alias to a Node with an anchor.
Definition: YAMLParser.h:490
void skip(CollectionType &C)
Definition: YAMLParser.h:374
void reserve(size_type N)
Definition: SmallVector.h:377
UTF-32 Little Endian.
Definition: YAMLParser.cpp:30
document_iterator end()
void push_back(T &&V)
void setError(const Twine &Message, Token &Location) const
SMLoc Start
Definition: SMLoc.h:51
static bool is_ns_hex_digit(const char C)
Definition: YAMLParser.cpp:881
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static F t[256]
bool failed()
Returns true if an error occurred while parsing.
Definition: YAMLParser.cpp:271
static GCRegistry::Add< StatepointGC > D("statepoint-example","an example strategy for statepoint")
std::error_code make_error_code(BitcodeError E)
TypeID
Definitions of all of the base types for the Type system.
Definition: Type.h:54
Tagged union holding either a T or a Error.
static EncodingInfo getUnicodeEncoding(StringRef Input)
getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode encoding form of Input...
Definition: YAMLParser.cpp:48
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:662
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:60
The Input class is used to parse a yaml document into in-memory structs and vectors.
Definition: YAMLTraits.h:1099
virtual void skip()
Definition: YAMLParser.h:145
A key and value pair.
Definition: YAMLParser.h:266
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:264
Node * getRoot()
Parse and return the root level node.
Definition: YAMLParser.h:520
#define T
unsigned AddNewSourceBuffer(std::unique_ptr< MemoryBuffer > F, SMLoc IncludeLoc)
Add a new source buffer to this source manager.
Definition: SourceMgr.h:118
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE size_t size() const
size - Get the string size.
Definition: StringRef.h:135
iterator begin() const
Definition: StringRef.h:103
void append(in_iter S, in_iter E)
Append from an iterator pair.
Definition: SmallString.h:75
bool dumpTokens(StringRef Input, raw_ostream &)
Dump all the tokens in this stream to OS.
Definition: YAMLParser.cpp:573
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:295
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
void printError(Node *N, const Twine &Msg)
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:138
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:115
const char * getBufferEnd() const
Definition: MemoryBuffer.h:174
LLVM_NODISCARD size_t find_last_of(char C, size_t From=npos) const
Find the last character in the string that is C, or npos if not found.
Definition: StringRef.h:418
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:587
std::string escape(StringRef Input)
Escape Input for a double quoted scalar.
Definition: YAMLParser.cpp:670
static unsigned getChompedLineBreaks(char ChompingIndicator, unsigned LineBreaks, StringRef Str)
Get the number of line breaks after chomping.
static const unsigned End
StringRef Range
A string of length 0 or more whose begin() points to the logical location of the token in the input...
Definition: YAMLParser.cpp:140
void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, ArrayRef< SMRange > Ranges=None)
Definition: YAMLParser.cpp:246
Token getNext()
Parse the next token and pop it from the queue.
Definition: YAMLParser.cpp:787
This owns the files read by a parser, handles include stacks, and handles diagnostic wrangling...
Definition: SourceMgr.h:35
LLVM_NODISCARD std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:225
iterator erase(const_iterator CI)
Definition: SmallVector.h:431
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
A scalar node is an opaque datum that can be presented as a series of zero or more Unicode scalar val...
Definition: YAMLParser.h:193
UTF-16 Little Endian.
Definition: YAMLParser.cpp:32
static Cursor skipComment(Cursor C)
Skip a line comment and return the updated cursor.
Definition: MILexer.cpp:94
A null value.
Definition: YAMLParser.h:178
void setError(const Twine &Message, StringRef::iterator Position)
Definition: YAMLParser.cpp:251
const char * iterator
Definition: StringRef.h:49
bool skip()
Finish parsing the current document and return true if there are more.
enum llvm::yaml::Token::TokenKind Kind
Token & peekNext()
Parse the next token and return it without popping it.
Definition: YAMLParser.cpp:761
Token & peekNext()
This class represents a YAML stream potentially containing multiple documents.
Definition: YAMLParser.h:76
#define LLVM_ATTRIBUTE_NOINLINE
LLVM_ATTRIBUTE_NOINLINE - On compilers where we have a directive to do so, mark a method "not for inl...
Definition: Compiler.h:192
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:382
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
iterator insert(iterator I, T &&V)
static void encodeUTF8(uint32_t UnicodeScalarValue, SmallVectorImpl< char > &Result)
encodeUTF8 - Encode UnicodeScalarValue in UTF-8 and append it to result.
Definition: YAMLParser.cpp:545
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:267
void setError(const Twine &Message)
Definition: YAMLParser.cpp:266
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:130
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:464
static UTF8Decoded decodeUTF8(StringRef Range)
Definition: YAMLParser.cpp:180
LLVM_NODISCARD StringRef copy(Allocator &A) const
Definition: StringRef.h:153
static SMLoc getFromPointer(const char *Ptr)
Definition: SMLoc.h:37
SMRange getSourceRange() const
Definition: YAMLParser.h:134
Stream(StringRef Input, SourceMgr &, bool ShowColors=true, std::error_code *EC=nullptr)
This keeps a reference to the string referenced by Input.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:119
static const size_t npos
Definition: StringRef.h:51
size_t size_type
Definition: StringRef.h:52
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:135
LLVM_NODISCARD StringRef ltrim(char Char) const
Return string with consecutive Char characters starting from the the left removed.
Definition: StringRef.h:797
Token - A single YAML token.
Definition: YAMLParser.cpp:111
A block scalar node is an opaque datum that can be presented as a series of zero or more Unicode scal...
Definition: YAMLParser.h:236
Represents a YAML map created from either a block map for a flow map.
Definition: YAMLParser.h:390
IteratorImpl< T, typename list_type::iterator > iterator
UTF-16 Big Endian.
Definition: YAMLParser.cpp:33
static std::string utohexstr(uint64_t X, bool LowerCase=false)
Definition: StringExtras.h:48
const unsigned Kind
Scans YAML tokens from a MemoryBuffer.
Definition: YAMLParser.cpp:233
std::pair< UnicodeEncodingForm, unsigned > EncodingInfo
EncodingInfo - Holds the encoding type and length of the byte order mark if it exists.
Definition: YAMLParser.cpp:40
LLVM_NODISCARD size_t find_first_of(char C, size_t From=0) const
Find the first character in the string that is C, or npos if not found.
Definition: StringRef.h:392
Iterator abstraction for Documents over a Stream.
Definition: YAMLParser.h:565
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SMRange SourceRange
Definition: YAMLParser.h:161
LLVM Value Representation.
Definition: Value.h:71
friend class Document
Definition: YAMLParser.h:101
iterator end() const
Definition: StringRef.h:105
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:44
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:643
void PrintMessage(raw_ostream &OS, SMLoc Loc, DiagKind Kind, const Twine &Msg, ArrayRef< SMRange > Ranges=None, ArrayRef< SMFixIt > FixIts=None, bool ShowColors=true) const
Emit a message about the specified location with the specified string.
Definition: SourceMgr.cpp:216
Represents a location in source code.
Definition: SMLoc.h:24
An inline mapping node is used for "[key: value]".
Definition: YAMLParser.h:397
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
UTF-32 Big Endian.
Definition: YAMLParser.cpp:31
Node * parseBlockNode()
Root for parsing a node. Returns a single node.
UnicodeEncodingForm
Definition: YAMLParser.cpp:29
Abstract base class for all Nodes.
Definition: YAMLParser.h:105
static bool is_ns_word_char(const char C)
Definition: YAMLParser.cpp:887
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:783