LLVM  3.7.0
YAMLParser.cpp
Go to the documentation of this file.
1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements a YAML parser.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/ADT/SmallString.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/StringExtras.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/ADT/ilist.h"
20 #include "llvm/ADT/ilist_node.h"
23 #include "llvm/Support/SourceMgr.h"
25 
26 using namespace llvm;
27 using namespace yaml;
28 
30  UEF_UTF32_LE, ///< UTF-32 Little Endian
31  UEF_UTF32_BE, ///< UTF-32 Big Endian
32  UEF_UTF16_LE, ///< UTF-16 Little Endian
33  UEF_UTF16_BE, ///< UTF-16 Big Endian
34  UEF_UTF8, ///< UTF-8 or ascii.
35  UEF_Unknown ///< Not a valid Unicode encoding.
36 };
37 
38 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
39 /// it exists. Length is in {0, 2, 3, 4}.
40 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
41 
42 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
43 /// encoding form of \a Input.
44 ///
45 /// @param Input A string of length 0 or more.
46 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
47 /// and how long the byte order mark is if one exists.
49  if (Input.size() == 0)
50  return std::make_pair(UEF_Unknown, 0);
51 
52  switch (uint8_t(Input[0])) {
53  case 0x00:
54  if (Input.size() >= 4) {
55  if ( Input[1] == 0
56  && uint8_t(Input[2]) == 0xFE
57  && uint8_t(Input[3]) == 0xFF)
58  return std::make_pair(UEF_UTF32_BE, 4);
59  if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
60  return std::make_pair(UEF_UTF32_BE, 0);
61  }
62 
63  if (Input.size() >= 2 && Input[1] != 0)
64  return std::make_pair(UEF_UTF16_BE, 0);
65  return std::make_pair(UEF_Unknown, 0);
66  case 0xFF:
67  if ( Input.size() >= 4
68  && uint8_t(Input[1]) == 0xFE
69  && Input[2] == 0
70  && Input[3] == 0)
71  return std::make_pair(UEF_UTF32_LE, 4);
72 
73  if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
74  return std::make_pair(UEF_UTF16_LE, 2);
75  return std::make_pair(UEF_Unknown, 0);
76  case 0xFE:
77  if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
78  return std::make_pair(UEF_UTF16_BE, 2);
79  return std::make_pair(UEF_Unknown, 0);
80  case 0xEF:
81  if ( Input.size() >= 3
82  && uint8_t(Input[1]) == 0xBB
83  && uint8_t(Input[2]) == 0xBF)
84  return std::make_pair(UEF_UTF8, 3);
85  return std::make_pair(UEF_Unknown, 0);
86  }
87 
88  // It could still be utf-32 or utf-16.
89  if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
90  return std::make_pair(UEF_UTF32_LE, 0);
91 
92  if (Input.size() >= 2 && Input[1] == 0)
93  return std::make_pair(UEF_UTF16_LE, 0);
94 
95  return std::make_pair(UEF_UTF8, 0);
96 }
97 
98 namespace llvm {
99 namespace yaml {
100 /// Pin the vtables to this file.
101 void Node::anchor() {}
102 void NullNode::anchor() {}
103 void ScalarNode::anchor() {}
104 void BlockScalarNode::anchor() {}
105 void KeyValueNode::anchor() {}
106 void MappingNode::anchor() {}
107 void SequenceNode::anchor() {}
108 void AliasNode::anchor() {}
109 
110 /// Token - A single YAML token.
111 struct Token : ilist_node<Token> {
112  enum TokenKind {
113  TK_Error, // Uninitialized token.
135  TK_Tag
136  } Kind;
137 
138  /// A string of length 0 or more whose begin() points to the logical location
139  /// of the token in the input.
141 
142  /// The value of a block scalar node.
143  std::string Value;
144 
145  Token() : Kind(TK_Error) {}
146 };
147 }
148 }
149 
150 namespace llvm {
151 template<>
154  return &Sentinel;
155  }
156  static void destroySentinel(Token*) {}
157 
158  Token *provideInitialHead() const { return createSentinel(); }
159  Token *ensureHead(Token*) const { return createSentinel(); }
160  static void noteHead(Token*, Token*) {}
161 
162 private:
163  mutable Token Sentinel;
164 };
165 
166 template<>
168  Token *createNode(const Token &V) {
169  return new (Alloc.Allocate<Token>()) Token(V);
170  }
171  static void deleteNode(Token *V) { V->~Token(); }
172 
173  void addNodeToList(Token *) {}
176  ilist_iterator<Token> /*first*/,
177  ilist_iterator<Token> /*last*/) {}
178 
180 };
181 }
182 
184 
185 namespace {
186 /// @brief This struct is used to track simple keys.
187 ///
188 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
189 /// which could legally be the start of a simple key. When peekNext is called,
190 /// if the Token To be returned is referenced by a SimpleKey, we continue
191 /// tokenizing until that potential simple key has either been found to not be
192 /// a simple key (we moved on to the next line or went further than 1024 chars).
193 /// Or when we run into a Value, and then insert a Key token (and possibly
194 /// others) before the SimpleKey's Tok.
195 struct SimpleKey {
197  unsigned Column;
198  unsigned Line;
199  unsigned FlowLevel;
200  bool IsRequired;
201 
202  bool operator ==(const SimpleKey &Other) {
203  return Tok == Other.Tok;
204  }
205 };
206 }
207 
208 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
209 /// subsequence and the subsequence's length in code units (uint8_t).
210 /// A length of 0 represents an error.
211 typedef std::pair<uint32_t, unsigned> UTF8Decoded;
212 
214  StringRef::iterator Position= Range.begin();
215  StringRef::iterator End = Range.end();
216  // 1 byte: [0x00, 0x7f]
217  // Bit pattern: 0xxxxxxx
218  if ((*Position & 0x80) == 0) {
219  return std::make_pair(*Position, 1);
220  }
221  // 2 bytes: [0x80, 0x7ff]
222  // Bit pattern: 110xxxxx 10xxxxxx
223  if (Position + 1 != End &&
224  ((*Position & 0xE0) == 0xC0) &&
225  ((*(Position + 1) & 0xC0) == 0x80)) {
226  uint32_t codepoint = ((*Position & 0x1F) << 6) |
227  (*(Position + 1) & 0x3F);
228  if (codepoint >= 0x80)
229  return std::make_pair(codepoint, 2);
230  }
231  // 3 bytes: [0x8000, 0xffff]
232  // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
233  if (Position + 2 != End &&
234  ((*Position & 0xF0) == 0xE0) &&
235  ((*(Position + 1) & 0xC0) == 0x80) &&
236  ((*(Position + 2) & 0xC0) == 0x80)) {
237  uint32_t codepoint = ((*Position & 0x0F) << 12) |
238  ((*(Position + 1) & 0x3F) << 6) |
239  (*(Position + 2) & 0x3F);
240  // Codepoints between 0xD800 and 0xDFFF are invalid, as
241  // they are high / low surrogate halves used by UTF-16.
242  if (codepoint >= 0x800 &&
243  (codepoint < 0xD800 || codepoint > 0xDFFF))
244  return std::make_pair(codepoint, 3);
245  }
246  // 4 bytes: [0x10000, 0x10FFFF]
247  // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
248  if (Position + 3 != End &&
249  ((*Position & 0xF8) == 0xF0) &&
250  ((*(Position + 1) & 0xC0) == 0x80) &&
251  ((*(Position + 2) & 0xC0) == 0x80) &&
252  ((*(Position + 3) & 0xC0) == 0x80)) {
253  uint32_t codepoint = ((*Position & 0x07) << 18) |
254  ((*(Position + 1) & 0x3F) << 12) |
255  ((*(Position + 2) & 0x3F) << 6) |
256  (*(Position + 3) & 0x3F);
257  if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
258  return std::make_pair(codepoint, 4);
259  }
260  return std::make_pair(0, 0);
261 }
262 
263 namespace llvm {
264 namespace yaml {
265 /// @brief Scans YAML tokens from a MemoryBuffer.
266 class Scanner {
267 public:
268  Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true);
269  Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true);
270 
271  /// @brief Parse the next token and return it without popping it.
272  Token &peekNext();
273 
274  /// @brief Parse the next token and pop it from the queue.
275  Token getNext();
276 
277  void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
278  ArrayRef<SMRange> Ranges = None) {
279  SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
280  }
281 
282  void setError(const Twine &Message, StringRef::iterator Position) {
283  if (Current >= End)
284  Current = End - 1;
285 
286  // Don't print out more errors after the first one we encounter. The rest
287  // are just the result of the first, and have no meaning.
288  if (!Failed)
289  printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
290  Failed = true;
291  }
292 
293  void setError(const Twine &Message) {
294  setError(Message, Current);
295  }
296 
297  /// @brief Returns true if an error occurred while parsing.
298  bool failed() {
299  return Failed;
300  }
301 
302 private:
303  void init(MemoryBufferRef Buffer);
304 
305  StringRef currentInput() {
306  return StringRef(Current, End - Current);
307  }
308 
309  /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
310  /// at \a Position.
311  ///
312  /// If the UTF-8 code units starting at Position do not form a well-formed
313  /// code unit subsequence, then the Unicode scalar value is 0, and the length
314  /// is 0.
316  return ::decodeUTF8(StringRef(Position, End - Position));
317  }
318 
319  // The following functions are based on the gramar rules in the YAML spec. The
320  // style of the function names it meant to closely match how they are written
321  // in the spec. The number within the [] is the number of the grammar rule in
322  // the spec.
323  //
324  // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
325  //
326  // c-
327  // A production starting and ending with a special character.
328  // b-
329  // A production matching a single line break.
330  // nb-
331  // A production starting and ending with a non-break character.
332  // s-
333  // A production starting and ending with a white space character.
334  // ns-
335  // A production starting and ending with a non-space character.
336  // l-
337  // A production matching complete line(s).
338 
339  /// @brief Skip a single nb-char[27] starting at Position.
340  ///
341  /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
342  /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
343  ///
344  /// @returns The code unit after the nb-char, or Position if it's not an
345  /// nb-char.
346  StringRef::iterator skip_nb_char(StringRef::iterator Position);
347 
348  /// @brief Skip a single b-break[28] starting at Position.
349  ///
350  /// A b-break is 0xD 0xA | 0xD | 0xA
351  ///
352  /// @returns The code unit after the b-break, or Position if it's not a
353  /// b-break.
354  StringRef::iterator skip_b_break(StringRef::iterator Position);
355 
356  /// Skip a single s-space[31] starting at Position.
357  ///
358  /// An s-space is 0x20
359  ///
360  /// @returns The code unit after the s-space, or Position if it's not a
361  /// s-space.
362  StringRef::iterator skip_s_space(StringRef::iterator Position);
363 
364  /// @brief Skip a single s-white[33] starting at Position.
365  ///
366  /// A s-white is 0x20 | 0x9
367  ///
368  /// @returns The code unit after the s-white, or Position if it's not a
369  /// s-white.
370  StringRef::iterator skip_s_white(StringRef::iterator Position);
371 
372  /// @brief Skip a single ns-char[34] starting at Position.
373  ///
374  /// A ns-char is nb-char - s-white
375  ///
376  /// @returns The code unit after the ns-char, or Position if it's not a
377  /// ns-char.
378  StringRef::iterator skip_ns_char(StringRef::iterator Position);
379 
380  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
381  /// @brief Skip minimal well-formed code unit subsequences until Func
382  /// returns its input.
383  ///
384  /// @returns The code unit after the last minimal well-formed code unit
385  /// subsequence that Func accepted.
386  StringRef::iterator skip_while( SkipWhileFunc Func
387  , StringRef::iterator Position);
388 
389  /// Skip minimal well-formed code unit subsequences until Func returns its
390  /// input.
391  void advanceWhile(SkipWhileFunc Func);
392 
393  /// @brief Scan ns-uri-char[39]s starting at Cur.
394  ///
395  /// This updates Cur and Column while scanning.
396  ///
397  /// @returns A StringRef starting at Cur which covers the longest contiguous
398  /// sequence of ns-uri-char.
399  StringRef scan_ns_uri_char();
400 
401  /// @brief Consume a minimal well-formed code unit subsequence starting at
402  /// \a Cur. Return false if it is not the same Unicode scalar value as
403  /// \a Expected. This updates \a Column.
404  bool consume(uint32_t Expected);
405 
406  /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
407  void skip(uint32_t Distance);
408 
409  /// @brief Return true if the minimal well-formed code unit subsequence at
410  /// Pos is whitespace or a new line
411  bool isBlankOrBreak(StringRef::iterator Position);
412 
413  /// Consume a single b-break[28] if it's present at the current position.
414  ///
415  /// Return false if the code unit at the current position isn't a line break.
416  bool consumeLineBreakIfPresent();
417 
418  /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
419  void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
420  , unsigned AtColumn
421  , bool IsRequired);
422 
423  /// @brief Remove simple keys that can no longer be valid simple keys.
424  ///
425  /// Invalid simple keys are not on the current line or are further than 1024
426  /// columns back.
427  void removeStaleSimpleKeyCandidates();
428 
429  /// @brief Remove all simple keys on FlowLevel \a Level.
430  void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
431 
432  /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
433  /// tokens if needed.
434  bool unrollIndent(int ToColumn);
435 
436  /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
437  /// if needed.
438  bool rollIndent( int ToColumn
440  , TokenQueueT::iterator InsertPoint);
441 
442  /// @brief Skip a single-line comment when the comment starts at the current
443  /// position of the scanner.
444  void skipComment();
445 
446  /// @brief Skip whitespace and comments until the start of the next token.
447  void scanToNextToken();
448 
449  /// @brief Must be the first token generated.
450  bool scanStreamStart();
451 
452  /// @brief Generate tokens needed to close out the stream.
453  bool scanStreamEnd();
454 
455  /// @brief Scan a %BLAH directive.
456  bool scanDirective();
457 
458  /// @brief Scan a ... or ---.
459  bool scanDocumentIndicator(bool IsStart);
460 
461  /// @brief Scan a [ or { and generate the proper flow collection start token.
462  bool scanFlowCollectionStart(bool IsSequence);
463 
464  /// @brief Scan a ] or } and generate the proper flow collection end token.
465  bool scanFlowCollectionEnd(bool IsSequence);
466 
467  /// @brief Scan the , that separates entries in a flow collection.
468  bool scanFlowEntry();
469 
470  /// @brief Scan the - that starts block sequence entries.
471  bool scanBlockEntry();
472 
473  /// @brief Scan an explicit ? indicating a key.
474  bool scanKey();
475 
476  /// @brief Scan an explicit : indicating a value.
477  bool scanValue();
478 
479  /// @brief Scan a quoted scalar.
480  bool scanFlowScalar(bool IsDoubleQuoted);
481 
482  /// @brief Scan an unquoted scalar.
483  bool scanPlainScalar();
484 
485  /// @brief Scan an Alias or Anchor starting with * or &.
486  bool scanAliasOrAnchor(bool IsAlias);
487 
488  /// @brief Scan a block scalar starting with | or >.
489  bool scanBlockScalar(bool IsLiteral);
490 
491  /// Scan a chomping indicator in a block scalar header.
492  char scanBlockChompingIndicator();
493 
494  /// Scan an indentation indicator in a block scalar header.
495  unsigned scanBlockIndentationIndicator();
496 
497  /// Scan a block scalar header.
498  ///
499  /// Return false if an error occurred.
500  bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
501  bool &IsDone);
502 
503  /// Look for the indentation level of a block scalar.
504  ///
505  /// Return false if an error occurred.
506  bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
507  unsigned &LineBreaks, bool &IsDone);
508 
509  /// Scan the indentation of a text line in a block scalar.
510  ///
511  /// Return false if an error occurred.
512  bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
513  bool &IsDone);
514 
515  /// @brief Scan a tag of the form !stuff.
516  bool scanTag();
517 
518  /// @brief Dispatch to the next scanning function based on \a *Cur.
519  bool fetchMoreTokens();
520 
521  /// @brief The SourceMgr used for diagnostics and buffer management.
522  SourceMgr &SM;
523 
524  /// @brief The original input.
525  MemoryBufferRef InputBuffer;
526 
527  /// @brief The current position of the scanner.
528  StringRef::iterator Current;
529 
530  /// @brief The end of the input (one past the last character).
532 
533  /// @brief Current YAML indentation level in spaces.
534  int Indent;
535 
536  /// @brief Current column number in Unicode code points.
537  unsigned Column;
538 
539  /// @brief Current line number.
540  unsigned Line;
541 
542  /// @brief How deep we are in flow style containers. 0 Means at block level.
543  unsigned FlowLevel;
544 
545  /// @brief Are we at the start of the stream?
546  bool IsStartOfStream;
547 
548  /// @brief Can the next token be the start of a simple key?
549  bool IsSimpleKeyAllowed;
550 
551  /// @brief True if an error has occurred.
552  bool Failed;
553 
554  /// @brief Should colors be used when printing out the diagnostic messages?
555  bool ShowColors;
556 
557  /// @brief Queue of tokens. This is required to queue up tokens while looking
558  /// for the end of a simple key. And for cases where a single character
559  /// can produce multiple tokens (e.g. BlockEnd).
560  TokenQueueT TokenQueue;
561 
562  /// @brief Indentation levels.
563  SmallVector<int, 4> Indents;
564 
565  /// @brief Potential simple keys.
566  SmallVector<SimpleKey, 4> SimpleKeys;
567 };
568 
569 } // end namespace yaml
570 } // end namespace llvm
571 
572 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
573 static void encodeUTF8( uint32_t UnicodeScalarValue
574  , SmallVectorImpl<char> &Result) {
575  if (UnicodeScalarValue <= 0x7F) {
576  Result.push_back(UnicodeScalarValue & 0x7F);
577  } else if (UnicodeScalarValue <= 0x7FF) {
578  uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
579  uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
580  Result.push_back(FirstByte);
581  Result.push_back(SecondByte);
582  } else if (UnicodeScalarValue <= 0xFFFF) {
583  uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
584  uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
585  uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
586  Result.push_back(FirstByte);
587  Result.push_back(SecondByte);
588  Result.push_back(ThirdByte);
589  } else if (UnicodeScalarValue <= 0x10FFFF) {
590  uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
591  uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
592  uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
593  uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
594  Result.push_back(FirstByte);
595  Result.push_back(SecondByte);
596  Result.push_back(ThirdByte);
597  Result.push_back(FourthByte);
598  }
599 }
600 
602  SourceMgr SM;
603  Scanner scanner(Input, SM);
604  while (true) {
605  Token T = scanner.getNext();
606  switch (T.Kind) {
608  OS << "Stream-Start: ";
609  break;
610  case Token::TK_StreamEnd:
611  OS << "Stream-End: ";
612  break;
614  OS << "Version-Directive: ";
615  break;
617  OS << "Tag-Directive: ";
618  break;
620  OS << "Document-Start: ";
621  break;
623  OS << "Document-End: ";
624  break;
626  OS << "Block-Entry: ";
627  break;
628  case Token::TK_BlockEnd:
629  OS << "Block-End: ";
630  break;
632  OS << "Block-Sequence-Start: ";
633  break;
635  OS << "Block-Mapping-Start: ";
636  break;
637  case Token::TK_FlowEntry:
638  OS << "Flow-Entry: ";
639  break;
641  OS << "Flow-Sequence-Start: ";
642  break;
644  OS << "Flow-Sequence-End: ";
645  break;
647  OS << "Flow-Mapping-Start: ";
648  break;
650  OS << "Flow-Mapping-End: ";
651  break;
652  case Token::TK_Key:
653  OS << "Key: ";
654  break;
655  case Token::TK_Value:
656  OS << "Value: ";
657  break;
658  case Token::TK_Scalar:
659  OS << "Scalar: ";
660  break;
662  OS << "Block Scalar: ";
663  break;
664  case Token::TK_Alias:
665  OS << "Alias: ";
666  break;
667  case Token::TK_Anchor:
668  OS << "Anchor: ";
669  break;
670  case Token::TK_Tag:
671  OS << "Tag: ";
672  break;
673  case Token::TK_Error:
674  break;
675  }
676  OS << T.Range << "\n";
677  if (T.Kind == Token::TK_StreamEnd)
678  break;
679  else if (T.Kind == Token::TK_Error)
680  return false;
681  }
682  return true;
683 }
684 
686  llvm::SourceMgr SM;
687  llvm::yaml::Scanner scanner(Input, SM);
688  for (;;) {
689  llvm::yaml::Token T = scanner.getNext();
690  if (T.Kind == Token::TK_StreamEnd)
691  break;
692  else if (T.Kind == Token::TK_Error)
693  return false;
694  }
695  return true;
696 }
697 
699  std::string EscapedInput;
700  for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
701  if (*i == '\\')
702  EscapedInput += "\\\\";
703  else if (*i == '"')
704  EscapedInput += "\\\"";
705  else if (*i == 0)
706  EscapedInput += "\\0";
707  else if (*i == 0x07)
708  EscapedInput += "\\a";
709  else if (*i == 0x08)
710  EscapedInput += "\\b";
711  else if (*i == 0x09)
712  EscapedInput += "\\t";
713  else if (*i == 0x0A)
714  EscapedInput += "\\n";
715  else if (*i == 0x0B)
716  EscapedInput += "\\v";
717  else if (*i == 0x0C)
718  EscapedInput += "\\f";
719  else if (*i == 0x0D)
720  EscapedInput += "\\r";
721  else if (*i == 0x1B)
722  EscapedInput += "\\e";
723  else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
724  std::string HexStr = utohexstr(*i);
725  EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
726  } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
727  UTF8Decoded UnicodeScalarValue
728  = decodeUTF8(StringRef(i, Input.end() - i));
729  if (UnicodeScalarValue.second == 0) {
730  // Found invalid char.
731  SmallString<4> Val;
732  encodeUTF8(0xFFFD, Val);
733  EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
734  // FIXME: Error reporting.
735  return EscapedInput;
736  }
737  if (UnicodeScalarValue.first == 0x85)
738  EscapedInput += "\\N";
739  else if (UnicodeScalarValue.first == 0xA0)
740  EscapedInput += "\\_";
741  else if (UnicodeScalarValue.first == 0x2028)
742  EscapedInput += "\\L";
743  else if (UnicodeScalarValue.first == 0x2029)
744  EscapedInput += "\\P";
745  else {
746  std::string HexStr = utohexstr(UnicodeScalarValue.first);
747  if (HexStr.size() <= 2)
748  EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
749  else if (HexStr.size() <= 4)
750  EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
751  else if (HexStr.size() <= 8)
752  EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
753  }
754  i += UnicodeScalarValue.second - 1;
755  } else
756  EscapedInput.push_back(*i);
757  }
758  return EscapedInput;
759 }
760 
762  : SM(sm), ShowColors(ShowColors) {
763  init(MemoryBufferRef(Input, "YAML"));
764 }
765 
766 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors)
767  : SM(SM_), ShowColors(ShowColors) {
768  init(Buffer);
769 }
770 
771 void Scanner::init(MemoryBufferRef Buffer) {
772  InputBuffer = Buffer;
773  Current = InputBuffer.getBufferStart();
774  End = InputBuffer.getBufferEnd();
775  Indent = -1;
776  Column = 0;
777  Line = 0;
778  FlowLevel = 0;
779  IsStartOfStream = true;
780  IsSimpleKeyAllowed = true;
781  Failed = false;
782  std::unique_ptr<MemoryBuffer> InputBufferOwner =
784  SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
785 }
786 
788  // If the current token is a possible simple key, keep parsing until we
789  // can confirm.
790  bool NeedMore = false;
791  while (true) {
792  if (TokenQueue.empty() || NeedMore) {
793  if (!fetchMoreTokens()) {
794  TokenQueue.clear();
795  TokenQueue.push_back(Token());
796  return TokenQueue.front();
797  }
798  }
799  assert(!TokenQueue.empty() &&
800  "fetchMoreTokens lied about getting tokens!");
801 
802  removeStaleSimpleKeyCandidates();
803  SimpleKey SK;
804  SK.Tok = TokenQueue.front();
805  if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
806  == SimpleKeys.end())
807  break;
808  else
809  NeedMore = true;
810  }
811  return TokenQueue.front();
812 }
813 
815  Token Ret = peekNext();
816  // TokenQueue can be empty if there was an error getting the next token.
817  if (!TokenQueue.empty())
818  TokenQueue.pop_front();
819 
820  // There cannot be any referenced Token's if the TokenQueue is empty. So do a
821  // quick deallocation of them all.
822  if (TokenQueue.empty()) {
823  TokenQueue.Alloc.Reset();
824  }
825 
826  return Ret;
827 }
828 
829 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
830  if (Position == End)
831  return Position;
832  // Check 7 bit c-printable - b-char.
833  if ( *Position == 0x09
834  || (*Position >= 0x20 && *Position <= 0x7E))
835  return Position + 1;
836 
837  // Check for valid UTF-8.
838  if (uint8_t(*Position) & 0x80) {
839  UTF8Decoded u8d = decodeUTF8(Position);
840  if ( u8d.second != 0
841  && u8d.first != 0xFEFF
842  && ( u8d.first == 0x85
843  || ( u8d.first >= 0xA0
844  && u8d.first <= 0xD7FF)
845  || ( u8d.first >= 0xE000
846  && u8d.first <= 0xFFFD)
847  || ( u8d.first >= 0x10000
848  && u8d.first <= 0x10FFFF)))
849  return Position + u8d.second;
850  }
851  return Position;
852 }
853 
854 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
855  if (Position == End)
856  return Position;
857  if (*Position == 0x0D) {
858  if (Position + 1 != End && *(Position + 1) == 0x0A)
859  return Position + 2;
860  return Position + 1;
861  }
862 
863  if (*Position == 0x0A)
864  return Position + 1;
865  return Position;
866 }
867 
868 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
869  if (Position == End)
870  return Position;
871  if (*Position == ' ')
872  return Position + 1;
873  return Position;
874 }
875 
876 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
877  if (Position == End)
878  return Position;
879  if (*Position == ' ' || *Position == '\t')
880  return Position + 1;
881  return Position;
882 }
883 
884 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
885  if (Position == End)
886  return Position;
887  if (*Position == ' ' || *Position == '\t')
888  return Position;
889  return skip_nb_char(Position);
890 }
891 
892 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
893  , StringRef::iterator Position) {
894  while (true) {
895  StringRef::iterator i = (this->*Func)(Position);
896  if (i == Position)
897  break;
898  Position = i;
899  }
900  return Position;
901 }
902 
903 void Scanner::advanceWhile(SkipWhileFunc Func) {
904  auto Final = skip_while(Func, Current);
905  Column += Final - Current;
906  Current = Final;
907 }
908 
909 static bool is_ns_hex_digit(const char C) {
910  return (C >= '0' && C <= '9')
911  || (C >= 'a' && C <= 'z')
912  || (C >= 'A' && C <= 'Z');
913 }
914 
915 static bool is_ns_word_char(const char C) {
916  return C == '-'
917  || (C >= 'a' && C <= 'z')
918  || (C >= 'A' && C <= 'Z');
919 }
920 
921 StringRef Scanner::scan_ns_uri_char() {
922  StringRef::iterator Start = Current;
923  while (true) {
924  if (Current == End)
925  break;
926  if (( *Current == '%'
927  && Current + 2 < End
928  && is_ns_hex_digit(*(Current + 1))
929  && is_ns_hex_digit(*(Current + 2)))
930  || is_ns_word_char(*Current)
931  || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
932  != StringRef::npos) {
933  ++Current;
934  ++Column;
935  } else
936  break;
937  }
938  return StringRef(Start, Current - Start);
939 }
940 
941 bool Scanner::consume(uint32_t Expected) {
942  if (Expected >= 0x80)
943  report_fatal_error("Not dealing with this yet");
944  if (Current == End)
945  return false;
946  if (uint8_t(*Current) >= 0x80)
947  report_fatal_error("Not dealing with this yet");
948  if (uint8_t(*Current) == Expected) {
949  ++Current;
950  ++Column;
951  return true;
952  }
953  return false;
954 }
955 
956 void Scanner::skip(uint32_t Distance) {
957  Current += Distance;
958  Column += Distance;
959  assert(Current <= End && "Skipped past the end");
960 }
961 
962 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
963  if (Position == End)
964  return false;
965  if ( *Position == ' ' || *Position == '\t'
966  || *Position == '\r' || *Position == '\n')
967  return true;
968  return false;
969 }
970 
971 bool Scanner::consumeLineBreakIfPresent() {
972  auto Next = skip_b_break(Current);
973  if (Next == Current)
974  return false;
975  Column = 0;
976  ++Line;
977  Current = Next;
978  return true;
979 }
980 
981 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
982  , unsigned AtColumn
983  , bool IsRequired) {
984  if (IsSimpleKeyAllowed) {
985  SimpleKey SK;
986  SK.Tok = Tok;
987  SK.Line = Line;
988  SK.Column = AtColumn;
989  SK.IsRequired = IsRequired;
990  SK.FlowLevel = FlowLevel;
991  SimpleKeys.push_back(SK);
992  }
993 }
994 
995 void Scanner::removeStaleSimpleKeyCandidates() {
996  for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
997  i != SimpleKeys.end();) {
998  if (i->Line != Line || i->Column + 1024 < Column) {
999  if (i->IsRequired)
1000  setError( "Could not find expected : for simple key"
1001  , i->Tok->Range.begin());
1002  i = SimpleKeys.erase(i);
1003  } else
1004  ++i;
1005  }
1006 }
1007 
1008 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
1009  if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
1010  SimpleKeys.pop_back();
1011 }
1012 
1013 bool Scanner::unrollIndent(int ToColumn) {
1014  Token T;
1015  // Indentation is ignored in flow.
1016  if (FlowLevel != 0)
1017  return true;
1018 
1019  while (Indent > ToColumn) {
1021  T.Range = StringRef(Current, 1);
1022  TokenQueue.push_back(T);
1023  Indent = Indents.pop_back_val();
1024  }
1025 
1026  return true;
1027 }
1028 
1029 bool Scanner::rollIndent( int ToColumn
1031  , TokenQueueT::iterator InsertPoint) {
1032  if (FlowLevel)
1033  return true;
1034  if (Indent < ToColumn) {
1035  Indents.push_back(Indent);
1036  Indent = ToColumn;
1037 
1038  Token T;
1039  T.Kind = Kind;
1040  T.Range = StringRef(Current, 0);
1041  TokenQueue.insert(InsertPoint, T);
1042  }
1043  return true;
1044 }
1045 
1046 void Scanner::skipComment() {
1047  if (*Current != '#')
1048  return;
1049  while (true) {
1050  // This may skip more than one byte, thus Column is only incremented
1051  // for code points.
1052  StringRef::iterator I = skip_nb_char(Current);
1053  if (I == Current)
1054  break;
1055  Current = I;
1056  ++Column;
1057  }
1058 }
1059 
1060 void Scanner::scanToNextToken() {
1061  while (true) {
1062  while (*Current == ' ' || *Current == '\t') {
1063  skip(1);
1064  }
1065 
1066  skipComment();
1067 
1068  // Skip EOL.
1069  StringRef::iterator i = skip_b_break(Current);
1070  if (i == Current)
1071  break;
1072  Current = i;
1073  ++Line;
1074  Column = 0;
1075  // New lines may start a simple key.
1076  if (!FlowLevel)
1077  IsSimpleKeyAllowed = true;
1078  }
1079 }
1080 
1081 bool Scanner::scanStreamStart() {
1082  IsStartOfStream = false;
1083 
1084  EncodingInfo EI = getUnicodeEncoding(currentInput());
1085 
1086  Token T;
1088  T.Range = StringRef(Current, EI.second);
1089  TokenQueue.push_back(T);
1090  Current += EI.second;
1091  return true;
1092 }
1093 
1094 bool Scanner::scanStreamEnd() {
1095  // Force an ending new line if one isn't present.
1096  if (Column != 0) {
1097  Column = 0;
1098  ++Line;
1099  }
1100 
1101  unrollIndent(-1);
1102  SimpleKeys.clear();
1103  IsSimpleKeyAllowed = false;
1104 
1105  Token T;
1107  T.Range = StringRef(Current, 0);
1108  TokenQueue.push_back(T);
1109  return true;
1110 }
1111 
1112 bool Scanner::scanDirective() {
1113  // Reset the indentation level.
1114  unrollIndent(-1);
1115  SimpleKeys.clear();
1116  IsSimpleKeyAllowed = false;
1117 
1118  StringRef::iterator Start = Current;
1119  consume('%');
1120  StringRef::iterator NameStart = Current;
1121  Current = skip_while(&Scanner::skip_ns_char, Current);
1122  StringRef Name(NameStart, Current - NameStart);
1123  Current = skip_while(&Scanner::skip_s_white, Current);
1124 
1125  Token T;
1126  if (Name == "YAML") {
1127  Current = skip_while(&Scanner::skip_ns_char, Current);
1129  T.Range = StringRef(Start, Current - Start);
1130  TokenQueue.push_back(T);
1131  return true;
1132  } else if(Name == "TAG") {
1133  Current = skip_while(&Scanner::skip_ns_char, Current);
1134  Current = skip_while(&Scanner::skip_s_white, Current);
1135  Current = skip_while(&Scanner::skip_ns_char, Current);
1137  T.Range = StringRef(Start, Current - Start);
1138  TokenQueue.push_back(T);
1139  return true;
1140  }
1141  return false;
1142 }
1143 
1144 bool Scanner::scanDocumentIndicator(bool IsStart) {
1145  unrollIndent(-1);
1146  SimpleKeys.clear();
1147  IsSimpleKeyAllowed = false;
1148 
1149  Token T;
1151  T.Range = StringRef(Current, 3);
1152  skip(3);
1153  TokenQueue.push_back(T);
1154  return true;
1155 }
1156 
1157 bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1158  Token T;
1159  T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1161  T.Range = StringRef(Current, 1);
1162  skip(1);
1163  TokenQueue.push_back(T);
1164 
1165  // [ and { may begin a simple key.
1166  saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
1167 
1168  // And may also be followed by a simple key.
1169  IsSimpleKeyAllowed = true;
1170  ++FlowLevel;
1171  return true;
1172 }
1173 
1174 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1175  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1176  IsSimpleKeyAllowed = false;
1177  Token T;
1178  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1180  T.Range = StringRef(Current, 1);
1181  skip(1);
1182  TokenQueue.push_back(T);
1183  if (FlowLevel)
1184  --FlowLevel;
1185  return true;
1186 }
1187 
1188 bool Scanner::scanFlowEntry() {
1189  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1190  IsSimpleKeyAllowed = true;
1191  Token T;
1193  T.Range = StringRef(Current, 1);
1194  skip(1);
1195  TokenQueue.push_back(T);
1196  return true;
1197 }
1198 
1199 bool Scanner::scanBlockEntry() {
1200  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1201  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1202  IsSimpleKeyAllowed = true;
1203  Token T;
1205  T.Range = StringRef(Current, 1);
1206  skip(1);
1207  TokenQueue.push_back(T);
1208  return true;
1209 }
1210 
1211 bool Scanner::scanKey() {
1212  if (!FlowLevel)
1213  rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1214 
1215  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1216  IsSimpleKeyAllowed = !FlowLevel;
1217 
1218  Token T;
1219  T.Kind = Token::TK_Key;
1220  T.Range = StringRef(Current, 1);
1221  skip(1);
1222  TokenQueue.push_back(T);
1223  return true;
1224 }
1225 
1226 bool Scanner::scanValue() {
1227  // If the previous token could have been a simple key, insert the key token
1228  // into the token queue.
1229  if (!SimpleKeys.empty()) {
1230  SimpleKey SK = SimpleKeys.pop_back_val();
1231  Token T;
1232  T.Kind = Token::TK_Key;
1233  T.Range = SK.Tok->Range;
1234  TokenQueueT::iterator i, e;
1235  for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1236  if (i == SK.Tok)
1237  break;
1238  }
1239  assert(i != e && "SimpleKey not in token queue!");
1240  i = TokenQueue.insert(i, T);
1241 
1242  // We may also need to add a Block-Mapping-Start token.
1243  rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1244 
1245  IsSimpleKeyAllowed = false;
1246  } else {
1247  if (!FlowLevel)
1248  rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1249  IsSimpleKeyAllowed = !FlowLevel;
1250  }
1251 
1252  Token T;
1253  T.Kind = Token::TK_Value;
1254  T.Range = StringRef(Current, 1);
1255  skip(1);
1256  TokenQueue.push_back(T);
1257  return true;
1258 }
1259 
1260 // Forbidding inlining improves performance by roughly 20%.
1261 // FIXME: Remove once llvm optimizes this to the faster version without hints.
1262 LLVM_ATTRIBUTE_NOINLINE static bool
1264 
1265 // Returns whether a character at 'Position' was escaped with a leading '\'.
1266 // 'First' specifies the position of the first character in the string.
1268  StringRef::iterator Position) {
1269  assert(Position - 1 >= First);
1270  StringRef::iterator I = Position - 1;
1271  // We calculate the number of consecutive '\'s before the current position
1272  // by iterating backwards through our string.
1273  while (I >= First && *I == '\\') --I;
1274  // (Position - 1 - I) now contains the number of '\'s before the current
1275  // position. If it is odd, the character at 'Position' was escaped.
1276  return (Position - 1 - I) % 2 == 1;
1277 }
1278 
1279 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1280  StringRef::iterator Start = Current;
1281  unsigned ColStart = Column;
1282  if (IsDoubleQuoted) {
1283  do {
1284  ++Current;
1285  while (Current != End && *Current != '"')
1286  ++Current;
1287  // Repeat until the previous character was not a '\' or was an escaped
1288  // backslash.
1289  } while ( Current != End
1290  && *(Current - 1) == '\\'
1291  && wasEscaped(Start + 1, Current));
1292  } else {
1293  skip(1);
1294  while (true) {
1295  // Skip a ' followed by another '.
1296  if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1297  skip(2);
1298  continue;
1299  } else if (*Current == '\'')
1300  break;
1301  StringRef::iterator i = skip_nb_char(Current);
1302  if (i == Current) {
1303  i = skip_b_break(Current);
1304  if (i == Current)
1305  break;
1306  Current = i;
1307  Column = 0;
1308  ++Line;
1309  } else {
1310  if (i == End)
1311  break;
1312  Current = i;
1313  ++Column;
1314  }
1315  }
1316  }
1317 
1318  if (Current == End) {
1319  setError("Expected quote at end of scalar", Current);
1320  return false;
1321  }
1322 
1323  skip(1); // Skip ending quote.
1324  Token T;
1325  T.Kind = Token::TK_Scalar;
1326  T.Range = StringRef(Start, Current - Start);
1327  TokenQueue.push_back(T);
1328 
1329  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1330 
1331  IsSimpleKeyAllowed = false;
1332 
1333  return true;
1334 }
1335 
1336 bool Scanner::scanPlainScalar() {
1337  StringRef::iterator Start = Current;
1338  unsigned ColStart = Column;
1339  unsigned LeadingBlanks = 0;
1340  assert(Indent >= -1 && "Indent must be >= -1 !");
1341  unsigned indent = static_cast<unsigned>(Indent + 1);
1342  while (true) {
1343  if (*Current == '#')
1344  break;
1345 
1346  while (!isBlankOrBreak(Current)) {
1347  if ( FlowLevel && *Current == ':'
1348  && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
1349  setError("Found unexpected ':' while scanning a plain scalar", Current);
1350  return false;
1351  }
1352 
1353  // Check for the end of the plain scalar.
1354  if ( (*Current == ':' && isBlankOrBreak(Current + 1))
1355  || ( FlowLevel
1356  && (StringRef(Current, 1).find_first_of(",:?[]{}")
1357  != StringRef::npos)))
1358  break;
1359 
1360  StringRef::iterator i = skip_nb_char(Current);
1361  if (i == Current)
1362  break;
1363  Current = i;
1364  ++Column;
1365  }
1366 
1367  // Are we at the end?
1368  if (!isBlankOrBreak(Current))
1369  break;
1370 
1371  // Eat blanks.
1372  StringRef::iterator Tmp = Current;
1373  while (isBlankOrBreak(Tmp)) {
1374  StringRef::iterator i = skip_s_white(Tmp);
1375  if (i != Tmp) {
1376  if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1377  setError("Found invalid tab character in indentation", Tmp);
1378  return false;
1379  }
1380  Tmp = i;
1381  ++Column;
1382  } else {
1383  i = skip_b_break(Tmp);
1384  if (!LeadingBlanks)
1385  LeadingBlanks = 1;
1386  Tmp = i;
1387  Column = 0;
1388  ++Line;
1389  }
1390  }
1391 
1392  if (!FlowLevel && Column < indent)
1393  break;
1394 
1395  Current = Tmp;
1396  }
1397  if (Start == Current) {
1398  setError("Got empty plain scalar", Start);
1399  return false;
1400  }
1401  Token T;
1402  T.Kind = Token::TK_Scalar;
1403  T.Range = StringRef(Start, Current - Start);
1404  TokenQueue.push_back(T);
1405 
1406  // Plain scalars can be simple keys.
1407  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1408 
1409  IsSimpleKeyAllowed = false;
1410 
1411  return true;
1412 }
1413 
1414 bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1415  StringRef::iterator Start = Current;
1416  unsigned ColStart = Column;
1417  skip(1);
1418  while(true) {
1419  if ( *Current == '[' || *Current == ']'
1420  || *Current == '{' || *Current == '}'
1421  || *Current == ','
1422  || *Current == ':')
1423  break;
1424  StringRef::iterator i = skip_ns_char(Current);
1425  if (i == Current)
1426  break;
1427  Current = i;
1428  ++Column;
1429  }
1430 
1431  if (Start == Current) {
1432  setError("Got empty alias or anchor", Start);
1433  return false;
1434  }
1435 
1436  Token T;
1437  T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1438  T.Range = StringRef(Start, Current - Start);
1439  TokenQueue.push_back(T);
1440 
1441  // Alias and anchors can be simple keys.
1442  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1443 
1444  IsSimpleKeyAllowed = false;
1445 
1446  return true;
1447 }
1448 
1449 char Scanner::scanBlockChompingIndicator() {
1450  char Indicator = ' ';
1451  if (Current != End && (*Current == '+' || *Current == '-')) {
1452  Indicator = *Current;
1453  skip(1);
1454  }
1455  return Indicator;
1456 }
1457 
1458 /// Get the number of line breaks after chomping.
1459 ///
1460 /// Return the number of trailing line breaks to emit, depending on
1461 /// \p ChompingIndicator.
1462 static unsigned getChompedLineBreaks(char ChompingIndicator,
1463  unsigned LineBreaks, StringRef Str) {
1464  if (ChompingIndicator == '-') // Strip all line breaks.
1465  return 0;
1466  if (ChompingIndicator == '+') // Keep all line breaks.
1467  return LineBreaks;
1468  // Clip trailing lines.
1469  return Str.empty() ? 0 : 1;
1470 }
1471 
1472 unsigned Scanner::scanBlockIndentationIndicator() {
1473  unsigned Indent = 0;
1474  if (Current != End && (*Current >= '1' && *Current <= '9')) {
1475  Indent = unsigned(*Current - '0');
1476  skip(1);
1477  }
1478  return Indent;
1479 }
1480 
1481 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
1482  unsigned &IndentIndicator, bool &IsDone) {
1483  auto Start = Current;
1484 
1485  ChompingIndicator = scanBlockChompingIndicator();
1486  IndentIndicator = scanBlockIndentationIndicator();
1487  // Check for the chomping indicator once again.
1488  if (ChompingIndicator == ' ')
1489  ChompingIndicator = scanBlockChompingIndicator();
1490  Current = skip_while(&Scanner::skip_s_white, Current);
1491  skipComment();
1492 
1493  if (Current == End) { // EOF, we have an empty scalar.
1494  Token T;
1496  T.Range = StringRef(Start, Current - Start);
1497  TokenQueue.push_back(T);
1498  IsDone = true;
1499  return true;
1500  }
1501 
1502  if (!consumeLineBreakIfPresent()) {
1503  setError("Expected a line break after block scalar header", Current);
1504  return false;
1505  }
1506  return true;
1507 }
1508 
1509 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
1510  unsigned BlockExitIndent,
1511  unsigned &LineBreaks, bool &IsDone) {
1512  unsigned MaxAllSpaceLineCharacters = 0;
1513  StringRef::iterator LongestAllSpaceLine;
1514 
1515  while (true) {
1516  advanceWhile(&Scanner::skip_s_space);
1517  if (skip_nb_char(Current) != Current) {
1518  // This line isn't empty, so try and find the indentation.
1519  if (Column <= BlockExitIndent) { // End of the block literal.
1520  IsDone = true;
1521  return true;
1522  }
1523  // We found the block's indentation.
1524  BlockIndent = Column;
1525  if (MaxAllSpaceLineCharacters > BlockIndent) {
1526  setError(
1527  "Leading all-spaces line must be smaller than the block indent",
1528  LongestAllSpaceLine);
1529  return false;
1530  }
1531  return true;
1532  }
1533  if (skip_b_break(Current) != Current &&
1534  Column > MaxAllSpaceLineCharacters) {
1535  // Record the longest all-space line in case it's longer than the
1536  // discovered block indent.
1537  MaxAllSpaceLineCharacters = Column;
1538  LongestAllSpaceLine = Current;
1539  }
1540 
1541  // Check for EOF.
1542  if (Current == End) {
1543  IsDone = true;
1544  return true;
1545  }
1546 
1547  if (!consumeLineBreakIfPresent()) {
1548  IsDone = true;
1549  return true;
1550  }
1551  ++LineBreaks;
1552  }
1553  return true;
1554 }
1555 
1556 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
1557  unsigned BlockExitIndent, bool &IsDone) {
1558  // Skip the indentation.
1559  while (Column < BlockIndent) {
1560  auto I = skip_s_space(Current);
1561  if (I == Current)
1562  break;
1563  Current = I;
1564  ++Column;
1565  }
1566 
1567  if (skip_nb_char(Current) == Current)
1568  return true;
1569 
1570  if (Column <= BlockExitIndent) { // End of the block literal.
1571  IsDone = true;
1572  return true;
1573  }
1574 
1575  if (Column < BlockIndent) {
1576  if (Current != End && *Current == '#') { // Trailing comment.
1577  IsDone = true;
1578  return true;
1579  }
1580  setError("A text line is less indented than the block scalar", Current);
1581  return false;
1582  }
1583  return true; // A normal text line.
1584 }
1585 
1586 bool Scanner::scanBlockScalar(bool IsLiteral) {
1587  // Eat '|' or '>'
1588  assert(*Current == '|' || *Current == '>');
1589  skip(1);
1590 
1591  char ChompingIndicator;
1592  unsigned BlockIndent;
1593  bool IsDone = false;
1594  if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
1595  return false;
1596  if (IsDone)
1597  return true;
1598 
1599  auto Start = Current;
1600  unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
1601  unsigned LineBreaks = 0;
1602  if (BlockIndent == 0) {
1603  if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
1604  IsDone))
1605  return false;
1606  }
1607 
1608  // Scan the block's scalars body.
1609  SmallString<256> Str;
1610  while (!IsDone) {
1611  if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
1612  return false;
1613  if (IsDone)
1614  break;
1615 
1616  // Parse the current line.
1617  auto LineStart = Current;
1618  advanceWhile(&Scanner::skip_nb_char);
1619  if (LineStart != Current) {
1620  Str.append(LineBreaks, '\n');
1621  Str.append(StringRef(LineStart, Current - LineStart));
1622  LineBreaks = 0;
1623  }
1624 
1625  // Check for EOF.
1626  if (Current == End)
1627  break;
1628 
1629  if (!consumeLineBreakIfPresent())
1630  break;
1631  ++LineBreaks;
1632  }
1633 
1634  if (Current == End && !LineBreaks)
1635  // Ensure that there is at least one line break before the end of file.
1636  LineBreaks = 1;
1637  Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
1638 
1639  // New lines may start a simple key.
1640  if (!FlowLevel)
1641  IsSimpleKeyAllowed = true;
1642 
1643  Token T;
1645  T.Range = StringRef(Start, Current - Start);
1646  T.Value = Str.str().str();
1647  TokenQueue.push_back(T);
1648  return true;
1649 }
1650 
1651 bool Scanner::scanTag() {
1652  StringRef::iterator Start = Current;
1653  unsigned ColStart = Column;
1654  skip(1); // Eat !.
1655  if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1656  else if (*Current == '<') {
1657  skip(1);
1658  scan_ns_uri_char();
1659  if (!consume('>'))
1660  return false;
1661  } else {
1662  // FIXME: Actually parse the c-ns-shorthand-tag rule.
1663  Current = skip_while(&Scanner::skip_ns_char, Current);
1664  }
1665 
1666  Token T;
1667  T.Kind = Token::TK_Tag;
1668  T.Range = StringRef(Start, Current - Start);
1669  TokenQueue.push_back(T);
1670 
1671  // Tags can be simple keys.
1672  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
1673 
1674  IsSimpleKeyAllowed = false;
1675 
1676  return true;
1677 }
1678 
1679 bool Scanner::fetchMoreTokens() {
1680  if (IsStartOfStream)
1681  return scanStreamStart();
1682 
1683  scanToNextToken();
1684 
1685  if (Current == End)
1686  return scanStreamEnd();
1687 
1688  removeStaleSimpleKeyCandidates();
1689 
1690  unrollIndent(Column);
1691 
1692  if (Column == 0 && *Current == '%')
1693  return scanDirective();
1694 
1695  if (Column == 0 && Current + 4 <= End
1696  && *Current == '-'
1697  && *(Current + 1) == '-'
1698  && *(Current + 2) == '-'
1699  && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1700  return scanDocumentIndicator(true);
1701 
1702  if (Column == 0 && Current + 4 <= End
1703  && *Current == '.'
1704  && *(Current + 1) == '.'
1705  && *(Current + 2) == '.'
1706  && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1707  return scanDocumentIndicator(false);
1708 
1709  if (*Current == '[')
1710  return scanFlowCollectionStart(true);
1711 
1712  if (*Current == '{')
1713  return scanFlowCollectionStart(false);
1714 
1715  if (*Current == ']')
1716  return scanFlowCollectionEnd(true);
1717 
1718  if (*Current == '}')
1719  return scanFlowCollectionEnd(false);
1720 
1721  if (*Current == ',')
1722  return scanFlowEntry();
1723 
1724  if (*Current == '-' && isBlankOrBreak(Current + 1))
1725  return scanBlockEntry();
1726 
1727  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
1728  return scanKey();
1729 
1730  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
1731  return scanValue();
1732 
1733  if (*Current == '*')
1734  return scanAliasOrAnchor(true);
1735 
1736  if (*Current == '&')
1737  return scanAliasOrAnchor(false);
1738 
1739  if (*Current == '!')
1740  return scanTag();
1741 
1742  if (*Current == '|' && !FlowLevel)
1743  return scanBlockScalar(true);
1744 
1745  if (*Current == '>' && !FlowLevel)
1746  return scanBlockScalar(false);
1747 
1748  if (*Current == '\'')
1749  return scanFlowScalar(false);
1750 
1751  if (*Current == '"')
1752  return scanFlowScalar(true);
1753 
1754  // Get a plain scalar.
1755  StringRef FirstChar(Current, 1);
1756  if (!(isBlankOrBreak(Current)
1757  || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
1758  || (*Current == '-' && !isBlankOrBreak(Current + 1))
1759  || (!FlowLevel && (*Current == '?' || *Current == ':')
1760  && isBlankOrBreak(Current + 1))
1761  || (!FlowLevel && *Current == ':'
1762  && Current + 2 < End
1763  && *(Current + 1) == ':'
1764  && !isBlankOrBreak(Current + 2)))
1765  return scanPlainScalar();
1766 
1767  setError("Unrecognized character while tokenizing.");
1768  return false;
1769 }
1770 
1771 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors)
1772  : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {}
1773 
1774 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors)
1775  : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {}
1776 
1778 
1779 bool Stream::failed() { return scanner->failed(); }
1780 
1781 void Stream::printError(Node *N, const Twine &Msg) {
1782  scanner->printError( N->getSourceRange().Start
1784  , Msg
1785  , N->getSourceRange());
1786 }
1787 
1789  if (CurrentDoc)
1790  report_fatal_error("Can only iterate over the stream once");
1791 
1792  // Skip Stream-Start.
1793  scanner->getNext();
1794 
1795  CurrentDoc.reset(new Document(*this));
1796  return document_iterator(CurrentDoc);
1797 }
1798 
1800  return document_iterator();
1801 }
1802 
1804  for (document_iterator i = begin(), e = end(); i != e; ++i)
1805  i->skip();
1806 }
1807 
1808 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
1809  StringRef T)
1810  : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
1811  SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1812  SourceRange = SMRange(Start, Start);
1813 }
1814 
1815 std::string Node::getVerbatimTag() const {
1816  StringRef Raw = getRawTag();
1817  if (!Raw.empty() && Raw != "!") {
1818  std::string Ret;
1819  if (Raw.find_last_of('!') == 0) {
1820  Ret = Doc->getTagMap().find("!")->second;
1821  Ret += Raw.substr(1);
1822  return Ret;
1823  } else if (Raw.startswith("!!")) {
1824  Ret = Doc->getTagMap().find("!!")->second;
1825  Ret += Raw.substr(2);
1826  return Ret;
1827  } else {
1828  StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1829  std::map<StringRef, StringRef>::const_iterator It =
1830  Doc->getTagMap().find(TagHandle);
1831  if (It != Doc->getTagMap().end())
1832  Ret = It->second;
1833  else {
1834  Token T;
1835  T.Kind = Token::TK_Tag;
1836  T.Range = TagHandle;
1837  setError(Twine("Unknown tag handle ") + TagHandle, T);
1838  }
1839  Ret += Raw.substr(Raw.find_last_of('!') + 1);
1840  return Ret;
1841  }
1842  }
1843 
1844  switch (getType()) {
1845  case NK_Null:
1846  return "tag:yaml.org,2002:null";
1847  case NK_Scalar:
1848  case NK_BlockScalar:
1849  // TODO: Tag resolution.
1850  return "tag:yaml.org,2002:str";
1851  case NK_Mapping:
1852  return "tag:yaml.org,2002:map";
1853  case NK_Sequence:
1854  return "tag:yaml.org,2002:seq";
1855  }
1856 
1857  return "";
1858 }
1859 
1860 Token &Node::peekNext() {
1861  return Doc->peekNext();
1862 }
1863 
1864 Token Node::getNext() {
1865  return Doc->getNext();
1866 }
1867 
1868 Node *Node::parseBlockNode() {
1869  return Doc->parseBlockNode();
1870 }
1871 
1872 BumpPtrAllocator &Node::getAllocator() {
1873  return Doc->NodeAllocator;
1874 }
1875 
1876 void Node::setError(const Twine &Msg, Token &Tok) const {
1877  Doc->setError(Msg, Tok);
1878 }
1879 
1880 bool Node::failed() const {
1881  return Doc->failed();
1882 }
1883 
1884 
1885 
1886 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
1887  // TODO: Handle newlines properly. We need to remove leading whitespace.
1888  if (Value[0] == '"') { // Double quoted.
1889  // Pull off the leading and trailing "s.
1890  StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1891  // Search for characters that would require unescaping the value.
1892  StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
1893  if (i != StringRef::npos)
1894  return unescapeDoubleQuoted(UnquotedValue, i, Storage);
1895  return UnquotedValue;
1896  } else if (Value[0] == '\'') { // Single quoted.
1897  // Pull off the leading and trailing 's.
1898  StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1899  StringRef::size_type i = UnquotedValue.find('\'');
1900  if (i != StringRef::npos) {
1901  // We're going to need Storage.
1902  Storage.clear();
1903  Storage.reserve(UnquotedValue.size());
1904  for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
1905  StringRef Valid(UnquotedValue.begin(), i);
1906  Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1907  Storage.push_back('\'');
1908  UnquotedValue = UnquotedValue.substr(i + 2);
1909  }
1910  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
1911  return StringRef(Storage.begin(), Storage.size());
1912  }
1913  return UnquotedValue;
1914  }
1915  // Plain or block.
1916  return Value.rtrim(" ");
1917 }
1918 
1919 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
1921  , SmallVectorImpl<char> &Storage)
1922  const {
1923  // Use Storage to build proper value.
1924  Storage.clear();
1925  Storage.reserve(UnquotedValue.size());
1926  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
1927  // Insert all previous chars into Storage.
1928  StringRef Valid(UnquotedValue.begin(), i);
1929  Storage.insert(Storage.end(), Valid.begin(), Valid.end());
1930  // Chop off inserted chars.
1931  UnquotedValue = UnquotedValue.substr(i);
1932 
1933  assert(!UnquotedValue.empty() && "Can't be empty!");
1934 
1935  // Parse escape or line break.
1936  switch (UnquotedValue[0]) {
1937  case '\r':
1938  case '\n':
1939  Storage.push_back('\n');
1940  if ( UnquotedValue.size() > 1
1941  && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1942  UnquotedValue = UnquotedValue.substr(1);
1943  UnquotedValue = UnquotedValue.substr(1);
1944  break;
1945  default:
1946  if (UnquotedValue.size() == 1)
1947  // TODO: Report error.
1948  break;
1949  UnquotedValue = UnquotedValue.substr(1);
1950  switch (UnquotedValue[0]) {
1951  default: {
1952  Token T;
1953  T.Range = StringRef(UnquotedValue.begin(), 1);
1954  setError("Unrecognized escape code!", T);
1955  return "";
1956  }
1957  case '\r':
1958  case '\n':
1959  // Remove the new line.
1960  if ( UnquotedValue.size() > 1
1961  && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
1962  UnquotedValue = UnquotedValue.substr(1);
1963  // If this was just a single byte newline, it will get skipped
1964  // below.
1965  break;
1966  case '0':
1967  Storage.push_back(0x00);
1968  break;
1969  case 'a':
1970  Storage.push_back(0x07);
1971  break;
1972  case 'b':
1973  Storage.push_back(0x08);
1974  break;
1975  case 't':
1976  case 0x09:
1977  Storage.push_back(0x09);
1978  break;
1979  case 'n':
1980  Storage.push_back(0x0A);
1981  break;
1982  case 'v':
1983  Storage.push_back(0x0B);
1984  break;
1985  case 'f':
1986  Storage.push_back(0x0C);
1987  break;
1988  case 'r':
1989  Storage.push_back(0x0D);
1990  break;
1991  case 'e':
1992  Storage.push_back(0x1B);
1993  break;
1994  case ' ':
1995  Storage.push_back(0x20);
1996  break;
1997  case '"':
1998  Storage.push_back(0x22);
1999  break;
2000  case '/':
2001  Storage.push_back(0x2F);
2002  break;
2003  case '\\':
2004  Storage.push_back(0x5C);
2005  break;
2006  case 'N':
2007  encodeUTF8(0x85, Storage);
2008  break;
2009  case '_':
2010  encodeUTF8(0xA0, Storage);
2011  break;
2012  case 'L':
2013  encodeUTF8(0x2028, Storage);
2014  break;
2015  case 'P':
2016  encodeUTF8(0x2029, Storage);
2017  break;
2018  case 'x': {
2019  if (UnquotedValue.size() < 3)
2020  // TODO: Report error.
2021  break;
2022  unsigned int UnicodeScalarValue;
2023  if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
2024  // TODO: Report error.
2025  UnicodeScalarValue = 0xFFFD;
2026  encodeUTF8(UnicodeScalarValue, Storage);
2027  UnquotedValue = UnquotedValue.substr(2);
2028  break;
2029  }
2030  case 'u': {
2031  if (UnquotedValue.size() < 5)
2032  // TODO: Report error.
2033  break;
2034  unsigned int UnicodeScalarValue;
2035  if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
2036  // TODO: Report error.
2037  UnicodeScalarValue = 0xFFFD;
2038  encodeUTF8(UnicodeScalarValue, Storage);
2039  UnquotedValue = UnquotedValue.substr(4);
2040  break;
2041  }
2042  case 'U': {
2043  if (UnquotedValue.size() < 9)
2044  // TODO: Report error.
2045  break;
2046  unsigned int UnicodeScalarValue;
2047  if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
2048  // TODO: Report error.
2049  UnicodeScalarValue = 0xFFFD;
2050  encodeUTF8(UnicodeScalarValue, Storage);
2051  UnquotedValue = UnquotedValue.substr(8);
2052  break;
2053  }
2054  }
2055  UnquotedValue = UnquotedValue.substr(1);
2056  }
2057  }
2058  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
2059  return StringRef(Storage.begin(), Storage.size());
2060 }
2061 
2062 Node *KeyValueNode::getKey() {
2063  if (Key)
2064  return Key;
2065  // Handle implicit null keys.
2066  {
2067  Token &t = peekNext();
2068  if ( t.Kind == Token::TK_BlockEnd
2069  || t.Kind == Token::TK_Value
2070  || t.Kind == Token::TK_Error) {
2071  return Key = new (getAllocator()) NullNode(Doc);
2072  }
2073  if (t.Kind == Token::TK_Key)
2074  getNext(); // skip TK_Key.
2075  }
2076 
2077  // Handle explicit null keys.
2078  Token &t = peekNext();
2079  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
2080  return Key = new (getAllocator()) NullNode(Doc);
2081  }
2082 
2083  // We've got a normal key.
2084  return Key = parseBlockNode();
2085 }
2086 
2087 Node *KeyValueNode::getValue() {
2088  if (Value)
2089  return Value;
2090  getKey()->skip();
2091  if (failed())
2092  return Value = new (getAllocator()) NullNode(Doc);
2093 
2094  // Handle implicit null values.
2095  {
2096  Token &t = peekNext();
2097  if ( t.Kind == Token::TK_BlockEnd
2098  || t.Kind == Token::TK_FlowMappingEnd
2099  || t.Kind == Token::TK_Key
2100  || t.Kind == Token::TK_FlowEntry
2101  || t.Kind == Token::TK_Error) {
2102  return Value = new (getAllocator()) NullNode(Doc);
2103  }
2104 
2105  if (t.Kind != Token::TK_Value) {
2106  setError("Unexpected token in Key Value.", t);
2107  return Value = new (getAllocator()) NullNode(Doc);
2108  }
2109  getNext(); // skip TK_Value.
2110  }
2111 
2112  // Handle explicit null values.
2113  Token &t = peekNext();
2114  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
2115  return Value = new (getAllocator()) NullNode(Doc);
2116  }
2117 
2118  // We got a normal value.
2119  return Value = parseBlockNode();
2120 }
2121 
2122 void MappingNode::increment() {
2123  if (failed()) {
2124  IsAtEnd = true;
2125  CurrentEntry = nullptr;
2126  return;
2127  }
2128  if (CurrentEntry) {
2129  CurrentEntry->skip();
2130  if (Type == MT_Inline) {
2131  IsAtEnd = true;
2132  CurrentEntry = nullptr;
2133  return;
2134  }
2135  }
2136  Token T = peekNext();
2137  if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
2138  // KeyValueNode eats the TK_Key. That way it can detect null keys.
2139  CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
2140  } else if (Type == MT_Block) {
2141  switch (T.Kind) {
2142  case Token::TK_BlockEnd:
2143  getNext();
2144  IsAtEnd = true;
2145  CurrentEntry = nullptr;
2146  break;
2147  default:
2148  setError("Unexpected token. Expected Key or Block End", T);
2149  case Token::TK_Error:
2150  IsAtEnd = true;
2151  CurrentEntry = nullptr;
2152  }
2153  } else {
2154  switch (T.Kind) {
2155  case Token::TK_FlowEntry:
2156  // Eat the flow entry and recurse.
2157  getNext();
2158  return increment();
2159  case Token::TK_FlowMappingEnd:
2160  getNext();
2161  case Token::TK_Error:
2162  // Set this to end iterator.
2163  IsAtEnd = true;
2164  CurrentEntry = nullptr;
2165  break;
2166  default:
2167  setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
2168  "Mapping End."
2169  , T);
2170  IsAtEnd = true;
2171  CurrentEntry = nullptr;
2172  }
2173  }
2174 }
2175 
2176 void SequenceNode::increment() {
2177  if (failed()) {
2178  IsAtEnd = true;
2179  CurrentEntry = nullptr;
2180  return;
2181  }
2182  if (CurrentEntry)
2183  CurrentEntry->skip();
2184  Token T = peekNext();
2185  if (SeqType == ST_Block) {
2186  switch (T.Kind) {
2187  case Token::TK_BlockEntry:
2188  getNext();
2189  CurrentEntry = parseBlockNode();
2190  if (!CurrentEntry) { // An error occurred.
2191  IsAtEnd = true;
2192  CurrentEntry = nullptr;
2193  }
2194  break;
2195  case Token::TK_BlockEnd:
2196  getNext();
2197  IsAtEnd = true;
2198  CurrentEntry = nullptr;
2199  break;
2200  default:
2201  setError( "Unexpected token. Expected Block Entry or Block End."
2202  , T);
2203  case Token::TK_Error:
2204  IsAtEnd = true;
2205  CurrentEntry = nullptr;
2206  }
2207  } else if (SeqType == ST_Indentless) {
2208  switch (T.Kind) {
2209  case Token::TK_BlockEntry:
2210  getNext();
2211  CurrentEntry = parseBlockNode();
2212  if (!CurrentEntry) { // An error occurred.
2213  IsAtEnd = true;
2214  CurrentEntry = nullptr;
2215  }
2216  break;
2217  default:
2218  case Token::TK_Error:
2219  IsAtEnd = true;
2220  CurrentEntry = nullptr;
2221  }
2222  } else if (SeqType == ST_Flow) {
2223  switch (T.Kind) {
2224  case Token::TK_FlowEntry:
2225  // Eat the flow entry and recurse.
2226  getNext();
2227  WasPreviousTokenFlowEntry = true;
2228  return increment();
2229  case Token::TK_FlowSequenceEnd:
2230  getNext();
2231  case Token::TK_Error:
2232  // Set this to end iterator.
2233  IsAtEnd = true;
2234  CurrentEntry = nullptr;
2235  break;
2236  case Token::TK_StreamEnd:
2237  case Token::TK_DocumentEnd:
2238  case Token::TK_DocumentStart:
2239  setError("Could not find closing ]!", T);
2240  // Set this to end iterator.
2241  IsAtEnd = true;
2242  CurrentEntry = nullptr;
2243  break;
2244  default:
2245  if (!WasPreviousTokenFlowEntry) {
2246  setError("Expected , between entries!", T);
2247  IsAtEnd = true;
2248  CurrentEntry = nullptr;
2249  break;
2250  }
2251  // Otherwise it must be a flow entry.
2252  CurrentEntry = parseBlockNode();
2253  if (!CurrentEntry) {
2254  IsAtEnd = true;
2255  }
2256  WasPreviousTokenFlowEntry = false;
2257  break;
2258  }
2259  }
2260 }
2261 
2262 Document::Document(Stream &S) : stream(S), Root(nullptr) {
2263  // Tag maps starts with two default mappings.
2264  TagMap["!"] = "!";
2265  TagMap["!!"] = "tag:yaml.org,2002:";
2266 
2267  if (parseDirectives())
2268  expectToken(Token::TK_DocumentStart);
2269  Token &T = peekNext();
2270  if (T.Kind == Token::TK_DocumentStart)
2271  getNext();
2272 }
2273 
2275  if (stream.scanner->failed())
2276  return false;
2277  if (!Root)
2278  getRoot();
2279  Root->skip();
2280  Token &T = peekNext();
2281  if (T.Kind == Token::TK_StreamEnd)
2282  return false;
2283  if (T.Kind == Token::TK_DocumentEnd) {
2284  getNext();
2285  return skip();
2286  }
2287  return true;
2288 }
2289 
2290 Token &Document::peekNext() {
2291  return stream.scanner->peekNext();
2292 }
2293 
2294 Token Document::getNext() {
2295  return stream.scanner->getNext();
2296 }
2297 
2298 void Document::setError(const Twine &Message, Token &Location) const {
2299  stream.scanner->setError(Message, Location.Range.begin());
2300 }
2301 
2302 bool Document::failed() const {
2303  return stream.scanner->failed();
2304 }
2305 
2307  Token T = peekNext();
2308  // Handle properties.
2309  Token AnchorInfo;
2310  Token TagInfo;
2311 parse_property:
2312  switch (T.Kind) {
2313  case Token::TK_Alias:
2314  getNext();
2315  return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2316  case Token::TK_Anchor:
2317  if (AnchorInfo.Kind == Token::TK_Anchor) {
2318  setError("Already encountered an anchor for this node!", T);
2319  return nullptr;
2320  }
2321  AnchorInfo = getNext(); // Consume TK_Anchor.
2322  T = peekNext();
2323  goto parse_property;
2324  case Token::TK_Tag:
2325  if (TagInfo.Kind == Token::TK_Tag) {
2326  setError("Already encountered a tag for this node!", T);
2327  return nullptr;
2328  }
2329  TagInfo = getNext(); // Consume TK_Tag.
2330  T = peekNext();
2331  goto parse_property;
2332  default:
2333  break;
2334  }
2335 
2336  switch (T.Kind) {
2337  case Token::TK_BlockEntry:
2338  // We got an unindented BlockEntry sequence. This is not terminated with
2339  // a BlockEnd.
2340  // Don't eat the TK_BlockEntry, SequenceNode needs it.
2341  return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2342  , AnchorInfo.Range.substr(1)
2343  , TagInfo.Range
2346  getNext();
2347  return new (NodeAllocator)
2348  SequenceNode( stream.CurrentDoc
2349  , AnchorInfo.Range.substr(1)
2350  , TagInfo.Range
2353  getNext();
2354  return new (NodeAllocator)
2355  MappingNode( stream.CurrentDoc
2356  , AnchorInfo.Range.substr(1)
2357  , TagInfo.Range
2360  getNext();
2361  return new (NodeAllocator)
2362  SequenceNode( stream.CurrentDoc
2363  , AnchorInfo.Range.substr(1)
2364  , TagInfo.Range
2367  getNext();
2368  return new (NodeAllocator)
2369  MappingNode( stream.CurrentDoc
2370  , AnchorInfo.Range.substr(1)
2371  , TagInfo.Range
2373  case Token::TK_Scalar:
2374  getNext();
2375  return new (NodeAllocator)
2376  ScalarNode( stream.CurrentDoc
2377  , AnchorInfo.Range.substr(1)
2378  , TagInfo.Range
2379  , T.Range);
2380  case Token::TK_BlockScalar: {
2381  getNext();
2382  StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
2383  StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
2384  return new (NodeAllocator)
2385  BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
2386  TagInfo.Range, StrCopy, T.Range);
2387  }
2388  case Token::TK_Key:
2389  // Don't eat the TK_Key, KeyValueNode expects it.
2390  return new (NodeAllocator)
2391  MappingNode( stream.CurrentDoc
2392  , AnchorInfo.Range.substr(1)
2393  , TagInfo.Range
2396  case Token::TK_DocumentEnd:
2397  case Token::TK_StreamEnd:
2398  default:
2399  // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2400  // !!null null.
2401  return new (NodeAllocator) NullNode(stream.CurrentDoc);
2402  case Token::TK_Error:
2403  return nullptr;
2404  }
2405  llvm_unreachable("Control flow shouldn't reach here.");
2406  return nullptr;
2407 }
2408 
2409 bool Document::parseDirectives() {
2410  bool isDirective = false;
2411  while (true) {
2412  Token T = peekNext();
2413  if (T.Kind == Token::TK_TagDirective) {
2414  parseTAGDirective();
2415  isDirective = true;
2416  } else if (T.Kind == Token::TK_VersionDirective) {
2417  parseYAMLDirective();
2418  isDirective = true;
2419  } else
2420  break;
2421  }
2422  return isDirective;
2423 }
2424 
2425 void Document::parseYAMLDirective() {
2426  getNext(); // Eat %YAML <version>
2427 }
2428 
2429 void Document::parseTAGDirective() {
2430  Token Tag = getNext(); // %TAG <handle> <prefix>
2431  StringRef T = Tag.Range;
2432  // Strip %TAG
2433  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2434  std::size_t HandleEnd = T.find_first_of(" \t");
2435  StringRef TagHandle = T.substr(0, HandleEnd);
2436  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2437  TagMap[TagHandle] = TagPrefix;
2438 }
2439 
2440 bool Document::expectToken(int TK) {
2441  Token T = getNext();
2442  if (T.Kind != TK) {
2443  setError("Unexpected token", T);
2444  return false;
2445  }
2446  return true;
2447 }
const NoneType None
Definition: None.h:23
std::enable_if< std::numeric_limits< T >::is_signed, bool >::type getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:347
static void destroySentinel(Token *)
Definition: YAMLParser.cpp:156
Represents a range in source code.
Definition: SMLoc.h:47
void push_back(const T &Elt)
Definition: SmallVector.h:222
std::unique_ptr< Document > & Doc
Definition: YAMLParser.h:157
std::string getVerbatimTag() const
Get the verbatium tag for a given Node.
bool operator==(const BinaryRef &LHS, const BinaryRef &RHS)
Definition: YAML.h:79
iplist< Token >::iterator iterator
Definition: ilist.h:588
size_t size() const
size - Get the string size.
Definition: StringRef.h:113
Node(unsigned int Type, std::unique_ptr< Document > &, StringRef Anchor, StringRef Tag)
Not a valid Unicode encoding.
Definition: YAMLParser.cpp:35
bool scanTokens(StringRef Input)
Scans all tokens in input without outputting anything.
Definition: YAMLParser.cpp:685
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition: StringRef.h:240
static std::unique_ptr< MemoryBuffer > getMemBuffer(StringRef InputData, StringRef BufferName="", bool RequiresNullTerminator=true)
Open the specified memory range as a MemoryBuffer.
std::string Value
The value of a block scalar node.
Definition: YAMLParser.cpp:143
StringRef getRawTag() const
Get the tag as it was written in the document.
Definition: YAMLParser.h:126
Represents a YAML sequence created from either a block sequence for a flow sequence.
Definition: YAMLParser.h:421
static LLVM_ATTRIBUTE_NOINLINE bool wasEscaped(StringRef::iterator First, StringRef::iterator Position)
StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:405
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:188
UTF-8 or ascii.
Definition: YAMLParser.cpp:34
const char * getBufferStart() const
Definition: MemoryBuffer.h:161
Node * parseBlockNode()
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:822
std::pair< uint32_t, unsigned > UTF8Decoded
The Unicode scalar value of a UTF-8 minimal well-formed code unit subsequence and the subsequence's l...
Definition: YAMLParser.cpp:211
iterator begin()
Definition: ilist.h:359
document_iterator begin()
Represents an alias to a Node with an anchor.
Definition: YAMLParser.h:474
void skip(CollectionType &C)
Definition: YAMLParser.h:358
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:419
void reserve(size_type N)
Definition: SmallVector.h:401
UTF-32 Little Endian.
Definition: YAMLParser.cpp:30
document_iterator end()
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const char *reason, bool gen_crash_diag=true)
Reports a serious error, calling any installed error handler.
void setError(const Twine &Message, Token &Location) const
SMLoc Start
Definition: SMLoc.h:49
static bool is_ns_hex_digit(const char C)
Definition: YAMLParser.cpp:909
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:79
T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val()
Definition: SmallVector.h:406
bool failed()
Returns true if an error occurred while parsing.
Definition: YAMLParser.cpp:298
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:98
StringRef copy(Allocator &A) const
Definition: StringRef.h:128
TypeID
Definitions of all of the base types for the Type system.
Definition: Type.h:54
Token * ensureHead(Token *) const
Definition: YAMLParser.cpp:159
static EncodingInfo getUnicodeEncoding(StringRef Input)
getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode encoding form of Input...
Definition: YAMLParser.cpp:48
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:591
void clear()
Definition: ilist.h:550
The Input class is used to parse a yaml document into in-memory structs and vectors.
Definition: YAMLTraits.h:970
static void noteHead(Token *, Token *)
Definition: YAMLParser.cpp:160
virtual void skip()
Definition: YAMLParser.h:143
A key and value pair.
Definition: YAMLParser.h:263
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: SmallVector.h:57
Node * getRoot()
Parse and return the root level node.
Definition: YAMLParser.h:504
#define T
unsigned AddNewSourceBuffer(std::unique_ptr< MemoryBuffer > F, SMLoc IncludeLoc)
Add a new source buffer to this source manager.
Definition: SourceMgr.h:123
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: ArrayRef.h:31
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:25
iterator begin() const
Definition: StringRef.h:90
void append(in_iter S, in_iter E)
Append from an iterator pair.
Definition: SmallString.h:74
Scanner(StringRef Input, SourceMgr &SM, bool ShowColors=true)
Definition: YAMLParser.cpp:761
bool dumpTokens(StringRef Input, raw_ostream &)
Dump all the tokens in this stream to OS.
Definition: YAMLParser.cpp:601
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:325
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
void printError(Node *N, const Twine &Msg)
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:135
ilist_sentinel_traits - A fragment for template traits for intrusive list that provides default senti...
Definition: ilist.h:76
const char * getBufferEnd() const
Definition: MemoryBuffer.h:162
iterator insert(iterator where, const NodeTy &val)
Definition: ilist.h:610
Token * createNode(const Token &V)
Definition: YAMLParser.cpp:168
std::string escape(StringRef Input)
Escape Input for a double quoted scalar.
Definition: YAMLParser.cpp:698
static unsigned getChompedLineBreaks(char ChompingIndicator, unsigned LineBreaks, StringRef Str)
Get the number of line breaks after chomping.
StringRef Range
A string of length 0 or more whose begin() points to the logical location of the token in the input...
Definition: YAMLParser.cpp:140
void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, ArrayRef< SMRange > Ranges=None)
Definition: YAMLParser.cpp:277
Token getNext()
Parse the next token and pop it from the queue.
Definition: YAMLParser.cpp:814
This owns the files read by a parser, handles include stacks, and handles diagnostic wrangling...
Definition: SourceMgr.h:35
ilist_node_traits - A fragment for template traits for intrusive list that provides default node rela...
Definition: ilist.h:111
iterator erase(iterator I)
Definition: SmallVector.h:455
A scalar node is an opaque datum that can be presented as a series of zero or more Unicode scalar val...
Definition: YAMLParser.h:190
UTF-16 Little Endian.
Definition: YAMLParser.cpp:32
A null value.
Definition: YAMLParser.h:175
void setError(const Twine &Message, StringRef::iterator Position)
Definition: YAMLParser.cpp:282
const char * iterator
Definition: StringRef.h:42
size_t find_last_of(char C, size_t From=npos) const
Find the last character in the string that is C, or npos if not found.
Definition: StringRef.h:301
void transferNodesFromList(ilist_node_traits &, ilist_iterator< Token >, ilist_iterator< Token >)
Definition: YAMLParser.cpp:175
bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:215
bool skip()
Finish parsing the current document and return true if there are more.
enum llvm::yaml::Token::TokenKind Kind
Token & peekNext()
Parse the next token and return it without popping it.
Definition: YAMLParser.cpp:787
Token & peekNext()
This class represents a YAML stream potentially containing multiple documents.
Definition: YAMLParser.h:76
#define LLVM_ATTRIBUTE_NOINLINE
LLVM_ATTRIBUTE_NOINLINE - On compilers where we have a directive to do so, mark a method "not for inl...
Definition: Compiler.h:184
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: ilist.h:385
Stream(StringRef Input, SourceMgr &, bool ShowColors=true)
This keeps a reference to the string referenced by Input.
ilist< Token > TokenQueueT
Definition: YAMLParser.cpp:183
static void encodeUTF8(uint32_t UnicodeScalarValue, SmallVectorImpl< char > &Result)
encodeUTF8 - Encode UnicodeScalarValue in UTF-8 and append it to result.
Definition: YAMLParser.cpp:573
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:267
void setError(const Twine &Message)
Definition: YAMLParser.cpp:293
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:481
static UTF8Decoded decodeUTF8(StringRef Range)
Definition: YAMLParser.cpp:213
reference front()
Definition: ilist.h:390
static SMLoc getFromPointer(const char *Ptr)
Definition: SMLoc.h:35
SI Fix SGPR Live Ranges
SMRange getSourceRange() const
Definition: YAMLParser.h:132
static const size_t npos
Definition: StringRef.h:44
ilist_node - Base class that provides next/prev services for nodes that use ilist_nextprev_traits or ...
Definition: ilist_node.h:43
size_t size_type
Definition: StringRef.h:45
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
Token - A single YAML token.
Definition: YAMLParser.cpp:111
A block scalar node is an opaque datum that can be presented as a series of zero or more Unicode scal...
Definition: YAMLParser.h:233
Represents a YAML map created from either a block map for a flow map.
Definition: YAMLParser.h:374
static void deleteNode(Token *V)
Definition: YAMLParser.cpp:171
size_t find_first_of(char C, size_t From=0) const
Find the first character in the string that is C, or npos if not found.
Definition: StringRef.h:279
reference back()
Definition: ilist.h:398
UTF-16 Big Endian.
Definition: YAMLParser.cpp:33
static std::string utohexstr(uint64_t X, bool LowerCase=false)
Definition: StringExtras.h:72
Scans YAML tokens from a MemoryBuffer.
Definition: YAMLParser.cpp:266
std::pair< UnicodeEncodingForm, unsigned > EncodingInfo
EncodingInfo - Holds the encoding type and length of the byte order mark if it exists.
Definition: YAMLParser.cpp:40
Iterator abstraction for Documents over a Stream.
Definition: YAMLParser.h:549
const ARM::ArchExtKind Kind
iterator end()
Definition: ilist.h:367
SMRange SourceRange
Definition: YAMLParser.h:158
LLVM Value Representation.
Definition: Value.h:69
void pop_front()
Definition: ilist.h:555
friend class Document
Definition: YAMLParser.h:99
iterator end() const
Definition: StringRef.h:92
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:38
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:40
void PrintMessage(raw_ostream &OS, SMLoc Loc, DiagKind Kind, const Twine &Msg, ArrayRef< SMRange > Ranges=None, ArrayRef< SMFixIt > FixIts=None, bool ShowColors=true) const
Emit a message about the specified location with the specified string.
Definition: SourceMgr.cpp:215
Represents a location in source code.
Definition: SMLoc.h:23
An inline mapping node is used for "[key: value]".
Definition: YAMLParser.h:381
UTF-32 Big Endian.
Definition: YAMLParser.cpp:31
Node * parseBlockNode()
Root for parsing a node. Returns a single node.
StringRef ltrim(StringRef Chars=" \t\n\v\f\r") const
Return string with consecutive characters in Chars starting from the left removed.
Definition: StringRef.h:511
void push_back(const NodeTy &val)
Definition: ilist.h:617
UnicodeEncodingForm
Definition: YAMLParser.cpp:29
bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:110
Abstract base class for all Nodes.
Definition: YAMLParser.h:103
static bool is_ns_word_char(const char C)
Definition: YAMLParser.cpp:915