Line data Source code
1 : //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This is a YAML 1.2 parser.
11 : //
12 : // See http://www.yaml.org/spec/1.2/spec.html for the full standard.
13 : //
14 : // This currently does not implement the following:
15 : // * Multi-line literal folding.
16 : // * Tag resolution.
17 : // * UTF-16.
18 : // * BOMs anywhere other than the first Unicode scalar value in the file.
19 : //
20 : // The most important class here is Stream. This represents a YAML stream with
21 : // 0, 1, or many documents.
22 : //
23 : // SourceMgr sm;
24 : // StringRef input = getInput();
25 : // yaml::Stream stream(input, sm);
26 : //
27 : // for (yaml::document_iterator di = stream.begin(), de = stream.end();
28 : // di != de; ++di) {
29 : // yaml::Node *n = di->getRoot();
30 : // if (n) {
31 : // // Do something with n...
32 : // } else
33 : // break;
34 : // }
35 : //
36 : //===----------------------------------------------------------------------===//
37 :
38 : #ifndef LLVM_SUPPORT_YAMLPARSER_H
39 : #define LLVM_SUPPORT_YAMLPARSER_H
40 :
41 : #include "llvm/ADT/StringRef.h"
42 : #include "llvm/Support/Allocator.h"
43 : #include "llvm/Support/SMLoc.h"
44 : #include <cassert>
45 : #include <cstddef>
46 : #include <iterator>
47 : #include <map>
48 : #include <memory>
49 : #include <string>
50 : #include <system_error>
51 :
52 : namespace llvm {
53 :
54 : class MemoryBufferRef;
55 : class SourceMgr;
56 : class raw_ostream;
57 : class Twine;
58 :
59 : namespace yaml {
60 :
61 : class Document;
62 : class document_iterator;
63 : class Node;
64 : class Scanner;
65 : struct Token;
66 :
67 : /// Dump all the tokens in this stream to OS.
68 : /// \returns true if there was an error, false otherwise.
69 : bool dumpTokens(StringRef Input, raw_ostream &);
70 :
71 : /// Scans all tokens in input without outputting anything. This is used
72 : /// for benchmarking the tokenizer.
73 : /// \returns true if there was an error, false otherwise.
74 : bool scanTokens(StringRef Input);
75 :
76 : /// Escape \a Input for a double quoted scalar; if \p EscapePrintable
77 : /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
78 : /// false, those UTF8 sequences encoding printable unicode scalars will not be
79 : /// escaped, but emitted verbatim.
80 : std::string escape(StringRef Input, bool EscapePrintable = true);
81 :
82 : /// This class represents a YAML stream potentially containing multiple
83 : /// documents.
84 6291 : class Stream {
85 : public:
86 : /// This keeps a reference to the string referenced by \p Input.
87 : Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
88 : std::error_code *EC = nullptr);
89 :
90 : Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
91 : std::error_code *EC = nullptr);
92 : ~Stream();
93 :
94 : document_iterator begin();
95 : document_iterator end();
96 : void skip();
97 : bool failed();
98 :
99 : bool validate() {
100 53 : skip();
101 53 : return !failed();
102 : }
103 :
104 : void printError(Node *N, const Twine &Msg);
105 :
106 : private:
107 : friend class Document;
108 :
109 : std::unique_ptr<Scanner> scanner;
110 : std::unique_ptr<Document> CurrentDoc;
111 : };
112 :
113 : /// Abstract base class for all Nodes.
114 : class Node {
115 : virtual void anchor();
116 :
117 : public:
118 : enum NodeKind {
119 : NK_Null,
120 : NK_Scalar,
121 : NK_BlockScalar,
122 : NK_KeyValue,
123 : NK_Mapping,
124 : NK_Sequence,
125 : NK_Alias
126 : };
127 :
128 : Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
129 : StringRef Tag);
130 :
131 : // It's not safe to copy YAML nodes; the document is streamed and the position
132 : // is part of the state.
133 : Node(const Node &) = delete;
134 : void operator=(const Node &) = delete;
135 :
136 : void *operator new(size_t Size, BumpPtrAllocator &Alloc,
137 : size_t Alignment = 16) noexcept {
138 998782 : return Alloc.Allocate(Size, Alignment);
139 : }
140 :
141 : void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
142 : size_t Size) noexcept {
143 : Alloc.Deallocate(Ptr, Size);
144 : }
145 :
146 : void operator delete(void *) noexcept = delete;
147 :
148 : /// Get the value of the anchor attached to this node. If it does not
149 : /// have one, getAnchor().size() will be 0.
150 0 : StringRef getAnchor() const { return Anchor; }
151 :
152 : /// Get the tag as it was written in the document. This does not
153 : /// perform tag resolution.
154 0 : StringRef getRawTag() const { return Tag; }
155 :
156 : /// Get the verbatium tag for a given Node. This performs tag resoluton
157 : /// and substitution.
158 : std::string getVerbatimTag() const;
159 :
160 0 : SMRange getSourceRange() const { return SourceRange; }
161 : void setSourceRange(SMRange SR) { SourceRange = SR; }
162 :
163 : // These functions forward to Document and Scanner.
164 : Token &peekNext();
165 : Token getNext();
166 : Node *parseBlockNode();
167 : BumpPtrAllocator &getAllocator();
168 : void setError(const Twine &Message, Token &Location) const;
169 : bool failed() const;
170 :
171 1137898 : virtual void skip() {}
172 :
173 0 : unsigned int getType() const { return TypeID; }
174 :
175 : protected:
176 : std::unique_ptr<Document> &Doc;
177 : SMRange SourceRange;
178 :
179 : ~Node() = default;
180 :
181 : private:
182 : unsigned int TypeID;
183 : StringRef Anchor;
184 : /// The tag as typed in the document.
185 : StringRef Tag;
186 : };
187 :
188 : /// A null value.
189 : ///
190 : /// Example:
191 : /// !!null null
192 : class NullNode final : public Node {
193 : void anchor() override;
194 :
195 : public:
196 3163 : NullNode(std::unique_ptr<Document> &D)
197 3163 : : Node(NK_Null, D, StringRef(), StringRef()) {}
198 :
199 8407 : static bool classof(const Node *N) { return N->getType() == NK_Null; }
200 : };
201 :
202 : /// A scalar node is an opaque datum that can be presented as a
203 : /// series of zero or more Unicode scalar values.
204 : ///
205 : /// Example:
206 : /// Adena
207 : class ScalarNode final : public Node {
208 : void anchor() override;
209 :
210 : public:
211 : ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
212 : StringRef Val)
213 586096 : : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
214 : SMLoc Start = SMLoc::getFromPointer(Val.begin());
215 : SMLoc End = SMLoc::getFromPointer(Val.end());
216 586096 : SourceRange = SMRange(Start, End);
217 : }
218 :
219 : // Return Value without any escaping or folding or other fun YAML stuff. This
220 : // is the exact bytes that are contained in the file (after conversion to
221 : // utf8).
222 0 : StringRef getRawValue() const { return Value; }
223 :
224 : /// Gets the value of this node as a StringRef.
225 : ///
226 : /// \param Storage is used to store the content of the returned StringRef iff
227 : /// it requires any modification from how it appeared in the source.
228 : /// This happens with escaped characters and multi-line literals.
229 : StringRef getValue(SmallVectorImpl<char> &Storage) const;
230 :
231 : static bool classof(const Node *N) {
232 1127268 : return N->getType() == NK_Scalar;
233 : }
234 :
235 : private:
236 : StringRef Value;
237 :
238 : StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
239 : StringRef::size_type Start,
240 : SmallVectorImpl<char> &Storage) const;
241 : };
242 :
243 : /// A block scalar node is an opaque datum that can be presented as a
244 : /// series of zero or more Unicode scalar values.
245 : ///
246 : /// Example:
247 : /// |
248 : /// Hello
249 : /// World
250 : class BlockScalarNode final : public Node {
251 : void anchor() override;
252 :
253 : public:
254 : BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
255 : StringRef Value, StringRef RawVal)
256 5548 : : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
257 : SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
258 : SMLoc End = SMLoc::getFromPointer(RawVal.end());
259 5548 : SourceRange = SMRange(Start, End);
260 : }
261 :
262 : /// Gets the value of this node as a StringRef.
263 0 : StringRef getValue() const { return Value; }
264 :
265 : static bool classof(const Node *N) {
266 1256 : return N->getType() == NK_BlockScalar;
267 : }
268 :
269 : private:
270 : StringRef Value;
271 : };
272 :
273 : /// A key and value pair. While not technically a Node under the YAML
274 : /// representation graph, it is easier to treat them this way.
275 : ///
276 : /// TODO: Consider making this not a child of Node.
277 : ///
278 : /// Example:
279 : /// Section: .text
280 : class KeyValueNode final : public Node {
281 : void anchor() override;
282 :
283 : public:
284 299006 : KeyValueNode(std::unique_ptr<Document> &D)
285 299006 : : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
286 :
287 : /// Parse and return the key.
288 : ///
289 : /// This may be called multiple times.
290 : ///
291 : /// \returns The key, or nullptr if failed() == true.
292 : Node *getKey();
293 :
294 : /// Parse and return the value.
295 : ///
296 : /// This may be called multiple times.
297 : ///
298 : /// \returns The value, or nullptr if failed() == true.
299 : Node *getValue();
300 :
301 415192 : void skip() override {
302 415192 : if (Node *Key = getKey()) {
303 415191 : Key->skip();
304 415191 : if (Node *Val = getValue())
305 415187 : Val->skip();
306 : }
307 415192 : }
308 :
309 : static bool classof(const Node *N) {
310 : return N->getType() == NK_KeyValue;
311 : }
312 :
313 : private:
314 : Node *Key = nullptr;
315 : Node *Value = nullptr;
316 : };
317 :
318 : /// This is an iterator abstraction over YAML collections shared by both
319 : /// sequences and maps.
320 : ///
321 : /// BaseT must have a ValueT* member named CurrentEntry and a member function
322 : /// increment() which must set CurrentEntry to 0 to create an end iterator.
323 : template <class BaseT, class ValueT>
324 : class basic_collection_iterator
325 : : public std::iterator<std::input_iterator_tag, ValueT> {
326 : public:
327 : basic_collection_iterator() = default;
328 : basic_collection_iterator(BaseT *B) : Base(B) {}
329 :
330 0 : ValueT *operator->() const {
331 : assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
332 5718 : return Base->CurrentEntry;
333 : }
334 0 :
335 0 : ValueT &operator*() const {
336 0 : assert(Base && Base->CurrentEntry &&
337 : "Attempted to dereference end iterator!");
338 343337 : return *Base->CurrentEntry;
339 : }
340 0 :
341 0 : operator ValueT *() const {
342 : assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
343 303 : return Base->CurrentEntry;
344 : }
345 0 :
346 : /// Note on EqualityComparable:
347 : ///
348 0 : /// The iterator is not re-entrant,
349 : /// it is meant to be used for parsing YAML on-demand
350 : /// Once iteration started - it can point only to one entry at a time
351 : /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
352 : /// iff Base and Other.Base are equal.
353 0 : bool operator==(const basic_collection_iterator &Other) const {
354 0 : if (Base && (Base == Other.Base)) {
355 : assert((Base->CurrentEntry == Other.Base->CurrentEntry)
356 : && "Equal Bases expected to point to equal Entries");
357 : }
358 :
359 0 : return Base == Other.Base;
360 : }
361 :
362 0 : bool operator!=(const basic_collection_iterator &Other) const {
363 3 : return !(Base == Other.Base);
364 0 : }
365 0 :
366 0 : basic_collection_iterator &operator++() {
367 : assert(Base && "Attempted to advance iterator past end!");
368 86 : Base->increment();
369 0 : // Create an end iterator.
370 131 : if (!Base->CurrentEntry)
371 0 : Base = nullptr;
372 0 : return *this;
373 0 : }
374 679 :
375 0 : private:
376 977 : BaseT *Base = nullptr;
377 0 : };
378 0 :
379 0 : // The following two templates are used for both MappingNode and Sequence Node.
380 0 : template <class CollectionType>
381 : typename CollectionType::iterator begin(CollectionType &C) {
382 4947 : assert(C.IsAtBeginning && "You may only iterate over a collection once!");
383 45 : C.IsAtBeginning = false;
384 391103 : typename CollectionType::iterator ret(&C);
385 0 : ++ret;
386 486255 : return ret;
387 : }
388 0 :
389 298 : template <class CollectionType> void skip(CollectionType &C) {
390 : // TODO: support skipping from the middle of a parsed collection ;/
391 0 : assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
392 0 : if (C.IsAtBeginning)
393 : for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
394 0 : ++i)
395 0 : i->skip();
396 : }
397 1875 :
398 : /// Represents a YAML map created from either a block map for a flow map.
399 101974 : ///
400 0 : /// This parses the YAML stream as increment() is called.
401 : ///
402 0 : /// Example:
403 151055 : /// Name: _main
404 : /// Scope: Global
405 : class MappingNode final : public Node {
406 151055 : void anchor() override;
407 6822 :
408 : public:
409 4947 : enum MappingType {
410 151055 : MT_Block,
411 14403 : MT_Flow,
412 : MT_Inline ///< An inline mapping node is used for "[key: value]".
413 : };
414 14403 :
415 1381 : MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
416 : MappingType MT)
417 709 : : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
418 14403 :
419 136652 : friend class basic_collection_iterator<MappingNode, KeyValueNode>;
420 :
421 : using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
422 136652 :
423 5441 : template <class T> friend typename T::iterator yaml::begin(T &);
424 : template <class T> friend void yaml::skip(T &);
425 4238 :
426 136652 : iterator begin() { return yaml::begin(*this); }
427 :
428 0 : iterator end() { return iterator(); }
429 :
430 : void skip() override { yaml::skip(*this); }
431 :
432 : static bool classof(const Node *N) {
433 28 : return N->getType() == NK_Mapping;
434 0 : }
435 :
436 : private:
437 : MappingType Type;
438 : bool IsAtBeginning = true;
439 0 : bool IsAtEnd = false;
440 : KeyValueNode *CurrentEntry = nullptr;
441 :
442 : void increment();
443 : };
444 0 :
445 : /// Represents a YAML sequence created from either a block sequence for a
446 : /// flow sequence.
447 91037 : ///
448 : /// This parses the YAML stream as increment() is called.
449 461151 : ///
450 : /// Example:
451 : /// - Hello
452 : /// - World
453 : class SequenceNode final : public Node {
454 : void anchor() override;
455 :
456 : public:
457 : enum SequenceType {
458 0 : ST_Block,
459 : ST_Flow,
460 136652 : // Use for:
461 : //
462 : // key:
463 : // - val1
464 : // - val2
465 0 : //
466 : // As a BlockMappingEntry and BlockEnd are not created in this case.
467 : ST_Indentless
468 : };
469 :
470 : SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
471 : SequenceType ST)
472 : : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
473 :
474 : friend class basic_collection_iterator<SequenceNode, Node>;
475 :
476 : using iterator = basic_collection_iterator<SequenceNode, Node>;
477 :
478 : template <class T> friend typename T::iterator yaml::begin(T &);
479 : template <class T> friend void yaml::skip(T &);
480 :
481 : void increment();
482 :
483 : iterator begin() { return yaml::begin(*this); }
484 :
485 0 : iterator end() { return iterator(); }
486 :
487 : void skip() override { yaml::skip(*this); }
488 :
489 : static bool classof(const Node *N) {
490 16 : return N->getType() == NK_Sequence;
491 0 : }
492 :
493 : private:
494 : SequenceType SeqType;
495 : bool IsAtBeginning = true;
496 0 : bool IsAtEnd = false;
497 : bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
498 : Node *CurrentEntry = nullptr;
499 : };
500 :
501 0 : /// Represents an alias to a Node with an anchor.
502 13916 : ///
503 : /// Example:
504 : /// *AnchorName
505 : class AliasNode final : public Node {
506 99590 : void anchor() override;
507 :
508 : public:
509 : AliasNode(std::unique_ptr<Document> &D, StringRef Val)
510 : : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
511 :
512 : StringRef getName() const { return Name; }
513 : Node *getTarget();
514 :
515 0 : static bool classof(const Node *N) { return N->getType() == NK_Alias; }
516 :
517 14403 : private:
518 0 : StringRef Name;
519 : };
520 :
521 0 : /// A YAML Stream is a sequence of Documents. A document contains a root
522 0 : /// node.
523 0 : class Document {
524 : public:
525 : Document(Stream &ParentStream);
526 :
527 : /// Root for parsing a node. Returns a single node.
528 : Node *parseBlockNode();
529 0 :
530 : /// Finish parsing the current document and return true if there are
531 : /// more. Return false otherwise.
532 : bool skip();
533 :
534 : /// Parse and return the root level node.
535 : Node *getRoot() {
536 23 : if (Root)
537 : return Root;
538 21 : return Root = parseBlockNode();
539 16 : }
540 16 :
541 : const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
542 191 :
543 : private:
544 191 : friend class Node;
545 : friend class document_iterator;
546 :
547 : /// Stream to read tokens from.
548 : Stream &stream;
549 :
550 : /// Used to allocate nodes to. All are destroyed without calling their
551 : /// destructor when the document is destroyed.
552 8659 : BumpPtrAllocator NodeAllocator;
553 0 :
554 8654 : /// The root node. Used to support skipping a partially parsed
555 : /// document.
556 : Node *Root;
557 :
558 : /// Maps tag prefixes to their expansion.
559 : std::map<StringRef, StringRef> TagMap;
560 :
561 : Token &peekNext();
562 : Token getNext();
563 : void setError(const Twine &Message, Token &Location) const;
564 : bool failed() const;
565 :
566 : /// Parse %BLAH directives and return true if any were encountered.
567 : bool parseDirectives();
568 580 :
569 : /// Parse %YAML
570 0 : void parseYAMLDirective();
571 :
572 : /// Parse %TAG
573 : void parseTAGDirective();
574 :
575 : /// Consume the next token and error if it is not \a TK.
576 : bool expectToken(int TK);
577 : };
578 :
579 : /// Iterator abstraction for Documents over a Stream.
580 : class document_iterator {
581 : public:
582 : document_iterator() = default;
583 : document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
584 :
585 4 : bool operator==(const document_iterator &Other) const {
586 6 : if (isAtEnd() || Other.isAtEnd())
587 2 : return isAtEnd() && Other.isAtEnd();
588 :
589 0 : return Doc == Other.Doc;
590 : }
591 355 : bool operator!=(const document_iterator &Other) const {
592 550 : return !(*this == Other);
593 164 : }
594 :
595 2 : document_iterator operator++() {
596 : assert(Doc && "incrementing iterator past the end.");
597 4 : if (!(*Doc)->skip()) {
598 4369 : Doc->reset(nullptr);
599 : } else {
600 0 : Stream &S = (*Doc)->stream;
601 17999 : Doc->reset(new Document(S));
602 31423 : }
603 4577 : return *this;
604 164 : }
605 0 :
606 22 : Document &operator*() { return *Doc->get(); }
607 22 :
608 10737 : std::unique_ptr<Document> &operator->() { return *Doc; }
609 186 :
610 : private:
611 7407 : bool isAtEnd() const { return !Doc || !*Doc; }
612 :
613 14806 : std::unique_ptr<Document> *Doc = nullptr;
614 2357 : };
615 107 :
616 5207 : } // end namespace yaml
617 5454 :
618 0 : } // end namespace llvm
619 7403 :
620 : #endif // LLVM_SUPPORT_YAMLPARSER_H
|