LLVM API Documentation

YAMLParser.h
Go to the documentation of this file.
00001 //===--- YAMLParser.h - Simple YAML parser --------------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 //  This is a YAML 1.2 parser.
00011 //
00012 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
00013 //
00014 //  This currently does not implement the following:
00015 //    * Multi-line literal folding.
00016 //    * Tag resolution.
00017 //    * UTF-16.
00018 //    * BOMs anywhere other than the first Unicode scalar value in the file.
00019 //
00020 //  The most important class here is Stream. This represents a YAML stream with
00021 //  0, 1, or many documents.
00022 //
00023 //  SourceMgr sm;
00024 //  StringRef input = getInput();
00025 //  yaml::Stream stream(input, sm);
00026 //
00027 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
00028 //       di != de; ++di) {
00029 //    yaml::Node *n = di->getRoot();
00030 //    if (n) {
00031 //      // Do something with n...
00032 //    } else
00033 //      break;
00034 //  }
00035 //
00036 //===----------------------------------------------------------------------===//
00037 
00038 #ifndef LLVM_SUPPORT_YAMLPARSER_H
00039 #define LLVM_SUPPORT_YAMLPARSER_H
00040 
00041 #include "llvm/ADT/OwningPtr.h"
00042 #include "llvm/ADT/SmallString.h"
00043 #include "llvm/ADT/StringRef.h"
00044 #include "llvm/Support/Allocator.h"
00045 #include "llvm/Support/SMLoc.h"
00046 #include <limits>
00047 #include <utility>
00048 
00049 namespace llvm {
00050 class MemoryBuffer;
00051 class SourceMgr;
00052 class raw_ostream;
00053 class Twine;
00054 
00055 namespace yaml {
00056 
00057 class document_iterator;
00058 class Document;
00059 class Node;
00060 class Scanner;
00061 struct Token;
00062 
00063 /// @brief Dump all the tokens in this stream to OS.
00064 /// @returns true if there was an error, false otherwise.
00065 bool dumpTokens(StringRef Input, raw_ostream &);
00066 
00067 /// @brief Scans all tokens in input without outputting anything. This is used
00068 ///        for benchmarking the tokenizer.
00069 /// @returns true if there was an error, false otherwise.
00070 bool scanTokens(StringRef Input);
00071 
00072 /// @brief Escape \a Input for a double quoted scalar.
00073 std::string escape(StringRef Input);
00074 
00075 /// @brief This class represents a YAML stream potentially containing multiple
00076 ///        documents.
00077 class Stream {
00078 public:
00079   /// @brief This keeps a reference to the string referenced by \p Input.
00080   Stream(StringRef Input, SourceMgr &);
00081 
00082   /// @brief This takes ownership of \p InputBuffer.
00083   Stream(MemoryBuffer *InputBuffer, SourceMgr &);
00084   ~Stream();
00085 
00086   document_iterator begin();
00087   document_iterator end();
00088   void skip();
00089   bool failed();
00090   bool validate() {
00091     skip();
00092     return !failed();
00093   }
00094 
00095   void printError(Node *N, const Twine &Msg);
00096 
00097 private:
00098   OwningPtr<Scanner> scanner;
00099   OwningPtr<Document> CurrentDoc;
00100 
00101   friend class Document;
00102 
00103   /// @brief Validate a %YAML x.x directive.
00104   void handleYAMLDirective(const Token &);
00105 };
00106 
00107 /// @brief Abstract base class for all Nodes.
00108 class Node {
00109 public:
00110   enum NodeKind {
00111     NK_Null,
00112     NK_Scalar,
00113     NK_KeyValue,
00114     NK_Mapping,
00115     NK_Sequence,
00116     NK_Alias
00117   };
00118 
00119   Node(unsigned int Type, OwningPtr<Document>&, StringRef Anchor);
00120 
00121   /// @brief Get the value of the anchor attached to this node. If it does not
00122   ///        have one, getAnchor().size() will be 0.
00123   StringRef getAnchor() const { return Anchor; }
00124 
00125   SMRange getSourceRange() const { return SourceRange; }
00126   void setSourceRange(SMRange SR) { SourceRange = SR; }
00127 
00128   // These functions forward to Document and Scanner.
00129   Token &peekNext();
00130   Token getNext();
00131   Node *parseBlockNode();
00132   BumpPtrAllocator &getAllocator();
00133   void setError(const Twine &Message, Token &Location) const;
00134   bool failed() const;
00135 
00136   virtual void skip() {}
00137 
00138   unsigned int getType() const { return TypeID; }
00139 
00140   void *operator new ( size_t Size
00141                      , BumpPtrAllocator &Alloc
00142                      , size_t Alignment = 16) throw() {
00143     return Alloc.Allocate(Size, Alignment);
00144   }
00145 
00146   void operator delete(void *Ptr, BumpPtrAllocator &Alloc, size_t) throw() {
00147     Alloc.Deallocate(Ptr);
00148   }
00149 
00150 protected:
00151   OwningPtr<Document> &Doc;
00152   SMRange SourceRange;
00153 
00154   void operator delete(void *) throw() {}
00155 
00156   virtual ~Node() {}
00157 
00158 private:
00159   unsigned int TypeID;
00160   StringRef Anchor;
00161 };
00162 
00163 /// @brief A null value.
00164 ///
00165 /// Example:
00166 ///   !!null null
00167 class NullNode : public Node {
00168 public:
00169   NullNode(OwningPtr<Document> &D) : Node(NK_Null, D, StringRef()) {}
00170 
00171   static inline bool classof(const Node *N) {
00172     return N->getType() == NK_Null;
00173   }
00174 };
00175 
00176 /// @brief A scalar node is an opaque datum that can be presented as a
00177 ///        series of zero or more Unicode scalar values.
00178 ///
00179 /// Example:
00180 ///   Adena
00181 class ScalarNode : public Node {
00182 public:
00183   ScalarNode(OwningPtr<Document> &D, StringRef Anchor, StringRef Val)
00184     : Node(NK_Scalar, D, Anchor)
00185     , Value(Val) {
00186     SMLoc Start = SMLoc::getFromPointer(Val.begin());
00187     SMLoc End = SMLoc::getFromPointer(Val.end());
00188     SourceRange = SMRange(Start, End);
00189   }
00190 
00191   // Return Value without any escaping or folding or other fun YAML stuff. This
00192   // is the exact bytes that are contained in the file (after conversion to
00193   // utf8).
00194   StringRef getRawValue() const { return Value; }
00195 
00196   /// @brief Gets the value of this node as a StringRef.
00197   ///
00198   /// @param Storage is used to store the content of the returned StringRef iff
00199   ///        it requires any modification from how it appeared in the source.
00200   ///        This happens with escaped characters and multi-line literals.
00201   StringRef getValue(SmallVectorImpl<char> &Storage) const;
00202 
00203   static inline bool classof(const Node *N) {
00204     return N->getType() == NK_Scalar;
00205   }
00206 
00207 private:
00208   StringRef Value;
00209 
00210   StringRef unescapeDoubleQuoted( StringRef UnquotedValue
00211                                 , StringRef::size_type Start
00212                                 , SmallVectorImpl<char> &Storage) const;
00213 };
00214 
00215 /// @brief A key and value pair. While not technically a Node under the YAML
00216 ///        representation graph, it is easier to treat them this way.
00217 ///
00218 /// TODO: Consider making this not a child of Node.
00219 ///
00220 /// Example:
00221 ///   Section: .text
00222 class KeyValueNode : public Node {
00223 public:
00224   KeyValueNode(OwningPtr<Document> &D)
00225     : Node(NK_KeyValue, D, StringRef())
00226     , Key(0)
00227     , Value(0)
00228   {}
00229 
00230   /// @brief Parse and return the key.
00231   ///
00232   /// This may be called multiple times.
00233   ///
00234   /// @returns The key, or nullptr if failed() == true.
00235   Node *getKey();
00236 
00237   /// @brief Parse and return the value.
00238   ///
00239   /// This may be called multiple times.
00240   ///
00241   /// @returns The value, or nullptr if failed() == true.
00242   Node *getValue();
00243 
00244   virtual void skip() LLVM_OVERRIDE {
00245     getKey()->skip();
00246     getValue()->skip();
00247   }
00248 
00249   static inline bool classof(const Node *N) {
00250     return N->getType() == NK_KeyValue;
00251   }
00252 
00253 private:
00254   Node *Key;
00255   Node *Value;
00256 };
00257 
00258 /// @brief This is an iterator abstraction over YAML collections shared by both
00259 ///        sequences and maps.
00260 ///
00261 /// BaseT must have a ValueT* member named CurrentEntry and a member function
00262 /// increment() which must set CurrentEntry to 0 to create an end iterator.
00263 template <class BaseT, class ValueT>
00264 class basic_collection_iterator
00265   : public std::iterator<std::forward_iterator_tag, ValueT> {
00266 public:
00267   basic_collection_iterator() : Base(0) {}
00268   basic_collection_iterator(BaseT *B) : Base(B) {}
00269 
00270   ValueT *operator ->() const {
00271     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
00272     return Base->CurrentEntry;
00273   }
00274 
00275   ValueT &operator *() const {
00276     assert(Base && Base->CurrentEntry &&
00277            "Attempted to dereference end iterator!");
00278     return *Base->CurrentEntry;
00279   }
00280 
00281   operator ValueT*() const {
00282     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
00283     return Base->CurrentEntry;
00284   }
00285 
00286   bool operator !=(const basic_collection_iterator &Other) const {
00287     if(Base != Other.Base)
00288       return true;
00289     return (Base && Other.Base) && Base->CurrentEntry
00290                                    != Other.Base->CurrentEntry;
00291   }
00292 
00293   basic_collection_iterator &operator++() {
00294     assert(Base && "Attempted to advance iterator past end!");
00295     Base->increment();
00296     // Create an end iterator.
00297     if (Base->CurrentEntry == 0)
00298       Base = 0;
00299     return *this;
00300   }
00301 
00302 private:
00303   BaseT *Base;
00304 };
00305 
00306 // The following two templates are used for both MappingNode and Sequence Node.
00307 template <class CollectionType>
00308 typename CollectionType::iterator begin(CollectionType &C) {
00309   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
00310   C.IsAtBeginning = false;
00311   typename CollectionType::iterator ret(&C);
00312   ++ret;
00313   return ret;
00314 }
00315 
00316 template <class CollectionType>
00317 void skip(CollectionType &C) {
00318   // TODO: support skipping from the middle of a parsed collection ;/
00319   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
00320   if (C.IsAtBeginning)
00321     for (typename CollectionType::iterator i = begin(C), e = C.end();
00322                                            i != e; ++i)
00323       i->skip();
00324 }
00325 
00326 /// @brief Represents a YAML map created from either a block map for a flow map.
00327 ///
00328 /// This parses the YAML stream as increment() is called.
00329 ///
00330 /// Example:
00331 ///   Name: _main
00332 ///   Scope: Global
00333 class MappingNode : public Node {
00334 public:
00335   enum MappingType {
00336     MT_Block,
00337     MT_Flow,
00338     MT_Inline ///< An inline mapping node is used for "[key: value]".
00339   };
00340 
00341   MappingNode(OwningPtr<Document> &D, StringRef Anchor, MappingType MT)
00342     : Node(NK_Mapping, D, Anchor)
00343     , Type(MT)
00344     , IsAtBeginning(true)
00345     , IsAtEnd(false)
00346     , CurrentEntry(0)
00347   {}
00348 
00349   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
00350   typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
00351   template <class T> friend typename T::iterator yaml::begin(T &);
00352   template <class T> friend void yaml::skip(T &);
00353 
00354   iterator begin() {
00355     return yaml::begin(*this);
00356   }
00357 
00358   iterator end() { return iterator(); }
00359 
00360   virtual void skip() LLVM_OVERRIDE {
00361     yaml::skip(*this);
00362   }
00363 
00364   static inline bool classof(const Node *N) {
00365     return N->getType() == NK_Mapping;
00366   }
00367 
00368 private:
00369   MappingType Type;
00370   bool IsAtBeginning;
00371   bool IsAtEnd;
00372   KeyValueNode *CurrentEntry;
00373 
00374   void increment();
00375 };
00376 
00377 /// @brief Represents a YAML sequence created from either a block sequence for a
00378 ///        flow sequence.
00379 ///
00380 /// This parses the YAML stream as increment() is called.
00381 ///
00382 /// Example:
00383 ///   - Hello
00384 ///   - World
00385 class SequenceNode : public Node {
00386 public:
00387   enum SequenceType {
00388     ST_Block,
00389     ST_Flow,
00390     // Use for:
00391     //
00392     // key:
00393     // - val1
00394     // - val2
00395     //
00396     // As a BlockMappingEntry and BlockEnd are not created in this case.
00397     ST_Indentless
00398   };
00399 
00400   SequenceNode(OwningPtr<Document> &D, StringRef Anchor, SequenceType ST)
00401     : Node(NK_Sequence, D, Anchor)
00402     , SeqType(ST)
00403     , IsAtBeginning(true)
00404     , IsAtEnd(false)
00405     , WasPreviousTokenFlowEntry(true) // Start with an imaginary ','.
00406     , CurrentEntry(0)
00407   {}
00408 
00409   friend class basic_collection_iterator<SequenceNode, Node>;
00410   typedef basic_collection_iterator<SequenceNode, Node> iterator;
00411   template <class T> friend typename T::iterator yaml::begin(T &);
00412   template <class T> friend void yaml::skip(T &);
00413 
00414   void increment();
00415 
00416   iterator begin() {
00417     return yaml::begin(*this);
00418   }
00419 
00420   iterator end() { return iterator(); }
00421 
00422   virtual void skip() LLVM_OVERRIDE {
00423     yaml::skip(*this);
00424   }
00425 
00426   static inline bool classof(const Node *N) {
00427     return N->getType() == NK_Sequence;
00428   }
00429 
00430 private:
00431   SequenceType SeqType;
00432   bool IsAtBeginning;
00433   bool IsAtEnd;
00434   bool WasPreviousTokenFlowEntry;
00435   Node *CurrentEntry;
00436 };
00437 
00438 /// @brief Represents an alias to a Node with an anchor.
00439 ///
00440 /// Example:
00441 ///   *AnchorName
00442 class AliasNode : public Node {
00443 public:
00444   AliasNode(OwningPtr<Document> &D, StringRef Val)
00445     : Node(NK_Alias, D, StringRef()), Name(Val) {}
00446 
00447   StringRef getName() const { return Name; }
00448   Node *getTarget();
00449 
00450   static inline bool classof(const Node *N) {
00451     return N->getType() == NK_Alias;
00452   }
00453 
00454 private:
00455   StringRef Name;
00456 };
00457 
00458 /// @brief A YAML Stream is a sequence of Documents. A document contains a root
00459 ///        node.
00460 class Document {
00461 public:
00462   /// @brief Root for parsing a node. Returns a single node.
00463   Node *parseBlockNode();
00464 
00465   Document(Stream &ParentStream);
00466 
00467   /// @brief Finish parsing the current document and return true if there are
00468   ///        more. Return false otherwise.
00469   bool skip();
00470 
00471   /// @brief Parse and return the root level node.
00472   Node *getRoot() {
00473     if (Root)
00474       return Root;
00475     return Root = parseBlockNode();
00476   }
00477 
00478 private:
00479   friend class Node;
00480   friend class document_iterator;
00481 
00482   /// @brief Stream to read tokens from.
00483   Stream &stream;
00484 
00485   /// @brief Used to allocate nodes to. All are destroyed without calling their
00486   ///        destructor when the document is destroyed.
00487   BumpPtrAllocator NodeAllocator;
00488 
00489   /// @brief The root node. Used to support skipping a partially parsed
00490   ///        document.
00491   Node *Root;
00492 
00493   Token &peekNext();
00494   Token getNext();
00495   void setError(const Twine &Message, Token &Location) const;
00496   bool failed() const;
00497 
00498   void handleTagDirective(const Token &Tag) {
00499     // TODO: Track tags.
00500   }
00501 
00502   /// @brief Parse %BLAH directives and return true if any were encountered.
00503   bool parseDirectives();
00504 
00505   /// @brief Consume the next token and error if it is not \a TK.
00506   bool expectToken(int TK);
00507 };
00508 
00509 /// @brief Iterator abstraction for Documents over a Stream.
00510 class document_iterator {
00511 public:
00512   document_iterator() : Doc(0) {}
00513   document_iterator(OwningPtr<Document> &D) : Doc(&D) {}
00514 
00515   bool operator ==(const document_iterator &Other) {
00516     if (isAtEnd() || Other.isAtEnd())
00517       return isAtEnd() && Other.isAtEnd();
00518 
00519     return Doc == Other.Doc;
00520   }
00521   bool operator !=(const document_iterator &Other) {
00522     return !(*this == Other);
00523   }
00524 
00525   document_iterator operator ++() {
00526     assert(Doc != 0 && "incrementing iterator past the end.");
00527     if (!(*Doc)->skip()) {
00528       Doc->reset(0);
00529     } else {
00530       Stream &S = (*Doc)->stream;
00531       Doc->reset(new Document(S));
00532     }
00533     return *this;
00534   }
00535 
00536   Document &operator *() {
00537     return *Doc->get();
00538   }
00539 
00540   OwningPtr<Document> &operator ->() {
00541     return *Doc;
00542   }
00543 
00544 private:
00545   bool isAtEnd() const {
00546     return !Doc || !*Doc;
00547   }
00548 
00549   OwningPtr<Document> *Doc;
00550 };
00551 
00552 }
00553 }
00554 
00555 #endif