Blame - llvm/lib/Support/YAMLParser.cpp - toolchain/llvm-project

blob: 3be02ee9fb98639b4ddf24e626b408d2fd7220c0 [file] [log] [blame]

Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1	//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file implements a YAML parser.
				11	//
				12	//===----------------------------------------------------------------------===//
				13
				14	#include "llvm/Support/YAMLParser.h"
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	15	#include "llvm/ADT/SmallVector.h"
				16	#include "llvm/ADT/StringExtras.h"
				17	#include "llvm/ADT/Twine.h"
Chandler Carruth	ed0881b	2012-12-03 16:50:05 +0000	[diff] [blame]	18	#include "llvm/ADT/ilist.h"
				19	#include "llvm/ADT/ilist_node.h"
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	20	#include "llvm/Support/ErrorHandling.h"
				21	#include "llvm/Support/MemoryBuffer.h"
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	22	#include "llvm/Support/SourceMgr.h"
Chandler Carruth	ed0881b	2012-12-03 16:50:05 +0000	[diff] [blame]	23	#include "llvm/Support/raw_ostream.h"
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	24
				25	using namespace llvm;
				26	using namespace yaml;
				27
				28	enum UnicodeEncodingForm {
Dmitri Gribenko	dbeafa7	2012-06-09 00:01:45 +0000	[diff] [blame]	29	UEF_UTF32_LE, ///< UTF-32 Little Endian
				30	UEF_UTF32_BE, ///< UTF-32 Big Endian
				31	UEF_UTF16_LE, ///< UTF-16 Little Endian
				32	UEF_UTF16_BE, ///< UTF-16 Big Endian
				33	UEF_UTF8, ///< UTF-8 or ascii.
				34	UEF_Unknown ///< Not a valid Unicode encoding.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	35	};
				36
				37	/// EncodingInfo - Holds the encoding type and length of the byte order mark if
				38	/// it exists. Length is in {0, 2, 3, 4}.
				39	typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
				40
				41	/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
				42	/// encoding form of \a Input.
				43	///
				44	/// @param Input A string of length 0 or more.
				45	/// @returns An EncodingInfo indicating the Unicode encoding form of the input
				46	/// and how long the byte order mark is if one exists.
				47	static EncodingInfo getUnicodeEncoding(StringRef Input) {
				48	if (Input.size() == 0)
				49	return std::make_pair(UEF_Unknown, 0);
				50
				51	switch (uint8_t(Input[0])) {
				52	case 0x00:
				53	if (Input.size() >= 4) {
				54	if ( Input[1] == 0
				55	&& uint8_t(Input[2]) == 0xFE
				56	&& uint8_t(Input[3]) == 0xFF)
				57	return std::make_pair(UEF_UTF32_BE, 4);
				58	if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
				59	return std::make_pair(UEF_UTF32_BE, 0);
				60	}
				61
				62	if (Input.size() >= 2 && Input[1] != 0)
				63	return std::make_pair(UEF_UTF16_BE, 0);
				64	return std::make_pair(UEF_Unknown, 0);
				65	case 0xFF:
				66	if ( Input.size() >= 4
				67	&& uint8_t(Input[1]) == 0xFE
				68	&& Input[2] == 0
				69	&& Input[3] == 0)
				70	return std::make_pair(UEF_UTF32_LE, 4);
				71
				72	if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
				73	return std::make_pair(UEF_UTF16_LE, 2);
				74	return std::make_pair(UEF_Unknown, 0);
				75	case 0xFE:
				76	if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
				77	return std::make_pair(UEF_UTF16_BE, 2);
				78	return std::make_pair(UEF_Unknown, 0);
				79	case 0xEF:
				80	if ( Input.size() >= 3
				81	&& uint8_t(Input[1]) == 0xBB
				82	&& uint8_t(Input[2]) == 0xBF)
				83	return std::make_pair(UEF_UTF8, 3);
				84	return std::make_pair(UEF_Unknown, 0);
				85	}
				86
				87	// It could still be utf-32 or utf-16.
				88	if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
				89	return std::make_pair(UEF_UTF32_LE, 0);
				90
				91	if (Input.size() >= 2 && Input[1] == 0)
				92	return std::make_pair(UEF_UTF16_LE, 0);
				93
				94	return std::make_pair(UEF_UTF8, 0);
				95	}
				96
				97	namespace llvm {
				98	namespace yaml {
Juergen Ributzka	d12ccbd	2013-11-19 00:57:56 +0000	[diff] [blame]	99	/// Pin the vtables to this file.
				100	void Node::anchor() {}
				101	void NullNode::anchor() {}
				102	void ScalarNode::anchor() {}
				103	void KeyValueNode::anchor() {}
				104	void MappingNode::anchor() {}
				105	void SequenceNode::anchor() {}
				106	void AliasNode::anchor() {}
				107
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	108	/// Token - A single YAML token.
				109	struct Token : ilist_node<Token> {
				110	enum TokenKind {
				111	TK_Error, // Uninitialized token.
				112	TK_StreamStart,
				113	TK_StreamEnd,
				114	TK_VersionDirective,
				115	TK_TagDirective,
				116	TK_DocumentStart,
				117	TK_DocumentEnd,
				118	TK_BlockEntry,
				119	TK_BlockEnd,
				120	TK_BlockSequenceStart,
				121	TK_BlockMappingStart,
				122	TK_FlowEntry,
				123	TK_FlowSequenceStart,
				124	TK_FlowSequenceEnd,
				125	TK_FlowMappingStart,
				126	TK_FlowMappingEnd,
				127	TK_Key,
				128	TK_Value,
				129	TK_Scalar,
				130	TK_Alias,
				131	TK_Anchor,
				132	TK_Tag
				133	} Kind;
				134
				135	/// A string of length 0 or more whose begin() points to the logical location
				136	/// of the token in the input.
				137	StringRef Range;
				138
				139	Token() : Kind(TK_Error) {}
				140	};
				141	}
				142	}
				143
Michael J. Spencer	afc0d6a	2012-04-03 23:36:44 +0000	[diff] [blame]	144	namespace llvm {
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	145	template<>
				146	struct ilist_sentinel_traits<Token> {
				147	Token *createSentinel() const {
				148	return &Sentinel;
				149	}
				150	static void destroySentinel(Token*) {}
				151
				152	Token *provideInitialHead() const { return createSentinel(); }
				153	Token ensureHead(Token) const { return createSentinel(); }
				154	static void noteHead(Token, Token) {}
				155
				156	private:
				157	mutable Token Sentinel;
				158	};
				159
				160	template<>
				161	struct ilist_node_traits<Token> {
				162	Token *createNode(const Token &V) {
				163	return new (Alloc.Allocate<Token>()) Token(V);
				164	}
				165	static void deleteNode(Token *V) {}
				166
				167	void addNodeToList(Token *) {}
				168	void removeNodeFromList(Token *) {}
				169	void transferNodesFromList(ilist_node_traits & /SrcTraits/,
				170	ilist_iterator<Token> /first/,
				171	ilist_iterator<Token> /last/) {}
				172
				173	BumpPtrAllocator Alloc;
				174	};
Michael J. Spencer	afc0d6a	2012-04-03 23:36:44 +0000	[diff] [blame]	175	}
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	176
				177	typedef ilist<Token> TokenQueueT;
				178
				179	namespace {
				180	/// @brief This struct is used to track simple keys.
				181	///
				182	/// Simple keys are handled by creating an entry in SimpleKeys for each Token
				183	/// which could legally be the start of a simple key. When peekNext is called,
				184	/// if the Token To be returned is referenced by a SimpleKey, we continue
				185	/// tokenizing until that potential simple key has either been found to not be
				186	/// a simple key (we moved on to the next line or went further than 1024 chars).
				187	/// Or when we run into a Value, and then insert a Key token (and possibly
				188	/// others) before the SimpleKey's Tok.
				189	struct SimpleKey {
				190	TokenQueueT::iterator Tok;
				191	unsigned Column;
				192	unsigned Line;
				193	unsigned FlowLevel;
				194	bool IsRequired;
				195
				196	bool operator ==(const SimpleKey &Other) {
				197	return Tok == Other.Tok;
				198	}
				199	};
				200	}
				201
				202	/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
				203	/// subsequence and the subsequence's length in code units (uint8_t).
				204	/// A length of 0 represents an error.
				205	typedef std::pair<uint32_t, unsigned> UTF8Decoded;
				206
				207	static UTF8Decoded decodeUTF8(StringRef Range) {
				208	StringRef::iterator Position= Range.begin();
				209	StringRef::iterator End = Range.end();
				210	// 1 byte: [0x00, 0x7f]
				211	// Bit pattern: 0xxxxxxx
				212	if ((*Position & 0x80) == 0) {
				213	return std::make_pair(*Position, 1);
				214	}
				215	// 2 bytes: [0x80, 0x7ff]
				216	// Bit pattern: 110xxxxx 10xxxxxx
				217	if (Position + 1 != End &&
				218	((*Position & 0xE0) == 0xC0) &&
				219	((*(Position + 1) & 0xC0) == 0x80)) {
				220	uint32_t codepoint = ((*Position & 0x1F) << 6) \|
				221	(*(Position + 1) & 0x3F);
				222	if (codepoint >= 0x80)
				223	return std::make_pair(codepoint, 2);
				224	}
				225	// 3 bytes: [0x8000, 0xffff]
				226	// Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
				227	if (Position + 2 != End &&
				228	((*Position & 0xF0) == 0xE0) &&
				229	((*(Position + 1) & 0xC0) == 0x80) &&
				230	((*(Position + 2) & 0xC0) == 0x80)) {
				231	uint32_t codepoint = ((*Position & 0x0F) << 12) \|
				232	((*(Position + 1) & 0x3F) << 6) \|
				233	(*(Position + 2) & 0x3F);
				234	// Codepoints between 0xD800 and 0xDFFF are invalid, as
				235	// they are high / low surrogate halves used by UTF-16.
				236	if (codepoint >= 0x800 &&
				237	(codepoint < 0xD800 \|\| codepoint > 0xDFFF))
				238	return std::make_pair(codepoint, 3);
				239	}
				240	// 4 bytes: [0x10000, 0x10FFFF]
				241	// Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				242	if (Position + 3 != End &&
				243	((*Position & 0xF8) == 0xF0) &&
				244	((*(Position + 1) & 0xC0) == 0x80) &&
				245	((*(Position + 2) & 0xC0) == 0x80) &&
				246	((*(Position + 3) & 0xC0) == 0x80)) {
				247	uint32_t codepoint = ((*Position & 0x07) << 18) \|
				248	((*(Position + 1) & 0x3F) << 12) \|
				249	((*(Position + 2) & 0x3F) << 6) \|
				250	(*(Position + 3) & 0x3F);
				251	if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
				252	return std::make_pair(codepoint, 4);
				253	}
				254	return std::make_pair(0, 0);
				255	}
				256
				257	namespace llvm {
				258	namespace yaml {
				259	/// @brief Scans YAML tokens from a MemoryBuffer.
				260	class Scanner {
				261	public:
				262	Scanner(const StringRef Input, SourceMgr &SM);
Sean Silva	aba8270	2012-11-19 23:21:47 +0000	[diff] [blame]	263	Scanner(MemoryBuffer *Buffer, SourceMgr &SM_);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	264
				265	/// @brief Parse the next token and return it without popping it.
				266	Token &peekNext();
				267
				268	/// @brief Parse the next token and pop it from the queue.
				269	Token getNext();
				270
				271	void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
Dmitri Gribenko	3238fb7	2013-05-05 00:40:33 +0000	[diff] [blame]	272	ArrayRef<SMRange> Ranges = None) {
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	273	SM.PrintMessage(Loc, Kind, Message, Ranges);
				274	}
				275
				276	void setError(const Twine &Message, StringRef::iterator Position) {
				277	if (Current >= End)
				278	Current = End - 1;
				279
				280	// Don't print out more errors after the first one we encounter. The rest
				281	// are just the result of the first, and have no meaning.
				282	if (!Failed)
				283	printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
				284	Failed = true;
				285	}
				286
				287	void setError(const Twine &Message) {
				288	setError(Message, Current);
				289	}
				290
				291	/// @brief Returns true if an error occurred while parsing.
				292	bool failed() {
				293	return Failed;
				294	}
				295
				296	private:
				297	StringRef currentInput() {
				298	return StringRef(Current, End - Current);
				299	}
				300
				301	/// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
				302	/// at \a Position.
				303	///
				304	/// If the UTF-8 code units starting at Position do not form a well-formed
				305	/// code unit subsequence, then the Unicode scalar value is 0, and the length
				306	/// is 0.
				307	UTF8Decoded decodeUTF8(StringRef::iterator Position) {
				308	return ::decodeUTF8(StringRef(Position, End - Position));
				309	}
				310
				311	// The following functions are based on the gramar rules in the YAML spec. The
				312	// style of the function names it meant to closely match how they are written
				313	// in the spec. The number within the [] is the number of the grammar rule in
				314	// the spec.
				315	//
				316	// See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
				317	//
				318	// c-
				319	// A production starting and ending with a special character.
				320	// b-
				321	// A production matching a single line break.
				322	// nb-
				323	// A production starting and ending with a non-break character.
				324	// s-
				325	// A production starting and ending with a white space character.
				326	// ns-
				327	// A production starting and ending with a non-space character.
				328	// l-
				329	// A production matching complete line(s).
				330
				331	/// @brief Skip a single nb-char[27] starting at Position.
				332	///
				333	/// A nb-char is 0x9 \| [0x20-0x7E] \| 0x85 \| [0xA0-0xD7FF] \| [0xE000-0xFEFE]
				334	/// \| [0xFF00-0xFFFD] \| [0x10000-0x10FFFF]
				335	///
				336	/// @returns The code unit after the nb-char, or Position if it's not an
				337	/// nb-char.
				338	StringRef::iterator skip_nb_char(StringRef::iterator Position);
				339
				340	/// @brief Skip a single b-break[28] starting at Position.
				341	///
				342	/// A b-break is 0xD 0xA \| 0xD \| 0xA
				343	///
				344	/// @returns The code unit after the b-break, or Position if it's not a
				345	/// b-break.
				346	StringRef::iterator skip_b_break(StringRef::iterator Position);
				347
				348	/// @brief Skip a single s-white[33] starting at Position.
				349	///
				350	/// A s-white is 0x20 \| 0x9
				351	///
				352	/// @returns The code unit after the s-white, or Position if it's not a
				353	/// s-white.
				354	StringRef::iterator skip_s_white(StringRef::iterator Position);
				355
				356	/// @brief Skip a single ns-char[34] starting at Position.
				357	///
				358	/// A ns-char is nb-char - s-white
				359	///
				360	/// @returns The code unit after the ns-char, or Position if it's not a
				361	/// ns-char.
				362	StringRef::iterator skip_ns_char(StringRef::iterator Position);
				363
				364	typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
				365	/// @brief Skip minimal well-formed code unit subsequences until Func
				366	/// returns its input.
				367	///
				368	/// @returns The code unit after the last minimal well-formed code unit
				369	/// subsequence that Func accepted.
				370	StringRef::iterator skip_while( SkipWhileFunc Func
				371	, StringRef::iterator Position);
				372
				373	/// @brief Scan ns-uri-char[39]s starting at Cur.
				374	///
				375	/// This updates Cur and Column while scanning.
				376	///
				377	/// @returns A StringRef starting at Cur which covers the longest contiguous
				378	/// sequence of ns-uri-char.
				379	StringRef scan_ns_uri_char();
				380
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	381	/// @brief Consume a minimal well-formed code unit subsequence starting at
				382	/// \a Cur. Return false if it is not the same Unicode scalar value as
				383	/// \a Expected. This updates \a Column.
				384	bool consume(uint32_t Expected);
				385
				386	/// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
				387	void skip(uint32_t Distance);
				388
				389	/// @brief Return true if the minimal well-formed code unit subsequence at
				390	/// Pos is whitespace or a new line
				391	bool isBlankOrBreak(StringRef::iterator Position);
				392
				393	/// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
				394	void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
				395	, unsigned AtColumn
				396	, bool IsRequired);
				397
				398	/// @brief Remove simple keys that can no longer be valid simple keys.
				399	///
				400	/// Invalid simple keys are not on the current line or are further than 1024
				401	/// columns back.
				402	void removeStaleSimpleKeyCandidates();
				403
				404	/// @brief Remove all simple keys on FlowLevel \a Level.
				405	void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
				406
				407	/// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
				408	/// tokens if needed.
				409	bool unrollIndent(int ToColumn);
				410
				411	/// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
				412	/// if needed.
				413	bool rollIndent( int ToColumn
				414	, Token::TokenKind Kind
				415	, TokenQueueT::iterator InsertPoint);
				416
				417	/// @brief Skip whitespace and comments until the start of the next token.
				418	void scanToNextToken();
				419
				420	/// @brief Must be the first token generated.
				421	bool scanStreamStart();
				422
				423	/// @brief Generate tokens needed to close out the stream.
				424	bool scanStreamEnd();
				425
				426	/// @brief Scan a %BLAH directive.
				427	bool scanDirective();
				428
				429	/// @brief Scan a ... or ---.
				430	bool scanDocumentIndicator(bool IsStart);
				431
				432	/// @brief Scan a [ or { and generate the proper flow collection start token.
				433	bool scanFlowCollectionStart(bool IsSequence);
				434
				435	/// @brief Scan a ] or } and generate the proper flow collection end token.
				436	bool scanFlowCollectionEnd(bool IsSequence);
				437
				438	/// @brief Scan the , that separates entries in a flow collection.
				439	bool scanFlowEntry();
				440
				441	/// @brief Scan the - that starts block sequence entries.
				442	bool scanBlockEntry();
				443
				444	/// @brief Scan an explicit ? indicating a key.
				445	bool scanKey();
				446
				447	/// @brief Scan an explicit : indicating a value.
				448	bool scanValue();
				449
				450	/// @brief Scan a quoted scalar.
				451	bool scanFlowScalar(bool IsDoubleQuoted);
				452
				453	/// @brief Scan an unquoted scalar.
				454	bool scanPlainScalar();
				455
				456	/// @brief Scan an Alias or Anchor starting with * or &.
				457	bool scanAliasOrAnchor(bool IsAlias);
				458
				459	/// @brief Scan a block scalar starting with \| or >.
				460	bool scanBlockScalar(bool IsLiteral);
				461
				462	/// @brief Scan a tag of the form !stuff.
				463	bool scanTag();
				464
				465	/// @brief Dispatch to the next scanning function based on \a *Cur.
				466	bool fetchMoreTokens();
				467
				468	/// @brief The SourceMgr used for diagnostics and buffer management.
				469	SourceMgr &SM;
				470
				471	/// @brief The original input.
				472	MemoryBuffer *InputBuffer;
				473
				474	/// @brief The current position of the scanner.
				475	StringRef::iterator Current;
				476
				477	/// @brief The end of the input (one past the last character).
				478	StringRef::iterator End;
				479
				480	/// @brief Current YAML indentation level in spaces.
				481	int Indent;
				482
				483	/// @brief Current column number in Unicode code points.
				484	unsigned Column;
				485
				486	/// @brief Current line number.
				487	unsigned Line;
				488
				489	/// @brief How deep we are in flow style containers. 0 Means at block level.
				490	unsigned FlowLevel;
				491
				492	/// @brief Are we at the start of the stream?
				493	bool IsStartOfStream;
				494
				495	/// @brief Can the next token be the start of a simple key?
				496	bool IsSimpleKeyAllowed;
				497
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	498	/// @brief True if an error has occurred.
				499	bool Failed;
				500
				501	/// @brief Queue of tokens. This is required to queue up tokens while looking
				502	/// for the end of a simple key. And for cases where a single character
				503	/// can produce multiple tokens (e.g. BlockEnd).
				504	TokenQueueT TokenQueue;
				505
				506	/// @brief Indentation levels.
				507	SmallVector<int, 4> Indents;
				508
				509	/// @brief Potential simple keys.
				510	SmallVector<SimpleKey, 4> SimpleKeys;
				511	};
				512
				513	} // end namespace yaml
				514	} // end namespace llvm
				515
				516	/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
				517	static void encodeUTF8( uint32_t UnicodeScalarValue
				518	, SmallVectorImpl<char> &Result) {
				519	if (UnicodeScalarValue <= 0x7F) {
				520	Result.push_back(UnicodeScalarValue & 0x7F);
				521	} else if (UnicodeScalarValue <= 0x7FF) {
				522	uint8_t FirstByte = 0xC0 \| ((UnicodeScalarValue & 0x7C0) >> 6);
				523	uint8_t SecondByte = 0x80 \| (UnicodeScalarValue & 0x3F);
				524	Result.push_back(FirstByte);
				525	Result.push_back(SecondByte);
				526	} else if (UnicodeScalarValue <= 0xFFFF) {
				527	uint8_t FirstByte = 0xE0 \| ((UnicodeScalarValue & 0xF000) >> 12);
				528	uint8_t SecondByte = 0x80 \| ((UnicodeScalarValue & 0xFC0) >> 6);
				529	uint8_t ThirdByte = 0x80 \| (UnicodeScalarValue & 0x3F);
				530	Result.push_back(FirstByte);
				531	Result.push_back(SecondByte);
				532	Result.push_back(ThirdByte);
				533	} else if (UnicodeScalarValue <= 0x10FFFF) {
				534	uint8_t FirstByte = 0xF0 \| ((UnicodeScalarValue & 0x1F0000) >> 18);
				535	uint8_t SecondByte = 0x80 \| ((UnicodeScalarValue & 0x3F000) >> 12);
				536	uint8_t ThirdByte = 0x80 \| ((UnicodeScalarValue & 0xFC0) >> 6);
				537	uint8_t FourthByte = 0x80 \| (UnicodeScalarValue & 0x3F);
				538	Result.push_back(FirstByte);
				539	Result.push_back(SecondByte);
				540	Result.push_back(ThirdByte);
				541	Result.push_back(FourthByte);
				542	}
				543	}
				544
				545	bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
				546	SourceMgr SM;
				547	Scanner scanner(Input, SM);
				548	while (true) {
				549	Token T = scanner.getNext();
				550	switch (T.Kind) {
				551	case Token::TK_StreamStart:
				552	OS << "Stream-Start: ";
				553	break;
				554	case Token::TK_StreamEnd:
				555	OS << "Stream-End: ";
				556	break;
				557	case Token::TK_VersionDirective:
				558	OS << "Version-Directive: ";
				559	break;
				560	case Token::TK_TagDirective:
				561	OS << "Tag-Directive: ";
				562	break;
				563	case Token::TK_DocumentStart:
				564	OS << "Document-Start: ";
				565	break;
				566	case Token::TK_DocumentEnd:
				567	OS << "Document-End: ";
				568	break;
				569	case Token::TK_BlockEntry:
				570	OS << "Block-Entry: ";
				571	break;
				572	case Token::TK_BlockEnd:
				573	OS << "Block-End: ";
				574	break;
				575	case Token::TK_BlockSequenceStart:
				576	OS << "Block-Sequence-Start: ";
				577	break;
				578	case Token::TK_BlockMappingStart:
				579	OS << "Block-Mapping-Start: ";
				580	break;
				581	case Token::TK_FlowEntry:
				582	OS << "Flow-Entry: ";
				583	break;
				584	case Token::TK_FlowSequenceStart:
				585	OS << "Flow-Sequence-Start: ";
				586	break;
				587	case Token::TK_FlowSequenceEnd:
				588	OS << "Flow-Sequence-End: ";
				589	break;
				590	case Token::TK_FlowMappingStart:
				591	OS << "Flow-Mapping-Start: ";
				592	break;
				593	case Token::TK_FlowMappingEnd:
				594	OS << "Flow-Mapping-End: ";
				595	break;
				596	case Token::TK_Key:
				597	OS << "Key: ";
				598	break;
				599	case Token::TK_Value:
				600	OS << "Value: ";
				601	break;
				602	case Token::TK_Scalar:
				603	OS << "Scalar: ";
				604	break;
				605	case Token::TK_Alias:
				606	OS << "Alias: ";
				607	break;
				608	case Token::TK_Anchor:
				609	OS << "Anchor: ";
				610	break;
				611	case Token::TK_Tag:
				612	OS << "Tag: ";
				613	break;
				614	case Token::TK_Error:
				615	break;
				616	}
				617	OS << T.Range << "\n";
				618	if (T.Kind == Token::TK_StreamEnd)
				619	break;
				620	else if (T.Kind == Token::TK_Error)
				621	return false;
				622	}
				623	return true;
				624	}
				625
				626	bool yaml::scanTokens(StringRef Input) {
				627	llvm::SourceMgr SM;
				628	llvm::yaml::Scanner scanner(Input, SM);
				629	for (;;) {
				630	llvm::yaml::Token T = scanner.getNext();
				631	if (T.Kind == Token::TK_StreamEnd)
				632	break;
				633	else if (T.Kind == Token::TK_Error)
				634	return false;
				635	}
				636	return true;
				637	}
				638
				639	std::string yaml::escape(StringRef Input) {
				640	std::string EscapedInput;
				641	for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
				642	if (*i == '\\')
				643	EscapedInput += "\\\\";
				644	else if (*i == '"')
				645	EscapedInput += "\\\"";
				646	else if (*i == 0)
				647	EscapedInput += "\\0";
				648	else if (*i == 0x07)
				649	EscapedInput += "\\a";
				650	else if (*i == 0x08)
				651	EscapedInput += "\\b";
				652	else if (*i == 0x09)
				653	EscapedInput += "\\t";
				654	else if (*i == 0x0A)
				655	EscapedInput += "\\n";
				656	else if (*i == 0x0B)
				657	EscapedInput += "\\v";
				658	else if (*i == 0x0C)
				659	EscapedInput += "\\f";
				660	else if (*i == 0x0D)
				661	EscapedInput += "\\r";
				662	else if (*i == 0x1B)
				663	EscapedInput += "\\e";
Benjamin Kramer	0aa0d3d	2012-04-21 10:51:42 +0000	[diff] [blame]	664	else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	665	std::string HexStr = utohexstr(*i);
				666	EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
				667	} else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
				668	UTF8Decoded UnicodeScalarValue
				669	= decodeUTF8(StringRef(i, Input.end() - i));
				670	if (UnicodeScalarValue.second == 0) {
				671	// Found invalid char.
				672	SmallString<4> Val;
				673	encodeUTF8(0xFFFD, Val);
				674	EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
				675	// FIXME: Error reporting.
				676	return EscapedInput;
				677	}
				678	if (UnicodeScalarValue.first == 0x85)
				679	EscapedInput += "\\N";
				680	else if (UnicodeScalarValue.first == 0xA0)
				681	EscapedInput += "\\_";
				682	else if (UnicodeScalarValue.first == 0x2028)
				683	EscapedInput += "\\L";
				684	else if (UnicodeScalarValue.first == 0x2029)
				685	EscapedInput += "\\P";
				686	else {
				687	std::string HexStr = utohexstr(UnicodeScalarValue.first);
				688	if (HexStr.size() <= 2)
				689	EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
				690	else if (HexStr.size() <= 4)
				691	EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
				692	else if (HexStr.size() <= 8)
				693	EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
				694	}
				695	i += UnicodeScalarValue.second - 1;
				696	} else
				697	EscapedInput.push_back(*i);
				698	}
				699	return EscapedInput;
				700	}
				701
				702	Scanner::Scanner(StringRef Input, SourceMgr &sm)
				703	: SM(sm)
				704	, Indent(-1)
				705	, Column(0)
				706	, Line(0)
				707	, FlowLevel(0)
				708	, IsStartOfStream(true)
				709	, IsSimpleKeyAllowed(true)
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	710	, Failed(false) {
				711	InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
				712	SM.AddNewSourceBuffer(InputBuffer, SMLoc());
				713	Current = InputBuffer->getBufferStart();
				714	End = InputBuffer->getBufferEnd();
				715	}
				716
Sean Silva	aba8270	2012-11-19 23:21:47 +0000	[diff] [blame]	717	Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_)
				718	: SM(SM_)
				719	, InputBuffer(Buffer)
				720	, Current(InputBuffer->getBufferStart())
				721	, End(InputBuffer->getBufferEnd())
				722	, Indent(-1)
				723	, Column(0)
				724	, Line(0)
				725	, FlowLevel(0)
				726	, IsStartOfStream(true)
				727	, IsSimpleKeyAllowed(true)
				728	, Failed(false) {
				729	SM.AddNewSourceBuffer(InputBuffer, SMLoc());
				730	}
				731
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	732	Token &Scanner::peekNext() {
				733	// If the current token is a possible simple key, keep parsing until we
				734	// can confirm.
				735	bool NeedMore = false;
				736	while (true) {
				737	if (TokenQueue.empty() \|\| NeedMore) {
				738	if (!fetchMoreTokens()) {
				739	TokenQueue.clear();
				740	TokenQueue.push_back(Token());
				741	return TokenQueue.front();
				742	}
				743	}
				744	assert(!TokenQueue.empty() &&
				745	"fetchMoreTokens lied about getting tokens!");
				746
				747	removeStaleSimpleKeyCandidates();
				748	SimpleKey SK;
				749	SK.Tok = TokenQueue.front();
				750	if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
				751	== SimpleKeys.end())
				752	break;
				753	else
				754	NeedMore = true;
				755	}
				756	return TokenQueue.front();
				757	}
				758
				759	Token Scanner::getNext() {
				760	Token Ret = peekNext();
				761	// TokenQueue can be empty if there was an error getting the next token.
				762	if (!TokenQueue.empty())
				763	TokenQueue.pop_front();
				764
				765	// There cannot be any referenced Token's if the TokenQueue is empty. So do a
				766	// quick deallocation of them all.
				767	if (TokenQueue.empty()) {
				768	TokenQueue.Alloc.Reset();
				769	}
				770
				771	return Ret;
				772	}
				773
				774	StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
Michael J. Spencer	6033113	2012-04-27 21:12:20 +0000	[diff] [blame]	775	if (Position == End)
				776	return Position;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	777	// Check 7 bit c-printable - b-char.
				778	if ( *Position == 0x09
				779	\|\| (Position >= 0x20 && Position <= 0x7E))
				780	return Position + 1;
				781
				782	// Check for valid UTF-8.
				783	if (uint8_t(*Position) & 0x80) {
				784	UTF8Decoded u8d = decodeUTF8(Position);
				785	if ( u8d.second != 0
				786	&& u8d.first != 0xFEFF
				787	&& ( u8d.first == 0x85
				788	\|\| ( u8d.first >= 0xA0
				789	&& u8d.first <= 0xD7FF)
				790	\|\| ( u8d.first >= 0xE000
				791	&& u8d.first <= 0xFFFD)
				792	\|\| ( u8d.first >= 0x10000
				793	&& u8d.first <= 0x10FFFF)))
				794	return Position + u8d.second;
				795	}
				796	return Position;
				797	}
				798
				799	StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
Michael J. Spencer	6033113	2012-04-27 21:12:20 +0000	[diff] [blame]	800	if (Position == End)
				801	return Position;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	802	if (*Position == 0x0D) {
				803	if (Position + 1 != End && *(Position + 1) == 0x0A)
				804	return Position + 2;
				805	return Position + 1;
				806	}
				807
				808	if (*Position == 0x0A)
				809	return Position + 1;
				810	return Position;
				811	}
				812
				813
				814	StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
				815	if (Position == End)
				816	return Position;
				817	if (Position == ' ' \|\| Position == '\t')
				818	return Position + 1;
				819	return Position;
				820	}
				821
				822	StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
				823	if (Position == End)
				824	return Position;
				825	if (Position == ' ' \|\| Position == '\t')
				826	return Position;
				827	return skip_nb_char(Position);
				828	}
				829
				830	StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
				831	, StringRef::iterator Position) {
				832	while (true) {
				833	StringRef::iterator i = (this->*Func)(Position);
				834	if (i == Position)
				835	break;
				836	Position = i;
				837	}
				838	return Position;
				839	}
				840
				841	static bool is_ns_hex_digit(const char C) {
				842	return (C >= '0' && C <= '9')
				843	\|\| (C >= 'a' && C <= 'z')
				844	\|\| (C >= 'A' && C <= 'Z');
				845	}
				846
				847	static bool is_ns_word_char(const char C) {
				848	return C == '-'
				849	\|\| (C >= 'a' && C <= 'z')
				850	\|\| (C >= 'A' && C <= 'Z');
				851	}
				852
				853	StringRef Scanner::scan_ns_uri_char() {
				854	StringRef::iterator Start = Current;
				855	while (true) {
				856	if (Current == End)
				857	break;
				858	if (( *Current == '%'
				859	&& Current + 2 < End
				860	&& is_ns_hex_digit(*(Current + 1))
				861	&& is_ns_hex_digit(*(Current + 2)))
				862	\|\| is_ns_word_char(*Current)
				863	\|\| StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
				864	!= StringRef::npos) {
				865	++Current;
				866	++Column;
				867	} else
				868	break;
				869	}
				870	return StringRef(Start, Current - Start);
				871	}
				872
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	873	bool Scanner::consume(uint32_t Expected) {
				874	if (Expected >= 0x80)
				875	report_fatal_error("Not dealing with this yet");
				876	if (Current == End)
				877	return false;
				878	if (uint8_t(*Current) >= 0x80)
				879	report_fatal_error("Not dealing with this yet");
				880	if (uint8_t(*Current) == Expected) {
				881	++Current;
				882	++Column;
				883	return true;
				884	}
				885	return false;
				886	}
				887
				888	void Scanner::skip(uint32_t Distance) {
				889	Current += Distance;
				890	Column += Distance;
Benjamin Kramer	8fb58f6	2012-09-26 15:52:15 +0000	[diff] [blame]	891	assert(Current <= End && "Skipped past the end");
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	892	}
				893
				894	bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
				895	if (Position == End)
				896	return false;
				897	if ( Position == ' ' \|\| Position == '\t'
				898	\|\| Position == '\r' \|\| Position == '\n')
				899	return true;
				900	return false;
				901	}
				902
				903	void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
				904	, unsigned AtColumn
				905	, bool IsRequired) {
				906	if (IsSimpleKeyAllowed) {
				907	SimpleKey SK;
				908	SK.Tok = Tok;
				909	SK.Line = Line;
				910	SK.Column = AtColumn;
				911	SK.IsRequired = IsRequired;
				912	SK.FlowLevel = FlowLevel;
				913	SimpleKeys.push_back(SK);
				914	}
				915	}
				916
				917	void Scanner::removeStaleSimpleKeyCandidates() {
				918	for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
				919	i != SimpleKeys.end();) {
				920	if (i->Line != Line \|\| i->Column + 1024 < Column) {
				921	if (i->IsRequired)
				922	setError( "Could not find expected : for simple key"
				923	, i->Tok->Range.begin());
				924	i = SimpleKeys.erase(i);
				925	} else
				926	++i;
				927	}
				928	}
				929
				930	void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
				931	if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
				932	SimpleKeys.pop_back();
				933	}
				934
				935	bool Scanner::unrollIndent(int ToColumn) {
				936	Token T;
				937	// Indentation is ignored in flow.
				938	if (FlowLevel != 0)
				939	return true;
				940
				941	while (Indent > ToColumn) {
				942	T.Kind = Token::TK_BlockEnd;
				943	T.Range = StringRef(Current, 1);
				944	TokenQueue.push_back(T);
				945	Indent = Indents.pop_back_val();
				946	}
				947
				948	return true;
				949	}
				950
				951	bool Scanner::rollIndent( int ToColumn
				952	, Token::TokenKind Kind
				953	, TokenQueueT::iterator InsertPoint) {
				954	if (FlowLevel)
				955	return true;
				956	if (Indent < ToColumn) {
				957	Indents.push_back(Indent);
				958	Indent = ToColumn;
				959
				960	Token T;
				961	T.Kind = Kind;
				962	T.Range = StringRef(Current, 0);
				963	TokenQueue.insert(InsertPoint, T);
				964	}
				965	return true;
				966	}
				967
				968	void Scanner::scanToNextToken() {
				969	while (true) {
				970	while (Current == ' ' \|\| Current == '\t') {
				971	skip(1);
				972	}
				973
				974	// Skip comment.
				975	if (*Current == '#') {
				976	while (true) {
				977	// This may skip more than one byte, thus Column is only incremented
				978	// for code points.
				979	StringRef::iterator i = skip_nb_char(Current);
				980	if (i == Current)
				981	break;
				982	Current = i;
				983	++Column;
				984	}
				985	}
				986
				987	// Skip EOL.
				988	StringRef::iterator i = skip_b_break(Current);
				989	if (i == Current)
				990	break;
				991	Current = i;
				992	++Line;
				993	Column = 0;
				994	// New lines may start a simple key.
				995	if (!FlowLevel)
				996	IsSimpleKeyAllowed = true;
				997	}
				998	}
				999
				1000	bool Scanner::scanStreamStart() {
				1001	IsStartOfStream = false;
				1002
				1003	EncodingInfo EI = getUnicodeEncoding(currentInput());
				1004
				1005	Token T;
				1006	T.Kind = Token::TK_StreamStart;
				1007	T.Range = StringRef(Current, EI.second);
				1008	TokenQueue.push_back(T);
				1009	Current += EI.second;
				1010	return true;
				1011	}
				1012
				1013	bool Scanner::scanStreamEnd() {
				1014	// Force an ending new line if one isn't present.
				1015	if (Column != 0) {
				1016	Column = 0;
				1017	++Line;
				1018	}
				1019
				1020	unrollIndent(-1);
				1021	SimpleKeys.clear();
				1022	IsSimpleKeyAllowed = false;
				1023
				1024	Token T;
				1025	T.Kind = Token::TK_StreamEnd;
				1026	T.Range = StringRef(Current, 0);
				1027	TokenQueue.push_back(T);
				1028	return true;
				1029	}
				1030
				1031	bool Scanner::scanDirective() {
				1032	// Reset the indentation level.
				1033	unrollIndent(-1);
				1034	SimpleKeys.clear();
				1035	IsSimpleKeyAllowed = false;
				1036
				1037	StringRef::iterator Start = Current;
				1038	consume('%');
				1039	StringRef::iterator NameStart = Current;
				1040	Current = skip_while(&Scanner::skip_ns_char, Current);
				1041	StringRef Name(NameStart, Current - NameStart);
				1042	Current = skip_while(&Scanner::skip_s_white, Current);
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1043
				1044	Token T;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1045	if (Name == "YAML") {
				1046	Current = skip_while(&Scanner::skip_ns_char, Current);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1047	T.Kind = Token::TK_VersionDirective;
				1048	T.Range = StringRef(Start, Current - Start);
				1049	TokenQueue.push_back(T);
				1050	return true;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1051	} else if(Name == "TAG") {
				1052	Current = skip_while(&Scanner::skip_ns_char, Current);
				1053	Current = skip_while(&Scanner::skip_s_white, Current);
				1054	Current = skip_while(&Scanner::skip_ns_char, Current);
				1055	T.Kind = Token::TK_TagDirective;
				1056	T.Range = StringRef(Start, Current - Start);
				1057	TokenQueue.push_back(T);
				1058	return true;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1059	}
				1060	return false;
				1061	}
				1062
				1063	bool Scanner::scanDocumentIndicator(bool IsStart) {
				1064	unrollIndent(-1);
				1065	SimpleKeys.clear();
				1066	IsSimpleKeyAllowed = false;
				1067
				1068	Token T;
				1069	T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
				1070	T.Range = StringRef(Current, 3);
				1071	skip(3);
				1072	TokenQueue.push_back(T);
				1073	return true;
				1074	}
				1075
				1076	bool Scanner::scanFlowCollectionStart(bool IsSequence) {
				1077	Token T;
				1078	T.Kind = IsSequence ? Token::TK_FlowSequenceStart
				1079	: Token::TK_FlowMappingStart;
				1080	T.Range = StringRef(Current, 1);
				1081	skip(1);
				1082	TokenQueue.push_back(T);
				1083
				1084	// [ and { may begin a simple key.
				1085	saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
				1086
				1087	// And may also be followed by a simple key.
				1088	IsSimpleKeyAllowed = true;
				1089	++FlowLevel;
				1090	return true;
				1091	}
				1092
				1093	bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
				1094	removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
				1095	IsSimpleKeyAllowed = false;
				1096	Token T;
				1097	T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
				1098	: Token::TK_FlowMappingEnd;
				1099	T.Range = StringRef(Current, 1);
				1100	skip(1);
				1101	TokenQueue.push_back(T);
				1102	if (FlowLevel)
				1103	--FlowLevel;
				1104	return true;
				1105	}
				1106
				1107	bool Scanner::scanFlowEntry() {
				1108	removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
				1109	IsSimpleKeyAllowed = true;
				1110	Token T;
				1111	T.Kind = Token::TK_FlowEntry;
				1112	T.Range = StringRef(Current, 1);
				1113	skip(1);
				1114	TokenQueue.push_back(T);
				1115	return true;
				1116	}
				1117
				1118	bool Scanner::scanBlockEntry() {
				1119	rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
				1120	removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
				1121	IsSimpleKeyAllowed = true;
				1122	Token T;
				1123	T.Kind = Token::TK_BlockEntry;
				1124	T.Range = StringRef(Current, 1);
				1125	skip(1);
				1126	TokenQueue.push_back(T);
				1127	return true;
				1128	}
				1129
				1130	bool Scanner::scanKey() {
				1131	if (!FlowLevel)
				1132	rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
				1133
				1134	removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
				1135	IsSimpleKeyAllowed = !FlowLevel;
				1136
				1137	Token T;
				1138	T.Kind = Token::TK_Key;
				1139	T.Range = StringRef(Current, 1);
				1140	skip(1);
				1141	TokenQueue.push_back(T);
				1142	return true;
				1143	}
				1144
				1145	bool Scanner::scanValue() {
				1146	// If the previous token could have been a simple key, insert the key token
				1147	// into the token queue.
				1148	if (!SimpleKeys.empty()) {
				1149	SimpleKey SK = SimpleKeys.pop_back_val();
				1150	Token T;
				1151	T.Kind = Token::TK_Key;
				1152	T.Range = SK.Tok->Range;
				1153	TokenQueueT::iterator i, e;
				1154	for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
				1155	if (i == SK.Tok)
				1156	break;
				1157	}
				1158	assert(i != e && "SimpleKey not in token queue!");
				1159	i = TokenQueue.insert(i, T);
				1160
				1161	// We may also need to add a Block-Mapping-Start token.
				1162	rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
				1163
				1164	IsSimpleKeyAllowed = false;
				1165	} else {
				1166	if (!FlowLevel)
				1167	rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
				1168	IsSimpleKeyAllowed = !FlowLevel;
				1169	}
				1170
				1171	Token T;
				1172	T.Kind = Token::TK_Value;
				1173	T.Range = StringRef(Current, 1);
				1174	skip(1);
				1175	TokenQueue.push_back(T);
				1176	return true;
				1177	}
				1178
				1179	// Forbidding inlining improves performance by roughly 20%.
				1180	// FIXME: Remove once llvm optimizes this to the faster version without hints.
				1181	LLVM_ATTRIBUTE_NOINLINE static bool
				1182	wasEscaped(StringRef::iterator First, StringRef::iterator Position);
				1183
				1184	// Returns whether a character at 'Position' was escaped with a leading '\'.
				1185	// 'First' specifies the position of the first character in the string.
				1186	static bool wasEscaped(StringRef::iterator First,
				1187	StringRef::iterator Position) {
				1188	assert(Position - 1 >= First);
				1189	StringRef::iterator I = Position - 1;
				1190	// We calculate the number of consecutive '\'s before the current position
				1191	// by iterating backwards through our string.
				1192	while (I >= First && *I == '\\') --I;
				1193	// (Position - 1 - I) now contains the number of '\'s before the current
				1194	// position. If it is odd, the character at 'Position' was escaped.
				1195	return (Position - 1 - I) % 2 == 1;
				1196	}
				1197
				1198	bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
				1199	StringRef::iterator Start = Current;
				1200	unsigned ColStart = Column;
				1201	if (IsDoubleQuoted) {
				1202	do {
				1203	++Current;
				1204	while (Current != End && *Current != '"')
				1205	++Current;
				1206	// Repeat until the previous character was not a '\' or was an escaped
				1207	// backslash.
Michael J. Spencer	6033113	2012-04-27 21:12:20 +0000	[diff] [blame]	1208	} while ( Current != End
				1209	&& *(Current - 1) == '\\'
				1210	&& wasEscaped(Start + 1, Current));
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1211	} else {
				1212	skip(1);
				1213	while (true) {
				1214	// Skip a ' followed by another '.
				1215	if (Current + 1 < End && Current == '\'' && (Current + 1) == '\'') {
				1216	skip(2);
				1217	continue;
				1218	} else if (*Current == '\'')
				1219	break;
				1220	StringRef::iterator i = skip_nb_char(Current);
				1221	if (i == Current) {
				1222	i = skip_b_break(Current);
				1223	if (i == Current)
				1224	break;
				1225	Current = i;
				1226	Column = 0;
				1227	++Line;
				1228	} else {
				1229	if (i == End)
				1230	break;
				1231	Current = i;
				1232	++Column;
				1233	}
				1234	}
				1235	}
Benjamin Kramer	8fb58f6	2012-09-26 15:52:15 +0000	[diff] [blame]	1236
				1237	if (Current == End) {
				1238	setError("Expected quote at end of scalar", Current);
				1239	return false;
				1240	}
				1241
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1242	skip(1); // Skip ending quote.
				1243	Token T;
				1244	T.Kind = Token::TK_Scalar;
				1245	T.Range = StringRef(Start, Current - Start);
				1246	TokenQueue.push_back(T);
				1247
				1248	saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
				1249
				1250	IsSimpleKeyAllowed = false;
				1251
				1252	return true;
				1253	}
				1254
				1255	bool Scanner::scanPlainScalar() {
				1256	StringRef::iterator Start = Current;
				1257	unsigned ColStart = Column;
				1258	unsigned LeadingBlanks = 0;
				1259	assert(Indent >= -1 && "Indent must be >= -1 !");
				1260	unsigned indent = static_cast<unsigned>(Indent + 1);
				1261	while (true) {
				1262	if (*Current == '#')
				1263	break;
				1264
				1265	while (!isBlankOrBreak(Current)) {
				1266	if ( FlowLevel && *Current == ':'
				1267	&& !(isBlankOrBreak(Current + 1) \|\| *(Current + 1) == ',')) {
				1268	setError("Found unexpected ':' while scanning a plain scalar", Current);
				1269	return false;
				1270	}
				1271
				1272	// Check for the end of the plain scalar.
				1273	if ( (*Current == ':' && isBlankOrBreak(Current + 1))
				1274	\|\| ( FlowLevel
				1275	&& (StringRef(Current, 1).find_first_of(",:?[]{}")
				1276	!= StringRef::npos)))
				1277	break;
				1278
				1279	StringRef::iterator i = skip_nb_char(Current);
				1280	if (i == Current)
				1281	break;
				1282	Current = i;
				1283	++Column;
				1284	}
				1285
				1286	// Are we at the end?
				1287	if (!isBlankOrBreak(Current))
				1288	break;
				1289
				1290	// Eat blanks.
				1291	StringRef::iterator Tmp = Current;
				1292	while (isBlankOrBreak(Tmp)) {
				1293	StringRef::iterator i = skip_s_white(Tmp);
				1294	if (i != Tmp) {
				1295	if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
				1296	setError("Found invalid tab character in indentation", Tmp);
				1297	return false;
				1298	}
				1299	Tmp = i;
				1300	++Column;
				1301	} else {
				1302	i = skip_b_break(Tmp);
				1303	if (!LeadingBlanks)
				1304	LeadingBlanks = 1;
				1305	Tmp = i;
				1306	Column = 0;
				1307	++Line;
				1308	}
				1309	}
				1310
				1311	if (!FlowLevel && Column < indent)
				1312	break;
				1313
				1314	Current = Tmp;
				1315	}
				1316	if (Start == Current) {
				1317	setError("Got empty plain scalar", Start);
				1318	return false;
				1319	}
				1320	Token T;
				1321	T.Kind = Token::TK_Scalar;
				1322	T.Range = StringRef(Start, Current - Start);
				1323	TokenQueue.push_back(T);
				1324
				1325	// Plain scalars can be simple keys.
				1326	saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
				1327
				1328	IsSimpleKeyAllowed = false;
				1329
				1330	return true;
				1331	}
				1332
				1333	bool Scanner::scanAliasOrAnchor(bool IsAlias) {
				1334	StringRef::iterator Start = Current;
				1335	unsigned ColStart = Column;
				1336	skip(1);
				1337	while(true) {
				1338	if ( Current == '[' \|\| Current == ']'
				1339	\|\| Current == '{' \|\| Current == '}'
				1340	\|\| *Current == ','
				1341	\|\| *Current == ':')
				1342	break;
				1343	StringRef::iterator i = skip_ns_char(Current);
				1344	if (i == Current)
				1345	break;
				1346	Current = i;
				1347	++Column;
				1348	}
				1349
				1350	if (Start == Current) {
				1351	setError("Got empty alias or anchor", Start);
				1352	return false;
				1353	}
				1354
				1355	Token T;
				1356	T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
				1357	T.Range = StringRef(Start, Current - Start);
				1358	TokenQueue.push_back(T);
				1359
				1360	// Alias and anchors can be simple keys.
				1361	saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
				1362
				1363	IsSimpleKeyAllowed = false;
				1364
				1365	return true;
				1366	}
				1367
				1368	bool Scanner::scanBlockScalar(bool IsLiteral) {
				1369	StringRef::iterator Start = Current;
				1370	skip(1); // Eat \| or >
				1371	while(true) {
				1372	StringRef::iterator i = skip_nb_char(Current);
				1373	if (i == Current) {
				1374	if (Column == 0)
				1375	break;
				1376	i = skip_b_break(Current);
				1377	if (i != Current) {
				1378	// We got a line break.
				1379	Column = 0;
				1380	++Line;
				1381	Current = i;
				1382	continue;
				1383	} else {
				1384	// There was an error, which should already have been printed out.
				1385	return false;
				1386	}
				1387	}
				1388	Current = i;
				1389	++Column;
				1390	}
				1391
				1392	if (Start == Current) {
				1393	setError("Got empty block scalar", Start);
				1394	return false;
				1395	}
				1396
				1397	Token T;
				1398	T.Kind = Token::TK_Scalar;
				1399	T.Range = StringRef(Start, Current - Start);
				1400	TokenQueue.push_back(T);
				1401	return true;
				1402	}
				1403
				1404	bool Scanner::scanTag() {
				1405	StringRef::iterator Start = Current;
				1406	unsigned ColStart = Column;
				1407	skip(1); // Eat !.
				1408	if (Current == End \|\| isBlankOrBreak(Current)); // An empty tag.
				1409	else if (*Current == '<') {
				1410	skip(1);
				1411	scan_ns_uri_char();
				1412	if (!consume('>'))
				1413	return false;
				1414	} else {
				1415	// FIXME: Actually parse the c-ns-shorthand-tag rule.
				1416	Current = skip_while(&Scanner::skip_ns_char, Current);
				1417	}
				1418
				1419	Token T;
				1420	T.Kind = Token::TK_Tag;
				1421	T.Range = StringRef(Start, Current - Start);
				1422	TokenQueue.push_back(T);
				1423
				1424	// Tags can be simple keys.
				1425	saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
				1426
				1427	IsSimpleKeyAllowed = false;
				1428
				1429	return true;
				1430	}
				1431
				1432	bool Scanner::fetchMoreTokens() {
				1433	if (IsStartOfStream)
				1434	return scanStreamStart();
				1435
				1436	scanToNextToken();
				1437
				1438	if (Current == End)
				1439	return scanStreamEnd();
				1440
				1441	removeStaleSimpleKeyCandidates();
				1442
				1443	unrollIndent(Column);
				1444
				1445	if (Column == 0 && *Current == '%')
				1446	return scanDirective();
				1447
				1448	if (Column == 0 && Current + 4 <= End
				1449	&& *Current == '-'
				1450	&& *(Current + 1) == '-'
				1451	&& *(Current + 2) == '-'
				1452	&& (Current + 3 == End \|\| isBlankOrBreak(Current + 3)))
				1453	return scanDocumentIndicator(true);
				1454
				1455	if (Column == 0 && Current + 4 <= End
				1456	&& *Current == '.'
				1457	&& *(Current + 1) == '.'
				1458	&& *(Current + 2) == '.'
				1459	&& (Current + 3 == End \|\| isBlankOrBreak(Current + 3)))
				1460	return scanDocumentIndicator(false);
				1461
				1462	if (*Current == '[')
				1463	return scanFlowCollectionStart(true);
				1464
				1465	if (*Current == '{')
				1466	return scanFlowCollectionStart(false);
				1467
				1468	if (*Current == ']')
				1469	return scanFlowCollectionEnd(true);
				1470
				1471	if (*Current == '}')
				1472	return scanFlowCollectionEnd(false);
				1473
				1474	if (*Current == ',')
				1475	return scanFlowEntry();
				1476
				1477	if (*Current == '-' && isBlankOrBreak(Current + 1))
				1478	return scanBlockEntry();
				1479
				1480	if (*Current == '?' && (FlowLevel \|\| isBlankOrBreak(Current + 1)))
				1481	return scanKey();
				1482
				1483	if (*Current == ':' && (FlowLevel \|\| isBlankOrBreak(Current + 1)))
				1484	return scanValue();
				1485
				1486	if (Current == '')
				1487	return scanAliasOrAnchor(true);
				1488
				1489	if (*Current == '&')
				1490	return scanAliasOrAnchor(false);
				1491
				1492	if (*Current == '!')
				1493	return scanTag();
				1494
				1495	if (*Current == '\|' && !FlowLevel)
				1496	return scanBlockScalar(true);
				1497
				1498	if (*Current == '>' && !FlowLevel)
				1499	return scanBlockScalar(false);
				1500
				1501	if (*Current == '\'')
				1502	return scanFlowScalar(false);
				1503
				1504	if (*Current == '"')
				1505	return scanFlowScalar(true);
				1506
				1507	// Get a plain scalar.
				1508	StringRef FirstChar(Current, 1);
				1509	if (!(isBlankOrBreak(Current)
				1510	\|\| FirstChar.find_first_of("-?:,[]{}#&*!\|>'\"%@`") != StringRef::npos)
				1511	\|\| (*Current == '-' && !isBlankOrBreak(Current + 1))
				1512	\|\| (!FlowLevel && (Current == '?' \|\| Current == ':')
				1513	&& isBlankOrBreak(Current + 1))
				1514	\|\| (!FlowLevel && *Current == ':'
				1515	&& Current + 2 < End
				1516	&& *(Current + 1) == ':'
				1517	&& !isBlankOrBreak(Current + 2)))
				1518	return scanPlainScalar();
				1519
				1520	setError("Unrecognized character while tokenizing.");
				1521	return false;
				1522	}
				1523
				1524	Stream::Stream(StringRef Input, SourceMgr &SM)
Ahmed Charles	56440fd	2014-03-06 05:51:42 +0000	[diff] [blame]	1525	: scanner(new Scanner(Input, SM)), CurrentDoc() {}
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1526
Sean Silva	aba8270	2012-11-19 23:21:47 +0000	[diff] [blame]	1527	Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM)
Ahmed Charles	56440fd	2014-03-06 05:51:42 +0000	[diff] [blame]	1528	: scanner(new Scanner(InputBuffer, SM)), CurrentDoc() {}
Sean Silva	aba8270	2012-11-19 23:21:47 +0000	[diff] [blame]	1529
Benjamin Kramer	a1355d1	2012-04-04 08:53:34 +0000	[diff] [blame]	1530	Stream::~Stream() {}
				1531
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1532	bool Stream::failed() { return scanner->failed(); }
				1533
				1534	void Stream::printError(Node *N, const Twine &Msg) {
				1535	SmallVector<SMRange, 1> Ranges;
				1536	Ranges.push_back(N->getSourceRange());
				1537	scanner->printError( N->getSourceRange().Start
				1538	, SourceMgr::DK_Error
				1539	, Msg
				1540	, Ranges);
				1541	}
				1542
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1543	document_iterator Stream::begin() {
				1544	if (CurrentDoc)
				1545	report_fatal_error("Can only iterate over the stream once");
				1546
				1547	// Skip Stream-Start.
				1548	scanner->getNext();
				1549
				1550	CurrentDoc.reset(new Document(*this));
				1551	return document_iterator(CurrentDoc);
				1552	}
				1553
				1554	document_iterator Stream::end() {
				1555	return document_iterator();
				1556	}
				1557
				1558	void Stream::skip() {
				1559	for (document_iterator i = begin(), e = end(); i != e; ++i)
				1560	i->skip();
				1561	}
				1562
Ahmed Charles	56440fd	2014-03-06 05:51:42 +0000	[diff] [blame]	1563	Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
				1564	StringRef T)
				1565	: Doc(D), TypeID(Type), Anchor(A), Tag(T) {
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1566	SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
				1567	SourceRange = SMRange(Start, Start);
				1568	}
				1569
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1570	std::string Node::getVerbatimTag() const {
				1571	StringRef Raw = getRawTag();
				1572	if (!Raw.empty() && Raw != "!") {
				1573	std::string Ret;
				1574	if (Raw.find_last_of('!') == 0) {
				1575	Ret = Doc->getTagMap().find("!")->second;
				1576	Ret += Raw.substr(1);
Chandler Carruth	002da5d	2014-03-02 04:08:41 +0000	[diff] [blame]	1577	return std::move(Ret);
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1578	} else if (Raw.startswith("!!")) {
				1579	Ret = Doc->getTagMap().find("!!")->second;
				1580	Ret += Raw.substr(2);
Chandler Carruth	002da5d	2014-03-02 04:08:41 +0000	[diff] [blame]	1581	return std::move(Ret);
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1582	} else {
				1583	StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
				1584	std::map<StringRef, StringRef>::const_iterator It =
				1585	Doc->getTagMap().find(TagHandle);
				1586	if (It != Doc->getTagMap().end())
				1587	Ret = It->second;
				1588	else {
				1589	Token T;
				1590	T.Kind = Token::TK_Tag;
				1591	T.Range = TagHandle;
				1592	setError(Twine("Unknown tag handle ") + TagHandle, T);
				1593	}
				1594	Ret += Raw.substr(Raw.find_last_of('!') + 1);
Chandler Carruth	002da5d	2014-03-02 04:08:41 +0000	[diff] [blame]	1595	return std::move(Ret);
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1596	}
				1597	}
				1598
				1599	switch (getType()) {
				1600	case NK_Null:
				1601	return "tag:yaml.org,2002:null";
				1602	case NK_Scalar:
				1603	// TODO: Tag resolution.
				1604	return "tag:yaml.org,2002:str";
				1605	case NK_Mapping:
				1606	return "tag:yaml.org,2002:map";
				1607	case NK_Sequence:
				1608	return "tag:yaml.org,2002:seq";
				1609	}
				1610
				1611	return "";
				1612	}
				1613
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1614	Token &Node::peekNext() {
				1615	return Doc->peekNext();
				1616	}
				1617
				1618	Token Node::getNext() {
				1619	return Doc->getNext();
				1620	}
				1621
				1622	Node *Node::parseBlockNode() {
				1623	return Doc->parseBlockNode();
				1624	}
				1625
				1626	BumpPtrAllocator &Node::getAllocator() {
				1627	return Doc->NodeAllocator;
				1628	}
				1629
				1630	void Node::setError(const Twine &Msg, Token &Tok) const {
				1631	Doc->setError(Msg, Tok);
				1632	}
				1633
				1634	bool Node::failed() const {
				1635	return Doc->failed();
				1636	}
				1637
				1638
				1639
				1640	StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
				1641	// TODO: Handle newlines properly. We need to remove leading whitespace.
				1642	if (Value[0] == '"') { // Double quoted.
				1643	// Pull off the leading and trailing "s.
				1644	StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
				1645	// Search for characters that would require unescaping the value.
				1646	StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
				1647	if (i != StringRef::npos)
				1648	return unescapeDoubleQuoted(UnquotedValue, i, Storage);
				1649	return UnquotedValue;
				1650	} else if (Value[0] == '\'') { // Single quoted.
				1651	// Pull off the leading and trailing 's.
				1652	StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
				1653	StringRef::size_type i = UnquotedValue.find('\'');
				1654	if (i != StringRef::npos) {
				1655	// We're going to need Storage.
				1656	Storage.clear();
				1657	Storage.reserve(UnquotedValue.size());
				1658	for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
				1659	StringRef Valid(UnquotedValue.begin(), i);
				1660	Storage.insert(Storage.end(), Valid.begin(), Valid.end());
				1661	Storage.push_back('\'');
				1662	UnquotedValue = UnquotedValue.substr(i + 2);
				1663	}
				1664	Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
				1665	return StringRef(Storage.begin(), Storage.size());
				1666	}
				1667	return UnquotedValue;
				1668	}
				1669	// Plain or block.
Michael J. Spencer	c10948d	2012-05-14 22:43:34 +0000	[diff] [blame]	1670	return Value.rtrim(" ");
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1671	}
				1672
				1673	StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
				1674	, StringRef::size_type i
				1675	, SmallVectorImpl<char> &Storage)
				1676	const {
				1677	// Use Storage to build proper value.
				1678	Storage.clear();
				1679	Storage.reserve(UnquotedValue.size());
				1680	for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
				1681	// Insert all previous chars into Storage.
				1682	StringRef Valid(UnquotedValue.begin(), i);
				1683	Storage.insert(Storage.end(), Valid.begin(), Valid.end());
				1684	// Chop off inserted chars.
				1685	UnquotedValue = UnquotedValue.substr(i);
				1686
				1687	assert(!UnquotedValue.empty() && "Can't be empty!");
				1688
				1689	// Parse escape or line break.
				1690	switch (UnquotedValue[0]) {
				1691	case '\r':
				1692	case '\n':
				1693	Storage.push_back('\n');
				1694	if ( UnquotedValue.size() > 1
				1695	&& (UnquotedValue[1] == '\r' \|\| UnquotedValue[1] == '\n'))
				1696	UnquotedValue = UnquotedValue.substr(1);
				1697	UnquotedValue = UnquotedValue.substr(1);
				1698	break;
				1699	default:
				1700	if (UnquotedValue.size() == 1)
				1701	// TODO: Report error.
				1702	break;
				1703	UnquotedValue = UnquotedValue.substr(1);
				1704	switch (UnquotedValue[0]) {
				1705	default: {
				1706	Token T;
				1707	T.Range = StringRef(UnquotedValue.begin(), 1);
				1708	setError("Unrecognized escape code!", T);
				1709	return "";
				1710	}
				1711	case '\r':
				1712	case '\n':
				1713	// Remove the new line.
				1714	if ( UnquotedValue.size() > 1
				1715	&& (UnquotedValue[1] == '\r' \|\| UnquotedValue[1] == '\n'))
				1716	UnquotedValue = UnquotedValue.substr(1);
				1717	// If this was just a single byte newline, it will get skipped
				1718	// below.
				1719	break;
				1720	case '0':
				1721	Storage.push_back(0x00);
				1722	break;
				1723	case 'a':
				1724	Storage.push_back(0x07);
				1725	break;
				1726	case 'b':
				1727	Storage.push_back(0x08);
				1728	break;
				1729	case 't':
				1730	case 0x09:
				1731	Storage.push_back(0x09);
				1732	break;
				1733	case 'n':
				1734	Storage.push_back(0x0A);
				1735	break;
				1736	case 'v':
				1737	Storage.push_back(0x0B);
				1738	break;
				1739	case 'f':
				1740	Storage.push_back(0x0C);
				1741	break;
				1742	case 'r':
				1743	Storage.push_back(0x0D);
				1744	break;
				1745	case 'e':
				1746	Storage.push_back(0x1B);
				1747	break;
				1748	case ' ':
				1749	Storage.push_back(0x20);
				1750	break;
				1751	case '"':
				1752	Storage.push_back(0x22);
				1753	break;
				1754	case '/':
				1755	Storage.push_back(0x2F);
				1756	break;
				1757	case '\\':
				1758	Storage.push_back(0x5C);
				1759	break;
				1760	case 'N':
				1761	encodeUTF8(0x85, Storage);
				1762	break;
				1763	case '_':
				1764	encodeUTF8(0xA0, Storage);
				1765	break;
				1766	case 'L':
				1767	encodeUTF8(0x2028, Storage);
				1768	break;
				1769	case 'P':
				1770	encodeUTF8(0x2029, Storage);
				1771	break;
				1772	case 'x': {
				1773	if (UnquotedValue.size() < 3)
				1774	// TODO: Report error.
				1775	break;
Michael J. Spencer	a6c2c29	2012-04-26 19:27:11 +0000	[diff] [blame]	1776	unsigned int UnicodeScalarValue;
				1777	if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
				1778	// TODO: Report error.
				1779	UnicodeScalarValue = 0xFFFD;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1780	encodeUTF8(UnicodeScalarValue, Storage);
				1781	UnquotedValue = UnquotedValue.substr(2);
				1782	break;
				1783	}
				1784	case 'u': {
				1785	if (UnquotedValue.size() < 5)
				1786	// TODO: Report error.
				1787	break;
Michael J. Spencer	a6c2c29	2012-04-26 19:27:11 +0000	[diff] [blame]	1788	unsigned int UnicodeScalarValue;
				1789	if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
				1790	// TODO: Report error.
				1791	UnicodeScalarValue = 0xFFFD;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1792	encodeUTF8(UnicodeScalarValue, Storage);
				1793	UnquotedValue = UnquotedValue.substr(4);
				1794	break;
				1795	}
				1796	case 'U': {
				1797	if (UnquotedValue.size() < 9)
				1798	// TODO: Report error.
				1799	break;
Michael J. Spencer	a6c2c29	2012-04-26 19:27:11 +0000	[diff] [blame]	1800	unsigned int UnicodeScalarValue;
				1801	if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
				1802	// TODO: Report error.
				1803	UnicodeScalarValue = 0xFFFD;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1804	encodeUTF8(UnicodeScalarValue, Storage);
				1805	UnquotedValue = UnquotedValue.substr(8);
				1806	break;
				1807	}
				1808	}
				1809	UnquotedValue = UnquotedValue.substr(1);
				1810	}
				1811	}
				1812	Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
				1813	return StringRef(Storage.begin(), Storage.size());
				1814	}
				1815
				1816	Node *KeyValueNode::getKey() {
				1817	if (Key)
				1818	return Key;
				1819	// Handle implicit null keys.
				1820	{
				1821	Token &t = peekNext();
				1822	if ( t.Kind == Token::TK_BlockEnd
				1823	\|\| t.Kind == Token::TK_Value
				1824	\|\| t.Kind == Token::TK_Error) {
				1825	return Key = new (getAllocator()) NullNode(Doc);
				1826	}
				1827	if (t.Kind == Token::TK_Key)
				1828	getNext(); // skip TK_Key.
				1829	}
				1830
				1831	// Handle explicit null keys.
				1832	Token &t = peekNext();
				1833	if (t.Kind == Token::TK_BlockEnd \|\| t.Kind == Token::TK_Value) {
				1834	return Key = new (getAllocator()) NullNode(Doc);
				1835	}
				1836
				1837	// We've got a normal key.
				1838	return Key = parseBlockNode();
				1839	}
				1840
				1841	Node *KeyValueNode::getValue() {
				1842	if (Value)
				1843	return Value;
				1844	getKey()->skip();
				1845	if (failed())
				1846	return Value = new (getAllocator()) NullNode(Doc);
				1847
				1848	// Handle implicit null values.
				1849	{
				1850	Token &t = peekNext();
				1851	if ( t.Kind == Token::TK_BlockEnd
				1852	\|\| t.Kind == Token::TK_FlowMappingEnd
				1853	\|\| t.Kind == Token::TK_Key
				1854	\|\| t.Kind == Token::TK_FlowEntry
				1855	\|\| t.Kind == Token::TK_Error) {
				1856	return Value = new (getAllocator()) NullNode(Doc);
				1857	}
				1858
				1859	if (t.Kind != Token::TK_Value) {
				1860	setError("Unexpected token in Key Value.", t);
				1861	return Value = new (getAllocator()) NullNode(Doc);
				1862	}
				1863	getNext(); // skip TK_Value.
				1864	}
				1865
				1866	// Handle explicit null values.
				1867	Token &t = peekNext();
				1868	if (t.Kind == Token::TK_BlockEnd \|\| t.Kind == Token::TK_Key) {
				1869	return Value = new (getAllocator()) NullNode(Doc);
				1870	}
				1871
				1872	// We got a normal value.
				1873	return Value = parseBlockNode();
				1874	}
				1875
				1876	void MappingNode::increment() {
				1877	if (failed()) {
				1878	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1879	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1880	return;
				1881	}
				1882	if (CurrentEntry) {
				1883	CurrentEntry->skip();
				1884	if (Type == MT_Inline) {
				1885	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1886	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1887	return;
				1888	}
				1889	}
				1890	Token T = peekNext();
				1891	if (T.Kind == Token::TK_Key \|\| T.Kind == Token::TK_Scalar) {
				1892	// KeyValueNode eats the TK_Key. That way it can detect null keys.
				1893	CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
				1894	} else if (Type == MT_Block) {
				1895	switch (T.Kind) {
				1896	case Token::TK_BlockEnd:
				1897	getNext();
				1898	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1899	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1900	break;
				1901	default:
				1902	setError("Unexpected token. Expected Key or Block End", T);
				1903	case Token::TK_Error:
				1904	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1905	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1906	}
				1907	} else {
				1908	switch (T.Kind) {
				1909	case Token::TK_FlowEntry:
				1910	// Eat the flow entry and recurse.
				1911	getNext();
				1912	return increment();
				1913	case Token::TK_FlowMappingEnd:
				1914	getNext();
				1915	case Token::TK_Error:
				1916	// Set this to end iterator.
				1917	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1918	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1919	break;
				1920	default:
				1921	setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
				1922	"Mapping End."
				1923	, T);
				1924	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1925	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1926	}
				1927	}
				1928	}
				1929
				1930	void SequenceNode::increment() {
				1931	if (failed()) {
				1932	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1933	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1934	return;
				1935	}
				1936	if (CurrentEntry)
				1937	CurrentEntry->skip();
				1938	Token T = peekNext();
				1939	if (SeqType == ST_Block) {
				1940	switch (T.Kind) {
				1941	case Token::TK_BlockEntry:
				1942	getNext();
				1943	CurrentEntry = parseBlockNode();
Craig Topper	8d399f8	2014-04-09 04:20:00 +0000	[diff] [blame^]	1944	if (!CurrentEntry) { // An error occurred.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1945	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1946	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1947	}
				1948	break;
				1949	case Token::TK_BlockEnd:
				1950	getNext();
				1951	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1952	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1953	break;
				1954	default:
				1955	setError( "Unexpected token. Expected Block Entry or Block End."
				1956	, T);
				1957	case Token::TK_Error:
				1958	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1959	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1960	}
				1961	} else if (SeqType == ST_Indentless) {
				1962	switch (T.Kind) {
				1963	case Token::TK_BlockEntry:
				1964	getNext();
				1965	CurrentEntry = parseBlockNode();
Craig Topper	8d399f8	2014-04-09 04:20:00 +0000	[diff] [blame^]	1966	if (!CurrentEntry) { // An error occurred.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1967	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1968	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1969	}
				1970	break;
				1971	default:
				1972	case Token::TK_Error:
				1973	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1974	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1975	}
				1976	} else if (SeqType == ST_Flow) {
				1977	switch (T.Kind) {
				1978	case Token::TK_FlowEntry:
				1979	// Eat the flow entry and recurse.
				1980	getNext();
				1981	WasPreviousTokenFlowEntry = true;
				1982	return increment();
				1983	case Token::TK_FlowSequenceEnd:
				1984	getNext();
				1985	case Token::TK_Error:
				1986	// Set this to end iterator.
				1987	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1988	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1989	break;
				1990	case Token::TK_StreamEnd:
				1991	case Token::TK_DocumentEnd:
				1992	case Token::TK_DocumentStart:
				1993	setError("Could not find closing ]!", T);
				1994	// Set this to end iterator.
				1995	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	1996	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1997	break;
				1998	default:
				1999	if (!WasPreviousTokenFlowEntry) {
				2000	setError("Expected , between entries!", T);
				2001	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2002	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2003	break;
				2004	}
				2005	// Otherwise it must be a flow entry.
				2006	CurrentEntry = parseBlockNode();
				2007	if (!CurrentEntry) {
				2008	IsAtEnd = true;
				2009	}
				2010	WasPreviousTokenFlowEntry = false;
				2011	break;
				2012	}
				2013	}
				2014	}
				2015
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2016	Document::Document(Stream &S) : stream(S), Root(nullptr) {
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2017	// Tag maps starts with two default mappings.
				2018	TagMap["!"] = "!";
				2019	TagMap["!!"] = "tag:yaml.org,2002:";
				2020
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2021	if (parseDirectives())
				2022	expectToken(Token::TK_DocumentStart);
				2023	Token &T = peekNext();
				2024	if (T.Kind == Token::TK_DocumentStart)
				2025	getNext();
				2026	}
				2027
				2028	bool Document::skip() {
				2029	if (stream.scanner->failed())
				2030	return false;
				2031	if (!Root)
				2032	getRoot();
				2033	Root->skip();
				2034	Token &T = peekNext();
				2035	if (T.Kind == Token::TK_StreamEnd)
				2036	return false;
				2037	if (T.Kind == Token::TK_DocumentEnd) {
				2038	getNext();
				2039	return skip();
				2040	}
				2041	return true;
				2042	}
				2043
				2044	Token &Document::peekNext() {
				2045	return stream.scanner->peekNext();
				2046	}
				2047
				2048	Token Document::getNext() {
				2049	return stream.scanner->getNext();
				2050	}
				2051
				2052	void Document::setError(const Twine &Message, Token &Location) const {
				2053	stream.scanner->setError(Message, Location.Range.begin());
				2054	}
				2055
				2056	bool Document::failed() const {
				2057	return stream.scanner->failed();
				2058	}
				2059
				2060	Node *Document::parseBlockNode() {
				2061	Token T = peekNext();
				2062	// Handle properties.
				2063	Token AnchorInfo;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2064	Token TagInfo;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2065	parse_property:
				2066	switch (T.Kind) {
				2067	case Token::TK_Alias:
				2068	getNext();
				2069	return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
				2070	case Token::TK_Anchor:
				2071	if (AnchorInfo.Kind == Token::TK_Anchor) {
				2072	setError("Already encountered an anchor for this node!", T);
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2073	return nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2074	}
				2075	AnchorInfo = getNext(); // Consume TK_Anchor.
				2076	T = peekNext();
				2077	goto parse_property;
				2078	case Token::TK_Tag:
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2079	if (TagInfo.Kind == Token::TK_Tag) {
				2080	setError("Already encountered a tag for this node!", T);
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2081	return nullptr;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2082	}
				2083	TagInfo = getNext(); // Consume TK_Tag.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2084	T = peekNext();
				2085	goto parse_property;
				2086	default:
				2087	break;
				2088	}
				2089
				2090	switch (T.Kind) {
				2091	case Token::TK_BlockEntry:
				2092	// We got an unindented BlockEntry sequence. This is not terminated with
				2093	// a BlockEnd.
				2094	// Don't eat the TK_BlockEntry, SequenceNode needs it.
				2095	return new (NodeAllocator) SequenceNode( stream.CurrentDoc
				2096	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2097	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2098	, SequenceNode::ST_Indentless);
				2099	case Token::TK_BlockSequenceStart:
				2100	getNext();
				2101	return new (NodeAllocator)
				2102	SequenceNode( stream.CurrentDoc
				2103	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2104	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2105	, SequenceNode::ST_Block);
				2106	case Token::TK_BlockMappingStart:
				2107	getNext();
				2108	return new (NodeAllocator)
				2109	MappingNode( stream.CurrentDoc
				2110	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2111	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2112	, MappingNode::MT_Block);
				2113	case Token::TK_FlowSequenceStart:
				2114	getNext();
				2115	return new (NodeAllocator)
				2116	SequenceNode( stream.CurrentDoc
				2117	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2118	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2119	, SequenceNode::ST_Flow);
				2120	case Token::TK_FlowMappingStart:
				2121	getNext();
				2122	return new (NodeAllocator)
				2123	MappingNode( stream.CurrentDoc
				2124	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2125	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2126	, MappingNode::MT_Flow);
				2127	case Token::TK_Scalar:
				2128	getNext();
				2129	return new (NodeAllocator)
				2130	ScalarNode( stream.CurrentDoc
				2131	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2132	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2133	, T.Range);
				2134	case Token::TK_Key:
				2135	// Don't eat the TK_Key, KeyValueNode expects it.
				2136	return new (NodeAllocator)
				2137	MappingNode( stream.CurrentDoc
				2138	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2139	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2140	, MappingNode::MT_Inline);
				2141	case Token::TK_DocumentStart:
				2142	case Token::TK_DocumentEnd:
				2143	case Token::TK_StreamEnd:
				2144	default:
				2145	// TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
				2146	// !!null null.
				2147	return new (NodeAllocator) NullNode(stream.CurrentDoc);
				2148	case Token::TK_Error:
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2149	return nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2150	}
				2151	llvm_unreachable("Control flow shouldn't reach here.");
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2152	return nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2153	}
				2154
				2155	bool Document::parseDirectives() {
				2156	bool isDirective = false;
				2157	while (true) {
				2158	Token T = peekNext();
				2159	if (T.Kind == Token::TK_TagDirective) {
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2160	parseTAGDirective();
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2161	isDirective = true;
				2162	} else if (T.Kind == Token::TK_VersionDirective) {
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2163	parseYAMLDirective();
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2164	isDirective = true;
				2165	} else
				2166	break;
				2167	}
				2168	return isDirective;
				2169	}
				2170
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2171	void Document::parseYAMLDirective() {
				2172	getNext(); // Eat %YAML <version>
				2173	}
				2174
				2175	void Document::parseTAGDirective() {
				2176	Token Tag = getNext(); // %TAG <handle> <prefix>
				2177	StringRef T = Tag.Range;
				2178	// Strip %TAG
				2179	T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
				2180	std::size_t HandleEnd = T.find_first_of(" \t");
				2181	StringRef TagHandle = T.substr(0, HandleEnd);
				2182	StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
				2183	TagMap[TagHandle] = TagPrefix;
				2184	}
				2185
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2186	bool Document::expectToken(int TK) {
				2187	Token T = getNext();
				2188	if (T.Kind != TK) {
				2189	setError("Unexpected token", T);
				2190	return false;
				2191	}
				2192	return true;
				2193	}