Blame - llvm/lib/Support/YAMLParser.cpp - toolchain/llvm-project

blob: 0d169af26be8632af95bcdc44a5405d5331bba85 [file] [log] [blame]

Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1	//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file implements a YAML parser.
				11	//
				12	//===----------------------------------------------------------------------===//
				13
				14	#include "llvm/Support/YAMLParser.h"
David Majnemer	0d955d0	2016-08-11 22:21:41 +0000	[diff] [blame]	15	#include "llvm/ADT/STLExtras.h"
Benjamin Kramer	16132e6	2015-03-23 18:07:13 +0000	[diff] [blame]	16	#include "llvm/ADT/SmallString.h"
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	17	#include "llvm/ADT/SmallVector.h"
				18	#include "llvm/ADT/StringExtras.h"
				19	#include "llvm/ADT/Twine.h"
Duncan P. N. Exon Smith	23d8306	2016-09-11 22:40:40 +0000	[diff] [blame]	20	#include "llvm/ADT/AllocatorList.h"
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	21	#include "llvm/Support/ErrorHandling.h"
				22	#include "llvm/Support/MemoryBuffer.h"
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	23	#include "llvm/Support/SourceMgr.h"
Chandler Carruth	ed0881b	2012-12-03 16:50:05 +0000	[diff] [blame]	24	#include "llvm/Support/raw_ostream.h"
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	25
				26	using namespace llvm;
				27	using namespace yaml;
				28
				29	enum UnicodeEncodingForm {
Dmitri Gribenko	dbeafa7	2012-06-09 00:01:45 +0000	[diff] [blame]	30	UEF_UTF32_LE, ///< UTF-32 Little Endian
				31	UEF_UTF32_BE, ///< UTF-32 Big Endian
				32	UEF_UTF16_LE, ///< UTF-16 Little Endian
				33	UEF_UTF16_BE, ///< UTF-16 Big Endian
				34	UEF_UTF8, ///< UTF-8 or ascii.
				35	UEF_Unknown ///< Not a valid Unicode encoding.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	36	};
				37
				38	/// EncodingInfo - Holds the encoding type and length of the byte order mark if
				39	/// it exists. Length is in {0, 2, 3, 4}.
				40	typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
				41
				42	/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
				43	/// encoding form of \a Input.
				44	///
				45	/// @param Input A string of length 0 or more.
				46	/// @returns An EncodingInfo indicating the Unicode encoding form of the input
				47	/// and how long the byte order mark is if one exists.
				48	static EncodingInfo getUnicodeEncoding(StringRef Input) {
				49	if (Input.size() == 0)
				50	return std::make_pair(UEF_Unknown, 0);
				51
				52	switch (uint8_t(Input[0])) {
				53	case 0x00:
				54	if (Input.size() >= 4) {
				55	if ( Input[1] == 0
				56	&& uint8_t(Input[2]) == 0xFE
				57	&& uint8_t(Input[3]) == 0xFF)
				58	return std::make_pair(UEF_UTF32_BE, 4);
				59	if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
				60	return std::make_pair(UEF_UTF32_BE, 0);
				61	}
				62
				63	if (Input.size() >= 2 && Input[1] != 0)
				64	return std::make_pair(UEF_UTF16_BE, 0);
				65	return std::make_pair(UEF_Unknown, 0);
				66	case 0xFF:
				67	if ( Input.size() >= 4
				68	&& uint8_t(Input[1]) == 0xFE
				69	&& Input[2] == 0
				70	&& Input[3] == 0)
				71	return std::make_pair(UEF_UTF32_LE, 4);
				72
				73	if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
				74	return std::make_pair(UEF_UTF16_LE, 2);
				75	return std::make_pair(UEF_Unknown, 0);
				76	case 0xFE:
				77	if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
				78	return std::make_pair(UEF_UTF16_BE, 2);
				79	return std::make_pair(UEF_Unknown, 0);
				80	case 0xEF:
				81	if ( Input.size() >= 3
				82	&& uint8_t(Input[1]) == 0xBB
				83	&& uint8_t(Input[2]) == 0xBF)
				84	return std::make_pair(UEF_UTF8, 3);
				85	return std::make_pair(UEF_Unknown, 0);
				86	}
				87
				88	// It could still be utf-32 or utf-16.
				89	if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
				90	return std::make_pair(UEF_UTF32_LE, 0);
				91
				92	if (Input.size() >= 2 && Input[1] == 0)
				93	return std::make_pair(UEF_UTF16_LE, 0);
				94
				95	return std::make_pair(UEF_UTF8, 0);
				96	}
				97
				98	namespace llvm {
				99	namespace yaml {
Juergen Ributzka	d12ccbd	2013-11-19 00:57:56 +0000	[diff] [blame]	100	/// Pin the vtables to this file.
				101	void Node::anchor() {}
				102	void NullNode::anchor() {}
				103	void ScalarNode::anchor() {}
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	104	void BlockScalarNode::anchor() {}
Juergen Ributzka	d12ccbd	2013-11-19 00:57:56 +0000	[diff] [blame]	105	void KeyValueNode::anchor() {}
				106	void MappingNode::anchor() {}
				107	void SequenceNode::anchor() {}
				108	void AliasNode::anchor() {}
				109
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	110	/// Token - A single YAML token.
Duncan P. N. Exon Smith	23d8306	2016-09-11 22:40:40 +0000	[diff] [blame]	111	struct Token {
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	112	enum TokenKind {
				113	TK_Error, // Uninitialized token.
				114	TK_StreamStart,
				115	TK_StreamEnd,
				116	TK_VersionDirective,
				117	TK_TagDirective,
				118	TK_DocumentStart,
				119	TK_DocumentEnd,
				120	TK_BlockEntry,
				121	TK_BlockEnd,
				122	TK_BlockSequenceStart,
				123	TK_BlockMappingStart,
				124	TK_FlowEntry,
				125	TK_FlowSequenceStart,
				126	TK_FlowSequenceEnd,
				127	TK_FlowMappingStart,
				128	TK_FlowMappingEnd,
				129	TK_Key,
				130	TK_Value,
				131	TK_Scalar,
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	132	TK_BlockScalar,
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	133	TK_Alias,
				134	TK_Anchor,
				135	TK_Tag
				136	} Kind;
				137
				138	/// A string of length 0 or more whose begin() points to the logical location
				139	/// of the token in the input.
				140	StringRef Range;
				141
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	142	/// The value of a block scalar node.
				143	std::string Value;
				144
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	145	Token() : Kind(TK_Error) {}
				146	};
Alexander Kornienko	f00654e	2015-06-23 09:49:53 +0000	[diff] [blame]	147	}
				148	}
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	149
Duncan P. N. Exon Smith	23d8306	2016-09-11 22:40:40 +0000	[diff] [blame]	150	typedef llvm::BumpPtrList<Token> TokenQueueT;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	151
				152	namespace {
				153	/// @brief This struct is used to track simple keys.
				154	///
				155	/// Simple keys are handled by creating an entry in SimpleKeys for each Token
				156	/// which could legally be the start of a simple key. When peekNext is called,
				157	/// if the Token To be returned is referenced by a SimpleKey, we continue
				158	/// tokenizing until that potential simple key has either been found to not be
				159	/// a simple key (we moved on to the next line or went further than 1024 chars).
				160	/// Or when we run into a Value, and then insert a Key token (and possibly
				161	/// others) before the SimpleKey's Tok.
				162	struct SimpleKey {
				163	TokenQueueT::iterator Tok;
				164	unsigned Column;
				165	unsigned Line;
				166	unsigned FlowLevel;
				167	bool IsRequired;
				168
				169	bool operator ==(const SimpleKey &Other) {
				170	return Tok == Other.Tok;
				171	}
				172	};
Alexander Kornienko	f00654e	2015-06-23 09:49:53 +0000	[diff] [blame]	173	}
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	174
				175	/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
				176	/// subsequence and the subsequence's length in code units (uint8_t).
				177	/// A length of 0 represents an error.
				178	typedef std::pair<uint32_t, unsigned> UTF8Decoded;
				179
				180	static UTF8Decoded decodeUTF8(StringRef Range) {
				181	StringRef::iterator Position= Range.begin();
				182	StringRef::iterator End = Range.end();
				183	// 1 byte: [0x00, 0x7f]
				184	// Bit pattern: 0xxxxxxx
				185	if ((*Position & 0x80) == 0) {
				186	return std::make_pair(*Position, 1);
				187	}
				188	// 2 bytes: [0x80, 0x7ff]
				189	// Bit pattern: 110xxxxx 10xxxxxx
				190	if (Position + 1 != End &&
				191	((*Position & 0xE0) == 0xC0) &&
				192	((*(Position + 1) & 0xC0) == 0x80)) {
				193	uint32_t codepoint = ((*Position & 0x1F) << 6) \|
				194	(*(Position + 1) & 0x3F);
				195	if (codepoint >= 0x80)
				196	return std::make_pair(codepoint, 2);
				197	}
				198	// 3 bytes: [0x8000, 0xffff]
				199	// Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
				200	if (Position + 2 != End &&
				201	((*Position & 0xF0) == 0xE0) &&
				202	((*(Position + 1) & 0xC0) == 0x80) &&
				203	((*(Position + 2) & 0xC0) == 0x80)) {
				204	uint32_t codepoint = ((*Position & 0x0F) << 12) \|
				205	((*(Position + 1) & 0x3F) << 6) \|
				206	(*(Position + 2) & 0x3F);
				207	// Codepoints between 0xD800 and 0xDFFF are invalid, as
				208	// they are high / low surrogate halves used by UTF-16.
				209	if (codepoint >= 0x800 &&
				210	(codepoint < 0xD800 \|\| codepoint > 0xDFFF))
				211	return std::make_pair(codepoint, 3);
				212	}
				213	// 4 bytes: [0x10000, 0x10FFFF]
				214	// Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				215	if (Position + 3 != End &&
				216	((*Position & 0xF8) == 0xF0) &&
				217	((*(Position + 1) & 0xC0) == 0x80) &&
				218	((*(Position + 2) & 0xC0) == 0x80) &&
				219	((*(Position + 3) & 0xC0) == 0x80)) {
				220	uint32_t codepoint = ((*Position & 0x07) << 18) \|
				221	((*(Position + 1) & 0x3F) << 12) \|
				222	((*(Position + 2) & 0x3F) << 6) \|
				223	(*(Position + 3) & 0x3F);
				224	if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
				225	return std::make_pair(codepoint, 4);
				226	}
				227	return std::make_pair(0, 0);
				228	}
				229
				230	namespace llvm {
				231	namespace yaml {
				232	/// @brief Scans YAML tokens from a MemoryBuffer.
				233	class Scanner {
				234	public:
Alex Lorenz	e4bcfbf	2015-05-07 18:08:46 +0000	[diff] [blame]	235	Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true);
				236	Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	237
				238	/// @brief Parse the next token and return it without popping it.
				239	Token &peekNext();
				240
				241	/// @brief Parse the next token and pop it from the queue.
				242	Token getNext();
				243
				244	void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
Dmitri Gribenko	3238fb7	2013-05-05 00:40:33 +0000	[diff] [blame]	245	ArrayRef<SMRange> Ranges = None) {
Alex Lorenz	e4bcfbf	2015-05-07 18:08:46 +0000	[diff] [blame]	246	SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	247	}
				248
				249	void setError(const Twine &Message, StringRef::iterator Position) {
				250	if (Current >= End)
				251	Current = End - 1;
				252
				253	// Don't print out more errors after the first one we encounter. The rest
				254	// are just the result of the first, and have no meaning.
				255	if (!Failed)
				256	printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
				257	Failed = true;
				258	}
				259
				260	void setError(const Twine &Message) {
				261	setError(Message, Current);
				262	}
				263
				264	/// @brief Returns true if an error occurred while parsing.
				265	bool failed() {
				266	return Failed;
				267	}
				268
				269	private:
Rafael Espindola	68669e3	2014-08-27 19:03:22 +0000	[diff] [blame]	270	void init(MemoryBufferRef Buffer);
				271
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	272	StringRef currentInput() {
				273	return StringRef(Current, End - Current);
				274	}
				275
				276	/// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
				277	/// at \a Position.
				278	///
				279	/// If the UTF-8 code units starting at Position do not form a well-formed
				280	/// code unit subsequence, then the Unicode scalar value is 0, and the length
				281	/// is 0.
				282	UTF8Decoded decodeUTF8(StringRef::iterator Position) {
				283	return ::decodeUTF8(StringRef(Position, End - Position));
				284	}
				285
				286	// The following functions are based on the gramar rules in the YAML spec. The
				287	// style of the function names it meant to closely match how they are written
				288	// in the spec. The number within the [] is the number of the grammar rule in
				289	// the spec.
				290	//
				291	// See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
				292	//
				293	// c-
				294	// A production starting and ending with a special character.
				295	// b-
				296	// A production matching a single line break.
				297	// nb-
				298	// A production starting and ending with a non-break character.
				299	// s-
				300	// A production starting and ending with a white space character.
				301	// ns-
				302	// A production starting and ending with a non-space character.
				303	// l-
				304	// A production matching complete line(s).
				305
				306	/// @brief Skip a single nb-char[27] starting at Position.
				307	///
				308	/// A nb-char is 0x9 \| [0x20-0x7E] \| 0x85 \| [0xA0-0xD7FF] \| [0xE000-0xFEFE]
				309	/// \| [0xFF00-0xFFFD] \| [0x10000-0x10FFFF]
				310	///
				311	/// @returns The code unit after the nb-char, or Position if it's not an
				312	/// nb-char.
				313	StringRef::iterator skip_nb_char(StringRef::iterator Position);
				314
				315	/// @brief Skip a single b-break[28] starting at Position.
				316	///
				317	/// A b-break is 0xD 0xA \| 0xD \| 0xA
				318	///
				319	/// @returns The code unit after the b-break, or Position if it's not a
				320	/// b-break.
				321	StringRef::iterator skip_b_break(StringRef::iterator Position);
				322
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	323	/// Skip a single s-space[31] starting at Position.
				324	///
				325	/// An s-space is 0x20
				326	///
				327	/// @returns The code unit after the s-space, or Position if it's not a
				328	/// s-space.
				329	StringRef::iterator skip_s_space(StringRef::iterator Position);
				330
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	331	/// @brief Skip a single s-white[33] starting at Position.
				332	///
				333	/// A s-white is 0x20 \| 0x9
				334	///
				335	/// @returns The code unit after the s-white, or Position if it's not a
				336	/// s-white.
				337	StringRef::iterator skip_s_white(StringRef::iterator Position);
				338
				339	/// @brief Skip a single ns-char[34] starting at Position.
				340	///
				341	/// A ns-char is nb-char - s-white
				342	///
				343	/// @returns The code unit after the ns-char, or Position if it's not a
				344	/// ns-char.
				345	StringRef::iterator skip_ns_char(StringRef::iterator Position);
				346
				347	typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
				348	/// @brief Skip minimal well-formed code unit subsequences until Func
				349	/// returns its input.
				350	///
				351	/// @returns The code unit after the last minimal well-formed code unit
				352	/// subsequence that Func accepted.
				353	StringRef::iterator skip_while( SkipWhileFunc Func
				354	, StringRef::iterator Position);
				355
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	356	/// Skip minimal well-formed code unit subsequences until Func returns its
				357	/// input.
				358	void advanceWhile(SkipWhileFunc Func);
				359
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	360	/// @brief Scan ns-uri-char[39]s starting at Cur.
				361	///
				362	/// This updates Cur and Column while scanning.
				363	///
				364	/// @returns A StringRef starting at Cur which covers the longest contiguous
				365	/// sequence of ns-uri-char.
				366	StringRef scan_ns_uri_char();
				367
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	368	/// @brief Consume a minimal well-formed code unit subsequence starting at
				369	/// \a Cur. Return false if it is not the same Unicode scalar value as
				370	/// \a Expected. This updates \a Column.
				371	bool consume(uint32_t Expected);
				372
				373	/// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
				374	void skip(uint32_t Distance);
				375
				376	/// @brief Return true if the minimal well-formed code unit subsequence at
				377	/// Pos is whitespace or a new line
				378	bool isBlankOrBreak(StringRef::iterator Position);
				379
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	380	/// Consume a single b-break[28] if it's present at the current position.
				381	///
				382	/// Return false if the code unit at the current position isn't a line break.
				383	bool consumeLineBreakIfPresent();
				384
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	385	/// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
				386	void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
				387	, unsigned AtColumn
				388	, bool IsRequired);
				389
				390	/// @brief Remove simple keys that can no longer be valid simple keys.
				391	///
				392	/// Invalid simple keys are not on the current line or are further than 1024
				393	/// columns back.
				394	void removeStaleSimpleKeyCandidates();
				395
				396	/// @brief Remove all simple keys on FlowLevel \a Level.
				397	void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
				398
				399	/// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
				400	/// tokens if needed.
				401	bool unrollIndent(int ToColumn);
				402
				403	/// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
				404	/// if needed.
				405	bool rollIndent( int ToColumn
				406	, Token::TokenKind Kind
				407	, TokenQueueT::iterator InsertPoint);
				408
Alex Lorenz	fe6f186	2015-05-06 23:00:45 +0000	[diff] [blame]	409	/// @brief Skip a single-line comment when the comment starts at the current
				410	/// position of the scanner.
				411	void skipComment();
				412
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	413	/// @brief Skip whitespace and comments until the start of the next token.
				414	void scanToNextToken();
				415
				416	/// @brief Must be the first token generated.
				417	bool scanStreamStart();
				418
				419	/// @brief Generate tokens needed to close out the stream.
				420	bool scanStreamEnd();
				421
				422	/// @brief Scan a %BLAH directive.
				423	bool scanDirective();
				424
				425	/// @brief Scan a ... or ---.
				426	bool scanDocumentIndicator(bool IsStart);
				427
				428	/// @brief Scan a [ or { and generate the proper flow collection start token.
				429	bool scanFlowCollectionStart(bool IsSequence);
				430
				431	/// @brief Scan a ] or } and generate the proper flow collection end token.
				432	bool scanFlowCollectionEnd(bool IsSequence);
				433
				434	/// @brief Scan the , that separates entries in a flow collection.
				435	bool scanFlowEntry();
				436
				437	/// @brief Scan the - that starts block sequence entries.
				438	bool scanBlockEntry();
				439
				440	/// @brief Scan an explicit ? indicating a key.
				441	bool scanKey();
				442
				443	/// @brief Scan an explicit : indicating a value.
				444	bool scanValue();
				445
				446	/// @brief Scan a quoted scalar.
				447	bool scanFlowScalar(bool IsDoubleQuoted);
				448
				449	/// @brief Scan an unquoted scalar.
				450	bool scanPlainScalar();
				451
				452	/// @brief Scan an Alias or Anchor starting with * or &.
				453	bool scanAliasOrAnchor(bool IsAlias);
				454
				455	/// @brief Scan a block scalar starting with \| or >.
				456	bool scanBlockScalar(bool IsLiteral);
				457
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	458	/// Scan a chomping indicator in a block scalar header.
				459	char scanBlockChompingIndicator();
				460
				461	/// Scan an indentation indicator in a block scalar header.
				462	unsigned scanBlockIndentationIndicator();
				463
				464	/// Scan a block scalar header.
				465	///
				466	/// Return false if an error occurred.
				467	bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
				468	bool &IsDone);
				469
				470	/// Look for the indentation level of a block scalar.
				471	///
				472	/// Return false if an error occurred.
				473	bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
				474	unsigned &LineBreaks, bool &IsDone);
				475
				476	/// Scan the indentation of a text line in a block scalar.
				477	///
				478	/// Return false if an error occurred.
				479	bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
				480	bool &IsDone);
				481
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	482	/// @brief Scan a tag of the form !stuff.
				483	bool scanTag();
				484
				485	/// @brief Dispatch to the next scanning function based on \a *Cur.
				486	bool fetchMoreTokens();
				487
				488	/// @brief The SourceMgr used for diagnostics and buffer management.
				489	SourceMgr &SM;
				490
				491	/// @brief The original input.
Rafael Espindola	68669e3	2014-08-27 19:03:22 +0000	[diff] [blame]	492	MemoryBufferRef InputBuffer;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	493
				494	/// @brief The current position of the scanner.
				495	StringRef::iterator Current;
				496
				497	/// @brief The end of the input (one past the last character).
				498	StringRef::iterator End;
				499
				500	/// @brief Current YAML indentation level in spaces.
				501	int Indent;
				502
				503	/// @brief Current column number in Unicode code points.
				504	unsigned Column;
				505
				506	/// @brief Current line number.
				507	unsigned Line;
				508
				509	/// @brief How deep we are in flow style containers. 0 Means at block level.
				510	unsigned FlowLevel;
				511
				512	/// @brief Are we at the start of the stream?
				513	bool IsStartOfStream;
				514
				515	/// @brief Can the next token be the start of a simple key?
				516	bool IsSimpleKeyAllowed;
				517
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	518	/// @brief True if an error has occurred.
				519	bool Failed;
				520
Alex Lorenz	e4bcfbf	2015-05-07 18:08:46 +0000	[diff] [blame]	521	/// @brief Should colors be used when printing out the diagnostic messages?
				522	bool ShowColors;
				523
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	524	/// @brief Queue of tokens. This is required to queue up tokens while looking
				525	/// for the end of a simple key. And for cases where a single character
				526	/// can produce multiple tokens (e.g. BlockEnd).
				527	TokenQueueT TokenQueue;
				528
				529	/// @brief Indentation levels.
				530	SmallVector<int, 4> Indents;
				531
				532	/// @brief Potential simple keys.
				533	SmallVector<SimpleKey, 4> SimpleKeys;
				534	};
				535
				536	} // end namespace yaml
				537	} // end namespace llvm
				538
				539	/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
				540	static void encodeUTF8( uint32_t UnicodeScalarValue
				541	, SmallVectorImpl<char> &Result) {
				542	if (UnicodeScalarValue <= 0x7F) {
				543	Result.push_back(UnicodeScalarValue & 0x7F);
				544	} else if (UnicodeScalarValue <= 0x7FF) {
				545	uint8_t FirstByte = 0xC0 \| ((UnicodeScalarValue & 0x7C0) >> 6);
				546	uint8_t SecondByte = 0x80 \| (UnicodeScalarValue & 0x3F);
				547	Result.push_back(FirstByte);
				548	Result.push_back(SecondByte);
				549	} else if (UnicodeScalarValue <= 0xFFFF) {
				550	uint8_t FirstByte = 0xE0 \| ((UnicodeScalarValue & 0xF000) >> 12);
				551	uint8_t SecondByte = 0x80 \| ((UnicodeScalarValue & 0xFC0) >> 6);
				552	uint8_t ThirdByte = 0x80 \| (UnicodeScalarValue & 0x3F);
				553	Result.push_back(FirstByte);
				554	Result.push_back(SecondByte);
				555	Result.push_back(ThirdByte);
				556	} else if (UnicodeScalarValue <= 0x10FFFF) {
				557	uint8_t FirstByte = 0xF0 \| ((UnicodeScalarValue & 0x1F0000) >> 18);
				558	uint8_t SecondByte = 0x80 \| ((UnicodeScalarValue & 0x3F000) >> 12);
				559	uint8_t ThirdByte = 0x80 \| ((UnicodeScalarValue & 0xFC0) >> 6);
				560	uint8_t FourthByte = 0x80 \| (UnicodeScalarValue & 0x3F);
				561	Result.push_back(FirstByte);
				562	Result.push_back(SecondByte);
				563	Result.push_back(ThirdByte);
				564	Result.push_back(FourthByte);
				565	}
				566	}
				567
				568	bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
				569	SourceMgr SM;
				570	Scanner scanner(Input, SM);
				571	while (true) {
				572	Token T = scanner.getNext();
				573	switch (T.Kind) {
				574	case Token::TK_StreamStart:
				575	OS << "Stream-Start: ";
				576	break;
				577	case Token::TK_StreamEnd:
				578	OS << "Stream-End: ";
				579	break;
				580	case Token::TK_VersionDirective:
				581	OS << "Version-Directive: ";
				582	break;
				583	case Token::TK_TagDirective:
				584	OS << "Tag-Directive: ";
				585	break;
				586	case Token::TK_DocumentStart:
				587	OS << "Document-Start: ";
				588	break;
				589	case Token::TK_DocumentEnd:
				590	OS << "Document-End: ";
				591	break;
				592	case Token::TK_BlockEntry:
				593	OS << "Block-Entry: ";
				594	break;
				595	case Token::TK_BlockEnd:
				596	OS << "Block-End: ";
				597	break;
				598	case Token::TK_BlockSequenceStart:
				599	OS << "Block-Sequence-Start: ";
				600	break;
				601	case Token::TK_BlockMappingStart:
				602	OS << "Block-Mapping-Start: ";
				603	break;
				604	case Token::TK_FlowEntry:
				605	OS << "Flow-Entry: ";
				606	break;
				607	case Token::TK_FlowSequenceStart:
				608	OS << "Flow-Sequence-Start: ";
				609	break;
				610	case Token::TK_FlowSequenceEnd:
				611	OS << "Flow-Sequence-End: ";
				612	break;
				613	case Token::TK_FlowMappingStart:
				614	OS << "Flow-Mapping-Start: ";
				615	break;
				616	case Token::TK_FlowMappingEnd:
				617	OS << "Flow-Mapping-End: ";
				618	break;
				619	case Token::TK_Key:
				620	OS << "Key: ";
				621	break;
				622	case Token::TK_Value:
				623	OS << "Value: ";
				624	break;
				625	case Token::TK_Scalar:
				626	OS << "Scalar: ";
				627	break;
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	628	case Token::TK_BlockScalar:
				629	OS << "Block Scalar: ";
				630	break;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	631	case Token::TK_Alias:
				632	OS << "Alias: ";
				633	break;
				634	case Token::TK_Anchor:
				635	OS << "Anchor: ";
				636	break;
				637	case Token::TK_Tag:
				638	OS << "Tag: ";
				639	break;
				640	case Token::TK_Error:
				641	break;
				642	}
				643	OS << T.Range << "\n";
				644	if (T.Kind == Token::TK_StreamEnd)
				645	break;
				646	else if (T.Kind == Token::TK_Error)
				647	return false;
				648	}
				649	return true;
				650	}
				651
				652	bool yaml::scanTokens(StringRef Input) {
				653	llvm::SourceMgr SM;
				654	llvm::yaml::Scanner scanner(Input, SM);
				655	for (;;) {
				656	llvm::yaml::Token T = scanner.getNext();
				657	if (T.Kind == Token::TK_StreamEnd)
				658	break;
				659	else if (T.Kind == Token::TK_Error)
				660	return false;
				661	}
				662	return true;
				663	}
				664
				665	std::string yaml::escape(StringRef Input) {
				666	std::string EscapedInput;
				667	for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
				668	if (*i == '\\')
				669	EscapedInput += "\\\\";
				670	else if (*i == '"')
				671	EscapedInput += "\\\"";
				672	else if (*i == 0)
				673	EscapedInput += "\\0";
				674	else if (*i == 0x07)
				675	EscapedInput += "\\a";
				676	else if (*i == 0x08)
				677	EscapedInput += "\\b";
				678	else if (*i == 0x09)
				679	EscapedInput += "\\t";
				680	else if (*i == 0x0A)
				681	EscapedInput += "\\n";
				682	else if (*i == 0x0B)
				683	EscapedInput += "\\v";
				684	else if (*i == 0x0C)
				685	EscapedInput += "\\f";
				686	else if (*i == 0x0D)
				687	EscapedInput += "\\r";
				688	else if (*i == 0x1B)
				689	EscapedInput += "\\e";
Benjamin Kramer	0aa0d3d	2012-04-21 10:51:42 +0000	[diff] [blame]	690	else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	691	std::string HexStr = utohexstr(*i);
				692	EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
				693	} else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
				694	UTF8Decoded UnicodeScalarValue
				695	= decodeUTF8(StringRef(i, Input.end() - i));
				696	if (UnicodeScalarValue.second == 0) {
				697	// Found invalid char.
				698	SmallString<4> Val;
				699	encodeUTF8(0xFFFD, Val);
				700	EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
				701	// FIXME: Error reporting.
				702	return EscapedInput;
				703	}
				704	if (UnicodeScalarValue.first == 0x85)
				705	EscapedInput += "\\N";
				706	else if (UnicodeScalarValue.first == 0xA0)
				707	EscapedInput += "\\_";
				708	else if (UnicodeScalarValue.first == 0x2028)
				709	EscapedInput += "\\L";
				710	else if (UnicodeScalarValue.first == 0x2029)
				711	EscapedInput += "\\P";
				712	else {
				713	std::string HexStr = utohexstr(UnicodeScalarValue.first);
				714	if (HexStr.size() <= 2)
				715	EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
				716	else if (HexStr.size() <= 4)
				717	EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
				718	else if (HexStr.size() <= 8)
				719	EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
				720	}
				721	i += UnicodeScalarValue.second - 1;
				722	} else
				723	EscapedInput.push_back(*i);
				724	}
				725	return EscapedInput;
				726	}
				727
Alex Lorenz	e4bcfbf	2015-05-07 18:08:46 +0000	[diff] [blame]	728	Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors)
				729	: SM(sm), ShowColors(ShowColors) {
Rafael Espindola	68669e3	2014-08-27 19:03:22 +0000	[diff] [blame]	730	init(MemoryBufferRef(Input, "YAML"));
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	731	}
				732
Alex Lorenz	e4bcfbf	2015-05-07 18:08:46 +0000	[diff] [blame]	733	Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors)
				734	: SM(SM_), ShowColors(ShowColors) {
Rafael Espindola	68669e3	2014-08-27 19:03:22 +0000	[diff] [blame]	735	init(Buffer);
				736	}
				737
				738	void Scanner::init(MemoryBufferRef Buffer) {
				739	InputBuffer = Buffer;
				740	Current = InputBuffer.getBufferStart();
				741	End = InputBuffer.getBufferEnd();
				742	Indent = -1;
				743	Column = 0;
				744	Line = 0;
				745	FlowLevel = 0;
				746	IsStartOfStream = true;
				747	IsSimpleKeyAllowed = true;
				748	Failed = false;
				749	std::unique_ptr<MemoryBuffer> InputBufferOwner =
				750	MemoryBuffer::getMemBuffer(Buffer);
				751	SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
Sean Silva	aba8270	2012-11-19 23:21:47 +0000	[diff] [blame]	752	}
				753
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	754	Token &Scanner::peekNext() {
				755	// If the current token is a possible simple key, keep parsing until we
				756	// can confirm.
				757	bool NeedMore = false;
				758	while (true) {
				759	if (TokenQueue.empty() \|\| NeedMore) {
				760	if (!fetchMoreTokens()) {
				761	TokenQueue.clear();
				762	TokenQueue.push_back(Token());
				763	return TokenQueue.front();
				764	}
				765	}
				766	assert(!TokenQueue.empty() &&
				767	"fetchMoreTokens lied about getting tokens!");
				768
				769	removeStaleSimpleKeyCandidates();
				770	SimpleKey SK;
Duncan P. N. Exon Smith	6eeaff1	2015-10-08 22:47:55 +0000	[diff] [blame]	771	SK.Tok = TokenQueue.begin();
David Majnemer	0d955d0	2016-08-11 22:21:41 +0000	[diff] [blame]	772	if (!is_contained(SimpleKeys, SK))
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	773	break;
				774	else
				775	NeedMore = true;
				776	}
				777	return TokenQueue.front();
				778	}
				779
				780	Token Scanner::getNext() {
				781	Token Ret = peekNext();
				782	// TokenQueue can be empty if there was an error getting the next token.
				783	if (!TokenQueue.empty())
				784	TokenQueue.pop_front();
				785
				786	// There cannot be any referenced Token's if the TokenQueue is empty. So do a
				787	// quick deallocation of them all.
Duncan P. N. Exon Smith	23d8306	2016-09-11 22:40:40 +0000	[diff] [blame]	788	if (TokenQueue.empty())
				789	TokenQueue.resetAlloc();
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	790
				791	return Ret;
				792	}
				793
				794	StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
Michael J. Spencer	6033113	2012-04-27 21:12:20 +0000	[diff] [blame]	795	if (Position == End)
				796	return Position;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	797	// Check 7 bit c-printable - b-char.
				798	if ( *Position == 0x09
				799	\|\| (Position >= 0x20 && Position <= 0x7E))
				800	return Position + 1;
				801
				802	// Check for valid UTF-8.
				803	if (uint8_t(*Position) & 0x80) {
				804	UTF8Decoded u8d = decodeUTF8(Position);
				805	if ( u8d.second != 0
				806	&& u8d.first != 0xFEFF
				807	&& ( u8d.first == 0x85
				808	\|\| ( u8d.first >= 0xA0
				809	&& u8d.first <= 0xD7FF)
				810	\|\| ( u8d.first >= 0xE000
				811	&& u8d.first <= 0xFFFD)
				812	\|\| ( u8d.first >= 0x10000
				813	&& u8d.first <= 0x10FFFF)))
				814	return Position + u8d.second;
				815	}
				816	return Position;
				817	}
				818
				819	StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
Michael J. Spencer	6033113	2012-04-27 21:12:20 +0000	[diff] [blame]	820	if (Position == End)
				821	return Position;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	822	if (*Position == 0x0D) {
				823	if (Position + 1 != End && *(Position + 1) == 0x0A)
				824	return Position + 2;
				825	return Position + 1;
				826	}
				827
				828	if (*Position == 0x0A)
				829	return Position + 1;
				830	return Position;
				831	}
				832
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	833	StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
				834	if (Position == End)
				835	return Position;
				836	if (*Position == ' ')
				837	return Position + 1;
				838	return Position;
				839	}
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	840
				841	StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
				842	if (Position == End)
				843	return Position;
				844	if (Position == ' ' \|\| Position == '\t')
				845	return Position + 1;
				846	return Position;
				847	}
				848
				849	StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
				850	if (Position == End)
				851	return Position;
				852	if (Position == ' ' \|\| Position == '\t')
				853	return Position;
				854	return skip_nb_char(Position);
				855	}
				856
				857	StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
				858	, StringRef::iterator Position) {
				859	while (true) {
				860	StringRef::iterator i = (this->*Func)(Position);
				861	if (i == Position)
				862	break;
				863	Position = i;
				864	}
				865	return Position;
				866	}
				867
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	868	void Scanner::advanceWhile(SkipWhileFunc Func) {
				869	auto Final = skip_while(Func, Current);
				870	Column += Final - Current;
				871	Current = Final;
				872	}
				873
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	874	static bool is_ns_hex_digit(const char C) {
				875	return (C >= '0' && C <= '9')
				876	\|\| (C >= 'a' && C <= 'z')
				877	\|\| (C >= 'A' && C <= 'Z');
				878	}
				879
				880	static bool is_ns_word_char(const char C) {
				881	return C == '-'
				882	\|\| (C >= 'a' && C <= 'z')
				883	\|\| (C >= 'A' && C <= 'Z');
				884	}
				885
				886	StringRef Scanner::scan_ns_uri_char() {
				887	StringRef::iterator Start = Current;
				888	while (true) {
				889	if (Current == End)
				890	break;
				891	if (( *Current == '%'
				892	&& Current + 2 < End
				893	&& is_ns_hex_digit(*(Current + 1))
				894	&& is_ns_hex_digit(*(Current + 2)))
				895	\|\| is_ns_word_char(*Current)
				896	\|\| StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
				897	!= StringRef::npos) {
				898	++Current;
				899	++Column;
				900	} else
				901	break;
				902	}
				903	return StringRef(Start, Current - Start);
				904	}
				905
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	906	bool Scanner::consume(uint32_t Expected) {
				907	if (Expected >= 0x80)
				908	report_fatal_error("Not dealing with this yet");
				909	if (Current == End)
				910	return false;
				911	if (uint8_t(*Current) >= 0x80)
				912	report_fatal_error("Not dealing with this yet");
				913	if (uint8_t(*Current) == Expected) {
				914	++Current;
				915	++Column;
				916	return true;
				917	}
				918	return false;
				919	}
				920
				921	void Scanner::skip(uint32_t Distance) {
				922	Current += Distance;
				923	Column += Distance;
Benjamin Kramer	8fb58f6	2012-09-26 15:52:15 +0000	[diff] [blame]	924	assert(Current <= End && "Skipped past the end");
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	925	}
				926
				927	bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
				928	if (Position == End)
				929	return false;
Alexander Kornienko	66da20a	2015-12-28 15:46:15 +0000	[diff] [blame]	930	return Position == ' ' \|\| Position == '\t' \|\| *Position == '\r' \|\|
				931	*Position == '\n';
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	932	}
				933
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	934	bool Scanner::consumeLineBreakIfPresent() {
				935	auto Next = skip_b_break(Current);
				936	if (Next == Current)
				937	return false;
				938	Column = 0;
				939	++Line;
				940	Current = Next;
				941	return true;
				942	}
				943
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	944	void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
				945	, unsigned AtColumn
				946	, bool IsRequired) {
				947	if (IsSimpleKeyAllowed) {
				948	SimpleKey SK;
				949	SK.Tok = Tok;
				950	SK.Line = Line;
				951	SK.Column = AtColumn;
				952	SK.IsRequired = IsRequired;
				953	SK.FlowLevel = FlowLevel;
				954	SimpleKeys.push_back(SK);
				955	}
				956	}
				957
				958	void Scanner::removeStaleSimpleKeyCandidates() {
				959	for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
				960	i != SimpleKeys.end();) {
				961	if (i->Line != Line \|\| i->Column + 1024 < Column) {
				962	if (i->IsRequired)
				963	setError( "Could not find expected : for simple key"
				964	, i->Tok->Range.begin());
				965	i = SimpleKeys.erase(i);
				966	} else
				967	++i;
				968	}
				969	}
				970
				971	void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
				972	if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
				973	SimpleKeys.pop_back();
				974	}
				975
				976	bool Scanner::unrollIndent(int ToColumn) {
				977	Token T;
				978	// Indentation is ignored in flow.
				979	if (FlowLevel != 0)
				980	return true;
				981
				982	while (Indent > ToColumn) {
				983	T.Kind = Token::TK_BlockEnd;
				984	T.Range = StringRef(Current, 1);
				985	TokenQueue.push_back(T);
				986	Indent = Indents.pop_back_val();
				987	}
				988
				989	return true;
				990	}
				991
				992	bool Scanner::rollIndent( int ToColumn
				993	, Token::TokenKind Kind
				994	, TokenQueueT::iterator InsertPoint) {
				995	if (FlowLevel)
				996	return true;
				997	if (Indent < ToColumn) {
				998	Indents.push_back(Indent);
				999	Indent = ToColumn;
				1000
				1001	Token T;
				1002	T.Kind = Kind;
				1003	T.Range = StringRef(Current, 0);
				1004	TokenQueue.insert(InsertPoint, T);
				1005	}
				1006	return true;
				1007	}
				1008
Alex Lorenz	fe6f186	2015-05-06 23:00:45 +0000	[diff] [blame]	1009	void Scanner::skipComment() {
				1010	if (*Current != '#')
				1011	return;
				1012	while (true) {
				1013	// This may skip more than one byte, thus Column is only incremented
				1014	// for code points.
				1015	StringRef::iterator I = skip_nb_char(Current);
				1016	if (I == Current)
				1017	break;
				1018	Current = I;
				1019	++Column;
				1020	}
				1021	}
				1022
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1023	void Scanner::scanToNextToken() {
				1024	while (true) {
				1025	while (Current == ' ' \|\| Current == '\t') {
				1026	skip(1);
				1027	}
				1028
Alex Lorenz	fe6f186	2015-05-06 23:00:45 +0000	[diff] [blame]	1029	skipComment();
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1030
				1031	// Skip EOL.
				1032	StringRef::iterator i = skip_b_break(Current);
				1033	if (i == Current)
				1034	break;
				1035	Current = i;
				1036	++Line;
				1037	Column = 0;
				1038	// New lines may start a simple key.
				1039	if (!FlowLevel)
				1040	IsSimpleKeyAllowed = true;
				1041	}
				1042	}
				1043
				1044	bool Scanner::scanStreamStart() {
				1045	IsStartOfStream = false;
				1046
				1047	EncodingInfo EI = getUnicodeEncoding(currentInput());
				1048
				1049	Token T;
				1050	T.Kind = Token::TK_StreamStart;
				1051	T.Range = StringRef(Current, EI.second);
				1052	TokenQueue.push_back(T);
				1053	Current += EI.second;
				1054	return true;
				1055	}
				1056
				1057	bool Scanner::scanStreamEnd() {
				1058	// Force an ending new line if one isn't present.
				1059	if (Column != 0) {
				1060	Column = 0;
				1061	++Line;
				1062	}
				1063
				1064	unrollIndent(-1);
				1065	SimpleKeys.clear();
				1066	IsSimpleKeyAllowed = false;
				1067
				1068	Token T;
				1069	T.Kind = Token::TK_StreamEnd;
				1070	T.Range = StringRef(Current, 0);
				1071	TokenQueue.push_back(T);
				1072	return true;
				1073	}
				1074
				1075	bool Scanner::scanDirective() {
				1076	// Reset the indentation level.
				1077	unrollIndent(-1);
				1078	SimpleKeys.clear();
				1079	IsSimpleKeyAllowed = false;
				1080
				1081	StringRef::iterator Start = Current;
				1082	consume('%');
				1083	StringRef::iterator NameStart = Current;
				1084	Current = skip_while(&Scanner::skip_ns_char, Current);
				1085	StringRef Name(NameStart, Current - NameStart);
				1086	Current = skip_while(&Scanner::skip_s_white, Current);
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1087
				1088	Token T;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1089	if (Name == "YAML") {
				1090	Current = skip_while(&Scanner::skip_ns_char, Current);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1091	T.Kind = Token::TK_VersionDirective;
				1092	T.Range = StringRef(Start, Current - Start);
				1093	TokenQueue.push_back(T);
				1094	return true;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1095	} else if(Name == "TAG") {
				1096	Current = skip_while(&Scanner::skip_ns_char, Current);
				1097	Current = skip_while(&Scanner::skip_s_white, Current);
				1098	Current = skip_while(&Scanner::skip_ns_char, Current);
				1099	T.Kind = Token::TK_TagDirective;
				1100	T.Range = StringRef(Start, Current - Start);
				1101	TokenQueue.push_back(T);
				1102	return true;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1103	}
				1104	return false;
				1105	}
				1106
				1107	bool Scanner::scanDocumentIndicator(bool IsStart) {
				1108	unrollIndent(-1);
				1109	SimpleKeys.clear();
				1110	IsSimpleKeyAllowed = false;
				1111
				1112	Token T;
				1113	T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
				1114	T.Range = StringRef(Current, 3);
				1115	skip(3);
				1116	TokenQueue.push_back(T);
				1117	return true;
				1118	}
				1119
				1120	bool Scanner::scanFlowCollectionStart(bool IsSequence) {
				1121	Token T;
				1122	T.Kind = IsSequence ? Token::TK_FlowSequenceStart
				1123	: Token::TK_FlowMappingStart;
				1124	T.Range = StringRef(Current, 1);
				1125	skip(1);
				1126	TokenQueue.push_back(T);
				1127
				1128	// [ and { may begin a simple key.
Duncan P. N. Exon Smith	6eeaff1	2015-10-08 22:47:55 +0000	[diff] [blame]	1129	saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1130
				1131	// And may also be followed by a simple key.
				1132	IsSimpleKeyAllowed = true;
				1133	++FlowLevel;
				1134	return true;
				1135	}
				1136
				1137	bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
				1138	removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
				1139	IsSimpleKeyAllowed = false;
				1140	Token T;
				1141	T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
				1142	: Token::TK_FlowMappingEnd;
				1143	T.Range = StringRef(Current, 1);
				1144	skip(1);
				1145	TokenQueue.push_back(T);
				1146	if (FlowLevel)
				1147	--FlowLevel;
				1148	return true;
				1149	}
				1150
				1151	bool Scanner::scanFlowEntry() {
				1152	removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
				1153	IsSimpleKeyAllowed = true;
				1154	Token T;
				1155	T.Kind = Token::TK_FlowEntry;
				1156	T.Range = StringRef(Current, 1);
				1157	skip(1);
				1158	TokenQueue.push_back(T);
				1159	return true;
				1160	}
				1161
				1162	bool Scanner::scanBlockEntry() {
				1163	rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
				1164	removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
				1165	IsSimpleKeyAllowed = true;
				1166	Token T;
				1167	T.Kind = Token::TK_BlockEntry;
				1168	T.Range = StringRef(Current, 1);
				1169	skip(1);
				1170	TokenQueue.push_back(T);
				1171	return true;
				1172	}
				1173
				1174	bool Scanner::scanKey() {
				1175	if (!FlowLevel)
				1176	rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
				1177
				1178	removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
				1179	IsSimpleKeyAllowed = !FlowLevel;
				1180
				1181	Token T;
				1182	T.Kind = Token::TK_Key;
				1183	T.Range = StringRef(Current, 1);
				1184	skip(1);
				1185	TokenQueue.push_back(T);
				1186	return true;
				1187	}
				1188
				1189	bool Scanner::scanValue() {
				1190	// If the previous token could have been a simple key, insert the key token
				1191	// into the token queue.
				1192	if (!SimpleKeys.empty()) {
				1193	SimpleKey SK = SimpleKeys.pop_back_val();
				1194	Token T;
				1195	T.Kind = Token::TK_Key;
				1196	T.Range = SK.Tok->Range;
				1197	TokenQueueT::iterator i, e;
				1198	for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
				1199	if (i == SK.Tok)
				1200	break;
				1201	}
				1202	assert(i != e && "SimpleKey not in token queue!");
				1203	i = TokenQueue.insert(i, T);
				1204
				1205	// We may also need to add a Block-Mapping-Start token.
				1206	rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
				1207
				1208	IsSimpleKeyAllowed = false;
				1209	} else {
				1210	if (!FlowLevel)
				1211	rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
				1212	IsSimpleKeyAllowed = !FlowLevel;
				1213	}
				1214
				1215	Token T;
				1216	T.Kind = Token::TK_Value;
				1217	T.Range = StringRef(Current, 1);
				1218	skip(1);
				1219	TokenQueue.push_back(T);
				1220	return true;
				1221	}
				1222
				1223	// Forbidding inlining improves performance by roughly 20%.
				1224	// FIXME: Remove once llvm optimizes this to the faster version without hints.
				1225	LLVM_ATTRIBUTE_NOINLINE static bool
				1226	wasEscaped(StringRef::iterator First, StringRef::iterator Position);
				1227
				1228	// Returns whether a character at 'Position' was escaped with a leading '\'.
				1229	// 'First' specifies the position of the first character in the string.
				1230	static bool wasEscaped(StringRef::iterator First,
				1231	StringRef::iterator Position) {
				1232	assert(Position - 1 >= First);
				1233	StringRef::iterator I = Position - 1;
				1234	// We calculate the number of consecutive '\'s before the current position
				1235	// by iterating backwards through our string.
				1236	while (I >= First && *I == '\\') --I;
				1237	// (Position - 1 - I) now contains the number of '\'s before the current
				1238	// position. If it is odd, the character at 'Position' was escaped.
				1239	return (Position - 1 - I) % 2 == 1;
				1240	}
				1241
				1242	bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
				1243	StringRef::iterator Start = Current;
				1244	unsigned ColStart = Column;
				1245	if (IsDoubleQuoted) {
				1246	do {
				1247	++Current;
				1248	while (Current != End && *Current != '"')
				1249	++Current;
				1250	// Repeat until the previous character was not a '\' or was an escaped
				1251	// backslash.
Michael J. Spencer	6033113	2012-04-27 21:12:20 +0000	[diff] [blame]	1252	} while ( Current != End
				1253	&& *(Current - 1) == '\\'
				1254	&& wasEscaped(Start + 1, Current));
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1255	} else {
				1256	skip(1);
				1257	while (true) {
				1258	// Skip a ' followed by another '.
				1259	if (Current + 1 < End && Current == '\'' && (Current + 1) == '\'') {
				1260	skip(2);
				1261	continue;
				1262	} else if (*Current == '\'')
				1263	break;
				1264	StringRef::iterator i = skip_nb_char(Current);
				1265	if (i == Current) {
				1266	i = skip_b_break(Current);
				1267	if (i == Current)
				1268	break;
				1269	Current = i;
				1270	Column = 0;
				1271	++Line;
				1272	} else {
				1273	if (i == End)
				1274	break;
				1275	Current = i;
				1276	++Column;
				1277	}
				1278	}
				1279	}
Benjamin Kramer	8fb58f6	2012-09-26 15:52:15 +0000	[diff] [blame]	1280
				1281	if (Current == End) {
				1282	setError("Expected quote at end of scalar", Current);
				1283	return false;
				1284	}
				1285
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1286	skip(1); // Skip ending quote.
				1287	Token T;
				1288	T.Kind = Token::TK_Scalar;
				1289	T.Range = StringRef(Start, Current - Start);
				1290	TokenQueue.push_back(T);
				1291
Duncan P. N. Exon Smith	6eeaff1	2015-10-08 22:47:55 +0000	[diff] [blame]	1292	saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1293
				1294	IsSimpleKeyAllowed = false;
				1295
				1296	return true;
				1297	}
				1298
				1299	bool Scanner::scanPlainScalar() {
				1300	StringRef::iterator Start = Current;
				1301	unsigned ColStart = Column;
				1302	unsigned LeadingBlanks = 0;
				1303	assert(Indent >= -1 && "Indent must be >= -1 !");
				1304	unsigned indent = static_cast<unsigned>(Indent + 1);
				1305	while (true) {
				1306	if (*Current == '#')
				1307	break;
				1308
				1309	while (!isBlankOrBreak(Current)) {
				1310	if ( FlowLevel && *Current == ':'
				1311	&& !(isBlankOrBreak(Current + 1) \|\| *(Current + 1) == ',')) {
				1312	setError("Found unexpected ':' while scanning a plain scalar", Current);
				1313	return false;
				1314	}
				1315
				1316	// Check for the end of the plain scalar.
				1317	if ( (*Current == ':' && isBlankOrBreak(Current + 1))
				1318	\|\| ( FlowLevel
				1319	&& (StringRef(Current, 1).find_first_of(",:?[]{}")
				1320	!= StringRef::npos)))
				1321	break;
				1322
				1323	StringRef::iterator i = skip_nb_char(Current);
				1324	if (i == Current)
				1325	break;
				1326	Current = i;
				1327	++Column;
				1328	}
				1329
				1330	// Are we at the end?
				1331	if (!isBlankOrBreak(Current))
				1332	break;
				1333
				1334	// Eat blanks.
				1335	StringRef::iterator Tmp = Current;
				1336	while (isBlankOrBreak(Tmp)) {
				1337	StringRef::iterator i = skip_s_white(Tmp);
				1338	if (i != Tmp) {
				1339	if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
				1340	setError("Found invalid tab character in indentation", Tmp);
				1341	return false;
				1342	}
				1343	Tmp = i;
				1344	++Column;
				1345	} else {
				1346	i = skip_b_break(Tmp);
				1347	if (!LeadingBlanks)
				1348	LeadingBlanks = 1;
				1349	Tmp = i;
				1350	Column = 0;
				1351	++Line;
				1352	}
				1353	}
				1354
				1355	if (!FlowLevel && Column < indent)
				1356	break;
				1357
				1358	Current = Tmp;
				1359	}
				1360	if (Start == Current) {
				1361	setError("Got empty plain scalar", Start);
				1362	return false;
				1363	}
				1364	Token T;
				1365	T.Kind = Token::TK_Scalar;
				1366	T.Range = StringRef(Start, Current - Start);
				1367	TokenQueue.push_back(T);
				1368
				1369	// Plain scalars can be simple keys.
Duncan P. N. Exon Smith	6eeaff1	2015-10-08 22:47:55 +0000	[diff] [blame]	1370	saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1371
				1372	IsSimpleKeyAllowed = false;
				1373
				1374	return true;
				1375	}
				1376
				1377	bool Scanner::scanAliasOrAnchor(bool IsAlias) {
				1378	StringRef::iterator Start = Current;
				1379	unsigned ColStart = Column;
				1380	skip(1);
				1381	while(true) {
				1382	if ( Current == '[' \|\| Current == ']'
				1383	\|\| Current == '{' \|\| Current == '}'
				1384	\|\| *Current == ','
				1385	\|\| *Current == ':')
				1386	break;
				1387	StringRef::iterator i = skip_ns_char(Current);
				1388	if (i == Current)
				1389	break;
				1390	Current = i;
				1391	++Column;
				1392	}
				1393
				1394	if (Start == Current) {
				1395	setError("Got empty alias or anchor", Start);
				1396	return false;
				1397	}
				1398
				1399	Token T;
				1400	T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
				1401	T.Range = StringRef(Start, Current - Start);
				1402	TokenQueue.push_back(T);
				1403
				1404	// Alias and anchors can be simple keys.
Duncan P. N. Exon Smith	6eeaff1	2015-10-08 22:47:55 +0000	[diff] [blame]	1405	saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1406
				1407	IsSimpleKeyAllowed = false;
				1408
				1409	return true;
				1410	}
				1411
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	1412	char Scanner::scanBlockChompingIndicator() {
				1413	char Indicator = ' ';
				1414	if (Current != End && (Current == '+' \|\| Current == '-')) {
				1415	Indicator = *Current;
				1416	skip(1);
				1417	}
				1418	return Indicator;
				1419	}
				1420
				1421	/// Get the number of line breaks after chomping.
				1422	///
				1423	/// Return the number of trailing line breaks to emit, depending on
				1424	/// \p ChompingIndicator.
				1425	static unsigned getChompedLineBreaks(char ChompingIndicator,
				1426	unsigned LineBreaks, StringRef Str) {
				1427	if (ChompingIndicator == '-') // Strip all line breaks.
				1428	return 0;
				1429	if (ChompingIndicator == '+') // Keep all line breaks.
				1430	return LineBreaks;
				1431	// Clip trailing lines.
				1432	return Str.empty() ? 0 : 1;
				1433	}
				1434
				1435	unsigned Scanner::scanBlockIndentationIndicator() {
				1436	unsigned Indent = 0;
				1437	if (Current != End && (Current >= '1' && Current <= '9')) {
				1438	Indent = unsigned(*Current - '0');
				1439	skip(1);
				1440	}
				1441	return Indent;
				1442	}
				1443
				1444	bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
				1445	unsigned &IndentIndicator, bool &IsDone) {
				1446	auto Start = Current;
				1447
				1448	ChompingIndicator = scanBlockChompingIndicator();
				1449	IndentIndicator = scanBlockIndentationIndicator();
				1450	// Check for the chomping indicator once again.
				1451	if (ChompingIndicator == ' ')
				1452	ChompingIndicator = scanBlockChompingIndicator();
				1453	Current = skip_while(&Scanner::skip_s_white, Current);
				1454	skipComment();
				1455
				1456	if (Current == End) { // EOF, we have an empty scalar.
				1457	Token T;
				1458	T.Kind = Token::TK_BlockScalar;
				1459	T.Range = StringRef(Start, Current - Start);
				1460	TokenQueue.push_back(T);
				1461	IsDone = true;
				1462	return true;
				1463	}
				1464
				1465	if (!consumeLineBreakIfPresent()) {
				1466	setError("Expected a line break after block scalar header", Current);
				1467	return false;
				1468	}
				1469	return true;
				1470	}
				1471
				1472	bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
				1473	unsigned BlockExitIndent,
				1474	unsigned &LineBreaks, bool &IsDone) {
				1475	unsigned MaxAllSpaceLineCharacters = 0;
				1476	StringRef::iterator LongestAllSpaceLine;
				1477
				1478	while (true) {
				1479	advanceWhile(&Scanner::skip_s_space);
				1480	if (skip_nb_char(Current) != Current) {
				1481	// This line isn't empty, so try and find the indentation.
				1482	if (Column <= BlockExitIndent) { // End of the block literal.
				1483	IsDone = true;
				1484	return true;
				1485	}
				1486	// We found the block's indentation.
				1487	BlockIndent = Column;
				1488	if (MaxAllSpaceLineCharacters > BlockIndent) {
				1489	setError(
				1490	"Leading all-spaces line must be smaller than the block indent",
				1491	LongestAllSpaceLine);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1492	return false;
				1493	}
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	1494	return true;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1495	}
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	1496	if (skip_b_break(Current) != Current &&
				1497	Column > MaxAllSpaceLineCharacters) {
				1498	// Record the longest all-space line in case it's longer than the
				1499	// discovered block indent.
				1500	MaxAllSpaceLineCharacters = Column;
				1501	LongestAllSpaceLine = Current;
				1502	}
				1503
				1504	// Check for EOF.
				1505	if (Current == End) {
				1506	IsDone = true;
				1507	return true;
				1508	}
				1509
				1510	if (!consumeLineBreakIfPresent()) {
				1511	IsDone = true;
				1512	return true;
				1513	}
				1514	++LineBreaks;
				1515	}
				1516	return true;
				1517	}
				1518
				1519	bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
				1520	unsigned BlockExitIndent, bool &IsDone) {
				1521	// Skip the indentation.
				1522	while (Column < BlockIndent) {
				1523	auto I = skip_s_space(Current);
				1524	if (I == Current)
				1525	break;
				1526	Current = I;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1527	++Column;
				1528	}
				1529
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	1530	if (skip_nb_char(Current) == Current)
				1531	return true;
				1532
				1533	if (Column <= BlockExitIndent) { // End of the block literal.
				1534	IsDone = true;
				1535	return true;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1536	}
				1537
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	1538	if (Column < BlockIndent) {
				1539	if (Current != End && *Current == '#') { // Trailing comment.
				1540	IsDone = true;
				1541	return true;
				1542	}
				1543	setError("A text line is less indented than the block scalar", Current);
				1544	return false;
				1545	}
				1546	return true; // A normal text line.
				1547	}
				1548
				1549	bool Scanner::scanBlockScalar(bool IsLiteral) {
				1550	// Eat '\|' or '>'
				1551	assert(Current == '\|' \|\| Current == '>');
				1552	skip(1);
				1553
				1554	char ChompingIndicator;
				1555	unsigned BlockIndent;
				1556	bool IsDone = false;
				1557	if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
				1558	return false;
				1559	if (IsDone)
				1560	return true;
				1561
				1562	auto Start = Current;
				1563	unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
				1564	unsigned LineBreaks = 0;
				1565	if (BlockIndent == 0) {
				1566	if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
				1567	IsDone))
				1568	return false;
				1569	}
				1570
				1571	// Scan the block's scalars body.
				1572	SmallString<256> Str;
				1573	while (!IsDone) {
				1574	if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
				1575	return false;
				1576	if (IsDone)
				1577	break;
				1578
				1579	// Parse the current line.
				1580	auto LineStart = Current;
				1581	advanceWhile(&Scanner::skip_nb_char);
				1582	if (LineStart != Current) {
				1583	Str.append(LineBreaks, '\n');
				1584	Str.append(StringRef(LineStart, Current - LineStart));
				1585	LineBreaks = 0;
				1586	}
				1587
				1588	// Check for EOF.
				1589	if (Current == End)
				1590	break;
				1591
				1592	if (!consumeLineBreakIfPresent())
				1593	break;
				1594	++LineBreaks;
				1595	}
				1596
				1597	if (Current == End && !LineBreaks)
				1598	// Ensure that there is at least one line break before the end of file.
				1599	LineBreaks = 1;
				1600	Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
				1601
				1602	// New lines may start a simple key.
				1603	if (!FlowLevel)
				1604	IsSimpleKeyAllowed = true;
				1605
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1606	Token T;
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	1607	T.Kind = Token::TK_BlockScalar;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1608	T.Range = StringRef(Start, Current - Start);
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	1609	T.Value = Str.str().str();
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1610	TokenQueue.push_back(T);
				1611	return true;
				1612	}
				1613
				1614	bool Scanner::scanTag() {
				1615	StringRef::iterator Start = Current;
				1616	unsigned ColStart = Column;
				1617	skip(1); // Eat !.
				1618	if (Current == End \|\| isBlankOrBreak(Current)); // An empty tag.
				1619	else if (*Current == '<') {
				1620	skip(1);
				1621	scan_ns_uri_char();
				1622	if (!consume('>'))
				1623	return false;
				1624	} else {
				1625	// FIXME: Actually parse the c-ns-shorthand-tag rule.
				1626	Current = skip_while(&Scanner::skip_ns_char, Current);
				1627	}
				1628
				1629	Token T;
				1630	T.Kind = Token::TK_Tag;
				1631	T.Range = StringRef(Start, Current - Start);
				1632	TokenQueue.push_back(T);
				1633
				1634	// Tags can be simple keys.
Duncan P. N. Exon Smith	6eeaff1	2015-10-08 22:47:55 +0000	[diff] [blame]	1635	saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1636
				1637	IsSimpleKeyAllowed = false;
				1638
				1639	return true;
				1640	}
				1641
				1642	bool Scanner::fetchMoreTokens() {
				1643	if (IsStartOfStream)
				1644	return scanStreamStart();
				1645
				1646	scanToNextToken();
				1647
				1648	if (Current == End)
				1649	return scanStreamEnd();
				1650
				1651	removeStaleSimpleKeyCandidates();
				1652
				1653	unrollIndent(Column);
				1654
				1655	if (Column == 0 && *Current == '%')
				1656	return scanDirective();
				1657
				1658	if (Column == 0 && Current + 4 <= End
				1659	&& *Current == '-'
				1660	&& *(Current + 1) == '-'
				1661	&& *(Current + 2) == '-'
				1662	&& (Current + 3 == End \|\| isBlankOrBreak(Current + 3)))
				1663	return scanDocumentIndicator(true);
				1664
				1665	if (Column == 0 && Current + 4 <= End
				1666	&& *Current == '.'
				1667	&& *(Current + 1) == '.'
				1668	&& *(Current + 2) == '.'
				1669	&& (Current + 3 == End \|\| isBlankOrBreak(Current + 3)))
				1670	return scanDocumentIndicator(false);
				1671
				1672	if (*Current == '[')
				1673	return scanFlowCollectionStart(true);
				1674
				1675	if (*Current == '{')
				1676	return scanFlowCollectionStart(false);
				1677
				1678	if (*Current == ']')
				1679	return scanFlowCollectionEnd(true);
				1680
				1681	if (*Current == '}')
				1682	return scanFlowCollectionEnd(false);
				1683
				1684	if (*Current == ',')
				1685	return scanFlowEntry();
				1686
				1687	if (*Current == '-' && isBlankOrBreak(Current + 1))
				1688	return scanBlockEntry();
				1689
				1690	if (*Current == '?' && (FlowLevel \|\| isBlankOrBreak(Current + 1)))
				1691	return scanKey();
				1692
				1693	if (*Current == ':' && (FlowLevel \|\| isBlankOrBreak(Current + 1)))
				1694	return scanValue();
				1695
				1696	if (Current == '')
				1697	return scanAliasOrAnchor(true);
				1698
				1699	if (*Current == '&')
				1700	return scanAliasOrAnchor(false);
				1701
				1702	if (*Current == '!')
				1703	return scanTag();
				1704
				1705	if (*Current == '\|' && !FlowLevel)
				1706	return scanBlockScalar(true);
				1707
				1708	if (*Current == '>' && !FlowLevel)
				1709	return scanBlockScalar(false);
				1710
				1711	if (*Current == '\'')
				1712	return scanFlowScalar(false);
				1713
				1714	if (*Current == '"')
				1715	return scanFlowScalar(true);
				1716
				1717	// Get a plain scalar.
				1718	StringRef FirstChar(Current, 1);
				1719	if (!(isBlankOrBreak(Current)
				1720	\|\| FirstChar.find_first_of("-?:,[]{}#&*!\|>'\"%@`") != StringRef::npos)
				1721	\|\| (*Current == '-' && !isBlankOrBreak(Current + 1))
				1722	\|\| (!FlowLevel && (Current == '?' \|\| Current == ':')
				1723	&& isBlankOrBreak(Current + 1))
				1724	\|\| (!FlowLevel && *Current == ':'
				1725	&& Current + 2 < End
				1726	&& *(Current + 1) == ':'
				1727	&& !isBlankOrBreak(Current + 2)))
				1728	return scanPlainScalar();
				1729
				1730	setError("Unrecognized character while tokenizing.");
				1731	return false;
				1732	}
				1733
Alex Lorenz	e4bcfbf	2015-05-07 18:08:46 +0000	[diff] [blame]	1734	Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors)
				1735	: scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {}
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1736
Alex Lorenz	e4bcfbf	2015-05-07 18:08:46 +0000	[diff] [blame]	1737	Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors)
				1738	: scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {}
Sean Silva	aba8270	2012-11-19 23:21:47 +0000	[diff] [blame]	1739
Benjamin Kramer	a1355d1	2012-04-04 08:53:34 +0000	[diff] [blame]	1740	Stream::~Stream() {}
				1741
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1742	bool Stream::failed() { return scanner->failed(); }
				1743
				1744	void Stream::printError(Node *N, const Twine &Msg) {
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1745	scanner->printError( N->getSourceRange().Start
				1746	, SourceMgr::DK_Error
				1747	, Msg
Benjamin Kramer	ea68a94	2015-02-19 15:26:17 +0000	[diff] [blame]	1748	, N->getSourceRange());
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1749	}
				1750
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1751	document_iterator Stream::begin() {
				1752	if (CurrentDoc)
				1753	report_fatal_error("Can only iterate over the stream once");
				1754
				1755	// Skip Stream-Start.
				1756	scanner->getNext();
				1757
				1758	CurrentDoc.reset(new Document(*this));
				1759	return document_iterator(CurrentDoc);
				1760	}
				1761
				1762	document_iterator Stream::end() {
				1763	return document_iterator();
				1764	}
				1765
				1766	void Stream::skip() {
				1767	for (document_iterator i = begin(), e = end(); i != e; ++i)
				1768	i->skip();
				1769	}
				1770
Ahmed Charles	56440fd	2014-03-06 05:51:42 +0000	[diff] [blame]	1771	Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
				1772	StringRef T)
				1773	: Doc(D), TypeID(Type), Anchor(A), Tag(T) {
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1774	SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
				1775	SourceRange = SMRange(Start, Start);
				1776	}
				1777
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1778	std::string Node::getVerbatimTag() const {
				1779	StringRef Raw = getRawTag();
				1780	if (!Raw.empty() && Raw != "!") {
				1781	std::string Ret;
				1782	if (Raw.find_last_of('!') == 0) {
				1783	Ret = Doc->getTagMap().find("!")->second;
				1784	Ret += Raw.substr(1);
Richard Trieu	73d0652	2015-01-17 00:46:44 +0000	[diff] [blame]	1785	return Ret;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1786	} else if (Raw.startswith("!!")) {
				1787	Ret = Doc->getTagMap().find("!!")->second;
				1788	Ret += Raw.substr(2);
Richard Trieu	73d0652	2015-01-17 00:46:44 +0000	[diff] [blame]	1789	return Ret;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1790	} else {
				1791	StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
				1792	std::map<StringRef, StringRef>::const_iterator It =
				1793	Doc->getTagMap().find(TagHandle);
				1794	if (It != Doc->getTagMap().end())
				1795	Ret = It->second;
				1796	else {
				1797	Token T;
				1798	T.Kind = Token::TK_Tag;
				1799	T.Range = TagHandle;
				1800	setError(Twine("Unknown tag handle ") + TagHandle, T);
				1801	}
				1802	Ret += Raw.substr(Raw.find_last_of('!') + 1);
Richard Trieu	73d0652	2015-01-17 00:46:44 +0000	[diff] [blame]	1803	return Ret;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1804	}
				1805	}
				1806
				1807	switch (getType()) {
				1808	case NK_Null:
				1809	return "tag:yaml.org,2002:null";
				1810	case NK_Scalar:
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	1811	case NK_BlockScalar:
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	1812	// TODO: Tag resolution.
				1813	return "tag:yaml.org,2002:str";
				1814	case NK_Mapping:
				1815	return "tag:yaml.org,2002:map";
				1816	case NK_Sequence:
				1817	return "tag:yaml.org,2002:seq";
				1818	}
				1819
				1820	return "";
				1821	}
				1822
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1823	Token &Node::peekNext() {
				1824	return Doc->peekNext();
				1825	}
				1826
				1827	Token Node::getNext() {
				1828	return Doc->getNext();
				1829	}
				1830
				1831	Node *Node::parseBlockNode() {
				1832	return Doc->parseBlockNode();
				1833	}
				1834
				1835	BumpPtrAllocator &Node::getAllocator() {
				1836	return Doc->NodeAllocator;
				1837	}
				1838
				1839	void Node::setError(const Twine &Msg, Token &Tok) const {
				1840	Doc->setError(Msg, Tok);
				1841	}
				1842
				1843	bool Node::failed() const {
				1844	return Doc->failed();
				1845	}
				1846
				1847
				1848
				1849	StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
				1850	// TODO: Handle newlines properly. We need to remove leading whitespace.
				1851	if (Value[0] == '"') { // Double quoted.
				1852	// Pull off the leading and trailing "s.
				1853	StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
				1854	// Search for characters that would require unescaping the value.
				1855	StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
				1856	if (i != StringRef::npos)
				1857	return unescapeDoubleQuoted(UnquotedValue, i, Storage);
				1858	return UnquotedValue;
				1859	} else if (Value[0] == '\'') { // Single quoted.
				1860	// Pull off the leading and trailing 's.
				1861	StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
				1862	StringRef::size_type i = UnquotedValue.find('\'');
				1863	if (i != StringRef::npos) {
				1864	// We're going to need Storage.
				1865	Storage.clear();
				1866	Storage.reserve(UnquotedValue.size());
				1867	for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
				1868	StringRef Valid(UnquotedValue.begin(), i);
				1869	Storage.insert(Storage.end(), Valid.begin(), Valid.end());
				1870	Storage.push_back('\'');
				1871	UnquotedValue = UnquotedValue.substr(i + 2);
				1872	}
				1873	Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
				1874	return StringRef(Storage.begin(), Storage.size());
				1875	}
				1876	return UnquotedValue;
				1877	}
				1878	// Plain or block.
Vedant Kumar	98372e3	2016-02-16 02:06:01 +0000	[diff] [blame]	1879	return Value.rtrim(' ');
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1880	}
				1881
				1882	StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
				1883	, StringRef::size_type i
				1884	, SmallVectorImpl<char> &Storage)
				1885	const {
				1886	// Use Storage to build proper value.
				1887	Storage.clear();
				1888	Storage.reserve(UnquotedValue.size());
				1889	for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
				1890	// Insert all previous chars into Storage.
				1891	StringRef Valid(UnquotedValue.begin(), i);
				1892	Storage.insert(Storage.end(), Valid.begin(), Valid.end());
				1893	// Chop off inserted chars.
				1894	UnquotedValue = UnquotedValue.substr(i);
				1895
				1896	assert(!UnquotedValue.empty() && "Can't be empty!");
				1897
				1898	// Parse escape or line break.
				1899	switch (UnquotedValue[0]) {
				1900	case '\r':
				1901	case '\n':
				1902	Storage.push_back('\n');
				1903	if ( UnquotedValue.size() > 1
				1904	&& (UnquotedValue[1] == '\r' \|\| UnquotedValue[1] == '\n'))
				1905	UnquotedValue = UnquotedValue.substr(1);
				1906	UnquotedValue = UnquotedValue.substr(1);
				1907	break;
				1908	default:
				1909	if (UnquotedValue.size() == 1)
				1910	// TODO: Report error.
				1911	break;
				1912	UnquotedValue = UnquotedValue.substr(1);
				1913	switch (UnquotedValue[0]) {
				1914	default: {
				1915	Token T;
				1916	T.Range = StringRef(UnquotedValue.begin(), 1);
				1917	setError("Unrecognized escape code!", T);
				1918	return "";
				1919	}
				1920	case '\r':
				1921	case '\n':
				1922	// Remove the new line.
				1923	if ( UnquotedValue.size() > 1
				1924	&& (UnquotedValue[1] == '\r' \|\| UnquotedValue[1] == '\n'))
				1925	UnquotedValue = UnquotedValue.substr(1);
				1926	// If this was just a single byte newline, it will get skipped
				1927	// below.
				1928	break;
				1929	case '0':
				1930	Storage.push_back(0x00);
				1931	break;
				1932	case 'a':
				1933	Storage.push_back(0x07);
				1934	break;
				1935	case 'b':
				1936	Storage.push_back(0x08);
				1937	break;
				1938	case 't':
				1939	case 0x09:
				1940	Storage.push_back(0x09);
				1941	break;
				1942	case 'n':
				1943	Storage.push_back(0x0A);
				1944	break;
				1945	case 'v':
				1946	Storage.push_back(0x0B);
				1947	break;
				1948	case 'f':
				1949	Storage.push_back(0x0C);
				1950	break;
				1951	case 'r':
				1952	Storage.push_back(0x0D);
				1953	break;
				1954	case 'e':
				1955	Storage.push_back(0x1B);
				1956	break;
				1957	case ' ':
				1958	Storage.push_back(0x20);
				1959	break;
				1960	case '"':
				1961	Storage.push_back(0x22);
				1962	break;
				1963	case '/':
				1964	Storage.push_back(0x2F);
				1965	break;
				1966	case '\\':
				1967	Storage.push_back(0x5C);
				1968	break;
				1969	case 'N':
				1970	encodeUTF8(0x85, Storage);
				1971	break;
				1972	case '_':
				1973	encodeUTF8(0xA0, Storage);
				1974	break;
				1975	case 'L':
				1976	encodeUTF8(0x2028, Storage);
				1977	break;
				1978	case 'P':
				1979	encodeUTF8(0x2029, Storage);
				1980	break;
				1981	case 'x': {
				1982	if (UnquotedValue.size() < 3)
				1983	// TODO: Report error.
				1984	break;
Michael J. Spencer	a6c2c29	2012-04-26 19:27:11 +0000	[diff] [blame]	1985	unsigned int UnicodeScalarValue;
				1986	if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
				1987	// TODO: Report error.
				1988	UnicodeScalarValue = 0xFFFD;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	1989	encodeUTF8(UnicodeScalarValue, Storage);
				1990	UnquotedValue = UnquotedValue.substr(2);
				1991	break;
				1992	}
				1993	case 'u': {
				1994	if (UnquotedValue.size() < 5)
				1995	// TODO: Report error.
				1996	break;
Michael J. Spencer	a6c2c29	2012-04-26 19:27:11 +0000	[diff] [blame]	1997	unsigned int UnicodeScalarValue;
				1998	if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
				1999	// TODO: Report error.
				2000	UnicodeScalarValue = 0xFFFD;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2001	encodeUTF8(UnicodeScalarValue, Storage);
				2002	UnquotedValue = UnquotedValue.substr(4);
				2003	break;
				2004	}
				2005	case 'U': {
				2006	if (UnquotedValue.size() < 9)
				2007	// TODO: Report error.
				2008	break;
Michael J. Spencer	a6c2c29	2012-04-26 19:27:11 +0000	[diff] [blame]	2009	unsigned int UnicodeScalarValue;
				2010	if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
				2011	// TODO: Report error.
				2012	UnicodeScalarValue = 0xFFFD;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2013	encodeUTF8(UnicodeScalarValue, Storage);
				2014	UnquotedValue = UnquotedValue.substr(8);
				2015	break;
				2016	}
				2017	}
				2018	UnquotedValue = UnquotedValue.substr(1);
				2019	}
				2020	}
				2021	Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
				2022	return StringRef(Storage.begin(), Storage.size());
				2023	}
				2024
				2025	Node *KeyValueNode::getKey() {
				2026	if (Key)
				2027	return Key;
				2028	// Handle implicit null keys.
				2029	{
				2030	Token &t = peekNext();
				2031	if ( t.Kind == Token::TK_BlockEnd
				2032	\|\| t.Kind == Token::TK_Value
				2033	\|\| t.Kind == Token::TK_Error) {
				2034	return Key = new (getAllocator()) NullNode(Doc);
				2035	}
				2036	if (t.Kind == Token::TK_Key)
				2037	getNext(); // skip TK_Key.
				2038	}
				2039
				2040	// Handle explicit null keys.
				2041	Token &t = peekNext();
				2042	if (t.Kind == Token::TK_BlockEnd \|\| t.Kind == Token::TK_Value) {
				2043	return Key = new (getAllocator()) NullNode(Doc);
				2044	}
				2045
				2046	// We've got a normal key.
				2047	return Key = parseBlockNode();
				2048	}
				2049
				2050	Node *KeyValueNode::getValue() {
				2051	if (Value)
				2052	return Value;
				2053	getKey()->skip();
				2054	if (failed())
				2055	return Value = new (getAllocator()) NullNode(Doc);
				2056
				2057	// Handle implicit null values.
				2058	{
				2059	Token &t = peekNext();
				2060	if ( t.Kind == Token::TK_BlockEnd
				2061	\|\| t.Kind == Token::TK_FlowMappingEnd
				2062	\|\| t.Kind == Token::TK_Key
				2063	\|\| t.Kind == Token::TK_FlowEntry
				2064	\|\| t.Kind == Token::TK_Error) {
				2065	return Value = new (getAllocator()) NullNode(Doc);
				2066	}
				2067
				2068	if (t.Kind != Token::TK_Value) {
				2069	setError("Unexpected token in Key Value.", t);
				2070	return Value = new (getAllocator()) NullNode(Doc);
				2071	}
				2072	getNext(); // skip TK_Value.
				2073	}
				2074
				2075	// Handle explicit null values.
				2076	Token &t = peekNext();
				2077	if (t.Kind == Token::TK_BlockEnd \|\| t.Kind == Token::TK_Key) {
				2078	return Value = new (getAllocator()) NullNode(Doc);
				2079	}
				2080
				2081	// We got a normal value.
				2082	return Value = parseBlockNode();
				2083	}
				2084
				2085	void MappingNode::increment() {
				2086	if (failed()) {
				2087	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2088	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2089	return;
				2090	}
				2091	if (CurrentEntry) {
				2092	CurrentEntry->skip();
				2093	if (Type == MT_Inline) {
				2094	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2095	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2096	return;
				2097	}
				2098	}
				2099	Token T = peekNext();
				2100	if (T.Kind == Token::TK_Key \|\| T.Kind == Token::TK_Scalar) {
				2101	// KeyValueNode eats the TK_Key. That way it can detect null keys.
				2102	CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
				2103	} else if (Type == MT_Block) {
				2104	switch (T.Kind) {
				2105	case Token::TK_BlockEnd:
				2106	getNext();
				2107	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2108	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2109	break;
				2110	default:
				2111	setError("Unexpected token. Expected Key or Block End", T);
				2112	case Token::TK_Error:
				2113	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2114	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2115	}
				2116	} else {
				2117	switch (T.Kind) {
				2118	case Token::TK_FlowEntry:
				2119	// Eat the flow entry and recurse.
				2120	getNext();
				2121	return increment();
				2122	case Token::TK_FlowMappingEnd:
				2123	getNext();
				2124	case Token::TK_Error:
				2125	// Set this to end iterator.
				2126	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2127	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2128	break;
				2129	default:
				2130	setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
				2131	"Mapping End."
				2132	, T);
				2133	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2134	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2135	}
				2136	}
				2137	}
				2138
				2139	void SequenceNode::increment() {
				2140	if (failed()) {
				2141	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2142	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2143	return;
				2144	}
				2145	if (CurrentEntry)
				2146	CurrentEntry->skip();
				2147	Token T = peekNext();
				2148	if (SeqType == ST_Block) {
				2149	switch (T.Kind) {
				2150	case Token::TK_BlockEntry:
				2151	getNext();
				2152	CurrentEntry = parseBlockNode();
Craig Topper	8d399f8	2014-04-09 04:20:00 +0000	[diff] [blame]	2153	if (!CurrentEntry) { // An error occurred.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2154	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2155	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2156	}
				2157	break;
				2158	case Token::TK_BlockEnd:
				2159	getNext();
				2160	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2161	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2162	break;
				2163	default:
				2164	setError( "Unexpected token. Expected Block Entry or Block End."
				2165	, T);
				2166	case Token::TK_Error:
				2167	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2168	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2169	}
				2170	} else if (SeqType == ST_Indentless) {
				2171	switch (T.Kind) {
				2172	case Token::TK_BlockEntry:
				2173	getNext();
				2174	CurrentEntry = parseBlockNode();
Craig Topper	8d399f8	2014-04-09 04:20:00 +0000	[diff] [blame]	2175	if (!CurrentEntry) { // An error occurred.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2176	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2177	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2178	}
				2179	break;
				2180	default:
				2181	case Token::TK_Error:
				2182	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2183	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2184	}
				2185	} else if (SeqType == ST_Flow) {
				2186	switch (T.Kind) {
				2187	case Token::TK_FlowEntry:
				2188	// Eat the flow entry and recurse.
				2189	getNext();
				2190	WasPreviousTokenFlowEntry = true;
				2191	return increment();
				2192	case Token::TK_FlowSequenceEnd:
				2193	getNext();
				2194	case Token::TK_Error:
				2195	// Set this to end iterator.
				2196	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2197	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2198	break;
				2199	case Token::TK_StreamEnd:
				2200	case Token::TK_DocumentEnd:
				2201	case Token::TK_DocumentStart:
				2202	setError("Could not find closing ]!", T);
				2203	// Set this to end iterator.
				2204	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2205	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2206	break;
				2207	default:
				2208	if (!WasPreviousTokenFlowEntry) {
				2209	setError("Expected , between entries!", T);
				2210	IsAtEnd = true;
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2211	CurrentEntry = nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2212	break;
				2213	}
				2214	// Otherwise it must be a flow entry.
				2215	CurrentEntry = parseBlockNode();
				2216	if (!CurrentEntry) {
				2217	IsAtEnd = true;
				2218	}
				2219	WasPreviousTokenFlowEntry = false;
				2220	break;
				2221	}
				2222	}
				2223	}
				2224
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2225	Document::Document(Stream &S) : stream(S), Root(nullptr) {
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2226	// Tag maps starts with two default mappings.
				2227	TagMap["!"] = "!";
				2228	TagMap["!!"] = "tag:yaml.org,2002:";
				2229
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2230	if (parseDirectives())
				2231	expectToken(Token::TK_DocumentStart);
				2232	Token &T = peekNext();
				2233	if (T.Kind == Token::TK_DocumentStart)
				2234	getNext();
				2235	}
				2236
				2237	bool Document::skip() {
				2238	if (stream.scanner->failed())
				2239	return false;
				2240	if (!Root)
				2241	getRoot();
				2242	Root->skip();
				2243	Token &T = peekNext();
				2244	if (T.Kind == Token::TK_StreamEnd)
				2245	return false;
				2246	if (T.Kind == Token::TK_DocumentEnd) {
				2247	getNext();
				2248	return skip();
				2249	}
				2250	return true;
				2251	}
				2252
				2253	Token &Document::peekNext() {
				2254	return stream.scanner->peekNext();
				2255	}
				2256
				2257	Token Document::getNext() {
				2258	return stream.scanner->getNext();
				2259	}
				2260
				2261	void Document::setError(const Twine &Message, Token &Location) const {
				2262	stream.scanner->setError(Message, Location.Range.begin());
				2263	}
				2264
				2265	bool Document::failed() const {
				2266	return stream.scanner->failed();
				2267	}
				2268
				2269	Node *Document::parseBlockNode() {
				2270	Token T = peekNext();
				2271	// Handle properties.
				2272	Token AnchorInfo;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2273	Token TagInfo;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2274	parse_property:
				2275	switch (T.Kind) {
				2276	case Token::TK_Alias:
				2277	getNext();
				2278	return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
				2279	case Token::TK_Anchor:
				2280	if (AnchorInfo.Kind == Token::TK_Anchor) {
				2281	setError("Already encountered an anchor for this node!", T);
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2282	return nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2283	}
				2284	AnchorInfo = getNext(); // Consume TK_Anchor.
				2285	T = peekNext();
				2286	goto parse_property;
				2287	case Token::TK_Tag:
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2288	if (TagInfo.Kind == Token::TK_Tag) {
				2289	setError("Already encountered a tag for this node!", T);
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2290	return nullptr;
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2291	}
				2292	TagInfo = getNext(); // Consume TK_Tag.
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2293	T = peekNext();
				2294	goto parse_property;
				2295	default:
				2296	break;
				2297	}
				2298
				2299	switch (T.Kind) {
				2300	case Token::TK_BlockEntry:
				2301	// We got an unindented BlockEntry sequence. This is not terminated with
				2302	// a BlockEnd.
				2303	// Don't eat the TK_BlockEntry, SequenceNode needs it.
				2304	return new (NodeAllocator) SequenceNode( stream.CurrentDoc
				2305	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2306	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2307	, SequenceNode::ST_Indentless);
				2308	case Token::TK_BlockSequenceStart:
				2309	getNext();
				2310	return new (NodeAllocator)
				2311	SequenceNode( stream.CurrentDoc
				2312	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2313	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2314	, SequenceNode::ST_Block);
				2315	case Token::TK_BlockMappingStart:
				2316	getNext();
				2317	return new (NodeAllocator)
				2318	MappingNode( stream.CurrentDoc
				2319	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2320	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2321	, MappingNode::MT_Block);
				2322	case Token::TK_FlowSequenceStart:
				2323	getNext();
				2324	return new (NodeAllocator)
				2325	SequenceNode( stream.CurrentDoc
				2326	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2327	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2328	, SequenceNode::ST_Flow);
				2329	case Token::TK_FlowMappingStart:
				2330	getNext();
				2331	return new (NodeAllocator)
				2332	MappingNode( stream.CurrentDoc
				2333	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2334	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2335	, MappingNode::MT_Flow);
				2336	case Token::TK_Scalar:
				2337	getNext();
				2338	return new (NodeAllocator)
				2339	ScalarNode( stream.CurrentDoc
				2340	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2341	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2342	, T.Range);
Benjamin Kramer	7236733	2015-05-18 21:11:27 +0000	[diff] [blame]	2343	case Token::TK_BlockScalar: {
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	2344	getNext();
Alex Lorenz	481dca2	2015-05-21 19:45:02 +0000	[diff] [blame]	2345	StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
				2346	StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
Alex Lorenz	a22b250c	2015-05-13 23:10:51 +0000	[diff] [blame]	2347	return new (NodeAllocator)
				2348	BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
Benjamin Kramer	7236733	2015-05-18 21:11:27 +0000	[diff] [blame]	2349	TagInfo.Range, StrCopy, T.Range);
				2350	}
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2351	case Token::TK_Key:
				2352	// Don't eat the TK_Key, KeyValueNode expects it.
				2353	return new (NodeAllocator)
				2354	MappingNode( stream.CurrentDoc
				2355	, AnchorInfo.Range.substr(1)
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2356	, TagInfo.Range
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2357	, MappingNode::MT_Inline);
				2358	case Token::TK_DocumentStart:
				2359	case Token::TK_DocumentEnd:
				2360	case Token::TK_StreamEnd:
				2361	default:
				2362	// TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
				2363	// !!null null.
				2364	return new (NodeAllocator) NullNode(stream.CurrentDoc);
				2365	case Token::TK_Error:
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2366	return nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2367	}
				2368	llvm_unreachable("Control flow shouldn't reach here.");
Craig Topper	c10719f	2014-04-07 04:17:22 +0000	[diff] [blame]	2369	return nullptr;
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2370	}
				2371
				2372	bool Document::parseDirectives() {
				2373	bool isDirective = false;
				2374	while (true) {
				2375	Token T = peekNext();
				2376	if (T.Kind == Token::TK_TagDirective) {
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2377	parseTAGDirective();
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2378	isDirective = true;
				2379	} else if (T.Kind == Token::TK_VersionDirective) {
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2380	parseYAMLDirective();
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2381	isDirective = true;
				2382	} else
				2383	break;
				2384	}
				2385	return isDirective;
				2386	}
				2387
Michael J. Spencer	c064a9a	2013-10-18 22:38:04 +0000	[diff] [blame]	2388	void Document::parseYAMLDirective() {
				2389	getNext(); // Eat %YAML <version>
				2390	}
				2391
				2392	void Document::parseTAGDirective() {
				2393	Token Tag = getNext(); // %TAG <handle> <prefix>
				2394	StringRef T = Tag.Range;
				2395	// Strip %TAG
				2396	T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
				2397	std::size_t HandleEnd = T.find_first_of(" \t");
				2398	StringRef TagHandle = T.substr(0, HandleEnd);
				2399	StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
				2400	TagMap[TagHandle] = TagPrefix;
				2401	}
				2402
Michael J. Spencer	22120c4	2012-04-03 23:09:22 +0000	[diff] [blame]	2403	bool Document::expectToken(int TK) {
				2404	Token T = getNext();
				2405	if (T.Kind != TK) {
				2406	setError("Unexpected token", T);
				2407	return false;
				2408	}
				2409	return true;
				2410	}