blob: 4b67536ce4c1a97b4a28f3b695c1ef5b9cae08e5 [file] [log] [blame]
Sam McCall6be38242018-07-09 10:05:41 +00001//=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Sam McCall6be38242018-07-09 10:05:41 +00006//
7//===---------------------------------------------------------------------===//
8
9#include "llvm/Support/JSON.h"
Sam McCalle6057bc2018-07-10 11:51:26 +000010#include "llvm/Support/ConvertUTF.h"
Sam McCall6be38242018-07-09 10:05:41 +000011#include "llvm/Support/Format.h"
12#include <cctype>
13
14namespace llvm {
15namespace json {
16
17Value &Object::operator[](const ObjectKey &K) {
18 return try_emplace(K, nullptr).first->getSecond();
19}
20Value &Object::operator[](ObjectKey &&K) {
21 return try_emplace(std::move(K), nullptr).first->getSecond();
22}
23Value *Object::get(StringRef K) {
24 auto I = find(K);
25 if (I == end())
26 return nullptr;
27 return &I->second;
28}
29const Value *Object::get(StringRef K) const {
30 auto I = find(K);
31 if (I == end())
32 return nullptr;
33 return &I->second;
34}
35llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
36 if (auto *V = get(K))
37 return V->getAsNull();
38 return llvm::None;
39}
40llvm::Optional<bool> Object::getBoolean(StringRef K) const {
41 if (auto *V = get(K))
42 return V->getAsBoolean();
43 return llvm::None;
44}
45llvm::Optional<double> Object::getNumber(StringRef K) const {
46 if (auto *V = get(K))
47 return V->getAsNumber();
48 return llvm::None;
49}
50llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
51 if (auto *V = get(K))
52 return V->getAsInteger();
53 return llvm::None;
54}
55llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
56 if (auto *V = get(K))
57 return V->getAsString();
58 return llvm::None;
59}
60const json::Object *Object::getObject(StringRef K) const {
61 if (auto *V = get(K))
62 return V->getAsObject();
63 return nullptr;
64}
65json::Object *Object::getObject(StringRef K) {
66 if (auto *V = get(K))
67 return V->getAsObject();
68 return nullptr;
69}
70const json::Array *Object::getArray(StringRef K) const {
71 if (auto *V = get(K))
72 return V->getAsArray();
73 return nullptr;
74}
75json::Array *Object::getArray(StringRef K) {
76 if (auto *V = get(K))
77 return V->getAsArray();
78 return nullptr;
79}
80bool operator==(const Object &LHS, const Object &RHS) {
81 if (LHS.size() != RHS.size())
82 return false;
83 for (const auto &L : LHS) {
84 auto R = RHS.find(L.first);
85 if (R == RHS.end() || L.second != R->second)
86 return false;
87 }
88 return true;
89}
90
91Array::Array(std::initializer_list<Value> Elements) {
92 V.reserve(Elements.size());
93 for (const Value &V : Elements) {
94 emplace_back(nullptr);
95 back().moveFrom(std::move(V));
96 }
97}
98
99Value::Value(std::initializer_list<Value> Elements)
100 : Value(json::Array(Elements)) {}
101
102void Value::copyFrom(const Value &M) {
103 Type = M.Type;
104 switch (Type) {
105 case T_Null:
106 case T_Boolean:
Sam McCalld93eaeb2018-07-09 12:16:40 +0000107 case T_Double:
108 case T_Integer:
Sam McCall6be38242018-07-09 10:05:41 +0000109 memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
110 break;
111 case T_StringRef:
112 create<StringRef>(M.as<StringRef>());
113 break;
114 case T_String:
115 create<std::string>(M.as<std::string>());
116 break;
117 case T_Object:
118 create<json::Object>(M.as<json::Object>());
119 break;
120 case T_Array:
121 create<json::Array>(M.as<json::Array>());
122 break;
123 }
124}
125
126void Value::moveFrom(const Value &&M) {
127 Type = M.Type;
128 switch (Type) {
129 case T_Null:
130 case T_Boolean:
Sam McCalld93eaeb2018-07-09 12:16:40 +0000131 case T_Double:
132 case T_Integer:
Sam McCall6be38242018-07-09 10:05:41 +0000133 memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
134 break;
135 case T_StringRef:
136 create<StringRef>(M.as<StringRef>());
137 break;
138 case T_String:
139 create<std::string>(std::move(M.as<std::string>()));
140 M.Type = T_Null;
141 break;
142 case T_Object:
143 create<json::Object>(std::move(M.as<json::Object>()));
144 M.Type = T_Null;
145 break;
146 case T_Array:
147 create<json::Array>(std::move(M.as<json::Array>()));
148 M.Type = T_Null;
149 break;
150 }
151}
152
153void Value::destroy() {
154 switch (Type) {
155 case T_Null:
156 case T_Boolean:
Sam McCalld93eaeb2018-07-09 12:16:40 +0000157 case T_Double:
158 case T_Integer:
Sam McCall6be38242018-07-09 10:05:41 +0000159 break;
160 case T_StringRef:
161 as<StringRef>().~StringRef();
162 break;
163 case T_String:
164 as<std::string>().~basic_string();
165 break;
166 case T_Object:
167 as<json::Object>().~Object();
168 break;
169 case T_Array:
170 as<json::Array>().~Array();
171 break;
172 }
173}
174
175bool operator==(const Value &L, const Value &R) {
176 if (L.kind() != R.kind())
177 return false;
178 switch (L.kind()) {
179 case Value::Null:
180 return *L.getAsNull() == *R.getAsNull();
181 case Value::Boolean:
182 return *L.getAsBoolean() == *R.getAsBoolean();
183 case Value::Number:
184 return *L.getAsNumber() == *R.getAsNumber();
185 case Value::String:
186 return *L.getAsString() == *R.getAsString();
187 case Value::Array:
188 return *L.getAsArray() == *R.getAsArray();
189 case Value::Object:
190 return *L.getAsObject() == *R.getAsObject();
191 }
192 llvm_unreachable("Unknown value kind");
193}
194
195namespace {
196// Simple recursive-descent JSON parser.
197class Parser {
198public:
199 Parser(StringRef JSON)
200 : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
201
Sam McCalle6057bc2018-07-10 11:51:26 +0000202 bool checkUTF8() {
203 size_t ErrOffset;
204 if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
205 return true;
206 P = Start + ErrOffset; // For line/column calculation.
207 return parseError("Invalid UTF-8 sequence");
208 }
209
Sam McCall6be38242018-07-09 10:05:41 +0000210 bool parseValue(Value &Out);
211
212 bool assertEnd() {
213 eatWhitespace();
214 if (P == End)
215 return true;
216 return parseError("Text after end of document");
217 }
218
219 Error takeError() {
220 assert(Err);
221 return std::move(*Err);
222 }
223
224private:
225 void eatWhitespace() {
226 while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
227 ++P;
228 }
229
230 // On invalid syntax, parseX() functions return false and set Err.
Sam McCalld93eaeb2018-07-09 12:16:40 +0000231 bool parseNumber(char First, Value &Out);
Sam McCall6be38242018-07-09 10:05:41 +0000232 bool parseString(std::string &Out);
233 bool parseUnicode(std::string &Out);
234 bool parseError(const char *Msg); // always returns false
235
236 char next() { return P == End ? 0 : *P++; }
237 char peek() { return P == End ? 0 : *P; }
238 static bool isNumber(char C) {
239 return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
240 C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
241 C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
242 }
243
244 Optional<Error> Err;
245 const char *Start, *P, *End;
246};
247
248bool Parser::parseValue(Value &Out) {
249 eatWhitespace();
250 if (P == End)
251 return parseError("Unexpected EOF");
252 switch (char C = next()) {
253 // Bare null/true/false are easy - first char identifies them.
254 case 'n':
255 Out = nullptr;
256 return (next() == 'u' && next() == 'l' && next() == 'l') ||
257 parseError("Invalid JSON value (null?)");
258 case 't':
259 Out = true;
260 return (next() == 'r' && next() == 'u' && next() == 'e') ||
261 parseError("Invalid JSON value (true?)");
262 case 'f':
263 Out = false;
264 return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
265 parseError("Invalid JSON value (false?)");
266 case '"': {
267 std::string S;
268 if (parseString(S)) {
269 Out = std::move(S);
270 return true;
271 }
272 return false;
273 }
274 case '[': {
275 Out = Array{};
276 Array &A = *Out.getAsArray();
277 eatWhitespace();
278 if (peek() == ']') {
279 ++P;
280 return true;
281 }
282 for (;;) {
283 A.emplace_back(nullptr);
284 if (!parseValue(A.back()))
285 return false;
286 eatWhitespace();
287 switch (next()) {
288 case ',':
289 eatWhitespace();
290 continue;
291 case ']':
292 return true;
293 default:
294 return parseError("Expected , or ] after array element");
295 }
296 }
297 }
298 case '{': {
299 Out = Object{};
300 Object &O = *Out.getAsObject();
301 eatWhitespace();
302 if (peek() == '}') {
303 ++P;
304 return true;
305 }
306 for (;;) {
307 if (next() != '"')
308 return parseError("Expected object key");
309 std::string K;
310 if (!parseString(K))
311 return false;
312 eatWhitespace();
313 if (next() != ':')
314 return parseError("Expected : after object key");
315 eatWhitespace();
316 if (!parseValue(O[std::move(K)]))
317 return false;
318 eatWhitespace();
319 switch (next()) {
320 case ',':
321 eatWhitespace();
322 continue;
323 case '}':
324 return true;
325 default:
326 return parseError("Expected , or } after object property");
327 }
328 }
329 }
330 default:
Sam McCalld93eaeb2018-07-09 12:16:40 +0000331 if (isNumber(C))
332 return parseNumber(C, Out);
Sam McCall6be38242018-07-09 10:05:41 +0000333 return parseError("Invalid JSON value");
334 }
335}
336
Sam McCalld93eaeb2018-07-09 12:16:40 +0000337bool Parser::parseNumber(char First, Value &Out) {
338 // Read the number into a string. (Must be null-terminated for strto*).
Sam McCall6be38242018-07-09 10:05:41 +0000339 SmallString<24> S;
340 S.push_back(First);
341 while (isNumber(peek()))
342 S.push_back(next());
343 char *End;
Sam McCalld93eaeb2018-07-09 12:16:40 +0000344 // Try first to parse as integer, and if so preserve full 64 bits.
345 // strtoll returns long long >= 64 bits, so check it's in range too.
346 auto I = std::strtoll(S.c_str(), &End, 10);
347 if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
348 I <= std::numeric_limits<int64_t>::max()) {
349 Out = int64_t(I);
350 return true;
351 }
352 // If it's not an integer
Sam McCall6be38242018-07-09 10:05:41 +0000353 Out = std::strtod(S.c_str(), &End);
354 return End == S.end() || parseError("Invalid JSON value (number?)");
355}
356
357bool Parser::parseString(std::string &Out) {
358 // leading quote was already consumed.
359 for (char C = next(); C != '"'; C = next()) {
360 if (LLVM_UNLIKELY(P == End))
361 return parseError("Unterminated string");
362 if (LLVM_UNLIKELY((C & 0x1f) == C))
363 return parseError("Control character in string");
364 if (LLVM_LIKELY(C != '\\')) {
365 Out.push_back(C);
366 continue;
367 }
368 // Handle escape sequence.
369 switch (C = next()) {
370 case '"':
371 case '\\':
372 case '/':
373 Out.push_back(C);
374 break;
375 case 'b':
376 Out.push_back('\b');
377 break;
378 case 'f':
379 Out.push_back('\f');
380 break;
381 case 'n':
382 Out.push_back('\n');
383 break;
384 case 'r':
385 Out.push_back('\r');
386 break;
387 case 't':
388 Out.push_back('\t');
389 break;
390 case 'u':
391 if (!parseUnicode(Out))
392 return false;
393 break;
394 default:
395 return parseError("Invalid escape sequence");
396 }
397 }
398 return true;
399}
400
401static void encodeUtf8(uint32_t Rune, std::string &Out) {
402 if (Rune < 0x80) {
403 Out.push_back(Rune & 0x7F);
404 } else if (Rune < 0x800) {
405 uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
406 uint8_t SecondByte = 0x80 | (Rune & 0x3F);
407 Out.push_back(FirstByte);
408 Out.push_back(SecondByte);
409 } else if (Rune < 0x10000) {
410 uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
411 uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
412 uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
413 Out.push_back(FirstByte);
414 Out.push_back(SecondByte);
415 Out.push_back(ThirdByte);
416 } else if (Rune < 0x110000) {
417 uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
418 uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
419 uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
420 uint8_t FourthByte = 0x80 | (Rune & 0x3F);
421 Out.push_back(FirstByte);
422 Out.push_back(SecondByte);
423 Out.push_back(ThirdByte);
424 Out.push_back(FourthByte);
425 } else {
426 llvm_unreachable("Invalid codepoint");
427 }
428}
429
430// Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
431// May parse several sequential escapes to ensure proper surrogate handling.
432// We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
433// These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
434bool Parser::parseUnicode(std::string &Out) {
435 // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
436 auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
437 // Decodes 4 hex digits from the stream into Out, returns false on error.
438 auto Parse4Hex = [this](uint16_t &Out) -> bool {
439 Out = 0;
440 char Bytes[] = {next(), next(), next(), next()};
441 for (unsigned char C : Bytes) {
442 if (!std::isxdigit(C))
443 return parseError("Invalid \\u escape sequence");
444 Out <<= 4;
445 Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
446 }
447 return true;
448 };
449 uint16_t First; // UTF-16 code unit from the first \u escape.
450 if (!Parse4Hex(First))
451 return false;
452
453 // We loop to allow proper surrogate-pair error handling.
454 while (true) {
455 // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
456 if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
457 encodeUtf8(First, Out);
458 return true;
459 }
460
461 // Case 2: it's an (unpaired) trailing surrogate.
462 if (LLVM_UNLIKELY(First >= 0xDC00)) {
463 Invalid();
464 return true;
465 }
466
467 // Case 3: it's a leading surrogate. We expect a trailing one next.
468 // Case 3a: there's no trailing \u escape. Don't advance in the stream.
Sam McCalle6057bc2018-07-10 11:51:26 +0000469 if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
Sam McCall6be38242018-07-09 10:05:41 +0000470 Invalid(); // Leading surrogate was unpaired.
471 return true;
472 }
473 P += 2;
474 uint16_t Second;
475 if (!Parse4Hex(Second))
476 return false;
477 // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
478 if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
479 Invalid(); // Leading surrogate was unpaired.
480 First = Second; // Second escape still needs to be processed.
481 continue;
482 }
483 // Case 3c: a valid surrogate pair encoding an astral codepoint.
484 encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
485 return true;
486 }
487}
488
489bool Parser::parseError(const char *Msg) {
490 int Line = 1;
491 const char *StartOfLine = Start;
492 for (const char *X = Start; X < P; ++X) {
493 if (*X == 0x0A) {
494 ++Line;
495 StartOfLine = X + 1;
496 }
497 }
498 Err.emplace(
499 llvm::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
500 return false;
501}
502} // namespace
503
504Expected<Value> parse(StringRef JSON) {
505 Parser P(JSON);
506 Value E = nullptr;
Sam McCalle6057bc2018-07-10 11:51:26 +0000507 if (P.checkUTF8())
508 if (P.parseValue(E))
509 if (P.assertEnd())
510 return std::move(E);
Sam McCall6be38242018-07-09 10:05:41 +0000511 return P.takeError();
512}
513char ParseError::ID = 0;
514
515static std::vector<const Object::value_type *> sortedElements(const Object &O) {
516 std::vector<const Object::value_type *> Elements;
517 for (const auto &E : O)
518 Elements.push_back(&E);
Fangrui Song0cac7262018-09-27 02:13:45 +0000519 llvm::sort(Elements,
Sam McCall6be38242018-07-09 10:05:41 +0000520 [](const Object::value_type *L, const Object::value_type *R) {
521 return L->first < R->first;
522 });
523 return Elements;
524}
525
Sam McCalle6057bc2018-07-10 11:51:26 +0000526bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
527 // Fast-path for ASCII, which is valid UTF-8.
528 if (LLVM_LIKELY(isASCII(S)))
529 return true;
530
531 const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
532 if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
533 return true;
534
535 if (ErrOffset)
536 *ErrOffset = Rest - Data;
537 return false;
538}
539
540std::string fixUTF8(llvm::StringRef S) {
541 // This isn't particularly efficient, but is only for error-recovery.
542 std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
543 const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
544 UTF32 *Out32 = Codepoints.data();
545 ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
546 lenientConversion);
547 Codepoints.resize(Out32 - Codepoints.data());
548 std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
549 const UTF32 *In32 = Codepoints.data();
550 UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
551 ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
552 strictConversion);
553 Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
554 return Res;
555}
556
Sam McCall6be38242018-07-09 10:05:41 +0000557} // namespace json
558} // namespace llvm
559
560static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
561 OS << '\"';
562 for (unsigned char C : S) {
563 if (C == 0x22 || C == 0x5C)
564 OS << '\\';
565 if (C >= 0x20) {
566 OS << C;
567 continue;
568 }
569 OS << '\\';
570 switch (C) {
571 // A few characters are common enough to make short escapes worthwhile.
572 case '\t':
573 OS << 't';
574 break;
575 case '\n':
576 OS << 'n';
577 break;
578 case '\r':
579 OS << 'r';
580 break;
581 default:
582 OS << 'u';
583 llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
584 break;
585 }
586 }
587 OS << '\"';
588}
589
590enum IndenterAction {
591 Indent,
592 Outdent,
593 Newline,
594 Space,
595};
596
597// Prints JSON. The indenter can be used to control formatting.
598template <typename Indenter>
599void llvm::json::Value::print(raw_ostream &OS, const Indenter &I) const {
600 switch (Type) {
601 case T_Null:
602 OS << "null";
603 break;
604 case T_Boolean:
605 OS << (as<bool>() ? "true" : "false");
606 break;
Sam McCalld93eaeb2018-07-09 12:16:40 +0000607 case T_Double:
608 OS << format("%.*g", std::numeric_limits<double>::max_digits10,
609 as<double>());
610 break;
611 case T_Integer:
612 OS << as<int64_t>();
Sam McCall6be38242018-07-09 10:05:41 +0000613 break;
614 case T_StringRef:
615 quote(OS, as<StringRef>());
616 break;
617 case T_String:
618 quote(OS, as<std::string>());
619 break;
620 case T_Object: {
621 bool Comma = false;
622 OS << '{';
623 I(Indent);
624 for (const auto *P : sortedElements(as<json::Object>())) {
625 if (Comma)
626 OS << ',';
627 Comma = true;
628 I(Newline);
629 quote(OS, P->first);
630 OS << ':';
631 I(Space);
632 P->second.print(OS, I);
633 }
634 I(Outdent);
635 if (Comma)
636 I(Newline);
637 OS << '}';
638 break;
639 }
640 case T_Array: {
641 bool Comma = false;
642 OS << '[';
643 I(Indent);
644 for (const auto &E : as<json::Array>()) {
645 if (Comma)
646 OS << ',';
647 Comma = true;
648 I(Newline);
649 E.print(OS, I);
650 }
651 I(Outdent);
652 if (Comma)
653 I(Newline);
654 OS << ']';
655 break;
656 }
657 }
658}
659
660void llvm::format_provider<llvm::json::Value>::format(
661 const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
662 if (Options.empty()) {
663 OS << E;
664 return;
665 }
666 unsigned IndentAmount = 0;
667 if (Options.getAsInteger(/*Radix=*/10, IndentAmount))
668 llvm_unreachable("json::Value format options should be an integer");
669 unsigned IndentLevel = 0;
670 E.print(OS, [&](IndenterAction A) {
671 switch (A) {
672 case Newline:
673 OS << '\n';
674 OS.indent(IndentLevel);
675 break;
676 case Space:
677 OS << ' ';
678 break;
679 case Indent:
680 IndentLevel += IndentAmount;
681 break;
682 case Outdent:
683 IndentLevel -= IndentAmount;
684 break;
685 };
686 });
687}
688
689llvm::raw_ostream &llvm::json::operator<<(raw_ostream &OS, const Value &E) {
690 E.print(OS, [](IndenterAction A) { /*ignore*/ });
691 return OS;
692}