blob: cfe4eb230f9cde7613ea97bc03b1c175a163567f [file] [log] [blame]
Upstreambc0ee9a1970-01-12 13:46:40 +00001/*
2www.sourceforge.net/projects/tinyxml
3Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4
5This software is provided 'as-is', without any express or implied
6warranty. In no event will the authors be held liable for any
7damages arising from the use of this software.
8
9Permission is granted to anyone to use this software for any
10purpose, including commercial applications, and to alter it and
11redistribute it freely, subject to the following restrictions:
12
131. The origin of this software must not be misrepresented; you must
14not claim that you wrote the original software. If you use this
15software in a product, an acknowledgment in the product documentation
16would be appreciated but is not required.
17
182. Altered source versions must be plainly marked as such, and
19must not be misrepresented as being the original software.
20
213. This notice may not be removed or altered from any source
22distribution.
23*/
24
25#include "tinyxml.h"
26#include <ctype.h>
27#include <stddef.h>
28
29//#define DEBUG_PARSER
Karsten Tausched1438212022-10-06 15:20:50 +020030#if defined( DEBUG_PARSER )
31# if defined( DEBUG ) && defined( _MSC_VER )
32# include <windows.h>
33# define TIXML_LOG OutputDebugString
34# else
35# define TIXML_LOG printf
36# endif
37#endif
Upstreambc0ee9a1970-01-12 13:46:40 +000038
39// Note tha "PutString" hardcodes the same list. This
40// is less flexible than it appears. Changing the entries
41// or order will break putstring.
42TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
43{
44 { "&amp;", 5, '&' },
45 { "&lt;", 4, '<' },
46 { "&gt;", 4, '>' },
47 { "&quot;", 6, '\"' },
48 { "&apos;", 6, '\'' }
49};
50
51// Bunch of unicode info at:
52// http://www.unicode.org/faq/utf_bom.html
53// Including the basic of this table, which determines the #bytes in the
54// sequence from the lead byte. 1 placed for invalid sequences --
55// although the result will be junk, pass it through as much as possible.
56// Beware of the non-characters in UTF-8:
57// ef bb bf (Microsoft "lead bytes")
58// ef bf be
59// ef bf bf
60
61const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
62const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
63const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
64
65const int TiXmlBase::utf8ByteTable[256] =
66{
67 // 0 1 2 3 4 5 6 7 8 9 a b c d e f
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
80 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
82 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
83 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
84};
85
86
87void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
88{
89 const unsigned long BYTE_MASK = 0xBF;
90 const unsigned long BYTE_MARK = 0x80;
91 const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
92
93 if (input < 0x80)
94 *length = 1;
95 else if ( input < 0x800 )
96 *length = 2;
97 else if ( input < 0x10000 )
98 *length = 3;
99 else if ( input < 0x200000 )
100 *length = 4;
101 else
102 { *length = 0; return; } // This code won't covert this correctly anyway.
103
104 output += *length;
105
106 // Scary scary fall throughs.
107 switch (*length)
108 {
109 case 4:
110 --output;
111 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
112 input >>= 6;
113 case 3:
114 --output;
115 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
116 input >>= 6;
117 case 2:
118 --output;
119 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
120 input >>= 6;
121 case 1:
122 --output;
123 *output = (char)(input | FIRST_BYTE_MARK[*length]);
124 }
125}
126
127
128/*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
129{
130 // This will only work for low-ascii, everything else is assumed to be a valid
131 // letter. I'm not sure this is the best approach, but it is quite tricky trying
132 // to figure out alhabetical vs. not across encoding. So take a very
133 // conservative approach.
134
135// if ( encoding == TIXML_ENCODING_UTF8 )
136// {
137 if ( anyByte < 127 )
138 return isalpha( anyByte );
139 else
140 return 1; // What else to do? The unicode set is huge...get the english ones right.
141// }
142// else
143// {
144// return isalpha( anyByte );
145// }
146}
147
148
149/*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
150{
151 // This will only work for low-ascii, everything else is assumed to be a valid
152 // letter. I'm not sure this is the best approach, but it is quite tricky trying
153 // to figure out alhabetical vs. not across encoding. So take a very
154 // conservative approach.
155
156// if ( encoding == TIXML_ENCODING_UTF8 )
157// {
158 if ( anyByte < 127 )
159 return isalnum( anyByte );
160 else
161 return 1; // What else to do? The unicode set is huge...get the english ones right.
162// }
163// else
164// {
165// return isalnum( anyByte );
166// }
167}
168
169
170class TiXmlParsingData
171{
172 friend class TiXmlDocument;
173 public:
174 void Stamp( const char* now, TiXmlEncoding encoding );
175
176 const TiXmlCursor& Cursor() { return cursor; }
177
178 private:
179 // Only used by the document!
180 TiXmlParsingData( const char* start, int _tabsize, int row, int col )
181 {
182 assert( start );
183 stamp = start;
184 tabsize = _tabsize;
185 cursor.row = row;
186 cursor.col = col;
187 }
188
189 TiXmlCursor cursor;
190 const char* stamp;
191 int tabsize;
192};
193
194
195void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
196{
197 assert( now );
198
199 // Do nothing if the tabsize is 0.
200 if ( tabsize < 1 )
201 {
202 return;
203 }
204
205 // Get the current row, column.
206 int row = cursor.row;
207 int col = cursor.col;
208 const char* p = stamp;
209 assert( p );
210
211 while ( p < now )
212 {
213 // Treat p as unsigned, so we have a happy compiler.
214 const unsigned char* pU = (const unsigned char*)p;
215
216 // Code contributed by Fletcher Dunn: (modified by lee)
217 switch (*pU) {
218 case 0:
219 // We *should* never get here, but in case we do, don't
220 // advance past the terminating null character, ever
221 return;
222
223 case '\r':
224 // bump down to the next line
225 ++row;
226 col = 0;
227 // Eat the character
228 ++p;
229
230 // Check for \r\n sequence, and treat this as a single character
231 if (*p == '\n') {
232 ++p;
233 }
234 break;
235
236 case '\n':
237 // bump down to the next line
238 ++row;
239 col = 0;
240
241 // Eat the character
242 ++p;
243
244 // Check for \n\r sequence, and treat this as a single
245 // character. (Yes, this bizarre thing does occur still
246 // on some arcane platforms...)
247 if (*p == '\r') {
248 ++p;
249 }
250 break;
251
252 case '\t':
253 // Eat the character
254 ++p;
255
256 // Skip to next tab stop
257 col = (col / tabsize + 1) * tabsize;
258 break;
259
260 case TIXML_UTF_LEAD_0:
261 if ( encoding == TIXML_ENCODING_UTF8 )
262 {
263 if ( *(p+1) && *(p+2) )
264 {
265 // In these cases, don't advance the column. These are
266 // 0-width spaces.
267 if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
268 p += 3;
269 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
270 p += 3;
271 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
272 p += 3;
273 else
274 { p +=3; ++col; } // A normal character.
275 }
Christian Voegla9fb1072021-10-27 11:25:18 +0200276 else
277 {
278 // TIXML_UTF_LEAD_0 (239) is the start character of a 3 byte sequence, so
279 // there is something wrong here. Just advance the pointer to evade infinite loops
280 ++p;
281 }
Upstreambc0ee9a1970-01-12 13:46:40 +0000282 }
283 else
284 {
285 ++p;
286 ++col;
287 }
288 break;
289
290 default:
291 if ( encoding == TIXML_ENCODING_UTF8 )
292 {
293 // Eat the 1 to 4 byte utf8 character.
294 int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)];
295 if ( step == 0 )
296 step = 1; // Error case from bad encoding, but handle gracefully.
297 p += step;
298
299 // Just advance one column, of course.
300 ++col;
301 }
302 else
303 {
304 ++p;
305 ++col;
306 }
307 break;
308 }
309 }
310 cursor.row = row;
311 cursor.col = col;
312 assert( cursor.row >= -1 );
313 assert( cursor.col >= -1 );
314 stamp = p;
315 assert( stamp );
316}
317
318
319const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
320{
321 if ( !p || !*p )
322 {
323 return 0;
324 }
325 if ( encoding == TIXML_ENCODING_UTF8 )
326 {
327 while ( *p )
328 {
329 const unsigned char* pU = (const unsigned char*)p;
330
331 // Skip the stupid Microsoft UTF-8 Byte order marks
332 if ( *(pU+0)==TIXML_UTF_LEAD_0
333 && *(pU+1)==TIXML_UTF_LEAD_1
334 && *(pU+2)==TIXML_UTF_LEAD_2 )
335 {
336 p += 3;
337 continue;
338 }
339 else if(*(pU+0)==TIXML_UTF_LEAD_0
340 && *(pU+1)==0xbfU
341 && *(pU+2)==0xbeU )
342 {
343 p += 3;
344 continue;
345 }
346 else if(*(pU+0)==TIXML_UTF_LEAD_0
347 && *(pU+1)==0xbfU
348 && *(pU+2)==0xbfU )
349 {
350 p += 3;
351 continue;
352 }
353
354 if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' ) // Still using old rules for white space.
355 ++p;
356 else
357 break;
358 }
359 }
360 else
361 {
362 while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )
363 ++p;
364 }
365
366 return p;
367}
368
369#ifdef TIXML_USE_STL
370/*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
371{
372 for( ;; )
373 {
374 if ( !in->good() ) return false;
375
376 int c = in->peek();
377 // At this scope, we can't get to a document. So fail silently.
378 if ( !IsWhiteSpace( c ) || c <= 0 )
379 return true;
380
381 *tag += (char) in->get();
382 }
383}
384
385/*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
386{
387 //assert( character > 0 && character < 128 ); // else it won't work in utf-8
388 while ( in->good() )
389 {
390 int c = in->peek();
391 if ( c == character )
392 return true;
393 if ( c <= 0 ) // Silent failure: can't get document at this scope
394 return false;
395
396 in->get();
397 *tag += (char) c;
398 }
399 return false;
400}
401#endif
402
403const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
404{
405 *name = "";
406 assert( p );
407
408 // Names start with letters or underscores.
409 // Of course, in unicode, tinyxml has no idea what a letter *is*. The
410 // algorithm is generous.
411 //
412 // After that, they can be letters, underscores, numbers,
413 // hyphens, or colons. (Colons are valid ony for namespaces,
414 // but tinyxml can't tell namespaces from names.)
415 if ( p && *p
416 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
417 {
418 while( p && *p
419 && ( IsAlphaNum( (unsigned char ) *p, encoding )
420 || *p == '_'
421 || *p == '-'
422 || *p == '.'
423 || *p == ':' ) )
424 {
425 (*name) += *p;
426 ++p;
427 }
428 return p;
429 }
430 return 0;
431}
432
433const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
434{
435 // Presume an entity, and pull it out.
436 TIXML_STRING ent;
437 int i;
438 *length = 0;
439
440 if ( *(p+1) && *(p+1) == '#' && *(p+2) )
441 {
442 unsigned long ucs = 0;
443 ptrdiff_t delta = 0;
444 unsigned mult = 1;
445
446 if ( *(p+2) == 'x' )
447 {
448 // Hexadecimal.
449 if ( !*(p+3) ) return 0;
450
451 const char* q = p+3;
452 q = strchr( q, ';' );
453
454 if ( !q || !*q ) return 0;
455
456 delta = q-p;
457 --q;
458
459 while ( *q != 'x' )
460 {
461 if ( *q >= '0' && *q <= '9' )
462 ucs += mult * (*q - '0');
463 else if ( *q >= 'a' && *q <= 'f' )
464 ucs += mult * (*q - 'a' + 10);
465 else if ( *q >= 'A' && *q <= 'F' )
466 ucs += mult * (*q - 'A' + 10 );
467 else
468 return 0;
469 mult *= 16;
470 --q;
471 }
472 }
473 else
474 {
475 // Decimal.
476 if ( !*(p+2) ) return 0;
477
478 const char* q = p+2;
479 q = strchr( q, ';' );
480
481 if ( !q || !*q ) return 0;
482
483 delta = q-p;
484 --q;
485
486 while ( *q != '#' )
487 {
488 if ( *q >= '0' && *q <= '9' )
489 ucs += mult * (*q - '0');
490 else
491 return 0;
492 mult *= 10;
493 --q;
494 }
495 }
496 if ( encoding == TIXML_ENCODING_UTF8 )
497 {
498 // convert the UCS to UTF-8
499 ConvertUTF32ToUTF8( ucs, value, length );
500 }
501 else
502 {
503 *value = (char)ucs;
504 *length = 1;
505 }
506 return p + delta + 1;
507 }
508
509 // Now try to match it.
510 for( i=0; i<NUM_ENTITY; ++i )
511 {
512 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
513 {
514 assert( strlen( entity[i].str ) == entity[i].strLength );
515 *value = entity[i].chr;
516 *length = 1;
517 return ( p + entity[i].strLength );
518 }
519 }
520
521 // So it wasn't an entity, its unrecognized, or something like that.
522 *value = *p; // Don't put back the last one, since we return it!
523 return p+1;
524}
525
526
527bool TiXmlBase::StringEqual( const char* p,
528 const char* tag,
529 bool ignoreCase,
530 TiXmlEncoding encoding )
531{
532 assert( p );
533 assert( tag );
534 if ( !p || !*p )
535 {
536 assert( 0 );
537 return false;
538 }
539
540 const char* q = p;
541
542 if ( ignoreCase )
543 {
544 while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
545 {
546 ++q;
547 ++tag;
548 }
549
550 if ( *tag == 0 )
551 return true;
552 }
553 else
554 {
555 while ( *q && *tag && *q == *tag )
556 {
557 ++q;
558 ++tag;
559 }
560
561 if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
562 return true;
563 }
564 return false;
565}
566
567const char* TiXmlBase::ReadText( const char* p,
568 TIXML_STRING * text,
569 bool trimWhiteSpace,
570 const char* endTag,
571 bool caseInsensitive,
572 TiXmlEncoding encoding )
573{
574 *text = "";
575 if ( !trimWhiteSpace // certain tags always keep whitespace
576 || !condenseWhiteSpace ) // if true, whitespace is always kept
577 {
578 // Keep all the white space.
579 while ( p && *p
580 && !StringEqual( p, endTag, caseInsensitive, encoding )
581 )
582 {
583 int len;
584 char cArr[4] = { 0, 0, 0, 0 };
585 p = GetChar( p, cArr, &len, encoding );
586 text->append( cArr, len );
587 }
588 }
589 else
590 {
591 bool whitespace = false;
592
593 // Remove leading white space:
594 p = SkipWhiteSpace( p, encoding );
595 while ( p && *p
596 && !StringEqual( p, endTag, caseInsensitive, encoding ) )
597 {
598 if ( *p == '\r' || *p == '\n' )
599 {
600 whitespace = true;
601 ++p;
602 }
603 else if ( IsWhiteSpace( *p ) )
604 {
605 whitespace = true;
606 ++p;
607 }
608 else
609 {
610 // If we've found whitespace, add it before the
611 // new character. Any whitespace just becomes a space.
612 if ( whitespace )
613 {
614 (*text) += ' ';
615 whitespace = false;
616 }
617 int len;
618 char cArr[4] = { 0, 0, 0, 0 };
619 p = GetChar( p, cArr, &len, encoding );
620 if ( len == 1 )
621 (*text) += cArr[0]; // more efficient
622 else
623 text->append( cArr, len );
624 }
625 }
626 }
627 return p + strlen( endTag );
628}
629
630#ifdef TIXML_USE_STL
631
632void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
633{
634 // The basic issue with a document is that we don't know what we're
635 // streaming. Read something presumed to be a tag (and hope), then
636 // identify it, and call the appropriate stream method on the tag.
637 //
638 // This "pre-streaming" will never read the closing ">" so the
639 // sub-tag can orient itself.
640
641 if ( !StreamTo( in, '<', tag ) )
642 {
643 SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
644 return;
645 }
646
647 while ( in->good() )
648 {
649 int tagIndex = (int) tag->length();
650 while ( in->good() && in->peek() != '>' )
651 {
652 int c = in->get();
653 if ( c <= 0 )
654 {
655 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
656 break;
657 }
658 (*tag) += (char) c;
659 }
660
661 if ( in->good() )
662 {
663 // We now have something we presume to be a node of
664 // some sort. Identify it, and call the node to
665 // continue streaming.
666 TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
667
668 if ( node )
669 {
670 node->StreamIn( in, tag );
671 bool isElement = node->ToElement() != 0;
672 delete node;
673 node = 0;
674
675 // If this is the root element, we're done. Parsing will be
676 // done by the >> operator.
677 if ( isElement )
678 {
679 return;
680 }
681 }
682 else
683 {
684 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
685 return;
686 }
687 }
688 }
689 // We should have returned sooner.
690 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
691}
692
693#endif
694
695const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
696{
697 ClearError();
698
699 // Parse away, at the document level. Since a document
700 // contains nothing but other tags, most of what happens
701 // here is skipping white space.
702 if ( !p || !*p )
703 {
704 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
705 return 0;
706 }
707
708 // Note that, for a document, this needs to come
709 // before the while space skip, so that parsing
710 // starts from the pointer we are given.
711 location.Clear();
712 if ( prevData )
713 {
714 location.row = prevData->cursor.row;
715 location.col = prevData->cursor.col;
716 }
717 else
718 {
719 location.row = 0;
720 location.col = 0;
721 }
722 TiXmlParsingData data( p, TabSize(), location.row, location.col );
723 location = data.Cursor();
724
725 if ( encoding == TIXML_ENCODING_UNKNOWN )
726 {
727 // Check for the Microsoft UTF-8 lead bytes.
728 const unsigned char* pU = (const unsigned char*)p;
729 if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
730 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
731 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
732 {
733 encoding = TIXML_ENCODING_UTF8;
734 useMicrosoftBOM = true;
735 }
736 }
737
738 p = SkipWhiteSpace( p, encoding );
739 if ( !p )
740 {
741 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
742 return 0;
743 }
744
745 while ( p && *p )
746 {
747 TiXmlNode* node = Identify( p, encoding );
748 if ( node )
749 {
750 p = node->Parse( p, &data, encoding );
751 LinkEndChild( node );
752 }
753 else
754 {
755 break;
756 }
757
758 // Did we get encoding info?
759 if ( encoding == TIXML_ENCODING_UNKNOWN
760 && node->ToDeclaration() )
761 {
762 TiXmlDeclaration* dec = node->ToDeclaration();
763 const char* enc = dec->Encoding();
764 assert( enc );
765
766 if ( *enc == 0 )
767 encoding = TIXML_ENCODING_UTF8;
768 else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
769 encoding = TIXML_ENCODING_UTF8;
770 else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
771 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
772 else
773 encoding = TIXML_ENCODING_LEGACY;
774 }
775
776 p = SkipWhiteSpace( p, encoding );
777 }
778
779 // Was this empty?
780 if ( !firstChild ) {
781 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
782 return 0;
783 }
784
785 // All is well.
786 return p;
787}
788
789void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
790{
791 // The first error in a chain is more accurate - don't set again!
792 if ( error )
793 return;
794
795 assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
796 error = true;
797 errorId = err;
798 errorDesc = errorString[ errorId ];
799
800 errorLocation.Clear();
801 if ( pError && data )
802 {
803 data->Stamp( pError, encoding );
804 errorLocation = data->Cursor();
805 }
806}
807
808
809TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
810{
811 TiXmlNode* returnNode = 0;
812
813 p = SkipWhiteSpace( p, encoding );
814 if( !p || !*p || *p != '<' )
815 {
816 return 0;
817 }
818
819 TiXmlDocument* doc = GetDocument();
820 p = SkipWhiteSpace( p, encoding );
821
822 if ( !p || !*p )
823 {
824 return 0;
825 }
826
827 // What is this thing?
828 // - Elements start with a letter or underscore, but xml is reserved.
829 // - Comments: <!--
830 // - Decleration: <?xml
831 // - Everthing else is unknown to tinyxml.
832 //
833
834 const char* xmlHeader = { "<?xml" };
835 const char* commentHeader = { "<!--" };
836 const char* dtdHeader = { "<!" };
837 const char* cdataHeader = { "<![CDATA[" };
838
839 if ( StringEqual( p, xmlHeader, true, encoding ) )
840 {
841 #ifdef DEBUG_PARSER
842 TIXML_LOG( "XML parsing Declaration\n" );
843 #endif
844 returnNode = new TiXmlDeclaration();
845 }
846 else if ( StringEqual( p, commentHeader, false, encoding ) )
847 {
848 #ifdef DEBUG_PARSER
849 TIXML_LOG( "XML parsing Comment\n" );
850 #endif
851 returnNode = new TiXmlComment();
852 }
853 else if ( StringEqual( p, cdataHeader, false, encoding ) )
854 {
855 #ifdef DEBUG_PARSER
856 TIXML_LOG( "XML parsing CDATA\n" );
857 #endif
858 TiXmlText* text = new TiXmlText( "" );
859 text->SetCDATA( true );
860 returnNode = text;
861 }
862 else if ( StringEqual( p, dtdHeader, false, encoding ) )
863 {
864 #ifdef DEBUG_PARSER
865 TIXML_LOG( "XML parsing Unknown(1)\n" );
866 #endif
867 returnNode = new TiXmlUnknown();
868 }
869 else if ( IsAlpha( *(p+1), encoding )
870 || *(p+1) == '_' )
871 {
872 #ifdef DEBUG_PARSER
873 TIXML_LOG( "XML parsing Element\n" );
874 #endif
875 returnNode = new TiXmlElement( "" );
876 }
877 else
878 {
879 #ifdef DEBUG_PARSER
880 TIXML_LOG( "XML parsing Unknown(2)\n" );
881 #endif
882 returnNode = new TiXmlUnknown();
883 }
884
885 if ( returnNode )
886 {
887 // Set the parent, so it can report errors
888 returnNode->parent = this;
889 }
890 else
891 {
892 if ( doc )
893 doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
894 }
895 return returnNode;
896}
897
898#ifdef TIXML_USE_STL
899
900void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
901{
902 // We're called with some amount of pre-parsing. That is, some of "this"
903 // element is in "tag". Go ahead and stream to the closing ">"
904 while( in->good() )
905 {
906 int c = in->get();
907 if ( c <= 0 )
908 {
909 TiXmlDocument* document = GetDocument();
910 if ( document )
911 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
912 return;
913 }
914 (*tag) += (char) c ;
915
916 if ( c == '>' )
917 break;
918 }
919
920 if ( tag->length() < 3 ) return;
921
922 // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
923 // If not, identify and stream.
924
925 if ( tag->at( tag->length() - 1 ) == '>'
926 && tag->at( tag->length() - 2 ) == '/' )
927 {
928 // All good!
929 return;
930 }
931 else if ( tag->at( tag->length() - 1 ) == '>' )
932 {
933 // There is more. Could be:
934 // text
935 // closing tag
936 // another node.
937 for ( ;; )
938 {
939 StreamWhiteSpace( in, tag );
940
941 // Do we have text?
942 if ( in->good() && in->peek() != '<' )
943 {
944 // Yep, text.
945 TiXmlText text( "" );
946 text.StreamIn( in, tag );
947
948 // What follows text is a closing tag or another node.
949 // Go around again and figure it out.
950 continue;
951 }
952
953 // We now have either a closing tag...or another node.
954 // We should be at a "<", regardless.
955 if ( !in->good() ) return;
956 assert( in->peek() == '<' );
957 int tagIndex = (int) tag->length();
958
959 bool closingTag = false;
960 bool firstCharFound = false;
961
962 for( ;; )
963 {
964 if ( !in->good() )
965 return;
966
967 int c = in->peek();
968 if ( c <= 0 )
969 {
970 TiXmlDocument* document = GetDocument();
971 if ( document )
972 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
973 return;
974 }
975
976 if ( c == '>' )
977 break;
978
979 *tag += (char) c;
980 in->get();
981
982 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
983 {
984 firstCharFound = true;
985 if ( c == '/' )
986 closingTag = true;
987 }
988 }
989 // If it was a closing tag, then read in the closing '>' to clean up the input stream.
990 // If it was not, the streaming will be done by the tag.
991 if ( closingTag )
992 {
993 if ( !in->good() )
994 return;
995
996 int c = in->get();
997 if ( c <= 0 )
998 {
999 TiXmlDocument* document = GetDocument();
1000 if ( document )
1001 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1002 return;
1003 }
1004 assert( c == '>' );
1005 *tag += (char) c;
1006
1007 // We are done, once we've found our closing tag.
1008 return;
1009 }
1010 else
1011 {
1012 // If not a closing tag, id it, and stream.
1013 const char* tagloc = tag->c_str() + tagIndex;
1014 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1015 if ( !node )
1016 return;
1017 node->StreamIn( in, tag );
1018 delete node;
1019 node = 0;
1020
1021 // No return: go around from the beginning: text, closing tag, or node.
1022 }
1023 }
1024 }
1025}
1026#endif
1027
1028const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1029{
1030 p = SkipWhiteSpace( p, encoding );
1031 TiXmlDocument* document = GetDocument();
1032
1033 if ( !p || !*p )
1034 {
1035 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1036 return 0;
1037 }
1038
1039 if ( data )
1040 {
1041 data->Stamp( p, encoding );
1042 location = data->Cursor();
1043 }
1044
1045 if ( *p != '<' )
1046 {
1047 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1048 return 0;
1049 }
1050
1051 p = SkipWhiteSpace( p+1, encoding );
1052
1053 // Read the name.
1054 const char* pErr = p;
1055
1056 p = ReadName( p, &value, encoding );
1057 if ( !p || !*p )
1058 {
1059 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1060 return 0;
1061 }
1062
1063 TIXML_STRING endTag ("</");
1064 endTag += value;
1065 endTag += ">";
1066
1067 // Check for and read attributes. Also look for an empty
1068 // tag or an end tag.
1069 while ( p && *p )
1070 {
1071 pErr = p;
1072 p = SkipWhiteSpace( p, encoding );
1073 if ( !p || !*p )
1074 {
1075 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1076 return 0;
1077 }
1078 if ( *p == '/' )
1079 {
1080 ++p;
1081 // Empty tag.
1082 if ( *p != '>' )
1083 {
1084 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1085 return 0;
1086 }
1087 return (p+1);
1088 }
1089 else if ( *p == '>' )
1090 {
1091 // Done with attributes (if there were any.)
1092 // Read the value -- which can include other
1093 // elements -- read the end tag, and return.
1094 ++p;
1095 p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
1096 if ( !p || !*p )
1097 return 0;
1098
1099 // We should find the end tag now
1100 if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1101 {
1102 p += endTag.length();
1103 return p;
1104 }
1105 else
1106 {
1107 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1108 return 0;
1109 }
1110 }
1111 else
1112 {
1113 // Try to read an attribute:
1114 TiXmlAttribute* attrib = new TiXmlAttribute();
1115 if ( !attrib )
1116 {
1117 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1118 return 0;
1119 }
1120
1121 attrib->SetDocument( document );
1122 const char* pErr = p;
1123 p = attrib->Parse( p, data, encoding );
1124
1125 if ( !p || !*p )
1126 {
1127 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1128 delete attrib;
1129 return 0;
1130 }
1131
1132 // Handle the strange case of double attributes:
Karsten Tausched1438212022-10-06 15:20:50 +02001133 TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
Upstreambc0ee9a1970-01-12 13:46:40 +00001134 if ( node )
1135 {
1136 node->SetValue( attrib->Value() );
1137 delete attrib;
1138 return 0;
1139 }
1140
1141 attributeSet.Add( attrib );
1142 }
1143 }
1144 return p;
1145}
1146
1147
1148const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1149{
1150 TiXmlDocument* document = GetDocument();
1151
1152 // Read in text and elements in any order.
1153 const char* pWithWhiteSpace = p;
1154 p = SkipWhiteSpace( p, encoding );
1155
1156 while ( p && *p )
1157 {
1158 if ( *p != '<' )
1159 {
1160 // Take what we have, make a text element.
1161 TiXmlText* textNode = new TiXmlText( "" );
1162
1163 if ( !textNode )
1164 {
1165 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1166 return 0;
1167 }
1168
1169 if ( TiXmlBase::IsWhiteSpaceCondensed() )
1170 {
1171 p = textNode->Parse( p, data, encoding );
1172 }
1173 else
1174 {
1175 // Special case: we want to keep the white space
1176 // so that leading spaces aren't removed.
1177 p = textNode->Parse( pWithWhiteSpace, data, encoding );
1178 }
1179
1180 if ( !textNode->Blank() )
1181 LinkEndChild( textNode );
1182 else
1183 delete textNode;
1184 }
1185 else
1186 {
1187 // We hit a '<'
1188 // Have we hit a new element or an end tag? This could also be
1189 // a TiXmlText in the "CDATA" style.
1190 if ( StringEqual( p, "</", false, encoding ) )
1191 {
1192 return p;
1193 }
1194 else
1195 {
1196 TiXmlNode* node = Identify( p, encoding );
1197 if ( node )
1198 {
1199 p = node->Parse( p, data, encoding );
1200 LinkEndChild( node );
1201 }
1202 else
1203 {
1204 return 0;
1205 }
1206 }
1207 }
1208 pWithWhiteSpace = p;
1209 p = SkipWhiteSpace( p, encoding );
1210 }
1211
1212 if ( !p )
1213 {
1214 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1215 }
1216 return p;
1217}
1218
1219
1220#ifdef TIXML_USE_STL
1221void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1222{
1223 while ( in->good() )
1224 {
1225 int c = in->get();
1226 if ( c <= 0 )
1227 {
1228 TiXmlDocument* document = GetDocument();
1229 if ( document )
1230 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1231 return;
1232 }
1233 (*tag) += (char) c;
1234
1235 if ( c == '>' )
1236 {
1237 // All is well.
1238 return;
1239 }
1240 }
1241}
1242#endif
1243
1244
1245const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1246{
1247 TiXmlDocument* document = GetDocument();
1248 p = SkipWhiteSpace( p, encoding );
1249
1250 if ( data )
1251 {
1252 data->Stamp( p, encoding );
1253 location = data->Cursor();
1254 }
1255 if ( !p || !*p || *p != '<' )
1256 {
1257 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1258 return 0;
1259 }
1260 ++p;
1261 value = "";
1262
1263 while ( p && *p && *p != '>' )
1264 {
1265 value += *p;
1266 ++p;
1267 }
1268
1269 if ( !p )
1270 {
1271 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1272 }
1273 if ( *p == '>' )
1274 return p+1;
1275 return p;
1276}
1277
1278#ifdef TIXML_USE_STL
1279void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1280{
1281 while ( in->good() )
1282 {
1283 int c = in->get();
1284 if ( c <= 0 )
1285 {
1286 TiXmlDocument* document = GetDocument();
1287 if ( document )
1288 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1289 return;
1290 }
1291
1292 (*tag) += (char) c;
1293
1294 if ( c == '>'
1295 && tag->at( tag->length() - 2 ) == '-'
1296 && tag->at( tag->length() - 3 ) == '-' )
1297 {
1298 // All is well.
1299 return;
1300 }
1301 }
1302}
1303#endif
1304
1305
1306const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1307{
1308 TiXmlDocument* document = GetDocument();
1309 value = "";
1310
1311 p = SkipWhiteSpace( p, encoding );
1312
1313 if ( data )
1314 {
1315 data->Stamp( p, encoding );
1316 location = data->Cursor();
1317 }
1318 const char* startTag = "<!--";
1319 const char* endTag = "-->";
1320
1321 if ( !StringEqual( p, startTag, false, encoding ) )
1322 {
1323 document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1324 return 0;
1325 }
1326 p += strlen( startTag );
1327 p = ReadText( p, &value, false, endTag, false, encoding );
1328 return p;
1329}
1330
1331
1332const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1333{
1334 p = SkipWhiteSpace( p, encoding );
1335 if ( !p || !*p ) return 0;
1336
1337 int tabsize = 4;
1338 if ( document )
1339 tabsize = document->TabSize();
1340
1341 if ( data )
1342 {
1343 data->Stamp( p, encoding );
1344 location = data->Cursor();
1345 }
1346 // Read the name, the '=' and the value.
1347 const char* pErr = p;
1348 p = ReadName( p, &name, encoding );
1349 if ( !p || !*p )
1350 {
1351 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1352 return 0;
1353 }
1354 p = SkipWhiteSpace( p, encoding );
1355 if ( !p || !*p || *p != '=' )
1356 {
1357 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1358 return 0;
1359 }
1360
1361 ++p; // skip '='
1362 p = SkipWhiteSpace( p, encoding );
1363 if ( !p || !*p )
1364 {
1365 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1366 return 0;
1367 }
1368
1369 const char* end;
1370
1371 if ( *p == '\'' )
1372 {
1373 ++p;
1374 end = "\'";
1375 p = ReadText( p, &value, false, end, false, encoding );
1376 }
1377 else if ( *p == '"' )
1378 {
1379 ++p;
1380 end = "\"";
1381 p = ReadText( p, &value, false, end, false, encoding );
1382 }
1383 else
1384 {
1385 // All attribute values should be in single or double quotes.
1386 // But this is such a common error that the parser will try
1387 // its best, even without them.
1388 value = "";
1389 while ( p && *p // existence
1390 && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r' // whitespace
1391 && *p != '/' && *p != '>' ) // tag end
1392 {
1393 value += *p;
1394 ++p;
1395 }
1396 }
1397 return p;
1398}
1399
1400#ifdef TIXML_USE_STL
1401void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1402{
1403 if ( cdata )
1404 {
1405 int c = in->get();
1406 if ( c <= 0 )
1407 {
1408 TiXmlDocument* document = GetDocument();
1409 if ( document )
1410 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1411 return;
1412 }
1413
1414 (*tag) += (char) c;
1415
1416 if ( c == '>'
1417 && tag->at( tag->length() - 2 ) == ']'
1418 && tag->at( tag->length() - 3 ) == ']' )
1419 {
1420 // All is well.
1421 return;
1422 }
1423 }
1424 else
1425 {
1426 while ( in->good() )
1427 {
1428 int c = in->peek();
1429 if ( c == '<' )
1430 return;
1431 if ( c <= 0 )
1432 {
1433 TiXmlDocument* document = GetDocument();
1434 if ( document )
1435 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1436 return;
1437 }
1438
1439 (*tag) += (char) c;
1440 in->get();
1441 }
1442 }
1443}
1444#endif
1445
1446const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1447{
1448 value = "";
1449 TiXmlDocument* document = GetDocument();
1450
1451 if ( data )
1452 {
1453 data->Stamp( p, encoding );
1454 location = data->Cursor();
1455 }
1456
1457 const char* const startTag = "<![CDATA[";
1458 const char* const endTag = "]]>";
1459
1460 if ( cdata || StringEqual( p, startTag, false, encoding ) )
1461 {
1462 cdata = true;
1463
1464 if ( !StringEqual( p, startTag, false, encoding ) )
1465 {
1466 document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1467 return 0;
1468 }
1469 p += strlen( startTag );
1470
1471 // Keep all the white space, ignore the encoding, etc.
1472 while ( p && *p
1473 && !StringEqual( p, endTag, false, encoding )
1474 )
1475 {
1476 value += *p;
1477 ++p;
1478 }
1479
1480 TIXML_STRING dummy;
1481 p = ReadText( p, &dummy, false, endTag, false, encoding );
1482 return p;
1483 }
1484 else
1485 {
1486 bool ignoreWhite = true;
1487
1488 const char* end = "<";
1489 p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1490 if ( p )
1491 return p-1; // don't truncate the '<'
1492 return 0;
1493 }
1494}
1495
1496#ifdef TIXML_USE_STL
1497void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1498{
1499 while ( in->good() )
1500 {
1501 int c = in->get();
1502 if ( c <= 0 )
1503 {
1504 TiXmlDocument* document = GetDocument();
1505 if ( document )
1506 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1507 return;
1508 }
1509 (*tag) += (char) c;
1510
1511 if ( c == '>' )
1512 {
1513 // All is well.
1514 return;
1515 }
1516 }
1517}
1518#endif
1519
1520const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1521{
1522 p = SkipWhiteSpace( p, _encoding );
1523 // Find the beginning, find the end, and look for
1524 // the stuff in-between.
1525 TiXmlDocument* document = GetDocument();
1526 if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1527 {
1528 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1529 return 0;
1530 }
1531 if ( data )
1532 {
1533 data->Stamp( p, _encoding );
1534 location = data->Cursor();
1535 }
1536 p += 5;
1537
1538 version = "";
1539 encoding = "";
1540 standalone = "";
1541
1542 while ( p && *p )
1543 {
1544 if ( *p == '>' )
1545 {
1546 ++p;
1547 return p;
1548 }
1549
1550 p = SkipWhiteSpace( p, _encoding );
1551 if ( StringEqual( p, "version", true, _encoding ) )
1552 {
1553 TiXmlAttribute attrib;
1554 p = attrib.Parse( p, data, _encoding );
1555 version = attrib.Value();
1556 }
1557 else if ( StringEqual( p, "encoding", true, _encoding ) )
1558 {
1559 TiXmlAttribute attrib;
1560 p = attrib.Parse( p, data, _encoding );
1561 encoding = attrib.Value();
1562 }
1563 else if ( StringEqual( p, "standalone", true, _encoding ) )
1564 {
1565 TiXmlAttribute attrib;
1566 p = attrib.Parse( p, data, _encoding );
1567 standalone = attrib.Value();
1568 }
1569 else
1570 {
1571 // Read over whatever it is.
1572 while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1573 ++p;
1574 }
1575 }
1576 return 0;
1577}
1578
1579bool TiXmlText::Blank() const
1580{
1581 for ( unsigned i=0; i<value.length(); i++ )
1582 if ( !IsWhiteSpace( value[i] ) )
1583 return false;
1584 return true;
1585}
1586