thymian: 3rdparty/tinyxml/tinyxmlparser.cpp comparison

comparison 3rdparty/tinyxml/tinyxmlparser.cpp @ 0:a4671277546c tip

created the repository for the thymian project

author	ferencd
date	Tue, 17 Aug 2021 11:19:54 +0200
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:a4671277546c
+/*
+www.sourceforge.net/projects/tinyxml
+Original code by Lee Thomason (www.grinninglizard.com)
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any
+damages arising from the use of this software.
+Permission is granted to anyone to use this software for any
+purpose, including commercial applications, and to alter it and
+redistribute it freely, subject to the following restrictions:
+1. The origin of this software must not be misrepresented; you must
+not claim that you wrote the original software. If you use this
+software in a product, an acknowledgment in the product documentation
+would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and
+must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source
+distribution.
+*/
+#include <ctype.h>
+#include <stddef.h>
+#include "tinyxml.h"
+//#define DEBUG_PARSER
+#if defined( DEBUG_PARSER )
+#	if defined( DEBUG ) && defined( _MSC_VER )
+#		include <windows.h>
+#		define TIXML_LOG OutputDebugString
+#	else
+#		define TIXML_LOG printf
+#	endif
+#endif
+// Note tha "PutString" hardcodes the same list. This
+// is less flexible than it appears. Changing the entries
+// or order will break putstring.
+TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] =
+{
+	{ "&amp;",  5, '&' },
+	{ "&lt;",   4, '<' },
+	{ "&gt;",   4, '>' },
+	{ "&quot;", 6, '\"' },
+	{ "&apos;", 6, '\'' }
+};
+// Bunch of unicode info at:
+//		http://www.unicode.org/faq/utf_bom.html
+// Including the basic of this table, which determines the #bytes in the
+// sequence from the lead byte. 1 placed for invalid sequences --
+// although the result will be junk, pass it through as much as possible.
+// Beware of the non-characters in UTF-8:
+//				ef bb bf (Microsoft "lead bytes")
+//				ef bf be
+//				ef bf bf
+const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
+const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
+const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
+const int TiXmlBase::utf8ByteTable[256] =
+{
+	//	0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x00
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x10
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x20
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x30
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x40
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x50
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x60
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x70	End of ASCII range
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x80 0x80 to 0xc1 invalid
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x90
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xa0
+		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xb0
+		1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xc0 0xc2 to 0xdf 2 byte
+		2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xd0
+		3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	// 0xe0 0xe0 to 0xef 3 byte
+		4,	4,	4,	4,	4,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1	// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
+};
+void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
+{
+	const unsigned long BYTE_MASK = 0xBF;
+	const unsigned long BYTE_MARK = 0x80;
+	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+	if (input < 0x80)
+		*length = 1;
+	else if ( input < 0x800 )
+		*length = 2;
+	else if ( input < 0x10000 )
+		*length = 3;
+	else if ( input < 0x200000 )
+		*length = 4;
+	else
+		{ *length = 0; return; }	// This code won't covert this correctly anyway.
+	output += *length;
+	// Scary scary fall throughs.
+	switch (*length)
+	{
+		case 4:
+			--output;
+	                *output = static_cast<char>((input | BYTE_MARK) & BYTE_MASK);
+input >>= 6;
+		case 3:
+			--output;
+	                *output = static_cast<char>((input | BYTE_MARK) & BYTE_MASK);
+input >>= 6;
+		case 2:
+			--output;
+	                *output = static_cast<char>((input | BYTE_MARK) & BYTE_MASK);
+input >>= 6;
+		case 1:
+			--output;
+	                *output = static_cast<char>(input | FIRST_BYTE_MARK[*length]);
+	        default:
+	                break;
+	}
+}
+/*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
+{
+	// This will only work for low-ascii, everything else is assumed to be a valid
+	// letter. I'm not sure this is the best approach, but it is quite tricky trying
+	// to figure out alhabetical vs. not across encoding. So take a very
+	// conservative approach.
+//	if ( encoding == TIXML_ENCODING_UTF8 )
+//	{
+		if ( anyByte < 127 )
+			return isalpha( anyByte );
+		else
+			return 1;	// What else to do? The unicode set is huge...get the english ones right.
+//	}
+//	else
+//	{
+//		return isalpha( anyByte );
+//	}
+}
+/*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
+{
+	// This will only work for low-ascii, everything else is assumed to be a valid
+	// letter. I'm not sure this is the best approach, but it is quite tricky trying
+	// to figure out alhabetical vs. not across encoding. So take a very
+	// conservative approach.
+//	if ( encoding == TIXML_ENCODING_UTF8 )
+//	{
+		if ( anyByte < 127 )
+			return isalnum( anyByte );
+		else
+			return 1;	// What else to do? The unicode set is huge...get the english ones right.
+//	}
+//	else
+//	{
+//		return isalnum( anyByte );
+//	}
+}
+class TiXmlParsingData
+{
+	friend class TiXmlDocument;
+public:
+	void Stamp( const char* now, TiXmlEncoding encoding );
+	const TiXmlCursor& Cursor() const	{ return cursor; }
+private:
+	// Only used by the document!
+	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
+	{
+		assert( start );
+		stamp = start;
+		tabsize = _tabsize;
+		cursor.row = row;
+		cursor.col = col;
+	}
+	TiXmlCursor		cursor;
+	const char*		stamp;
+	int				tabsize;
+};
+void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
+{
+	assert( now );
+	// Do nothing if the tabsize is 0.
+	if ( tabsize < 1 )
+	{
+		return;
+	}
+	// Get the current row, column.
+	int row = cursor.row;
+	int col = cursor.col;
+	const char* p = stamp;
+	assert( p );
+	while ( p < now )
+	{
+		// Treat p as unsigned, so we have a happy compiler.
+	        const unsigned char* pU = reinterpret_cast<const unsigned char*>(p);
+		// Code contributed by Fletcher Dunn: (modified by lee)
+		switch (*pU) {
+			case 0:
+				// We *should* never get here, but in case we do, don't
+				// advance past the terminating null character, ever
+				return;
+			case '\r':
+				// bump down to the next line
+				++row;
+				col = 0;
+				// Eat the character
+				++p;
+				// Check for \r\n sequence, and treat this as a single character
+				if (*p == '\n') {
+					++p;
+				}
+				break;
+			case '\n':
+				// bump down to the next line
+				++row;
+				col = 0;
+				// Eat the character
+				++p;
+				// Check for \n\r sequence, and treat this as a single
+				// character.  (Yes, this bizarre thing does occur still
+				// on some arcane platforms...)
+				if (*p == '\r') {
+					++p;
+				}
+				break;
+			case '\t':
+				// Eat the character
+				++p;
+				// Skip to next tab stop
+				col = (col / tabsize + 1) * tabsize;
+				break;
+			case TIXML_UTF_LEAD_0:
+				if ( encoding == TIXML_ENCODING_UTF8 )
+				{
+					if ( *(p+1) && *(p+2) )
+					{
+						// In these cases, don't advance the column. These are
+						// 0-width spaces.
+						if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
+							p += 3;
+						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
+							p += 3;
+						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
+							p += 3;
+						else
+							{ p +=3; ++col; }	// A normal character.
+					}
+				}
+				else
+				{
+					++p;
+					++col;
+				}
+				break;
+			default:
+				if ( encoding == TIXML_ENCODING_UTF8 )
+				{
+					// Eat the 1 to 4 byte utf8 character.
+					int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
+					if ( step == 0 )
+						step = 1;		// Error case from bad encoding, but handle gracefully.
+					p += step;
+					// Just advance one column, of course.
+					++col;
+				}
+				else
+				{
+					++p;
+					++col;
+				}
+				break;
+		}
+	}
+	cursor.row = row;
+	cursor.col = col;
+	assert( cursor.row >= -1 );
+	assert( cursor.col >= -1 );
+	stamp = p;
+	assert( stamp );
+}
+const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
+{
+	if ( !p || !*p )
+	{
+		return 0;
+	}
+	if ( encoding == TIXML_ENCODING_UTF8 )
+	{
+		while ( *p )
+		{
+			const unsigned char* pU = (const unsigned char*)p;
+			// Skip the stupid Microsoft UTF-8 Byte order marks
+			if (	*(pU+0)==TIXML_UTF_LEAD_0
+				 && *(pU+1)==TIXML_UTF_LEAD_1
+				 && *(pU+2)==TIXML_UTF_LEAD_2 )
+			{
+				p += 3;
+				continue;
+			}
+			else if(*(pU+0)==TIXML_UTF_LEAD_0
+				 && *(pU+1)==0xbfU
+				 && *(pU+2)==0xbeU )
+			{
+				p += 3;
+				continue;
+			}
+			else if(*(pU+0)==TIXML_UTF_LEAD_0
+				 && *(pU+1)==0xbfU
+				 && *(pU+2)==0xbfU )
+			{
+				p += 3;
+				continue;
+			}
+			if ( IsWhiteSpace( *p ) )		// Still using old rules for white space.
+				++p;
+			else
+				break;
+		}
+	}
+	else
+	{
+		while ( *p && IsWhiteSpace( *p ) )
+			++p;
+	}
+	return p;
+}
+#ifdef TIXML_USE_STL
+/*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
+{
+	for( ;; )
+	{
+		if ( !in->good() ) return false;
+		int c = in->peek();
+		// At this scope, we can't get to a document. So fail silently.
+		if ( !IsWhiteSpace( c ) || c <= 0 )
+			return true;
+		*tag += (char) in->get();
+	}
+}
+/*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
+{
+	//assert( character > 0 && character < 128 );	// else it won't work in utf-8
+	while ( in->good() )
+	{
+		int c = in->peek();
+		if ( c == character )
+			return true;
+		if ( c <= 0 )		// Silent failure: can't get document at this scope
+			return false;
+		in->get();
+		*tag += (char) c;
+	}
+	return false;
+}
+#endif
+// One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
+// "assign" optimization removes over 10% of the execution time.
+//
+const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
+{
+	// Oddly, not supported on some comilers,
+	//name->clear();
+	// So use this:
+	*name = "";
+	assert( p );
+	// Names start with letters or underscores.
+	// Of course, in unicode, tinyxml has no idea what a letter *is*. The
+	// algorithm is generous.
+	//
+	// After that, they can be letters, underscores, numbers,
+	// hyphens, or colons. (Colons are valid ony for namespaces,
+	// but tinyxml can't tell namespaces from names.)
+	if (    p && *p
+		 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
+	{
+		const char* start = p;
+		while(		p && *p
+				&&	(		IsAlphaNum( (unsigned char ) *p, encoding )
+						 || *p == '_'
+						 || *p == '-'
+						 || *p == '.'
+						 || *p == ':' ) )
+		{
+			//(*name) += *p; // expensive
+			++p;
+		}
+		if ( p-start > 0 ) {
+			name->assign( start, p-start );
+		}
+		return p;
+	}
+	return 0;
+}
+const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
+{
+	// Presume an entity, and pull it out.
+TIXML_STRING ent;
+	int i;
+	*length = 0;
+	if ( *(p+1) && *(p+1) == '#' && *(p+2) )
+	{
+		unsigned long ucs = 0;
+		ptrdiff_t delta = 0;
+		unsigned mult = 1;
+		if ( *(p+2) == 'x' )
+		{
+			// Hexadecimal.
+			if ( !*(p+3) ) return 0;
+			const char* q = p+3;
+			q = strchr( q, ';' );
+			if ( !q || !*q ) return 0;
+			delta = q-p;
+			--q;
+			while ( *q != 'x' )
+			{
+				if ( *q >= '0' && *q <= '9' )
+					ucs += mult * (*q - '0');
+				else if ( *q >= 'a' && *q <= 'f' )
+					ucs += mult * (*q - 'a' + 10);
+				else if ( *q >= 'A' && *q <= 'F' )
+					ucs += mult * (*q - 'A' + 10 );
+				else
+					return 0;
+				mult *= 16;
+				--q;
+			}
+		}
+		else
+		{
+			// Decimal.
+			if ( !*(p+2) ) return 0;
+			const char* q = p+2;
+			q = strchr( q, ';' );
+			if ( !q || !*q ) return 0;
+			delta = q-p;
+			--q;
+			while ( *q != '#' )
+			{
+				if ( *q >= '0' && *q <= '9' )
+					ucs += mult * (*q - '0');
+				else
+					return 0;
+				mult *= 10;
+				--q;
+			}
+		}
+		if ( encoding == TIXML_ENCODING_UTF8 )
+		{
+			// convert the UCS to UTF-8
+			ConvertUTF32ToUTF8( ucs, value, length );
+		}
+		else
+		{
+			*value = (char)ucs;
+			*length = 1;
+		}
+		return p + delta + 1;
+	}
+	// Now try to match it.
+	for( i=0; i<NUM_ENTITY; ++i )
+	{
+		if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
+		{
+			assert( strlen( entity[i].str ) == entity[i].strLength );
+			*value = entity[i].chr;
+			*length = 1;
+			return ( p + entity[i].strLength );
+		}
+	}
+	// So it wasn't an entity, its unrecognized, or something like that.
+	*value = *p;	// Don't put back the last one, since we return it!
+	//*length = 1;	// Leave unrecognized entities - this doesn't really work.
+					// Just writes strange XML.
+	return p+1;
+}
+bool TiXmlBase::StringEqual( const char* p,
+							 const char* tag,
+							 bool ignoreCase,
+							 TiXmlEncoding encoding )
+{
+	assert( p );
+	assert( tag );
+	if ( !p || !*p )
+	{
+		assert( 0 );
+		return false;
+	}
+	const char* q = p;
+	if ( ignoreCase )
+	{
+		while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
+		{
+			++q;
+			++tag;
+		}
+		if ( *tag == 0 )
+			return true;
+	}
+	else
+	{
+		while ( *q && *tag && *q == *tag )
+		{
+			++q;
+			++tag;
+		}
+		if ( *tag == 0 )		// Have we found the end of the tag, and everything equal?
+			return true;
+	}
+	return false;
+}
+const char* TiXmlBase::ReadText(	const char* p,
+									TIXML_STRING * text,
+									bool trimWhiteSpace,
+									const char* endTag,
+									bool caseInsensitive,
+									TiXmlEncoding encoding )
+{
+*text = "";
+	if (    !trimWhiteSpace			// certain tags always keep whitespace
+		 || !condenseWhiteSpace )	// if true, whitespace is always kept
+	{
+		// Keep all the white space.
+		while (	   p && *p
+				&& !StringEqual( p, endTag, caseInsensitive, encoding )
+			  )
+		{
+			int len;
+			char cArr[4] = { 0, 0, 0, 0 };
+			p = GetChar( p, cArr, &len, encoding );
+			text->append( cArr, len );
+		}
+	}
+	else
+	{
+		bool whitespace = false;
+		// Remove leading white space:
+		p = SkipWhiteSpace( p, encoding );
+		while (	   p && *p
+				&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
+		{
+			if ( *p == '\r' || *p == '\n' )
+			{
+				whitespace = true;
+				++p;
+			}
+			else if ( IsWhiteSpace( *p ) )
+			{
+				whitespace = true;
+				++p;
+			}
+			else
+			{
+				// If we've found whitespace, add it before the
+				// new character. Any whitespace just becomes a space.
+				if ( whitespace )
+				{
+					(*text) += ' ';
+					whitespace = false;
+				}
+				int len;
+				char cArr[4] = { 0, 0, 0, 0 };
+				p = GetChar( p, cArr, &len, encoding );
+				if ( len == 1 )
+					(*text) += cArr[0];	// more efficient
+				else
+					text->append( cArr, len );
+			}
+		}
+	}
+	if ( p && *p )
+		p += strlen( endTag );
+	return ( p && *p ) ? p : 0;
+}
+#ifdef TIXML_USE_STL
+void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
+{
+	// The basic issue with a document is that we don't know what we're
+	// streaming. Read something presumed to be a tag (and hope), then
+	// identify it, and call the appropriate stream method on the tag.
+	//
+	// This "pre-streaming" will never read the closing ">" so the
+	// sub-tag can orient itself.
+	if ( !StreamTo( in, '<', tag ) )
+	{
+		SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
+		return;
+	}
+	while ( in->good() )
+	{
+		int tagIndex = (int) tag->length();
+		while ( in->good() && in->peek() != '>' )
+		{
+			int c = in->get();
+			if ( c <= 0 )
+			{
+				SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+				break;
+			}
+			(*tag) += (char) c;
+		}
+		if ( in->good() )
+		{
+			// We now have something we presume to be a node of
+			// some sort. Identify it, and call the node to
+			// continue streaming.
+			TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
+			if ( node )
+			{
+				node->StreamIn( in, tag );
+				bool isElement = node->ToElement() != 0;
+				delete node;
+				node = 0;
+				// If this is the root element, we're done. Parsing will be
+				// done by the >> operator.
+				if ( isElement )
+				{
+					return;
+				}
+			}
+			else
+			{
+				SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
+				return;
+			}
+		}
+	}
+	// We should have returned sooner.
+	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
+}
+#endif
+const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
+{
+	ClearError();
+	// Parse away, at the document level. Since a document
+	// contains nothing but other tags, most of what happens
+	// here is skipping white space.
+	if ( !p || !*p )
+	{
+		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
+		return 0;
+	}
+	// Note that, for a document, this needs to come
+	// before the while space skip, so that parsing
+	// starts from the pointer we are given.
+	location.Clear();
+	if ( prevData )
+	{
+		location.row = prevData->cursor.row;
+		location.col = prevData->cursor.col;
+	}
+	else
+	{
+		location.row = 0;
+		location.col = 0;
+	}
+	TiXmlParsingData data( p, TabSize(), location.row, location.col );
+	location = data.Cursor();
+	if ( encoding == TIXML_ENCODING_UNKNOWN )
+	{
+		// Check for the Microsoft UTF-8 lead bytes.
+		const unsigned char* pU = (const unsigned char*)p;
+		if (	*(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
+			 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
+			 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
+		{
+			encoding = TIXML_ENCODING_UTF8;
+			useMicrosoftBOM = true;
+		}
+	}
+p = SkipWhiteSpace( p, encoding );
+	if ( !p )
+	{
+		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
+		return 0;
+	}
+	while ( p && *p )
+	{
+		TiXmlNode* node = Identify( p, encoding );
+		if ( node )
+		{
+			p = node->Parse( p, &data, encoding );
+			LinkEndChild( node );
+		}
+		else
+		{
+			break;
+		}
+		// Did we get encoding info?
+		if (    encoding == TIXML_ENCODING_UNKNOWN
+			 && node->ToDeclaration() )
+		{
+			TiXmlDeclaration* dec = node->ToDeclaration();
+			const char* enc = dec->Encoding();
+			assert( enc );
+			if ( *enc == 0 )
+				encoding = TIXML_ENCODING_UTF8;
+			else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
+				encoding = TIXML_ENCODING_UTF8;
+			else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
+				encoding = TIXML_ENCODING_UTF8;	// incorrect, but be nice
+			else
+				encoding = TIXML_ENCODING_LEGACY;
+		}
+		p = SkipWhiteSpace( p, encoding );
+	}
+	// Was this empty?
+	if ( !firstChild ) {
+		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
+		return 0;
+	}
+	// All is well.
+	return p;
+}
+void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
+{
+	// The first error in a chain is more accurate - don't set again!
+	if ( error )
+		return;
+	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
+	error   = true;
+	errorId = err;
+	errorDesc = errorString[ errorId ];
+	errorLocation.Clear();
+	if ( pError && data )
+	{
+		data->Stamp( pError, encoding );
+		errorLocation = data->Cursor();
+	}
+}
+TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
+{
+	TiXmlNode* returnNode = 0;
+	p = SkipWhiteSpace( p, encoding );
+	if( !p || !*p || *p != '<' )
+	{
+		return 0;
+	}
+	p = SkipWhiteSpace( p, encoding );
+	if ( !p || !*p )
+	{
+		return 0;
+	}
+	// What is this thing?
+	// - Elements start with a letter or underscore, but xml is reserved.
+	// - Comments: <!--
+	// - Decleration: <?xml
+	// - Everthing else is unknown to tinyxml.
+	//
+	const char* xmlHeader = { "<?xml" };
+	const char* commentHeader = { "<!--" };
+	const char* dtdHeader = { "<!" };
+	const char* cdataHeader = { "<![CDATA[" };
+	if ( StringEqual( p, xmlHeader, true, encoding ) )
+	{
+		#ifdef DEBUG_PARSER
+			TIXML_LOG( "XML parsing Declaration\n" );
+		#endif
+		returnNode = new TiXmlDeclaration();
+	}
+	else if ( StringEqual( p, commentHeader, false, encoding ) )
+	{
+		#ifdef DEBUG_PARSER
+			TIXML_LOG( "XML parsing Comment\n" );
+		#endif
+		returnNode = new TiXmlComment();
+	}
+	else if ( StringEqual( p, cdataHeader, false, encoding ) )
+	{
+		#ifdef DEBUG_PARSER
+			TIXML_LOG( "XML parsing CDATA\n" );
+		#endif
+		TiXmlText* text = new TiXmlText( "" );
+		text->SetCDATA( true );
+		returnNode = text;
+	}
+	else if ( StringEqual( p, dtdHeader, false, encoding ) )
+	{
+		#ifdef DEBUG_PARSER
+			TIXML_LOG( "XML parsing Unknown(1)\n" );
+		#endif
+		returnNode = new TiXmlUnknown();
+	}
+	else if (    IsAlpha( *(p+1), encoding )
+			  || *(p+1) == '_' )
+	{
+		#ifdef DEBUG_PARSER
+			TIXML_LOG( "XML parsing Element\n" );
+		#endif
+		returnNode = new TiXmlElement( "" );
+	}
+	else
+	{
+		#ifdef DEBUG_PARSER
+			TIXML_LOG( "XML parsing Unknown(2)\n" );
+		#endif
+		returnNode = new TiXmlUnknown();
+	}
+	if ( returnNode )
+	{
+		// Set the parent, so it can report errors
+		returnNode->parent = this;
+	}
+	return returnNode;
+}
+#ifdef TIXML_USE_STL
+void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
+{
+	// We're called with some amount of pre-parsing. That is, some of "this"
+	// element is in "tag". Go ahead and stream to the closing ">"
+	while( in->good() )
+	{
+		int c = in->get();
+		if ( c <= 0 )
+		{
+			TiXmlDocument* document = GetDocument();
+			if ( document )
+				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+			return;
+		}
+		(*tag) += (char) c ;
+		if ( c == '>' )
+			break;
+	}
+	if ( tag->length() < 3 ) return;
+	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
+	// If not, identify and stream.
+	if (    tag->at( tag->length() - 1 ) == '>'
+		 && tag->at( tag->length() - 2 ) == '/' )
+	{
+		// All good!
+		return;
+	}
+	else if ( tag->at( tag->length() - 1 ) == '>' )
+	{
+		// There is more. Could be:
+		//		text
+		//		cdata text (which looks like another node)
+		//		closing tag
+		//		another node.
+		for ( ;; )
+		{
+			StreamWhiteSpace( in, tag );
+			// Do we have text?
+			if ( in->good() && in->peek() != '<' )
+			{
+				// Yep, text.
+				TiXmlText text( "" );
+				text.StreamIn( in, tag );
+				// What follows text is a closing tag or another node.
+				// Go around again and figure it out.
+				continue;
+			}
+			// We now have either a closing tag...or another node.
+			// We should be at a "<", regardless.
+			if ( !in->good() ) return;
+			assert( in->peek() == '<' );
+			int tagIndex = (int) tag->length();
+			bool closingTag = false;
+			bool firstCharFound = false;
+			for( ;; )
+			{
+				if ( !in->good() )
+					return;
+				int c = in->peek();
+				if ( c <= 0 )
+				{
+					TiXmlDocument* document = GetDocument();
+					if ( document )
+						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+					return;
+				}
+				if ( c == '>' )
+					break;
+				*tag += (char) c;
+				in->get();
+				// Early out if we find the CDATA id.
+				if ( c == '[' && tag->size() >= 9 )
+				{
+					size_t len = tag->size();
+					const char* start = tag->c_str() + len - 9;
+					if ( strcmp( start, "<![CDATA[" ) == 0 ) {
+						assert( !closingTag );
+						break;
+					}
+				}
+				if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
+				{
+					firstCharFound = true;
+					if ( c == '/' )
+						closingTag = true;
+				}
+			}
+			// If it was a closing tag, then read in the closing '>' to clean up the input stream.
+			// If it was not, the streaming will be done by the tag.
+			if ( closingTag )
+			{
+				if ( !in->good() )
+					return;
+				int c = in->get();
+				if ( c <= 0 )
+				{
+					TiXmlDocument* document = GetDocument();
+					if ( document )
+						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+					return;
+				}
+				assert( c == '>' );
+				*tag += (char) c;
+				// We are done, once we've found our closing tag.
+				return;
+			}
+			else
+			{
+				// If not a closing tag, id it, and stream.
+				const char* tagloc = tag->c_str() + tagIndex;
+				TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
+				if ( !node )
+					return;
+				node->StreamIn( in, tag );
+				delete node;
+				node = 0;
+				// No return: go around from the beginning: text, closing tag, or node.
+			}
+		}
+	}
+}
+#endif
+const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
+{
+	p = SkipWhiteSpace( p, encoding );
+	TiXmlDocument* document = GetDocument();
+	if ( !p || !*p )
+	{
+		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
+		return 0;
+	}
+	if ( data )
+	{
+		data->Stamp( p, encoding );
+		location = data->Cursor();
+	}
+	if ( *p != '<' )
+	{
+		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
+		return 0;
+	}
+	p = SkipWhiteSpace( p+1, encoding );
+	// Read the name.
+	const char* pErr = p;
+p = ReadName( p, &value, encoding );
+	if ( !p || !*p )
+	{
+		if ( document )	document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
+		return 0;
+	}
+TIXML_STRING endTag ("</");
+	endTag += value;
+	// Check for and read attributes. Also look for an empty
+	// tag or an end tag.
+	while ( p && *p )
+	{
+		pErr = p;
+		p = SkipWhiteSpace( p, encoding );
+		if ( !p || !*p )
+		{
+			if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
+			return 0;
+		}
+		if ( *p == '/' )
+		{
+			++p;
+			// Empty tag.
+			if ( *p  != '>' )
+			{
+				if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
+				return 0;
+			}
+			return (p+1);
+		}
+		else if ( *p == '>' )
+		{
+			// Done with attributes (if there were any.)
+			// Read the value -- which can include other
+			// elements -- read the end tag, and return.
+			++p;
+			p = ReadValue( p, data, encoding );		// Note this is an Element method, and will set the error if one happens.
+			if ( !p || !*p ) {
+				// We were looking for the end tag, but found nothing.
+				// Fix for [ 1663758 ] Failure to report error on bad XML
+				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
+				return 0;
+			}
+			// We should find the end tag now
+			// note that:
+			// </foo > and
+			// </foo>
+			// are both valid end tags.
+			if ( StringEqual( p, endTag.c_str(), false, encoding ) )
+			{
+				p += endTag.length();
+				p = SkipWhiteSpace( p, encoding );
+				if ( p && *p && *p == '>' ) {
+					++p;
+					return p;
+				}
+				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
+				return 0;
+			}
+			else
+			{
+				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
+				return 0;
+			}
+		}
+		else
+		{
+			// Try to read an attribute:
+			TiXmlAttribute* attrib = new TiXmlAttribute();
+			if ( !attrib )
+			{
+				return 0;
+			}
+			attrib->SetDocument( document );
+			pErr = p;
+			p = attrib->Parse( p, data, encoding );
+			if ( !p || !*p )
+			{
+				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
+				delete attrib;
+				return 0;
+			}
+			// Handle the strange case of double attributes:
+			#ifdef TIXML_USE_STL
+			TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
+			#else
+			TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
+			#endif
+			if ( node )
+			{
+				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
+				delete attrib;
+				return 0;
+			}
+			attributeSet.Add( attrib );
+		}
+	}
+	return p;
+}
+const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
+{
+	TiXmlDocument* document = GetDocument();
+	// Read in text and elements in any order.
+	const char* pWithWhiteSpace = p;
+	p = SkipWhiteSpace( p, encoding );
+	while ( p && *p )
+	{
+		if ( *p != '<' )
+		{
+			// Take what we have, make a text element.
+			TiXmlText* textNode = new TiXmlText( "" );
+			if ( !textNode )
+			{
+			    return 0;
+			}
+			if ( TiXmlBase::IsWhiteSpaceCondensed() )
+			{
+				p = textNode->Parse( p, data, encoding );
+			}
+			else
+			{
+				// Special case: we want to keep the white space
+				// so that leading spaces aren't removed.
+				p = textNode->Parse( pWithWhiteSpace, data, encoding );
+			}
+			if ( !textNode->Blank() )
+				LinkEndChild( textNode );
+			else
+				delete textNode;
+		}
+		else
+		{
+			// We hit a '<'
+			// Have we hit a new element or an end tag? This could also be
+			// a TiXmlText in the "CDATA" style.
+			if ( StringEqual( p, "</", false, encoding ) )
+			{
+				return p;
+			}
+			else
+			{
+				TiXmlNode* node = Identify( p, encoding );
+				if ( node )
+				{
+					p = node->Parse( p, data, encoding );
+					LinkEndChild( node );
+				}
+				else
+				{
+					return 0;
+				}
+			}
+		}
+		pWithWhiteSpace = p;
+		p = SkipWhiteSpace( p, encoding );
+	}
+	if ( !p )
+	{
+		if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
+	}
+	return p;
+}
+#ifdef TIXML_USE_STL
+void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
+{
+	while ( in->good() )
+	{
+		int c = in->get();
+		if ( c <= 0 )
+		{
+			TiXmlDocument* document = GetDocument();
+			if ( document )
+				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+			return;
+		}
+		(*tag) += (char) c;
+		if ( c == '>' )
+		{
+			// All is well.
+			return;
+		}
+	}
+}
+#endif
+const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
+{
+	TiXmlDocument* document = GetDocument();
+	p = SkipWhiteSpace( p, encoding );
+	if ( data )
+	{
+		data->Stamp( p, encoding );
+		location = data->Cursor();
+	}
+	if ( !p || !*p || *p != '<' )
+	{
+		if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
+		return 0;
+	}
+	++p;
+value = "";
+	while ( p && *p && *p != '>' )
+	{
+		value += *p;
+		++p;
+	}
+	if ( !p )
+	{
+		if ( document )
+			document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
+	}
+	if ( p && *p == '>' )
+		return p+1;
+	return p;
+}
+#ifdef TIXML_USE_STL
+void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
+{
+	while ( in->good() )
+	{
+		int c = in->get();
+		if ( c <= 0 )
+		{
+			TiXmlDocument* document = GetDocument();
+			if ( document )
+				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+			return;
+		}
+		(*tag) += (char) c;
+		if ( c == '>'
+			 && tag->at( tag->length() - 2 ) == '-'
+			 && tag->at( tag->length() - 3 ) == '-' )
+		{
+			// All is well.
+			return;
+		}
+	}
+}
+#endif
+const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
+{
+	TiXmlDocument* document = GetDocument();
+	value = "";
+	p = SkipWhiteSpace( p, encoding );
+	if ( data )
+	{
+		data->Stamp( p, encoding );
+		location = data->Cursor();
+	}
+	const char* startTag = "<!--";
+	const char* endTag   = "-->";
+	if ( !StringEqual( p, startTag, false, encoding ) )
+	{
+		if ( document )
+			document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
+		return 0;
+	}
+	p += strlen( startTag );
+	// [ 1475201 ] TinyXML parses entities in comments
+	// Oops - ReadText doesn't work, because we don't want to parse the entities.
+	// p = ReadText( p, &value, false, endTag, false, encoding );
+	//
+	// from the XML spec:
+	/*
+	 [Definition: Comments may appear anywhere in a document outside other markup; in addition,
+	              they may appear within the document type declaration at places allowed by the grammar.
+				  They are not part of the document's character data; an XML processor MAY, but need not,
+				  make it possible for an application to retrieve the text of comments. For compatibility,
+				  the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
+				  references MUST NOT be recognized within comments.
+				  An example of a comment:
+				  <!-- declarations for <head> & <body> -->
+	*/
+value = "";
+	// Keep all the white space.
+	while (	p && *p && !StringEqual( p, endTag, false, encoding ) )
+	{
+		value.append( p, 1 );
+		++p;
+	}
+	if ( p && *p )
+		p += strlen( endTag );
+	return p;
+}
+const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
+{
+	p = SkipWhiteSpace( p, encoding );
+	if ( !p || !*p ) return 0;
+	if ( data )
+	{
+		data->Stamp( p, encoding );
+		location = data->Cursor();
+	}
+	// Read the name, the '=' and the value.
+	const char* pErr = p;
+	p = ReadName( p, &name, encoding );
+	if ( !p || !*p )
+	{
+		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
+		return 0;
+	}
+	p = SkipWhiteSpace( p, encoding );
+	if ( !p || !*p || *p != '=' )
+	{
+		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
+		return 0;
+	}
+	++p;	// skip '='
+	p = SkipWhiteSpace( p, encoding );
+	if ( !p || !*p )
+	{
+		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
+		return 0;
+	}
+	const char* end;
+	const char SINGLE_QUOTE = '\'';
+	const char DOUBLE_QUOTE = '\"';
+	if ( *p == SINGLE_QUOTE )
+	{
+		++p;
+		end = "\'";		// single quote in string
+		p = ReadText( p, &value, false, end, false, encoding );
+	}
+	else if ( *p == DOUBLE_QUOTE )
+	{
+		++p;
+		end = "\"";		// double quote in string
+		p = ReadText( p, &value, false, end, false, encoding );
+	}
+	else
+	{
+		// All attribute values should be in single or double quotes.
+		// But this is such a common error that the parser will try
+		// its best, even without them.
+		value = "";
+		while (    p && *p											// existence
+				&& !IsWhiteSpace( *p )								// whitespace
+				&& *p != '/' && *p != '>' )							// tag end
+		{
+			if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
+				// [ 1451649 ] Attribute values with trailing quotes not handled correctly
+				// We did not have an opening quote but seem to have a
+				// closing one. Give up and throw an error.
+				if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
+				return 0;
+			}
+			value += *p;
+			++p;
+		}
+	}
+	return p;
+}
+#ifdef TIXML_USE_STL
+void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
+{
+	while ( in->good() )
+	{
+		int c = in->peek();
+		if ( !cdata && (c == '<' ) )
+		{
+			return;
+		}
+		if ( c <= 0 )
+		{
+			TiXmlDocument* document = GetDocument();
+			if ( document )
+				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+			return;
+		}
+		(*tag) += (char) c;
+		in->get();	// "commits" the peek made above
+		if ( cdata && c == '>' && tag->size() >= 3 ) {
+			size_t len = tag->size();
+			if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
+				// terminator of cdata.
+				return;
+			}
+		}
+	}
+}
+#endif
+const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
+{
+	value = "";
+	TiXmlDocument* document = GetDocument();
+	if ( data )
+	{
+		data->Stamp( p, encoding );
+		location = data->Cursor();
+	}
+	const char* const startTag = "<![CDATA[";
+	const char* const endTag   = "]]>";
+	if ( cdata || StringEqual( p, startTag, false, encoding ) )
+	{
+		cdata = true;
+		if ( !StringEqual( p, startTag, false, encoding ) )
+		{
+			if ( document )
+				document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
+			return 0;
+		}
+		p += strlen( startTag );
+		// Keep all the white space, ignore the encoding, etc.
+		while (	   p && *p
+				&& !StringEqual( p, endTag, false, encoding )
+			  )
+		{
+			value += *p;
+			++p;
+		}
+		TIXML_STRING dummy;
+		p = ReadText( p, &dummy, false, endTag, false, encoding );
+		return p;
+	}
+	else
+	{
+		bool ignoreWhite = true;
+		const char* end = "<";
+		p = ReadText( p, &value, ignoreWhite, end, false, encoding );
+		if ( p && *p )
+			return p-1;	// don't truncate the '<'
+		return 0;
+	}
+}
+#ifdef TIXML_USE_STL
+void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
+{
+	while ( in->good() )
+	{
+		int c = in->get();
+		if ( c <= 0 )
+		{
+			TiXmlDocument* document = GetDocument();
+			if ( document )
+				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
+			return;
+		}
+		(*tag) += (char) c;
+		if ( c == '>' )
+		{
+			// All is well.
+			return;
+		}
+	}
+}
+#endif
+const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
+{
+	p = SkipWhiteSpace( p, _encoding );
+	// Find the beginning, find the end, and look for
+	// the stuff in-between.
+	TiXmlDocument* document = GetDocument();
+	if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
+	{
+		if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
+		return 0;
+	}
+	if ( data )
+	{
+		data->Stamp( p, _encoding );
+		location = data->Cursor();
+	}
+	p += 5;
+	version = "";
+	encoding = "";
+	standalone = "";
+	while ( p && *p )
+	{
+		if ( *p == '>' )
+		{
+			++p;
+			return p;
+		}
+		p = SkipWhiteSpace( p, _encoding );
+		if ( StringEqual( p, "version", true, _encoding ) )
+		{
+			TiXmlAttribute attrib;
+			p = attrib.Parse( p, data, _encoding );
+			version = attrib.Value();
+		}
+		else if ( StringEqual( p, "encoding", true, _encoding ) )
+		{
+			TiXmlAttribute attrib;
+			p = attrib.Parse( p, data, _encoding );
+			encoding = attrib.Value();
+		}
+		else if ( StringEqual( p, "standalone", true, _encoding ) )
+		{
+			TiXmlAttribute attrib;
+			p = attrib.Parse( p, data, _encoding );
+			standalone = attrib.Value();
+		}
+		else
+		{
+			// Read over whatever it is.
+			while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
+				++p;
+		}
+	}
+	return 0;
+}
+bool TiXmlText::Blank() const
+{
+	for ( unsigned i=0; i<value.length(); i++ )
+		if ( !IsWhiteSpace( value[i] ) )
+			return false;
+	return true;
+}

Mercurial > thymian

comparison 3rdparty/tinyxml/tinyxmlparser.cpp @ 0:a4671277546c tip