ferencd@0: /* ferencd@0: www.sourceforge.net/projects/tinyxml ferencd@0: Original code by Lee Thomason (www.grinninglizard.com) ferencd@0: ferencd@0: This software is provided 'as-is', without any express or implied ferencd@0: warranty. In no event will the authors be held liable for any ferencd@0: damages arising from the use of this software. ferencd@0: ferencd@0: Permission is granted to anyone to use this software for any ferencd@0: purpose, including commercial applications, and to alter it and ferencd@0: redistribute it freely, subject to the following restrictions: ferencd@0: ferencd@0: 1. The origin of this software must not be misrepresented; you must ferencd@0: not claim that you wrote the original software. If you use this ferencd@0: software in a product, an acknowledgment in the product documentation ferencd@0: would be appreciated but is not required. ferencd@0: ferencd@0: 2. Altered source versions must be plainly marked as such, and ferencd@0: must not be misrepresented as being the original software. ferencd@0: ferencd@0: 3. This notice may not be removed or altered from any source ferencd@0: distribution. ferencd@0: */ ferencd@0: ferencd@0: #include ferencd@0: #include ferencd@0: ferencd@0: #include "tinyxml.h" ferencd@0: ferencd@0: //#define DEBUG_PARSER ferencd@0: #if defined( DEBUG_PARSER ) ferencd@0: # if defined( DEBUG ) && defined( _MSC_VER ) ferencd@0: # include ferencd@0: # define TIXML_LOG OutputDebugString ferencd@0: # else ferencd@0: # define TIXML_LOG printf ferencd@0: # endif ferencd@0: #endif ferencd@0: ferencd@0: // Note tha "PutString" hardcodes the same list. This ferencd@0: // is less flexible than it appears. Changing the entries ferencd@0: // or order will break putstring. ferencd@0: TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] = ferencd@0: { ferencd@0: { "&", 5, '&' }, ferencd@0: { "<", 4, '<' }, ferencd@0: { ">", 4, '>' }, ferencd@0: { """, 6, '\"' }, ferencd@0: { "'", 6, '\'' } ferencd@0: }; ferencd@0: ferencd@0: // Bunch of unicode info at: ferencd@0: // http://www.unicode.org/faq/utf_bom.html ferencd@0: // Including the basic of this table, which determines the #bytes in the ferencd@0: // sequence from the lead byte. 1 placed for invalid sequences -- ferencd@0: // although the result will be junk, pass it through as much as possible. ferencd@0: // Beware of the non-characters in UTF-8: ferencd@0: // ef bb bf (Microsoft "lead bytes") ferencd@0: // ef bf be ferencd@0: // ef bf bf ferencd@0: ferencd@0: const unsigned char TIXML_UTF_LEAD_0 = 0xefU; ferencd@0: const unsigned char TIXML_UTF_LEAD_1 = 0xbbU; ferencd@0: const unsigned char TIXML_UTF_LEAD_2 = 0xbfU; ferencd@0: ferencd@0: const int TiXmlBase::utf8ByteTable[256] = ferencd@0: { ferencd@0: // 0 1 2 3 4 5 6 7 8 9 a b c d e f ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0 ferencd@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0 ferencd@0: 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte ferencd@0: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 ferencd@0: 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte ferencd@0: 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid ferencd@0: }; ferencd@0: ferencd@0: ferencd@0: void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length ) ferencd@0: { ferencd@0: const unsigned long BYTE_MASK = 0xBF; ferencd@0: const unsigned long BYTE_MARK = 0x80; ferencd@0: const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; ferencd@0: ferencd@0: if (input < 0x80) ferencd@0: *length = 1; ferencd@0: else if ( input < 0x800 ) ferencd@0: *length = 2; ferencd@0: else if ( input < 0x10000 ) ferencd@0: *length = 3; ferencd@0: else if ( input < 0x200000 ) ferencd@0: *length = 4; ferencd@0: else ferencd@0: { *length = 0; return; } // This code won't covert this correctly anyway. ferencd@0: ferencd@0: output += *length; ferencd@0: ferencd@0: // Scary scary fall throughs. ferencd@0: switch (*length) ferencd@0: { ferencd@0: case 4: ferencd@0: --output; ferencd@0: *output = static_cast((input | BYTE_MARK) & BYTE_MASK); ferencd@0: input >>= 6; ferencd@0: case 3: ferencd@0: --output; ferencd@0: *output = static_cast((input | BYTE_MARK) & BYTE_MASK); ferencd@0: input >>= 6; ferencd@0: case 2: ferencd@0: --output; ferencd@0: *output = static_cast((input | BYTE_MARK) & BYTE_MASK); ferencd@0: input >>= 6; ferencd@0: case 1: ferencd@0: --output; ferencd@0: *output = static_cast(input | FIRST_BYTE_MARK[*length]); ferencd@0: default: ferencd@0: break; ferencd@0: } ferencd@0: } ferencd@0: ferencd@0: ferencd@0: /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) ferencd@0: { ferencd@0: // This will only work for low-ascii, everything else is assumed to be a valid ferencd@0: // letter. I'm not sure this is the best approach, but it is quite tricky trying ferencd@0: // to figure out alhabetical vs. not across encoding. So take a very ferencd@0: // conservative approach. ferencd@0: ferencd@0: // if ( encoding == TIXML_ENCODING_UTF8 ) ferencd@0: // { ferencd@0: if ( anyByte < 127 ) ferencd@0: return isalpha( anyByte ); ferencd@0: else ferencd@0: return 1; // What else to do? The unicode set is huge...get the english ones right. ferencd@0: // } ferencd@0: // else ferencd@0: // { ferencd@0: // return isalpha( anyByte ); ferencd@0: // } ferencd@0: } ferencd@0: ferencd@0: ferencd@0: /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) ferencd@0: { ferencd@0: // This will only work for low-ascii, everything else is assumed to be a valid ferencd@0: // letter. I'm not sure this is the best approach, but it is quite tricky trying ferencd@0: // to figure out alhabetical vs. not across encoding. So take a very ferencd@0: // conservative approach. ferencd@0: ferencd@0: // if ( encoding == TIXML_ENCODING_UTF8 ) ferencd@0: // { ferencd@0: if ( anyByte < 127 ) ferencd@0: return isalnum( anyByte ); ferencd@0: else ferencd@0: return 1; // What else to do? The unicode set is huge...get the english ones right. ferencd@0: // } ferencd@0: // else ferencd@0: // { ferencd@0: // return isalnum( anyByte ); ferencd@0: // } ferencd@0: } ferencd@0: ferencd@0: ferencd@0: class TiXmlParsingData ferencd@0: { ferencd@0: friend class TiXmlDocument; ferencd@0: public: ferencd@0: void Stamp( const char* now, TiXmlEncoding encoding ); ferencd@0: ferencd@0: const TiXmlCursor& Cursor() const { return cursor; } ferencd@0: ferencd@0: private: ferencd@0: // Only used by the document! ferencd@0: TiXmlParsingData( const char* start, int _tabsize, int row, int col ) ferencd@0: { ferencd@0: assert( start ); ferencd@0: stamp = start; ferencd@0: tabsize = _tabsize; ferencd@0: cursor.row = row; ferencd@0: cursor.col = col; ferencd@0: } ferencd@0: ferencd@0: TiXmlCursor cursor; ferencd@0: const char* stamp; ferencd@0: int tabsize; ferencd@0: }; ferencd@0: ferencd@0: ferencd@0: void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: assert( now ); ferencd@0: ferencd@0: // Do nothing if the tabsize is 0. ferencd@0: if ( tabsize < 1 ) ferencd@0: { ferencd@0: return; ferencd@0: } ferencd@0: ferencd@0: // Get the current row, column. ferencd@0: int row = cursor.row; ferencd@0: int col = cursor.col; ferencd@0: const char* p = stamp; ferencd@0: assert( p ); ferencd@0: ferencd@0: while ( p < now ) ferencd@0: { ferencd@0: // Treat p as unsigned, so we have a happy compiler. ferencd@0: const unsigned char* pU = reinterpret_cast(p); ferencd@0: ferencd@0: // Code contributed by Fletcher Dunn: (modified by lee) ferencd@0: switch (*pU) { ferencd@0: case 0: ferencd@0: // We *should* never get here, but in case we do, don't ferencd@0: // advance past the terminating null character, ever ferencd@0: return; ferencd@0: ferencd@0: case '\r': ferencd@0: // bump down to the next line ferencd@0: ++row; ferencd@0: col = 0; ferencd@0: // Eat the character ferencd@0: ++p; ferencd@0: ferencd@0: // Check for \r\n sequence, and treat this as a single character ferencd@0: if (*p == '\n') { ferencd@0: ++p; ferencd@0: } ferencd@0: break; ferencd@0: ferencd@0: case '\n': ferencd@0: // bump down to the next line ferencd@0: ++row; ferencd@0: col = 0; ferencd@0: ferencd@0: // Eat the character ferencd@0: ++p; ferencd@0: ferencd@0: // Check for \n\r sequence, and treat this as a single ferencd@0: // character. (Yes, this bizarre thing does occur still ferencd@0: // on some arcane platforms...) ferencd@0: if (*p == '\r') { ferencd@0: ++p; ferencd@0: } ferencd@0: break; ferencd@0: ferencd@0: case '\t': ferencd@0: // Eat the character ferencd@0: ++p; ferencd@0: ferencd@0: // Skip to next tab stop ferencd@0: col = (col / tabsize + 1) * tabsize; ferencd@0: break; ferencd@0: ferencd@0: case TIXML_UTF_LEAD_0: ferencd@0: if ( encoding == TIXML_ENCODING_UTF8 ) ferencd@0: { ferencd@0: if ( *(p+1) && *(p+2) ) ferencd@0: { ferencd@0: // In these cases, don't advance the column. These are ferencd@0: // 0-width spaces. ferencd@0: if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 ) ferencd@0: p += 3; ferencd@0: else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU ) ferencd@0: p += 3; ferencd@0: else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU ) ferencd@0: p += 3; ferencd@0: else ferencd@0: { p +=3; ++col; } // A normal character. ferencd@0: } ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: ++p; ferencd@0: ++col; ferencd@0: } ferencd@0: break; ferencd@0: ferencd@0: default: ferencd@0: if ( encoding == TIXML_ENCODING_UTF8 ) ferencd@0: { ferencd@0: // Eat the 1 to 4 byte utf8 character. ferencd@0: int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)]; ferencd@0: if ( step == 0 ) ferencd@0: step = 1; // Error case from bad encoding, but handle gracefully. ferencd@0: p += step; ferencd@0: ferencd@0: // Just advance one column, of course. ferencd@0: ++col; ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: ++p; ferencd@0: ++col; ferencd@0: } ferencd@0: break; ferencd@0: } ferencd@0: } ferencd@0: cursor.row = row; ferencd@0: cursor.col = col; ferencd@0: assert( cursor.row >= -1 ); ferencd@0: assert( cursor.col >= -1 ); ferencd@0: stamp = p; ferencd@0: assert( stamp ); ferencd@0: } ferencd@0: ferencd@0: ferencd@0: const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: if ( !p || !*p ) ferencd@0: { ferencd@0: return 0; ferencd@0: } ferencd@0: if ( encoding == TIXML_ENCODING_UTF8 ) ferencd@0: { ferencd@0: while ( *p ) ferencd@0: { ferencd@0: const unsigned char* pU = (const unsigned char*)p; ferencd@0: ferencd@0: // Skip the stupid Microsoft UTF-8 Byte order marks ferencd@0: if ( *(pU+0)==TIXML_UTF_LEAD_0 ferencd@0: && *(pU+1)==TIXML_UTF_LEAD_1 ferencd@0: && *(pU+2)==TIXML_UTF_LEAD_2 ) ferencd@0: { ferencd@0: p += 3; ferencd@0: continue; ferencd@0: } ferencd@0: else if(*(pU+0)==TIXML_UTF_LEAD_0 ferencd@0: && *(pU+1)==0xbfU ferencd@0: && *(pU+2)==0xbeU ) ferencd@0: { ferencd@0: p += 3; ferencd@0: continue; ferencd@0: } ferencd@0: else if(*(pU+0)==TIXML_UTF_LEAD_0 ferencd@0: && *(pU+1)==0xbfU ferencd@0: && *(pU+2)==0xbfU ) ferencd@0: { ferencd@0: p += 3; ferencd@0: continue; ferencd@0: } ferencd@0: ferencd@0: if ( IsWhiteSpace( *p ) ) // Still using old rules for white space. ferencd@0: ++p; ferencd@0: else ferencd@0: break; ferencd@0: } ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: while ( *p && IsWhiteSpace( *p ) ) ferencd@0: ++p; ferencd@0: } ferencd@0: ferencd@0: return p; ferencd@0: } ferencd@0: ferencd@0: #ifdef TIXML_USE_STL ferencd@0: /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag ) ferencd@0: { ferencd@0: for( ;; ) ferencd@0: { ferencd@0: if ( !in->good() ) return false; ferencd@0: ferencd@0: int c = in->peek(); ferencd@0: // At this scope, we can't get to a document. So fail silently. ferencd@0: if ( !IsWhiteSpace( c ) || c <= 0 ) ferencd@0: return true; ferencd@0: ferencd@0: *tag += (char) in->get(); ferencd@0: } ferencd@0: } ferencd@0: ferencd@0: /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag ) ferencd@0: { ferencd@0: //assert( character > 0 && character < 128 ); // else it won't work in utf-8 ferencd@0: while ( in->good() ) ferencd@0: { ferencd@0: int c = in->peek(); ferencd@0: if ( c == character ) ferencd@0: return true; ferencd@0: if ( c <= 0 ) // Silent failure: can't get document at this scope ferencd@0: return false; ferencd@0: ferencd@0: in->get(); ferencd@0: *tag += (char) c; ferencd@0: } ferencd@0: return false; ferencd@0: } ferencd@0: #endif ferencd@0: ferencd@0: // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The ferencd@0: // "assign" optimization removes over 10% of the execution time. ferencd@0: // ferencd@0: const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: // Oddly, not supported on some comilers, ferencd@0: //name->clear(); ferencd@0: // So use this: ferencd@0: *name = ""; ferencd@0: assert( p ); ferencd@0: ferencd@0: // Names start with letters or underscores. ferencd@0: // Of course, in unicode, tinyxml has no idea what a letter *is*. The ferencd@0: // algorithm is generous. ferencd@0: // ferencd@0: // After that, they can be letters, underscores, numbers, ferencd@0: // hyphens, or colons. (Colons are valid ony for namespaces, ferencd@0: // but tinyxml can't tell namespaces from names.) ferencd@0: if ( p && *p ferencd@0: && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) ) ferencd@0: { ferencd@0: const char* start = p; ferencd@0: while( p && *p ferencd@0: && ( IsAlphaNum( (unsigned char ) *p, encoding ) ferencd@0: || *p == '_' ferencd@0: || *p == '-' ferencd@0: || *p == '.' ferencd@0: || *p == ':' ) ) ferencd@0: { ferencd@0: //(*name) += *p; // expensive ferencd@0: ++p; ferencd@0: } ferencd@0: if ( p-start > 0 ) { ferencd@0: name->assign( start, p-start ); ferencd@0: } ferencd@0: return p; ferencd@0: } ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: // Presume an entity, and pull it out. ferencd@0: TIXML_STRING ent; ferencd@0: int i; ferencd@0: *length = 0; ferencd@0: ferencd@0: if ( *(p+1) && *(p+1) == '#' && *(p+2) ) ferencd@0: { ferencd@0: unsigned long ucs = 0; ferencd@0: ptrdiff_t delta = 0; ferencd@0: unsigned mult = 1; ferencd@0: ferencd@0: if ( *(p+2) == 'x' ) ferencd@0: { ferencd@0: // Hexadecimal. ferencd@0: if ( !*(p+3) ) return 0; ferencd@0: ferencd@0: const char* q = p+3; ferencd@0: q = strchr( q, ';' ); ferencd@0: ferencd@0: if ( !q || !*q ) return 0; ferencd@0: ferencd@0: delta = q-p; ferencd@0: --q; ferencd@0: ferencd@0: while ( *q != 'x' ) ferencd@0: { ferencd@0: if ( *q >= '0' && *q <= '9' ) ferencd@0: ucs += mult * (*q - '0'); ferencd@0: else if ( *q >= 'a' && *q <= 'f' ) ferencd@0: ucs += mult * (*q - 'a' + 10); ferencd@0: else if ( *q >= 'A' && *q <= 'F' ) ferencd@0: ucs += mult * (*q - 'A' + 10 ); ferencd@0: else ferencd@0: return 0; ferencd@0: mult *= 16; ferencd@0: --q; ferencd@0: } ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: // Decimal. ferencd@0: if ( !*(p+2) ) return 0; ferencd@0: ferencd@0: const char* q = p+2; ferencd@0: q = strchr( q, ';' ); ferencd@0: ferencd@0: if ( !q || !*q ) return 0; ferencd@0: ferencd@0: delta = q-p; ferencd@0: --q; ferencd@0: ferencd@0: while ( *q != '#' ) ferencd@0: { ferencd@0: if ( *q >= '0' && *q <= '9' ) ferencd@0: ucs += mult * (*q - '0'); ferencd@0: else ferencd@0: return 0; ferencd@0: mult *= 10; ferencd@0: --q; ferencd@0: } ferencd@0: } ferencd@0: if ( encoding == TIXML_ENCODING_UTF8 ) ferencd@0: { ferencd@0: // convert the UCS to UTF-8 ferencd@0: ConvertUTF32ToUTF8( ucs, value, length ); ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: *value = (char)ucs; ferencd@0: *length = 1; ferencd@0: } ferencd@0: return p + delta + 1; ferencd@0: } ferencd@0: ferencd@0: // Now try to match it. ferencd@0: for( i=0; iappend( cArr, len ); ferencd@0: } ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: bool whitespace = false; ferencd@0: ferencd@0: // Remove leading white space: ferencd@0: p = SkipWhiteSpace( p, encoding ); ferencd@0: while ( p && *p ferencd@0: && !StringEqual( p, endTag, caseInsensitive, encoding ) ) ferencd@0: { ferencd@0: if ( *p == '\r' || *p == '\n' ) ferencd@0: { ferencd@0: whitespace = true; ferencd@0: ++p; ferencd@0: } ferencd@0: else if ( IsWhiteSpace( *p ) ) ferencd@0: { ferencd@0: whitespace = true; ferencd@0: ++p; ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: // If we've found whitespace, add it before the ferencd@0: // new character. Any whitespace just becomes a space. ferencd@0: if ( whitespace ) ferencd@0: { ferencd@0: (*text) += ' '; ferencd@0: whitespace = false; ferencd@0: } ferencd@0: int len; ferencd@0: char cArr[4] = { 0, 0, 0, 0 }; ferencd@0: p = GetChar( p, cArr, &len, encoding ); ferencd@0: if ( len == 1 ) ferencd@0: (*text) += cArr[0]; // more efficient ferencd@0: else ferencd@0: text->append( cArr, len ); ferencd@0: } ferencd@0: } ferencd@0: } ferencd@0: if ( p && *p ) ferencd@0: p += strlen( endTag ); ferencd@0: return ( p && *p ) ? p : 0; ferencd@0: } ferencd@0: ferencd@0: #ifdef TIXML_USE_STL ferencd@0: ferencd@0: void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag ) ferencd@0: { ferencd@0: // The basic issue with a document is that we don't know what we're ferencd@0: // streaming. Read something presumed to be a tag (and hope), then ferencd@0: // identify it, and call the appropriate stream method on the tag. ferencd@0: // ferencd@0: // This "pre-streaming" will never read the closing ">" so the ferencd@0: // sub-tag can orient itself. ferencd@0: ferencd@0: if ( !StreamTo( in, '<', tag ) ) ferencd@0: { ferencd@0: SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); ferencd@0: return; ferencd@0: } ferencd@0: ferencd@0: while ( in->good() ) ferencd@0: { ferencd@0: int tagIndex = (int) tag->length(); ferencd@0: while ( in->good() && in->peek() != '>' ) ferencd@0: { ferencd@0: int c = in->get(); ferencd@0: if ( c <= 0 ) ferencd@0: { ferencd@0: SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); ferencd@0: break; ferencd@0: } ferencd@0: (*tag) += (char) c; ferencd@0: } ferencd@0: ferencd@0: if ( in->good() ) ferencd@0: { ferencd@0: // We now have something we presume to be a node of ferencd@0: // some sort. Identify it, and call the node to ferencd@0: // continue streaming. ferencd@0: TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING ); ferencd@0: ferencd@0: if ( node ) ferencd@0: { ferencd@0: node->StreamIn( in, tag ); ferencd@0: bool isElement = node->ToElement() != 0; ferencd@0: delete node; ferencd@0: node = 0; ferencd@0: ferencd@0: // If this is the root element, we're done. Parsing will be ferencd@0: // done by the >> operator. ferencd@0: if ( isElement ) ferencd@0: { ferencd@0: return; ferencd@0: } ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); ferencd@0: return; ferencd@0: } ferencd@0: } ferencd@0: } ferencd@0: // We should have returned sooner. ferencd@0: SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); ferencd@0: } ferencd@0: ferencd@0: #endif ferencd@0: ferencd@0: const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: ClearError(); ferencd@0: ferencd@0: // Parse away, at the document level. Since a document ferencd@0: // contains nothing but other tags, most of what happens ferencd@0: // here is skipping white space. ferencd@0: if ( !p || !*p ) ferencd@0: { ferencd@0: SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: // Note that, for a document, this needs to come ferencd@0: // before the while space skip, so that parsing ferencd@0: // starts from the pointer we are given. ferencd@0: location.Clear(); ferencd@0: if ( prevData ) ferencd@0: { ferencd@0: location.row = prevData->cursor.row; ferencd@0: location.col = prevData->cursor.col; ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: location.row = 0; ferencd@0: location.col = 0; ferencd@0: } ferencd@0: TiXmlParsingData data( p, TabSize(), location.row, location.col ); ferencd@0: location = data.Cursor(); ferencd@0: ferencd@0: if ( encoding == TIXML_ENCODING_UNKNOWN ) ferencd@0: { ferencd@0: // Check for the Microsoft UTF-8 lead bytes. ferencd@0: const unsigned char* pU = (const unsigned char*)p; ferencd@0: if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0 ferencd@0: && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1 ferencd@0: && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 ) ferencd@0: { ferencd@0: encoding = TIXML_ENCODING_UTF8; ferencd@0: useMicrosoftBOM = true; ferencd@0: } ferencd@0: } ferencd@0: ferencd@0: p = SkipWhiteSpace( p, encoding ); ferencd@0: if ( !p ) ferencd@0: { ferencd@0: SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: while ( p && *p ) ferencd@0: { ferencd@0: TiXmlNode* node = Identify( p, encoding ); ferencd@0: if ( node ) ferencd@0: { ferencd@0: p = node->Parse( p, &data, encoding ); ferencd@0: LinkEndChild( node ); ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: break; ferencd@0: } ferencd@0: ferencd@0: // Did we get encoding info? ferencd@0: if ( encoding == TIXML_ENCODING_UNKNOWN ferencd@0: && node->ToDeclaration() ) ferencd@0: { ferencd@0: TiXmlDeclaration* dec = node->ToDeclaration(); ferencd@0: const char* enc = dec->Encoding(); ferencd@0: assert( enc ); ferencd@0: ferencd@0: if ( *enc == 0 ) ferencd@0: encoding = TIXML_ENCODING_UTF8; ferencd@0: else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) ) ferencd@0: encoding = TIXML_ENCODING_UTF8; ferencd@0: else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) ) ferencd@0: encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice ferencd@0: else ferencd@0: encoding = TIXML_ENCODING_LEGACY; ferencd@0: } ferencd@0: ferencd@0: p = SkipWhiteSpace( p, encoding ); ferencd@0: } ferencd@0: ferencd@0: // Was this empty? ferencd@0: if ( !firstChild ) { ferencd@0: SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding ); ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: // All is well. ferencd@0: return p; ferencd@0: } ferencd@0: ferencd@0: void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: // The first error in a chain is more accurate - don't set again! ferencd@0: if ( error ) ferencd@0: return; ferencd@0: ferencd@0: assert( err > 0 && err < TIXML_ERROR_STRING_COUNT ); ferencd@0: error = true; ferencd@0: errorId = err; ferencd@0: errorDesc = errorString[ errorId ]; ferencd@0: ferencd@0: errorLocation.Clear(); ferencd@0: if ( pError && data ) ferencd@0: { ferencd@0: data->Stamp( pError, encoding ); ferencd@0: errorLocation = data->Cursor(); ferencd@0: } ferencd@0: } ferencd@0: ferencd@0: ferencd@0: TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: TiXmlNode* returnNode = 0; ferencd@0: ferencd@0: p = SkipWhiteSpace( p, encoding ); ferencd@0: if( !p || !*p || *p != '<' ) ferencd@0: { ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: p = SkipWhiteSpace( p, encoding ); ferencd@0: ferencd@0: if ( !p || !*p ) ferencd@0: { ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: // What is this thing? ferencd@0: // - Elements start with a letter or underscore, but xml is reserved. ferencd@0: // - Comments: "; ferencd@0: ferencd@0: if ( !StringEqual( p, startTag, false, encoding ) ) ferencd@0: { ferencd@0: if ( document ) ferencd@0: document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding ); ferencd@0: return 0; ferencd@0: } ferencd@0: p += strlen( startTag ); ferencd@0: ferencd@0: // [ 1475201 ] TinyXML parses entities in comments ferencd@0: // Oops - ReadText doesn't work, because we don't want to parse the entities. ferencd@0: // p = ReadText( p, &value, false, endTag, false, encoding ); ferencd@0: // ferencd@0: // from the XML spec: ferencd@0: /* ferencd@0: [Definition: Comments may appear anywhere in a document outside other markup; in addition, ferencd@0: they may appear within the document type declaration at places allowed by the grammar. ferencd@0: They are not part of the document's character data; an XML processor MAY, but need not, ferencd@0: make it possible for an application to retrieve the text of comments. For compatibility, ferencd@0: the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity ferencd@0: references MUST NOT be recognized within comments. ferencd@0: ferencd@0: An example of a comment: ferencd@0: ferencd@0: ferencd@0: */ ferencd@0: ferencd@0: value = ""; ferencd@0: // Keep all the white space. ferencd@0: while ( p && *p && !StringEqual( p, endTag, false, encoding ) ) ferencd@0: { ferencd@0: value.append( p, 1 ); ferencd@0: ++p; ferencd@0: } ferencd@0: if ( p && *p ) ferencd@0: p += strlen( endTag ); ferencd@0: ferencd@0: return p; ferencd@0: } ferencd@0: ferencd@0: ferencd@0: const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: p = SkipWhiteSpace( p, encoding ); ferencd@0: if ( !p || !*p ) return 0; ferencd@0: ferencd@0: if ( data ) ferencd@0: { ferencd@0: data->Stamp( p, encoding ); ferencd@0: location = data->Cursor(); ferencd@0: } ferencd@0: // Read the name, the '=' and the value. ferencd@0: const char* pErr = p; ferencd@0: p = ReadName( p, &name, encoding ); ferencd@0: if ( !p || !*p ) ferencd@0: { ferencd@0: if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding ); ferencd@0: return 0; ferencd@0: } ferencd@0: p = SkipWhiteSpace( p, encoding ); ferencd@0: if ( !p || !*p || *p != '=' ) ferencd@0: { ferencd@0: if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: ++p; // skip '=' ferencd@0: p = SkipWhiteSpace( p, encoding ); ferencd@0: if ( !p || !*p ) ferencd@0: { ferencd@0: if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: const char* end; ferencd@0: const char SINGLE_QUOTE = '\''; ferencd@0: const char DOUBLE_QUOTE = '\"'; ferencd@0: ferencd@0: if ( *p == SINGLE_QUOTE ) ferencd@0: { ferencd@0: ++p; ferencd@0: end = "\'"; // single quote in string ferencd@0: p = ReadText( p, &value, false, end, false, encoding ); ferencd@0: } ferencd@0: else if ( *p == DOUBLE_QUOTE ) ferencd@0: { ferencd@0: ++p; ferencd@0: end = "\""; // double quote in string ferencd@0: p = ReadText( p, &value, false, end, false, encoding ); ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: // All attribute values should be in single or double quotes. ferencd@0: // But this is such a common error that the parser will try ferencd@0: // its best, even without them. ferencd@0: value = ""; ferencd@0: while ( p && *p // existence ferencd@0: && !IsWhiteSpace( *p ) // whitespace ferencd@0: && *p != '/' && *p != '>' ) // tag end ferencd@0: { ferencd@0: if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) { ferencd@0: // [ 1451649 ] Attribute values with trailing quotes not handled correctly ferencd@0: // We did not have an opening quote but seem to have a ferencd@0: // closing one. Give up and throw an error. ferencd@0: if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); ferencd@0: return 0; ferencd@0: } ferencd@0: value += *p; ferencd@0: ++p; ferencd@0: } ferencd@0: } ferencd@0: return p; ferencd@0: } ferencd@0: ferencd@0: #ifdef TIXML_USE_STL ferencd@0: void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag ) ferencd@0: { ferencd@0: while ( in->good() ) ferencd@0: { ferencd@0: int c = in->peek(); ferencd@0: if ( !cdata && (c == '<' ) ) ferencd@0: { ferencd@0: return; ferencd@0: } ferencd@0: if ( c <= 0 ) ferencd@0: { ferencd@0: TiXmlDocument* document = GetDocument(); ferencd@0: if ( document ) ferencd@0: document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); ferencd@0: return; ferencd@0: } ferencd@0: ferencd@0: (*tag) += (char) c; ferencd@0: in->get(); // "commits" the peek made above ferencd@0: ferencd@0: if ( cdata && c == '>' && tag->size() >= 3 ) { ferencd@0: size_t len = tag->size(); ferencd@0: if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) { ferencd@0: // terminator of cdata. ferencd@0: return; ferencd@0: } ferencd@0: } ferencd@0: } ferencd@0: } ferencd@0: #endif ferencd@0: ferencd@0: const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) ferencd@0: { ferencd@0: value = ""; ferencd@0: TiXmlDocument* document = GetDocument(); ferencd@0: ferencd@0: if ( data ) ferencd@0: { ferencd@0: data->Stamp( p, encoding ); ferencd@0: location = data->Cursor(); ferencd@0: } ferencd@0: ferencd@0: const char* const startTag = ""; ferencd@0: ferencd@0: if ( cdata || StringEqual( p, startTag, false, encoding ) ) ferencd@0: { ferencd@0: cdata = true; ferencd@0: ferencd@0: if ( !StringEqual( p, startTag, false, encoding ) ) ferencd@0: { ferencd@0: if ( document ) ferencd@0: document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding ); ferencd@0: return 0; ferencd@0: } ferencd@0: p += strlen( startTag ); ferencd@0: ferencd@0: // Keep all the white space, ignore the encoding, etc. ferencd@0: while ( p && *p ferencd@0: && !StringEqual( p, endTag, false, encoding ) ferencd@0: ) ferencd@0: { ferencd@0: value += *p; ferencd@0: ++p; ferencd@0: } ferencd@0: ferencd@0: TIXML_STRING dummy; ferencd@0: p = ReadText( p, &dummy, false, endTag, false, encoding ); ferencd@0: return p; ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: bool ignoreWhite = true; ferencd@0: ferencd@0: const char* end = "<"; ferencd@0: p = ReadText( p, &value, ignoreWhite, end, false, encoding ); ferencd@0: if ( p && *p ) ferencd@0: return p-1; // don't truncate the '<' ferencd@0: return 0; ferencd@0: } ferencd@0: } ferencd@0: ferencd@0: #ifdef TIXML_USE_STL ferencd@0: void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag ) ferencd@0: { ferencd@0: while ( in->good() ) ferencd@0: { ferencd@0: int c = in->get(); ferencd@0: if ( c <= 0 ) ferencd@0: { ferencd@0: TiXmlDocument* document = GetDocument(); ferencd@0: if ( document ) ferencd@0: document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); ferencd@0: return; ferencd@0: } ferencd@0: (*tag) += (char) c; ferencd@0: ferencd@0: if ( c == '>' ) ferencd@0: { ferencd@0: // All is well. ferencd@0: return; ferencd@0: } ferencd@0: } ferencd@0: } ferencd@0: #endif ferencd@0: ferencd@0: const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding ) ferencd@0: { ferencd@0: p = SkipWhiteSpace( p, _encoding ); ferencd@0: // Find the beginning, find the end, and look for ferencd@0: // the stuff in-between. ferencd@0: TiXmlDocument* document = GetDocument(); ferencd@0: if ( !p || !*p || !StringEqual( p, "SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding ); ferencd@0: return 0; ferencd@0: } ferencd@0: if ( data ) ferencd@0: { ferencd@0: data->Stamp( p, _encoding ); ferencd@0: location = data->Cursor(); ferencd@0: } ferencd@0: p += 5; ferencd@0: ferencd@0: version = ""; ferencd@0: encoding = ""; ferencd@0: standalone = ""; ferencd@0: ferencd@0: while ( p && *p ) ferencd@0: { ferencd@0: if ( *p == '>' ) ferencd@0: { ferencd@0: ++p; ferencd@0: return p; ferencd@0: } ferencd@0: ferencd@0: p = SkipWhiteSpace( p, _encoding ); ferencd@0: if ( StringEqual( p, "version", true, _encoding ) ) ferencd@0: { ferencd@0: TiXmlAttribute attrib; ferencd@0: p = attrib.Parse( p, data, _encoding ); ferencd@0: version = attrib.Value(); ferencd@0: } ferencd@0: else if ( StringEqual( p, "encoding", true, _encoding ) ) ferencd@0: { ferencd@0: TiXmlAttribute attrib; ferencd@0: p = attrib.Parse( p, data, _encoding ); ferencd@0: encoding = attrib.Value(); ferencd@0: } ferencd@0: else if ( StringEqual( p, "standalone", true, _encoding ) ) ferencd@0: { ferencd@0: TiXmlAttribute attrib; ferencd@0: p = attrib.Parse( p, data, _encoding ); ferencd@0: standalone = attrib.Value(); ferencd@0: } ferencd@0: else ferencd@0: { ferencd@0: // Read over whatever it is. ferencd@0: while( p && *p && *p != '>' && !IsWhiteSpace( *p ) ) ferencd@0: ++p; ferencd@0: } ferencd@0: } ferencd@0: return 0; ferencd@0: } ferencd@0: ferencd@0: bool TiXmlText::Blank() const ferencd@0: { ferencd@0: for ( unsigned i=0; i