Mercurial > thymian
comparison 3rdparty/tinyxml/tinyxmlparser.cpp @ 0:a4671277546c tip
created the repository for the thymian project
| author | ferencd |
|---|---|
| date | Tue, 17 Aug 2021 11:19:54 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a4671277546c |
|---|---|
| 1 /* | |
| 2 www.sourceforge.net/projects/tinyxml | |
| 3 Original code by Lee Thomason (www.grinninglizard.com) | |
| 4 | |
| 5 This software is provided 'as-is', without any express or implied | |
| 6 warranty. In no event will the authors be held liable for any | |
| 7 damages arising from the use of this software. | |
| 8 | |
| 9 Permission is granted to anyone to use this software for any | |
| 10 purpose, including commercial applications, and to alter it and | |
| 11 redistribute it freely, subject to the following restrictions: | |
| 12 | |
| 13 1. The origin of this software must not be misrepresented; you must | |
| 14 not claim that you wrote the original software. If you use this | |
| 15 software in a product, an acknowledgment in the product documentation | |
| 16 would be appreciated but is not required. | |
| 17 | |
| 18 2. Altered source versions must be plainly marked as such, and | |
| 19 must not be misrepresented as being the original software. | |
| 20 | |
| 21 3. This notice may not be removed or altered from any source | |
| 22 distribution. | |
| 23 */ | |
| 24 | |
| 25 #include <ctype.h> | |
| 26 #include <stddef.h> | |
| 27 | |
| 28 #include "tinyxml.h" | |
| 29 | |
| 30 //#define DEBUG_PARSER | |
| 31 #if defined( DEBUG_PARSER ) | |
| 32 # if defined( DEBUG ) && defined( _MSC_VER ) | |
| 33 # include <windows.h> | |
| 34 # define TIXML_LOG OutputDebugString | |
| 35 # else | |
| 36 # define TIXML_LOG printf | |
| 37 # endif | |
| 38 #endif | |
| 39 | |
| 40 // Note tha "PutString" hardcodes the same list. This | |
| 41 // is less flexible than it appears. Changing the entries | |
| 42 // or order will break putstring. | |
| 43 TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] = | |
| 44 { | |
| 45 { "&", 5, '&' }, | |
| 46 { "<", 4, '<' }, | |
| 47 { ">", 4, '>' }, | |
| 48 { """, 6, '\"' }, | |
| 49 { "'", 6, '\'' } | |
| 50 }; | |
| 51 | |
| 52 // Bunch of unicode info at: | |
| 53 // http://www.unicode.org/faq/utf_bom.html | |
| 54 // Including the basic of this table, which determines the #bytes in the | |
| 55 // sequence from the lead byte. 1 placed for invalid sequences -- | |
| 56 // although the result will be junk, pass it through as much as possible. | |
| 57 // Beware of the non-characters in UTF-8: | |
| 58 // ef bb bf (Microsoft "lead bytes") | |
| 59 // ef bf be | |
| 60 // ef bf bf | |
| 61 | |
| 62 const unsigned char TIXML_UTF_LEAD_0 = 0xefU; | |
| 63 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU; | |
| 64 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU; | |
| 65 | |
| 66 const int TiXmlBase::utf8ByteTable[256] = | |
| 67 { | |
| 68 // 0 1 2 3 4 5 6 7 8 9 a b c d e f | |
| 69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 | |
| 70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 | |
| 71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 | |
| 72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 | |
| 73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 | |
| 74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 | |
| 75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 | |
| 76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range | |
| 77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid | |
| 78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 | |
| 79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0 | |
| 80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0 | |
| 81 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte | |
| 82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 | |
| 83 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte | |
| 84 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid | |
| 85 }; | |
| 86 | |
| 87 | |
| 88 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length ) | |
| 89 { | |
| 90 const unsigned long BYTE_MASK = 0xBF; | |
| 91 const unsigned long BYTE_MARK = 0x80; | |
| 92 const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; | |
| 93 | |
| 94 if (input < 0x80) | |
| 95 *length = 1; | |
| 96 else if ( input < 0x800 ) | |
| 97 *length = 2; | |
| 98 else if ( input < 0x10000 ) | |
| 99 *length = 3; | |
| 100 else if ( input < 0x200000 ) | |
| 101 *length = 4; | |
| 102 else | |
| 103 { *length = 0; return; } // This code won't covert this correctly anyway. | |
| 104 | |
| 105 output += *length; | |
| 106 | |
| 107 // Scary scary fall throughs. | |
| 108 switch (*length) | |
| 109 { | |
| 110 case 4: | |
| 111 --output; | |
| 112 *output = static_cast<char>((input | BYTE_MARK) & BYTE_MASK); | |
| 113 input >>= 6; | |
| 114 case 3: | |
| 115 --output; | |
| 116 *output = static_cast<char>((input | BYTE_MARK) & BYTE_MASK); | |
| 117 input >>= 6; | |
| 118 case 2: | |
| 119 --output; | |
| 120 *output = static_cast<char>((input | BYTE_MARK) & BYTE_MASK); | |
| 121 input >>= 6; | |
| 122 case 1: | |
| 123 --output; | |
| 124 *output = static_cast<char>(input | FIRST_BYTE_MARK[*length]); | |
| 125 default: | |
| 126 break; | |
| 127 } | |
| 128 } | |
| 129 | |
| 130 | |
| 131 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) | |
| 132 { | |
| 133 // This will only work for low-ascii, everything else is assumed to be a valid | |
| 134 // letter. I'm not sure this is the best approach, but it is quite tricky trying | |
| 135 // to figure out alhabetical vs. not across encoding. So take a very | |
| 136 // conservative approach. | |
| 137 | |
| 138 // if ( encoding == TIXML_ENCODING_UTF8 ) | |
| 139 // { | |
| 140 if ( anyByte < 127 ) | |
| 141 return isalpha( anyByte ); | |
| 142 else | |
| 143 return 1; // What else to do? The unicode set is huge...get the english ones right. | |
| 144 // } | |
| 145 // else | |
| 146 // { | |
| 147 // return isalpha( anyByte ); | |
| 148 // } | |
| 149 } | |
| 150 | |
| 151 | |
| 152 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) | |
| 153 { | |
| 154 // This will only work for low-ascii, everything else is assumed to be a valid | |
| 155 // letter. I'm not sure this is the best approach, but it is quite tricky trying | |
| 156 // to figure out alhabetical vs. not across encoding. So take a very | |
| 157 // conservative approach. | |
| 158 | |
| 159 // if ( encoding == TIXML_ENCODING_UTF8 ) | |
| 160 // { | |
| 161 if ( anyByte < 127 ) | |
| 162 return isalnum( anyByte ); | |
| 163 else | |
| 164 return 1; // What else to do? The unicode set is huge...get the english ones right. | |
| 165 // } | |
| 166 // else | |
| 167 // { | |
| 168 // return isalnum( anyByte ); | |
| 169 // } | |
| 170 } | |
| 171 | |
| 172 | |
| 173 class TiXmlParsingData | |
| 174 { | |
| 175 friend class TiXmlDocument; | |
| 176 public: | |
| 177 void Stamp( const char* now, TiXmlEncoding encoding ); | |
| 178 | |
| 179 const TiXmlCursor& Cursor() const { return cursor; } | |
| 180 | |
| 181 private: | |
| 182 // Only used by the document! | |
| 183 TiXmlParsingData( const char* start, int _tabsize, int row, int col ) | |
| 184 { | |
| 185 assert( start ); | |
| 186 stamp = start; | |
| 187 tabsize = _tabsize; | |
| 188 cursor.row = row; | |
| 189 cursor.col = col; | |
| 190 } | |
| 191 | |
| 192 TiXmlCursor cursor; | |
| 193 const char* stamp; | |
| 194 int tabsize; | |
| 195 }; | |
| 196 | |
| 197 | |
| 198 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding ) | |
| 199 { | |
| 200 assert( now ); | |
| 201 | |
| 202 // Do nothing if the tabsize is 0. | |
| 203 if ( tabsize < 1 ) | |
| 204 { | |
| 205 return; | |
| 206 } | |
| 207 | |
| 208 // Get the current row, column. | |
| 209 int row = cursor.row; | |
| 210 int col = cursor.col; | |
| 211 const char* p = stamp; | |
| 212 assert( p ); | |
| 213 | |
| 214 while ( p < now ) | |
| 215 { | |
| 216 // Treat p as unsigned, so we have a happy compiler. | |
| 217 const unsigned char* pU = reinterpret_cast<const unsigned char*>(p); | |
| 218 | |
| 219 // Code contributed by Fletcher Dunn: (modified by lee) | |
| 220 switch (*pU) { | |
| 221 case 0: | |
| 222 // We *should* never get here, but in case we do, don't | |
| 223 // advance past the terminating null character, ever | |
| 224 return; | |
| 225 | |
| 226 case '\r': | |
| 227 // bump down to the next line | |
| 228 ++row; | |
| 229 col = 0; | |
| 230 // Eat the character | |
| 231 ++p; | |
| 232 | |
| 233 // Check for \r\n sequence, and treat this as a single character | |
| 234 if (*p == '\n') { | |
| 235 ++p; | |
| 236 } | |
| 237 break; | |
| 238 | |
| 239 case '\n': | |
| 240 // bump down to the next line | |
| 241 ++row; | |
| 242 col = 0; | |
| 243 | |
| 244 // Eat the character | |
| 245 ++p; | |
| 246 | |
| 247 // Check for \n\r sequence, and treat this as a single | |
| 248 // character. (Yes, this bizarre thing does occur still | |
| 249 // on some arcane platforms...) | |
| 250 if (*p == '\r') { | |
| 251 ++p; | |
| 252 } | |
| 253 break; | |
| 254 | |
| 255 case '\t': | |
| 256 // Eat the character | |
| 257 ++p; | |
| 258 | |
| 259 // Skip to next tab stop | |
| 260 col = (col / tabsize + 1) * tabsize; | |
| 261 break; | |
| 262 | |
| 263 case TIXML_UTF_LEAD_0: | |
| 264 if ( encoding == TIXML_ENCODING_UTF8 ) | |
| 265 { | |
| 266 if ( *(p+1) && *(p+2) ) | |
| 267 { | |
| 268 // In these cases, don't advance the column. These are | |
| 269 // 0-width spaces. | |
| 270 if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 ) | |
| 271 p += 3; | |
| 272 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU ) | |
| 273 p += 3; | |
| 274 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU ) | |
| 275 p += 3; | |
| 276 else | |
| 277 { p +=3; ++col; } // A normal character. | |
| 278 } | |
| 279 } | |
| 280 else | |
| 281 { | |
| 282 ++p; | |
| 283 ++col; | |
| 284 } | |
| 285 break; | |
| 286 | |
| 287 default: | |
| 288 if ( encoding == TIXML_ENCODING_UTF8 ) | |
| 289 { | |
| 290 // Eat the 1 to 4 byte utf8 character. | |
| 291 int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)]; | |
| 292 if ( step == 0 ) | |
| 293 step = 1; // Error case from bad encoding, but handle gracefully. | |
| 294 p += step; | |
| 295 | |
| 296 // Just advance one column, of course. | |
| 297 ++col; | |
| 298 } | |
| 299 else | |
| 300 { | |
| 301 ++p; | |
| 302 ++col; | |
| 303 } | |
| 304 break; | |
| 305 } | |
| 306 } | |
| 307 cursor.row = row; | |
| 308 cursor.col = col; | |
| 309 assert( cursor.row >= -1 ); | |
| 310 assert( cursor.col >= -1 ); | |
| 311 stamp = p; | |
| 312 assert( stamp ); | |
| 313 } | |
| 314 | |
| 315 | |
| 316 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) | |
| 317 { | |
| 318 if ( !p || !*p ) | |
| 319 { | |
| 320 return 0; | |
| 321 } | |
| 322 if ( encoding == TIXML_ENCODING_UTF8 ) | |
| 323 { | |
| 324 while ( *p ) | |
| 325 { | |
| 326 const unsigned char* pU = (const unsigned char*)p; | |
| 327 | |
| 328 // Skip the stupid Microsoft UTF-8 Byte order marks | |
| 329 if ( *(pU+0)==TIXML_UTF_LEAD_0 | |
| 330 && *(pU+1)==TIXML_UTF_LEAD_1 | |
| 331 && *(pU+2)==TIXML_UTF_LEAD_2 ) | |
| 332 { | |
| 333 p += 3; | |
| 334 continue; | |
| 335 } | |
| 336 else if(*(pU+0)==TIXML_UTF_LEAD_0 | |
| 337 && *(pU+1)==0xbfU | |
| 338 && *(pU+2)==0xbeU ) | |
| 339 { | |
| 340 p += 3; | |
| 341 continue; | |
| 342 } | |
| 343 else if(*(pU+0)==TIXML_UTF_LEAD_0 | |
| 344 && *(pU+1)==0xbfU | |
| 345 && *(pU+2)==0xbfU ) | |
| 346 { | |
| 347 p += 3; | |
| 348 continue; | |
| 349 } | |
| 350 | |
| 351 if ( IsWhiteSpace( *p ) ) // Still using old rules for white space. | |
| 352 ++p; | |
| 353 else | |
| 354 break; | |
| 355 } | |
| 356 } | |
| 357 else | |
| 358 { | |
| 359 while ( *p && IsWhiteSpace( *p ) ) | |
| 360 ++p; | |
| 361 } | |
| 362 | |
| 363 return p; | |
| 364 } | |
| 365 | |
| 366 #ifdef TIXML_USE_STL | |
| 367 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag ) | |
| 368 { | |
| 369 for( ;; ) | |
| 370 { | |
| 371 if ( !in->good() ) return false; | |
| 372 | |
| 373 int c = in->peek(); | |
| 374 // At this scope, we can't get to a document. So fail silently. | |
| 375 if ( !IsWhiteSpace( c ) || c <= 0 ) | |
| 376 return true; | |
| 377 | |
| 378 *tag += (char) in->get(); | |
| 379 } | |
| 380 } | |
| 381 | |
| 382 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag ) | |
| 383 { | |
| 384 //assert( character > 0 && character < 128 ); // else it won't work in utf-8 | |
| 385 while ( in->good() ) | |
| 386 { | |
| 387 int c = in->peek(); | |
| 388 if ( c == character ) | |
| 389 return true; | |
| 390 if ( c <= 0 ) // Silent failure: can't get document at this scope | |
| 391 return false; | |
| 392 | |
| 393 in->get(); | |
| 394 *tag += (char) c; | |
| 395 } | |
| 396 return false; | |
| 397 } | |
| 398 #endif | |
| 399 | |
| 400 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The | |
| 401 // "assign" optimization removes over 10% of the execution time. | |
| 402 // | |
| 403 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding ) | |
| 404 { | |
| 405 // Oddly, not supported on some comilers, | |
| 406 //name->clear(); | |
| 407 // So use this: | |
| 408 *name = ""; | |
| 409 assert( p ); | |
| 410 | |
| 411 // Names start with letters or underscores. | |
| 412 // Of course, in unicode, tinyxml has no idea what a letter *is*. The | |
| 413 // algorithm is generous. | |
| 414 // | |
| 415 // After that, they can be letters, underscores, numbers, | |
| 416 // hyphens, or colons. (Colons are valid ony for namespaces, | |
| 417 // but tinyxml can't tell namespaces from names.) | |
| 418 if ( p && *p | |
| 419 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) ) | |
| 420 { | |
| 421 const char* start = p; | |
| 422 while( p && *p | |
| 423 && ( IsAlphaNum( (unsigned char ) *p, encoding ) | |
| 424 || *p == '_' | |
| 425 || *p == '-' | |
| 426 || *p == '.' | |
| 427 || *p == ':' ) ) | |
| 428 { | |
| 429 //(*name) += *p; // expensive | |
| 430 ++p; | |
| 431 } | |
| 432 if ( p-start > 0 ) { | |
| 433 name->assign( start, p-start ); | |
| 434 } | |
| 435 return p; | |
| 436 } | |
| 437 return 0; | |
| 438 } | |
| 439 | |
| 440 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding ) | |
| 441 { | |
| 442 // Presume an entity, and pull it out. | |
| 443 TIXML_STRING ent; | |
| 444 int i; | |
| 445 *length = 0; | |
| 446 | |
| 447 if ( *(p+1) && *(p+1) == '#' && *(p+2) ) | |
| 448 { | |
| 449 unsigned long ucs = 0; | |
| 450 ptrdiff_t delta = 0; | |
| 451 unsigned mult = 1; | |
| 452 | |
| 453 if ( *(p+2) == 'x' ) | |
| 454 { | |
| 455 // Hexadecimal. | |
| 456 if ( !*(p+3) ) return 0; | |
| 457 | |
| 458 const char* q = p+3; | |
| 459 q = strchr( q, ';' ); | |
| 460 | |
| 461 if ( !q || !*q ) return 0; | |
| 462 | |
| 463 delta = q-p; | |
| 464 --q; | |
| 465 | |
| 466 while ( *q != 'x' ) | |
| 467 { | |
| 468 if ( *q >= '0' && *q <= '9' ) | |
| 469 ucs += mult * (*q - '0'); | |
| 470 else if ( *q >= 'a' && *q <= 'f' ) | |
| 471 ucs += mult * (*q - 'a' + 10); | |
| 472 else if ( *q >= 'A' && *q <= 'F' ) | |
| 473 ucs += mult * (*q - 'A' + 10 ); | |
| 474 else | |
| 475 return 0; | |
| 476 mult *= 16; | |
| 477 --q; | |
| 478 } | |
| 479 } | |
| 480 else | |
| 481 { | |
| 482 // Decimal. | |
| 483 if ( !*(p+2) ) return 0; | |
| 484 | |
| 485 const char* q = p+2; | |
| 486 q = strchr( q, ';' ); | |
| 487 | |
| 488 if ( !q || !*q ) return 0; | |
| 489 | |
| 490 delta = q-p; | |
| 491 --q; | |
| 492 | |
| 493 while ( *q != '#' ) | |
| 494 { | |
| 495 if ( *q >= '0' && *q <= '9' ) | |
| 496 ucs += mult * (*q - '0'); | |
| 497 else | |
| 498 return 0; | |
| 499 mult *= 10; | |
| 500 --q; | |
| 501 } | |
| 502 } | |
| 503 if ( encoding == TIXML_ENCODING_UTF8 ) | |
| 504 { | |
| 505 // convert the UCS to UTF-8 | |
| 506 ConvertUTF32ToUTF8( ucs, value, length ); | |
| 507 } | |
| 508 else | |
| 509 { | |
| 510 *value = (char)ucs; | |
| 511 *length = 1; | |
| 512 } | |
| 513 return p + delta + 1; | |
| 514 } | |
| 515 | |
| 516 // Now try to match it. | |
| 517 for( i=0; i<NUM_ENTITY; ++i ) | |
| 518 { | |
| 519 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 ) | |
| 520 { | |
| 521 assert( strlen( entity[i].str ) == entity[i].strLength ); | |
| 522 *value = entity[i].chr; | |
| 523 *length = 1; | |
| 524 return ( p + entity[i].strLength ); | |
| 525 } | |
| 526 } | |
| 527 | |
| 528 // So it wasn't an entity, its unrecognized, or something like that. | |
| 529 *value = *p; // Don't put back the last one, since we return it! | |
| 530 //*length = 1; // Leave unrecognized entities - this doesn't really work. | |
| 531 // Just writes strange XML. | |
| 532 return p+1; | |
| 533 } | |
| 534 | |
| 535 | |
| 536 bool TiXmlBase::StringEqual( const char* p, | |
| 537 const char* tag, | |
| 538 bool ignoreCase, | |
| 539 TiXmlEncoding encoding ) | |
| 540 { | |
| 541 assert( p ); | |
| 542 assert( tag ); | |
| 543 if ( !p || !*p ) | |
| 544 { | |
| 545 assert( 0 ); | |
| 546 return false; | |
| 547 } | |
| 548 | |
| 549 const char* q = p; | |
| 550 | |
| 551 if ( ignoreCase ) | |
| 552 { | |
| 553 while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) ) | |
| 554 { | |
| 555 ++q; | |
| 556 ++tag; | |
| 557 } | |
| 558 | |
| 559 if ( *tag == 0 ) | |
| 560 return true; | |
| 561 } | |
| 562 else | |
| 563 { | |
| 564 while ( *q && *tag && *q == *tag ) | |
| 565 { | |
| 566 ++q; | |
| 567 ++tag; | |
| 568 } | |
| 569 | |
| 570 if ( *tag == 0 ) // Have we found the end of the tag, and everything equal? | |
| 571 return true; | |
| 572 } | |
| 573 return false; | |
| 574 } | |
| 575 | |
| 576 const char* TiXmlBase::ReadText( const char* p, | |
| 577 TIXML_STRING * text, | |
| 578 bool trimWhiteSpace, | |
| 579 const char* endTag, | |
| 580 bool caseInsensitive, | |
| 581 TiXmlEncoding encoding ) | |
| 582 { | |
| 583 *text = ""; | |
| 584 if ( !trimWhiteSpace // certain tags always keep whitespace | |
| 585 || !condenseWhiteSpace ) // if true, whitespace is always kept | |
| 586 { | |
| 587 // Keep all the white space. | |
| 588 while ( p && *p | |
| 589 && !StringEqual( p, endTag, caseInsensitive, encoding ) | |
| 590 ) | |
| 591 { | |
| 592 int len; | |
| 593 char cArr[4] = { 0, 0, 0, 0 }; | |
| 594 p = GetChar( p, cArr, &len, encoding ); | |
| 595 text->append( cArr, len ); | |
| 596 } | |
| 597 } | |
| 598 else | |
| 599 { | |
| 600 bool whitespace = false; | |
| 601 | |
| 602 // Remove leading white space: | |
| 603 p = SkipWhiteSpace( p, encoding ); | |
| 604 while ( p && *p | |
| 605 && !StringEqual( p, endTag, caseInsensitive, encoding ) ) | |
| 606 { | |
| 607 if ( *p == '\r' || *p == '\n' ) | |
| 608 { | |
| 609 whitespace = true; | |
| 610 ++p; | |
| 611 } | |
| 612 else if ( IsWhiteSpace( *p ) ) | |
| 613 { | |
| 614 whitespace = true; | |
| 615 ++p; | |
| 616 } | |
| 617 else | |
| 618 { | |
| 619 // If we've found whitespace, add it before the | |
| 620 // new character. Any whitespace just becomes a space. | |
| 621 if ( whitespace ) | |
| 622 { | |
| 623 (*text) += ' '; | |
| 624 whitespace = false; | |
| 625 } | |
| 626 int len; | |
| 627 char cArr[4] = { 0, 0, 0, 0 }; | |
| 628 p = GetChar( p, cArr, &len, encoding ); | |
| 629 if ( len == 1 ) | |
| 630 (*text) += cArr[0]; // more efficient | |
| 631 else | |
| 632 text->append( cArr, len ); | |
| 633 } | |
| 634 } | |
| 635 } | |
| 636 if ( p && *p ) | |
| 637 p += strlen( endTag ); | |
| 638 return ( p && *p ) ? p : 0; | |
| 639 } | |
| 640 | |
| 641 #ifdef TIXML_USE_STL | |
| 642 | |
| 643 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag ) | |
| 644 { | |
| 645 // The basic issue with a document is that we don't know what we're | |
| 646 // streaming. Read something presumed to be a tag (and hope), then | |
| 647 // identify it, and call the appropriate stream method on the tag. | |
| 648 // | |
| 649 // This "pre-streaming" will never read the closing ">" so the | |
| 650 // sub-tag can orient itself. | |
| 651 | |
| 652 if ( !StreamTo( in, '<', tag ) ) | |
| 653 { | |
| 654 SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 655 return; | |
| 656 } | |
| 657 | |
| 658 while ( in->good() ) | |
| 659 { | |
| 660 int tagIndex = (int) tag->length(); | |
| 661 while ( in->good() && in->peek() != '>' ) | |
| 662 { | |
| 663 int c = in->get(); | |
| 664 if ( c <= 0 ) | |
| 665 { | |
| 666 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 667 break; | |
| 668 } | |
| 669 (*tag) += (char) c; | |
| 670 } | |
| 671 | |
| 672 if ( in->good() ) | |
| 673 { | |
| 674 // We now have something we presume to be a node of | |
| 675 // some sort. Identify it, and call the node to | |
| 676 // continue streaming. | |
| 677 TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING ); | |
| 678 | |
| 679 if ( node ) | |
| 680 { | |
| 681 node->StreamIn( in, tag ); | |
| 682 bool isElement = node->ToElement() != 0; | |
| 683 delete node; | |
| 684 node = 0; | |
| 685 | |
| 686 // If this is the root element, we're done. Parsing will be | |
| 687 // done by the >> operator. | |
| 688 if ( isElement ) | |
| 689 { | |
| 690 return; | |
| 691 } | |
| 692 } | |
| 693 else | |
| 694 { | |
| 695 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 696 return; | |
| 697 } | |
| 698 } | |
| 699 } | |
| 700 // We should have returned sooner. | |
| 701 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 702 } | |
| 703 | |
| 704 #endif | |
| 705 | |
| 706 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding ) | |
| 707 { | |
| 708 ClearError(); | |
| 709 | |
| 710 // Parse away, at the document level. Since a document | |
| 711 // contains nothing but other tags, most of what happens | |
| 712 // here is skipping white space. | |
| 713 if ( !p || !*p ) | |
| 714 { | |
| 715 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 716 return 0; | |
| 717 } | |
| 718 | |
| 719 // Note that, for a document, this needs to come | |
| 720 // before the while space skip, so that parsing | |
| 721 // starts from the pointer we are given. | |
| 722 location.Clear(); | |
| 723 if ( prevData ) | |
| 724 { | |
| 725 location.row = prevData->cursor.row; | |
| 726 location.col = prevData->cursor.col; | |
| 727 } | |
| 728 else | |
| 729 { | |
| 730 location.row = 0; | |
| 731 location.col = 0; | |
| 732 } | |
| 733 TiXmlParsingData data( p, TabSize(), location.row, location.col ); | |
| 734 location = data.Cursor(); | |
| 735 | |
| 736 if ( encoding == TIXML_ENCODING_UNKNOWN ) | |
| 737 { | |
| 738 // Check for the Microsoft UTF-8 lead bytes. | |
| 739 const unsigned char* pU = (const unsigned char*)p; | |
| 740 if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0 | |
| 741 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1 | |
| 742 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 ) | |
| 743 { | |
| 744 encoding = TIXML_ENCODING_UTF8; | |
| 745 useMicrosoftBOM = true; | |
| 746 } | |
| 747 } | |
| 748 | |
| 749 p = SkipWhiteSpace( p, encoding ); | |
| 750 if ( !p ) | |
| 751 { | |
| 752 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 753 return 0; | |
| 754 } | |
| 755 | |
| 756 while ( p && *p ) | |
| 757 { | |
| 758 TiXmlNode* node = Identify( p, encoding ); | |
| 759 if ( node ) | |
| 760 { | |
| 761 p = node->Parse( p, &data, encoding ); | |
| 762 LinkEndChild( node ); | |
| 763 } | |
| 764 else | |
| 765 { | |
| 766 break; | |
| 767 } | |
| 768 | |
| 769 // Did we get encoding info? | |
| 770 if ( encoding == TIXML_ENCODING_UNKNOWN | |
| 771 && node->ToDeclaration() ) | |
| 772 { | |
| 773 TiXmlDeclaration* dec = node->ToDeclaration(); | |
| 774 const char* enc = dec->Encoding(); | |
| 775 assert( enc ); | |
| 776 | |
| 777 if ( *enc == 0 ) | |
| 778 encoding = TIXML_ENCODING_UTF8; | |
| 779 else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) ) | |
| 780 encoding = TIXML_ENCODING_UTF8; | |
| 781 else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) ) | |
| 782 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice | |
| 783 else | |
| 784 encoding = TIXML_ENCODING_LEGACY; | |
| 785 } | |
| 786 | |
| 787 p = SkipWhiteSpace( p, encoding ); | |
| 788 } | |
| 789 | |
| 790 // Was this empty? | |
| 791 if ( !firstChild ) { | |
| 792 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding ); | |
| 793 return 0; | |
| 794 } | |
| 795 | |
| 796 // All is well. | |
| 797 return p; | |
| 798 } | |
| 799 | |
| 800 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding ) | |
| 801 { | |
| 802 // The first error in a chain is more accurate - don't set again! | |
| 803 if ( error ) | |
| 804 return; | |
| 805 | |
| 806 assert( err > 0 && err < TIXML_ERROR_STRING_COUNT ); | |
| 807 error = true; | |
| 808 errorId = err; | |
| 809 errorDesc = errorString[ errorId ]; | |
| 810 | |
| 811 errorLocation.Clear(); | |
| 812 if ( pError && data ) | |
| 813 { | |
| 814 data->Stamp( pError, encoding ); | |
| 815 errorLocation = data->Cursor(); | |
| 816 } | |
| 817 } | |
| 818 | |
| 819 | |
| 820 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding ) | |
| 821 { | |
| 822 TiXmlNode* returnNode = 0; | |
| 823 | |
| 824 p = SkipWhiteSpace( p, encoding ); | |
| 825 if( !p || !*p || *p != '<' ) | |
| 826 { | |
| 827 return 0; | |
| 828 } | |
| 829 | |
| 830 p = SkipWhiteSpace( p, encoding ); | |
| 831 | |
| 832 if ( !p || !*p ) | |
| 833 { | |
| 834 return 0; | |
| 835 } | |
| 836 | |
| 837 // What is this thing? | |
| 838 // - Elements start with a letter or underscore, but xml is reserved. | |
| 839 // - Comments: <!-- | |
| 840 // - Decleration: <?xml | |
| 841 // - Everthing else is unknown to tinyxml. | |
| 842 // | |
| 843 | |
| 844 const char* xmlHeader = { "<?xml" }; | |
| 845 const char* commentHeader = { "<!--" }; | |
| 846 const char* dtdHeader = { "<!" }; | |
| 847 const char* cdataHeader = { "<![CDATA[" }; | |
| 848 | |
| 849 if ( StringEqual( p, xmlHeader, true, encoding ) ) | |
| 850 { | |
| 851 #ifdef DEBUG_PARSER | |
| 852 TIXML_LOG( "XML parsing Declaration\n" ); | |
| 853 #endif | |
| 854 returnNode = new TiXmlDeclaration(); | |
| 855 } | |
| 856 else if ( StringEqual( p, commentHeader, false, encoding ) ) | |
| 857 { | |
| 858 #ifdef DEBUG_PARSER | |
| 859 TIXML_LOG( "XML parsing Comment\n" ); | |
| 860 #endif | |
| 861 returnNode = new TiXmlComment(); | |
| 862 } | |
| 863 else if ( StringEqual( p, cdataHeader, false, encoding ) ) | |
| 864 { | |
| 865 #ifdef DEBUG_PARSER | |
| 866 TIXML_LOG( "XML parsing CDATA\n" ); | |
| 867 #endif | |
| 868 TiXmlText* text = new TiXmlText( "" ); | |
| 869 text->SetCDATA( true ); | |
| 870 returnNode = text; | |
| 871 } | |
| 872 else if ( StringEqual( p, dtdHeader, false, encoding ) ) | |
| 873 { | |
| 874 #ifdef DEBUG_PARSER | |
| 875 TIXML_LOG( "XML parsing Unknown(1)\n" ); | |
| 876 #endif | |
| 877 returnNode = new TiXmlUnknown(); | |
| 878 } | |
| 879 else if ( IsAlpha( *(p+1), encoding ) | |
| 880 || *(p+1) == '_' ) | |
| 881 { | |
| 882 #ifdef DEBUG_PARSER | |
| 883 TIXML_LOG( "XML parsing Element\n" ); | |
| 884 #endif | |
| 885 returnNode = new TiXmlElement( "" ); | |
| 886 } | |
| 887 else | |
| 888 { | |
| 889 #ifdef DEBUG_PARSER | |
| 890 TIXML_LOG( "XML parsing Unknown(2)\n" ); | |
| 891 #endif | |
| 892 returnNode = new TiXmlUnknown(); | |
| 893 } | |
| 894 | |
| 895 if ( returnNode ) | |
| 896 { | |
| 897 // Set the parent, so it can report errors | |
| 898 returnNode->parent = this; | |
| 899 } | |
| 900 return returnNode; | |
| 901 } | |
| 902 | |
| 903 #ifdef TIXML_USE_STL | |
| 904 | |
| 905 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag) | |
| 906 { | |
| 907 // We're called with some amount of pre-parsing. That is, some of "this" | |
| 908 // element is in "tag". Go ahead and stream to the closing ">" | |
| 909 while( in->good() ) | |
| 910 { | |
| 911 int c = in->get(); | |
| 912 if ( c <= 0 ) | |
| 913 { | |
| 914 TiXmlDocument* document = GetDocument(); | |
| 915 if ( document ) | |
| 916 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 917 return; | |
| 918 } | |
| 919 (*tag) += (char) c ; | |
| 920 | |
| 921 if ( c == '>' ) | |
| 922 break; | |
| 923 } | |
| 924 | |
| 925 if ( tag->length() < 3 ) return; | |
| 926 | |
| 927 // Okay...if we are a "/>" tag, then we're done. We've read a complete tag. | |
| 928 // If not, identify and stream. | |
| 929 | |
| 930 if ( tag->at( tag->length() - 1 ) == '>' | |
| 931 && tag->at( tag->length() - 2 ) == '/' ) | |
| 932 { | |
| 933 // All good! | |
| 934 return; | |
| 935 } | |
| 936 else if ( tag->at( tag->length() - 1 ) == '>' ) | |
| 937 { | |
| 938 // There is more. Could be: | |
| 939 // text | |
| 940 // cdata text (which looks like another node) | |
| 941 // closing tag | |
| 942 // another node. | |
| 943 for ( ;; ) | |
| 944 { | |
| 945 StreamWhiteSpace( in, tag ); | |
| 946 | |
| 947 // Do we have text? | |
| 948 if ( in->good() && in->peek() != '<' ) | |
| 949 { | |
| 950 // Yep, text. | |
| 951 TiXmlText text( "" ); | |
| 952 text.StreamIn( in, tag ); | |
| 953 | |
| 954 // What follows text is a closing tag or another node. | |
| 955 // Go around again and figure it out. | |
| 956 continue; | |
| 957 } | |
| 958 | |
| 959 // We now have either a closing tag...or another node. | |
| 960 // We should be at a "<", regardless. | |
| 961 if ( !in->good() ) return; | |
| 962 assert( in->peek() == '<' ); | |
| 963 int tagIndex = (int) tag->length(); | |
| 964 | |
| 965 bool closingTag = false; | |
| 966 bool firstCharFound = false; | |
| 967 | |
| 968 for( ;; ) | |
| 969 { | |
| 970 if ( !in->good() ) | |
| 971 return; | |
| 972 | |
| 973 int c = in->peek(); | |
| 974 if ( c <= 0 ) | |
| 975 { | |
| 976 TiXmlDocument* document = GetDocument(); | |
| 977 if ( document ) | |
| 978 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 979 return; | |
| 980 } | |
| 981 | |
| 982 if ( c == '>' ) | |
| 983 break; | |
| 984 | |
| 985 *tag += (char) c; | |
| 986 in->get(); | |
| 987 | |
| 988 // Early out if we find the CDATA id. | |
| 989 if ( c == '[' && tag->size() >= 9 ) | |
| 990 { | |
| 991 size_t len = tag->size(); | |
| 992 const char* start = tag->c_str() + len - 9; | |
| 993 if ( strcmp( start, "<![CDATA[" ) == 0 ) { | |
| 994 assert( !closingTag ); | |
| 995 break; | |
| 996 } | |
| 997 } | |
| 998 | |
| 999 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) ) | |
| 1000 { | |
| 1001 firstCharFound = true; | |
| 1002 if ( c == '/' ) | |
| 1003 closingTag = true; | |
| 1004 } | |
| 1005 } | |
| 1006 // If it was a closing tag, then read in the closing '>' to clean up the input stream. | |
| 1007 // If it was not, the streaming will be done by the tag. | |
| 1008 if ( closingTag ) | |
| 1009 { | |
| 1010 if ( !in->good() ) | |
| 1011 return; | |
| 1012 | |
| 1013 int c = in->get(); | |
| 1014 if ( c <= 0 ) | |
| 1015 { | |
| 1016 TiXmlDocument* document = GetDocument(); | |
| 1017 if ( document ) | |
| 1018 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 1019 return; | |
| 1020 } | |
| 1021 assert( c == '>' ); | |
| 1022 *tag += (char) c; | |
| 1023 | |
| 1024 // We are done, once we've found our closing tag. | |
| 1025 return; | |
| 1026 } | |
| 1027 else | |
| 1028 { | |
| 1029 // If not a closing tag, id it, and stream. | |
| 1030 const char* tagloc = tag->c_str() + tagIndex; | |
| 1031 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING ); | |
| 1032 if ( !node ) | |
| 1033 return; | |
| 1034 node->StreamIn( in, tag ); | |
| 1035 delete node; | |
| 1036 node = 0; | |
| 1037 | |
| 1038 // No return: go around from the beginning: text, closing tag, or node. | |
| 1039 } | |
| 1040 } | |
| 1041 } | |
| 1042 } | |
| 1043 #endif | |
| 1044 | |
| 1045 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) | |
| 1046 { | |
| 1047 p = SkipWhiteSpace( p, encoding ); | |
| 1048 TiXmlDocument* document = GetDocument(); | |
| 1049 | |
| 1050 if ( !p || !*p ) | |
| 1051 { | |
| 1052 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding ); | |
| 1053 return 0; | |
| 1054 } | |
| 1055 | |
| 1056 if ( data ) | |
| 1057 { | |
| 1058 data->Stamp( p, encoding ); | |
| 1059 location = data->Cursor(); | |
| 1060 } | |
| 1061 | |
| 1062 if ( *p != '<' ) | |
| 1063 { | |
| 1064 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding ); | |
| 1065 return 0; | |
| 1066 } | |
| 1067 | |
| 1068 p = SkipWhiteSpace( p+1, encoding ); | |
| 1069 | |
| 1070 // Read the name. | |
| 1071 const char* pErr = p; | |
| 1072 | |
| 1073 p = ReadName( p, &value, encoding ); | |
| 1074 if ( !p || !*p ) | |
| 1075 { | |
| 1076 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding ); | |
| 1077 return 0; | |
| 1078 } | |
| 1079 | |
| 1080 TIXML_STRING endTag ("</"); | |
| 1081 endTag += value; | |
| 1082 | |
| 1083 // Check for and read attributes. Also look for an empty | |
| 1084 // tag or an end tag. | |
| 1085 while ( p && *p ) | |
| 1086 { | |
| 1087 pErr = p; | |
| 1088 p = SkipWhiteSpace( p, encoding ); | |
| 1089 if ( !p || !*p ) | |
| 1090 { | |
| 1091 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding ); | |
| 1092 return 0; | |
| 1093 } | |
| 1094 if ( *p == '/' ) | |
| 1095 { | |
| 1096 ++p; | |
| 1097 // Empty tag. | |
| 1098 if ( *p != '>' ) | |
| 1099 { | |
| 1100 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding ); | |
| 1101 return 0; | |
| 1102 } | |
| 1103 return (p+1); | |
| 1104 } | |
| 1105 else if ( *p == '>' ) | |
| 1106 { | |
| 1107 // Done with attributes (if there were any.) | |
| 1108 // Read the value -- which can include other | |
| 1109 // elements -- read the end tag, and return. | |
| 1110 ++p; | |
| 1111 p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens. | |
| 1112 if ( !p || !*p ) { | |
| 1113 // We were looking for the end tag, but found nothing. | |
| 1114 // Fix for [ 1663758 ] Failure to report error on bad XML | |
| 1115 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); | |
| 1116 return 0; | |
| 1117 } | |
| 1118 | |
| 1119 // We should find the end tag now | |
| 1120 // note that: | |
| 1121 // </foo > and | |
| 1122 // </foo> | |
| 1123 // are both valid end tags. | |
| 1124 if ( StringEqual( p, endTag.c_str(), false, encoding ) ) | |
| 1125 { | |
| 1126 p += endTag.length(); | |
| 1127 p = SkipWhiteSpace( p, encoding ); | |
| 1128 if ( p && *p && *p == '>' ) { | |
| 1129 ++p; | |
| 1130 return p; | |
| 1131 } | |
| 1132 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); | |
| 1133 return 0; | |
| 1134 } | |
| 1135 else | |
| 1136 { | |
| 1137 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); | |
| 1138 return 0; | |
| 1139 } | |
| 1140 } | |
| 1141 else | |
| 1142 { | |
| 1143 // Try to read an attribute: | |
| 1144 TiXmlAttribute* attrib = new TiXmlAttribute(); | |
| 1145 if ( !attrib ) | |
| 1146 { | |
| 1147 return 0; | |
| 1148 } | |
| 1149 | |
| 1150 attrib->SetDocument( document ); | |
| 1151 pErr = p; | |
| 1152 p = attrib->Parse( p, data, encoding ); | |
| 1153 | |
| 1154 if ( !p || !*p ) | |
| 1155 { | |
| 1156 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding ); | |
| 1157 delete attrib; | |
| 1158 return 0; | |
| 1159 } | |
| 1160 | |
| 1161 // Handle the strange case of double attributes: | |
| 1162 #ifdef TIXML_USE_STL | |
| 1163 TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() ); | |
| 1164 #else | |
| 1165 TiXmlAttribute* node = attributeSet.Find( attrib->Name() ); | |
| 1166 #endif | |
| 1167 if ( node ) | |
| 1168 { | |
| 1169 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding ); | |
| 1170 delete attrib; | |
| 1171 return 0; | |
| 1172 } | |
| 1173 | |
| 1174 attributeSet.Add( attrib ); | |
| 1175 } | |
| 1176 } | |
| 1177 return p; | |
| 1178 } | |
| 1179 | |
| 1180 | |
| 1181 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) | |
| 1182 { | |
| 1183 TiXmlDocument* document = GetDocument(); | |
| 1184 | |
| 1185 // Read in text and elements in any order. | |
| 1186 const char* pWithWhiteSpace = p; | |
| 1187 p = SkipWhiteSpace( p, encoding ); | |
| 1188 | |
| 1189 while ( p && *p ) | |
| 1190 { | |
| 1191 if ( *p != '<' ) | |
| 1192 { | |
| 1193 // Take what we have, make a text element. | |
| 1194 TiXmlText* textNode = new TiXmlText( "" ); | |
| 1195 | |
| 1196 if ( !textNode ) | |
| 1197 { | |
| 1198 return 0; | |
| 1199 } | |
| 1200 | |
| 1201 if ( TiXmlBase::IsWhiteSpaceCondensed() ) | |
| 1202 { | |
| 1203 p = textNode->Parse( p, data, encoding ); | |
| 1204 } | |
| 1205 else | |
| 1206 { | |
| 1207 // Special case: we want to keep the white space | |
| 1208 // so that leading spaces aren't removed. | |
| 1209 p = textNode->Parse( pWithWhiteSpace, data, encoding ); | |
| 1210 } | |
| 1211 | |
| 1212 if ( !textNode->Blank() ) | |
| 1213 LinkEndChild( textNode ); | |
| 1214 else | |
| 1215 delete textNode; | |
| 1216 } | |
| 1217 else | |
| 1218 { | |
| 1219 // We hit a '<' | |
| 1220 // Have we hit a new element or an end tag? This could also be | |
| 1221 // a TiXmlText in the "CDATA" style. | |
| 1222 if ( StringEqual( p, "</", false, encoding ) ) | |
| 1223 { | |
| 1224 return p; | |
| 1225 } | |
| 1226 else | |
| 1227 { | |
| 1228 TiXmlNode* node = Identify( p, encoding ); | |
| 1229 if ( node ) | |
| 1230 { | |
| 1231 p = node->Parse( p, data, encoding ); | |
| 1232 LinkEndChild( node ); | |
| 1233 } | |
| 1234 else | |
| 1235 { | |
| 1236 return 0; | |
| 1237 } | |
| 1238 } | |
| 1239 } | |
| 1240 pWithWhiteSpace = p; | |
| 1241 p = SkipWhiteSpace( p, encoding ); | |
| 1242 } | |
| 1243 | |
| 1244 if ( !p ) | |
| 1245 { | |
| 1246 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding ); | |
| 1247 } | |
| 1248 return p; | |
| 1249 } | |
| 1250 | |
| 1251 | |
| 1252 #ifdef TIXML_USE_STL | |
| 1253 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag ) | |
| 1254 { | |
| 1255 while ( in->good() ) | |
| 1256 { | |
| 1257 int c = in->get(); | |
| 1258 if ( c <= 0 ) | |
| 1259 { | |
| 1260 TiXmlDocument* document = GetDocument(); | |
| 1261 if ( document ) | |
| 1262 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 1263 return; | |
| 1264 } | |
| 1265 (*tag) += (char) c; | |
| 1266 | |
| 1267 if ( c == '>' ) | |
| 1268 { | |
| 1269 // All is well. | |
| 1270 return; | |
| 1271 } | |
| 1272 } | |
| 1273 } | |
| 1274 #endif | |
| 1275 | |
| 1276 | |
| 1277 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) | |
| 1278 { | |
| 1279 TiXmlDocument* document = GetDocument(); | |
| 1280 p = SkipWhiteSpace( p, encoding ); | |
| 1281 | |
| 1282 if ( data ) | |
| 1283 { | |
| 1284 data->Stamp( p, encoding ); | |
| 1285 location = data->Cursor(); | |
| 1286 } | |
| 1287 if ( !p || !*p || *p != '<' ) | |
| 1288 { | |
| 1289 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding ); | |
| 1290 return 0; | |
| 1291 } | |
| 1292 ++p; | |
| 1293 value = ""; | |
| 1294 | |
| 1295 while ( p && *p && *p != '>' ) | |
| 1296 { | |
| 1297 value += *p; | |
| 1298 ++p; | |
| 1299 } | |
| 1300 | |
| 1301 if ( !p ) | |
| 1302 { | |
| 1303 if ( document ) | |
| 1304 document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding ); | |
| 1305 } | |
| 1306 if ( p && *p == '>' ) | |
| 1307 return p+1; | |
| 1308 return p; | |
| 1309 } | |
| 1310 | |
| 1311 #ifdef TIXML_USE_STL | |
| 1312 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag ) | |
| 1313 { | |
| 1314 while ( in->good() ) | |
| 1315 { | |
| 1316 int c = in->get(); | |
| 1317 if ( c <= 0 ) | |
| 1318 { | |
| 1319 TiXmlDocument* document = GetDocument(); | |
| 1320 if ( document ) | |
| 1321 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 1322 return; | |
| 1323 } | |
| 1324 | |
| 1325 (*tag) += (char) c; | |
| 1326 | |
| 1327 if ( c == '>' | |
| 1328 && tag->at( tag->length() - 2 ) == '-' | |
| 1329 && tag->at( tag->length() - 3 ) == '-' ) | |
| 1330 { | |
| 1331 // All is well. | |
| 1332 return; | |
| 1333 } | |
| 1334 } | |
| 1335 } | |
| 1336 #endif | |
| 1337 | |
| 1338 | |
| 1339 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) | |
| 1340 { | |
| 1341 TiXmlDocument* document = GetDocument(); | |
| 1342 value = ""; | |
| 1343 | |
| 1344 p = SkipWhiteSpace( p, encoding ); | |
| 1345 | |
| 1346 if ( data ) | |
| 1347 { | |
| 1348 data->Stamp( p, encoding ); | |
| 1349 location = data->Cursor(); | |
| 1350 } | |
| 1351 const char* startTag = "<!--"; | |
| 1352 const char* endTag = "-->"; | |
| 1353 | |
| 1354 if ( !StringEqual( p, startTag, false, encoding ) ) | |
| 1355 { | |
| 1356 if ( document ) | |
| 1357 document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding ); | |
| 1358 return 0; | |
| 1359 } | |
| 1360 p += strlen( startTag ); | |
| 1361 | |
| 1362 // [ 1475201 ] TinyXML parses entities in comments | |
| 1363 // Oops - ReadText doesn't work, because we don't want to parse the entities. | |
| 1364 // p = ReadText( p, &value, false, endTag, false, encoding ); | |
| 1365 // | |
| 1366 // from the XML spec: | |
| 1367 /* | |
| 1368 [Definition: Comments may appear anywhere in a document outside other markup; in addition, | |
| 1369 they may appear within the document type declaration at places allowed by the grammar. | |
| 1370 They are not part of the document's character data; an XML processor MAY, but need not, | |
| 1371 make it possible for an application to retrieve the text of comments. For compatibility, | |
| 1372 the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity | |
| 1373 references MUST NOT be recognized within comments. | |
| 1374 | |
| 1375 An example of a comment: | |
| 1376 | |
| 1377 <!-- declarations for <head> & <body> --> | |
| 1378 */ | |
| 1379 | |
| 1380 value = ""; | |
| 1381 // Keep all the white space. | |
| 1382 while ( p && *p && !StringEqual( p, endTag, false, encoding ) ) | |
| 1383 { | |
| 1384 value.append( p, 1 ); | |
| 1385 ++p; | |
| 1386 } | |
| 1387 if ( p && *p ) | |
| 1388 p += strlen( endTag ); | |
| 1389 | |
| 1390 return p; | |
| 1391 } | |
| 1392 | |
| 1393 | |
| 1394 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) | |
| 1395 { | |
| 1396 p = SkipWhiteSpace( p, encoding ); | |
| 1397 if ( !p || !*p ) return 0; | |
| 1398 | |
| 1399 if ( data ) | |
| 1400 { | |
| 1401 data->Stamp( p, encoding ); | |
| 1402 location = data->Cursor(); | |
| 1403 } | |
| 1404 // Read the name, the '=' and the value. | |
| 1405 const char* pErr = p; | |
| 1406 p = ReadName( p, &name, encoding ); | |
| 1407 if ( !p || !*p ) | |
| 1408 { | |
| 1409 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding ); | |
| 1410 return 0; | |
| 1411 } | |
| 1412 p = SkipWhiteSpace( p, encoding ); | |
| 1413 if ( !p || !*p || *p != '=' ) | |
| 1414 { | |
| 1415 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); | |
| 1416 return 0; | |
| 1417 } | |
| 1418 | |
| 1419 ++p; // skip '=' | |
| 1420 p = SkipWhiteSpace( p, encoding ); | |
| 1421 if ( !p || !*p ) | |
| 1422 { | |
| 1423 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); | |
| 1424 return 0; | |
| 1425 } | |
| 1426 | |
| 1427 const char* end; | |
| 1428 const char SINGLE_QUOTE = '\''; | |
| 1429 const char DOUBLE_QUOTE = '\"'; | |
| 1430 | |
| 1431 if ( *p == SINGLE_QUOTE ) | |
| 1432 { | |
| 1433 ++p; | |
| 1434 end = "\'"; // single quote in string | |
| 1435 p = ReadText( p, &value, false, end, false, encoding ); | |
| 1436 } | |
| 1437 else if ( *p == DOUBLE_QUOTE ) | |
| 1438 { | |
| 1439 ++p; | |
| 1440 end = "\""; // double quote in string | |
| 1441 p = ReadText( p, &value, false, end, false, encoding ); | |
| 1442 } | |
| 1443 else | |
| 1444 { | |
| 1445 // All attribute values should be in single or double quotes. | |
| 1446 // But this is such a common error that the parser will try | |
| 1447 // its best, even without them. | |
| 1448 value = ""; | |
| 1449 while ( p && *p // existence | |
| 1450 && !IsWhiteSpace( *p ) // whitespace | |
| 1451 && *p != '/' && *p != '>' ) // tag end | |
| 1452 { | |
| 1453 if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) { | |
| 1454 // [ 1451649 ] Attribute values with trailing quotes not handled correctly | |
| 1455 // We did not have an opening quote but seem to have a | |
| 1456 // closing one. Give up and throw an error. | |
| 1457 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); | |
| 1458 return 0; | |
| 1459 } | |
| 1460 value += *p; | |
| 1461 ++p; | |
| 1462 } | |
| 1463 } | |
| 1464 return p; | |
| 1465 } | |
| 1466 | |
| 1467 #ifdef TIXML_USE_STL | |
| 1468 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag ) | |
| 1469 { | |
| 1470 while ( in->good() ) | |
| 1471 { | |
| 1472 int c = in->peek(); | |
| 1473 if ( !cdata && (c == '<' ) ) | |
| 1474 { | |
| 1475 return; | |
| 1476 } | |
| 1477 if ( c <= 0 ) | |
| 1478 { | |
| 1479 TiXmlDocument* document = GetDocument(); | |
| 1480 if ( document ) | |
| 1481 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 1482 return; | |
| 1483 } | |
| 1484 | |
| 1485 (*tag) += (char) c; | |
| 1486 in->get(); // "commits" the peek made above | |
| 1487 | |
| 1488 if ( cdata && c == '>' && tag->size() >= 3 ) { | |
| 1489 size_t len = tag->size(); | |
| 1490 if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) { | |
| 1491 // terminator of cdata. | |
| 1492 return; | |
| 1493 } | |
| 1494 } | |
| 1495 } | |
| 1496 } | |
| 1497 #endif | |
| 1498 | |
| 1499 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) | |
| 1500 { | |
| 1501 value = ""; | |
| 1502 TiXmlDocument* document = GetDocument(); | |
| 1503 | |
| 1504 if ( data ) | |
| 1505 { | |
| 1506 data->Stamp( p, encoding ); | |
| 1507 location = data->Cursor(); | |
| 1508 } | |
| 1509 | |
| 1510 const char* const startTag = "<![CDATA["; | |
| 1511 const char* const endTag = "]]>"; | |
| 1512 | |
| 1513 if ( cdata || StringEqual( p, startTag, false, encoding ) ) | |
| 1514 { | |
| 1515 cdata = true; | |
| 1516 | |
| 1517 if ( !StringEqual( p, startTag, false, encoding ) ) | |
| 1518 { | |
| 1519 if ( document ) | |
| 1520 document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding ); | |
| 1521 return 0; | |
| 1522 } | |
| 1523 p += strlen( startTag ); | |
| 1524 | |
| 1525 // Keep all the white space, ignore the encoding, etc. | |
| 1526 while ( p && *p | |
| 1527 && !StringEqual( p, endTag, false, encoding ) | |
| 1528 ) | |
| 1529 { | |
| 1530 value += *p; | |
| 1531 ++p; | |
| 1532 } | |
| 1533 | |
| 1534 TIXML_STRING dummy; | |
| 1535 p = ReadText( p, &dummy, false, endTag, false, encoding ); | |
| 1536 return p; | |
| 1537 } | |
| 1538 else | |
| 1539 { | |
| 1540 bool ignoreWhite = true; | |
| 1541 | |
| 1542 const char* end = "<"; | |
| 1543 p = ReadText( p, &value, ignoreWhite, end, false, encoding ); | |
| 1544 if ( p && *p ) | |
| 1545 return p-1; // don't truncate the '<' | |
| 1546 return 0; | |
| 1547 } | |
| 1548 } | |
| 1549 | |
| 1550 #ifdef TIXML_USE_STL | |
| 1551 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag ) | |
| 1552 { | |
| 1553 while ( in->good() ) | |
| 1554 { | |
| 1555 int c = in->get(); | |
| 1556 if ( c <= 0 ) | |
| 1557 { | |
| 1558 TiXmlDocument* document = GetDocument(); | |
| 1559 if ( document ) | |
| 1560 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); | |
| 1561 return; | |
| 1562 } | |
| 1563 (*tag) += (char) c; | |
| 1564 | |
| 1565 if ( c == '>' ) | |
| 1566 { | |
| 1567 // All is well. | |
| 1568 return; | |
| 1569 } | |
| 1570 } | |
| 1571 } | |
| 1572 #endif | |
| 1573 | |
| 1574 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding ) | |
| 1575 { | |
| 1576 p = SkipWhiteSpace( p, _encoding ); | |
| 1577 // Find the beginning, find the end, and look for | |
| 1578 // the stuff in-between. | |
| 1579 TiXmlDocument* document = GetDocument(); | |
| 1580 if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) ) | |
| 1581 { | |
| 1582 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding ); | |
| 1583 return 0; | |
| 1584 } | |
| 1585 if ( data ) | |
| 1586 { | |
| 1587 data->Stamp( p, _encoding ); | |
| 1588 location = data->Cursor(); | |
| 1589 } | |
| 1590 p += 5; | |
| 1591 | |
| 1592 version = ""; | |
| 1593 encoding = ""; | |
| 1594 standalone = ""; | |
| 1595 | |
| 1596 while ( p && *p ) | |
| 1597 { | |
| 1598 if ( *p == '>' ) | |
| 1599 { | |
| 1600 ++p; | |
| 1601 return p; | |
| 1602 } | |
| 1603 | |
| 1604 p = SkipWhiteSpace( p, _encoding ); | |
| 1605 if ( StringEqual( p, "version", true, _encoding ) ) | |
| 1606 { | |
| 1607 TiXmlAttribute attrib; | |
| 1608 p = attrib.Parse( p, data, _encoding ); | |
| 1609 version = attrib.Value(); | |
| 1610 } | |
| 1611 else if ( StringEqual( p, "encoding", true, _encoding ) ) | |
| 1612 { | |
| 1613 TiXmlAttribute attrib; | |
| 1614 p = attrib.Parse( p, data, _encoding ); | |
| 1615 encoding = attrib.Value(); | |
| 1616 } | |
| 1617 else if ( StringEqual( p, "standalone", true, _encoding ) ) | |
| 1618 { | |
| 1619 TiXmlAttribute attrib; | |
| 1620 p = attrib.Parse( p, data, _encoding ); | |
| 1621 standalone = attrib.Value(); | |
| 1622 } | |
| 1623 else | |
| 1624 { | |
| 1625 // Read over whatever it is. | |
| 1626 while( p && *p && *p != '>' && !IsWhiteSpace( *p ) ) | |
| 1627 ++p; | |
| 1628 } | |
| 1629 } | |
| 1630 return 0; | |
| 1631 } | |
| 1632 | |
| 1633 bool TiXmlText::Blank() const | |
| 1634 { | |
| 1635 for ( unsigned i=0; i<value.length(); i++ ) | |
| 1636 if ( !IsWhiteSpace( value[i] ) ) | |
| 1637 return false; | |
| 1638 return true; | |
| 1639 } | |
| 1640 |
