|
Teuchos Package Browser (Single Doxygen Collection) Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Teuchos: Common Tools Package 00005 // Copyright (2004) Sandia Corporation 00006 // 00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00008 // license for use of this work by or on behalf of the U.S. Government. 00009 // 00010 // This library is free software; you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as 00012 // published by the Free Software Foundation; either version 2.1 of the 00013 // License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, but 00016 // WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00023 // USA 00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00025 // 00026 // *********************************************************************** 00027 // @HEADER 00028 00029 // BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82 00030 // when printing attribute values, one must check if the value contains quote 00031 // or apost; 00032 // a quot'd attval cannot contain literal quot 00033 // a apos'd attval cannot contain literal apos 00034 // either they have to be matched appropriately or (easier) all quot and apos must 00035 // be replaced by " and ' 00036 00037 #include "Teuchos_XMLParser.hpp" 00038 #include "Teuchos_TreeBuildingXMLHandler.hpp" 00039 #include "Teuchos_TestForException.hpp" 00040 00041 using namespace Teuchos; 00042 00043 // this parser currently does not support: 00044 // * XML declaration 00045 // * processing instructions 00046 // * XML schemas 00047 // * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection 00048 // * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF) 00049 00050 // it currently does support: 00051 // * comments 00052 // * empty element tags, e.g. <hello /> 00053 // * entity references: & < > ' " 00054 // * numeric character references:   00055 // * std::exception/error handling on parse errors 00056 00057 00058 /* From the W3C XML 1.0 Third Edition 00059 http://www.w3.org/TR/2004/REC-xml-20040204/ 00060 00061 The following productions specify well-formed XML documents. 00062 These have been reduced to the support anticipated for support by this parser. 00063 00064 element ::= EmptyElemTag 00065 | STag content ETag 00066 STag ::= '<' Name (S Attribute)* S? '>' 00067 Attribute ::= Name Eq AttValue 00068 ETag ::= '</' Name S? '>' 00069 content ::= CharData? ((element | Reference | CDSect | Comment) CharData?)* 00070 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 00071 00072 AttValue ::= '"' ([^<&"] | Reference)* '"' 00073 | "'" ([^<&'] | Reference)* "'" 00074 00075 CharRef ::= '&#' [0-9]+ ';' 00076 EntityRef ::= '&' Name ';' 00077 Reference ::= EntityRef | CharRef 00078 00079 #x20 (space) 00080 #x9 (horizontal tab) 00081 #xD (carriage return) 00082 #xA (new line, new line line feed) 00083 00084 S ::= (#x20 | #x9 | #xD | #xA)+ 00085 Eq ::= S? '=' S? 00086 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7 00087 Name ::= (Letter | '_' | ':') (NameChar)* 00088 00089 Letter ::= [#x0041-#x005A] | [#x0061-#x007A] 00090 | [#x00C0-#x00D6] | [#x00D8-#x00F6] 00091 | [#x00F8-#x00FF] 00092 Digit ::= [#x0030-#x0039] 00093 00094 Char ::= #x9 | #xA | #xD | [#x20-#xFF] 00095 CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 00096 that is, some std::string of characters not containing '<' or '&' or ']]>' 00097 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 00098 that is, '<!--' txt '-->', where txt does not contain '--' 00099 00100 CDSect ::= CDStart CData CDEnd 00101 CDStart ::= '<![CDATA[' 00102 CData ::= (Char* - (Char* ']]>' Char*)) 00103 CDEnd ::= ']]>' 00104 00105 document ::= prolog element Misc* 00106 prolog ::= XMLDecl? Misc* 00107 XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 00108 Misc ::= Comment | S 00109 00110 */ 00111 00112 XMLObject XMLParser::parse() 00113 { 00114 00115 RCP<TreeBuildingXMLHandler> handler = rcp(new TreeBuildingXMLHandler()); 00116 00117 _entities.clear(); 00118 _entities["apos"] = "'"; 00119 _entities["quot"] = "\""; 00120 _entities["lt"] = "<"; 00121 _entities["gt"] = ">"; 00122 _entities["amp"] = "&"; 00123 00124 bool done = false; 00125 int curopen = 0; // number of currently open tags, or "do we process character data?" 00126 bool gotRoot = false; 00127 00128 while (!done) { 00129 00130 std::string tag, cdata; 00131 unsigned char c1, c2; 00132 Teuchos::map<std::string,string> attrs; 00133 00134 // Consume any whitespace 00135 if (curopen == 0) { 00136 // this will leave a lookahead in c1 00137 if ( getSpace(c1) ) { 00138 done = true; 00139 break; 00140 } 00141 } 00142 else { 00143 // need to manually lookahead 00144 if (_is->readBytes(&c1,1) < 1) { 00145 done = true; 00146 break; 00147 } 00148 } 00149 00150 if (c1 == '<') { 00151 // determine if it is a STag/EmptyElemTag or ETag or Comment 00152 // get lookahead 00153 TEST_FOR_EXCEPTION( _is->readBytes(&c2,1) < 1 , std::runtime_error, "XMLParser::parse(): stream ended in tag begin/end"); 00154 00155 if (c2 == '/') { 00156 // we have: </ 00157 // try to get an ETag 00158 getETag(tag); 00159 TEST_FOR_EXCEPTION( handler->endElement(tag)!=0, std::runtime_error, 00160 "XMLParser::getETag(): document not well-formed: end element" 00161 " tag = '"<<tag<<"' did not match start element"); 00162 curopen--; 00163 } 00164 else if (isLetter(c2) || c2==':' || c2=='_') { 00165 // it looks like a STag or an EmptyElemTag 00166 bool emptytag; 00167 getSTag(c2, tag, attrs, emptytag); 00168 handler->startElement(tag,attrs); 00169 if (curopen == 0) { 00170 TEST_FOR_EXCEPTION(gotRoot == true, std::runtime_error, 00171 "XMLParser::getETag(): document not well-formed: more than one root element specified"); 00172 gotRoot = true; 00173 } 00174 curopen++; 00175 if (emptytag) { 00176 TEST_FOR_EXCEPTION( handler->endElement(tag)!=0, std::runtime_error, 00177 "XMLParser::getETag(): document not well-formed: end element tag did not match start element"); 00178 curopen--; 00179 } 00180 } 00181 else if (c2 == '!') { 00182 // it is starting to look like a comment; we need '--' 00183 // if we don't get this, it means 00184 // * the document is not well-formed 00185 // * the document employs a feature not supported by this parser, 00186 // e.g. <!ELEMENT... <!ATTLIST... <!DOCTYPE... <![CDATA[... 00187 TEST_FOR_EXCEPTION( assertChar('-')!=0, std::runtime_error, 00188 "XMLParser::parse(): element not well-formed or exploits unsupported feature" ); 00189 TEST_FOR_EXCEPTION( assertChar('-')!=0 , std::runtime_error, 00190 "XMLParser::parse(): element not well-formed or exploits unsupported feature" ); 00191 getComment(); 00192 } 00193 else { 00194 TEST_FOR_EXCEPTION(1, std::runtime_error, "XMLParser::parse(): element not well-formed or exploits unsupported feature" ); 00195 } 00196 } 00197 else if ( (curopen > 0) && (c1 == '&') ) { 00198 std::string chars = ""; 00199 getReference(chars); 00200 handler->characters(chars); 00201 } 00202 else if ( (curopen > 0) ) { 00203 std::string chars = ""; 00204 chars.push_back(c1); 00205 handler->characters(chars); 00206 } 00207 else { 00208 TEST_FOR_EXCEPTION(1,std::runtime_error,"XMLParser::parse(): document not well-formed"); 00209 } 00210 } 00211 00212 TEST_FOR_EXCEPTION( curopen != 0 , std::runtime_error, "XMLParser::parse(): document not well-formed: elements not matched" ); 00213 00214 return handler->getObject(); 00215 00216 } 00217 00218 00219 void XMLParser::getETag(std::string &tag) 00220 { 00221 /* Recall from the specification: 00222 ETag ::= '</' Name S? '>' 00223 Name ::= (Letter | '_' | ':') (NameChar)* 00224 00225 We have already consumed: </ 00226 */ 00227 00228 bool tagover = false; 00229 unsigned char c; 00230 // clear tag 00231 tag = ""; 00232 TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getETag(): EOF before end element was terminated"); 00233 TEST_FOR_EXCEPTION( !isLetter(c) && c!='_' && c!=':' , std::runtime_error , "XMLParser::getETag(): tag not well-formed"); 00234 tag.push_back(c); 00235 while (1) { 00236 TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getETag(): EOF before end element was terminated"); 00237 if ( isNameChar(c) ) { 00238 if (tagover) { 00239 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getETag(): end element not well-formed: expected '>'"); 00240 } 00241 tag.push_back(c); 00242 } 00243 else if (isSpace(c)) { 00244 // mark the end of the tag and consume the whitespace 00245 tagover = true; 00246 } 00247 else if (c == '>') { 00248 break; 00249 } 00250 else { 00251 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getETag(): end element not well-formed"); 00252 } 00253 } 00254 } 00255 00256 00257 void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag) 00258 { 00259 00260 /* Recall from the specification: 00261 00262 STag ::= '<' Name (S Attribute)* S? '>' 00263 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 00264 Name ::= (Letter | '_' | ':') (NameChar)* 00265 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7 00266 00267 S ::= (#x20 | #x9 | #xD | #xA)+ 00268 Attribute ::= Name Eq AttValue 00269 Eq ::= S? '=' S? 00270 AttValue ::= '"' ([^<&"] | Reference)* '"' 00271 | "'" ([^<&'] | Reference)* "'" 00272 Reference ::= EntityRef | CharRef 00273 CharRef ::= '&#' [0-9]+ ';' 00274 EntityRef ::= '&' Name ';' 00275 00276 We have already consumed: <lookahead 00277 */ 00278 00279 unsigned char c; 00280 attrs.clear(); 00281 00282 tag = lookahead; 00283 // get the rest of the tag: (NameChar)* 00284 while (1) { 00285 TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated"); 00286 if (isNameChar(c)) { 00287 tag.push_back(c); 00288 } 00289 else { 00290 break; 00291 } 00292 } 00293 00294 // after the name: should be one of the following 00295 // (S Attribute) | S? '>' | S? '/>' 00296 do { 00297 00298 bool hadspace = false; 00299 00300 // if space, consume the whitespace 00301 if ( isSpace(c) ) { 00302 hadspace = true; 00303 TEST_FOR_EXCEPTION( getSpace(c)!=0, std::runtime_error, 00304 "XMLParser::getSTag(): EOF before start element was terminated"); 00305 } 00306 00307 // now, either Attribute | '>' | '/>' 00308 if ( (isLetter(c) || c=='_' || c==':') && hadspace ) { 00309 00310 // Attribute 00311 // get attribute name, starting with contents of c 00312 std::string attname, attval; 00313 attname = c; 00314 do { 00315 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated"); 00316 if ( isNameChar(c) ) { 00317 attname.push_back(c); 00318 } 00319 else if ( isSpace(c) || c=='=' ) { 00320 break; 00321 } 00322 else { 00323 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute not well-formed: expected whitespace or '='"); 00324 } 00325 } while (1); 00326 00327 // if whitespace, consume it 00328 if (isSpace(c)) { 00329 getSpace(c); 00330 } 00331 // should be on '=' 00332 if (c != '=') { 00333 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute not well-formed: expected '='"); 00334 } 00335 00336 // get any whitespace following the '=' 00337 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated"); 00338 if (isSpace(c)) { 00339 getSpace(c); 00340 } 00341 00342 // now get the quoted attribute value 00343 bool apost; 00344 attval = ""; 00345 if (c == '\'') { 00346 apost = true; 00347 } 00348 else if (c == '\"') { 00349 apost = false; 00350 } 00351 else { 00352 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute value must be quoted with either ''' or '\"'"); 00353 } 00354 do { 00355 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated"); 00356 if (apost && c=='\'') { 00357 // end of attval 00358 break; 00359 } 00360 else if (!apost && c=='\"') { 00361 // end of attval 00362 break; 00363 } 00364 else if ( c == '&' ) { 00365 // finish: need to add support for Reference 00366 std::string refstr; 00367 getReference(refstr); 00368 attval += refstr; 00369 } 00370 else if ( c!='<' ) { 00371 // valid character for attval 00372 attval.push_back(c); 00373 } 00374 else { 00375 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): invalid character in attribute value"); 00376 } 00377 } while(1); 00378 00379 // add attribute to list 00380 TEST_FOR_EXCEPTION( attrs.find(attname) != attrs.end() , std::runtime_error , "XMLParser::getSTag(): cannot have two attributes with the same name"); 00381 attrs[attname] = attval; 00382 } 00383 else if (c == '>') { 00384 emptytag = false; 00385 break; 00386 } 00387 else if (c == '/') { 00388 TEST_FOR_EXCEPTION(assertChar('>')!=0, std::runtime_error, 00389 "XMLParser::getSTag(): empty element tag not well-formed: expected '>'"); 00390 emptytag = true; 00391 break; 00392 } 00393 else { 00394 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): start element not well-formed: invalid character"); 00395 } 00396 00397 // get next char 00398 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated"); 00399 00400 } while(1); 00401 } 00402 00403 00404 void XMLParser::getComment() 00405 { 00406 /* Recall from the specification: 00407 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 00408 that is, '<!--' txt '-->', where txt does not contain '--' 00409 We have already consumed: <!-- 00410 00411 Be wary here of the fact that c=='-' implies isChar(c) 00412 */ 00413 unsigned char c; 00414 while (1) { 00415 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getComment(): EOF before comment was terminated"); 00416 // if we have a - 00417 if (c=='-') { 00418 // then it must be the end of the comment or be a Char 00419 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getComment(): EOF before comment was terminated"); 00420 if (c=='-') { 00421 // this had better be leading to the end of the comment 00422 TEST_FOR_EXCEPTION( assertChar('>')!=0, std::runtime_error, 00423 "XMLParser::getComment(): comment not well-formed: expected '>'"); 00424 break; 00425 } 00426 else if (!isChar(c)) { 00427 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getComment(): comment not well-formed: invalid character"); 00428 } 00429 } 00430 else if (!isChar(c)) { 00431 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getComment(): comment not well-formed: invalid character"); 00432 } 00433 } 00434 } 00435 00436 00437 void XMLParser::getReference(std::string &refstr) { 00438 // finish: does CharRef support only dec, or hex as well? 00439 unsigned char c; 00440 unsigned int num, base; 00441 refstr = ""; 00442 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated"); 00443 if (c == '#') { 00444 // get a CharRef 00445 // CharRef ::= '&#' [0-9]+ ';' 00446 // | '&#x' [0-9]+ ';' 00447 // get first number 00448 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated"); 00449 if (c == 'x') { 00450 base = 16; 00451 num = 0; 00452 } 00453 else if ('0' <= c && c <= '9') { 00454 base = 10; 00455 num = c - '0'; 00456 } 00457 else { 00458 TEST_FOR_EXCEPTION(1, std::runtime_error, "XMLParser::getReference(): invalid character in character reference: expected 'x' or [0-9]"); 00459 } 00460 00461 do { 00462 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated"); 00463 TEST_FOR_EXCEPTION( c != ';' && !('0' <= c && c <= '9') , std::runtime_error , "XMLParser::getReference(): invalid character in character reference: expected [0-9] or ';'"); 00464 if (c == ';') { 00465 break; 00466 } 00467 num = num*base + (c-'0'); 00468 } while (1); 00469 TEST_FOR_EXCEPTION(num > 0xFF, std::runtime_error , "XMLParser::getReference(): character reference value out of range"); 00470 refstr.push_back( (unsigned char)num ); 00471 } 00472 else if (isLetter(c) || c=='_' || c==':') { 00473 // get an EntityRef 00474 // EntityRef ::= '&' Name ';' 00475 std::string entname = ""; 00476 entname.push_back(c); 00477 do { 00478 TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated"); 00479 if (c==';') { 00480 break; 00481 } 00482 else if ( isLetter(c) || ('0' <= c && c <= '9') 00483 || c=='.' || c=='-' || c=='_' || c==':' 00484 || c==0xB7 ) { 00485 entname.push_back(c); 00486 } 00487 else { 00488 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getReference(): entity reference not well-formed: invalid character"); 00489 } 00490 } while (1); 00491 TEST_FOR_EXCEPTION( _entities.find(entname) == _entities.end(), std::runtime_error , "XMLParser::getReference(): entity reference not well-formed: undefined entity"); 00492 refstr = _entities[entname]; 00493 } 00494 else { 00495 TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getReference(): reference not well-formed: expected name or '#'"); 00496 } 00497 } 00498 00499 00500 int XMLParser::getSpace(unsigned char &lookahead) { 00501 // if space, consume the whitespace 00502 do { 00503 if (_is->readBytes(&lookahead,1) < 1) { 00504 return 1; // inform caller that we reached the end 00505 } 00506 } 00507 while (isSpace(lookahead)); 00508 return 0; 00509 } 00510 00511 00512 bool XMLParser::isLetter(unsigned char c) { 00513 if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || 00514 (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) || 00515 (0xF8 <= c) /* unsigned char must be <= 0xFF */ ) 00516 { 00517 return true; 00518 } 00519 return false; 00520 } 00521 00522 00523 bool XMLParser::isNameChar(unsigned char c) { 00524 if ( isLetter(c) || ('0' <= c && c <= '9') || 00525 c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 ) 00526 { 00527 return true; 00528 } 00529 return false; 00530 } 00531 00532 00533 bool XMLParser::isSpace(unsigned char c) { 00534 if ( c==0x20 || c==0x9 || c==0xD || c==0xA ) 00535 { 00536 return true; 00537 } 00538 return false; 00539 } 00540 00541 00542 bool XMLParser::isChar(unsigned char c) { 00543 if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) { // unsigned char must be <= 0xFF 00544 return true; 00545 } 00546 return false; 00547 } 00548 00549 00550 int XMLParser::assertChar(unsigned char cexp) 00551 { 00552 // pull the next character off the stream and verify that it is what is expected 00553 // if not, return an error to the caller 00554 unsigned char c; 00555 if (_is->readBytes(&c,1) < 1) { 00556 return 1; 00557 } 00558 if (c != cexp) { 00559 return 2; 00560 } 00561 return 0; 00562 } 00563
1.7.4