Teuchos_XMLParser.cpp

Go to the documentation of this file.
00001 // @HEADER
00002 // ***********************************************************************
00003 // 
00004 //                    Teuchos: Common Tools Package
00005 //                 Copyright (2004) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 // BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
00030 // when printing attribute values, one must check if the value contains quote
00031 // or apost; 
00032 // a quot'd attval cannot contain literal quot
00033 // a apos'd attval cannot contain literal apos
00034 // either they have to be matched appropriately or (easier) all quot and apos must
00035 // be replaced by " and '
00036 
00037 #include "Teuchos_XMLParser.hpp"
00038 #include "Teuchos_TreeBuildingXMLHandler.hpp"
00039 #include "Teuchos_TestForException.hpp"
00040 
00041 using namespace Teuchos;
00042 
00043 // this parser currently does not support:
00044 // * XML declaration
00045 // * processing instructions
00046 // * XML schemas
00047 // * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
00048 // * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
00049 
00050 // it currently does support:
00051 // * comments
00052 // * empty element tags, e.g.   <hello />
00053 // * entity references: &amp; &lt; &gt; &apos; &quot;
00054 // * numeric character references: &#32;
00055 // * std::exception/error handling on parse errors
00056 
00057 
00058 /* From the W3C XML 1.0 Third Edition
00059    http://www.w3.org/TR/2004/REC-xml-20040204/
00060   
00061    The following productions specify well-formed XML documents.
00062    These have been reduced to the support anticipated for support by this parser.
00063         
00064      element      ::=  EmptyElemTag
00065                        | STag content ETag 
00066      STag         ::=  '<' Name (S Attribute)* S? '>' 
00067      Attribute    ::=  Name Eq AttValue 
00068      ETag         ::=  '</' Name S? '>'
00069      content      ::=  CharData? ((element | Reference | CDSect | Comment) CharData?)*
00070      EmptyElemTag ::=  '<' Name (S Attribute)* S? '/>'
00071      
00072      AttValue     ::=  '"' ([^<&"] | Reference)* '"'
00073                        | "'" ([^<&'] | Reference)* "'"
00074      
00075      CharRef   ::= '&#' [0-9]+ ';'
00076      EntityRef ::= '&' Name ';'
00077      Reference ::= EntityRef | CharRef
00078      
00079      #x20 (space)
00080      #x9  (horizontal tab)
00081      #xD  (carriage return)
00082      #xA  (new line, new line line feed)
00083      
00084      S        ::=  (#x20 | #x9 | #xD | #xA)+
00085      Eq       ::=   S? '=' S?
00086      NameChar ::=  Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
00087      Name     ::=  (Letter | '_' | ':') (NameChar)*
00088      
00089      Letter   ::= [#x0041-#x005A] | [#x0061-#x007A] 
00090                   | [#x00C0-#x00D6] | [#x00D8-#x00F6] 
00091                   | [#x00F8-#x00FF]
00092      Digit    ::= [#x0030-#x0039]
00093      
00094      Char      ::=  #x9 | #xA | #xD | [#x20-#xFF]   
00095      CharData  ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
00096                    that is, some std::string of characters not containing '<' or '&' or ']]>'
00097      Comment   ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
00098                    that is, '<!--' txt '-->', where txt does not contain '--' 
00099      
00100      CDSect    ::= CDStart CData CDEnd
00101      CDStart   ::= '<![CDATA['
00102      CData     ::= (Char* - (Char* ']]>' Char*))
00103      CDEnd     ::= ']]>'
00104      
00105      document  ::=   prolog element Misc*
00106      prolog    ::=   XMLDecl? Misc*
00107      XMLDecl   ::=   '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
00108      Misc      ::=   Comment | S
00109         
00110 */
00111 
00112 XMLObject XMLParser::parse() 
00113 {
00114   
00115   RCP<TreeBuildingXMLHandler> handler = rcp(new TreeBuildingXMLHandler());
00116   
00117   _entities.clear();
00118   _entities["apos"] = "'";
00119   _entities["quot"] = "\"";
00120   _entities["lt"]   = "<";
00121   _entities["gt"]   = ">";
00122   _entities["amp"]  = "&";
00123   
00124   bool done = false;
00125   int curopen = 0;  // number of currently open tags, or "do we process character data?"
00126   bool gotRoot = false;
00127 
00128   while (!done) {
00129     
00130     std::string tag, cdata;
00131     unsigned char c1, c2;
00132     Teuchos::map<std::string,string> attrs;
00133     
00134     // Consume any whitespace
00135     if (curopen == 0) {
00136       // this will leave a lookahead in c1
00137       if ( getSpace(c1) ) {
00138         done = true;
00139         break;
00140       }
00141     }
00142     else {
00143       // need to manually lookahead
00144       if (_is->readBytes(&c1,1) < 1) {
00145         done = true;
00146         break;
00147       }
00148     }
00149 
00150     if (c1 == '<') {
00151       // determine if it is a STag/EmptyElemTag or ETag or Comment
00152       // get lookahead
00153       TEST_FOR_EXCEPTION( _is->readBytes(&c2,1) < 1 , std::runtime_error, "XMLParser::parse(): stream ended in tag begin/end");
00154 
00155       if (c2 == '/') {
00156         // we have: </
00157         // try to get an ETag
00158         getETag(tag);
00159         TEST_FOR_EXCEPTION( handler->endElement(tag)!=0, std::runtime_error,
00160           "XMLParser::getETag(): document not well-formed: end element"
00161           " tag = '"<<tag<<"' did not match start element");
00162         curopen--;
00163       }
00164       else if (isLetter(c2) || c2==':' || c2=='_') {
00165         // it looks like a STag or an EmptyElemTag
00166         bool emptytag;
00167         getSTag(c2, tag, attrs, emptytag);
00168         handler->startElement(tag,attrs);
00169         if (curopen == 0) {
00170           TEST_FOR_EXCEPTION(gotRoot == true, std::runtime_error,
00171             "XMLParser::getETag(): document not well-formed: more than one root element specified");
00172           gotRoot = true;
00173         }
00174         curopen++;
00175         if (emptytag) {
00176           TEST_FOR_EXCEPTION( handler->endElement(tag)!=0, std::runtime_error,
00177             "XMLParser::getETag(): document not well-formed: end element tag did not match start element");
00178           curopen--;
00179         }
00180       }
00181       else if (c2 == '!') {
00182         // it is starting to look like a comment; we need '--'
00183         // if we don't get this, it means
00184         // * the document is not well-formed
00185         // * the document employs a feature not supported by this parser, 
00186         //   e.g. <!ELEMENT...  <!ATTLIST...  <!DOCTYPE...  <![CDATA[...
00187         TEST_FOR_EXCEPTION( assertChar('-')!=0, std::runtime_error,
00188             "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00189         TEST_FOR_EXCEPTION( assertChar('-')!=0 , std::runtime_error,
00190             "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00191         getComment();
00192       }
00193       else {
00194         TEST_FOR_EXCEPTION(1, std::runtime_error, "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00195       }
00196     }
00197     else if ( (curopen > 0) && (c1 == '&') ) {
00198       std::string chars = "";
00199       getReference(chars);
00200       handler->characters(chars);
00201     }
00202     else if ( (curopen > 0) ) {
00203       std::string chars = "";
00204       chars.push_back(c1);
00205       handler->characters(chars);
00206     }
00207     else {
00208       TEST_FOR_EXCEPTION(1,std::runtime_error,"XMLParser::parse(): document not well-formed");
00209     }
00210   }
00211 
00212   TEST_FOR_EXCEPTION( curopen != 0 , std::runtime_error, "XMLParser::parse(): document not well-formed: elements not matched" );
00213 
00214   return handler->getObject();
00215 
00216 }
00217 
00218 
00219 void XMLParser::getETag(std::string &tag)
00220 {
00221   /* Recall from the specification:
00222         ETag  ::=  '</' Name S? '>'
00223         Name  ::=  (Letter | '_' | ':') (NameChar)*
00224     
00225      We have already consumed: </
00226   */
00227   
00228   bool tagover = false;
00229   unsigned char c;
00230   // clear tag
00231   tag = "";
00232   TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getETag(): EOF before end element was terminated");
00233   TEST_FOR_EXCEPTION( !isLetter(c) && c!='_' && c!=':' , std::runtime_error , "XMLParser::getETag(): tag not well-formed");
00234   tag.push_back(c);
00235   while (1) {
00236     TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getETag(): EOF before end element was terminated");
00237     if ( isNameChar(c) ) {
00238       if (tagover) {
00239         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getETag(): end element not well-formed: expected '>'");
00240       }
00241       tag.push_back(c);
00242     }
00243     else if (isSpace(c)) {
00244       // mark the end of the tag and consume the whitespace
00245       tagover = true;
00246     }
00247     else if (c == '>') {
00248       break; 
00249     }
00250     else {
00251       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getETag(): end element not well-formed");
00252     }
00253   }
00254 }
00255 
00256 
00257 void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag) 
00258 {
00259   
00260   /* Recall from the specification:
00261         
00262         STag         ::=  '<' Name (S Attribute)* S? '>' 
00263         EmptyElemTag ::=  '<' Name (S Attribute)* S? '/>'
00264         Name         ::=  (Letter | '_' | ':') (NameChar)*
00265         NameChar     ::=  Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
00266         
00267         S            ::=  (#x20 | #x9 | #xD | #xA)+
00268         Attribute    ::=  Name Eq AttValue 
00269         Eq           ::=   S? '=' S?
00270         AttValue     ::=  '"' ([^<&"] | Reference)* '"'
00271                           | "'" ([^<&'] | Reference)* "'"
00272         Reference ::= EntityRef | CharRef
00273         CharRef   ::= '&#' [0-9]+ ';'
00274         EntityRef ::= '&' Name ';'
00275         
00276      We have already consumed: <lookahead
00277   */
00278   
00279   unsigned char c;
00280   attrs.clear();
00281   
00282   tag = lookahead;
00283   // get the rest of the tag: (NameChar)*
00284   while (1) {
00285     TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00286     if (isNameChar(c)) {
00287       tag.push_back(c);
00288     }
00289     else {
00290       break; 
00291     }
00292   }
00293   
00294   // after the name: should be one of the following
00295   // (S Attribute) | S? '>' | S? '/>'
00296   do {
00297     
00298     bool hadspace = false;
00299     
00300     // if space, consume the whitespace
00301     if ( isSpace(c) ) {
00302       hadspace = true;
00303       TEST_FOR_EXCEPTION( getSpace(c)!=0, std::runtime_error,
00304         "XMLParser::getSTag(): EOF before start element was terminated");
00305     }
00306     
00307     // now, either Attribute | '>' | '/>'
00308     if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
00309       
00310       // Attribute
00311       // get attribute name, starting with contents of c
00312       std::string attname, attval;
00313       attname = c;
00314       do {
00315         TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00316         if ( isNameChar(c) ) {
00317           attname.push_back(c);
00318         }
00319         else if ( isSpace(c) || c=='=' ) {
00320           break; 
00321         }
00322         else {
00323           TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute not well-formed: expected whitespace or '='");
00324         }
00325       } while (1);
00326       
00327       // if whitespace, consume it
00328       if (isSpace(c)) {
00329         getSpace(c);  
00330       }
00331       // should be on '='
00332       if (c != '=') {
00333         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute not well-formed: expected '='");
00334       }
00335       
00336       // get any whitespace following the '='
00337       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00338       if (isSpace(c)) {
00339         getSpace(c);
00340       }
00341       
00342       // now get the quoted attribute value
00343       bool apost;
00344       attval = "";
00345       if (c == '\'') {
00346         apost = true;
00347       }
00348       else if (c == '\"') {
00349         apost = false;
00350       }
00351       else {
00352         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute value must be quoted with either ''' or '\"'");
00353       }
00354       do {
00355         TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00356         if (apost && c=='\'') {
00357           // end of attval
00358           break;
00359         }
00360         else if (!apost && c=='\"') {
00361           // end of attval
00362           break;
00363         }
00364         else if ( c == '&' ) {
00365           // finish: need to add support for Reference
00366           std::string refstr;
00367           getReference(refstr);
00368           attval += refstr;
00369         }
00370         else if ( c!='<' ) {
00371           // valid character for attval
00372           attval.push_back(c);
00373         }
00374         else {
00375           TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): invalid character in attribute value");
00376         }
00377       } while(1);
00378       
00379       // add attribute to list
00380       TEST_FOR_EXCEPTION( attrs.find(attname) != attrs.end() , std::runtime_error , "XMLParser::getSTag(): cannot have two attributes with the same name");
00381       attrs[attname] = attval;
00382     }
00383     else if (c == '>') {
00384       emptytag = false;
00385       break;
00386     }
00387     else if (c == '/') {
00388       TEST_FOR_EXCEPTION(assertChar('>')!=0, std::runtime_error,
00389         "XMLParser::getSTag(): empty element tag not well-formed: expected '>'");
00390       emptytag = true;
00391       break;
00392     }
00393     else {
00394       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): start element not well-formed: invalid character");
00395     }
00396   
00397     // get next char
00398     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00399   
00400   } while(1);
00401 }
00402 
00403 
00404 void XMLParser::getComment() 
00405 {
00406   /* Recall from the specification:
00407         Comment   ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
00408                       that is, '<!--' txt '-->', where txt does not contain '--' 
00409      We have already consumed: <!--
00410      
00411      Be wary here of the fact that c=='-' implies isChar(c)
00412   */
00413   unsigned char c;
00414   while (1) {
00415     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getComment(): EOF before comment was terminated");
00416     // if we have a -
00417     if (c=='-') {
00418       // then it must be the end of the comment or be a Char
00419       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getComment(): EOF before comment was terminated");
00420       if (c=='-') {
00421         // this had better be leading to the end of the comment
00422         TEST_FOR_EXCEPTION( assertChar('>')!=0, std::runtime_error,
00423             "XMLParser::getComment(): comment not well-formed: expected '>'");
00424         break;
00425       }
00426       else if (!isChar(c)) {
00427         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getComment(): comment not well-formed: invalid character");
00428       }
00429     }
00430     else if (!isChar(c)) {
00431       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getComment(): comment not well-formed: invalid character");
00432     }
00433   } 
00434 }
00435 
00436 
00437 void XMLParser::getReference(std::string &refstr) {
00438   // finish: does CharRef support only dec, or hex as well?
00439   unsigned char c;
00440   unsigned int num, base;
00441   refstr = "";
00442   TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00443   if (c == '#') {
00444     // get a CharRef
00445     // CharRef   ::= '&#' [0-9]+ ';'
00446     //               | '&#x' [0-9]+ ';'
00447     // get first number
00448     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00449     if (c == 'x') {
00450       base = 16;
00451       num = 0;
00452     }
00453     else if ('0' <= c && c <= '9') {
00454       base = 10;
00455       num = c - '0';
00456     }
00457     else {
00458       TEST_FOR_EXCEPTION(1, std::runtime_error, "XMLParser::getReference(): invalid character in character reference: expected 'x' or [0-9]");
00459     }
00460 
00461     do {
00462       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00463       TEST_FOR_EXCEPTION( c != ';' && !('0' <= c && c <= '9') , std::runtime_error , "XMLParser::getReference(): invalid character in character reference: expected [0-9] or ';'");
00464       if (c == ';') {
00465         break;
00466       }
00467       num = num*base + (c-'0');
00468     } while (1);
00469     TEST_FOR_EXCEPTION(num > 0xFF, std::runtime_error , "XMLParser::getReference(): character reference value out of range");
00470     refstr.push_back( (unsigned char)num );
00471   }
00472   else if (isLetter(c) || c=='_' || c==':') {
00473     // get an EntityRef
00474     // EntityRef ::= '&' Name ';'
00475     std::string entname = "";
00476     entname.push_back(c);
00477     do {
00478       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00479       if (c==';') {
00480         break;
00481       }
00482       else if ( isLetter(c) || ('0' <= c && c <= '9')
00483                 || c=='.' || c=='-' || c=='_' || c==':' 
00484                 || c==0xB7 ) {
00485         entname.push_back(c);
00486       }
00487       else {
00488         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getReference(): entity reference not well-formed: invalid character");
00489       }
00490     } while (1);
00491     TEST_FOR_EXCEPTION( _entities.find(entname) == _entities.end(), std::runtime_error , "XMLParser::getReference(): entity reference not well-formed: undefined entity");
00492     refstr = _entities[entname];  
00493   }
00494   else {
00495     TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getReference(): reference not well-formed: expected name or '#'");
00496   }
00497 }
00498 
00499 
00500 int XMLParser::getSpace(unsigned char &lookahead) {
00501   // if space, consume the whitespace
00502   do {
00503     if (_is->readBytes(&lookahead,1) < 1) {
00504       return 1; // inform caller that we reached the end
00505     }
00506   }
00507   while (isSpace(lookahead));
00508   return 0;
00509 }
00510 
00511 
00512 bool XMLParser::isLetter(unsigned char c) {
00513   if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
00514        (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
00515        (0xF8 <= c) /* unsigned char must be <= 0xFF */         )
00516   {
00517     return true;
00518   }
00519   return false;
00520 }
00521 
00522 
00523 bool XMLParser::isNameChar(unsigned char c) {
00524   if ( isLetter(c) || ('0' <= c && c <= '9') ||
00525        c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 ) 
00526   {
00527     return true;
00528   }
00529   return false;
00530 }
00531 
00532 
00533 bool XMLParser::isSpace(unsigned char c) {
00534   if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
00535   {
00536     return true;
00537   }
00538   return false;
00539 }
00540 
00541 
00542 bool XMLParser::isChar(unsigned char c) {
00543   if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) {  // unsigned char must be <= 0xFF
00544     return true;
00545   }
00546   return false;
00547 }
00548 
00549 
00550 int XMLParser::assertChar(unsigned char cexp) 
00551 {
00552   // pull the next character off the stream and verify that it is what is expected
00553   // if not, return an error to the caller
00554   unsigned char c;
00555   if (_is->readBytes(&c,1) < 1) {
00556     return 1;
00557   }
00558   if (c != cexp) {
00559     return 2;
00560   }
00561   return 0; 
00562 }
00563 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

Generated on Tue Oct 20 10:14:01 2009 for Teuchos Package Browser (Single Doxygen Collection) by  doxygen 1.6.1