Teuchos - Trilinos Tools Package Version of the Day
Teuchos_XMLParser.cpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                    Teuchos: Common Tools Package
00005 //                 Copyright (2004) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ***********************************************************************
00040 // @HEADER
00041 
00042 // BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
00043 // when printing attribute values, one must check if the value contains quote
00044 // or apost; 
00045 // a quot'd attval cannot contain literal quot
00046 // a apos'd attval cannot contain literal apos
00047 // either they have to be matched appropriately or (easier) all quot and apos must
00048 // be replaced by " and '
00049 
00050 #include "Teuchos_XMLParser.hpp"
00051 #include "Teuchos_TreeBuildingXMLHandler.hpp"
00052 #include "Teuchos_TestForException.hpp"
00053 
00054 using namespace Teuchos;
00055 
00056 // this parser currently does not support:
00057 // * XML declaration
00058 // * processing instructions
00059 // * XML schemas
00060 // * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
00061 // * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
00062 
00063 // it currently does support:
00064 // * comments
00065 // * empty element tags, e.g.   <hello />
00066 // * entity references: &amp; &lt; &gt; &apos; &quot;
00067 // * numeric character references: &#32;
00068 // * std::exception/error handling on parse errors
00069 
00070 
00071 /* From the W3C XML 1.0 Third Edition
00072    http://www.w3.org/TR/2004/REC-xml-20040204/
00073   
00074    The following productions specify well-formed XML documents.
00075    These have been reduced to the support anticipated for support by this parser.
00076         
00077      element      ::=  EmptyElemTag
00078                        | STag content ETag 
00079      STag         ::=  '<' Name (S Attribute)* S? '>' 
00080      Attribute    ::=  Name Eq AttValue 
00081      ETag         ::=  '</' Name S? '>'
00082      content      ::=  CharData? ((element | Reference | CDSect | Comment) CharData?)*
00083      EmptyElemTag ::=  '<' Name (S Attribute)* S? '/>'
00084      
00085      AttValue     ::=  '"' ([^<&"] | Reference)* '"'
00086                        | "'" ([^<&'] | Reference)* "'"
00087      
00088      CharRef   ::= '&#' [0-9]+ ';'
00089      EntityRef ::= '&' Name ';'
00090      Reference ::= EntityRef | CharRef
00091      
00092      #x20 (space)
00093      #x9  (horizontal tab)
00094      #xD  (carriage return)
00095      #xA  (new line, new line line feed)
00096      
00097      S        ::=  (#x20 | #x9 | #xD | #xA)+
00098      Eq       ::=   S? '=' S?
00099      NameChar ::=  Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
00100      Name     ::=  (Letter | '_' | ':') (NameChar)*
00101      
00102      Letter   ::= [#x0041-#x005A] | [#x0061-#x007A] 
00103                   | [#x00C0-#x00D6] | [#x00D8-#x00F6] 
00104                   | [#x00F8-#x00FF]
00105      Digit    ::= [#x0030-#x0039]
00106      
00107      Char      ::=  #x9 | #xA | #xD | [#x20-#xFF]   
00108      CharData  ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
00109                    that is, some std::string of characters not containing '<' or '&' or ']]>'
00110      Comment   ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
00111                    that is, '<!--' txt '-->', where txt does not contain '--' 
00112      
00113      CDSect    ::= CDStart CData CDEnd
00114      CDStart   ::= '<![CDATA['
00115      CData     ::= (Char* - (Char* ']]>' Char*))
00116      CDEnd     ::= ']]>'
00117      
00118      document  ::=   prolog element Misc*
00119      prolog    ::=   XMLDecl? Misc*
00120      XMLDecl   ::=   '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
00121      Misc      ::=   Comment | S
00122         
00123 */
00124 
00125 XMLObject XMLParser::parse() 
00126 {
00127   
00128   RCP<TreeBuildingXMLHandler> handler = rcp(new TreeBuildingXMLHandler());
00129   
00130   _entities.clear();
00131   _entities["apos"] = "'";
00132   _entities["quot"] = "\"";
00133   _entities["lt"]   = "<";
00134   _entities["gt"]   = ">";
00135   _entities["amp"]  = "&";
00136   
00137   bool done = false;
00138   int curopen = 0;  // number of currently open tags, or "do we process character data?"
00139   bool gotRoot = false;
00140 
00141   while (!done) {
00142     
00143     std::string tag, cdata;
00144     unsigned char c1, c2;
00145     Teuchos::map<std::string,string> attrs;
00146     
00147     // Consume any whitespace
00148     if (curopen == 0) {
00149       // this will leave a lookahead in c1
00150       if ( getSpace(c1) ) {
00151         done = true;
00152         break;
00153       }
00154     }
00155     else {
00156       // need to manually lookahead
00157       if (_is->readBytes(&c1,1) < 1) {
00158         done = true;
00159         break;
00160       }
00161     }
00162 
00163     if (c1 == '<') {
00164       // determine if it is a STag/EmptyElemTag or ETag or Comment
00165       // get lookahead
00166       TEST_FOR_EXCEPTION( _is->readBytes(&c2,1) < 1 , std::runtime_error, "XMLParser::parse(): stream ended in tag begin/end");
00167 
00168       if (c2 == '/') {
00169         // we have: </
00170         // try to get an ETag
00171         getETag(tag);
00172         TEST_FOR_EXCEPTION( handler->endElement(tag)!=0, std::runtime_error,
00173           "XMLParser::getETag(): document not well-formed: end element"
00174           " tag = '"<<tag<<"' did not match start element");
00175         curopen--;
00176       }
00177       else if (isLetter(c2) || c2==':' || c2=='_') {
00178         // it looks like a STag or an EmptyElemTag
00179         bool emptytag;
00180         getSTag(c2, tag, attrs, emptytag);
00181         handler->startElement(tag,attrs);
00182         if (curopen == 0) {
00183           TEST_FOR_EXCEPTION(gotRoot == true, std::runtime_error,
00184             "XMLParser::getETag(): document not well-formed: more than one root element specified");
00185           gotRoot = true;
00186         }
00187         curopen++;
00188         if (emptytag) {
00189           TEST_FOR_EXCEPTION( handler->endElement(tag)!=0, std::runtime_error,
00190             "XMLParser::getETag(): document not well-formed: end element tag did not match start element");
00191           curopen--;
00192         }
00193       }
00194       else if (c2 == '!') {
00195         // it is starting to look like a comment; we need '--'
00196         // if we don't get this, it means
00197         // * the document is not well-formed
00198         // * the document employs a feature not supported by this parser, 
00199         //   e.g. <!ELEMENT...  <!ATTLIST...  <!DOCTYPE...  <![CDATA[...
00200         TEST_FOR_EXCEPTION( assertChar('-')!=0, std::runtime_error,
00201             "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00202         TEST_FOR_EXCEPTION( assertChar('-')!=0 , std::runtime_error,
00203             "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00204         getComment();
00205       }
00206       else {
00207         TEST_FOR_EXCEPTION(1, std::runtime_error, "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00208       }
00209     }
00210     else if ( (curopen > 0) && (c1 == '&') ) {
00211       std::string chars = "";
00212       getReference(chars);
00213       handler->characters(chars);
00214     }
00215     else if ( (curopen > 0) ) {
00216       std::string chars = "";
00217       chars.push_back(c1);
00218       handler->characters(chars);
00219     }
00220     else {
00221       TEST_FOR_EXCEPTION(1,std::runtime_error,"XMLParser::parse(): document not well-formed");
00222     }
00223   }
00224 
00225   TEST_FOR_EXCEPTION( curopen != 0 , std::runtime_error, "XMLParser::parse(): document not well-formed: elements not matched" );
00226 
00227   return handler->getObject();
00228 
00229 }
00230 
00231 
00232 void XMLParser::getETag(std::string &tag)
00233 {
00234   /* Recall from the specification:
00235         ETag  ::=  '</' Name S? '>'
00236         Name  ::=  (Letter | '_' | ':') (NameChar)*
00237     
00238      We have already consumed: </
00239   */
00240   
00241   bool tagover = false;
00242   unsigned char c;
00243   // clear tag
00244   tag = "";
00245   TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getETag(): EOF before end element was terminated");
00246   TEST_FOR_EXCEPTION( !isLetter(c) && c!='_' && c!=':' , std::runtime_error , "XMLParser::getETag(): tag not well-formed");
00247   tag.push_back(c);
00248   while (1) {
00249     TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getETag(): EOF before end element was terminated");
00250     if ( isNameChar(c) ) {
00251       if (tagover) {
00252         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getETag(): end element not well-formed: expected '>'");
00253       }
00254       tag.push_back(c);
00255     }
00256     else if (isSpace(c)) {
00257       // mark the end of the tag and consume the whitespace
00258       tagover = true;
00259     }
00260     else if (c == '>') {
00261       break; 
00262     }
00263     else {
00264       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getETag(): end element not well-formed");
00265     }
00266   }
00267 }
00268 
00269 
00270 void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag) 
00271 {
00272   
00273   /* Recall from the specification:
00274         
00275         STag         ::=  '<' Name (S Attribute)* S? '>' 
00276         EmptyElemTag ::=  '<' Name (S Attribute)* S? '/>'
00277         Name         ::=  (Letter | '_' | ':') (NameChar)*
00278         NameChar     ::=  Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
00279         
00280         S            ::=  (#x20 | #x9 | #xD | #xA)+
00281         Attribute    ::=  Name Eq AttValue 
00282         Eq           ::=   S? '=' S?
00283         AttValue     ::=  '"' ([^<&"] | Reference)* '"'
00284                           | "'" ([^<&'] | Reference)* "'"
00285         Reference ::= EntityRef | CharRef
00286         CharRef   ::= '&#' [0-9]+ ';'
00287         EntityRef ::= '&' Name ';'
00288         
00289      We have already consumed: <lookahead
00290   */
00291   
00292   unsigned char c;
00293   attrs.clear();
00294   
00295   tag = lookahead;
00296   // get the rest of the tag: (NameChar)*
00297   while (1) {
00298     TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00299     if (isNameChar(c)) {
00300       tag.push_back(c);
00301     }
00302     else {
00303       break; 
00304     }
00305   }
00306   
00307   // after the name: should be one of the following
00308   // (S Attribute) | S? '>' | S? '/>'
00309   do {
00310     
00311     bool hadspace = false;
00312     
00313     // if space, consume the whitespace
00314     if ( isSpace(c) ) {
00315       hadspace = true;
00316       TEST_FOR_EXCEPTION( getSpace(c)!=0, std::runtime_error,
00317         "XMLParser::getSTag(): EOF before start element was terminated");
00318     }
00319     
00320     // now, either Attribute | '>' | '/>'
00321     if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
00322       
00323       // Attribute
00324       // get attribute name, starting with contents of c
00325       std::string attname, attval;
00326       attname = c;
00327       do {
00328         TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00329         if ( isNameChar(c) ) {
00330           attname.push_back(c);
00331         }
00332         else if ( isSpace(c) || c=='=' ) {
00333           break; 
00334         }
00335         else {
00336           TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute not well-formed: expected whitespace or '='");
00337         }
00338       } while (1);
00339       
00340       // if whitespace, consume it
00341       if (isSpace(c)) {
00342         getSpace(c);  
00343       }
00344       // should be on '='
00345       if (c != '=') {
00346         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute not well-formed: expected '='");
00347       }
00348       
00349       // get any whitespace following the '='
00350       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00351       if (isSpace(c)) {
00352         getSpace(c);
00353       }
00354       
00355       // now get the quoted attribute value
00356       bool apost;
00357       attval = "";
00358       if (c == '\'') {
00359         apost = true;
00360       }
00361       else if (c == '\"') {
00362         apost = false;
00363       }
00364       else {
00365         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute value must be quoted with either ''' or '\"'");
00366       }
00367       do {
00368         TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00369         if (apost && c=='\'') {
00370           // end of attval
00371           break;
00372         }
00373         else if (!apost && c=='\"') {
00374           // end of attval
00375           break;
00376         }
00377         else if ( c == '&' ) {
00378           // finish: need to add support for Reference
00379           std::string refstr;
00380           getReference(refstr);
00381           attval += refstr;
00382         }
00383         else if ( c!='<' ) {
00384           // valid character for attval
00385           attval.push_back(c);
00386         }
00387         else {
00388           TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): invalid character in attribute value");
00389         }
00390       } while(1);
00391       
00392       // add attribute to list
00393       TEST_FOR_EXCEPTION( attrs.find(attname) != attrs.end() , std::runtime_error , "XMLParser::getSTag(): cannot have two attributes with the same name");
00394       attrs[attname] = attval;
00395     }
00396     else if (c == '>') {
00397       emptytag = false;
00398       break;
00399     }
00400     else if (c == '/') {
00401       TEST_FOR_EXCEPTION(assertChar('>')!=0, std::runtime_error,
00402         "XMLParser::getSTag(): empty element tag not well-formed: expected '>'");
00403       emptytag = true;
00404       break;
00405     }
00406     else {
00407       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): start element not well-formed: invalid character");
00408     }
00409   
00410     // get next char
00411     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00412   
00413   } while(1);
00414 }
00415 
00416 
00417 void XMLParser::getComment() 
00418 {
00419   /* Recall from the specification:
00420         Comment   ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
00421                       that is, '<!--' txt '-->', where txt does not contain '--' 
00422      We have already consumed: <!--
00423      
00424      Be wary here of the fact that c=='-' implies isChar(c)
00425   */
00426   unsigned char c;
00427   while (1) {
00428     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getComment(): EOF before comment was terminated");
00429     // if we have a -
00430     if (c=='-') {
00431       // then it must be the end of the comment or be a Char
00432       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getComment(): EOF before comment was terminated");
00433       if (c=='-') {
00434         // this had better be leading to the end of the comment
00435         TEST_FOR_EXCEPTION( assertChar('>')!=0, std::runtime_error,
00436             "XMLParser::getComment(): comment not well-formed: expected '>'");
00437         break;
00438       }
00439       else if (!isChar(c)) {
00440         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getComment(): comment not well-formed: invalid character");
00441       }
00442     }
00443     else if (!isChar(c)) {
00444       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getComment(): comment not well-formed: invalid character");
00445     }
00446   } 
00447 }
00448 
00449 
00450 void XMLParser::getReference(std::string &refstr) {
00451   // finish: does CharRef support only dec, or hex as well?
00452   unsigned char c;
00453   unsigned int num, base;
00454   refstr = "";
00455   TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00456   if (c == '#') {
00457     // get a CharRef
00458     // CharRef   ::= '&#' [0-9]+ ';'
00459     //               | '&#x' [0-9]+ ';'
00460     // get first number
00461     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00462     if (c == 'x') {
00463       base = 16;
00464       num = 0;
00465     }
00466     else if ('0' <= c && c <= '9') {
00467       base = 10;
00468       num = c - '0';
00469     }
00470     else {
00471       TEST_FOR_EXCEPTION(1, std::runtime_error, "XMLParser::getReference(): invalid character in character reference: expected 'x' or [0-9]");
00472     }
00473 
00474     do {
00475       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00476       TEST_FOR_EXCEPTION( c != ';' && !('0' <= c && c <= '9') , std::runtime_error , "XMLParser::getReference(): invalid character in character reference: expected [0-9] or ';'");
00477       if (c == ';') {
00478         break;
00479       }
00480       num = num*base + (c-'0');
00481     } while (1);
00482     TEST_FOR_EXCEPTION(num > 0xFF, std::runtime_error , "XMLParser::getReference(): character reference value out of range");
00483     refstr.push_back( (unsigned char)num );
00484   }
00485   else if (isLetter(c) || c=='_' || c==':') {
00486     // get an EntityRef
00487     // EntityRef ::= '&' Name ';'
00488     std::string entname = "";
00489     entname.push_back(c);
00490     do {
00491       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00492       if (c==';') {
00493         break;
00494       }
00495       else if ( isLetter(c) || ('0' <= c && c <= '9')
00496                 || c=='.' || c=='-' || c=='_' || c==':' 
00497                 || c==0xB7 ) {
00498         entname.push_back(c);
00499       }
00500       else {
00501         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getReference(): entity reference not well-formed: invalid character");
00502       }
00503     } while (1);
00504     TEST_FOR_EXCEPTION( _entities.find(entname) == _entities.end(), std::runtime_error , "XMLParser::getReference(): entity reference not well-formed: undefined entity");
00505     refstr = _entities[entname];  
00506   }
00507   else {
00508     TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getReference(): reference not well-formed: expected name or '#'");
00509   }
00510 }
00511 
00512 
00513 int XMLParser::getSpace(unsigned char &lookahead) {
00514   // if space, consume the whitespace
00515   do {
00516     if (_is->readBytes(&lookahead,1) < 1) {
00517       return 1; // inform caller that we reached the end
00518     }
00519   }
00520   while (isSpace(lookahead));
00521   return 0;
00522 }
00523 
00524 
00525 bool XMLParser::isLetter(unsigned char c) {
00526   if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
00527        (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
00528        (0xF8 <= c) /* unsigned char must be <= 0xFF */         )
00529   {
00530     return true;
00531   }
00532   return false;
00533 }
00534 
00535 
00536 bool XMLParser::isNameChar(unsigned char c) {
00537   if ( isLetter(c) || ('0' <= c && c <= '9') ||
00538        c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 ) 
00539   {
00540     return true;
00541   }
00542   return false;
00543 }
00544 
00545 
00546 bool XMLParser::isSpace(unsigned char c) {
00547   if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
00548   {
00549     return true;
00550   }
00551   return false;
00552 }
00553 
00554 
00555 bool XMLParser::isChar(unsigned char c) {
00556   if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) {  // unsigned char must be <= 0xFF
00557     return true;
00558   }
00559   return false;
00560 }
00561 
00562 
00563 int XMLParser::assertChar(unsigned char cexp) 
00564 {
00565   // pull the next character off the stream and verify that it is what is expected
00566   // if not, return an error to the caller
00567   unsigned char c;
00568   if (_is->readBytes(&c,1) < 1) {
00569     return 1;
00570   }
00571   if (c != cexp) {
00572     return 2;
00573   }
00574   return 0; 
00575 }
00576 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines