Teuchos_XMLParser.cpp

00001 // @HEADER
00002 // ***********************************************************************
00003 // 
00004 //                    Teuchos: Common Tools Package
00005 //                 Copyright (2004) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 // BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
00030 // when printing attribute values, one must check if the value contains quote
00031 // or apost; 
00032 // a quot'd attval cannot contain literal quot
00033 // a apos'd attval cannot contain literal apos
00034 // either they have to be matched appropriately or (easier) all quot and apos must
00035 // be replaced by " and '
00036 
00037 #include "Teuchos_XMLParser.hpp"
00038 #include "Teuchos_TreeBuildingXMLHandler.hpp"
00039 #include "Teuchos_TestForException.hpp"
00040 
00041 using namespace Teuchos;
00042 
00043 // this parser currently does not support:
00044 // * XML declaration
00045 // * processing instructions
00046 // * XML schemas
00047 // * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
00048 // * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
00049 
00050 // it currently does support:
00051 // * comments
00052 // * empty element tags, e.g.   <hello />
00053 // * entity references: &amp; &lt; &gt; &apos; &quot;
00054 // * numeric character references: &#32;
00055 // * std::exception/error handling on parse errors
00056 
00057 
00058 /* From the W3C XML 1.0 Third Edition
00059    http://www.w3.org/TR/2004/REC-xml-20040204/
00060   
00061    The following productions specify well-formed XML documents.
00062    These have been reduced to the support anticipated for support by this parser.
00063         
00064      element      ::=  EmptyElemTag
00065                        | STag content ETag 
00066      STag         ::=  '<' Name (S Attribute)* S? '>' 
00067      Attribute    ::=  Name Eq AttValue 
00068      ETag         ::=  '</' Name S? '>'
00069      content      ::=  CharData? ((element | Reference | CDSect | Comment) CharData?)*
00070      EmptyElemTag ::=  '<' Name (S Attribute)* S? '/>'
00071      
00072      AttValue     ::=  '"' ([^<&"] | Reference)* '"'
00073                        | "'" ([^<&'] | Reference)* "'"
00074      
00075      CharRef   ::= '&#' [0-9]+ ';'
00076      EntityRef ::= '&' Name ';'
00077      Reference ::= EntityRef | CharRef
00078      
00079      #x20 (space)
00080      #x9  (horizontal tab)
00081      #xD  (carriage return)
00082      #xA  (new line, new line line feed)
00083      
00084      S        ::=  (#x20 | #x9 | #xD | #xA)+
00085      Eq       ::=   S? '=' S?
00086      NameChar ::=  Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
00087      Name     ::=  (Letter | '_' | ':') (NameChar)*
00088      
00089      Letter   ::= [#x0041-#x005A] | [#x0061-#x007A] 
00090                   | [#x00C0-#x00D6] | [#x00D8-#x00F6] 
00091                   | [#x00F8-#x00FF]
00092      Digit    ::= [#x0030-#x0039]
00093      
00094      Char      ::=  #x9 | #xA | #xD | [#x20-#xFF]   
00095      CharData  ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
00096                    that is, some std::string of characters not containing '<' or '&' or ']]>'
00097      Comment   ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
00098                    that is, '<!--' txt '-->', where txt does not contain '--' 
00099      
00100      CDSect    ::= CDStart CData CDEnd
00101      CDStart   ::= '<![CDATA['
00102      CData     ::= (Char* - (Char* ']]>' Char*))
00103      CDEnd     ::= ']]>'
00104      
00105      document  ::=   prolog element Misc*
00106      prolog    ::=   XMLDecl? Misc*
00107      XMLDecl   ::=   '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
00108      Misc      ::=   Comment | S
00109         
00110 */
00111 
00112 XMLObject XMLParser::parse() 
00113 {
00114   
00115   RCP<TreeBuildingXMLHandler> handler = rcp(new TreeBuildingXMLHandler());
00116   
00117   _entities.clear();
00118   _entities["apos"] = "'";
00119   _entities["quot"] = "\"";
00120   _entities["lt"]   = "<";
00121   _entities["gt"]   = ">";
00122   _entities["amp"]  = "&";
00123   
00124   bool done = false;
00125   int curopen = 0;  // number of currently open tags, or "do we process character data?"
00126   bool gotRoot = false;
00127 
00128   while (!done) {
00129     
00130     std::string tag, cdata;
00131     unsigned char c1, c2;
00132     Teuchos::map<std::string,string> attrs;
00133     
00134     // Consume any whitespace
00135     if (curopen == 0) {
00136       // this will leave a lookahead in c1
00137       if ( getSpace(c1) ) {
00138         done = true;
00139         break;
00140       }
00141     }
00142     else {
00143       // need to manually lookahead
00144       if (_is->readBytes(&c1,1) < 1) {
00145         done = true;
00146         break;
00147       }
00148     }
00149 
00150     if (c1 == '<') {
00151       // determine if it is a STag/EmptyElemTag or ETag or Comment
00152       // get lookahead
00153       TEST_FOR_EXCEPTION( _is->readBytes(&c2,1) < 1 , std::runtime_error, "XMLParser::parse(): stream ended in tag begin/end");
00154 
00155       if (c2 == '/') {
00156         // we have: </
00157         // try to get an ETag
00158         getETag(tag);
00159         TEST_FOR_EXCEPTION( handler->endElement(tag) , std::runtime_error , "XMLParser::getETag(): document not well-formed: end element tag did not match start element");
00160         curopen--;
00161       }
00162       else if (isLetter(c2) || c2==':' || c2=='_') {
00163         // it looks like a STag or an EmptyElemTag
00164         bool emptytag;
00165         getSTag(c2, tag, attrs, emptytag);
00166         handler->startElement(tag,attrs);
00167         if (curopen == 0) {
00168           TEST_FOR_EXCEPTION(gotRoot == true, std::runtime_error , "XMLParser::getETag(): document not well-formed: more than one root element specified");
00169           gotRoot = true;
00170         }
00171         curopen++;
00172         if (emptytag) {
00173           TEST_FOR_EXCEPTION( handler->endElement(tag) , std::runtime_error , "XMLParser::getETag(): document not well-formed: end element tag did not match start element");
00174           curopen--;
00175         }
00176       }
00177       else if (c2 == '!') {
00178         // it is starting to look like a comment; we need '--'
00179         // if we don't get this, it means
00180         // * the document is not well-formed
00181         // * the document employs a feature not supported by this parser, 
00182         //   e.g. <!ELEMENT...  <!ATTLIST...  <!DOCTYPE...  <![CDATA[...
00183         TEST_FOR_EXCEPTION( assertChar('-') , std::runtime_error , "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00184         TEST_FOR_EXCEPTION( assertChar('-') , std::runtime_error , "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00185         getComment();
00186       }
00187       else {
00188         TEST_FOR_EXCEPTION(1, std::runtime_error, "XMLParser::parse(): element not well-formed or exploits unsupported feature" );
00189       }
00190     }
00191     else if ( (curopen > 0) && (c1 == '&') ) {
00192       std::string chars = "";
00193       getReference(chars);
00194       handler->characters(chars);
00195     }
00196     else if ( (curopen > 0) ) {
00197       std::string chars = "";
00198       chars.push_back(c1);
00199       handler->characters(chars);
00200     }
00201     else {
00202       TEST_FOR_EXCEPTION(1,std::runtime_error,"XMLParser::parse(): document not well-formed");
00203     }
00204   }
00205 
00206   TEST_FOR_EXCEPTION( curopen != 0 , std::runtime_error, "XMLParser::parse(): document not well-formed: elements not matched" );
00207 
00208   return handler->getObject();
00209 
00210 }
00211 
00212 
00213 void XMLParser::getETag(std::string &tag)
00214 {
00215   /* Recall from the specification:
00216         ETag  ::=  '</' Name S? '>'
00217         Name  ::=  (Letter | '_' | ':') (NameChar)*
00218     
00219      We have already consumed: </
00220   */
00221   
00222   bool tagover = false;
00223   unsigned char c;
00224   // clear tag
00225   tag = "";
00226   TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getETag(): EOF before end element was terminated");
00227   TEST_FOR_EXCEPTION( !isLetter(c) && c!='_' && c!=':' , std::runtime_error , "XMLParser::getETag(): tag not well-formed");
00228   tag.push_back(c);
00229   while (1) {
00230     TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getETag(): EOF before end element was terminated");
00231     if ( isNameChar(c) ) {
00232       if (tagover) {
00233         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getETag(): end element not well-formed: expected '>'");
00234       }
00235       tag.push_back(c);
00236     }
00237     else if (isSpace(c)) {
00238       // mark the end of the tag and consume the whitespace
00239       tagover = true;
00240     }
00241     else if (c == '>') {
00242       break; 
00243     }
00244     else {
00245       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getETag(): end element not well-formed");
00246     }
00247   }
00248 }
00249 
00250 
00251 void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag) 
00252 {
00253   
00254   /* Recall from the specification:
00255         
00256         STag         ::=  '<' Name (S Attribute)* S? '>' 
00257         EmptyElemTag ::=  '<' Name (S Attribute)* S? '/>'
00258         Name         ::=  (Letter | '_' | ':') (NameChar)*
00259         NameChar     ::=  Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
00260         
00261         S            ::=  (#x20 | #x9 | #xD | #xA)+
00262         Attribute    ::=  Name Eq AttValue 
00263         Eq           ::=   S? '=' S?
00264         AttValue     ::=  '"' ([^<&"] | Reference)* '"'
00265                           | "'" ([^<&'] | Reference)* "'"
00266         Reference ::= EntityRef | CharRef
00267         CharRef   ::= '&#' [0-9]+ ';'
00268         EntityRef ::= '&' Name ';'
00269         
00270      We have already consumed: <lookahead
00271   */
00272   
00273   unsigned char c;
00274   attrs.clear();
00275   
00276   tag = lookahead;
00277   // get the rest of the tag: (NameChar)*
00278   while (1) {
00279     TEST_FOR_EXCEPTION( _is->readBytes(&c,1) < 1 , std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00280     if (isNameChar(c)) {
00281       tag.push_back(c);
00282     }
00283     else {
00284       break; 
00285     }
00286   }
00287   
00288   // after the name: should be one of the following
00289   // (S Attribute) | S? '>' | S? '/>'
00290   do {
00291     
00292     bool hadspace = false;
00293     
00294     // if space, consume the whitespace
00295     if ( isSpace(c) ) {
00296       hadspace = true;
00297       TEST_FOR_EXCEPTION( getSpace(c) , std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00298     }
00299     
00300     // now, either Attribute | '>' | '/>'
00301     if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
00302       
00303       // Attribute
00304       // get attribute name, starting with contents of c
00305       std::string attname, attval;
00306       attname = c;
00307       do {
00308         TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00309         if ( isNameChar(c) ) {
00310           attname.push_back(c);
00311         }
00312         else if ( isSpace(c) || c=='=' ) {
00313           break; 
00314         }
00315         else {
00316           TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute not well-formed: expected whitespace or '='");
00317         }
00318       } while (1);
00319       
00320       // if whitespace, consume it
00321       if (isSpace(c)) {
00322         getSpace(c);  
00323       }
00324       // should be on '='
00325       if (c != '=') {
00326         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute not well-formed: expected '='");
00327       }
00328       
00329       // get any whitespace following the '='
00330       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00331       if (isSpace(c)) {
00332         getSpace(c);
00333       }
00334       
00335       // now get the quoted attribute value
00336       bool apost;
00337       attval = "";
00338       if (c == '\'') {
00339         apost = true;
00340       }
00341       else if (c == '\"') {
00342         apost = false;
00343       }
00344       else {
00345         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): attribute value must be quoted with either ''' or '\"'");
00346       }
00347       do {
00348         TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00349         if (apost && c=='\'') {
00350           // end of attval
00351           break;
00352         }
00353         else if (!apost && c=='\"') {
00354           // end of attval
00355           break;
00356         }
00357         else if ( c == '&' ) {
00358           // finish: need to add support for Reference
00359           std::string refstr;
00360           getReference(refstr);
00361           attval += refstr;
00362         }
00363         else if ( c!='<' ) {
00364           // valid character for attval
00365           attval.push_back(c);
00366         }
00367         else {
00368           TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): invalid character in attribute value");
00369         }
00370       } while(1);
00371       
00372       // add attribute to list
00373       TEST_FOR_EXCEPTION( attrs.find(attname) != attrs.end() , std::runtime_error , "XMLParser::getSTag(): cannot have two attributes with the same name");
00374       attrs[attname] = attval;
00375     }
00376     else if (c == '>') {
00377       emptytag = false;
00378       break;
00379     }
00380     else if (c == '/') {
00381       TEST_FOR_EXCEPTION(assertChar('>'), std::runtime_error , "XMLParser::getSTag(): empty element tag not well-formed: expected '>'");
00382       emptytag = true;
00383       break;
00384     }
00385     else {
00386       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getSTag(): start element not well-formed: invalid character");
00387     }
00388   
00389     // get next char
00390     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getSTag(): EOF before start element was terminated");
00391   
00392   } while(1);
00393 }
00394 
00395 
00396 void XMLParser::getComment() 
00397 {
00398   /* Recall from the specification:
00399         Comment   ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
00400                       that is, '<!--' txt '-->', where txt does not contain '--' 
00401      We have already consumed: <!--
00402      
00403      Be wary here of the fact that c=='-' implies isChar(c)
00404   */
00405   unsigned char c;
00406   while (1) {
00407     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getComment(): EOF before comment was terminated");
00408     // if we have a -
00409     if (c=='-') {
00410       // then it must be the end of the comment or be a Char
00411       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getComment(): EOF before comment was terminated");
00412       if (c=='-') {
00413         // this had better be leading to the end of the comment
00414         TEST_FOR_EXCEPTION( assertChar('>') , std::runtime_error , "XMLParser::getComment(): comment not well-formed: expected '>'");
00415         break;
00416       }
00417       else if (!isChar(c)) {
00418         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getComment(): comment not well-formed: invalid character");
00419       }
00420     }
00421     else if (!isChar(c)) {
00422       TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getComment(): comment not well-formed: invalid character");
00423     }
00424   } 
00425 }
00426 
00427 
00428 void XMLParser::getReference(std::string &refstr) {
00429   // finish: does CharRef support only dec, or hex as well?
00430   unsigned char c;
00431   unsigned int num, base;
00432   refstr = "";
00433   TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00434   if (c == '#') {
00435     // get a CharRef
00436     // CharRef   ::= '&#' [0-9]+ ';'
00437     //               | '&#x' [0-9]+ ';'
00438     // get first number
00439     TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00440     if (c == 'x') {
00441       base = 16;
00442       num = 0;
00443     }
00444     else if ('0' <= c && c <= '9') {
00445       base = 10;
00446       num = c - '0';
00447     }
00448     else {
00449       TEST_FOR_EXCEPTION(1, std::runtime_error, "XMLParser::getReference(): invalid character in character reference: expected 'x' or [0-9]");
00450     }
00451 
00452     do {
00453       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00454       TEST_FOR_EXCEPTION( c != ';' && !('0' <= c && c <= '9') , std::runtime_error , "XMLParser::getReference(): invalid character in character reference: expected [0-9] or ';'");
00455       if (c == ';') {
00456         break;
00457       }
00458       num = num*base + (c-'0');
00459     } while (1);
00460     TEST_FOR_EXCEPTION(num > 0xFF, std::runtime_error , "XMLParser::getReference(): character reference value out of range");
00461     refstr.push_back( (unsigned char)num );
00462   }
00463   else if (isLetter(c) || c=='_' || c==':') {
00464     // get an EntityRef
00465     // EntityRef ::= '&' Name ';'
00466     std::string entname = "";
00467     entname.push_back(c);
00468     do {
00469       TEST_FOR_EXCEPTION(_is->readBytes(&c,1) < 1, std::runtime_error , "XMLParser::getReference(): EOF before reference was terminated");
00470       if (c==';') {
00471         break;
00472       }
00473       else if ( isLetter(c) || ('0' <= c && c <= '9')
00474                 || c=='.' || c=='-' || c=='_' || c==':' 
00475                 || c==0xB7 ) {
00476         entname.push_back(c);
00477       }
00478       else {
00479         TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getReference(): entity reference not well-formed: invalid character");
00480       }
00481     } while (1);
00482     TEST_FOR_EXCEPTION( _entities.find(entname) == _entities.end(), std::runtime_error , "XMLParser::getReference(): entity reference not well-formed: undefined entity");
00483     refstr = _entities[entname];  
00484   }
00485   else {
00486     TEST_FOR_EXCEPTION(1, std::runtime_error , "XMLParser::getReference(): reference not well-formed: expected name or '#'");
00487   }
00488 }
00489 
00490 
00491 int XMLParser::getSpace(unsigned char &lookahead) {
00492   // if space, consume the whitespace
00493   do {
00494     if (_is->readBytes(&lookahead,1) < 1) {
00495       return 1; // inform caller that we reached the end
00496     }
00497   }
00498   while (isSpace(lookahead));
00499   return 0;
00500 }
00501 
00502 
00503 bool XMLParser::isLetter(unsigned char c) {
00504   if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
00505        (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
00506        (0xF8 <= c) /* unsigned char must be <= 0xFF */         )
00507   {
00508     return true;
00509   }
00510   return false;
00511 }
00512 
00513 
00514 bool XMLParser::isNameChar(unsigned char c) {
00515   if ( isLetter(c) || ('0' <= c && c <= '9') ||
00516        c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 ) 
00517   {
00518     return true;
00519   }
00520   return false;
00521 }
00522 
00523 
00524 bool XMLParser::isSpace(unsigned char c) {
00525   if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
00526   {
00527     return true;
00528   }
00529   return false;
00530 }
00531 
00532 
00533 bool XMLParser::isChar(unsigned char c) {
00534   if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) {  // unsigned char must be <= 0xFF
00535     return true;
00536   }
00537   return false;
00538 }
00539 
00540 
00541 int XMLParser::assertChar(unsigned char cexp) 
00542 {
00543   // pull the next character off the stream and verify that it is what is expected
00544   // if not, return an error to the caller
00545   unsigned char c;
00546   if (_is->readBytes(&c,1) < 1) {
00547     return 1;
00548   }
00549   if (c != cexp) {
00550     return 2;
00551   }
00552   return 0; 
00553 }
00554 

Generated on Wed May 12 21:40:33 2010 for Teuchos - Trilinos Tools Package by  doxygen 1.4.7