Teuchos - Trilinos Tools Package Version of the Day
Teuchos_XMLParser.cpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                    Teuchos: Common Tools Package
00005 //                 Copyright (2004) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ***********************************************************************
00040 // @HEADER
00041 
00042 // BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
00043 // when printing attribute values, one must check if the value contains quote
00044 // or apost; 
00045 // a quot'd attval cannot contain literal quot
00046 // a apos'd attval cannot contain literal apos
00047 // either they have to be matched appropriately or (easier) all quot and apos must
00048 // be replaced by " and '
00049 
00050 #include "Teuchos_XMLParser.hpp"
00051 #include "Teuchos_TreeBuildingXMLHandler.hpp"
00052 #include "Teuchos_Assert.hpp"
00053 #include <stack>
00054 
00055 using namespace Teuchos;
00056 
00057 // this parser currently does not support:
00058 // * XML declaration
00059 // * processing instructions
00060 // * XML schemas
00061 // * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
00062 // * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
00063 
00064 // it currently does support:
00065 // * comments
00066 // * empty element tags, e.g.   <hello />
00067 // * entity references: &amp; &lt; &gt; &apos; &quot;
00068 // * numeric character references: &#32;
00069 // * std::exception/error handling on parse errors
00070 
00071 
00072 /* From the W3C XML 1.0 Third Edition
00073    http://www.w3.org/TR/2004/REC-xml-20040204/
00074   
00075    The following productions specify well-formed XML documents.
00076    These have been reduced to the support anticipated for support by this parser.
00077         
00078      element      ::=  EmptyElemTag
00079                        | STag content ETag 
00080      STag         ::=  '<' Name (S Attribute)* S? '>' 
00081      Attribute    ::=  Name Eq AttValue 
00082      ETag         ::=  '</' Name S? '>'
00083      content      ::=  CharData? ((element | Reference | CDSect | Comment) CharData?)*
00084      EmptyElemTag ::=  '<' Name (S Attribute)* S? '/>'
00085      
00086      AttValue     ::=  '"' ([^<&"] | Reference)* '"'
00087                        | "'" ([^<&'] | Reference)* "'"
00088      
00089      CharRef   ::= '&#' [0-9]+ ';'
00090      EntityRef ::= '&' Name ';'
00091      Reference ::= EntityRef | CharRef
00092      
00093      #x20 (space)
00094      #x9  (horizontal tab)
00095      #xD  (carriage return)
00096      #xA  (new line, new line line feed)
00097      
00098      S        ::=  (#x20 | #x9 | #xD | #xA)+
00099      Eq       ::=   S? '=' S?
00100      NameChar ::=  Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
00101      Name     ::=  (Letter | '_' | ':') (NameChar)*
00102      
00103      Letter   ::= [#x0041-#x005A] | [#x0061-#x007A] 
00104                   | [#x00C0-#x00D6] | [#x00D8-#x00F6] 
00105                   | [#x00F8-#x00FF]
00106      Digit    ::= [#x0030-#x0039]
00107      
00108      Char      ::=  #x9 | #xA | #xD | [#x20-#xFF]   
00109      CharData  ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
00110                    that is, some std::string of characters not containing '<' or '&' or ']]>'
00111      Comment   ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
00112                    that is, '<!--' txt '-->', where txt does not contain '--' 
00113      
00114      CDSect    ::= CDStart CData CDEnd
00115      CDStart   ::= '<![CDATA['
00116      CData     ::= (Char* - (Char* ']]>' Char*))
00117      CDEnd     ::= ']]>'
00118      
00119      document  ::=   prolog element Misc*
00120      prolog    ::=   XMLDecl? Misc*
00121      XMLDecl   ::=   '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
00122      Misc      ::=   Comment | S
00123         
00124 */
00125 
00126 #define XMLPARSER_TFE( T , S ) \
00127   TEUCHOS_TEST_FOR_EXCEPTION( T, std::runtime_error, "XML parse error at line " << _lineNo << ": " << S )
00128 
00129 XMLObject XMLParser::parse() 
00130 {
00131   
00132   RCP<TreeBuildingXMLHandler> handler = rcp(new TreeBuildingXMLHandler());
00133   
00134   _entities.clear();
00135   _entities["apos"] = "'";
00136   _entities["quot"] = "\"";
00137   _entities["lt"]   = "<";
00138   _entities["gt"]   = ">";
00139   _entities["amp"]  = "&";
00140   
00141   bool done = false;
00142   int curopen = 0;  // number of currently open tags, or "do we process character data?"
00143   bool gotRoot = false;
00144   std::stack<long> tagLineStarts;
00145   std::stack<string> tags;
00146 
00147   while (!done) {
00148     
00149     std::string tag, cdata;
00150     unsigned char c1, c2;
00151     Teuchos::map<std::string,string> attrs;
00152     
00153     // Consume any whitespace
00154     if (curopen == 0) {
00155       // this will leave a lookahead in c1
00156       c1 = '\0';
00157       if ( getSpace(c1) ) {
00158         done = true;
00159         break;
00160       }
00161     }
00162     else {
00163       // need to manually lookahead
00164       if (_is->readBytes(&c1,1) < 1) {
00165         done = true;
00166         break;
00167       }
00168       if (c1 == '\n') ++_lineNo; // a newline while processing character data; not an error
00169     }
00170 
00171     if (c1 == '<') {
00172       // determine if it is a STag/EmptyElemTag or ETag or Comment
00173       // get lookahead
00174       XMLPARSER_TFE( _is->readBytes(&c2,1) < 1 , "stream ended in tag begin/end");
00175 
00176       if (c2 == '/') {
00177         // we have: </
00178         // try to get an ETag
00179         getETag(tag);
00180         // have to check whether we have an enclosing, otherwise tags and tagLineStarts have no top()
00181         XMLPARSER_TFE( curopen == 0,  "document not well-formed: encountered end element '" << tag << "' while not enclosed." );
00182         XMLPARSER_TFE( handler->endElement(tag)!=0, "document not well-formed: end element tag = '" << tag << "'"
00183                                                     << " did not match start element '" << tags.top() 
00184                                                     << "' from line " << tagLineStarts.top() );
00185         curopen--;
00186         tagLineStarts.pop();
00187         tags.pop();
00188       }
00189       else if (isLetter(c2) || c2==':' || c2=='_') {
00190         // it looks like a STag or an EmptyElemTag
00191         bool emptytag;
00192         tagLineStarts.push(_lineNo);
00193         getSTag(c2, tag, attrs, emptytag);
00194         tags.push(tag);
00195         handler->startElement(tag,attrs);
00196         if (curopen == 0) {
00197           XMLPARSER_TFE(gotRoot == true, "document not well-formed: more than one root element specified" );
00198           gotRoot = true;
00199         }
00200         curopen++;
00201         if (emptytag) {
00202           // we just open this tag, so we should have any trouble closing it
00203           XMLPARSER_TFE( handler->endElement(tag)!=0, "unknown failure from handler while processing tag '" << tag << "'" );
00204           curopen--;
00205           tagLineStarts.pop();
00206           tags.pop();
00207         }
00208       }
00209       else if (c2 == '!') {
00210         // it is starting to look like a comment; we need '--'
00211         // if we don't get this, it means
00212         // * the document is not well-formed
00213         // * the document employs a feature not supported by this parser, 
00214         //   e.g. <!ELEMENT...  <!ATTLIST...  <!DOCTYPE...  <![CDATA[...
00215         XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
00216         XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
00217         getComment(_lineNo);
00218       }
00219       else {
00220         XMLPARSER_TFE(true,  "element not well-formed or exploits unsupported feature" );
00221       }
00222     }
00223     else if ( (curopen > 0) && (c1 == '&') ) {
00224       std::string chars = "";
00225       getReference(chars);
00226       handler->characters(chars);
00227     }
00228     else if ( (curopen > 0) ) {
00229       std::string chars = "";
00230       chars.push_back(c1);
00231       handler->characters(chars);
00232     }
00233     else {
00234       XMLPARSER_TFE(1 , "document not well-formed: character data outside of an enclosing tag");
00235     }
00236   }
00237 
00238   XMLPARSER_TFE( curopen != 0 ,  "file ended before closing element '" << tags.top() << "' from line " << tagLineStarts.top() );
00239 
00240   return handler->getObject();
00241 
00242 }
00243 
00244 
00245 void XMLParser::getETag(std::string &tag)
00246 {
00247   /* Recall from the specification:
00248         ETag  ::=  '</' Name S? '>'
00249         Name  ::=  (Letter | '_' | ':') (NameChar)*
00250     
00251      We have already consumed: </
00252   */
00253   
00254   bool tagover = false;
00255   unsigned char c;
00256   // clear tag
00257   tag = "";
00258   XMLPARSER_TFE( _is->readBytes(&c,1) < 1 ,  "EOF before end element was terminated");
00259   XMLPARSER_TFE( !isLetter(c) && c!='_' && c!=':' ,  "tag not well-formed");
00260   tag.push_back(c);
00261   while (1) {
00262     XMLPARSER_TFE( _is->readBytes(&c,1) < 1 ,  "EOF before end element was terminated");
00263     if ( isNameChar(c) ) {
00264       if (tagover) {
00265         XMLPARSER_TFE(1,  "end element not well-formed: expected '>'");
00266       }
00267       tag.push_back(c);
00268     }
00269     else if (isSpace(c)) {
00270       // mark the end of the tag and consume the whitespace
00271       // if it is ia newline, it isn't an error
00272       if (c == '\n') ++_lineNo;
00273       tagover = true;
00274     }
00275     else if (c == '>') {
00276       break; 
00277     }
00278     else {
00279       XMLPARSER_TFE(1,  "end element not well-formed");
00280     }
00281   }
00282 }
00283 
00284 
00285 void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag) 
00286 {
00287   
00288   /* Recall from the specification:
00289         
00290         STag         ::=  '<' Name (S Attribute)* S? '>' 
00291         EmptyElemTag ::=  '<' Name (S Attribute)* S? '/>'
00292         Name         ::=  (Letter | '_' | ':') (NameChar)*
00293         NameChar     ::=  Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
00294         
00295         S            ::=  (#x20 | #x9 | #xD | #xA)+
00296         Attribute    ::=  Name Eq AttValue 
00297         Eq           ::=   S? '=' S?
00298         AttValue     ::=  '"' ([^<&"] | Reference)* '"'
00299                           | "'" ([^<&'] | Reference)* "'"
00300         Reference ::= EntityRef | CharRef
00301         CharRef   ::= '&#' [0-9]+ ';'
00302         EntityRef ::= '&' Name ';'
00303         
00304      We have already consumed: <lookahead
00305   */
00306   
00307   unsigned char c;
00308   attrs.clear();
00309   
00310   tag = lookahead;
00311   // get the rest of the tag: (NameChar)*
00312   while (1) {
00313     XMLPARSER_TFE( _is->readBytes(&c,1) < 1 ,  "EOF before start element was terminated");
00314     if (isNameChar(c)) {
00315       tag.push_back(c);
00316     }
00317     else {
00318       break; 
00319     }
00320   }
00321   
00322   // after the name: should be one of the following
00323   // (S Attribute) | S? '>' | S? '/>'
00324   do {
00325     
00326     bool hadspace = false;
00327     
00328     // if space, consume the whitespace
00329     if ( isSpace(c) ) {
00330       hadspace = true;
00331       XMLPARSER_TFE( getSpace(c)!=0, "EOF before start element was terminated");
00332     }
00333     
00334     // now, either Attribute | '>' | '/>'
00335     if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
00336       
00337       // Attribute
00338       // get attribute name, starting with contents of c
00339       std::string attname, attval;
00340       attname = c;
00341       do {
00342         XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before start element was terminated");
00343         if ( isNameChar(c) ) {
00344           attname.push_back(c);
00345         }
00346         else if ( isSpace(c) || c=='=' ) {
00347           break; 
00348         }
00349         else {
00350           XMLPARSER_TFE(1,  "attribute not well-formed: expected whitespace or '='");
00351         }
00352       } while (1);
00353       
00354       // if whitespace, consume it
00355       if (isSpace(c)) {
00356         getSpace(c);  
00357       }
00358       // should be on '='
00359       if (c != '=') {
00360         XMLPARSER_TFE(1,  "attribute not well-formed: expected '='");
00361       }
00362       
00363       // get any whitespace following the '='
00364       XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before start element was terminated");
00365       if (isSpace(c)) {
00366         getSpace(c);
00367       }
00368       
00369       // now get the quoted attribute value
00370       bool apost;
00371       attval = "";
00372       if (c == '\'') {
00373         apost = true;
00374       }
00375       else if (c == '\"') {
00376         apost = false;
00377       }
00378       else {
00379         XMLPARSER_TFE(1,  "attribute value must be quoted with either ''' or '\"'");
00380       }
00381       do {
00382         XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before start element was terminated");
00383         if (apost && c=='\'') {
00384           // end of attval
00385           break;
00386         }
00387         else if (!apost && c=='\"') {
00388           // end of attval
00389           break;
00390         }
00391         else if ( c == '&' ) {
00392           // finish: need to add support for Reference
00393           std::string refstr;
00394           getReference(refstr);
00395           attval += refstr;
00396         }
00397         else if ( c!='<' ) {
00398           // valid character for attval
00399           attval.push_back(c);
00400         }
00401         else {
00402           XMLPARSER_TFE(1,  "invalid character in attribute value");
00403         }
00404       } while(1);
00405       
00406       // add attribute to list
00407       XMLPARSER_TFE( attrs.find(attname) != attrs.end() ,  "cannot have two attributes with the same name");
00408       attrs[attname] = attval;
00409     }
00410     else if (c == '>') {
00411       emptytag = false;
00412       break;
00413     }
00414     else if (c == '/') {
00415       XMLPARSER_TFE(assertChar('>')!=0, "empty element tag not well-formed: expected '>'");
00416       emptytag = true;
00417       break;
00418     }
00419     else {
00420       XMLPARSER_TFE(1,  "start element not well-formed: invalid character");
00421     }
00422   
00423     // get next char
00424     XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before start element was terminated");
00425   
00426   } while(1);
00427 }
00428 
00429 
00430 void XMLParser::getComment(long startLine) 
00431 {
00432   /* Recall from the specification:
00433         Comment   ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
00434                       that is, '<!--' txt '-->', where txt does not contain '--' 
00435      We have already consumed: <!--
00436      
00437      Be wary here of the fact that c=='-' implies isChar(c)
00438   */
00439   unsigned char c;
00440   while (1) {
00441     XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before terminating comment begun at line " << _lineNo );
00442     if (c == '\n') ++_lineNo;
00443     // if we have a -
00444     if (c=='-') {
00445       // then it must be the end of the comment or be a Char
00446       XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before terminating comment begun at line " << _lineNo );
00447       if (c == '\n') ++_lineNo;
00448       if (c=='-') {
00449         // this had better be leading to the end of the comment
00450         XMLPARSER_TFE( assertChar('>')!=0, "comment not well-formed: missing expected '>' at line " << _lineNo );
00451         break;
00452       }
00453       else if (!isChar(c)) {
00454         XMLPARSER_TFE(1,  "comment not well-formed: invalid character at line " << _lineNo );
00455       }
00456     }
00457     else if (!isChar(c)) {
00458       XMLPARSER_TFE(1,  "comment not well-formed: invalid character at line " << _lineNo );
00459     }
00460   } 
00461 }
00462 
00463 
00464 void XMLParser::getReference(std::string &refstr) {
00465   // finish: does CharRef support only dec, or hex as well?
00466   unsigned char c;
00467   unsigned int num, base;
00468   refstr = "";
00469   // none of these bytes read are allowed to be a newline, so don't do any incrementing of _lineNo
00470   XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before reference was terminated");
00471   if (c == '#') {
00472     // get a CharRef
00473     // CharRef   ::= '&#' [0-9]+ ';'
00474     //               | '&#x' [0-9]+ ';'
00475     // get first number
00476     XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before reference was terminated");
00477     if (c == 'x') {
00478       base = 16;
00479       num = 0;
00480     }
00481     else if ('0' <= c && c <= '9') {
00482       base = 10;
00483       num = c - '0';
00484     }
00485     else {
00486       XMLPARSER_TFE(1,  "invalid character in character reference: expected 'x' or [0-9]");
00487     }
00488 
00489     do {
00490       XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before reference was terminated");
00491       XMLPARSER_TFE( c != ';' && !('0' <= c && c <= '9') ,  "invalid character in character reference: expected [0-9] or ';'");
00492       if (c == ';') {
00493         break;
00494       }
00495       num = num*base + (c-'0');
00496     } while (1);
00497     XMLPARSER_TFE(num > 0xFF,  "character reference value out of range");
00498     refstr.push_back( (unsigned char)num );
00499   }
00500   else if (isLetter(c) || c=='_' || c==':') {
00501     // get an EntityRef
00502     // EntityRef ::= '&' Name ';'
00503     std::string entname = "";
00504     entname.push_back(c);
00505     do {
00506       XMLPARSER_TFE(_is->readBytes(&c,1) < 1,  "EOF before reference was terminated");
00507       if (c==';') {
00508         break;
00509       }
00510       else if ( isLetter(c) || ('0' <= c && c <= '9')
00511                 || c=='.' || c=='-' || c=='_' || c==':' 
00512                 || c==0xB7 ) {
00513         entname.push_back(c);
00514       }
00515       else {
00516         XMLPARSER_TFE(1,  "entity reference not well-formed: invalid character");
00517       }
00518     } while (1);
00519     XMLPARSER_TFE( _entities.find(entname) == _entities.end(),  "entity reference not well-formed: undefined entity");
00520     refstr = _entities[entname];  
00521   }
00522   else {
00523     XMLPARSER_TFE(1,  "reference not well-formed: expected name or '#'");
00524   }
00525 }
00526 
00527 
00528 int XMLParser::getSpace(unsigned char &lookahead) {
00529   // if space, consume the whitespace
00530   do {
00531     if (lookahead == '\n') ++_lineNo;
00532     if (_is->readBytes(&lookahead,1) < 1) {
00533       return 1; // inform caller that we reached the end
00534     }
00535   }
00536   while (isSpace(lookahead));
00537   return 0;
00538 }
00539 
00540 
00541 bool XMLParser::isLetter(unsigned char c) {
00542   if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
00543        (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
00544        (0xF8 <= c) /* unsigned char must be <= 0xFF */         )
00545   {
00546     return true;
00547   }
00548   return false;
00549 }
00550 
00551 
00552 bool XMLParser::isNameChar(unsigned char c) {
00553   if ( isLetter(c) || ('0' <= c && c <= '9') ||
00554        c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 ) 
00555   {
00556     return true;
00557   }
00558   return false;
00559 }
00560 
00561 
00562 bool XMLParser::isSpace(unsigned char c) {
00563   if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
00564   {
00565     return true;
00566   }
00567   return false;
00568 }
00569 
00570 
00571 bool XMLParser::isChar(unsigned char c) {
00572   if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) {  // unsigned char must be <= 0xFF
00573     return true;
00574   }
00575   return false;
00576 }
00577 
00578 
00579 int XMLParser::assertChar(unsigned char cexp) 
00580 {
00581   // pull the next character off the stream and verify that it is what is expected
00582   // if not, return an error to the caller
00583   unsigned char c;
00584   // don't worry about newlines; assertChar is always wrapped in TEST_FOR_EXCEPTION, so we don't want to advance the line counter
00585   if (_is->readBytes(&c,1) < 1) {
00586     return 1;
00587   }
00588   if (c != cexp) {
00589     return 2;
00590   }
00591   return 0; 
00592 }
00593 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines