Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

XMLParse.hpp

Go to the documentation of this file.
00001 /*
00002 This product contains certain software code or other information
00003 ("AT&T Software") proprietary to AT&T Corp. ("AT&T").  The AT&T
00004 Software is provided to you "AS IS".  YOU ASSUME TOTAL RESPONSIBILITY
00005 AND RISK FOR USE OF THE AT&T SOFTWARE.  AT&T DOES NOT MAKE, AND
00006 EXPRESSLY DISCLAIMS, ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND
00007 WHATSOEVER, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
00008 MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, WARRANTIES OF
00009 TITLE OR NON-INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS, ANY
00010 WARRANTIES ARISING BY USAGE OF TRADE, COURSE OF DEALING OR COURSE OF
00011 PERFORMANCE, OR ANY WARRANTY THAT THE AT&T SOFTWARE IS "ERROR FREE" OR
00012 WILL MEET YOUR REQUIREMENTS.
00013 
00014 Unless you accept a license to use the AT&T Software, you shall not
00015 reverse compile, disassemble or otherwise reverse engineer this
00016 product to ascertain the source code for any AT&T Software.
00017 
00018 (c) AT&T Corp. All rights reserved.  AT&T is a registered trademark of AT&T Corp.
00019 
00020 ***********************************************************************
00021 
00022 History:
00023 
00024       24/11/99  - initial release by Hartmut Liefke, liefke@seas.upenn.edu
00025                                      Dan Suciu,      suciu@research.att.com
00026 */
00027 
00028 //**************************************************************************
00029 //**************************************************************************
00030 
00031 // This module contains the XML-Parser. Based on class 'FileParser',
00032 // the XML parser implements functions for finding and parsing elements and attributes
00033 // the events are handled through an SAX-like interface called SAXClient
00034 
00035 #include "Error.hpp"
00036 #include "FileParser.hpp"
00037 #include "SAXClient.hpp"
00038 
00039 extern unsigned long memory_cutoff;
00040    // The memory cutoff is the maximum amount of memory that should be used
00041    // If the current memory allocation exceed the limit, then the parser stops
00042    // and the current data is written to the compressed output file
00043    // Then, the parser resumes
00044 
00045 inline char *TraverseWhiteSpaces(char *ptr,char *endptr)
00046 {
00047    while((ptr<endptr)&&
00048          ((*ptr==' ')||(*ptr=='\t')||(*ptr=='\r')||(*ptr=='\n')))
00049       ptr++;
00050    return ptr;
00051 }
00052 
00053 class XMLParse : public FileParser
00054 { 
00055    SAXClient *saxclient;   // the event-receiving client
00056 
00057 public:
00058    void XMLParseError(char *errmsg)
00059       // Writes a parser error and exits
00060    {
00061       char tmpstr[50];
00062       sprintf(tmpstr,"Parse error in line %lu:\n",GetCurLineNo());
00063       Error(tmpstr);
00064       ErrorCont(errmsg);
00065       Exit();
00066    }
00067 
00068    void XMLParseError(char *errmsg,int savelineno)
00069       // Writes a parser error and exits
00070    {
00071       char tmpstr[50];
00072       sprintf(tmpstr,errmsg,savelineno);
00073       Error(tmpstr);
00074       Exit();
00075    }
00076 
00077 private:
00078    char SkipWhiteSpaces()
00079    {
00080       char c;
00081       do
00082       {
00083          PeekChar(&c);
00084          if((c!=' ')&&(c!='\t')&&(c!='\r')&&(c!='\n'))
00085             break;
00086          SkipChar();
00087       }
00088       while(1);
00089 
00090       return c;
00091    }
00092 
00093    char ParseAttribs()
00094    // This function scans the attributes in a given start label
00095    // The returns as soon as the trailing '>' is reached
00096    {
00097       char c;
00098       char *strptr;
00099       int  len;
00100 
00101       do
00102       {
00103          while(ReadWhiteSpaces(&strptr,&len)==0)
00104             // We read all white-spaces
00105             saxclient->HandleAttribWhiteSpaces(strptr,len,1);
00106 
00107          saxclient->HandleAttribWhiteSpaces(strptr,len,0);
00108 
00109          // Now we don't have any more white-spaces and we search
00110          // for '=' (if there is an attribute) or '>' (for the end of the element)
00111          PeekChar(&c);
00112          if((c=='>')||(c=='/'))  // End of label?
00113          {
00114             SkipChar();
00115             return c;
00116          }
00117          // Let's find '=' or some white-space
00118          while(ReadStringUntil(&strptr,&len,1,'=',0)==0)
00119             // We scan until we reach '='
00120             saxclient->HandleAttribName(strptr,len,1);
00121 
00122          // We found '='
00123          saxclient->HandleAttribName(strptr,len-1,0);
00124 
00125          if(strptr[len-1]!='=')
00126             // We found white-spaces instead?
00127          {
00128             c=SkipWhiteSpaces();
00129             if(c!='=')
00130                XMLParseError("Symbol '=' expected !");
00131             SkipChar();
00132          }
00133          // We skip all white spaces after '='
00134          c=SkipWhiteSpaces();
00135 
00136          // The next character should be a '"'
00137          // If not, then we assume that the value only goes until the
00138          // next white-space (or '>' or '/')!
00139 
00140          if(c!='"')
00141          {
00142             while(ReadStringUntil(&strptr,&len,1,'>','/')==0)
00143                saxclient->HandleAttribValue(strptr,len,0);
00144 
00145             saxclient->HandleAttribValue(strptr,len-1,1);
00146 
00147             c=strptr[len-1];
00148             if((c=='/')||(c=='>'))
00149                return c;
00150          }
00151          else
00152          {
00153             SkipChar();
00154 
00155             while(ReadStringUntil(&strptr,&len,0,'"','>')==0)
00156                saxclient->HandleAttribValue(strptr,len,0);
00157 
00158             if(strptr[len-1]=='>')
00159             {
00160                char tmpstr[100];
00161                sprintf(tmpstr,"Line %lu: Missing '\"' at the end of attribute value '",GetCurLineNo());
00162                Error(tmpstr);
00163                ErrorCont(strptr,len-1);
00164                ErrorCont("'!");
00165                PrintErrorMsg();
00166                UndoReadChar();
00167                len--;
00168             }
00169 
00170             saxclient->HandleAttribValue(strptr,len-1,1);
00171 
00172             PeekChar(&c);
00173             if((c!='>')&&(c!=' ')&&(c!='\t')&&(c!='\n')&&(c!='\r')&&(c!='/'))
00174             {
00175                char tmpstr[50];
00176                sprintf(tmpstr,"Skip invalid character '%c' in line %lu",c,GetCurLineNo());
00177                Error(tmpstr);
00178                PrintErrorMsg();
00179                SkipChar();
00180             }
00181          }
00182       }
00183       while(1);
00184    }
00185 
00186    void ParseLabel()
00187       // Scans a label after the '<' has already been parsed.
00188    {
00189       char c,*ptr;
00190       int  len;
00191 
00192       PeekChar(&c);
00193 
00194       if(c=='/') // An ending label ?
00195       {
00196          GetChar(&c);
00197 
00198          while(ReadStringUntil(&ptr,&len,0,'>','<')==0)
00199 //         while(ReadStringUntil(&ptr,&len,'>')==0)
00200             // We didn't find '>'  ?
00201             saxclient->HandleEndLabel(ptr,len,1);
00202 
00203          if(ptr[len-1]=='<')
00204          {
00205             Error("Unfinished end label!");
00206             PrintErrorMsg();
00207             UndoReadChar();
00208          }
00209 
00210          saxclient->HandleEndLabel(ptr,len-1,0);
00211          return;
00212       }
00213 
00214       while(ReadStringUntil(&ptr,&len,1,'>','/')==0)
00215          // We didn't find '>' or '/' or a white-space ?
00216          saxclient->HandleStartLabel(ptr,len,1);
00217 
00218       switch(ptr[len-1])
00219       {
00220       case '>':
00221          saxclient->HandleStartLabel(ptr,len-1,0);
00222          return;
00223 
00224       case '/':
00225          saxclient->HandleStartLabel(ptr,len-1,0);
00226          GetChar(&c);
00227          if(c!='>')
00228             XMLParseError("Symbol '/' in label must be followed by '>' !");
00229 
00230          saxclient->HandleEndLabel(NULL,0,0);
00231          return;
00232 
00233       default: // Did we find some white space ??
00234          saxclient->HandleStartLabel(ptr,len,0);
00235          c=ParseAttribs();
00236          if(c=='/')
00237          {
00238             // I.e. we received an empty label
00239             saxclient->HandleEndLabel(NULL,0,0);
00240             GetChar(&c);
00241          }
00242          if(c!='>')
00243             XMLParseError("Symbol '>' expected after '/' in tag!");
00244       }
00245    }
00246 
00247    void ParsePI()
00248       // Parses a processing instruction
00249    {
00250       int len,savelineno=GetCurLineNo();
00251       char *ptr;
00252 
00253       do
00254       {
00255          if(ReadStringUntil(&ptr,&len,"?>"))
00256             break;
00257 
00258          if(len==0)
00259             XMLParseError("Could not find closing '?>' for processing instruction in line %lu !",savelineno);
00260 
00261          saxclient->HandlePI(ptr,len,1);
00262       }
00263       while(1);
00264 
00265       saxclient->HandlePI(ptr,len,0);
00266    }
00267 
00268    void ParseCDATA()
00269       // Parses a CDATA section
00270    {
00271       int len,savelineno=GetCurLineNo();
00272       char *ptr;
00273 
00274       while(ReadStringUntil(&ptr,&len,"]]>")==0)
00275       {
00276          if(len==0)
00277             XMLParseError("Could not find closing ']]>' for CDATA section starting in line %lu !",savelineno);
00278 
00279          saxclient->HandleCDATA(ptr,len,1);
00280       }
00281     
00282       saxclient->HandleCDATA(ptr,len,0);
00283    }
00284 
00285    void ParseComment()
00286       // Parses a comment section
00287    {
00288       int len,savelineno=GetCurLineNo();
00289       char *ptr;
00290 
00291       while(ReadStringUntil(&ptr,&len,"-->")==0)
00292       {
00293          if(len==0)
00294             XMLParseError("Could not find closing '-->' for comment starting in line %lu !",savelineno);
00295 
00296          saxclient->HandleComment(ptr,len,1);
00297       }
00298 
00299       saxclient->HandleComment(ptr,len,0);
00300    }
00301 
00302    void ParseText()
00303       // Parses some text data
00304    {
00305       char err;
00306       int len;
00307       char *ptr,*leftwsptr,*rightwsptr,*endptr;
00308 
00309       // We look for the end '<'
00310       err=ReadStringUntil(&ptr,&len,'<');
00311 
00312       if((err==0)&&(len==0))
00313          return;
00314 
00315       endptr=ptr+len;
00316 
00317       // Let's traverse over all white spaces at the beginning
00318       leftwsptr=ptr;
00319 
00320       while((leftwsptr<endptr)&&
00321             (*leftwsptr==' ')||(*leftwsptr=='\t')||
00322             (*leftwsptr=='\r')||(*leftwsptr=='\n'))
00323          leftwsptr++;
00324 
00325       while(err==0)  // We didn't find '<' yet ?
00326                      // No? => We must handle that text before we can continue
00327       {
00328          if(len>0)
00329          {
00330             if(IsEndOfFile()&&(len==leftwsptr-ptr))
00331                // If all remaining characters are white spaces,
00332                // we send one single sequence
00333                saxclient->HandleText(ptr,len,0,len,len);
00334             else
00335                saxclient->HandleText(ptr,len,1,leftwsptr-ptr,0);
00336          }
00337 
00338          if(leftwsptr==endptr)   // Everything until now was just white spaces ?
00339                                  // ==> We compute again the number of left white-spaces
00340          {
00341             err=ReadStringUntil(&ptr,&len,'<');
00342 
00343             if((err==0)&&(len==0))  // No more characters? ==> We are done
00344                return;
00345 
00346             leftwsptr=ptr;
00347 
00348             while((leftwsptr<endptr)&&
00349                   (*leftwsptr==' ')||(*leftwsptr=='\t')||
00350                   (*leftwsptr=='\r')||(*leftwsptr=='\n'))
00351                leftwsptr++;
00352          }
00353          else
00354          {
00355             err=ReadStringUntil(&ptr,&len,'<');
00356             if((err==0)&&(len==0))
00357                return;
00358          
00359             leftwsptr=ptr; // i.e. the number of left-white spaces is set to zero
00360          }
00361       }
00362 
00363       // We found the character '<'
00364 
00365       // We take the '<' back
00366       UndoReadChar();
00367       len--;
00368 
00369       endptr=ptr+len;
00370 
00371       // Let's find the number of white spaces at the end of the string
00372       rightwsptr=endptr-1;
00373 
00374       while((rightwsptr>=ptr)&&
00375             (*rightwsptr==' ')||(*rightwsptr=='\t')||
00376             (*rightwsptr=='\r')||(*rightwsptr=='\n'))
00377          rightwsptr--;
00378 
00379       if(len>0)
00380          saxclient->HandleText(ptr,len,0,leftwsptr-ptr,endptr-rightwsptr-1);
00381    }
00382 
00383    void ParseDOCTYPE()
00384       // Parses a DOCTYPE section.
00385       // A DOCTYPE has format <!DOCTYPE ... >  or  <!DOCTTYPE ... [ ... ] >
00386    {
00387       int   len,savelineno=GetCurLineNo(); // We save the line
00388       char  *ptr;
00389       char  *myendptr,*curptr;
00390 
00391       // Let's get the current piece of buffer
00392       len=GetCurBlockPtr(&ptr);
00393       if(len==0)
00394          RefillAndGetCurBlockPtr(&ptr,&len);
00395 
00396       myendptr=ptr+len;
00397       curptr=ptr;
00398 
00399       do
00400       {
00401          if(*curptr=='[')
00402          {
00403             do
00404             {
00405                curptr++;
00406                if(curptr==myendptr)
00407                {
00408                   saxclient->HandleDOCTYPE(ptr,len,1);
00409                   FastSkipData(len);
00410                   RefillAndGetCurBlockPtr(&ptr,&len);
00411                   if(len==0)
00412                      XMLParseError("Could not find closing ']>' for DOCTYPE section starting in line %lu !",savelineno);
00413 
00414                   myendptr=ptr+len;
00415                   curptr=ptr;
00416                }
00417             }
00418             while(*curptr!=']');
00419          }
00420          if(*curptr=='>')
00421             break;
00422 
00423          curptr++;
00424          if(curptr==myendptr)
00425          {
00426             saxclient->HandleDOCTYPE(ptr,len,1);
00427             FastSkipData(len);
00428             RefillAndGetCurBlockPtr(&ptr,&len);
00429             if(len==0)
00430                XMLParseError("Could not find closing ']>' for DOCTYPE section starting in line %lu !",savelineno);
00431             myendptr=ptr+len;
00432             curptr=ptr;
00433          }
00434       }
00435       while(1);
00436 
00437       saxclient->HandleDOCTYPE(ptr,curptr+1-ptr,0);
00438       FastSkipData(curptr+1-ptr);
00439    }
00440 
00441 //******************************************************************************
00442 
00443 public:
00444 
00445    char DoParsing(SAXClient *myclient)
00446       // This is the main parse function
00447    {
00448       saxclient=myclient;
00449 
00450       xmlparser=this;
00451 
00452       char c[9];
00453 
00454       do
00455       {
00456          // Let's start parsing text
00457          ParseText();
00458 
00459          // If have reached the end of the file, we exit
00460          if(IsEndOfFile())
00461             return 1;
00462 
00463          // The next character must be an '<' character
00464          PeekChar(c);
00465          if(*c!='<') // This should actually be never true
00466          {
00467             Error("Character '<' expected !");
00468             XMLParseError("");
00469          }
00470 
00471          // let's look at the next three characters
00472          PeekData(c,3);
00473 
00474          switch(c[1])
00475          {
00476             case '?': // Processing Instruction ?
00477                if(c[2]=='>')
00478                {
00479                   SkipChar();
00480                   ParseLabel();
00481                }
00482                else
00483                   ParsePI();
00484                break;
00485 
00486             case '!':
00487                switch(c[2])
00488                {
00489                case '[': // We have <![CDATA[... ]]>
00490                   PeekData(c,9);
00491                   if(memcmp(c,"<![CDATA[",9)!=0)
00492                   {
00493                      Error("Invalid tag '");
00494                      ErrorCont(c,9);
00495                      ErrorCont("...' should probably be '<![CDATA ...' !");
00496                      XMLParseError("");
00497                   }
00498                   ParseCDATA();
00499                   break;
00500       
00501                case 'D': // We must have <!DOCTYPE ... [ ... ] >
00502                {
00503                   PeekData(c,9);
00504                   if(memcmp(c,"<!DOCTYPE",9)!=0)
00505                   {
00506                      Error("Invalid tag '");
00507                      ErrorCont(c,9);
00508                      ErrorCont("...' should probably be '<!DOCTYPE ...' !");
00509                      XMLParseError("");
00510                   }
00511                   ParseDOCTYPE();
00512                }
00513                break;
00514 
00515                case '-': // We (probably) have a comment <!-- ... -->
00516                   PeekData(c,4);
00517 
00518                   if(c[3]!='-')
00519                   {
00520                      Error("Invalid tag '");
00521                      ErrorCont(c,4);
00522                      ErrorCont("...' should probably be '<!-- ...' !");
00523                      XMLParseError("");
00524                   }
00525                   ParseComment();
00526                   break;
00527 
00528                default:
00529                   Error("Invalid tag '");
00530                   ErrorCont(c,3);
00531                   ErrorCont("...' !");
00532                   XMLParseError("");
00533                }
00534                break;
00535 
00536          case '=':
00537             Error("Invalid label '<=...'!");
00538             PrintErrorMsg();
00539             SkipChar();
00540             saxclient->HandleText("<",1,0,0,0);
00541             break;
00542 
00543          default: // If we only have a simple '<', we skip the character and
00544                   // parse the following label
00545             SkipChar();
00546             ParseLabel();
00547          }
00548       }
00549       while(allocatedmemory<memory_cutoff);
00550          // We perform the parsing as long as the allocated memory is smaller than the
00551          // memory cut off
00552 
00553       return 0;
00554    }
00555 };

Generated on Sat Oct 13 16:08:42 2001 for XMILL by doxygen1.2.11.1 written by Dimitri van Heesch, © 1997-2001