SAXClient.cpp Source File

00001 /*
00002 This product contains certain software code or other information
00003 ("AT&T Software") proprietary to AT&T Corp. ("AT&T").  The AT&T
00004 Software is provided to you "AS IS".  YOU ASSUME TOTAL RESPONSIBILITY
00005 AND RISK FOR USE OF THE AT&T SOFTWARE.  AT&T DOES NOT MAKE, AND
00006 EXPRESSLY DISCLAIMS, ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND
00007 WHATSOEVER, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
00008 MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, WARRANTIES OF
00009 TITLE OR NON-INFRINGEMENT OF ANY INTELLECTUAL PROPERTY RIGHTS, ANY
00010 WARRANTIES ARISING BY USAGE OF TRADE, COURSE OF DEALING OR COURSE OF
00011 PERFORMANCE, OR ANY WARRANTY THAT THE AT&T SOFTWARE IS "ERROR FREE" OR
00012 WILL MEET YOUR REQUIREMENTS.
00013 
00014 Unless you accept a license to use the AT&T Software, you shall not
00015 reverse compile, disassemble or otherwise reverse engineer this
00016 product to ascertain the source code for any AT&T Software.
00017 
00018 (c) AT&T Corp. All rights reserved.  AT&T is a registered trademark of AT&T Corp.
00019 
00020 ***********************************************************************
00021 
00022 History:
00023 
00024       24/11/99  - initial release by Hartmut Liefke, liefke@seas.upenn.edu
00025                                      Dan Suciu,      suciu@research.att.com
00026 */
00027 
00028 //***********************************************************************
00029 //***********************************************************************
00030 
00031 // This module contains the SAX-Client
00032 // The interface used is very similar to SAX.
00033 
00034 #include <stdio.h>
00035 
00036 #include "Error.hpp"
00037 #include "Output.hpp"
00038 #include "SAXClient.hpp"
00039 #include "ContMan.hpp"
00040 #include "CurPath.hpp"
00041 #include "XMLParse.hpp"
00042 
00043 extern CurPath  curpath;
00044 
00045 extern char globalfullwhitespacescompress;
00046 extern char globalattribwhitespacescompress;
00047 
00048 extern CompressContainer *globalwhitespacecont;
00049 
00050 // These flags tell us whether to ignore comments, cdata, etc. or not.
00051 extern char ignore_comment;
00052 extern char ignore_cdata;
00053 extern char ignore_doctype;
00054 extern char ignore_pi;
00055 
00056 // The XML Parser
00057 XMLParse *xmlparser;
00058 
00059 
00060 #ifdef USE_FORWARD_DATAGUIDE
00061 PathTreeNode   *curpathtreenode;
00062 
00063 void InitForwardDataGuide()
00064 {
00065    curpathtreenode=pathtree.GetRootNode();
00066 }
00067 #endif
00068 
00069 //**************************************************************************
00070 //**************************************************************************
00071 
00072 // First some auxiliary functions for storing start/end labels
00073 
00074 inline void StoreEndLabel()
00075    // We store an end label by simply storing the TREETOKEN_ENDLABEL token
00076 {
00077    globaltreecont->StoreCompressedSInt(0,TREETOKEN_ENDLABEL);
00078 
00079 #ifdef USE_FORWARD_DATAGUIDE
00080    curpathtreenode=curpathtreenode->parent;
00081 
00082 #ifdef USE_NO_DATAGUIDE
00083    pathtreemem.RemoveLastMemBlock();
00084 #endif
00085 #endif
00086 }
00087 
00088 inline void StoreEmptyEndLabel()
00089    // We store an end label by simply storing the TREETOKEN_ENDLABEL token
00090 {
00091    globaltreecont->StoreCompressedSInt(0,TREETOKEN_EMPTYENDLABEL);
00092 
00093 #ifdef USE_FORWARD_DATAGUIDE
00094    curpathtreenode=curpathtreenode->parent;
00095 
00096 #ifdef USE_NO_DATAGUIDE
00097    pathtreemem.RemoveLastMemBlock();
00098 #endif
00099 #endif
00100 }
00101 
00102 inline void StoreStartLabel(TLabelID labelid)
00103    // We store the start label by simply storing the label id and an LABELIDX_TOKENOFFS
00104    // The LABELIDX_TOKENOFFS is used since the first labels (0 and 1) are used
00105    // to denote white spaces and special strings (DOCTYPE, ...)
00106 {
00107    globaltreecont->StoreCompressedSInt(0,GET_LABELID(labelid)+LABELIDX_TOKENOFFS);
00108 
00109 #ifdef USE_FORWARD_DATAGUIDE
00110 #ifdef USE_NO_DATAGUIDE
00111    pathtreemem.StartNewMemBlock();
00112 #endif
00113 
00114    curpathtreenode=pathtree.ExtendCurPath(curpathtreenode,labelid);
00115 #endif
00116 }
00117 
00118 inline void StoreTextToken(unsigned blockid)
00119    // A text token is stored by simply storing the block ID
00120 {
00121    globaltreecont->StoreCompressedSInt(1,blockid);
00122 /*
00123 #ifdef USE_FORWARD_DATAGUIDE
00124    CurPathIterator it;
00125    TLabelID labelid;
00126    PathTreeNode *mycurnode=reversedataguide.GetRootNode();
00127 
00128    curpath.InitIterator(&it);
00129    while((labelid=it.GotoPrev())!=LABEL_UNDEFINED)
00130       mycurnode=reversedataguide.ExtendCurPath(mycurnode,labelid);
00131 #endif
00132 */
00133 }
00134 
00135 //**************************************************************************
00136 //**************************************************************************
00137 
00138 void CompressTextItem(char *str,int len,int leftwslen,int rightwslen);
00139 
00140 void SAXClient::HandleAttribName(char *str,int len,char iscont)
00141    // Handles a single attribute name
00142 {
00143    // We simply create a new attribute, if it does not already exist
00144    TLabelID labelid=globallabeldict.FindLabelOrAttrib(str,len,1);
00145 
00146    if(labelid==LABEL_UNDEFINED)
00147       labelid=globallabeldict.CreateLabelOrAttrib(str,len,1);
00148 
00149    // We add it to the current path
00150    curpath.AddLabel(labelid);
00151 
00152    // We store the label ID
00153    StoreStartLabel(labelid);
00154 }
00155 
00156 void SAXClient::HandleAttribValue(char *str,int len,char iscont)
00157    // Handles the attribute value
00158 {
00159    // We simply compress and store the text
00160    CompressTextItem(str,len,0,0);
00161 
00162    // We remove the attribute label from the path stack
00163    curpath.RemoveLabel();
00164 
00165    // We store the end label token
00166    StoreEndLabel();
00167 }
00168 
00169 void SAXClient::HandleAttribWhiteSpaces(char *str,int len,char iscont)
00170 {
00171    if(globalattribwhitespacescompress!=WHITESPACE_IGNORE)
00172    {
00173       if(len>0)
00174       {
00175          globalwhitespacecont->StoreUInt32(len);
00176          globalwhitespacecont->StoreData(str,len);
00177          globaltreecont->StoreCompressedSInt(0,TREETOKEN_ATTRIBWHITESPACE);
00178       }
00179    }
00180 }
00181 
00182 void SAXClient::HandleStartLabel(char *str,int len,char iscont)
00183    // Handles a start element tag
00184 {
00185    // Find or create the attribute
00186    TLabelID labelid=globallabeldict.FindLabelOrAttrib(str,len,0);
00187 
00188    if((len==9)&&(memcmp(str,"CARBOHYD ",9)==0))
00189       labelid=labelid;
00190 
00191    if(labelid==LABEL_UNDEFINED)
00192    {
00193       labelid=globallabeldict.CreateLabelOrAttrib(str,len,0);
00194       if(labelid==LABEL_UNDEFINED)
00195          labelid=LABEL_UNDEFINED;
00196    }
00197 
00198    // Add the label to the path
00199    curpath.AddLabel(labelid);
00200 
00201    // Store the start label in the schema container
00202    StoreStartLabel(labelid);
00203 }
00204 
00205 void SAXClient::HandleEndLabel(char *str,int len,char iscont)
00206    // Stores the end label
00207 {
00208    TLabelID labelid=curpath.RemoveLabel();
00209    TLabelID endlabelid;
00210 
00211    // Let's check that the end label doesn't have any trailing white spaces
00212    while((len>0)&&
00213          ((str[len-1]=='\n')||(str[len-1]=='\r')||(str[len-1]=='\t')||(str[len-1]==' ')))
00214    {
00215       Error("End label has trailing white spaces!");
00216       PrintErrorMsg();
00217       len--;
00218    }
00219 
00220    // Was the current path empty? I.e. we didn't have any corresponding starting label?
00221    // ==> Exit
00222    if(labelid==LABEL_UNDEFINED)
00223    {
00224       Error("Unexpected end label '");
00225       ErrorCont(str,len);
00226       ErrorCont("' !");
00227       xmlparser->XMLParseError("");
00228    }
00229 
00230    if(str==NULL)  // Did we have an empty element of the form <label/> ?
00231       StoreEmptyEndLabel();
00232    else
00233    {
00234       // Otherwise, let's check whether the end label is the same as the start label
00235       endlabelid=globallabeldict.FindLabelOrAttrib(str,len,0);
00236 
00237       if(endlabelid!=labelid) // Not the same?
00238                               // We look at the previous label in the path
00239                               // If this is not equal either, then we exit
00240       {
00241          char *ptr;
00242          unsigned long startlen=globallabeldict.LookupCompressLabel(labelid,&ptr);
00243 
00244          TLabelID prevlabelid=curpath.RemoveLabel();
00245          if(prevlabelid!=endlabelid)
00246          {
00247             Error("End label '");
00248             ErrorCont(str,len);
00249             ErrorCont("' does not match start label '");
00250             ErrorCont(ptr,startlen);
00251             ErrorCont("' !");
00252             xmlparser->XMLParseError("");
00253          }
00254 
00255          // The previous label was equal,
00256          char tmpstr[100];
00257 
00258          Error("Warning: End label '");
00259          ErrorCont(str,len);
00260          sprintf(tmpstr,"' in line %lu does not match start label '",xmlparser->GetCurLineNo());
00261          ErrorCont(tmpstr);
00262          ErrorCont(ptr,startlen);
00263          ErrorCont("'!\n => Additional end label inserted!");
00264          PrintErrorMsg();
00265 
00266          // We store one additional end tag token
00267          StoreEndLabel();
00268       }
00269       StoreEndLabel();
00270    }
00271 }
00272 
00273 void SAXClient::HandleText(char *str,int len,char iscont,int leftwslen,int rightwslen)
00274    // This function handles text
00275 {
00276    if((leftwslen==len)&&(rightwslen==len))
00277       // Is the entire text block only containing white spaces ?
00278    {
00279       // Depending on the flag for full white space text, we either
00280       // ignore, store them in a global container or treat them as text.
00281       switch(globalfullwhitespacescompress)
00282       {
00283       case WHITESPACE_IGNORE:
00284          return;
00285 
00286       case WHITESPACE_STOREGLOBAL:
00287          globaltreecont->StoreCompressedSInt(0,TREETOKEN_WHITESPACE);
00288          globalwhitespacecont->StoreUInt32(len);
00289          globalwhitespacecont->StoreData(str,len);
00290          return;
00291 
00292       case WHITESPACE_STORETEXT:
00293          CompressTextItem(str,len,0,0);
00294          return;
00295       }
00296    }
00297    // If there is some real text in the text block, then we use
00298    // the following function, which distributes the text to the
00299    // appropriate user compressor
00300    CompressTextItem(str,len,leftwslen,rightwslen);
00301 }
00302 
00303 void SAXClient::HandleComment(char *str,int len,char iscont)
00304    // Handles comment sections
00305 {
00306    if(!ignore_comment)
00307    {
00308       globaltreecont->StoreCompressedSInt(0,TREETOKEN_SPECIAL);
00309       globalspecialcont->StoreUInt32(len);
00310       globalspecialcont->StoreData(str,len);
00311    }
00312 }
00313 
00314 void SAXClient::HandlePI(char *str,int len,char iscont)
00315    // Handles processing instruction sections
00316 {
00317    if(!ignore_pi)
00318    {
00319       globaltreecont->StoreCompressedSInt(0,TREETOKEN_SPECIAL);
00320       globalspecialcont->StoreUInt32(len);
00321       globalspecialcont->StoreData(str,len);
00322    }
00323 }
00324 
00325 void SAXClient::HandleDOCTYPE(char *str,int len,char iscont)
00326    // Handles DOCTYPE sections
00327 {
00328    if(!ignore_doctype)
00329    {
00330       globaltreecont->StoreCompressedSInt(0,TREETOKEN_SPECIAL);
00331       globalspecialcont->StoreUInt32(len);
00332       globalspecialcont->StoreData(str,len);
00333    }
00334 }
00335 
00336 void SAXClient::HandleCDATA(char *str,int len,char iscont)
00337    // Handles CDATA sections
00338 {
00339    if(!ignore_cdata)
00340    {
00341       globaltreecont->StoreCompressedSInt(0,TREETOKEN_SPECIAL);
00342       globalspecialcont->StoreUInt32(len);
00343       globalspecialcont->StoreData(str,len);
00344    }
00345 }
00346 
00347 //**************************************************************************************
00348 //**************************************************************************************
00349 
00350 inline char VPathExpr::CompressTextItem(char *str,int len,PathDictNode *pathdictnode,int wsleftlen,int wsrightlen)
00351    // Attempts to compress the string (str,len) by the user compressor of
00352    // this path expression
00353    // wsleftlen and wsrightlen describe the length of left and right white spaces
00354    // at the beginning/end of the string
00355 {
00356    CompressContainerBlock  *contblock=pathdictnode->GetCompressContainerBlock();
00357    CompressContainer       *cont;
00358    char                    *dataptr;
00359 
00360    // If we haven't created container block yet, then let's do that now
00361    if(contblock==NULL)
00362    {
00363       contblock=pathdictnode->AssignCompressContainerBlock(
00364          GetUserContNum(),
00365          GetUserDataSize(),this);
00366 
00367       cont=contblock->GetContainer(0);
00368       dataptr=contblock->GetUserDataPtr();
00369 
00370       // We initialize the state of the container block
00371       // by invoking InitCompress on the user compressor
00372       InitCompress(cont,dataptr);
00373    }
00374    else
00375    {
00376       cont=contblock->GetContainer(0);
00377       dataptr=contblock->GetUserDataPtr();
00378    }
00379 
00380    if(wsleftlen>0)   // Do we have left white spaces?
00381    {
00382       // If the left white spaces should be  part of the text, then we simply
00383       // set wsleftlen=0. Otherwise, 'str' and 'len' are adjusted to exclude
00384       // those white spaces
00385       switch(leftwhitespacescompress)
00386       {
00387       case WHITESPACE_IGNORE:
00388       case WHITESPACE_STOREGLOBAL:
00389          str+=wsleftlen;
00390          len-=wsleftlen;
00391          break;
00392       case WHITESPACE_STORETEXT:
00393          wsleftlen=0;
00394       }
00395    }
00396    if(wsrightlen>0)
00397    {
00398       // If the right white spaces should be part of the text, then we simply
00399       // set wsrightlen=0. Otherwise, 'str' and 'len' are adjusted to exclude
00400       // those white spaces
00401       switch(rightwhitespacescompress)
00402       {
00403       case WHITESPACE_IGNORE:
00404       case WHITESPACE_STOREGLOBAL:
00405          len-=wsrightlen;
00406          break;
00407       case WHITESPACE_STORETEXT:
00408          wsrightlen=0;
00409       }
00410    }
00411 
00412    char *savedataptr=dataptr;
00413 
00414    // Let's try to parse and compress the string using the user compressor
00415    // If it doesn't work, then we exit
00416    if(len>0)
00417    {
00418       if(usercompressor->ParseString(str,len,dataptr)==0)
00419          return 0;
00420 
00421       usercompressor->CompressString(str,len,cont,savedataptr);
00422    }
00423 
00424    // The compression of the text was successful !
00425 
00426    // Let's globally store the left white spaces (if there are some)
00427    if((wsleftlen>0)&&(leftwhitespacescompress==WHITESPACE_STOREGLOBAL))
00428    {
00429       globaltreecont->StoreCompressedSInt(0,TREETOKEN_WHITESPACE);
00430       globalwhitespacecont->StoreUInt32(wsleftlen);
00431       globalwhitespacecont->StoreData(str-wsleftlen,wsleftlen);
00432    }
00433 
00434    // We store the text token. Note that this happens *after* storing the
00435    // left white space token, but *before* storing the right white space token!
00436 
00437    if(len>0)
00438       StoreTextToken(contblock->GetID());
00439 
00440    // Let's globally store the right white spaces (if there are some)
00441    if((wsrightlen>0)&&(rightwhitespacescompress==WHITESPACE_STOREGLOBAL))
00442    {
00443       globaltreecont->StoreCompressedSInt(0,TREETOKEN_WHITESPACE);
00444       globalwhitespacecont->StoreUInt32(wsrightlen);
00445       globalwhitespacecont->StoreData(str+len,wsrightlen);
00446    }
00447    return 1;
00448 }
00449 
00450 //**************************************************************************************
00451 
00452 #ifndef USE_FORWARD_DATAGUIDE
00453 
00454 void CompressTextItem(char *str,int len,int leftwslen,int rightwslen)
00455    // Compresses a given piece of text where 'leftwslen' and 'rightwslen'
00456    // are the number of white space on the left and right and of
00457    // the string
00458 
00459    // This function distributes the text pieces depending on the current
00460    // path.
00461 {
00462    CurPathIterator         it,saveit;
00463    FSMManStateItem         *fsmstate;
00464    TLabelID                labelid;
00465    char                    overpoundedge;
00466    FSMState                *curstate;
00467    PathDictNode            *pathdictnode;
00468 
00469    // We iterate over the current path
00470    curpath.InitIterator(&it);
00471 
00472    PathTreeNode *curpathtreenode=pathtree.GetRootNode();
00473 
00474    // We start at the root-node of the reverse data guide
00475    // and traverse the path backward as long as no accepting state
00476    // has been reached
00477    while(curpathtreenode->IsAccepting()==0)
00478    {
00479       labelid=it.GotoPrev();
00480       if(labelid==LABEL_UNDEFINED)
00481          break;
00482 
00483       curpathtreenode=pathtree.ExtendCurPath(curpathtreenode,labelid);
00484    }
00485 
00486    // After we reached an accepting state, we look at each
00487    // single regular expression and the corresponding state of the FSM
00488    // Note that each of the states already accepted the word!
00489    // Therefore, we only check whether there are additional pound-signs that
00490    // come afterwards
00491 
00492    fsmstate=curpathtreenode->GetFSMStates();
00493 
00494    // Did we reach the end of the path?
00495    if(labelid==LABEL_UNDEFINED)
00496    {
00497       // We look for an FSM whose state is final for that path
00498       while(fsmstate!=NULL)
00499       {
00500          if(fsmstate->curstate->IsFinal())
00501             // Did we find a final state => We send the text to the
00502             // corresponding path expression
00503          {
00504             if(fsmstate->pathexpr->CompressTextItem(
00505                   str,len,fsmstate->GetPathDictNode(),
00506                   leftwslen,rightwslen))
00507                return;
00508          }
00509          fsmstate=fsmstate->next;
00510       }
00511    }
00512    else
00513    {
00514       // We haven't reached the end of the path, but we found
00515       // an accepting state?
00516 
00517       // Let's save the iterator
00518       saveit=it;
00519 
00520       // Let's find the FSMs whose states are accepting
00521       while(fsmstate!=NULL)
00522       {
00523          if(fsmstate->curstate->IsAccepting()==0)
00524          {
00525             fsmstate=fsmstate->next;
00526             continue;
00527          }
00528 
00529          // For each state, go over the rest of the path and
00530          // traverse the rest of the FSM and we instantiate
00531          // the # symbols.
00532 
00533          pathdictnode=fsmstate->GetPathDictNode();
00534 
00535          curstate=fsmstate->curstate;
00536 
00537          // Let's go the starting point in the iterator
00538          saveit=it;
00539 
00540          // Let's instantiate the #'s as long as we still have
00541          // #'s ahead
00542          while(curstate->HasPoundsAhead())
00543          {
00544             labelid=it.GotoPrev();
00545             if(labelid==LABEL_UNDEFINED)  // We reached the beginning of the path?
00546                break;
00547             curstate=curstate->GetNextState(labelid,&overpoundedge);
00548 
00549             // Did we jump over a pound-edge ?
00550             // ==> We must advance the 'pathdictnode' item
00551             if(overpoundedge)
00552                pathdictnode=pathdict.FindOrCreatePath(pathdictnode,labelid);
00553          }
00554 
00555          // Let's now try to compress the text with the compressor
00556          if(fsmstate->pathexpr->CompressTextItem(str,len,pathdictnode,leftwslen,rightwslen))
00557             return;
00558       
00559          fsmstate=fsmstate->next;
00560       }
00561    }
00562    // No FSM accepts the path? ==> Something is wrong
00563    Error("Fatal error: no automaton accepts current path !\n");
00564    Exit();
00565 }
00566 
00567 //****************************************************************************************
00568 //****************************************************************************************
00569 
00570 // The forward guide implementation shouldn't be used
00571 
00572 #else // Should we use a forward data guide ?
00573 
00574 void CompressTextItem(char *str,int len,int leftwslen,int rightwslen)
00575 {
00576    FSMManStateItem         *fsmstate;
00577 
00578    // We look at each single regular expression and the
00579    // corresponding state of the FSM
00580 
00581    fsmstate=curpathtreenode->GetFSMStates();
00582 
00583    while(fsmstate!=NULL)
00584    {
00585       if(fsmstate->curstate->IsFinal())
00586       {
00587          if(fsmstate->pathexpr->CompressTextItem(
00588                str,len,fsmstate->GetPathDictNode(),
00589                leftwslen,rightwslen))
00590             return;
00591       }
00592       fsmstate=fsmstate->next;
00593    }
00594    // No FSM accepts the path? ==> Something is wrong
00595    Error("No automaton accepts string !\n");
00596    Exit();
00597 }
00598 
00599 #endif