Annotation of libwww/Library/src/SGML.html, revision 2.4

2.4     ! timbl       1: <HTML>
        !             2: <HEAD>
        !             3: <TITLE>SGML parse and stream definition for libwww</TITLE></HEAD>
2.1       timbl       4: <BODY>
                      5: <H1>SGML and Structured streams</H1>The SGML parser is a state machine.
                      6: It is called for every character<P>
                      7: of the input stream. The DTD data
                      8: structure contains pointers<P>
                      9: to functions which are called to
                     10: implement the actual effect of the<P>
                     11: text read. When these functions are
                     12: called, the attribute structures
                     13: pointed to by the DTD are valid,
                     14: and the function is passed a pointer
                     15: to the curent tag structure, and
                     16: an "element stack" which represents
                     17: the state of nesting within SGML
                     18: elements.<P>
                     19: The following aspects are from Dan
                     20: Connolly's suggestions:  Binary search,
                     21: Strcutured object scheme basically,
                     22: SGML content enum type.<P>
                     23: (c) Copyright CERN 1991 - See Copyright.html
                     24: <PRE>#ifndef SGML_H
                     25: #define SGML_H
                     26: 
                     27: #include "HTUtils.h"
                     28: #include "HTStream.h"
                     29: 
                     30: </PRE>
                     31: <H2>SGML content types</H2>
                     32: <PRE>typedef enum _SGMLContent{
                     33:   SGML_EMPTY,    /* no content */
                     34:   SGML_LITTERAL, /* character data. Recognised excat close tag only. litteral
                     35:                    Old www server compatibility only! Not SGML */
                     36:   SGML_CDATA,    /* character data. recognize &lt;/ only */
                     37:   SGML_RCDATA,   /* replaceable character data. recognize &lt;/ and &amp;ref; */
                     38:   SGML_MIXED,    /* elements and parsed character data. recognize all markup */
                     39:   SGML_ELEMENT   /* any data found will be returned as an error*/
                     40:   } SGMLContent;
                     41: 
                     42: 
                     43: typedef struct {
                     44:     char *     name;           /* The (constant) name of the attribute */
                     45:                                /* Could put type info in here */
                     46: } attr;
                     47: 
                     48: 
                     49: /*             A tag structure describes an SGML element.
                     50: **             -----------------------------------------
                     51: **
                     52: **
                     53: **     name            is the string which comes after the tag opener "&lt;".
                     54: **
                     55: **     attributes      points to a zero-terminated array
                     56: **                     of attribute names.
                     57: **
                     58: **     litteral        determines how the SGML engine parses the charaters
                     59: **                     within the element. If set, tag openers are ignored
                     60: **                     except for that which opens a matching closing tag.
                     61: **
                     62: */
                     63: typedef struct _tag HTTag;
                     64: struct _tag{
                     65:     char *     name;                   /* The name of the tag */
                     66:     attr *     attributes;             /* The list of acceptable attributes */
                     67:     int                number_of_attributes;   /* Number of possible attributes */
                     68:     SGMLContent contents;              /* End only on end tag @@ */            
                     69: };
                     70: 
                     71: 
                     72: 
                     73: 
                     74: /*             DTD Information
                     75: **             ---------------
                     76: **
                     77: ** Not the whole DTD, but all this parser usues of it.
                     78: */
                     79: typedef struct {
2.2       timbl      80:     HTTag *            tags;           /* Must be in strcmp order by name */ 
                     81:     int                        number_of_tags;
                     82:     CONST char **      entity_names;   /* Must be in strcmp order by name */
                     83:     int                        number_of_entities;
2.1       timbl      84: } SGML_dtd;
                     85: 
                     86: 
                     87: /*     SGML context passed to parsers
                     88: */
                     89: typedef struct _HTSGMLContext *HTSGMLContext;  /* Hidden */
                     90: 
                     91: 
                     92: /*__________________________________________________________________________
                     93: */
                     94: /*             Structured Object definition
                     95: **
                     96: **     A structured object is something which can reasonably be
                     97: **     represented in SGML.  I'll rephrase that.  A structured
                     98: **     object is am ordered tree-structured arrangement of data
                     99: **     which is representable as text.
                    100: **
                    101: **     The SGML parer outputs to a Structured object. 
                    102: **     A Structured object can output its contents
                    103: **     to another Structured Object. 
                    104: **     It's a kind of typed stream.  The architecure
                    105: **     is largely Dan Conolly's.
                    106: **     Elements and entities are passed to the sob by number, implying
                    107: **     a knowledge of the DTD.
                    108: **     Knowledge of the SGML syntax is not here, though.
                    109: **
                    110: **     Superclass: HTStream
                    111: */
                    112: 
                    113: 
                    114: /*     The creation methods will vary on the type of Structured Object.
                    115: **     Maybe the callerData is enough info to pass along.
                    116: */
                    117: 
                    118: typedef struct _HTStructured HTStructured;
                    119: 
                    120: typedef struct _HTStructuredClass{
                    121: 
                    122:        char*  name;                            /* Just for diagnostics */
                    123: 
                    124:        void (*free) PARAMS((
                    125:                HTStructured*   me));
                    126: 
2.4     ! timbl     127:        void (*abort) PARAMS((
2.1       timbl     128:                HTStructured*   me));
                    129:                
                    130:        void (*put_character) PARAMS((
                    131:                HTStructured*   me,
                    132:                char            ch));
                    133:                                
                    134:        void (*put_string) PARAMS((
                    135:                HTStructured*   me,
                    136:                CONST char *    str));
                    137:                
                    138:        void (*write) PARAMS((
2.2       timbl     139:                HTStructured*   me,
2.1       timbl     140:                CONST char *    str,
                    141:                int             len));
                    142:                
                    143:        void (*start_element) PARAMS((
                    144:                HTStructured*   me,
                    145:                int             element_number,
2.2       timbl     146:                CONST BOOL*             attribute_present,
                    147:                CONST char**            attribute_value));
2.1       timbl     148:                
                    149:        void (*end_element) PARAMS((
                    150:                HTStructured*   me,
                    151:                int             element_number));
                    152: 
                    153:        void (*put_entity) PARAMS((
                    154:                HTStructured*   me,
                    155:                int             entity_number));
                    156:                
                    157: }HTStructuredClass;
                    158: 
                    159: 
                    160: 
                    161: /*     Create an SGML parser
                    162: **
                    163: ** On entry,
                    164: **     dtd             must point to a DTD structure as defined above
                    165: **     callbacks       must point to user routines.
                    166: **     callData        is returned in callbacks transparently.
                    167: ** On exit,
                    168: **             The default tag starter has been processed.
                    169: */
                    170: 
                    171: 
                    172: extern HTStream* SGML_new PARAMS((
                    173:        CONST SGML_dtd *                dtd,
                    174:        HTStructured *          target));
                    175: 
2.2       timbl     176: extern CONST HTStreamClass SGMLParser;
2.1       timbl     177: 
                    178: 
                    179: #endif /* SGML_H */
                    180: 
2.3       timbl     181: 
                    182: 
2.4     ! timbl     183: 
        !           184: </BODY>
        !           185: </HTML>

Webmaster