Annotation of libwww/Library/src/SGML.html, revision 2.2

2.1       timbl       1: <HEADER>
                      2: <TITLE>/Net/dxcern/userd/timbl/hypertext/WWW/Library/Implementation/SGML.html</TITLE></HEADER>
                      3: <BODY>
                      4: <H1>SGML and Structured streams</H1>The SGML parser is a state machine.
                      5: It is called for every character<P>
                      6: of the input stream. The DTD data
                      7: structure contains pointers<P>
                      8: to functions which are called to
                      9: implement the actual effect of the<P>
                     10: text read. When these functions are
                     11: called, the attribute structures
                     12: pointed to by the DTD are valid,
                     13: and the function is passed a pointer
                     14: to the curent tag structure, and
                     15: an "element stack" which represents
                     16: the state of nesting within SGML
                     17: elements.<P>
                     18: The following aspects are from Dan
                     19: Connolly's suggestions:  Binary search,
                     20: Strcutured object scheme basically,
                     21: SGML content enum type.<P>
                     22: (c) Copyright CERN 1991 - See Copyright.html
                     23: <PRE>#ifndef SGML_H
                     24: #define SGML_H
                     25: 
                     26: #include "HTUtils.h"
                     27: #include "HTStream.h"
                     28: 
                     29: </PRE>
                     30: <H2>SGML content types</H2>
                     31: <PRE>typedef enum _SGMLContent{
                     32:   SGML_EMPTY,    /* no content */
                     33:   SGML_LITTERAL, /* character data. Recognised excat close tag only. litteral
                     34:                    Old www server compatibility only! Not SGML */
                     35:   SGML_CDATA,    /* character data. recognize &lt;/ only */
                     36:   SGML_RCDATA,   /* replaceable character data. recognize &lt;/ and &amp;ref; */
                     37:   SGML_MIXED,    /* elements and parsed character data. recognize all markup */
                     38:   SGML_ELEMENT   /* any data found will be returned as an error*/
                     39:   } SGMLContent;
                     40: 
                     41: 
                     42: typedef struct {
                     43:     char *     name;           /* The (constant) name of the attribute */
                     44:                                /* Could put type info in here */
                     45: } attr;
                     46: 
                     47: 
                     48: /*             A tag structure describes an SGML element.
                     49: **             -----------------------------------------
                     50: **
                     51: **
                     52: **     name            is the string which comes after the tag opener "&lt;".
                     53: **
                     54: **     attributes      points to a zero-terminated array
                     55: **                     of attribute names.
                     56: **
                     57: **     litteral        determines how the SGML engine parses the charaters
                     58: **                     within the element. If set, tag openers are ignored
                     59: **                     except for that which opens a matching closing tag.
                     60: **
                     61: */
                     62: typedef struct _tag HTTag;
                     63: struct _tag{
                     64:     char *     name;                   /* The name of the tag */
                     65:     attr *     attributes;             /* The list of acceptable attributes */
                     66:     int                number_of_attributes;   /* Number of possible attributes */
                     67:     SGMLContent contents;              /* End only on end tag @@ */            
                     68: };
                     69: 
                     70: 
                     71: 
                     72: 
                     73: /*             DTD Information
                     74: **             ---------------
                     75: **
                     76: ** Not the whole DTD, but all this parser usues of it.
                     77: */
                     78: typedef struct {
2.2     ! timbl      79:     HTTag *            tags;           /* Must be in strcmp order by name */ 
        !            80:     int                        number_of_tags;
        !            81:     CONST char **      entity_names;   /* Must be in strcmp order by name */
        !            82:     int                        number_of_entities;
2.1       timbl      83: } SGML_dtd;
                     84: 
                     85: 
                     86: /*     SGML context passed to parsers
                     87: */
                     88: typedef struct _HTSGMLContext *HTSGMLContext;  /* Hidden */
                     89: 
                     90: 
                     91: /*__________________________________________________________________________
                     92: */
                     93: /*             Structured Object definition
                     94: **
                     95: **     A structured object is something which can reasonably be
                     96: **     represented in SGML.  I'll rephrase that.  A structured
                     97: **     object is am ordered tree-structured arrangement of data
                     98: **     which is representable as text.
                     99: **
                    100: **     The SGML parer outputs to a Structured object. 
                    101: **     A Structured object can output its contents
                    102: **     to another Structured Object. 
                    103: **     It's a kind of typed stream.  The architecure
                    104: **     is largely Dan Conolly's.
                    105: **     Elements and entities are passed to the sob by number, implying
                    106: **     a knowledge of the DTD.
                    107: **     Knowledge of the SGML syntax is not here, though.
                    108: **
                    109: **     Superclass: HTStream
                    110: */
                    111: 
                    112: 
                    113: /*     The creation methods will vary on the type of Structured Object.
                    114: **     Maybe the callerData is enough info to pass along.
                    115: */
                    116: 
                    117: typedef struct _HTStructured HTStructured;
                    118: 
                    119: typedef struct _HTStructuredClass{
                    120: 
                    121:        char*  name;                            /* Just for diagnostics */
                    122: 
                    123:        void (*free) PARAMS((
                    124:                HTStructured*   me));
                    125: 
                    126:        void (*end_document) PARAMS((
                    127:                HTStructured*   me));
                    128:                
                    129:        void (*put_character) PARAMS((
                    130:                HTStructured*   me,
                    131:                char            ch));
                    132:                                
                    133:        void (*put_string) PARAMS((
                    134:                HTStructured*   me,
                    135:                CONST char *    str));
                    136:                
                    137:        void (*write) PARAMS((
2.2     ! timbl     138:                HTStructured*   me,
2.1       timbl     139:                CONST char *    str,
                    140:                int             len));
                    141:                
                    142:        void (*start_element) PARAMS((
                    143:                HTStructured*   me,
                    144:                int             element_number,
2.2     ! timbl     145:                CONST BOOL*             attribute_present,
        !           146:                CONST char**            attribute_value));
2.1       timbl     147:                
                    148:        void (*end_element) PARAMS((
                    149:                HTStructured*   me,
                    150:                int             element_number));
                    151: 
                    152:        void (*put_entity) PARAMS((
                    153:                HTStructured*   me,
                    154:                int             entity_number));
                    155:                
                    156: }HTStructuredClass;
                    157: 
                    158: 
                    159: 
                    160: /*     Create an SGML parser
                    161: **
                    162: ** On entry,
                    163: **     dtd             must point to a DTD structure as defined above
                    164: **     callbacks       must point to user routines.
                    165: **     callData        is returned in callbacks transparently.
                    166: ** On exit,
                    167: **             The default tag starter has been processed.
                    168: */
                    169: 
                    170: 
                    171: extern HTStream* SGML_new PARAMS((
                    172:        CONST SGML_dtd *                dtd,
                    173:        HTStructured *          target));
                    174: 
2.2     ! timbl     175: extern CONST HTStreamClass SGMLParser;
2.1       timbl     176: 
                    177: 
                    178: #endif /* SGML_H */
                    179: 
2.2     ! timbl     180: </PRE>
        !           181: <p>
2.1       timbl     182: </BODY>

Webmaster