Annotation of libwww/Library/src/SGML.html, revision 2.9

2.4       timbl       1: <HTML>
                      2: <HEAD>
                      3: <TITLE>SGML parse and stream definition for libwww</TITLE></HEAD>
2.1       timbl       4: <BODY>
                      5: <H1>SGML and Structured streams</H1>The SGML parser is a state machine.
                      6: It is called for every character<P>
                      7: of the input stream. The DTD data
                      8: structure contains pointers<P>
                      9: to functions which are called to
                     10: implement the actual effect of the<P>
                     11: text read. When these functions are
                     12: called, the attribute structures
                     13: pointed to by the DTD are valid,
                     14: and the function is passed a pointer
                     15: to the curent tag structure, and
                     16: an "element stack" which represents
                     17: the state of nesting within SGML
                     18: elements.<P>
                     19: The following aspects are from Dan
                     20: Connolly's suggestions:  Binary search,
                     21: Strcutured object scheme basically,
                     22: SGML content enum type.<P>
                     23: (c) Copyright CERN 1991 - See Copyright.html
                     24: <PRE>#ifndef SGML_H
                     25: #define SGML_H
                     26: 
                     27: #include "HTUtils.h"
                     28: #include "HTStream.h"
                     29: 
                     30: </PRE>
                     31: <H2>SGML content types</H2>
                     32: <PRE>typedef enum _SGMLContent{
                     33:   SGML_EMPTY,    /* no content */
2.8       timbl      34:   SGML_LITERAL, /* character data. Recognized exact close tag only.
2.1       timbl      35:                    Old www server compatibility only! Not SGML */
                     36:   SGML_CDATA,    /* character data. recognize &lt;/ only */
                     37:   SGML_RCDATA,   /* replaceable character data. recognize &lt;/ and &amp;ref; */
                     38:   SGML_MIXED,    /* elements and parsed character data. recognize all markup */
                     39:   SGML_ELEMENT   /* any data found will be returned as an error*/
                     40:   } SGMLContent;
                     41: 
                     42: 
                     43: typedef struct {
                     44:     char *     name;           /* The (constant) name of the attribute */
                     45:                                /* Could put type info in here */
                     46: } attr;
                     47: 
                     48: 
                     49: /*             A tag structure describes an SGML element.
                     50: **             -----------------------------------------
                     51: **
                     52: **
                     53: **     name            is the string which comes after the tag opener "&lt;".
                     54: **
                     55: **     attributes      points to a zero-terminated array
                     56: **                     of attribute names.
                     57: **
                     58: **     litteral        determines how the SGML engine parses the charaters
                     59: **                     within the element. If set, tag openers are ignored
                     60: **                     except for that which opens a matching closing tag.
                     61: **
                     62: */
                     63: typedef struct _tag HTTag;
                     64: struct _tag{
                     65:     char *     name;                   /* The name of the tag */
                     66:     attr *     attributes;             /* The list of acceptable attributes */
                     67:     int                number_of_attributes;   /* Number of possible attributes */
                     68:     SGMLContent contents;              /* End only on end tag @@ */            
                     69: };
                     70: 
                     71: 
                     72: 
                     73: 
                     74: /*             DTD Information
                     75: **             ---------------
                     76: **
                     77: ** Not the whole DTD, but all this parser usues of it.
                     78: */
                     79: typedef struct {
2.2       timbl      80:     HTTag *            tags;           /* Must be in strcmp order by name */ 
                     81:     int                        number_of_tags;
                     82:     CONST char **      entity_names;   /* Must be in strcmp order by name */
                     83:     int                        number_of_entities;
2.1       timbl      84: } SGML_dtd;
                     85: 
2.9     ! frystyk    86: #define MAX_ATTRIBUTES 20      /* Max number of attributes per element */
2.1       timbl      87: 
                     88: /*     SGML context passed to parsers
                     89: */
                     90: typedef struct _HTSGMLContext *HTSGMLContext;  /* Hidden */
                     91: 
                     92: 
                     93: /*__________________________________________________________________________
                     94: */
                     95: 
2.6       timbl      96: </PRE>
                     97: <H2>Structured Object definition</H2>A structured object is something
                     98: which can reasonably be represented
                     99: in SGML.  I'll rephrase that.  A
                    100: structured object is am ordered tree-structured
                    101: arrangement of data which is representable
                    102: as text.The SGML parer outputs to
                    103: a Structured object. A Structured
                    104: object can output its contents to
                    105: another Structured Object. It's a
                    106: kind of typed stream. The architecure
                    107: is largely Dan Conolly's. Elements
                    108: and entities are passed to the sob
                    109: by number, implying a knowledge of
                    110: the DTD. Knowledge of the SGML syntax
                    111: is not here, though.<P>
                    112: Superclass: HTStream<P>
                    113: The creation methods will vary on
                    114: the type of Structured Object.Maybe
                    115: the callerData is enough info to
                    116: pass along.
                    117: <PRE>typedef struct _HTStructured HTStructured;
2.1       timbl     118: 
                    119: typedef struct _HTStructuredClass{
                    120: 
                    121:        char*  name;                            /* Just for diagnostics */
                    122: 
                    123:        void (*free) PARAMS((
                    124:                HTStructured*   me));
                    125: 
2.4       timbl     126:        void (*abort) PARAMS((
2.5       timbl     127:                HTStructured*   me,
                    128:                HTError         e));
2.1       timbl     129:                
                    130:        void (*put_character) PARAMS((
                    131:                HTStructured*   me,
                    132:                char            ch));
                    133:                                
                    134:        void (*put_string) PARAMS((
                    135:                HTStructured*   me,
                    136:                CONST char *    str));
                    137:                
                    138:        void (*write) PARAMS((
2.2       timbl     139:                HTStructured*   me,
2.1       timbl     140:                CONST char *    str,
                    141:                int             len));
                    142:                
                    143:        void (*start_element) PARAMS((
                    144:                HTStructured*   me,
                    145:                int             element_number,
2.2       timbl     146:                CONST BOOL*             attribute_present,
                    147:                CONST char**            attribute_value));
2.1       timbl     148:                
                    149:        void (*end_element) PARAMS((
                    150:                HTStructured*   me,
                    151:                int             element_number));
                    152: 
                    153:        void (*put_entity) PARAMS((
                    154:                HTStructured*   me,
                    155:                int             entity_number));
                    156:                
                    157: }HTStructuredClass;
                    158: 
2.6       timbl     159: </PRE>
                    160: <H2>Find a Tag by Name</H2>Returns a pointer to the tag within
                    161: the DTD.
2.7       timbl     162: <PRE>extern HTTag * SGMLFindTag PARAMS((CONST SGML_dtd* dtd, CONST char * string));
2.1       timbl     163: 
                    164: 
2.6       timbl     165: </PRE>
                    166: <H2>Create an SGML parser</H2>
                    167: <PRE>/*
2.1       timbl     168: ** On entry,
                    169: **     dtd             must point to a DTD structure as defined above
                    170: **     callbacks       must point to user routines.
                    171: **     callData        is returned in callbacks transparently.
                    172: ** On exit,
                    173: **             The default tag starter has been processed.
                    174: */
                    175: 
                    176: 
                    177: extern HTStream* SGML_new PARAMS((
                    178:        CONST SGML_dtd *                dtd,
                    179:        HTStructured *          target));
                    180: 
2.2       timbl     181: extern CONST HTStreamClass SGMLParser;
2.1       timbl     182: 
                    183: 
                    184: #endif /* SGML_H */
2.7       timbl     185: 
2.1       timbl     186: 
2.3       timbl     187: 
                    188: 
2.4       timbl     189: 
2.6       timbl     190: 
                    191: 
                    192: 
                    193: 
2.8       timbl     194: 
                    195: </PRE></BODY>
2.4       timbl     196: </HTML>

Webmaster