Annotation of libwww/Library/src/SGML.html, revision 2.11

2.4       timbl       1: <HTML>
                      2: <HEAD>
                      3: <TITLE>SGML parse and stream definition for libwww</TITLE></HEAD>
2.1       timbl       4: <BODY>
                      5: <H1>SGML and Structured streams</H1>The SGML parser is a state machine.
2.11    ! frystyk     6: It is called for every character of the input stream. The DTD data
        !             7: structure contains pointers to functions which are called to
        !             8: implement the actual effect of the text read. When these functions are
        !             9: called, the attribute structures pointed to by the DTD are valid,
        !            10: and the function is parsed a pointer to the curent tag structure, and
        !            11: an "element stack" which represents the state of nesting within SGML
2.1       timbl      12: elements.<P>
2.11    ! frystyk    13: 
2.1       timbl      14: The following aspects are from Dan
                     15: Connolly's suggestions:  Binary search,
                     16: Strcutured object scheme basically,
                     17: SGML content enum type.<P>
2.11    ! frystyk    18: 
        !            19: The module is a part of the <A HREF="Overview">CERN Common WWW Library</A>
2.1       timbl      20: (c) Copyright CERN 1991 - See Copyright.html
2.11    ! frystyk    21: 
        !            22: <PRE>
        !            23: #ifndef SGML_H
2.1       timbl      24: #define SGML_H
                     25: 
                     26: #include "HTUtils.h"
                     27: #include "HTStream.h"
                     28: 
                     29: </PRE>
                     30: <H2>SGML content types</H2>
                     31: <PRE>typedef enum _SGMLContent{
                     32:   SGML_EMPTY,    /* no content */
2.8       timbl      33:   SGML_LITERAL, /* character data. Recognized exact close tag only.
2.1       timbl      34:                    Old www server compatibility only! Not SGML */
                     35:   SGML_CDATA,    /* character data. recognize &lt;/ only */
                     36:   SGML_RCDATA,   /* replaceable character data. recognize &lt;/ and &amp;ref; */
                     37:   SGML_MIXED,    /* elements and parsed character data. recognize all markup */
                     38:   SGML_ELEMENT   /* any data found will be returned as an error*/
                     39:   } SGMLContent;
                     40: 
                     41: 
                     42: typedef struct {
                     43:     char *     name;           /* The (constant) name of the attribute */
                     44:                                /* Could put type info in here */
                     45: } attr;
                     46: 
                     47: 
                     48: /*             A tag structure describes an SGML element.
                     49: **             -----------------------------------------
                     50: **
                     51: **
                     52: **     name            is the string which comes after the tag opener "&lt;".
                     53: **
                     54: **     attributes      points to a zero-terminated array
                     55: **                     of attribute names.
                     56: **
                     57: **     litteral        determines how the SGML engine parses the charaters
                     58: **                     within the element. If set, tag openers are ignored
                     59: **                     except for that which opens a matching closing tag.
                     60: **
                     61: */
                     62: typedef struct _tag HTTag;
                     63: struct _tag{
                     64:     char *     name;                   /* The name of the tag */
                     65:     attr *     attributes;             /* The list of acceptable attributes */
                     66:     int                number_of_attributes;   /* Number of possible attributes */
                     67:     SGMLContent contents;              /* End only on end tag @@ */            
                     68: };
                     69: 
                     70: 
                     71: 
                     72: 
                     73: /*             DTD Information
                     74: **             ---------------
                     75: **
                     76: ** Not the whole DTD, but all this parser usues of it.
                     77: */
                     78: typedef struct {
2.2       timbl      79:     HTTag *            tags;           /* Must be in strcmp order by name */ 
                     80:     int                        number_of_tags;
                     81:     CONST char **      entity_names;   /* Must be in strcmp order by name */
                     82:     int                        number_of_entities;
2.1       timbl      83: } SGML_dtd;
                     84: 
2.9       frystyk    85: #define MAX_ATTRIBUTES 20      /* Max number of attributes per element */
2.1       timbl      86: 
                     87: /*     SGML context passed to parsers
                     88: */
                     89: typedef struct _HTSGMLContext *HTSGMLContext;  /* Hidden */
                     90: 
                     91: 
                     92: /*__________________________________________________________________________
                     93: */
                     94: 
2.6       timbl      95: </PRE>
                     96: <H2>Structured Object definition</H2>A structured object is something
                     97: which can reasonably be represented
                     98: in SGML.  I'll rephrase that.  A
                     99: structured object is am ordered tree-structured
                    100: arrangement of data which is representable
                    101: as text.The SGML parer outputs to
                    102: a Structured object. A Structured
                    103: object can output its contents to
                    104: another Structured Object. It's a
                    105: kind of typed stream. The architecure
                    106: is largely Dan Conolly's. Elements
                    107: and entities are passed to the sob
                    108: by number, implying a knowledge of
                    109: the DTD. Knowledge of the SGML syntax
                    110: is not here, though.<P>
                    111: Superclass: HTStream<P>
                    112: The creation methods will vary on
                    113: the type of Structured Object.Maybe
                    114: the callerData is enough info to
                    115: pass along.
                    116: <PRE>typedef struct _HTStructured HTStructured;
2.1       timbl     117: 
                    118: typedef struct _HTStructuredClass{
                    119: 
                    120:        char*  name;                            /* Just for diagnostics */
                    121: 
                    122:        void (*free) PARAMS((
                    123:                HTStructured*   me));
                    124: 
2.4       timbl     125:        void (*abort) PARAMS((
2.5       timbl     126:                HTStructured*   me,
                    127:                HTError         e));
2.1       timbl     128:                
                    129:        void (*put_character) PARAMS((
                    130:                HTStructured*   me,
                    131:                char            ch));
                    132:                                
                    133:        void (*put_string) PARAMS((
                    134:                HTStructured*   me,
                    135:                CONST char *    str));
                    136:                
                    137:        void (*write) PARAMS((
2.2       timbl     138:                HTStructured*   me,
2.1       timbl     139:                CONST char *    str,
                    140:                int             len));
                    141:                
                    142:        void (*start_element) PARAMS((
                    143:                HTStructured*   me,
                    144:                int             element_number,
2.2       timbl     145:                CONST BOOL*             attribute_present,
                    146:                CONST char**            attribute_value));
2.1       timbl     147:                
                    148:        void (*end_element) PARAMS((
                    149:                HTStructured*   me,
                    150:                int             element_number));
                    151: 
                    152:        void (*put_entity) PARAMS((
                    153:                HTStructured*   me,
                    154:                int             entity_number));
                    155:                
                    156: }HTStructuredClass;
                    157: 
2.6       timbl     158: </PRE>
                    159: <H2>Find a Tag by Name</H2>Returns a pointer to the tag within
                    160: the DTD.
2.7       timbl     161: <PRE>extern HTTag * SGMLFindTag PARAMS((CONST SGML_dtd* dtd, CONST char * string));
2.1       timbl     162: 
                    163: 
2.6       timbl     164: </PRE>
2.10      timbl     165: <H2>Find a Attribute by Name</H2>Returns the number of the
                    166: atribute or -1 if failure.
                    167: <PRE>extern int SGMLFindAttribute PARAMS((HTTag* tag, CONST char * string));
                    168: 
                    169: 
                    170: </PRE>
2.6       timbl     171: <H2>Create an SGML parser</H2>
                    172: <PRE>/*
2.1       timbl     173: ** On entry,
                    174: **     dtd             must point to a DTD structure as defined above
                    175: **     callbacks       must point to user routines.
                    176: **     callData        is returned in callbacks transparently.
                    177: ** On exit,
                    178: **             The default tag starter has been processed.
                    179: */
                    180: 
                    181: 
                    182: extern HTStream* SGML_new PARAMS((
                    183:        CONST SGML_dtd *                dtd,
                    184:        HTStructured *          target));
                    185: 
2.2       timbl     186: extern CONST HTStreamClass SGMLParser;
2.1       timbl     187: 
                    188: 
                    189: #endif /* SGML_H */
2.7       timbl     190: 
2.1       timbl     191: 
2.3       timbl     192: 
                    193: 
2.4       timbl     194: 
2.6       timbl     195: 
                    196: 
                    197: 
                    198: 
2.8       timbl     199: 
                    200: </PRE></BODY>
2.4       timbl     201: </HTML>

Webmaster