Annotation of libwww/Library/src/SGML.html, revision 2.13

2.4       timbl       1: <HTML>
                      2: <HEAD>
                      3: <TITLE>SGML parse and stream definition for libwww</TITLE></HEAD>
2.1       timbl       4: <BODY>
                      5: <H1>SGML and Structured streams</H1>The SGML parser is a state machine.
2.11      frystyk     6: It is called for every character of the input stream. The DTD data
                      7: structure contains pointers to functions which are called to
                      8: implement the actual effect of the text read. When these functions are
                      9: called, the attribute structures pointed to by the DTD are valid,
                     10: and the function is parsed a pointer to the curent tag structure, and
                     11: an "element stack" which represents the state of nesting within SGML
2.13    ! frystyk    12: elements. See also the <A HREF="HTStream.html">the generic Stream
        !            13: definition</A><P>
2.11      frystyk    14: 
2.1       timbl      15: The following aspects are from Dan
                     16: Connolly's suggestions:  Binary search,
                     17: Strcutured object scheme basically,
                     18: SGML content enum type.<P>
2.11      frystyk    19: 
                     20: The module is a part of the <A HREF="Overview">CERN Common WWW Library</A>
2.1       timbl      21: (c) Copyright CERN 1991 - See Copyright.html
2.11      frystyk    22: 
                     23: <PRE>
                     24: #ifndef SGML_H
2.1       timbl      25: #define SGML_H
                     26: 
                     27: #include "HTUtils.h"
                     28: #include "HTStream.h"
                     29: 
                     30: </PRE>
                     31: <H2>SGML content types</H2>
                     32: <PRE>typedef enum _SGMLContent{
                     33:   SGML_EMPTY,    /* no content */
2.8       timbl      34:   SGML_LITERAL, /* character data. Recognized exact close tag only.
2.1       timbl      35:                    Old www server compatibility only! Not SGML */
                     36:   SGML_CDATA,    /* character data. recognize &lt;/ only */
                     37:   SGML_RCDATA,   /* replaceable character data. recognize &lt;/ and &amp;ref; */
                     38:   SGML_MIXED,    /* elements and parsed character data. recognize all markup */
                     39:   SGML_ELEMENT   /* any data found will be returned as an error*/
                     40:   } SGMLContent;
                     41: 
                     42: 
                     43: typedef struct {
                     44:     char *     name;           /* The (constant) name of the attribute */
                     45:                                /* Could put type info in here */
                     46: } attr;
                     47: 
                     48: 
                     49: /*             A tag structure describes an SGML element.
                     50: **             -----------------------------------------
                     51: **
                     52: **
                     53: **     name            is the string which comes after the tag opener "&lt;".
                     54: **
                     55: **     attributes      points to a zero-terminated array
                     56: **                     of attribute names.
                     57: **
                     58: **     litteral        determines how the SGML engine parses the charaters
                     59: **                     within the element. If set, tag openers are ignored
                     60: **                     except for that which opens a matching closing tag.
                     61: **
                     62: */
                     63: typedef struct _tag HTTag;
                     64: struct _tag{
                     65:     char *     name;                   /* The name of the tag */
                     66:     attr *     attributes;             /* The list of acceptable attributes */
                     67:     int                number_of_attributes;   /* Number of possible attributes */
                     68:     SGMLContent contents;              /* End only on end tag @@ */            
                     69: };
                     70: 
                     71: 
                     72: 
                     73: 
                     74: /*             DTD Information
                     75: **             ---------------
                     76: **
                     77: ** Not the whole DTD, but all this parser usues of it.
                     78: */
                     79: typedef struct {
2.2       timbl      80:     HTTag *            tags;           /* Must be in strcmp order by name */ 
                     81:     int                        number_of_tags;
                     82:     CONST char **      entity_names;   /* Must be in strcmp order by name */
                     83:     int                        number_of_entities;
2.1       timbl      84: } SGML_dtd;
                     85: 
2.9       frystyk    86: #define MAX_ATTRIBUTES 20      /* Max number of attributes per element */
2.1       timbl      87: 
                     88: /*     SGML context passed to parsers
                     89: */
                     90: typedef struct _HTSGMLContext *HTSGMLContext;  /* Hidden */
                     91: 
                     92: 
                     93: /*__________________________________________________________________________
                     94: */
                     95: 
2.6       timbl      96: </PRE>
                     97: <H2>Structured Object definition</H2>A structured object is something
                     98: which can reasonably be represented
                     99: in SGML.  I'll rephrase that.  A
                    100: structured object is am ordered tree-structured
                    101: arrangement of data which is representable
                    102: as text.The SGML parer outputs to
                    103: a Structured object. A Structured
                    104: object can output its contents to
                    105: another Structured Object. It's a
                    106: kind of typed stream. The architecure
                    107: is largely Dan Conolly's. Elements
                    108: and entities are passed to the sob
                    109: by number, implying a knowledge of
                    110: the DTD. Knowledge of the SGML syntax
                    111: is not here, though.<P>
                    112: Superclass: HTStream<P>
                    113: The creation methods will vary on
                    114: the type of Structured Object.Maybe
                    115: the callerData is enough info to
2.13    ! frystyk   116: pass along. <P>
        !           117: 
        !           118: <B>NOTE: </B>The <CODE>put_block</CODE> method was <CODE>write</CODE>,
        !           119: but this upset systems which had macros for <CODE>write()</CODE>. See
        !           120: <A HREF="HTStream.html">the generic stream definition</A> for valid
        !           121: return codes.<P>
        !           122: 
2.6       timbl     123: <PRE>typedef struct _HTStructured HTStructured;
2.1       timbl     124: 
                    125: typedef struct _HTStructuredClass{
                    126: 
                    127:        char*  name;                            /* Just for diagnostics */
                    128: 
2.13    ! frystyk   129:        int (*_free) PARAMS((
2.1       timbl     130:                HTStructured*   me));
                    131: 
2.13    ! frystyk   132:        int (*abort) PARAMS((
2.5       timbl     133:                HTStructured*   me,
                    134:                HTError         e));
2.1       timbl     135:                
                    136:        void (*put_character) PARAMS((
                    137:                HTStructured*   me,
                    138:                char            ch));
                    139:                                
                    140:        void (*put_string) PARAMS((
                    141:                HTStructured*   me,
                    142:                CONST char *    str));
                    143:                
2.13    ! frystyk   144:        void (*put_block) PARAMS((
2.2       timbl     145:                HTStructured*   me,
2.1       timbl     146:                CONST char *    str,
                    147:                int             len));
                    148:                
                    149:        void (*start_element) PARAMS((
                    150:                HTStructured*   me,
                    151:                int             element_number,
2.2       timbl     152:                CONST BOOL*             attribute_present,
                    153:                CONST char**            attribute_value));
2.1       timbl     154:                
                    155:        void (*end_element) PARAMS((
                    156:                HTStructured*   me,
                    157:                int             element_number));
                    158: 
                    159:        void (*put_entity) PARAMS((
                    160:                HTStructured*   me,
                    161:                int             entity_number));
                    162:                
2.13    ! frystyk   163: } HTStructuredClass;
2.1       timbl     164: 
2.6       timbl     165: </PRE>
                    166: <H2>Find a Tag by Name</H2>Returns a pointer to the tag within
                    167: the DTD.
2.7       timbl     168: <PRE>extern HTTag * SGMLFindTag PARAMS((CONST SGML_dtd* dtd, CONST char * string));
2.1       timbl     169: 
                    170: 
2.6       timbl     171: </PRE>
2.10      timbl     172: <H2>Find a Attribute by Name</H2>Returns the number of the
                    173: atribute or -1 if failure.
                    174: <PRE>extern int SGMLFindAttribute PARAMS((HTTag* tag, CONST char * string));
                    175: 
                    176: 
                    177: </PRE>
2.6       timbl     178: <H2>Create an SGML parser</H2>
                    179: <PRE>/*
2.1       timbl     180: ** On entry,
                    181: **     dtd             must point to a DTD structure as defined above
                    182: **     callbacks       must point to user routines.
                    183: **     callData        is returned in callbacks transparently.
                    184: ** On exit,
                    185: **             The default tag starter has been processed.
                    186: */
                    187: 
                    188: 
                    189: extern HTStream* SGML_new PARAMS((
                    190:        CONST SGML_dtd *                dtd,
                    191:        HTStructured *          target));
                    192: 
2.2       timbl     193: extern CONST HTStreamClass SGMLParser;
2.1       timbl     194: 
                    195: 
                    196: #endif /* SGML_H */
2.7       timbl     197: 
2.1       timbl     198: 
2.3       timbl     199: 
                    200: 
2.4       timbl     201: 
2.6       timbl     202: 
                    203: 
                    204: 
                    205: 
2.8       timbl     206: 
                    207: </PRE></BODY>
2.4       timbl     208: </HTML>

Webmaster