Annotation of libwww/Library/src/SGML.html, revision 2.15

2.4       timbl       1: <HTML>
                      2: <HEAD>
2.14      frystyk     3: <TITLE>SGML parse and stream definition for libwww</TITLE>
                      4: </HEAD>
2.1       timbl       5: <BODY>
2.11      frystyk     6: 
2.14      frystyk     7: <H1>SGML and Structured streams</H1>
2.11      frystyk     8: 
2.14      frystyk     9: <PRE>
                     10: /*
                     11: **     (c) COPYRIGHT CERN 1994.
                     12: **     Please first read the full copyright statement in the file COPYRIGH.
                     13: */
                     14: </PRE>
                     15: 
                     16: The SGML parser is a state machine.  It is called for every character
                     17: of the input stream. The DTD data structure contains pointers to
                     18: functions which are called to implement the actual effect of the text
                     19: read. When these functions are called, the attribute structures
                     20: pointed to by the DTD are valid, and the function is parsed a pointer
                     21: to the curent tag structure, and an "element stack" which represents
                     22: the state of nesting within SGML elements. See also the <A
                     23: HREF="HTStream.html">the generic Stream definition</A><P>
                     24: 
                     25: The following aspects are from Dan Connolly's suggestions: Binary
                     26: search, Strcutured object scheme basically, SGML content enum type.<P>
                     27: 
                     28: This module is implemented by <A HREF="SGML.c">SGML.c</A>, and it is a
                     29: part of the <A
                     30: HREF="http://info.cern.ch/hypertext/WWW/Library/User/Guide/Guide.html">
                     31: Library of Common Code</A>.
2.11      frystyk    32: 
                     33: <PRE>
                     34: #ifndef SGML_H
2.1       timbl      35: #define SGML_H
                     36: 
2.15    ! roeber     37: #include "sysdep.h"
2.1       timbl      38: #include "HTUtils.h"
                     39: #include "HTStream.h"
                     40: 
                     41: </PRE>
                     42: <H2>SGML content types</H2>
                     43: <PRE>typedef enum _SGMLContent{
                     44:   SGML_EMPTY,    /* no content */
2.8       timbl      45:   SGML_LITERAL, /* character data. Recognized exact close tag only.
2.1       timbl      46:                    Old www server compatibility only! Not SGML */
                     47:   SGML_CDATA,    /* character data. recognize &lt;/ only */
                     48:   SGML_RCDATA,   /* replaceable character data. recognize &lt;/ and &amp;ref; */
                     49:   SGML_MIXED,    /* elements and parsed character data. recognize all markup */
                     50:   SGML_ELEMENT   /* any data found will be returned as an error*/
                     51:   } SGMLContent;
                     52: 
                     53: 
                     54: typedef struct {
                     55:     char *     name;           /* The (constant) name of the attribute */
                     56:                                /* Could put type info in here */
                     57: } attr;
                     58: 
                     59: 
                     60: /*             A tag structure describes an SGML element.
                     61: **             -----------------------------------------
                     62: **
                     63: **
                     64: **     name            is the string which comes after the tag opener "&lt;".
                     65: **
                     66: **     attributes      points to a zero-terminated array
                     67: **                     of attribute names.
                     68: **
                     69: **     litteral        determines how the SGML engine parses the charaters
                     70: **                     within the element. If set, tag openers are ignored
                     71: **                     except for that which opens a matching closing tag.
                     72: **
                     73: */
                     74: typedef struct _tag HTTag;
                     75: struct _tag{
                     76:     char *     name;                   /* The name of the tag */
                     77:     attr *     attributes;             /* The list of acceptable attributes */
                     78:     int                number_of_attributes;   /* Number of possible attributes */
                     79:     SGMLContent contents;              /* End only on end tag @@ */            
                     80: };
                     81: 
                     82: 
                     83: 
                     84: 
                     85: /*             DTD Information
                     86: **             ---------------
                     87: **
                     88: ** Not the whole DTD, but all this parser usues of it.
                     89: */
                     90: typedef struct {
2.2       timbl      91:     HTTag *            tags;           /* Must be in strcmp order by name */ 
                     92:     int                        number_of_tags;
                     93:     CONST char **      entity_names;   /* Must be in strcmp order by name */
                     94:     int                        number_of_entities;
2.1       timbl      95: } SGML_dtd;
                     96: 
2.9       frystyk    97: #define MAX_ATTRIBUTES 20      /* Max number of attributes per element */
2.1       timbl      98: 
                     99: /*     SGML context passed to parsers
                    100: */
                    101: typedef struct _HTSGMLContext *HTSGMLContext;  /* Hidden */
                    102: 
                    103: 
                    104: /*__________________________________________________________________________
                    105: */
                    106: 
2.6       timbl     107: </PRE>
                    108: <H2>Structured Object definition</H2>A structured object is something
                    109: which can reasonably be represented
                    110: in SGML.  I'll rephrase that.  A
                    111: structured object is am ordered tree-structured
                    112: arrangement of data which is representable
                    113: as text.The SGML parer outputs to
                    114: a Structured object. A Structured
                    115: object can output its contents to
                    116: another Structured Object. It's a
                    117: kind of typed stream. The architecure
                    118: is largely Dan Conolly's. Elements
                    119: and entities are passed to the sob
                    120: by number, implying a knowledge of
                    121: the DTD. Knowledge of the SGML syntax
                    122: is not here, though.<P>
                    123: Superclass: HTStream<P>
                    124: The creation methods will vary on
                    125: the type of Structured Object.Maybe
                    126: the callerData is enough info to
2.13      frystyk   127: pass along. <P>
                    128: 
                    129: <B>NOTE: </B>The <CODE>put_block</CODE> method was <CODE>write</CODE>,
                    130: but this upset systems which had macros for <CODE>write()</CODE>. See
                    131: <A HREF="HTStream.html">the generic stream definition</A> for valid
                    132: return codes.<P>
                    133: 
2.6       timbl     134: <PRE>typedef struct _HTStructured HTStructured;
2.1       timbl     135: 
                    136: typedef struct _HTStructuredClass{
                    137: 
                    138:        char*  name;                            /* Just for diagnostics */
                    139: 
2.13      frystyk   140:        int (*_free) PARAMS((
2.1       timbl     141:                HTStructured*   me));
                    142: 
2.13      frystyk   143:        int (*abort) PARAMS((
2.5       timbl     144:                HTStructured*   me,
                    145:                HTError         e));
2.1       timbl     146:                
                    147:        void (*put_character) PARAMS((
                    148:                HTStructured*   me,
                    149:                char            ch));
                    150:                                
                    151:        void (*put_string) PARAMS((
                    152:                HTStructured*   me,
                    153:                CONST char *    str));
                    154:                
2.13      frystyk   155:        void (*put_block) PARAMS((
2.2       timbl     156:                HTStructured*   me,
2.1       timbl     157:                CONST char *    str,
                    158:                int             len));
                    159:                
                    160:        void (*start_element) PARAMS((
                    161:                HTStructured*   me,
                    162:                int             element_number,
2.2       timbl     163:                CONST BOOL*             attribute_present,
                    164:                CONST char**            attribute_value));
2.1       timbl     165:                
                    166:        void (*end_element) PARAMS((
                    167:                HTStructured*   me,
                    168:                int             element_number));
                    169: 
                    170:        void (*put_entity) PARAMS((
                    171:                HTStructured*   me,
                    172:                int             entity_number));
                    173:                
2.13      frystyk   174: } HTStructuredClass;
2.1       timbl     175: 
2.6       timbl     176: </PRE>
                    177: <H2>Find a Tag by Name</H2>Returns a pointer to the tag within
                    178: the DTD.
2.7       timbl     179: <PRE>extern HTTag * SGMLFindTag PARAMS((CONST SGML_dtd* dtd, CONST char * string));
2.1       timbl     180: 
                    181: 
2.6       timbl     182: </PRE>
2.10      timbl     183: <H2>Find a Attribute by Name</H2>Returns the number of the
                    184: atribute or -1 if failure.
                    185: <PRE>extern int SGMLFindAttribute PARAMS((HTTag* tag, CONST char * string));
                    186: 
                    187: 
                    188: </PRE>
2.6       timbl     189: <H2>Create an SGML parser</H2>
                    190: <PRE>/*
2.1       timbl     191: ** On entry,
                    192: **     dtd             must point to a DTD structure as defined above
                    193: **     callbacks       must point to user routines.
                    194: **     callData        is returned in callbacks transparently.
                    195: ** On exit,
                    196: **             The default tag starter has been processed.
                    197: */
                    198: 
                    199: 
                    200: extern HTStream* SGML_new PARAMS((
                    201:        CONST SGML_dtd *                dtd,
                    202:        HTStructured *          target));
                    203: 
2.2       timbl     204: extern CONST HTStreamClass SGMLParser;
2.1       timbl     205: 
                    206: 
                    207: #endif /* SGML_H */
2.7       timbl     208: 
2.1       timbl     209: 
2.3       timbl     210: 
                    211: 
2.4       timbl     212: 
2.6       timbl     213: 
                    214: 
                    215: 
                    216: 
2.8       timbl     217: 
                    218: </PRE></BODY>
2.4       timbl     219: </HTML>

Webmaster