Annotation of libwww/Library/src/SGML.c, revision 1.7
1.1 timbl 1: /* General SGML Parser code SGML.c
2: ** ========================
3: **
1.2 timbl 4: ** This module implements an HTStream object. To parse an
1.1 timbl 5: ** SGML file, create this object which is a parser. The object
1.2 timbl 6: ** is (currently) created by being passed a DTD structure,
7: ** and a target HTStructured oject at which to throw the parsed stuff.
1.1 timbl 8: **
1.2 timbl 9: ** 6 Feb 93 Binary seraches used. Intreface modified.
1.1 timbl 10: */
11: #include "SGML.h"
12:
13: #include <ctype.h>
14: #include <stdio.h>
15: #include "HTUtils.h"
16: #include "HTChunk.h"
17: #include "tcp.h" /* For FROMASCII */
18:
1.2 timbl 19: #define INVALID (-1)
20:
1.1 timbl 21: /* The State (context) of the parser
22: **
1.2 timbl 23: ** This is passed with each call to make the parser reentrant
1.1 timbl 24: **
25: */
26:
1.2 timbl 27: #define MAX_ATTRIBUTES 20 /* Max number of attributes per element */
28:
29:
30: /* Element Stack
31: ** -------------
32: ** This allows us to return down the stack reselcting styles.
33: ** As we return, attribute values will be garbage in general.
34: */
35: typedef struct _HTElement HTElement;
36: struct _HTElement {
37: HTElement * next; /* Previously nested element or 0 */
38: HTTag* tag; /* The tag at this level */
39: };
40:
41:
42: /* Internal Context Data Structure
43: ** -------------------------------
44: */
45: struct _HTStream {
46:
47: CONST HTStreamClass * isa; /* inherited from HTStream */
48:
49: CONST SGML_dtd *dtd;
50: HTStructuredClass *actions; /* target class */
51: HTStructured *target; /* target object */
52:
1.1 timbl 53: HTTag *current_tag;
1.2 timbl 54: int current_attribute_number;
1.1 timbl 55: HTChunk *string;
56: HTElement *element_stack;
57: enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap,
58: S_attr, S_attr_gap, S_equals, S_value,
59: S_ero, S_cro,
60: S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
1.2 timbl 61: #ifdef CALLERDATA
1.1 timbl 62: void * callerData;
1.2 timbl 63: #endif
64: BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
65: char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
66: } ;
67:
68:
69: #define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
70:
1.1 timbl 71:
72:
73: /* Handle Attribute
74: ** ----------------
75: */
76: /* PUBLIC CONST char * SGML_default = ""; ?? */
77:
78: #ifdef __STDC__
1.2 timbl 79: PRIVATE void handle_attribute_name(HTStream * context, const char * s)
1.1 timbl 80: #else
81: PRIVATE void handle_attribute_name(context, s)
1.2 timbl 82: HTStream * context;
1.1 timbl 83: char *s;
84: #endif
85: {
1.2 timbl 86:
87: HTTag * tag = context->current_tag;
88: attr* attributes = tag->attributes;
89:
90: int high, low, i, diff; /* Binary search for attribute name */
91: for(low=0, high=tag->number_of_attributes;
92: high > low ;
93: diff < 0 ? (low = i+1) : (high = i) ) {
94: i = (low + (high-low)/2);
95: diff = strcasecomp(attributes[i].name, s);
96: if (diff==0) { /* success: found it */
97: context->current_attribute_number = i;
98: context->present[i] = YES;
99: if (context->value[i]) {
100: free(context->value[i]);
101: context->value[i] = NULL;
102: }
103: return;
104: } /* if */
105:
106: } /* for */
107:
108: if (TRACE)
109: fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
110: s, context->current_tag->name);
111: context->current_attribute_number = INVALID; /* Invalid */
1.1 timbl 112: }
113:
114:
115: /* Handle attribute value
116: ** ----------------------
117: */
118: #ifdef __STDC__
1.2 timbl 119: PRIVATE void handle_attribute_value(HTStream * context, const char * s)
1.1 timbl 120: #else
121: PRIVATE void handle_attribute_value(context, s)
1.2 timbl 122: HTStream * context;
1.1 timbl 123: char *s;
124: #endif
125: {
1.2 timbl 126: if (context->current_attribute_number != INVALID) {
127: StrAllocCopy(context->value[context->current_attribute_number], s);
1.1 timbl 128: } else {
129: if (TRACE) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
130: }
1.2 timbl 131: context->current_attribute_number = INVALID; /* can't have two assignments! */
1.1 timbl 132: }
133:
1.2 timbl 134:
1.1 timbl 135: /* Handle entity
136: ** -------------
137: **
138: ** On entry,
139: ** s contains the entity name zero terminated
140: ** Bugs:
141: ** If the entity name is unknown, the terminator is treated as
142: ** a printable non-special character in all cases, even if it is '<'
143: */
144: #ifdef __STDC__
1.2 timbl 145: PRIVATE void handle_entity(HTStream * context, char term)
1.1 timbl 146: #else
147: PRIVATE void handle_entity(context, term)
1.2 timbl 148: HTStream * context;
1.1 timbl 149: char term;
150: #endif
151: {
1.2 timbl 152:
1.3 timbl 153: CONST char ** entities = context->dtd->entity_names;
1.1 timbl 154: CONST char *s = context->string->data;
1.2 timbl 155:
156: int high, low, i, diff;
157: for(low=0, high = context->dtd->number_of_entities;
158: high > low ;
159: diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
160: i = (low + (high-low)/2);
161: diff = strcmp(entities[i], s); /* Csse sensitive! */
162: if (diff==0) { /* success: found it */
163: (*context->actions->put_entity)(context->target, i);
164: return;
1.1 timbl 165: }
166: }
167: /* If entity string not found, display as text */
168: if (TRACE)
169: fprintf(stderr, "SGML: Unknown entity %s\n", s);
1.2 timbl 170: PUTC('&');
1.1 timbl 171: {
172: CONST char *p;
173: for (p=s; *p; p++) {
1.2 timbl 174: PUTC(*p);
1.1 timbl 175: }
176: }
1.2 timbl 177: PUTC(term);
1.1 timbl 178: }
179:
1.2 timbl 180:
1.1 timbl 181: /* End element
1.2 timbl 182: ** -----------
1.1 timbl 183: */
184: #ifdef __STDC__
1.2 timbl 185: PRIVATE void end_element(HTStream * context, HTTag * old_tag)
1.1 timbl 186: #else
187: PRIVATE void end_element(context, old_tag)
188: HTTag * old_tag;
1.2 timbl 189: HTStream * context;
1.1 timbl 190: #endif
191: {
192: if (TRACE) fprintf(stderr, "SGML: End </%s>\n", old_tag->name);
1.2 timbl 193: if (old_tag->contents == SGML_EMPTY) {
1.1 timbl 194: if (TRACE) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
195: old_tag->name);
196: return;
197: }
198: while (context->element_stack) {/* Loop is error path only */
199: HTElement * N = context->element_stack;
200: HTTag * t = N->tag;
201:
202: if (old_tag != t) { /* Mismatch: syntax error */
203: if (context->element_stack->next) { /* This is not the last level */
204: if (TRACE) fprintf(stderr,
205: "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
206: old_tag->name, t->name, t->name);
207: } else { /* last level */
208: if (TRACE) fprintf(stderr,
209: "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
210: old_tag->name, t->name, old_tag->name);
211: return; /* Ignore */
212: }
213: }
214:
215: context->element_stack = N->next; /* Remove from stack */
216: free(N);
1.2 timbl 217: (*context->actions->end_element)(context->target,
218: t - context->dtd->tags);
1.1 timbl 219: if (old_tag == t) return; /* Correct sequence */
220:
221: /* Syntax error path only */
222:
223: }
1.5 timbl 224: if (TRACE) fprintf(stderr,
1.1 timbl 225: "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
226: }
227:
228:
229: /* Start a element
230: */
231: #ifdef __STDC__
1.2 timbl 232: PRIVATE void start_element(HTStream * context)
1.1 timbl 233: #else
234: PRIVATE void start_element(context)
1.2 timbl 235: HTStream * context;
1.1 timbl 236: #endif
237: {
238: HTTag * new_tag = context->current_tag;
239:
240: if (TRACE) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
1.2 timbl 241: (*context->actions->start_element)(
242: context->target,
243: new_tag - context->dtd->tags,
244: context->present,
1.3 timbl 245: (CONST char**) context->value); /* coerce type for think c */
1.2 timbl 246: if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */
1.1 timbl 247: HTElement * N = (HTElement *)malloc(sizeof(HTElement));
248: if (N == NULL) outofmem(__FILE__, "start_element");
249: N->next = context->element_stack;
250: N->tag = new_tag;
251: context->element_stack = N;
252: }
253: }
254:
255:
1.2 timbl 256: /* Find Tag in DTD tag list
257: ** ------------------------
1.1 timbl 258: **
259: ** On entry,
1.2 timbl 260: ** dtd points to dtd structire including valid tag list
261: ** string points to name of tag in question
1.1 timbl 262: **
1.2 timbl 263: ** On exit,
264: ** returns:
1.7 ! timbl 265: ** NULL tag not found
! 266: ** else address of tag structure in dtd
1.2 timbl 267: */
1.7 ! timbl 268: PRIVATE HTTag * find_tag ARGS2(CONST SGML_dtd*, dtd, char *, string)
1.2 timbl 269: {
270: int high, low, i, diff;
271: for(low=0, high=dtd->number_of_tags;
272: high > low ;
273: diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
274: i = (low + (high-low)/2);
1.3 timbl 275: diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */
1.2 timbl 276: if (diff==0) { /* success: found it */
1.7 ! timbl 277: return &dtd->tags[i];
1.2 timbl 278: }
279: }
1.7 ! timbl 280: return NULL;
1.2 timbl 281: }
282:
283: /*________________________________________________________________________
284: ** Public Methods
1.1 timbl 285: */
286:
1.2 timbl 287:
288: PUBLIC void SGML_end ARGS1(HTStream *, context)
1.1 timbl 289: {
1.2 timbl 290: /* Could check that we are back to bottom of stack! @@ */
1.1 timbl 291:
1.2 timbl 292: (*context->actions->end_document)(context->target);
1.1 timbl 293: }
294:
295:
1.2 timbl 296: PUBLIC void SGML_free ARGS1(HTStream *, context)
1.1 timbl 297: {
1.2 timbl 298: (*context->actions->free)(context->target);
1.1 timbl 299: HTChunkFree(context->string);
300: free(context);
301: }
302:
1.2 timbl 303:
1.1 timbl 304: /* Read and write user callback handle
305: ** -----------------------------------
306: **
307: ** The callbacks from the SGML parser have an SGML context parameter.
308: ** These calls allow the caller to associate his own context with a
309: ** particular SGML context.
310: */
311:
1.2 timbl 312: #ifdef CALLERDATA
313: PUBLIC void* SGML_callerData ARGS1(HTStream *, context)
1.1 timbl 314: {
315: return context->callerData;
316: }
317:
1.2 timbl 318: PUBLIC void SGML_setCallerData ARGS2(HTStream *, context, void*, data)
1.1 timbl 319: {
320: context->callerData = data;
321: }
1.2 timbl 322: #endif
1.1 timbl 323:
1.2 timbl 324: PUBLIC void SGML_character ARGS2(HTStream *, context, char,c)
1.1 timbl 325:
326: {
1.2 timbl 327: CONST SGML_dtd *dtd = context->dtd;
1.1 timbl 328: HTChunk *string = context->string;
329:
330: switch(context->state) {
331: case S_text:
1.6 timbl 332: if (c=='&' && (!context->element_stack || (
333: context->element_stack->tag &&
334: ( context->element_stack->tag->contents == SGML_MIXED
335: || context->element_stack->tag->contents ==
336: SGML_RCDATA)
337: ))) {
1.1 timbl 338: string->size = 0;
339: context->state = S_ero;
340:
341: } else if (c=='<') {
342: string->size = 0;
343: context->state = (context->element_stack &&
344: context->element_stack->tag &&
1.2 timbl 345: context->element_stack->tag->contents == SGML_LITTERAL) ?
1.1 timbl 346: S_litteral : S_tag;
1.2 timbl 347: } else PUTC(c);
1.1 timbl 348: break;
349:
350: /* In litteral mode, waits only for specific end tag!
1.2 timbl 351: ** Only foir compatibility with old servers.
1.1 timbl 352: */
353: case S_litteral :
354: HTChunkPutc(string, c);
355: if ( TOUPPER(c) != ((string->size ==1) ? '/'
356: : context->element_stack->tag->name[string->size-2])) {
357: int i;
358:
359: /* If complete match, end litteral */
360: if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
361: end_element(context, context->element_stack->tag);
362: string->size = 0;
1.2 timbl 363: context->current_attribute_number = INVALID;
1.1 timbl 364: context->state = S_text;
365: break;
366: } /* If Mismatch: recover string. */
1.2 timbl 367: PUTC( '<');
1.1 timbl 368: for (i=0; i<string->size; i++) /* recover */
1.2 timbl 369: PUTC(
1.1 timbl 370: string->data[i]);
371: context->state = S_text;
372: }
373:
374: break;
375:
376: /* Character reference or Entity
377: */
378: case S_ero:
379: if (c=='#') {
380: context->state = S_cro; /* &# is Char Ref Open */
381: break;
382: }
383: context->state = S_entity; /* Fall through! */
384:
385: /* Handle Entities
386: */
387: case S_entity:
388: if (isalnum(c))
389: HTChunkPutc(string, c);
390: else {
391: HTChunkTerminate(string);
392: handle_entity(context, c);
393: context->state = S_text;
394: }
395: break;
396:
397: /* Character reference
398: */
399: case S_cro:
400: if (isalnum(c))
401: HTChunkPutc(string, c); /* accumulate a character NUMBER */
402: else {
403: int value;
404: HTChunkTerminate(string);
405: if (sscanf(string->data, "%d", &value)==1)
1.2 timbl 406: PUTC(FROMASCII((char)value));
1.1 timbl 407: context->state = S_text;
408: }
409: break;
410:
411: /* Tag
412: */
413: case S_tag: /* new tag */
414: if (isalnum(c))
415: HTChunkPutc(string, c);
416: else { /* End of tag name */
1.7 ! timbl 417: HTTag * t;
1.1 timbl 418: if (c=='/') {
419: if (TRACE) if (string->size!=0)
420: fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
421: context->state = S_end;
422: break;
423: }
424: HTChunkTerminate(string) ;
1.2 timbl 425:
426: t = find_tag(dtd, string->data);
1.7 ! timbl 427: if (!t) {
1.2 timbl 428: if(TRACE) fprintf(stderr, "SGML: *** Unknown element %s\n",
1.1 timbl 429: string->data);
430: context->state = (c=='>') ? S_text : S_junk_tag;
431: break;
432: }
1.7 ! timbl 433: context->current_tag = t;
1.2 timbl 434:
435: /* Clear out attributes
436: */
1.1 timbl 437:
1.2 timbl 438: {
439: int i;
440: for (i=0; i< context->current_tag->number_of_attributes; i++)
441: context->present[i] = NO;
1.1 timbl 442: }
443: string->size = 0;
1.2 timbl 444: context->current_attribute_number = INVALID;
1.1 timbl 445:
446: if (c=='>') {
447: if (context->current_tag->name) start_element(context);
448: context->state = S_text;
449: } else {
450: context->state = S_tag_gap;
451: }
452: }
453: break;
454:
455:
456: case S_tag_gap: /* Expecting attribute or > */
457: if (WHITE(c)) break; /* Gap between attributes */
458: if (c=='>') { /* End of tag */
459: if (context->current_tag->name) start_element(context);
460: context->state = S_text;
461: break;
462: }
463: HTChunkPutc(string, c);
464: context->state = S_attr; /* Get attribute */
465: break;
466:
467: /* accumulating value */
468: case S_attr:
469: if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
470: HTChunkTerminate(string) ;
471: handle_attribute_name(context, string->data);
472: string->size = 0;
473: if (c=='>') { /* End of tag */
474: if (context->current_tag->name) start_element(context);
475: context->state = S_text;
476: break;
477: }
478: context->state = (c=='=' ? S_equals: S_attr_gap);
479: } else {
480: HTChunkPutc(string, c);
481: }
482: break;
483:
484: case S_attr_gap: /* Expecting attribute or = or > */
485: if (WHITE(c)) break; /* Gap after attribute */
486: if (c=='>') { /* End of tag */
487: if (context->current_tag->name) start_element(context);
488: context->state = S_text;
489: break;
490: } else if (c=='=') {
491: context->state = S_equals;
492: break;
493: }
494: HTChunkPutc(string, c);
495: context->state = S_attr; /* Get next attribute */
496: break;
497:
498: case S_equals: /* After attr = */
499: if (WHITE(c)) break; /* Before attribute value */
500: if (c=='>') { /* End of tag */
1.5 timbl 501: if (TRACE) fprintf(stderr, "SGML: found = but no value\n");
1.1 timbl 502: if (context->current_tag->name) start_element(context);
503: context->state = S_text;
504: break;
505:
506: } else if (c=='\'') {
507: context->state = S_squoted;
508: break;
509:
510: } else if (c=='"') {
511: context->state = S_dquoted;
512: break;
513: }
514: HTChunkPutc(string, c);
515: context->state = S_value;
516: break;
517:
518: case S_value:
519: if (WHITE(c) || (c=='>')) { /* End of word */
520: HTChunkTerminate(string) ;
521: handle_attribute_value(context, string->data);
522: string->size = 0;
523: if (c=='>') { /* End of tag */
524: if (context->current_tag->name) start_element(context);
525: context->state = S_text;
526: break;
527: }
528: else context->state = S_tag_gap;
529: } else {
530: HTChunkPutc(string, c);
531: }
532: break;
533:
534: case S_squoted: /* Quoted attribute value */
535: if (c=='\'') { /* End of attribute value */
536: HTChunkTerminate(string) ;
537: handle_attribute_value(context, string->data);
538: string->size = 0;
539: context->state = S_tag_gap;
540: } else {
541: HTChunkPutc(string, c);
542: }
543: break;
544:
545: case S_dquoted: /* Quoted attribute value */
546: if (c=='"') { /* End of attribute value */
547: HTChunkTerminate(string) ;
548: handle_attribute_value(context, string->data);
549: string->size = 0;
550: context->state = S_tag_gap;
551: } else {
552: HTChunkPutc(string, c);
553: }
554: break;
555:
556: case S_end: /* </ */
557: if (isalnum(c))
558: HTChunkPutc(string, c);
559: else { /* End of end tag name */
1.7 ! timbl 560: HTTag * t;
1.1 timbl 561: HTChunkTerminate(string) ;
1.7 ! timbl 562: if (!*string->data) { /* Empty end tag */
! 563: t = context->element_stack->tag;
! 564: } else {
! 565: t = find_tag(dtd, string->data);
1.1 timbl 566: }
1.7 ! timbl 567: if (!t) {
1.1 timbl 568: if(TRACE) fprintf(stderr,
569: "Unknown end tag </%s>\n", string->data);
1.2 timbl 570: } else {
1.7 ! timbl 571: context->current_tag = t;
1.2 timbl 572: end_element( context, context->current_tag);
1.1 timbl 573: }
1.2 timbl 574:
1.1 timbl 575: string->size = 0;
1.2 timbl 576: context->current_attribute_number = INVALID;
1.7 ! timbl 577: if (c!='>') {
! 578: if (TRACE && !WHITE(c))
! 579: fprintf(stderr,"SGML: `</%s%c' found!\n",
! 580: string->data, c);
! 581: context->state = S_junk_tag;
! 582: } else {
! 583: context->state = S_text;
! 584: }
1.1 timbl 585: }
586: break;
587:
588:
589: case S_junk_tag:
590: if (c=='>') {
591: context->state = S_text;
592: }
593:
594: } /* switch on context->state */
595:
596: } /* SGML_character */
1.2 timbl 597:
598:
599: PUBLIC void SGML_string ARGS2(HTStream *, context, CONST char*, str)
600: {
601: CONST char *p;
602: for(p=str; *p; p++)
603: SGML_character(context, *p);
604: }
605:
606:
607: PUBLIC void SGML_write ARGS3(HTStream *, context, CONST char*, str, int, l)
608: {
609: CONST char *p;
610: CONST char *e = str+l;
611: for(p=str; p<e; p++)
612: SGML_character(context, *p);
613: }
614:
615: /*_______________________________________________________________________
616: */
617:
618: /* Structured Object Class
619: ** -----------------------
620: */
621: PUBLIC CONST HTStreamClass SGMLParser =
622: {
623: "SGMLParser",
624: SGML_free,
625: SGML_end,
626: SGML_character, SGML_string, SGML_write,
627: };
628:
629: /* Create SGML Engine
630: ** ------------------
631: **
632: ** On entry,
633: ** dtd represents the DTD, along with
634: ** actions is the sink for the data as a set of routines.
635: **
636: */
637:
638: PUBLIC HTStream* SGML_new ARGS2(
639: CONST SGML_dtd *, dtd,
640: HTStructured *, target)
641: {
642: int i;
643: HTStream* context = (HTStream *) malloc(sizeof(*context));
644: if (!context) outofmem(__FILE__, "SGML_begin");
645:
646: context->isa = &SGMLParser;
647: context->string = HTChunkCreate(128); /* Grow by this much */
648: context->dtd = dtd;
649: context->target = target;
650: context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
651: /* Ugh: no OO */
652: context->state = S_text;
653: context->element_stack = 0; /* empty */
654: #ifdef CALLERDATA
655: context->callerData = (void*) callerData;
656: #endif
657: for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0;
658:
659: return context;
660: }
661:
Webmaster