Annotation of libwww/Library/src/SGML.c, revision 1.13
1.1 timbl 1: /* General SGML Parser code SGML.c
2: ** ========================
3: **
1.2 timbl 4: ** This module implements an HTStream object. To parse an
1.1 timbl 5: ** SGML file, create this object which is a parser. The object
1.2 timbl 6: ** is (currently) created by being passed a DTD structure,
7: ** and a target HTStructured oject at which to throw the parsed stuff.
1.1 timbl 8: **
1.2 timbl 9: ** 6 Feb 93 Binary seraches used. Intreface modified.
1.1 timbl 10: */
11: #include "SGML.h"
12:
13: #include <ctype.h>
14: #include <stdio.h>
15: #include "HTUtils.h"
16: #include "HTChunk.h"
17: #include "tcp.h" /* For FROMASCII */
18:
1.2 timbl 19: #define INVALID (-1)
20:
1.1 timbl 21: /* The State (context) of the parser
22: **
1.2 timbl 23: ** This is passed with each call to make the parser reentrant
1.1 timbl 24: **
25: */
26:
1.2 timbl 27: #define MAX_ATTRIBUTES 20 /* Max number of attributes per element */
28:
29:
30: /* Element Stack
31: ** -------------
32: ** This allows us to return down the stack reselcting styles.
33: ** As we return, attribute values will be garbage in general.
34: */
35: typedef struct _HTElement HTElement;
36: struct _HTElement {
37: HTElement * next; /* Previously nested element or 0 */
38: HTTag* tag; /* The tag at this level */
39: };
40:
41:
42: /* Internal Context Data Structure
43: ** -------------------------------
44: */
45: struct _HTStream {
46:
47: CONST HTStreamClass * isa; /* inherited from HTStream */
48:
49: CONST SGML_dtd *dtd;
50: HTStructuredClass *actions; /* target class */
51: HTStructured *target; /* target object */
52:
1.1 timbl 53: HTTag *current_tag;
1.2 timbl 54: int current_attribute_number;
1.1 timbl 55: HTChunk *string;
56: HTElement *element_stack;
1.12 timbl 57: enum sgml_state { S_text, S_literal, S_tag, S_tag_gap,
1.1 timbl 58: S_attr, S_attr_gap, S_equals, S_value,
59: S_ero, S_cro,
1.13 ! timbl 60: #ifdef ISO_2022_JP
! 61: S_esc, S_dollar, S_paren, S_nonascii_text,
! 62: #endif
1.1 timbl 63: S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
1.2 timbl 64: #ifdef CALLERDATA
1.1 timbl 65: void * callerData;
1.2 timbl 66: #endif
67: BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
68: char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
69: } ;
70:
71:
72: #define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
73:
1.1 timbl 74:
75:
76: /* Handle Attribute
77: ** ----------------
78: */
79: /* PUBLIC CONST char * SGML_default = ""; ?? */
80:
81: #ifdef __STDC__
1.2 timbl 82: PRIVATE void handle_attribute_name(HTStream * context, const char * s)
1.1 timbl 83: #else
84: PRIVATE void handle_attribute_name(context, s)
1.2 timbl 85: HTStream * context;
1.1 timbl 86: char *s;
87: #endif
88: {
1.2 timbl 89:
90: HTTag * tag = context->current_tag;
91: attr* attributes = tag->attributes;
92:
93: int high, low, i, diff; /* Binary search for attribute name */
94: for(low=0, high=tag->number_of_attributes;
95: high > low ;
96: diff < 0 ? (low = i+1) : (high = i) ) {
97: i = (low + (high-low)/2);
98: diff = strcasecomp(attributes[i].name, s);
99: if (diff==0) { /* success: found it */
100: context->current_attribute_number = i;
101: context->present[i] = YES;
102: if (context->value[i]) {
103: free(context->value[i]);
104: context->value[i] = NULL;
105: }
106: return;
107: } /* if */
108:
109: } /* for */
110:
111: if (TRACE)
112: fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
113: s, context->current_tag->name);
114: context->current_attribute_number = INVALID; /* Invalid */
1.1 timbl 115: }
116:
117:
118: /* Handle attribute value
119: ** ----------------------
120: */
121: #ifdef __STDC__
1.2 timbl 122: PRIVATE void handle_attribute_value(HTStream * context, const char * s)
1.1 timbl 123: #else
124: PRIVATE void handle_attribute_value(context, s)
1.2 timbl 125: HTStream * context;
1.1 timbl 126: char *s;
127: #endif
128: {
1.2 timbl 129: if (context->current_attribute_number != INVALID) {
130: StrAllocCopy(context->value[context->current_attribute_number], s);
1.1 timbl 131: } else {
132: if (TRACE) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
133: }
1.2 timbl 134: context->current_attribute_number = INVALID; /* can't have two assignments! */
1.1 timbl 135: }
136:
1.2 timbl 137:
1.1 timbl 138: /* Handle entity
139: ** -------------
140: **
141: ** On entry,
142: ** s contains the entity name zero terminated
143: ** Bugs:
144: ** If the entity name is unknown, the terminator is treated as
145: ** a printable non-special character in all cases, even if it is '<'
146: */
147: #ifdef __STDC__
1.2 timbl 148: PRIVATE void handle_entity(HTStream * context, char term)
1.1 timbl 149: #else
150: PRIVATE void handle_entity(context, term)
1.2 timbl 151: HTStream * context;
1.1 timbl 152: char term;
153: #endif
154: {
1.2 timbl 155:
1.3 timbl 156: CONST char ** entities = context->dtd->entity_names;
1.1 timbl 157: CONST char *s = context->string->data;
1.2 timbl 158:
159: int high, low, i, diff;
160: for(low=0, high = context->dtd->number_of_entities;
161: high > low ;
162: diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
163: i = (low + (high-low)/2);
164: diff = strcmp(entities[i], s); /* Csse sensitive! */
165: if (diff==0) { /* success: found it */
166: (*context->actions->put_entity)(context->target, i);
167: return;
1.1 timbl 168: }
169: }
170: /* If entity string not found, display as text */
171: if (TRACE)
172: fprintf(stderr, "SGML: Unknown entity %s\n", s);
1.2 timbl 173: PUTC('&');
1.1 timbl 174: {
175: CONST char *p;
176: for (p=s; *p; p++) {
1.2 timbl 177: PUTC(*p);
1.1 timbl 178: }
179: }
1.2 timbl 180: PUTC(term);
1.1 timbl 181: }
182:
1.2 timbl 183:
1.1 timbl 184: /* End element
1.2 timbl 185: ** -----------
1.1 timbl 186: */
187: #ifdef __STDC__
1.2 timbl 188: PRIVATE void end_element(HTStream * context, HTTag * old_tag)
1.1 timbl 189: #else
190: PRIVATE void end_element(context, old_tag)
191: HTTag * old_tag;
1.2 timbl 192: HTStream * context;
1.1 timbl 193: #endif
194: {
195: if (TRACE) fprintf(stderr, "SGML: End </%s>\n", old_tag->name);
1.2 timbl 196: if (old_tag->contents == SGML_EMPTY) {
1.1 timbl 197: if (TRACE) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
198: old_tag->name);
199: return;
200: }
201: while (context->element_stack) {/* Loop is error path only */
202: HTElement * N = context->element_stack;
203: HTTag * t = N->tag;
204:
205: if (old_tag != t) { /* Mismatch: syntax error */
206: if (context->element_stack->next) { /* This is not the last level */
207: if (TRACE) fprintf(stderr,
208: "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
209: old_tag->name, t->name, t->name);
210: } else { /* last level */
211: if (TRACE) fprintf(stderr,
212: "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
213: old_tag->name, t->name, old_tag->name);
214: return; /* Ignore */
215: }
216: }
217:
218: context->element_stack = N->next; /* Remove from stack */
219: free(N);
1.2 timbl 220: (*context->actions->end_element)(context->target,
221: t - context->dtd->tags);
1.1 timbl 222: if (old_tag == t) return; /* Correct sequence */
223:
224: /* Syntax error path only */
225:
226: }
1.5 timbl 227: if (TRACE) fprintf(stderr,
1.1 timbl 228: "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
229: }
230:
231:
232: /* Start a element
233: */
234: #ifdef __STDC__
1.2 timbl 235: PRIVATE void start_element(HTStream * context)
1.1 timbl 236: #else
237: PRIVATE void start_element(context)
1.2 timbl 238: HTStream * context;
1.1 timbl 239: #endif
240: {
241: HTTag * new_tag = context->current_tag;
242:
243: if (TRACE) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
1.2 timbl 244: (*context->actions->start_element)(
245: context->target,
246: new_tag - context->dtd->tags,
247: context->present,
1.3 timbl 248: (CONST char**) context->value); /* coerce type for think c */
1.2 timbl 249: if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */
1.1 timbl 250: HTElement * N = (HTElement *)malloc(sizeof(HTElement));
251: if (N == NULL) outofmem(__FILE__, "start_element");
252: N->next = context->element_stack;
253: N->tag = new_tag;
254: context->element_stack = N;
255: }
256: }
257:
258:
1.2 timbl 259: /* Find Tag in DTD tag list
260: ** ------------------------
1.1 timbl 261: **
262: ** On entry,
1.2 timbl 263: ** dtd points to dtd structire including valid tag list
264: ** string points to name of tag in question
1.1 timbl 265: **
1.2 timbl 266: ** On exit,
267: ** returns:
1.7 timbl 268: ** NULL tag not found
269: ** else address of tag structure in dtd
1.2 timbl 270: */
1.11 timbl 271: PUBLIC HTTag * SGMLFindTag ARGS2(CONST SGML_dtd*, dtd, CONST char *, string)
1.2 timbl 272: {
273: int high, low, i, diff;
274: for(low=0, high=dtd->number_of_tags;
275: high > low ;
276: diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
277: i = (low + (high-low)/2);
1.3 timbl 278: diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */
1.2 timbl 279: if (diff==0) { /* success: found it */
1.7 timbl 280: return &dtd->tags[i];
1.2 timbl 281: }
282: }
1.7 timbl 283: return NULL;
1.2 timbl 284: }
285:
286: /*________________________________________________________________________
287: ** Public Methods
1.1 timbl 288: */
289:
1.2 timbl 290:
291: /* Could check that we are back to bottom of stack! @@ */
1.1 timbl 292:
1.8 timbl 293: PUBLIC void SGML_free ARGS1(HTStream *, context)
294: {
295: (*context->actions->free)(context->target);
296: HTChunkFree(context->string);
297: free(context);
1.1 timbl 298: }
299:
1.8 timbl 300: PUBLIC void SGML_abort ARGS2(HTStream *, context, HTError, e)
1.1 timbl 301: {
1.8 timbl 302: (*context->actions->abort)(context->target, e);
1.1 timbl 303: HTChunkFree(context->string);
304: free(context);
305: }
306:
1.2 timbl 307:
1.1 timbl 308: /* Read and write user callback handle
309: ** -----------------------------------
310: **
311: ** The callbacks from the SGML parser have an SGML context parameter.
312: ** These calls allow the caller to associate his own context with a
313: ** particular SGML context.
314: */
315:
1.2 timbl 316: #ifdef CALLERDATA
317: PUBLIC void* SGML_callerData ARGS1(HTStream *, context)
1.1 timbl 318: {
319: return context->callerData;
320: }
321:
1.2 timbl 322: PUBLIC void SGML_setCallerData ARGS2(HTStream *, context, void*, data)
1.1 timbl 323: {
324: context->callerData = data;
325: }
1.2 timbl 326: #endif
1.1 timbl 327:
1.2 timbl 328: PUBLIC void SGML_character ARGS2(HTStream *, context, char,c)
1.1 timbl 329:
330: {
1.2 timbl 331: CONST SGML_dtd *dtd = context->dtd;
1.1 timbl 332: HTChunk *string = context->string;
333:
334: switch(context->state) {
335: case S_text:
1.13 ! timbl 336: #ifdef ISO_2022_JP
! 337: if (c=='\033') {
! 338: context->state = S_esc;
! 339: PUTC(c);
! 340: break;
! 341: }
! 342: #endif /* ISO_2022_JP */
1.6 timbl 343: if (c=='&' && (!context->element_stack || (
344: context->element_stack->tag &&
345: ( context->element_stack->tag->contents == SGML_MIXED
346: || context->element_stack->tag->contents ==
347: SGML_RCDATA)
348: ))) {
1.1 timbl 349: string->size = 0;
350: context->state = S_ero;
351:
352: } else if (c=='<') {
353: string->size = 0;
354: context->state = (context->element_stack &&
1.13 ! timbl 355: context->element_stack->tag &&
! 356: context->element_stack->tag->contents == SGML_LITERAL) ?
1.12 timbl 357: S_literal : S_tag;
1.2 timbl 358: } else PUTC(c);
1.1 timbl 359: break;
1.13 ! timbl 360:
! 361: #ifdef ISO_2022_JP
! 362: case S_esc:
! 363: if (c=='$') {
! 364: context->state = S_dollar;
! 365: } else if (c=='(') {
! 366: context->state = S_paren;
! 367: } else {
! 368: context->state = S_text;
! 369: }
! 370: PUTC(c);
! 371: break;
! 372: case S_dollar:
! 373: if (c=='@' || c=='B') {
! 374: context->state = S_nonascii_text;
! 375: } else {
! 376: context->state = S_text;
! 377: }
! 378: PUTC(c);
! 379: break;
! 380: case S_paren:
! 381: if (c=='B' || c=='J') {
! 382: context->state = S_text;
! 383: } else {
! 384: context->state = S_text;
! 385: }
! 386: PUTC(c);
! 387: break;
! 388: case S_nonascii_text:
! 389: if (c=='\033') {
! 390: context->state = S_esc;
! 391: PUTC(c);
! 392: } else {
! 393: PUTC(c);
! 394: }
! 395: break;
! 396: #endif /* ISO_2022_JP */
1.1 timbl 397:
1.12 timbl 398: /* In literal mode, waits only for specific end tag!
1.2 timbl 399: ** Only foir compatibility with old servers.
1.1 timbl 400: */
1.12 timbl 401: case S_literal :
1.1 timbl 402: HTChunkPutc(string, c);
403: if ( TOUPPER(c) != ((string->size ==1) ? '/'
404: : context->element_stack->tag->name[string->size-2])) {
405: int i;
406:
1.12 timbl 407: /* If complete match, end literal */
1.1 timbl 408: if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
409: end_element(context, context->element_stack->tag);
410: string->size = 0;
1.2 timbl 411: context->current_attribute_number = INVALID;
1.1 timbl 412: context->state = S_text;
413: break;
414: } /* If Mismatch: recover string. */
1.2 timbl 415: PUTC( '<');
1.1 timbl 416: for (i=0; i<string->size; i++) /* recover */
1.2 timbl 417: PUTC(
1.1 timbl 418: string->data[i]);
419: context->state = S_text;
420: }
421:
422: break;
423:
424: /* Character reference or Entity
425: */
426: case S_ero:
427: if (c=='#') {
428: context->state = S_cro; /* &# is Char Ref Open */
429: break;
430: }
431: context->state = S_entity; /* Fall through! */
432:
433: /* Handle Entities
434: */
435: case S_entity:
436: if (isalnum(c))
437: HTChunkPutc(string, c);
438: else {
439: HTChunkTerminate(string);
440: handle_entity(context, c);
441: context->state = S_text;
442: }
443: break;
444:
445: /* Character reference
446: */
447: case S_cro:
448: if (isalnum(c))
449: HTChunkPutc(string, c); /* accumulate a character NUMBER */
450: else {
451: int value;
452: HTChunkTerminate(string);
453: if (sscanf(string->data, "%d", &value)==1)
1.2 timbl 454: PUTC(FROMASCII((char)value));
1.1 timbl 455: context->state = S_text;
456: }
457: break;
458:
459: /* Tag
460: */
461: case S_tag: /* new tag */
462: if (isalnum(c))
463: HTChunkPutc(string, c);
464: else { /* End of tag name */
1.7 timbl 465: HTTag * t;
1.1 timbl 466: if (c=='/') {
467: if (TRACE) if (string->size!=0)
468: fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
469: context->state = S_end;
470: break;
471: }
472: HTChunkTerminate(string) ;
1.2 timbl 473:
1.10 timbl 474: t = SGMLFindTag(dtd, string->data);
1.7 timbl 475: if (!t) {
1.2 timbl 476: if(TRACE) fprintf(stderr, "SGML: *** Unknown element %s\n",
1.1 timbl 477: string->data);
478: context->state = (c=='>') ? S_text : S_junk_tag;
479: break;
480: }
1.7 timbl 481: context->current_tag = t;
1.2 timbl 482:
483: /* Clear out attributes
484: */
1.1 timbl 485:
1.2 timbl 486: {
487: int i;
488: for (i=0; i< context->current_tag->number_of_attributes; i++)
489: context->present[i] = NO;
1.1 timbl 490: }
491: string->size = 0;
1.2 timbl 492: context->current_attribute_number = INVALID;
1.1 timbl 493:
494: if (c=='>') {
495: if (context->current_tag->name) start_element(context);
496: context->state = S_text;
497: } else {
498: context->state = S_tag_gap;
499: }
500: }
501: break;
502:
503:
504: case S_tag_gap: /* Expecting attribute or > */
505: if (WHITE(c)) break; /* Gap between attributes */
506: if (c=='>') { /* End of tag */
507: if (context->current_tag->name) start_element(context);
508: context->state = S_text;
509: break;
510: }
511: HTChunkPutc(string, c);
512: context->state = S_attr; /* Get attribute */
513: break;
514:
515: /* accumulating value */
516: case S_attr:
517: if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
518: HTChunkTerminate(string) ;
519: handle_attribute_name(context, string->data);
520: string->size = 0;
521: if (c=='>') { /* End of tag */
522: if (context->current_tag->name) start_element(context);
523: context->state = S_text;
524: break;
525: }
526: context->state = (c=='=' ? S_equals: S_attr_gap);
527: } else {
528: HTChunkPutc(string, c);
529: }
530: break;
531:
532: case S_attr_gap: /* Expecting attribute or = or > */
533: if (WHITE(c)) break; /* Gap after attribute */
534: if (c=='>') { /* End of tag */
535: if (context->current_tag->name) start_element(context);
536: context->state = S_text;
537: break;
538: } else if (c=='=') {
539: context->state = S_equals;
540: break;
541: }
542: HTChunkPutc(string, c);
543: context->state = S_attr; /* Get next attribute */
544: break;
545:
546: case S_equals: /* After attr = */
547: if (WHITE(c)) break; /* Before attribute value */
548: if (c=='>') { /* End of tag */
1.5 timbl 549: if (TRACE) fprintf(stderr, "SGML: found = but no value\n");
1.1 timbl 550: if (context->current_tag->name) start_element(context);
551: context->state = S_text;
552: break;
553:
554: } else if (c=='\'') {
555: context->state = S_squoted;
556: break;
557:
558: } else if (c=='"') {
559: context->state = S_dquoted;
560: break;
561: }
562: HTChunkPutc(string, c);
563: context->state = S_value;
564: break;
565:
566: case S_value:
567: if (WHITE(c) || (c=='>')) { /* End of word */
568: HTChunkTerminate(string) ;
569: handle_attribute_value(context, string->data);
570: string->size = 0;
571: if (c=='>') { /* End of tag */
572: if (context->current_tag->name) start_element(context);
573: context->state = S_text;
574: break;
575: }
576: else context->state = S_tag_gap;
577: } else {
578: HTChunkPutc(string, c);
579: }
580: break;
581:
582: case S_squoted: /* Quoted attribute value */
583: if (c=='\'') { /* End of attribute value */
584: HTChunkTerminate(string) ;
585: handle_attribute_value(context, string->data);
586: string->size = 0;
587: context->state = S_tag_gap;
588: } else {
589: HTChunkPutc(string, c);
590: }
591: break;
592:
593: case S_dquoted: /* Quoted attribute value */
594: if (c=='"') { /* End of attribute value */
595: HTChunkTerminate(string) ;
596: handle_attribute_value(context, string->data);
597: string->size = 0;
598: context->state = S_tag_gap;
599: } else {
600: HTChunkPutc(string, c);
601: }
602: break;
603:
604: case S_end: /* </ */
605: if (isalnum(c))
606: HTChunkPutc(string, c);
607: else { /* End of end tag name */
1.7 timbl 608: HTTag * t;
1.1 timbl 609: HTChunkTerminate(string) ;
1.7 timbl 610: if (!*string->data) { /* Empty end tag */
611: t = context->element_stack->tag;
612: } else {
1.10 timbl 613: t = SGMLFindTag(dtd, string->data);
1.1 timbl 614: }
1.7 timbl 615: if (!t) {
1.1 timbl 616: if(TRACE) fprintf(stderr,
617: "Unknown end tag </%s>\n", string->data);
1.2 timbl 618: } else {
1.7 timbl 619: context->current_tag = t;
1.2 timbl 620: end_element( context, context->current_tag);
1.1 timbl 621: }
1.2 timbl 622:
1.1 timbl 623: string->size = 0;
1.2 timbl 624: context->current_attribute_number = INVALID;
1.7 timbl 625: if (c!='>') {
626: if (TRACE && !WHITE(c))
627: fprintf(stderr,"SGML: `</%s%c' found!\n",
628: string->data, c);
629: context->state = S_junk_tag;
630: } else {
631: context->state = S_text;
632: }
1.1 timbl 633: }
634: break;
635:
636:
637: case S_junk_tag:
638: if (c=='>') {
639: context->state = S_text;
640: }
641:
642: } /* switch on context->state */
643:
644: } /* SGML_character */
1.2 timbl 645:
646:
647: PUBLIC void SGML_string ARGS2(HTStream *, context, CONST char*, str)
648: {
649: CONST char *p;
650: for(p=str; *p; p++)
651: SGML_character(context, *p);
652: }
653:
654:
655: PUBLIC void SGML_write ARGS3(HTStream *, context, CONST char*, str, int, l)
656: {
657: CONST char *p;
658: CONST char *e = str+l;
659: for(p=str; p<e; p++)
660: SGML_character(context, *p);
661: }
662:
663: /*_______________________________________________________________________
664: */
665:
666: /* Structured Object Class
667: ** -----------------------
668: */
669: PUBLIC CONST HTStreamClass SGMLParser =
670: {
671: "SGMLParser",
672: SGML_free,
1.8 timbl 673: SGML_abort,
1.9 timbl 674: SGML_character,
675: SGML_string,
676: SGML_write,
1.2 timbl 677: };
678:
679: /* Create SGML Engine
680: ** ------------------
681: **
682: ** On entry,
683: ** dtd represents the DTD, along with
684: ** actions is the sink for the data as a set of routines.
685: **
686: */
687:
688: PUBLIC HTStream* SGML_new ARGS2(
689: CONST SGML_dtd *, dtd,
690: HTStructured *, target)
691: {
692: int i;
693: HTStream* context = (HTStream *) malloc(sizeof(*context));
694: if (!context) outofmem(__FILE__, "SGML_begin");
695:
696: context->isa = &SGMLParser;
697: context->string = HTChunkCreate(128); /* Grow by this much */
698: context->dtd = dtd;
699: context->target = target;
700: context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
701: /* Ugh: no OO */
702: context->state = S_text;
703: context->element_stack = 0; /* empty */
704: #ifdef CALLERDATA
705: context->callerData = (void*) callerData;
706: #endif
707: for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0;
708:
709: return context;
710: }
711:
Webmaster