Annotation of libwww/Library/src/SGML.c, revision 1.8
1.1 timbl 1: /* General SGML Parser code SGML.c
2: ** ========================
3: **
1.2 timbl 4: ** This module implements an HTStream object. To parse an
1.1 timbl 5: ** SGML file, create this object which is a parser. The object
1.2 timbl 6: ** is (currently) created by being passed a DTD structure,
7: ** and a target HTStructured oject at which to throw the parsed stuff.
1.1 timbl 8: **
1.2 timbl 9: ** 6 Feb 93 Binary seraches used. Intreface modified.
1.1 timbl 10: */
11: #include "SGML.h"
12:
13: #include <ctype.h>
14: #include <stdio.h>
15: #include "HTUtils.h"
16: #include "HTChunk.h"
17: #include "tcp.h" /* For FROMASCII */
18:
1.2 timbl 19: #define INVALID (-1)
20:
1.1 timbl 21: /* The State (context) of the parser
22: **
1.2 timbl 23: ** This is passed with each call to make the parser reentrant
1.1 timbl 24: **
25: */
26:
1.2 timbl 27: #define MAX_ATTRIBUTES 20 /* Max number of attributes per element */
28:
29:
30: /* Element Stack
31: ** -------------
32: ** This allows us to return down the stack reselcting styles.
33: ** As we return, attribute values will be garbage in general.
34: */
35: typedef struct _HTElement HTElement;
36: struct _HTElement {
37: HTElement * next; /* Previously nested element or 0 */
38: HTTag* tag; /* The tag at this level */
39: };
40:
41:
42: /* Internal Context Data Structure
43: ** -------------------------------
44: */
45: struct _HTStream {
46:
47: CONST HTStreamClass * isa; /* inherited from HTStream */
48:
49: CONST SGML_dtd *dtd;
50: HTStructuredClass *actions; /* target class */
51: HTStructured *target; /* target object */
52:
1.1 timbl 53: HTTag *current_tag;
1.2 timbl 54: int current_attribute_number;
1.1 timbl 55: HTChunk *string;
56: HTElement *element_stack;
57: enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap,
58: S_attr, S_attr_gap, S_equals, S_value,
59: S_ero, S_cro,
60: S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
1.2 timbl 61: #ifdef CALLERDATA
1.1 timbl 62: void * callerData;
1.2 timbl 63: #endif
64: BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
65: char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
66: } ;
67:
68:
69: #define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
70:
1.1 timbl 71:
72:
73: /* Handle Attribute
74: ** ----------------
75: */
76: /* PUBLIC CONST char * SGML_default = ""; ?? */
77:
78: #ifdef __STDC__
1.2 timbl 79: PRIVATE void handle_attribute_name(HTStream * context, const char * s)
1.1 timbl 80: #else
81: PRIVATE void handle_attribute_name(context, s)
1.2 timbl 82: HTStream * context;
1.1 timbl 83: char *s;
84: #endif
85: {
1.2 timbl 86:
87: HTTag * tag = context->current_tag;
88: attr* attributes = tag->attributes;
89:
90: int high, low, i, diff; /* Binary search for attribute name */
91: for(low=0, high=tag->number_of_attributes;
92: high > low ;
93: diff < 0 ? (low = i+1) : (high = i) ) {
94: i = (low + (high-low)/2);
95: diff = strcasecomp(attributes[i].name, s);
96: if (diff==0) { /* success: found it */
97: context->current_attribute_number = i;
98: context->present[i] = YES;
99: if (context->value[i]) {
100: free(context->value[i]);
101: context->value[i] = NULL;
102: }
103: return;
104: } /* if */
105:
106: } /* for */
107:
108: if (TRACE)
109: fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
110: s, context->current_tag->name);
111: context->current_attribute_number = INVALID; /* Invalid */
1.1 timbl 112: }
113:
114:
115: /* Handle attribute value
116: ** ----------------------
117: */
118: #ifdef __STDC__
1.2 timbl 119: PRIVATE void handle_attribute_value(HTStream * context, const char * s)
1.1 timbl 120: #else
121: PRIVATE void handle_attribute_value(context, s)
1.2 timbl 122: HTStream * context;
1.1 timbl 123: char *s;
124: #endif
125: {
1.2 timbl 126: if (context->current_attribute_number != INVALID) {
127: StrAllocCopy(context->value[context->current_attribute_number], s);
1.1 timbl 128: } else {
129: if (TRACE) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
130: }
1.2 timbl 131: context->current_attribute_number = INVALID; /* can't have two assignments! */
1.1 timbl 132: }
133:
1.2 timbl 134:
1.1 timbl 135: /* Handle entity
136: ** -------------
137: **
138: ** On entry,
139: ** s contains the entity name zero terminated
140: ** Bugs:
141: ** If the entity name is unknown, the terminator is treated as
142: ** a printable non-special character in all cases, even if it is '<'
143: */
144: #ifdef __STDC__
1.2 timbl 145: PRIVATE void handle_entity(HTStream * context, char term)
1.1 timbl 146: #else
147: PRIVATE void handle_entity(context, term)
1.2 timbl 148: HTStream * context;
1.1 timbl 149: char term;
150: #endif
151: {
1.2 timbl 152:
1.3 timbl 153: CONST char ** entities = context->dtd->entity_names;
1.1 timbl 154: CONST char *s = context->string->data;
1.2 timbl 155:
156: int high, low, i, diff;
157: for(low=0, high = context->dtd->number_of_entities;
158: high > low ;
159: diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
160: i = (low + (high-low)/2);
161: diff = strcmp(entities[i], s); /* Csse sensitive! */
162: if (diff==0) { /* success: found it */
163: (*context->actions->put_entity)(context->target, i);
164: return;
1.1 timbl 165: }
166: }
167: /* If entity string not found, display as text */
168: if (TRACE)
169: fprintf(stderr, "SGML: Unknown entity %s\n", s);
1.2 timbl 170: PUTC('&');
1.1 timbl 171: {
172: CONST char *p;
173: for (p=s; *p; p++) {
1.2 timbl 174: PUTC(*p);
1.1 timbl 175: }
176: }
1.2 timbl 177: PUTC(term);
1.1 timbl 178: }
179:
1.2 timbl 180:
1.1 timbl 181: /* End element
1.2 timbl 182: ** -----------
1.1 timbl 183: */
184: #ifdef __STDC__
1.2 timbl 185: PRIVATE void end_element(HTStream * context, HTTag * old_tag)
1.1 timbl 186: #else
187: PRIVATE void end_element(context, old_tag)
188: HTTag * old_tag;
1.2 timbl 189: HTStream * context;
1.1 timbl 190: #endif
191: {
192: if (TRACE) fprintf(stderr, "SGML: End </%s>\n", old_tag->name);
1.2 timbl 193: if (old_tag->contents == SGML_EMPTY) {
1.1 timbl 194: if (TRACE) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
195: old_tag->name);
196: return;
197: }
198: while (context->element_stack) {/* Loop is error path only */
199: HTElement * N = context->element_stack;
200: HTTag * t = N->tag;
201:
202: if (old_tag != t) { /* Mismatch: syntax error */
203: if (context->element_stack->next) { /* This is not the last level */
204: if (TRACE) fprintf(stderr,
205: "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
206: old_tag->name, t->name, t->name);
207: } else { /* last level */
208: if (TRACE) fprintf(stderr,
209: "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
210: old_tag->name, t->name, old_tag->name);
211: return; /* Ignore */
212: }
213: }
214:
215: context->element_stack = N->next; /* Remove from stack */
216: free(N);
1.2 timbl 217: (*context->actions->end_element)(context->target,
218: t - context->dtd->tags);
1.1 timbl 219: if (old_tag == t) return; /* Correct sequence */
220:
221: /* Syntax error path only */
222:
223: }
1.5 timbl 224: if (TRACE) fprintf(stderr,
1.1 timbl 225: "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
226: }
227:
228:
229: /* Start a element
230: */
231: #ifdef __STDC__
1.2 timbl 232: PRIVATE void start_element(HTStream * context)
1.1 timbl 233: #else
234: PRIVATE void start_element(context)
1.2 timbl 235: HTStream * context;
1.1 timbl 236: #endif
237: {
238: HTTag * new_tag = context->current_tag;
239:
240: if (TRACE) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
1.2 timbl 241: (*context->actions->start_element)(
242: context->target,
243: new_tag - context->dtd->tags,
244: context->present,
1.3 timbl 245: (CONST char**) context->value); /* coerce type for think c */
1.2 timbl 246: if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */
1.1 timbl 247: HTElement * N = (HTElement *)malloc(sizeof(HTElement));
248: if (N == NULL) outofmem(__FILE__, "start_element");
249: N->next = context->element_stack;
250: N->tag = new_tag;
251: context->element_stack = N;
252: }
253: }
254:
255:
1.2 timbl 256: /* Find Tag in DTD tag list
257: ** ------------------------
1.1 timbl 258: **
259: ** On entry,
1.2 timbl 260: ** dtd points to dtd structire including valid tag list
261: ** string points to name of tag in question
1.1 timbl 262: **
1.2 timbl 263: ** On exit,
264: ** returns:
1.7 timbl 265: ** NULL tag not found
266: ** else address of tag structure in dtd
1.2 timbl 267: */
1.7 timbl 268: PRIVATE HTTag * find_tag ARGS2(CONST SGML_dtd*, dtd, char *, string)
1.2 timbl 269: {
270: int high, low, i, diff;
271: for(low=0, high=dtd->number_of_tags;
272: high > low ;
273: diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
274: i = (low + (high-low)/2);
1.3 timbl 275: diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */
1.2 timbl 276: if (diff==0) { /* success: found it */
1.7 timbl 277: return &dtd->tags[i];
1.2 timbl 278: }
279: }
1.7 timbl 280: return NULL;
1.2 timbl 281: }
282:
283: /*________________________________________________________________________
284: ** Public Methods
1.1 timbl 285: */
286:
1.2 timbl 287:
288: /* Could check that we are back to bottom of stack! @@ */
1.1 timbl 289:
1.8 ! timbl 290: PUBLIC void SGML_free ARGS1(HTStream *, context)
! 291: {
! 292: (*context->actions->free)(context->target);
! 293: HTChunkFree(context->string);
! 294: free(context);
1.1 timbl 295: }
296:
1.8 ! timbl 297: PUBLIC void SGML_abort ARGS2(HTStream *, context, HTError, e)
1.1 timbl 298: {
1.8 ! timbl 299: (*context->actions->abort)(context->target, e);
1.1 timbl 300: HTChunkFree(context->string);
301: free(context);
302: }
303:
1.2 timbl 304:
1.1 timbl 305: /* Read and write user callback handle
306: ** -----------------------------------
307: **
308: ** The callbacks from the SGML parser have an SGML context parameter.
309: ** These calls allow the caller to associate his own context with a
310: ** particular SGML context.
311: */
312:
1.2 timbl 313: #ifdef CALLERDATA
314: PUBLIC void* SGML_callerData ARGS1(HTStream *, context)
1.1 timbl 315: {
316: return context->callerData;
317: }
318:
1.2 timbl 319: PUBLIC void SGML_setCallerData ARGS2(HTStream *, context, void*, data)
1.1 timbl 320: {
321: context->callerData = data;
322: }
1.2 timbl 323: #endif
1.1 timbl 324:
1.2 timbl 325: PUBLIC void SGML_character ARGS2(HTStream *, context, char,c)
1.1 timbl 326:
327: {
1.2 timbl 328: CONST SGML_dtd *dtd = context->dtd;
1.1 timbl 329: HTChunk *string = context->string;
330:
331: switch(context->state) {
332: case S_text:
1.6 timbl 333: if (c=='&' && (!context->element_stack || (
334: context->element_stack->tag &&
335: ( context->element_stack->tag->contents == SGML_MIXED
336: || context->element_stack->tag->contents ==
337: SGML_RCDATA)
338: ))) {
1.1 timbl 339: string->size = 0;
340: context->state = S_ero;
341:
342: } else if (c=='<') {
343: string->size = 0;
344: context->state = (context->element_stack &&
345: context->element_stack->tag &&
1.2 timbl 346: context->element_stack->tag->contents == SGML_LITTERAL) ?
1.1 timbl 347: S_litteral : S_tag;
1.2 timbl 348: } else PUTC(c);
1.1 timbl 349: break;
350:
351: /* In litteral mode, waits only for specific end tag!
1.2 timbl 352: ** Only foir compatibility with old servers.
1.1 timbl 353: */
354: case S_litteral :
355: HTChunkPutc(string, c);
356: if ( TOUPPER(c) != ((string->size ==1) ? '/'
357: : context->element_stack->tag->name[string->size-2])) {
358: int i;
359:
360: /* If complete match, end litteral */
361: if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
362: end_element(context, context->element_stack->tag);
363: string->size = 0;
1.2 timbl 364: context->current_attribute_number = INVALID;
1.1 timbl 365: context->state = S_text;
366: break;
367: } /* If Mismatch: recover string. */
1.2 timbl 368: PUTC( '<');
1.1 timbl 369: for (i=0; i<string->size; i++) /* recover */
1.2 timbl 370: PUTC(
1.1 timbl 371: string->data[i]);
372: context->state = S_text;
373: }
374:
375: break;
376:
377: /* Character reference or Entity
378: */
379: case S_ero:
380: if (c=='#') {
381: context->state = S_cro; /* &# is Char Ref Open */
382: break;
383: }
384: context->state = S_entity; /* Fall through! */
385:
386: /* Handle Entities
387: */
388: case S_entity:
389: if (isalnum(c))
390: HTChunkPutc(string, c);
391: else {
392: HTChunkTerminate(string);
393: handle_entity(context, c);
394: context->state = S_text;
395: }
396: break;
397:
398: /* Character reference
399: */
400: case S_cro:
401: if (isalnum(c))
402: HTChunkPutc(string, c); /* accumulate a character NUMBER */
403: else {
404: int value;
405: HTChunkTerminate(string);
406: if (sscanf(string->data, "%d", &value)==1)
1.2 timbl 407: PUTC(FROMASCII((char)value));
1.1 timbl 408: context->state = S_text;
409: }
410: break;
411:
412: /* Tag
413: */
414: case S_tag: /* new tag */
415: if (isalnum(c))
416: HTChunkPutc(string, c);
417: else { /* End of tag name */
1.7 timbl 418: HTTag * t;
1.1 timbl 419: if (c=='/') {
420: if (TRACE) if (string->size!=0)
421: fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
422: context->state = S_end;
423: break;
424: }
425: HTChunkTerminate(string) ;
1.2 timbl 426:
427: t = find_tag(dtd, string->data);
1.7 timbl 428: if (!t) {
1.2 timbl 429: if(TRACE) fprintf(stderr, "SGML: *** Unknown element %s\n",
1.1 timbl 430: string->data);
431: context->state = (c=='>') ? S_text : S_junk_tag;
432: break;
433: }
1.7 timbl 434: context->current_tag = t;
1.2 timbl 435:
436: /* Clear out attributes
437: */
1.1 timbl 438:
1.2 timbl 439: {
440: int i;
441: for (i=0; i< context->current_tag->number_of_attributes; i++)
442: context->present[i] = NO;
1.1 timbl 443: }
444: string->size = 0;
1.2 timbl 445: context->current_attribute_number = INVALID;
1.1 timbl 446:
447: if (c=='>') {
448: if (context->current_tag->name) start_element(context);
449: context->state = S_text;
450: } else {
451: context->state = S_tag_gap;
452: }
453: }
454: break;
455:
456:
457: case S_tag_gap: /* Expecting attribute or > */
458: if (WHITE(c)) break; /* Gap between attributes */
459: if (c=='>') { /* End of tag */
460: if (context->current_tag->name) start_element(context);
461: context->state = S_text;
462: break;
463: }
464: HTChunkPutc(string, c);
465: context->state = S_attr; /* Get attribute */
466: break;
467:
468: /* accumulating value */
469: case S_attr:
470: if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
471: HTChunkTerminate(string) ;
472: handle_attribute_name(context, string->data);
473: string->size = 0;
474: if (c=='>') { /* End of tag */
475: if (context->current_tag->name) start_element(context);
476: context->state = S_text;
477: break;
478: }
479: context->state = (c=='=' ? S_equals: S_attr_gap);
480: } else {
481: HTChunkPutc(string, c);
482: }
483: break;
484:
485: case S_attr_gap: /* Expecting attribute or = or > */
486: if (WHITE(c)) break; /* Gap after attribute */
487: if (c=='>') { /* End of tag */
488: if (context->current_tag->name) start_element(context);
489: context->state = S_text;
490: break;
491: } else if (c=='=') {
492: context->state = S_equals;
493: break;
494: }
495: HTChunkPutc(string, c);
496: context->state = S_attr; /* Get next attribute */
497: break;
498:
499: case S_equals: /* After attr = */
500: if (WHITE(c)) break; /* Before attribute value */
501: if (c=='>') { /* End of tag */
1.5 timbl 502: if (TRACE) fprintf(stderr, "SGML: found = but no value\n");
1.1 timbl 503: if (context->current_tag->name) start_element(context);
504: context->state = S_text;
505: break;
506:
507: } else if (c=='\'') {
508: context->state = S_squoted;
509: break;
510:
511: } else if (c=='"') {
512: context->state = S_dquoted;
513: break;
514: }
515: HTChunkPutc(string, c);
516: context->state = S_value;
517: break;
518:
519: case S_value:
520: if (WHITE(c) || (c=='>')) { /* End of word */
521: HTChunkTerminate(string) ;
522: handle_attribute_value(context, string->data);
523: string->size = 0;
524: if (c=='>') { /* End of tag */
525: if (context->current_tag->name) start_element(context);
526: context->state = S_text;
527: break;
528: }
529: else context->state = S_tag_gap;
530: } else {
531: HTChunkPutc(string, c);
532: }
533: break;
534:
535: case S_squoted: /* Quoted attribute value */
536: if (c=='\'') { /* End of attribute value */
537: HTChunkTerminate(string) ;
538: handle_attribute_value(context, string->data);
539: string->size = 0;
540: context->state = S_tag_gap;
541: } else {
542: HTChunkPutc(string, c);
543: }
544: break;
545:
546: case S_dquoted: /* Quoted attribute value */
547: if (c=='"') { /* End of attribute value */
548: HTChunkTerminate(string) ;
549: handle_attribute_value(context, string->data);
550: string->size = 0;
551: context->state = S_tag_gap;
552: } else {
553: HTChunkPutc(string, c);
554: }
555: break;
556:
557: case S_end: /* </ */
558: if (isalnum(c))
559: HTChunkPutc(string, c);
560: else { /* End of end tag name */
1.7 timbl 561: HTTag * t;
1.1 timbl 562: HTChunkTerminate(string) ;
1.7 timbl 563: if (!*string->data) { /* Empty end tag */
564: t = context->element_stack->tag;
565: } else {
566: t = find_tag(dtd, string->data);
1.1 timbl 567: }
1.7 timbl 568: if (!t) {
1.1 timbl 569: if(TRACE) fprintf(stderr,
570: "Unknown end tag </%s>\n", string->data);
1.2 timbl 571: } else {
1.7 timbl 572: context->current_tag = t;
1.2 timbl 573: end_element( context, context->current_tag);
1.1 timbl 574: }
1.2 timbl 575:
1.1 timbl 576: string->size = 0;
1.2 timbl 577: context->current_attribute_number = INVALID;
1.7 timbl 578: if (c!='>') {
579: if (TRACE && !WHITE(c))
580: fprintf(stderr,"SGML: `</%s%c' found!\n",
581: string->data, c);
582: context->state = S_junk_tag;
583: } else {
584: context->state = S_text;
585: }
1.1 timbl 586: }
587: break;
588:
589:
590: case S_junk_tag:
591: if (c=='>') {
592: context->state = S_text;
593: }
594:
595: } /* switch on context->state */
596:
597: } /* SGML_character */
1.2 timbl 598:
599:
600: PUBLIC void SGML_string ARGS2(HTStream *, context, CONST char*, str)
601: {
602: CONST char *p;
603: for(p=str; *p; p++)
604: SGML_character(context, *p);
605: }
606:
607:
608: PUBLIC void SGML_write ARGS3(HTStream *, context, CONST char*, str, int, l)
609: {
610: CONST char *p;
611: CONST char *e = str+l;
612: for(p=str; p<e; p++)
613: SGML_character(context, *p);
614: }
615:
616: /*_______________________________________________________________________
617: */
618:
619: /* Structured Object Class
620: ** -----------------------
621: */
622: PUBLIC CONST HTStreamClass SGMLParser =
623: {
624: "SGMLParser",
625: SGML_free,
1.8 ! timbl 626: SGML_abort,
1.2 timbl 627: SGML_character, SGML_string, SGML_write,
628: };
629:
630: /* Create SGML Engine
631: ** ------------------
632: **
633: ** On entry,
634: ** dtd represents the DTD, along with
635: ** actions is the sink for the data as a set of routines.
636: **
637: */
638:
639: PUBLIC HTStream* SGML_new ARGS2(
640: CONST SGML_dtd *, dtd,
641: HTStructured *, target)
642: {
643: int i;
644: HTStream* context = (HTStream *) malloc(sizeof(*context));
645: if (!context) outofmem(__FILE__, "SGML_begin");
646:
647: context->isa = &SGMLParser;
648: context->string = HTChunkCreate(128); /* Grow by this much */
649: context->dtd = dtd;
650: context->target = target;
651: context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
652: /* Ugh: no OO */
653: context->state = S_text;
654: context->element_stack = 0; /* empty */
655: #ifdef CALLERDATA
656: context->callerData = (void*) callerData;
657: #endif
658: for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0;
659:
660: return context;
661: }
662:
Webmaster