Annotation of libwww/Library/src/SGML.c, revision 1.4
1.1 timbl 1: /* General SGML Parser code SGML.c
2: ** ========================
3: **
1.2 timbl 4: ** This module implements an HTStream object. To parse an
1.1 timbl 5: ** SGML file, create this object which is a parser. The object
1.2 timbl 6: ** is (currently) created by being passed a DTD structure,
7: ** and a target HTStructured oject at which to throw the parsed stuff.
1.1 timbl 8: **
1.2 timbl 9: ** 6 Feb 93 Binary seraches used. Intreface modified.
1.1 timbl 10: */
11: #include "SGML.h"
12:
13: #include <ctype.h>
14: #include <stdio.h>
15: #include "HTUtils.h"
16: #include "HTChunk.h"
17: #include "tcp.h" /* For FROMASCII */
18:
1.2 timbl 19: #define INVALID (-1)
20:
1.1 timbl 21: /* The State (context) of the parser
22: **
1.2 timbl 23: ** This is passed with each call to make the parser reentrant
1.1 timbl 24: **
25: */
26:
1.2 timbl 27: #define MAX_ATTRIBUTES 20 /* Max number of attributes per element */
28:
29:
30: /* Element Stack
31: ** -------------
32: ** This allows us to return down the stack reselcting styles.
33: ** As we return, attribute values will be garbage in general.
34: */
35: typedef struct _HTElement HTElement;
36: struct _HTElement {
37: HTElement * next; /* Previously nested element or 0 */
38: HTTag* tag; /* The tag at this level */
39: };
40:
41:
42: /* Internal Context Data Structure
43: ** -------------------------------
44: */
45: struct _HTStream {
46:
47: CONST HTStreamClass * isa; /* inherited from HTStream */
48:
49: CONST SGML_dtd *dtd;
50: HTStructuredClass *actions; /* target class */
51: HTStructured *target; /* target object */
52:
1.1 timbl 53: HTTag *current_tag;
1.2 timbl 54: int current_attribute_number;
1.1 timbl 55: HTChunk *string;
56: HTElement *element_stack;
57: enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap,
58: S_attr, S_attr_gap, S_equals, S_value,
59: S_ero, S_cro,
60: S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
1.2 timbl 61: #ifdef CALLERDATA
1.1 timbl 62: void * callerData;
1.2 timbl 63: #endif
64: BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
65: char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
66: } ;
67:
68:
69: #define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
70:
1.1 timbl 71:
72:
73: /* Handle Attribute
74: ** ----------------
75: */
76: /* PUBLIC CONST char * SGML_default = ""; ?? */
77:
78: #ifdef __STDC__
1.2 timbl 79: PRIVATE void handle_attribute_name(HTStream * context, const char * s)
1.1 timbl 80: #else
81: PRIVATE void handle_attribute_name(context, s)
1.2 timbl 82: HTStream * context;
1.1 timbl 83: char *s;
84: #endif
85: {
1.2 timbl 86:
87: HTTag * tag = context->current_tag;
88: attr* attributes = tag->attributes;
89:
90: int high, low, i, diff; /* Binary search for attribute name */
91: for(low=0, high=tag->number_of_attributes;
92: high > low ;
93: diff < 0 ? (low = i+1) : (high = i) ) {
94: i = (low + (high-low)/2);
95: diff = strcasecomp(attributes[i].name, s);
96: if (diff==0) { /* success: found it */
97: context->current_attribute_number = i;
98: context->present[i] = YES;
99: if (context->value[i]) {
100: free(context->value[i]);
101: context->value[i] = NULL;
102: }
103: return;
104: } /* if */
105:
106: } /* for */
107:
108: if (TRACE)
109: fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
110: s, context->current_tag->name);
111: context->current_attribute_number = INVALID; /* Invalid */
1.1 timbl 112: }
113:
114:
115: /* Handle attribute value
116: ** ----------------------
117: */
118: #ifdef __STDC__
1.2 timbl 119: PRIVATE void handle_attribute_value(HTStream * context, const char * s)
1.1 timbl 120: #else
121: PRIVATE void handle_attribute_value(context, s)
1.2 timbl 122: HTStream * context;
1.1 timbl 123: char *s;
124: #endif
125: {
1.2 timbl 126: if (context->current_attribute_number != INVALID) {
127: StrAllocCopy(context->value[context->current_attribute_number], s);
1.1 timbl 128: } else {
129: if (TRACE) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
130: }
1.2 timbl 131: context->current_attribute_number = INVALID; /* can't have two assignments! */
1.1 timbl 132: }
133:
1.2 timbl 134:
1.1 timbl 135: /* Handle entity
136: ** -------------
137: **
138: ** On entry,
139: ** s contains the entity name zero terminated
140: ** Bugs:
141: ** If the entity name is unknown, the terminator is treated as
142: ** a printable non-special character in all cases, even if it is '<'
143: */
144: #ifdef __STDC__
1.2 timbl 145: PRIVATE void handle_entity(HTStream * context, char term)
1.1 timbl 146: #else
147: PRIVATE void handle_entity(context, term)
1.2 timbl 148: HTStream * context;
1.1 timbl 149: char term;
150: #endif
151: {
1.2 timbl 152:
1.3 timbl 153: CONST char ** entities = context->dtd->entity_names;
1.1 timbl 154: CONST char *s = context->string->data;
1.2 timbl 155:
156: int high, low, i, diff;
157: for(low=0, high = context->dtd->number_of_entities;
158: high > low ;
159: diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
160: i = (low + (high-low)/2);
161: diff = strcmp(entities[i], s); /* Csse sensitive! */
162: if (diff==0) { /* success: found it */
163: (*context->actions->put_entity)(context->target, i);
164: return;
1.1 timbl 165: }
166: }
167: /* If entity string not found, display as text */
168: if (TRACE)
169: fprintf(stderr, "SGML: Unknown entity %s\n", s);
1.2 timbl 170: PUTC('&');
1.1 timbl 171: {
172: CONST char *p;
173: for (p=s; *p; p++) {
1.2 timbl 174: PUTC(*p);
1.1 timbl 175: }
176: }
1.2 timbl 177: PUTC(term);
1.1 timbl 178: }
179:
1.2 timbl 180:
1.1 timbl 181: /* End element
1.2 timbl 182: ** -----------
1.1 timbl 183: */
184: #ifdef __STDC__
1.2 timbl 185: PRIVATE void end_element(HTStream * context, HTTag * old_tag)
1.1 timbl 186: #else
187: PRIVATE void end_element(context, old_tag)
188: HTTag * old_tag;
1.2 timbl 189: HTStream * context;
1.1 timbl 190: #endif
191: {
192: if (TRACE) fprintf(stderr, "SGML: End </%s>\n", old_tag->name);
1.2 timbl 193: if (old_tag->contents == SGML_EMPTY) {
1.1 timbl 194: if (TRACE) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
195: old_tag->name);
196: return;
197: }
198: while (context->element_stack) {/* Loop is error path only */
199: HTElement * N = context->element_stack;
200: HTTag * t = N->tag;
201:
202: if (old_tag != t) { /* Mismatch: syntax error */
203: if (context->element_stack->next) { /* This is not the last level */
204: if (TRACE) fprintf(stderr,
205: "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
206: old_tag->name, t->name, t->name);
207: } else { /* last level */
208: if (TRACE) fprintf(stderr,
209: "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
210: old_tag->name, t->name, old_tag->name);
211: return; /* Ignore */
212: }
213: }
214:
215: context->element_stack = N->next; /* Remove from stack */
216: free(N);
1.2 timbl 217: (*context->actions->end_element)(context->target,
218: t - context->dtd->tags);
1.1 timbl 219: if (old_tag == t) return; /* Correct sequence */
220:
221: /* Syntax error path only */
222:
223: }
224: fprintf(stderr,
225: "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
226: }
227:
228:
229: /* Start a element
230: */
231: #ifdef __STDC__
1.2 timbl 232: PRIVATE void start_element(HTStream * context)
1.1 timbl 233: #else
234: PRIVATE void start_element(context)
1.2 timbl 235: HTStream * context;
1.1 timbl 236: #endif
237: {
238: HTTag * new_tag = context->current_tag;
239:
240: if (TRACE) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
1.2 timbl 241: (*context->actions->start_element)(
242: context->target,
243: new_tag - context->dtd->tags,
244: context->present,
1.3 timbl 245: (CONST char**) context->value); /* coerce type for think c */
1.2 timbl 246: if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */
1.1 timbl 247: HTElement * N = (HTElement *)malloc(sizeof(HTElement));
248: if (N == NULL) outofmem(__FILE__, "start_element");
249: N->next = context->element_stack;
250: N->tag = new_tag;
251: context->element_stack = N;
252: }
253: }
254:
255:
1.2 timbl 256: /* Find Tag in DTD tag list
257: ** ------------------------
1.1 timbl 258: **
259: ** On entry,
1.2 timbl 260: ** dtd points to dtd structire including valid tag list
261: ** string points to name of tag in question
1.1 timbl 262: **
1.2 timbl 263: ** On exit,
264: ** returns:
265: ** INVALID tag not found
266: ** >=0 tag number in dtd
267: */
268: PRIVATE int find_tag ARGS2(CONST SGML_dtd*, dtd, char *, string)
269: {
270: int high, low, i, diff;
271: for(low=0, high=dtd->number_of_tags;
272: high > low ;
273: diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
274: i = (low + (high-low)/2);
1.3 timbl 275: diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */
1.2 timbl 276: if (diff==0) { /* success: found it */
277: return i;
278: }
279: }
280: return INVALID;
281: }
282:
283: /*________________________________________________________________________
284: ** Public Methods
1.1 timbl 285: */
286:
1.2 timbl 287:
288: PUBLIC void SGML_end ARGS1(HTStream *, context)
1.1 timbl 289: {
1.2 timbl 290: /* Could check that we are back to bottom of stack! @@ */
1.1 timbl 291:
1.2 timbl 292: (*context->actions->end_document)(context->target);
1.1 timbl 293: }
294:
295:
1.2 timbl 296: PUBLIC void SGML_free ARGS1(HTStream *, context)
1.1 timbl 297: {
1.2 timbl 298: (*context->actions->free)(context->target);
1.1 timbl 299: HTChunkFree(context->string);
300: free(context);
301: }
302:
1.2 timbl 303:
1.1 timbl 304: /* Read and write user callback handle
305: ** -----------------------------------
306: **
307: ** The callbacks from the SGML parser have an SGML context parameter.
308: ** These calls allow the caller to associate his own context with a
309: ** particular SGML context.
310: */
311:
1.2 timbl 312: #ifdef CALLERDATA
313: PUBLIC void* SGML_callerData ARGS1(HTStream *, context)
1.1 timbl 314: {
315: return context->callerData;
316: }
317:
1.2 timbl 318: PUBLIC void SGML_setCallerData ARGS2(HTStream *, context, void*, data)
1.1 timbl 319: {
320: context->callerData = data;
321: }
1.2 timbl 322: #endif
1.1 timbl 323:
1.2 timbl 324: PUBLIC void SGML_character ARGS2(HTStream *, context, char,c)
1.1 timbl 325:
326: {
1.2 timbl 327: CONST SGML_dtd *dtd = context->dtd;
1.1 timbl 328: HTChunk *string = context->string;
329:
330: switch(context->state) {
331: case S_text:
1.4 ! timbl 332: if (c=='&' && (context->element_stack &&
1.1 timbl 333: context->element_stack->tag &&
1.2 timbl 334: ( context->element_stack->tag->contents == SGML_MIXED
335: || context->element_stack->tag->contents == SGML_RCDATA)
336: )) {
1.1 timbl 337: string->size = 0;
338: context->state = S_ero;
339:
340: } else if (c=='<') {
341: string->size = 0;
342: context->state = (context->element_stack &&
343: context->element_stack->tag &&
1.2 timbl 344: context->element_stack->tag->contents == SGML_LITTERAL) ?
1.1 timbl 345: S_litteral : S_tag;
1.2 timbl 346: } else PUTC(c);
1.1 timbl 347: break;
348:
349: /* In litteral mode, waits only for specific end tag!
1.2 timbl 350: ** Only foir compatibility with old servers.
1.1 timbl 351: */
352: case S_litteral :
353: HTChunkPutc(string, c);
354: if ( TOUPPER(c) != ((string->size ==1) ? '/'
355: : context->element_stack->tag->name[string->size-2])) {
356: int i;
357:
358: /* If complete match, end litteral */
359: if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
360: end_element(context, context->element_stack->tag);
361: string->size = 0;
1.2 timbl 362: context->current_attribute_number = INVALID;
1.1 timbl 363: context->state = S_text;
364: break;
365: } /* If Mismatch: recover string. */
1.2 timbl 366: PUTC( '<');
1.1 timbl 367: for (i=0; i<string->size; i++) /* recover */
1.2 timbl 368: PUTC(
1.1 timbl 369: string->data[i]);
370: context->state = S_text;
371: }
372:
373: break;
374:
375: /* Character reference or Entity
376: */
377: case S_ero:
378: if (c=='#') {
379: context->state = S_cro; /* &# is Char Ref Open */
380: break;
381: }
382: context->state = S_entity; /* Fall through! */
383:
384: /* Handle Entities
385: */
386: case S_entity:
387: if (isalnum(c))
388: HTChunkPutc(string, c);
389: else {
390: HTChunkTerminate(string);
391: handle_entity(context, c);
392: context->state = S_text;
393: }
394: break;
395:
396: /* Character reference
397: */
398: case S_cro:
399: if (isalnum(c))
400: HTChunkPutc(string, c); /* accumulate a character NUMBER */
401: else {
402: int value;
403: HTChunkTerminate(string);
404: if (sscanf(string->data, "%d", &value)==1)
1.2 timbl 405: PUTC(FROMASCII((char)value));
1.1 timbl 406: context->state = S_text;
407: }
408: break;
409:
410: /* Tag
411: */
412: case S_tag: /* new tag */
413: if (isalnum(c))
414: HTChunkPutc(string, c);
415: else { /* End of tag name */
1.2 timbl 416: int t;
1.1 timbl 417: if (c=='/') {
418: if (TRACE) if (string->size!=0)
419: fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
420: context->state = S_end;
421: break;
422: }
423: HTChunkTerminate(string) ;
1.2 timbl 424:
425: t = find_tag(dtd, string->data);
426: if (t == INVALID) {
427: if(TRACE) fprintf(stderr, "SGML: *** Unknown element %s\n",
1.1 timbl 428: string->data);
429: context->state = (c=='>') ? S_text : S_junk_tag;
430: break;
431: }
1.2 timbl 432: context->current_tag = &dtd->tags[t];
433:
434: /* Clear out attributes
435: */
1.1 timbl 436:
1.2 timbl 437: {
438: int i;
439: for (i=0; i< context->current_tag->number_of_attributes; i++)
440: context->present[i] = NO;
1.1 timbl 441: }
442: string->size = 0;
1.2 timbl 443: context->current_attribute_number = INVALID;
1.1 timbl 444:
445: if (c=='>') {
446: if (context->current_tag->name) start_element(context);
447: context->state = S_text;
448: } else {
449: context->state = S_tag_gap;
450: }
451: }
452: break;
453:
454:
455: case S_tag_gap: /* Expecting attribute or > */
456: if (WHITE(c)) break; /* Gap between attributes */
457: if (c=='>') { /* End of tag */
458: if (context->current_tag->name) start_element(context);
459: context->state = S_text;
460: break;
461: }
462: HTChunkPutc(string, c);
463: context->state = S_attr; /* Get attribute */
464: break;
465:
466: /* accumulating value */
467: case S_attr:
468: if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
469: HTChunkTerminate(string) ;
470: handle_attribute_name(context, string->data);
471: string->size = 0;
472: if (c=='>') { /* End of tag */
473: if (context->current_tag->name) start_element(context);
474: context->state = S_text;
475: break;
476: }
477: context->state = (c=='=' ? S_equals: S_attr_gap);
478: } else {
479: HTChunkPutc(string, c);
480: }
481: break;
482:
483: case S_attr_gap: /* Expecting attribute or = or > */
484: if (WHITE(c)) break; /* Gap after attribute */
485: if (c=='>') { /* End of tag */
486: if (context->current_tag->name) start_element(context);
487: context->state = S_text;
488: break;
489: } else if (c=='=') {
490: context->state = S_equals;
491: break;
492: }
493: HTChunkPutc(string, c);
494: context->state = S_attr; /* Get next attribute */
495: break;
496:
497: case S_equals: /* After attr = */
498: if (WHITE(c)) break; /* Before attribute value */
499: if (c=='>') { /* End of tag */
500: fprintf(stderr, "SGML: found = but no value\n");
501: if (context->current_tag->name) start_element(context);
502: context->state = S_text;
503: break;
504:
505: } else if (c=='\'') {
506: context->state = S_squoted;
507: break;
508:
509: } else if (c=='"') {
510: context->state = S_dquoted;
511: break;
512: }
513: HTChunkPutc(string, c);
514: context->state = S_value;
515: break;
516:
517: case S_value:
518: if (WHITE(c) || (c=='>')) { /* End of word */
519: HTChunkTerminate(string) ;
520: handle_attribute_value(context, string->data);
521: string->size = 0;
522: if (c=='>') { /* End of tag */
523: if (context->current_tag->name) start_element(context);
524: context->state = S_text;
525: break;
526: }
527: else context->state = S_tag_gap;
528: } else {
529: HTChunkPutc(string, c);
530: }
531: break;
532:
533: case S_squoted: /* Quoted attribute value */
534: if (c=='\'') { /* End of attribute value */
535: HTChunkTerminate(string) ;
536: handle_attribute_value(context, string->data);
537: string->size = 0;
538: context->state = S_tag_gap;
539: } else {
540: HTChunkPutc(string, c);
541: }
542: break;
543:
544: case S_dquoted: /* Quoted attribute value */
545: if (c=='"') { /* End of attribute value */
546: HTChunkTerminate(string) ;
547: handle_attribute_value(context, string->data);
548: string->size = 0;
549: context->state = S_tag_gap;
550: } else {
551: HTChunkPutc(string, c);
552: }
553: break;
554:
555: case S_end: /* </ */
556: if (isalnum(c))
557: HTChunkPutc(string, c);
558: else { /* End of end tag name */
1.2 timbl 559: int t;
1.1 timbl 560: HTChunkTerminate(string) ;
561: if (c!='>') {
562: if (TRACE) fprintf(stderr,"SGML: `</%s%c' found!\n",
563: string->data, c);
564: context->state = S_junk_tag;
565: break;
566: }
1.2 timbl 567: t = find_tag(dtd, string->data);
568: if (t == INVALID) {
1.1 timbl 569: if(TRACE) fprintf(stderr,
570: "Unknown end tag </%s>\n", string->data);
1.2 timbl 571: } else {
572: context->current_tag = &dtd->tags[t];
573: end_element( context, context->current_tag);
1.1 timbl 574: }
1.2 timbl 575:
1.1 timbl 576: string->size = 0;
1.2 timbl 577: context->current_attribute_number = INVALID;
1.1 timbl 578: context->state = S_text;
579: }
580: break;
581:
582:
583: case S_junk_tag:
584: if (c=='>') {
585: context->state = S_text;
586: }
587:
588: } /* switch on context->state */
589:
590: } /* SGML_character */
1.2 timbl 591:
592:
593: PUBLIC void SGML_string ARGS2(HTStream *, context, CONST char*, str)
594: {
595: CONST char *p;
596: for(p=str; *p; p++)
597: SGML_character(context, *p);
598: }
599:
600:
601: PUBLIC void SGML_write ARGS3(HTStream *, context, CONST char*, str, int, l)
602: {
603: CONST char *p;
604: CONST char *e = str+l;
605: for(p=str; p<e; p++)
606: SGML_character(context, *p);
607: }
608:
609: /*_______________________________________________________________________
610: */
611:
612: /* Structured Object Class
613: ** -----------------------
614: */
615: PUBLIC CONST HTStreamClass SGMLParser =
616: {
617: "SGMLParser",
618: SGML_free,
619: SGML_end,
620: SGML_character, SGML_string, SGML_write,
621: };
622:
623: /* Create SGML Engine
624: ** ------------------
625: **
626: ** On entry,
627: ** dtd represents the DTD, along with
628: ** actions is the sink for the data as a set of routines.
629: **
630: */
631:
632: PUBLIC HTStream* SGML_new ARGS2(
633: CONST SGML_dtd *, dtd,
634: HTStructured *, target)
635: {
636: int i;
637: HTStream* context = (HTStream *) malloc(sizeof(*context));
638: if (!context) outofmem(__FILE__, "SGML_begin");
639:
640: context->isa = &SGMLParser;
641: context->string = HTChunkCreate(128); /* Grow by this much */
642: context->dtd = dtd;
643: context->target = target;
644: context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
645: /* Ugh: no OO */
646: context->state = S_text;
647: context->element_stack = 0; /* empty */
648: #ifdef CALLERDATA
649: context->callerData = (void*) callerData;
650: #endif
651: for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0;
652:
653: return context;
654: }
655:
Webmaster