Annotation of libwww/Library/src/SGML.c, revision 1.1.1.1
1.1 timbl 1: /* General SGML Parser code SGML.c
2: ** ========================
3: **
4: ** This module implements an HTSGMLContext object. To parse an
5: ** SGML file, create this object which is a parser. The object
6: ** is (currently) created by being parsed a DTD structure.
7: **
8: **
9: */
10: #include "SGML.h"
11:
12: #include <ctype.h>
13: #include <stdio.h>
14: #include "HTUtils.h"
15: #include "HTChunk.h"
16: #include "tcp.h" /* For FROMASCII */
17:
18: /* The State (context) of the parser
19: **
20: ** This is passed with each call to make the parser recursive
21: **
22: */
23:
24: struct _HTSGMLContext {
25: SGML_dtd *dtd;
26: void (*contents_treatment) PARAMS((void * data, char c));
27: HTTag *current_tag;
28: attr *current_attribute;
29: HTChunk *string;
30: HTElement *element_stack;
31: enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap,
32: S_attr, S_attr_gap, S_equals, S_value,
33: S_ero, S_cro,
34: S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
35: void * callerData;
36: };
37:
38:
39: /* Handle Attribute
40: ** ----------------
41: */
42: /* PUBLIC CONST char * SGML_default = ""; ?? */
43:
44: #ifdef __STDC__
45: PRIVATE void handle_attribute_name(HTSGMLContext context, const char * s)
46: #else
47: PRIVATE void handle_attribute_name(context, s)
48: HTSGMLContext context;
49: char *s;
50: #endif
51: {
52: attr* a;
53: for( a = context->current_tag->attributes;
54: a->name;
55: a++) {
56: if (0==strcasecomp(a->name, s))
57: break;
58: }
59: if (!a->name) {
60: if (TRACE)
61: fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
62: s, context->current_tag->name);
63: context->current_attribute = 0; /* Invalid */
64: return;
65: }
66: a->present = YES;
67: if (a->value) {
68: free(a->value);
69: a->value = 0;
70: }
71: context->current_attribute = a;
72: }
73:
74:
75: /* Handle attribute value
76: ** ----------------------
77: */
78: #ifdef __STDC__
79: PRIVATE void handle_attribute_value(HTSGMLContext context, const char * s)
80: #else
81: PRIVATE void handle_attribute_value(context, s)
82: HTSGMLContext context;
83: char *s;
84: #endif
85: {
86: if (context->current_attribute) {
87: StrAllocCopy(context->current_attribute->value, s);
88: } else {
89: if (TRACE) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
90: }
91: context->current_attribute = 0; /* can't have two assignments! */
92: }
93:
94: /* Handle entity
95: ** -------------
96: **
97: ** On entry,
98: ** s contains the entity name zero terminated
99: ** Bugs:
100: ** If the entity name is unknown, the terminator is treated as
101: ** a printable non-special character in all cases, even if it is '<'
102: */
103: #ifdef __STDC__
104: PRIVATE void handle_entity(HTSGMLContext context, char term)
105: #else
106: PRIVATE void handle_entity(context, term)
107: HTSGMLContext context;
108: char term;
109: #endif
110: {
111: entity * e;
112: entity * entities = context->dtd->entities;
113: CONST char *s = context->string->data;
114:
115: for(e = entities; e->name; e++) {
116: if (0==strcmp(e->name, s)) {
117: char * p;
118: for (p=e->representation; *p; p++) {
119: (*context->contents_treatment)(context->callerData, *p);
120: }
121: return; /* Good */
122: }
123: }
124: /* If entity string not found, display as text */
125: if (TRACE)
126: fprintf(stderr, "SGML: Unknown entity %s\n", s);
127: (*context->contents_treatment)(context->callerData, '&');
128: {
129: CONST char *p;
130: for (p=s; *p; p++) {
131: (*context->contents_treatment)(context->callerData, *p);
132: }
133: }
134: (*context->contents_treatment)(context->callerData, term);
135: }
136:
137: /* End element
138: */
139: #ifdef __STDC__
140: PRIVATE void end_element(HTSGMLContext context, HTTag * old_tag)
141: #else
142: PRIVATE void end_element(context, old_tag)
143: HTTag * old_tag;
144: HTSGMLContext context;
145: #endif
146: {
147: if (TRACE) fprintf(stderr, "SGML: End </%s>\n", old_tag->name);
148: if (!old_tag->end) {
149: if (TRACE) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
150: old_tag->name);
151: return;
152: }
153: while (context->element_stack) {/* Loop is error path only */
154: HTElement * N = context->element_stack;
155: HTTag * t = N->tag;
156:
157: if (old_tag != t) { /* Mismatch: syntax error */
158: if (context->element_stack->next) { /* This is not the last level */
159: if (TRACE) fprintf(stderr,
160: "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
161: old_tag->name, t->name, t->name);
162: } else { /* last level */
163: if (TRACE) fprintf(stderr,
164: "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
165: old_tag->name, t->name, old_tag->name);
166: return; /* Ignore */
167: }
168: }
169:
170: context->element_stack = N->next; /* Remove from stack */
171: free(N);
172: (t->end)(context->callerData,
173: t,
174: context->element_stack); /* Assume tag end */
175: if (context->element_stack) /* not end of document */
176: context->contents_treatment = context->element_stack->tag->treat;
177: if (old_tag == t) return; /* Correct sequence */
178:
179: /* Syntax error path only */
180:
181: }
182: fprintf(stderr,
183: "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
184: }
185:
186:
187: /* Start a element
188: */
189: #ifdef __STDC__
190: PRIVATE void start_element(HTSGMLContext context)
191: #else
192: PRIVATE void start_element(context)
193: HTSGMLContext context;
194: #endif
195: {
196: HTTag * new_tag = context->current_tag;
197:
198: if (TRACE) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
199: (*new_tag->begin)(context->callerData, new_tag, context->element_stack);
200: if (new_tag->end) { /* i.e. tag not empty */
201: HTElement * N = (HTElement *)malloc(sizeof(HTElement));
202: if (N == NULL) outofmem(__FILE__, "start_element");
203: N->next = context->element_stack;
204: N->tag = new_tag;
205: context->element_stack = N;
206: context->contents_treatment = new_tag->treat;
207: }
208: }
209:
210: /*________________________________________________________________________
211: ** Public Methods
212: */
213:
214: /* Create SGML Engine
215: ** ------------------
216: **
217: ** On entry,
218: ** dtd->tags represents the DTD, along with
219: ** dtd->entities
220: **
221: ** default_tag represents the initial and final actions,
222: ** and the character processing, for data outside
223: ** any tags. May not be empty.
224: */
225:
226: PUBLIC HTSGMLContext SGML_begin ARGS1(SGML_dtd *,dtd)
227: {
228: HTSGMLContext context = (HTSGMLContext) malloc(sizeof(*context));
229: if (!context) outofmem(__FILE__, "SGML_begin");
230:
231: context->string = HTChunkCreate(128); /* Grow by this much */
232: context->dtd = dtd;
233: context->state = S_text;
234: context->element_stack = 0; /* empty */
235: context->callerData = (void*) 0; /* unspcified as yet */
236: context->current_tag = dtd->default_tag;
237: start_element(context); /* Start document */
238: return context;
239: }
240:
241:
242: PUBLIC void SGML_end ARGS1(HTSGMLContext, context)
243: {
244: end_element(context, context->dtd->default_tag); /* End document */
245: HTChunkFree(context->string);
246: free(context);
247: }
248:
249: /* Read and write user callback handle
250: ** -----------------------------------
251: **
252: ** The callbacks from the SGML parser have an SGML context parameter.
253: ** These calls allow the caller to associate his own context with a
254: ** particular SGML context.
255: */
256:
257: PUBLIC void* SGML_callerData ARGS1(HTSGMLContext, context)
258: {
259: return context->callerData;
260: }
261:
262: PUBLIC void SGML_setCallerData ARGS2(HTSGMLContext, context, void*, data)
263: {
264: context->callerData = data;
265: }
266:
267:
268: PUBLIC void SGML_string ARGS2(HTSGMLContext, context, char*, str)
269: {
270: char *p;
271: for(p=str; *p; p++)
272: SGML_character(context, *p);
273: }
274:
275: PUBLIC void SGML_character ARGS2(HTSGMLContext, context, char,c)
276:
277: {
278: SGML_dtd *dtd = context->dtd;
279: HTChunk *string = context->string;
280:
281: switch(context->state) {
282: case S_text:
283: if (c=='&' && !(context->element_stack &&
284: context->element_stack->tag &&
285: context->element_stack->tag->litteral)) {
286: string->size = 0;
287: context->state = S_ero;
288:
289: } else if (c=='<') {
290: string->size = 0;
291: context->state = (context->element_stack &&
292: context->element_stack->tag &&
293: context->element_stack->tag->litteral) ?
294: S_litteral : S_tag;
295: } else (*context->contents_treatment)(context->callerData, c);
296: break;
297:
298: /* In litteral mode, waits only for specific end tag!
299: */
300: case S_litteral :
301: HTChunkPutc(string, c);
302: if ( TOUPPER(c) != ((string->size ==1) ? '/'
303: : context->element_stack->tag->name[string->size-2])) {
304: int i;
305:
306: /* If complete match, end litteral */
307: if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
308: end_element(context, context->element_stack->tag);
309: string->size = 0;
310: context->current_attribute = (attr *) 0;
311: context->state = S_text;
312: break;
313: } /* If Mismatch: recover string. */
314: (*context->contents_treatment)(context->callerData, '<');
315: for (i=0; i<string->size; i++) /* recover */
316: (*context->contents_treatment)(context->callerData,
317: string->data[i]);
318: context->state = S_text;
319: }
320:
321: break;
322:
323: /* Character reference or Entity
324: */
325: case S_ero:
326: if (c=='#') {
327: context->state = S_cro; /* &# is Char Ref Open */
328: break;
329: }
330: context->state = S_entity; /* Fall through! */
331:
332: /* Handle Entities
333: */
334: case S_entity:
335: if (isalnum(c))
336: HTChunkPutc(string, c);
337: else {
338: HTChunkTerminate(string);
339: handle_entity(context, c);
340: context->state = S_text;
341: }
342: break;
343:
344: /* Character reference
345: */
346: case S_cro:
347: if (isalnum(c))
348: HTChunkPutc(string, c); /* accumulate a character NUMBER */
349: else {
350: int value;
351: HTChunkTerminate(string);
352: if (sscanf(string->data, "%d", &value)==1)
353: (*context->contents_treatment)(context->callerData,
354: FROMASCII((char)value));
355: context->state = S_text;
356: }
357: break;
358:
359: /* Tag
360: */
361: case S_tag: /* new tag */
362: if (isalnum(c))
363: HTChunkPutc(string, c);
364: else { /* End of tag name */
365: attr * a;
366: if (c=='/') {
367: if (TRACE) if (string->size!=0)
368: fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
369: context->state = S_end;
370: break;
371: }
372: HTChunkTerminate(string) ;
373: for(context->current_tag = dtd->tags;
374: context->current_tag->name; context->current_tag++) {
375: if (0==strcasecomp(context->current_tag->name, string->data)) {
376: break;
377: }
378: }
379: if (!context->current_tag->name) {
380: if(TRACE) fprintf(stderr, "Unknown tag %s\n",
381: string->data);
382: context->state = (c=='>') ? S_text : S_junk_tag;
383: break;
384: }
385:
386: for (a = context->current_tag->attributes; a->name; a++ ) {
387: a->present = NO;
388: }
389: string->size = 0;
390: context->current_attribute = (attr *) 0;
391:
392: if (c=='>') {
393: if (context->current_tag->name) start_element(context);
394: context->state = S_text;
395: } else {
396: context->state = S_tag_gap;
397: }
398: }
399: break;
400:
401:
402: case S_tag_gap: /* Expecting attribute or > */
403: if (WHITE(c)) break; /* Gap between attributes */
404: if (c=='>') { /* End of tag */
405: if (context->current_tag->name) start_element(context);
406: context->state = S_text;
407: break;
408: }
409: HTChunkPutc(string, c);
410: context->state = S_attr; /* Get attribute */
411: break;
412:
413: /* accumulating value */
414: case S_attr:
415: if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
416: HTChunkTerminate(string) ;
417: handle_attribute_name(context, string->data);
418: string->size = 0;
419: if (c=='>') { /* End of tag */
420: if (context->current_tag->name) start_element(context);
421: context->state = S_text;
422: break;
423: }
424: context->state = (c=='=' ? S_equals: S_attr_gap);
425: } else {
426: HTChunkPutc(string, c);
427: }
428: break;
429:
430: case S_attr_gap: /* Expecting attribute or = or > */
431: if (WHITE(c)) break; /* Gap after attribute */
432: if (c=='>') { /* End of tag */
433: if (context->current_tag->name) start_element(context);
434: context->state = S_text;
435: break;
436: } else if (c=='=') {
437: context->state = S_equals;
438: break;
439: }
440: HTChunkPutc(string, c);
441: context->state = S_attr; /* Get next attribute */
442: break;
443:
444: case S_equals: /* After attr = */
445: if (WHITE(c)) break; /* Before attribute value */
446: if (c=='>') { /* End of tag */
447: fprintf(stderr, "SGML: found = but no value\n");
448: if (context->current_tag->name) start_element(context);
449: context->state = S_text;
450: break;
451:
452: } else if (c=='\'') {
453: context->state = S_squoted;
454: break;
455:
456: } else if (c=='"') {
457: context->state = S_dquoted;
458: break;
459: }
460: HTChunkPutc(string, c);
461: context->state = S_value;
462: break;
463:
464: case S_value:
465: if (WHITE(c) || (c=='>')) { /* End of word */
466: HTChunkTerminate(string) ;
467: handle_attribute_value(context, string->data);
468: string->size = 0;
469: if (c=='>') { /* End of tag */
470: if (context->current_tag->name) start_element(context);
471: context->state = S_text;
472: break;
473: }
474: else context->state = S_tag_gap;
475: } else {
476: HTChunkPutc(string, c);
477: }
478: break;
479:
480: case S_squoted: /* Quoted attribute value */
481: if (c=='\'') { /* End of attribute value */
482: HTChunkTerminate(string) ;
483: handle_attribute_value(context, string->data);
484: string->size = 0;
485: context->state = S_tag_gap;
486: } else {
487: HTChunkPutc(string, c);
488: }
489: break;
490:
491: case S_dquoted: /* Quoted attribute value */
492: if (c=='"') { /* End of attribute value */
493: HTChunkTerminate(string) ;
494: handle_attribute_value(context, string->data);
495: string->size = 0;
496: context->state = S_tag_gap;
497: } else {
498: HTChunkPutc(string, c);
499: }
500: break;
501:
502: case S_end: /* </ */
503: if (isalnum(c))
504: HTChunkPutc(string, c);
505: else { /* End of end tag name */
506: HTChunkTerminate(string) ;
507: if (c!='>') {
508: if (TRACE) fprintf(stderr,"SGML: `</%s%c' found!\n",
509: string->data, c);
510: context->state = S_junk_tag;
511: break;
512: }
513: for(context->current_tag = dtd->tags;
514: context->current_tag->name; context->current_tag++) {
515: if (0==strcasecomp(context->current_tag->name, string->data)) {
516: end_element( context, context->current_tag);
517: break;
518: }
519: }
520: if (!context->current_tag->name) {
521: if(TRACE) fprintf(stderr,
522: "Unknown end tag </%s>\n", string->data);
523: }
524: string->size = 0;
525: context->current_attribute = (attr *) 0;
526: context->state = S_text;
527: }
528: break;
529:
530:
531: case S_junk_tag:
532: if (c=='>') {
533: context->state = S_text;
534: }
535:
536: } /* switch on context->state */
537:
538: } /* SGML_character */
Webmaster