Annotation of libwww/Library/src/SGML.c, revision 1.52
1.23 frystyk 1: /* SGML.c
2: ** GENERAL SGML PARSER CODE
3: **
1.27 frystyk 4: ** (c) COPYRIGHT MIT 1995.
1.23 frystyk 5: ** Please first read the full copyright statement in the file COPYRIGH.
1.52 ! frystyk 6: ** @(#) $Id: SGML.c,v 1.51 1999/02/22 22:10:12 frystyk Exp $
1.1 timbl 7: **
1.2 timbl 8: ** This module implements an HTStream object. To parse an
1.1 timbl 9: ** SGML file, create this object which is a parser. The object
1.2 timbl 10: ** is (currently) created by being passed a DTD structure,
11: ** and a target HTStructured oject at which to throw the parsed stuff.
1.1 timbl 12: **
1.19 duns 13: ** 6 Feb 93 Binary seraches used. Intreface modified.
14: ** 8 Jul 94 FM Insulate free() from _free structure element.
1.42 frystyk 15: ** Nov 1996 msa Strip down the parser to minimal HTML tokenizer,
16: ** Stop allocating space for the attribute values,
17: ** use pointers to the string chunk instead.
1.1 timbl 18: */
19:
1.52 ! frystyk 20: #include <assert.h>
! 21:
1.25 frystyk 22: /* Library include files */
1.45 frystyk 23: #include "wwwsys.h"
1.1 timbl 24: #include "HTUtils.h"
1.25 frystyk 25: #include "HTString.h"
1.1 timbl 26: #include "HTChunk.h"
1.20 frystyk 27: #include "SGML.h"
1.1 timbl 28:
1.2 timbl 29: #define INVALID (-1)
30:
1.1 timbl 31: /* The State (context) of the parser
32: **
1.2 timbl 33: ** This is passed with each call to make the parser reentrant
1.1 timbl 34: **
35: */
1.42 frystyk 36: typedef enum _sgml_state
37: {
38: S_text, S_literal, S_tag, S_tag_gap,
39: S_attr, S_attr_gap, S_equals, S_value, S_after_open,
40: S_nl, S_nl_tago,
41: S_ero, S_cro,
1.21 frystyk 42: #ifdef ISO_2022_JP
1.42 frystyk 43: S_esc, S_dollar, S_paren, S_nonascii_text,
1.21 frystyk 44: #endif
1.42 frystyk 45: S_squoted, S_dquoted, S_end, S_entity, S_junk_tag,
1.44 frystyk 46: S_md, S_md_sqs, S_md_dqs, S_com_1, S_com, S_com_2, S_com_2a
1.42 frystyk 47: } sgml_state;
1.21 frystyk 48:
49:
1.2 timbl 50: /* Internal Context Data Structure
51: ** -------------------------------
52: */
1.42 frystyk 53: struct _HTStream
54: {
55: const HTStreamClass *isa; /* inherited from HTStream */
56: const SGML_dtd *dtd;
57: HTStructuredClass *actions; /* target class */
58: HTStructured *target; /* target object */
1.2 timbl 59:
1.42 frystyk 60: HTTag *current_tag;
61: int current_attribute_number;
62: SGMLContent contents; /* current content mode */
63: HTChunk *string;
64: int token; /* ptr into string buffer */
65: sgml_state state;
66: BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
67: int value[MAX_ATTRIBUTES]; /* Offset pointers to the string */
68: };
1.2 timbl 69:
70:
71: #define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
1.42 frystyk 72: #define PUTB(b,l) ((*context->actions->put_block)(context->target, b, l))
1.2 timbl 73:
1.17 timbl 74: /* Find Attribute Number
75: ** ---------------------
76: */
1.40 frystyk 77: PRIVATE int SGMLFindAttribute (HTTag* tag, const char * s)
1.42 frystyk 78: {
1.47 frystyk 79: HTAttr* attributes = tag->attributes;
1.17 timbl 80:
1.42 frystyk 81: int high, low, i, diff; /* Binary search for attribute name */
1.52 ! frystyk 82:
! 83: assert(tag->number_of_attributes <= MAX_ATTRIBUTES);
! 84:
1.42 frystyk 85: for(low=0, high=tag->number_of_attributes;
86: high > low ;
87: diff < 0 ? (low = i+1) : (high = i) )
88: {
89: i = (low + (high-low)/2);
90: diff = strcasecomp(attributes[i].name, s);
91: if (diff==0)
92: return i; /* success: found it */
93: }
94: return -1;
95: }
1.17 timbl 96:
1.1 timbl 97:
98: /* Handle Attribute
99: ** ----------------
100: */
1.38 frystyk 101: /* PUBLIC const char * SGML_default = ""; ?? */
1.1 timbl 102:
1.38 frystyk 103: PRIVATE void handle_attribute_name (HTStream * context, const char * s)
1.42 frystyk 104: {
105: HTTag * tag = context->current_tag;
1.2 timbl 106:
1.42 frystyk 107: /* Note: if tag==NULL, we are skipping unknown tag... */
108: if (tag)
109: {
110: int i = SGMLFindAttribute(tag, s);
111: if (i >= 0)
112: {
113: context->current_attribute_number = i;
114: context->present[i] = YES;
115: return;
116: }
1.51 frystyk 117: HTTRACE(SGML_TRACE, "Unknown attribute %s for tag %s\n" _
118: s _ context->current_tag->name);
1.42 frystyk 119: }
120: context->current_attribute_number = INVALID; /* Invalid */
121: }
1.2 timbl 122:
1.1 timbl 123:
124: /* Handle attribute value
125: ** ----------------------
126: */
1.42 frystyk 127: PRIVATE void handle_attribute_value (HTStream * context)
128: {
129: /* Deal with attributes only if tag is known,
130: ignore silently otherwise */
131:
132: if (context->current_tag)
133: {
134: if (context->current_attribute_number != INVALID)
135: context->value[context->current_attribute_number] =
136: context->token;
1.48 frystyk 137: else {
138: char * data = HTChunk_data(context->string);
1.51 frystyk 139: HTTRACE(SGML_TRACE, "Attribute value %s ignored\n" _
1.48 frystyk 140: data ? data+context->token : "<null>");
141: }
1.42 frystyk 142: }
143: context->current_attribute_number = INVALID; /* can't have two assignments! */
1.1 timbl 144: }
145:
146: /* Handle entity
147: ** -------------
148: **
149: ** On entry,
150: ** s contains the entity name zero terminated
151: */
1.42 frystyk 152: PRIVATE void handle_entity (HTStream * context)
1.1 timbl 153: {
1.42 frystyk 154: const char ** entities = context->dtd->entity_names;
1.48 frystyk 155: const char *s = HTChunk_data(context->string);
1.1 timbl 156:
1.42 frystyk 157: int high, low, i, diff;
158: for(low=0, high = context->dtd->number_of_entities;
159: high > low ;
160: diff < 0 ? (low = i+1) : (high = i))
161: {
162: i = (low + (high-low)/2);
163: diff = strcmp(entities[i], s); /* Case sensitive! */
164: if (diff==0)
165: { /* success: found it */
166: (*context->actions->put_entity)(context->target, i);
167: return;
168: }
169: }
1.47 frystyk 170:
171: /* If entity string not found */
1.51 frystyk 172: HTTRACE(SGML_TRACE, "Unknown entity %s\n" _ s);
1.47 frystyk 173: (*context->actions->unparsed_entity)
1.48 frystyk 174: (context->target, HTChunk_data(context->string), HTChunk_size(context->string));
1.35 frystyk 175: }
1.2 timbl 176:
1.1 timbl 177: /* End element
1.2 timbl 178: ** -----------
1.1 timbl 179: */
1.42 frystyk 180: PRIVATE void end_element (HTStream * context, HTTag *tag)
181: {
1.51 frystyk 182: HTTRACE(SGML_TRACE, "End </%s>\n" _ tag->name);
1.42 frystyk 183: (*context->actions->end_element)
184: (context->target, tag - context->dtd->tags);
1.1 timbl 185: }
186:
1.17 timbl 187: /* Start an element
188: ** ----------------
1.1 timbl 189: */
1.31 frystyk 190: PRIVATE void start_element (HTStream * context)
1.42 frystyk 191: {
192: int i;
193: char *value[MAX_ATTRIBUTES];
194: HTTag *tag = context->current_tag;
195:
1.51 frystyk 196: HTTRACE(SGML_TRACE, "Start <%s>\n" _ tag->name);
1.42 frystyk 197: context->contents = tag->contents;
198:
199: /*
200: ** Build the actual pointers to the value strings stored in the
201: ** chunk buffer. (Must use offsets while collecting the values,
202: ** because the string chunk may get resized during the collection
203: ** and potentially relocated).
204: */
205: for (i = 0; i < MAX_ATTRIBUTES; ++i)
206: value[i] = context->value[i] < 0 ? NULL :
1.48 frystyk 207: HTChunk_data(context->string) + context->value[i];
1.42 frystyk 208: (*context->actions->start_element)
209: (context->target,
210: tag - context->dtd->tags,
211: context->present,
212: (const char**)value); /* coerce type for think c */
1.1 timbl 213: }
214:
215:
1.2 timbl 216: /* Find Tag in DTD tag list
217: ** ------------------------
1.1 timbl 218: **
219: ** On entry,
1.2 timbl 220: ** dtd points to dtd structire including valid tag list
221: ** string points to name of tag in question
1.1 timbl 222: **
1.2 timbl 223: ** On exit,
224: ** returns:
1.7 timbl 225: ** NULL tag not found
226: ** else address of tag structure in dtd
1.2 timbl 227: */
1.40 frystyk 228: PRIVATE HTTag * SGMLFindTag (const SGML_dtd* dtd, const char * string)
1.42 frystyk 229: {
230: int high, low, i, diff;
231: for(low=0, high=dtd->number_of_tags;
232: high > low ;
233: diff < 0 ? (low = i+1) : (high = i))
234: { /* Binary serach */
235: i = (low + (high-low)/2);
236: diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */
237: if (diff==0)
238: /* success: found it */
239: return &dtd->tags[i];
240: }
241: return NULL;
1.2 timbl 242: }
243:
244: /*________________________________________________________________________
245: ** Public Methods
1.1 timbl 246: */
247:
1.2 timbl 248:
249: /* Could check that we are back to bottom of stack! @@ */
1.40 frystyk 250: PRIVATE int SGML_flush (HTStream * context)
1.42 frystyk 251: {
252: return (*context->actions->flush)(context->target);
1.26 frystyk 253: }
1.1 timbl 254:
1.40 frystyk 255: PRIVATE int SGML_free (HTStream * context)
1.42 frystyk 256: {
257: int status;
1.15 frystyk 258:
1.42 frystyk 259: if ((status = (*context->actions->_free)(context->target)) != HT_OK)
260: return status;
261: HTChunk_delete(context->string);
262: HT_FREE(context);
263: return HT_OK;
1.15 frystyk 264: }
1.1 timbl 265:
1.40 frystyk 266: PRIVATE int SGML_abort (HTStream * context, HTList * e)
1.42 frystyk 267: {
268: (*context->actions->abort)(context->target, e);
269: HTChunk_delete(context->string);
270: HT_FREE(context);
271: return HT_ERROR;
1.15 frystyk 272: }
1.1 timbl 273:
1.41 frystyk 274: PRIVATE int SGML_write (HTStream * context, const char * b, int l)
1.42 frystyk 275: {
276: const SGML_dtd *dtd = context->dtd;
277: HTChunk *string = context->string;
278: const char *text = b;
279: int count = 0;
1.18 timbl 280:
1.42 frystyk 281: while (l-- > 0)
282: {
283: char c = *b++;
284: switch(context->state)
285: {
286: got_element_open:
287: /*
288: ** The label is jumped when the '>' of a the element
289: ** start tag has been detected. This DOES NOT FALL TO
290: ** THE CODE S_after_open, only processes the tag and
291: ** sets the state (c should still contain the
292: ** terminating character of the tag ('>'))
293: */
294: if (context->current_tag && context->current_tag->name)
295: start_element(context);
296: context->state = S_after_open;
297: break;
1.18 timbl 298:
1.42 frystyk 299: case S_after_open:
300: /*
301: ** State S_after_open is entered only for single
302: ** character after the element opening tag to test
303: ** against newline. Strip one trainling newline only
304: ** after opening nonempty element. - SGML: Ugh!
305: */
306: text = b;
307: count = 0;
308: if (c == '\n' && (context->contents != SGML_EMPTY))
309: {
310: context->state = S_text;
311: break;
312: }
313: --text;
314: goto S_text;
315:
316: S_text:
317: context->state = S_text;
318: case S_text:
1.13 timbl 319: #ifdef ISO_2022_JP
1.42 frystyk 320: if (c == '\033')
321: {
322: context->state = S_esc;
323: ++count;
324: break;
325: }
1.13 timbl 326: #endif /* ISO_2022_JP */
1.42 frystyk 327: if (c == '&')
328: {
329: if (count > 0)
330: PUTB(text, count);
331: count = 0;
1.48 frystyk 332: HTChunk_clear(string);
1.42 frystyk 333: context->state = S_ero;
334: }
335: else if (c == '<')
336: {
337: if (count > 0)
338: PUTB(text, count);
339: count = 0;
1.48 frystyk 340: HTChunk_clear(string);
1.42 frystyk 341: /* should scrap LITERAL, and use CDATA and
342: RCDATA -- msa */
343: context->state =
344: (context->contents == SGML_LITERAL) ?
345: S_literal : S_tag;
346: }
347: else if (c == '\n')
348: /* Newline - ignore if before end tag! */
349: context->state = S_nl;
350: else
351: ++count;
352: break;
1.13 timbl 353:
1.42 frystyk 354: case S_nl:
355: if (c == '<')
356: {
357: if (count > 0)
358: PUTB(text, count);
359: count = 0;
1.48 frystyk 360: HTChunk_clear(string);
1.42 frystyk 361: context->state =
362: (context->contents == SGML_LITERAL) ?
363: S_literal : S_nl_tago;
364: }
365: else
366: {
367: ++count;
368: goto S_text;
369: }
370: break;
1.18 timbl 371:
1.42 frystyk 372: case S_nl_tago: /* Had newline and tag opener */
373: if (c != '/')
374: PUTC('\n'); /* Only ignore newline before </ */
375: context->state = S_tag;
376: goto handle_S_tag;
1.18 timbl 377:
1.13 timbl 378: #ifdef ISO_2022_JP
1.42 frystyk 379: case S_esc:
380: if (c=='$')
381: context->state = S_dollar;
382: else if (c=='(')
383: context->state = S_paren;
384: else
385: context->state = S_text;
386: ++count;
387: break;
388:
389: case S_dollar:
390: if (c=='@' || c=='B')
391: context->state = S_nonascii_text;
392: else
393: context->state = S_text;
394: ++count;
395: break;
396:
397: case S_paren:
398: if (c=='B' || c=='J')
399: context->state = S_text;
400: else
401: context->state = S_text;
402: ++count;
403: break;
404:
405: case S_nonascii_text:
406: if (c == '\033')
407: context->state = S_esc;
408: ++count;
409: break;
1.13 timbl 410: #endif /* ISO_2022_JP */
1.1 timbl 411:
1.42 frystyk 412: /* In literal mode, waits only for specific end tag!
413: ** Only foir compatibility with old servers.
414: */
415: case S_literal:
416: HTChunk_putc(string, c);
417: if ( TOUPPER(c) !=
1.48 frystyk 418: ((HTChunk_size(string) == 1) ? '/'
419: : context->current_tag->name[HTChunk_size(string)-2]))
1.42 frystyk 420: {
1.1 timbl 421:
1.42 frystyk 422: /* If complete match, end literal */
423: if ((c == '>') &&
1.48 frystyk 424: (!context->current_tag->name[HTChunk_size(string)-2]))
1.42 frystyk 425: {
426: end_element
427: (context,context->current_tag);
428: /*
429: ...setting SGML_MIXED below is a
430: bit of kludge, but a good guess that
431: currently works, anything other than
432: SGML_LITERAL would work... -- msa */
433: context->contents = SGML_MIXED;
434: }
435: else
436: {
437: /* If Mismatch: recover string. */
438: PUTC( '<');
1.48 frystyk 439: PUTB(HTChunk_data(string), HTChunk_size(string));
1.42 frystyk 440: }
441: context->state = S_text;
442: text = b;
443: count = 0;
444: }
445: break;
1.1 timbl 446:
1.42 frystyk 447: /*
448: ** Character reference or Entity
449: */
450: case S_ero:
451: if (c == '#')
452: {
453: /* &# is Char Ref Open */
454: context->state = S_cro;
455: break;
456: }
457: context->state = S_entity;
1.1 timbl 458:
1.42 frystyk 459: /** FALL THROUGH TO S_entity !! ***/
1.18 timbl 460:
1.42 frystyk 461: /*
462: ** Handle Entities
463: */
464: case S_entity:
1.43 frystyk 465: if (isalnum((int) c))
1.42 frystyk 466: HTChunk_putc(string, c);
467: else
468: {
469: HTChunk_terminate(string);
470: handle_entity(context);
471: text = b;
472: count = 0;
473: if (c != ';')
474: {
475: --text;
476: goto S_text;
477: }
478: context->state = S_text;
479: }
480: break;
1.2 timbl 481:
1.42 frystyk 482: /* Character reference
483: */
484: case S_cro:
1.43 frystyk 485: if (isalnum((int)c))
1.42 frystyk 486: /* accumulate a character NUMBER */
487: HTChunk_putc(string, c);
488: else
489: {
490: int value;
491: HTChunk_terminate(string);
1.48 frystyk 492: if (sscanf(HTChunk_data(string), "%d", &value)==1)
1.42 frystyk 493: PUTC((char)value);
494: else
495: {
496: PUTB("&#", 2);
1.48 frystyk 497: PUTB(HTChunk_data(string), HTChunk_size(string)-1);
1.42 frystyk 498: }
499: text = b;
500: count = 0;
501: if (c != ';')
502: {
503: --text;
504: goto S_text;
505: }
506: context->state = S_text;
507: }
508: break;
1.1 timbl 509:
1.42 frystyk 510: case S_tag: /* new tag */
511: handle_S_tag:
1.43 frystyk 512: if (isalnum((int)c))
1.42 frystyk 513: HTChunk_putc(string, c);
1.48 frystyk 514: else { /* End of tag name */
515: int i;
516: if (c == '/') {
517: if (HTChunk_size(string) > 0)
1.51 frystyk 518: HTTRACE(SGML_TRACE, "`<%s/' found!\n" _ HTChunk_data(string));
1.48 frystyk 519: context->state = S_end;
520: break;
521: } else if (c == '!') {
522: if (HTChunk_size(string) > 0)
1.51 frystyk 523: HTTRACE(SGML_TRACE, " `<%s!' found!\n" _ HTChunk_data(string));
1.48 frystyk 524: context->state = S_md;
525: break;
526: }
527: HTChunk_terminate(string);
528: context->current_tag = SGMLFindTag(dtd, HTChunk_data(string));
529: if (context->current_tag == NULL) {
1.51 frystyk 530: HTTRACE(SGML_TRACE, "*** Unknown element %s\n" _ HTChunk_data(string));
1.48 frystyk 531: (*context->actions->unparsed_begin_element)
532: (context->target, HTChunk_data(string), HTChunk_size(string));
533: } else {
534: for (i=0; i<context->current_tag->number_of_attributes; i++) {
535: context->present[i] = NO;
536: context->value[i] = -1;
1.47 frystyk 537: }
1.42 frystyk 538: }
1.48 frystyk 539: context->token = 0;
540: HTChunk_clear(string);
541: context->current_attribute_number = INVALID;
542: goto S_tag_gap;
543: }
1.42 frystyk 544: break;
545:
546: S_tag_gap:
547: context->state = S_tag_gap;
548: case S_tag_gap: /* Expecting attribute or > */
1.43 frystyk 549: if (isspace((int) c))
1.42 frystyk 550: break; /* Gap between attributes */
551:
552: if (c == '>')
553: goto got_element_open;
554: else
555: goto S_attr;
556:
557: S_attr:
558: /*
559: ** Start collecting the attribute name and collect
560: ** it in S_attr.
561: */
562: context->state = S_attr;
1.48 frystyk 563: HTChunk_truncate(string, context->token);
1.42 frystyk 564: case S_attr:
1.43 frystyk 565: if (isspace((int) c) || c == '>' || c == '=')
1.42 frystyk 566: goto got_attribute_name;
567: else
568: HTChunk_putc(string, c);
569: break;
570:
571: got_attribute_name:
572: /*
573: ** This label is entered when attribute name has been
574: ** collected. Process it and enter S_attr_gap for
575: ** potential value or start of the next attribute.
576: */
577: HTChunk_terminate(string) ;
578: handle_attribute_name
1.48 frystyk 579: (context, HTChunk_data(string) + context->token);
580: HTChunk_truncate(string, context->token);
1.42 frystyk 581: context->state = S_attr_gap;
582: case S_attr_gap: /* Expecting attribute or = or > */
1.43 frystyk 583: if (isspace((int) c))
1.42 frystyk 584: break; /* Gap after attribute */
585:
586: if (c == '>')
587: goto got_element_open;
588: else if (c == '=')
589: context->state = S_equals;
590: else
591: goto S_attr; /* Get next attribute */
592: break;
593:
594: case S_equals: /* After attr = */
1.43 frystyk 595: if (isspace((int) c))
1.42 frystyk 596: break; /* Before attribute value */
597:
598: if (c == '>')
599: { /* End of tag */
1.51 frystyk 600: HTTRACE(SGML_TRACE, "found = but no value\n");
1.42 frystyk 601: goto got_element_open;
602: }
603: else if (c == '\'')
604: context->state = S_squoted;
605: else if (c == '"')
606: context->state = S_dquoted;
607: else
608: goto S_value;
609: break;
610:
611: S_value:
612: context->state = S_value;
1.48 frystyk 613: HTChunk_truncate(string, context->token);
1.42 frystyk 614: case S_value:
1.43 frystyk 615: if (isspace((int) c) || c == '>')
1.42 frystyk 616: {
617: HTChunk_terminate(string);
618: handle_attribute_value(context);
1.48 frystyk 619: context->token = HTChunk_size(string);
1.42 frystyk 620: goto S_tag_gap;
621: }
622: else
623: HTChunk_putc(string, c);
624: break;
1.1 timbl 625:
1.42 frystyk 626: case S_squoted: /* Quoted attribute value */
627: if (c == '\'')
628: {
629: HTChunk_terminate(string);
630: handle_attribute_value(context);
1.48 frystyk 631: context->token = HTChunk_size(string);
1.42 frystyk 632: context->state = S_tag_gap;
633: }
634: else if (c && c != '\n' && c != '\r')
635: HTChunk_putc(string, c);
636: break;
1.1 timbl 637:
1.42 frystyk 638: case S_dquoted: /* Quoted attribute value */
639: if (c == '"')
640: {
641: HTChunk_terminate(string);
642: handle_attribute_value(context);
1.48 frystyk 643: context->token = HTChunk_size(string);
1.42 frystyk 644: context->state = S_tag_gap;
645: }
646: else if (c && c != '\n' && c != '\r')
647: HTChunk_putc(string, c);
648: break;
1.2 timbl 649:
1.42 frystyk 650: case S_end: /* </ */
1.43 frystyk 651: if (isalnum((int) c))
1.42 frystyk 652: HTChunk_putc(string, c);
653: else
654: { /* End of end tag name */
655: HTTag *t;
1.48 frystyk 656: char * first;
1.42 frystyk 657: HTChunk_terminate(string);
1.48 frystyk 658: if ((first=HTChunk_data(string))!=NULL && *first != '\0')
659: t = SGMLFindTag(dtd, HTChunk_data(string));
1.42 frystyk 660: else
661: /* Empty end tag */
662: /* Original code popped here one
663: from the stack. If this feature
664: is required, I have to put the
665: stack back... -- msa */
666: t = NULL;
1.47 frystyk 667: if (!t) {
1.51 frystyk 668: HTTRACE(SGML_TRACE, "Unknown end tag </%s>\n" _ HTChunk_data(string));
1.47 frystyk 669: (*context->actions->unparsed_end_element)
1.48 frystyk 670: (context->target, HTChunk_data(string), HTChunk_size(string));
1.47 frystyk 671: } else {
672: context->current_tag = NULL;
673: end_element(context, t);
674: }
1.48 frystyk 675: HTChunk_clear(string);
1.42 frystyk 676: context->current_attribute_number = INVALID;
677: if (c != '>')
678: {
1.43 frystyk 679: if (!isspace((int) c))
1.51 frystyk 680: HTTRACE(SGML_TRACE, "`</%s%c' found!\n" _ HTChunk_data(string) _ c);
1.42 frystyk 681: context->state = S_junk_tag;
682: }
683: else
684: {
685: text = b;
686: count = 0;
687: context->state = S_text;
688: }
689: }
690: break;
691:
692: case S_junk_tag:
693: if (c == '>')
694: {
695: text = b;
696: count = 0;
697: context->state = S_text;
698: }
699: break;
700:
701: /*
702: ** Scanning (actually skipping) declarations
703: */
704: case S_md:
705: if (c == '-')
706: context->state = S_com_1;
707: else if (c == '"')
708: context->state = S_md_dqs;
709: else if (c == '\'')
710: context->state = S_md_sqs;
711: else if (c == '>')
712: {
713: text = b;
714: count = 0;
715: context->state = S_text;
716: }
717: break;
718:
719: case S_md_dqs: /* Skip double quoted string */
720: if (c == '"')
721: context->state = S_md;
1.46 frystyk 722: else if (c == '>')
723: {
724: text = b;
725: count = 0;
726: context->state = S_text;
727: }
1.42 frystyk 728: break;
729:
730: case S_md_sqs: /* Skip single quoted string */
731: if (c == '\'')
732: context->state = S_md;
1.46 frystyk 733: else if (c == '>')
734: {
735: text = b;
736: count = 0;
737: context->state = S_text;
738: }
1.42 frystyk 739: break;
740:
741: case S_com_1: /* Starting a comment? */
742: context->state = (c == '-') ? S_com : S_md;
1.46 frystyk 743: if (c == '>')
744: {
745: text = b;
746: count = 0;
747: context->state = S_text;
748: }
1.42 frystyk 749: break;
750:
751: case S_com: /* ..within comment */
752: if (c == '-')
753: context->state = S_com_2;
754: break;
755:
756: case S_com_2: /* Ending a comment ? */
1.44 frystyk 757: context->state = (c == '-') ? S_com_2a : S_com;
758: break;
759:
760: case S_com_2a:
761: if (c == '>') {
762: text = b;
763: count = 0;
764: context->state = S_text;
765: } else
766: context->state = S_com;
1.42 frystyk 767: break;
768: }
1.7 timbl 769: }
1.42 frystyk 770: if (count > 0)
771: PUTB(text, count);
772: return HT_OK;
773: }
1.1 timbl 774:
1.2 timbl 775:
1.40 frystyk 776: PRIVATE int SGML_string (HTStream * context, const char* s)
1.42 frystyk 777: {
778: return SGML_write(context, s, (int) strlen(s));
779: }
1.2 timbl 780:
781:
1.41 frystyk 782: PRIVATE int SGML_character (HTStream * context, char c)
1.42 frystyk 783: {
784: return SGML_write(context, &c, 1);
785: }
1.2 timbl 786:
787: /*_______________________________________________________________________
788: */
789:
790: /* Structured Object Class
791: ** -----------------------
792: */
1.38 frystyk 793: PRIVATE const HTStreamClass SGMLParser =
1.47 frystyk 794: {
795: "SGML",
796: SGML_flush,
797: SGML_free,
798: SGML_abort,
799: SGML_character,
800: SGML_string,
801: SGML_write
802: };
1.2 timbl 803:
804: /* Create SGML Engine
805: ** ------------------
806: **
807: ** On entry,
808: ** dtd represents the DTD, along with
809: ** actions is the sink for the data as a set of routines.
810: **
811: */
1.42 frystyk 812: PUBLIC HTStream *SGML_new(const SGML_dtd * dtd, HTStructured * target)
1.47 frystyk 813: {
814: int i;
815: HTStream* context;
816: if ((context = (HTStream *) HT_CALLOC(1, sizeof(HTStream))) == NULL)
817: HT_OUTOFMEM("SGML_begin");
818:
819: context->isa = &SGMLParser;
820: context->string = HTChunk_new(128); /* Grow by this much */
821: context->dtd = dtd;
822: context->target = target;
823: context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
824: /* Ugh: no OO */
825: context->state = S_text;
826: for(i=0; i<MAX_ATTRIBUTES; i++)
827: context->value[i] = 0;
828: return context;
829: }
830:
831: PUBLIC HTTag * SGML_findTag (SGML_dtd * dtd, int element_number)
832: {
833: return (dtd && element_number>=0 && element_number<dtd->number_of_tags) ?
834: (dtd->tags+element_number) : NULL;
835: }
836:
837: PUBLIC char * SGML_findTagName (SGML_dtd * dtd, int element_number)
838: {
839: return (dtd && element_number>=0 && element_number<dtd->number_of_tags) ?
840: (dtd->tags+element_number)->name : NULL;
841: }
842:
843: PUBLIC SGMLContent SGML_findTagContents (SGML_dtd * dtd, int element_number)
844: {
845: return (dtd && element_number>=0 && element_number<dtd->number_of_tags) ?
846: (dtd->tags+element_number)->contents : SGML_ELEMENT;
847: }
848:
1.50 frystyk 849: PUBLIC int SGML_findElementNumber (SGML_dtd * dtd, char * name_element)
850: {
851: if (dtd && name_element) {
852: int i;
853: HTTag *ct;
854: for (i = 0; i< dtd->number_of_tags; i++) {
855: ct = &(dtd->tags[i]);
856: if (!strcasecomp(ct->name,name_element))
857: return i;
858: }
859: }
860: return -1;
861: }
862:
1.47 frystyk 863: PUBLIC char * HTTag_name (HTTag * tag)
864: {
865: return tag ? tag->name : NULL;
1.49 frystyk 866: }
867:
868: PUBLIC SGMLContent HTTag_content (HTTag * tag)
869: {
870: return tag ? tag->contents : SGML_EMPTY;
1.47 frystyk 871: }
872:
873: PUBLIC int HTTag_attributes (HTTag * tag)
874: {
875: return tag ? tag->number_of_attributes : -1;
876: }
877:
878: PUBLIC char * HTTag_attributeName (HTTag * tag, int attribute_number)
879: {
880: return (tag && attribute_number>=0 && attribute_number<tag->number_of_attributes) ?
881: (tag->attributes+attribute_number)->name : NULL;
882: }
883:
884: PUBLIC char * HTAttr_name (HTAttr * attr)
885: {
886: return attr ? attr->name : NULL;
887: }
1.50 frystyk 888:
889:
890:
Webmaster