Annotation of libwww/Library/src/SGML.c, revision 1.51
1.23 frystyk 1: /* SGML.c
2: ** GENERAL SGML PARSER CODE
3: **
1.27 frystyk 4: ** (c) COPYRIGHT MIT 1995.
1.23 frystyk 5: ** Please first read the full copyright statement in the file COPYRIGH.
1.51 ! frystyk 6: ** @(#) $Id: SGML.c,v 1.50 1999/02/22 01:04:24 frystyk Exp $
1.1 timbl 7: **
1.2 timbl 8: ** This module implements an HTStream object. To parse an
1.1 timbl 9: ** SGML file, create this object which is a parser. The object
1.2 timbl 10: ** is (currently) created by being passed a DTD structure,
11: ** and a target HTStructured oject at which to throw the parsed stuff.
1.1 timbl 12: **
1.19 duns 13: ** 6 Feb 93 Binary seraches used. Intreface modified.
14: ** 8 Jul 94 FM Insulate free() from _free structure element.
1.42 frystyk 15: ** Nov 1996 msa Strip down the parser to minimal HTML tokenizer,
16: ** Stop allocating space for the attribute values,
17: ** use pointers to the string chunk instead.
1.1 timbl 18: */
19:
1.25 frystyk 20: /* Library include files */
1.45 frystyk 21: #include "wwwsys.h"
1.1 timbl 22: #include "HTUtils.h"
1.25 frystyk 23: #include "HTString.h"
1.1 timbl 24: #include "HTChunk.h"
1.20 frystyk 25: #include "SGML.h"
1.1 timbl 26:
1.2 timbl 27: #define INVALID (-1)
28:
1.1 timbl 29: /* The State (context) of the parser
30: **
1.2 timbl 31: ** This is passed with each call to make the parser reentrant
1.1 timbl 32: **
33: */
1.42 frystyk 34: typedef enum _sgml_state
35: {
36: S_text, S_literal, S_tag, S_tag_gap,
37: S_attr, S_attr_gap, S_equals, S_value, S_after_open,
38: S_nl, S_nl_tago,
39: S_ero, S_cro,
1.21 frystyk 40: #ifdef ISO_2022_JP
1.42 frystyk 41: S_esc, S_dollar, S_paren, S_nonascii_text,
1.21 frystyk 42: #endif
1.42 frystyk 43: S_squoted, S_dquoted, S_end, S_entity, S_junk_tag,
1.44 frystyk 44: S_md, S_md_sqs, S_md_dqs, S_com_1, S_com, S_com_2, S_com_2a
1.42 frystyk 45: } sgml_state;
1.21 frystyk 46:
47:
1.2 timbl 48: /* Internal Context Data Structure
49: ** -------------------------------
50: */
1.42 frystyk 51: struct _HTStream
52: {
53: const HTStreamClass *isa; /* inherited from HTStream */
54: const SGML_dtd *dtd;
55: HTStructuredClass *actions; /* target class */
56: HTStructured *target; /* target object */
1.2 timbl 57:
1.42 frystyk 58: HTTag *current_tag;
59: int current_attribute_number;
60: SGMLContent contents; /* current content mode */
61: HTChunk *string;
62: int token; /* ptr into string buffer */
63: sgml_state state;
64: BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
65: int value[MAX_ATTRIBUTES]; /* Offset pointers to the string */
66: };
1.2 timbl 67:
68:
69: #define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
1.42 frystyk 70: #define PUTB(b,l) ((*context->actions->put_block)(context->target, b, l))
1.2 timbl 71:
1.17 timbl 72: /* Find Attribute Number
73: ** ---------------------
74: */
1.40 frystyk 75: PRIVATE int SGMLFindAttribute (HTTag* tag, const char * s)
1.42 frystyk 76: {
1.47 frystyk 77: HTAttr* attributes = tag->attributes;
1.17 timbl 78:
1.42 frystyk 79: int high, low, i, diff; /* Binary search for attribute name */
80: for(low=0, high=tag->number_of_attributes;
81: high > low ;
82: diff < 0 ? (low = i+1) : (high = i) )
83: {
84: i = (low + (high-low)/2);
85: diff = strcasecomp(attributes[i].name, s);
86: if (diff==0)
87: return i; /* success: found it */
88: }
89: return -1;
90: }
1.17 timbl 91:
1.1 timbl 92:
93: /* Handle Attribute
94: ** ----------------
95: */
1.38 frystyk 96: /* PUBLIC const char * SGML_default = ""; ?? */
1.1 timbl 97:
1.38 frystyk 98: PRIVATE void handle_attribute_name (HTStream * context, const char * s)
1.42 frystyk 99: {
100: HTTag * tag = context->current_tag;
1.2 timbl 101:
1.42 frystyk 102: /* Note: if tag==NULL, we are skipping unknown tag... */
103: if (tag)
104: {
105: int i = SGMLFindAttribute(tag, s);
106: if (i >= 0)
107: {
108: context->current_attribute_number = i;
109: context->present[i] = YES;
110: return;
111: }
1.51 ! frystyk 112: HTTRACE(SGML_TRACE, "Unknown attribute %s for tag %s\n" _
! 113: s _ context->current_tag->name);
1.42 frystyk 114: }
115: context->current_attribute_number = INVALID; /* Invalid */
116: }
1.2 timbl 117:
1.1 timbl 118:
119: /* Handle attribute value
120: ** ----------------------
121: */
1.42 frystyk 122: PRIVATE void handle_attribute_value (HTStream * context)
123: {
124: /* Deal with attributes only if tag is known,
125: ignore silently otherwise */
126:
127: if (context->current_tag)
128: {
129: if (context->current_attribute_number != INVALID)
130: context->value[context->current_attribute_number] =
131: context->token;
1.48 frystyk 132: else {
133: char * data = HTChunk_data(context->string);
1.51 ! frystyk 134: HTTRACE(SGML_TRACE, "Attribute value %s ignored\n" _
1.48 frystyk 135: data ? data+context->token : "<null>");
136: }
1.42 frystyk 137: }
138: context->current_attribute_number = INVALID; /* can't have two assignments! */
1.1 timbl 139: }
140:
141: /* Handle entity
142: ** -------------
143: **
144: ** On entry,
145: ** s contains the entity name zero terminated
146: */
1.42 frystyk 147: PRIVATE void handle_entity (HTStream * context)
1.1 timbl 148: {
1.42 frystyk 149: const char ** entities = context->dtd->entity_names;
1.48 frystyk 150: const char *s = HTChunk_data(context->string);
1.1 timbl 151:
1.42 frystyk 152: int high, low, i, diff;
153: for(low=0, high = context->dtd->number_of_entities;
154: high > low ;
155: diff < 0 ? (low = i+1) : (high = i))
156: {
157: i = (low + (high-low)/2);
158: diff = strcmp(entities[i], s); /* Case sensitive! */
159: if (diff==0)
160: { /* success: found it */
161: (*context->actions->put_entity)(context->target, i);
162: return;
163: }
164: }
1.47 frystyk 165:
166: /* If entity string not found */
1.51 ! frystyk 167: HTTRACE(SGML_TRACE, "Unknown entity %s\n" _ s);
1.47 frystyk 168: (*context->actions->unparsed_entity)
1.48 frystyk 169: (context->target, HTChunk_data(context->string), HTChunk_size(context->string));
1.35 frystyk 170: }
1.2 timbl 171:
1.1 timbl 172: /* End element
1.2 timbl 173: ** -----------
1.1 timbl 174: */
1.42 frystyk 175: PRIVATE void end_element (HTStream * context, HTTag *tag)
176: {
1.51 ! frystyk 177: HTTRACE(SGML_TRACE, "End </%s>\n" _ tag->name);
1.42 frystyk 178: (*context->actions->end_element)
179: (context->target, tag - context->dtd->tags);
1.1 timbl 180: }
181:
1.17 timbl 182: /* Start an element
183: ** ----------------
1.1 timbl 184: */
1.31 frystyk 185: PRIVATE void start_element (HTStream * context)
1.42 frystyk 186: {
187: int i;
188: char *value[MAX_ATTRIBUTES];
189: HTTag *tag = context->current_tag;
190:
1.51 ! frystyk 191: HTTRACE(SGML_TRACE, "Start <%s>\n" _ tag->name);
1.42 frystyk 192: context->contents = tag->contents;
193:
194: /*
195: ** Build the actual pointers to the value strings stored in the
196: ** chunk buffer. (Must use offsets while collecting the values,
197: ** because the string chunk may get resized during the collection
198: ** and potentially relocated).
199: */
200: for (i = 0; i < MAX_ATTRIBUTES; ++i)
201: value[i] = context->value[i] < 0 ? NULL :
1.48 frystyk 202: HTChunk_data(context->string) + context->value[i];
1.42 frystyk 203: (*context->actions->start_element)
204: (context->target,
205: tag - context->dtd->tags,
206: context->present,
207: (const char**)value); /* coerce type for think c */
1.1 timbl 208: }
209:
210:
1.2 timbl 211: /* Find Tag in DTD tag list
212: ** ------------------------
1.1 timbl 213: **
214: ** On entry,
1.2 timbl 215: ** dtd points to dtd structire including valid tag list
216: ** string points to name of tag in question
1.1 timbl 217: **
1.2 timbl 218: ** On exit,
219: ** returns:
1.7 timbl 220: ** NULL tag not found
221: ** else address of tag structure in dtd
1.2 timbl 222: */
1.40 frystyk 223: PRIVATE HTTag * SGMLFindTag (const SGML_dtd* dtd, const char * string)
1.42 frystyk 224: {
225: int high, low, i, diff;
226: for(low=0, high=dtd->number_of_tags;
227: high > low ;
228: diff < 0 ? (low = i+1) : (high = i))
229: { /* Binary serach */
230: i = (low + (high-low)/2);
231: diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */
232: if (diff==0)
233: /* success: found it */
234: return &dtd->tags[i];
235: }
236: return NULL;
1.2 timbl 237: }
238:
239: /*________________________________________________________________________
240: ** Public Methods
1.1 timbl 241: */
242:
1.2 timbl 243:
244: /* Could check that we are back to bottom of stack! @@ */
1.40 frystyk 245: PRIVATE int SGML_flush (HTStream * context)
1.42 frystyk 246: {
247: return (*context->actions->flush)(context->target);
1.26 frystyk 248: }
1.1 timbl 249:
1.40 frystyk 250: PRIVATE int SGML_free (HTStream * context)
1.42 frystyk 251: {
252: int status;
1.15 frystyk 253:
1.42 frystyk 254: if ((status = (*context->actions->_free)(context->target)) != HT_OK)
255: return status;
256: HTChunk_delete(context->string);
257: HT_FREE(context);
258: return HT_OK;
1.15 frystyk 259: }
1.1 timbl 260:
1.40 frystyk 261: PRIVATE int SGML_abort (HTStream * context, HTList * e)
1.42 frystyk 262: {
263: (*context->actions->abort)(context->target, e);
264: HTChunk_delete(context->string);
265: HT_FREE(context);
266: return HT_ERROR;
1.15 frystyk 267: }
1.1 timbl 268:
1.41 frystyk 269: PRIVATE int SGML_write (HTStream * context, const char * b, int l)
1.42 frystyk 270: {
271: const SGML_dtd *dtd = context->dtd;
272: HTChunk *string = context->string;
273: const char *text = b;
274: int count = 0;
1.18 timbl 275:
1.42 frystyk 276: while (l-- > 0)
277: {
278: char c = *b++;
279: switch(context->state)
280: {
281: got_element_open:
282: /*
283: ** The label is jumped when the '>' of a the element
284: ** start tag has been detected. This DOES NOT FALL TO
285: ** THE CODE S_after_open, only processes the tag and
286: ** sets the state (c should still contain the
287: ** terminating character of the tag ('>'))
288: */
289: if (context->current_tag && context->current_tag->name)
290: start_element(context);
291: context->state = S_after_open;
292: break;
1.18 timbl 293:
1.42 frystyk 294: case S_after_open:
295: /*
296: ** State S_after_open is entered only for single
297: ** character after the element opening tag to test
298: ** against newline. Strip one trainling newline only
299: ** after opening nonempty element. - SGML: Ugh!
300: */
301: text = b;
302: count = 0;
303: if (c == '\n' && (context->contents != SGML_EMPTY))
304: {
305: context->state = S_text;
306: break;
307: }
308: --text;
309: goto S_text;
310:
311: S_text:
312: context->state = S_text;
313: case S_text:
1.13 timbl 314: #ifdef ISO_2022_JP
1.42 frystyk 315: if (c == '\033')
316: {
317: context->state = S_esc;
318: ++count;
319: break;
320: }
1.13 timbl 321: #endif /* ISO_2022_JP */
1.42 frystyk 322: if (c == '&')
323: {
324: if (count > 0)
325: PUTB(text, count);
326: count = 0;
1.48 frystyk 327: HTChunk_clear(string);
1.42 frystyk 328: context->state = S_ero;
329: }
330: else if (c == '<')
331: {
332: if (count > 0)
333: PUTB(text, count);
334: count = 0;
1.48 frystyk 335: HTChunk_clear(string);
1.42 frystyk 336: /* should scrap LITERAL, and use CDATA and
337: RCDATA -- msa */
338: context->state =
339: (context->contents == SGML_LITERAL) ?
340: S_literal : S_tag;
341: }
342: else if (c == '\n')
343: /* Newline - ignore if before end tag! */
344: context->state = S_nl;
345: else
346: ++count;
347: break;
1.13 timbl 348:
1.42 frystyk 349: case S_nl:
350: if (c == '<')
351: {
352: if (count > 0)
353: PUTB(text, count);
354: count = 0;
1.48 frystyk 355: HTChunk_clear(string);
1.42 frystyk 356: context->state =
357: (context->contents == SGML_LITERAL) ?
358: S_literal : S_nl_tago;
359: }
360: else
361: {
362: ++count;
363: goto S_text;
364: }
365: break;
1.18 timbl 366:
1.42 frystyk 367: case S_nl_tago: /* Had newline and tag opener */
368: if (c != '/')
369: PUTC('\n'); /* Only ignore newline before </ */
370: context->state = S_tag;
371: goto handle_S_tag;
1.18 timbl 372:
1.13 timbl 373: #ifdef ISO_2022_JP
1.42 frystyk 374: case S_esc:
375: if (c=='$')
376: context->state = S_dollar;
377: else if (c=='(')
378: context->state = S_paren;
379: else
380: context->state = S_text;
381: ++count;
382: break;
383:
384: case S_dollar:
385: if (c=='@' || c=='B')
386: context->state = S_nonascii_text;
387: else
388: context->state = S_text;
389: ++count;
390: break;
391:
392: case S_paren:
393: if (c=='B' || c=='J')
394: context->state = S_text;
395: else
396: context->state = S_text;
397: ++count;
398: break;
399:
400: case S_nonascii_text:
401: if (c == '\033')
402: context->state = S_esc;
403: ++count;
404: break;
1.13 timbl 405: #endif /* ISO_2022_JP */
1.1 timbl 406:
1.42 frystyk 407: /* In literal mode, waits only for specific end tag!
408: ** Only foir compatibility with old servers.
409: */
410: case S_literal:
411: HTChunk_putc(string, c);
412: if ( TOUPPER(c) !=
1.48 frystyk 413: ((HTChunk_size(string) == 1) ? '/'
414: : context->current_tag->name[HTChunk_size(string)-2]))
1.42 frystyk 415: {
1.1 timbl 416:
1.42 frystyk 417: /* If complete match, end literal */
418: if ((c == '>') &&
1.48 frystyk 419: (!context->current_tag->name[HTChunk_size(string)-2]))
1.42 frystyk 420: {
421: end_element
422: (context,context->current_tag);
423: /*
424: ...setting SGML_MIXED below is a
425: bit of kludge, but a good guess that
426: currently works, anything other than
427: SGML_LITERAL would work... -- msa */
428: context->contents = SGML_MIXED;
429: }
430: else
431: {
432: /* If Mismatch: recover string. */
433: PUTC( '<');
1.48 frystyk 434: PUTB(HTChunk_data(string), HTChunk_size(string));
1.42 frystyk 435: }
436: context->state = S_text;
437: text = b;
438: count = 0;
439: }
440: break;
1.1 timbl 441:
1.42 frystyk 442: /*
443: ** Character reference or Entity
444: */
445: case S_ero:
446: if (c == '#')
447: {
448: /* &# is Char Ref Open */
449: context->state = S_cro;
450: break;
451: }
452: context->state = S_entity;
1.1 timbl 453:
1.42 frystyk 454: /** FALL THROUGH TO S_entity !! ***/
1.18 timbl 455:
1.42 frystyk 456: /*
457: ** Handle Entities
458: */
459: case S_entity:
1.43 frystyk 460: if (isalnum((int) c))
1.42 frystyk 461: HTChunk_putc(string, c);
462: else
463: {
464: HTChunk_terminate(string);
465: handle_entity(context);
466: text = b;
467: count = 0;
468: if (c != ';')
469: {
470: --text;
471: goto S_text;
472: }
473: context->state = S_text;
474: }
475: break;
1.2 timbl 476:
1.42 frystyk 477: /* Character reference
478: */
479: case S_cro:
1.43 frystyk 480: if (isalnum((int)c))
1.42 frystyk 481: /* accumulate a character NUMBER */
482: HTChunk_putc(string, c);
483: else
484: {
485: int value;
486: HTChunk_terminate(string);
1.48 frystyk 487: if (sscanf(HTChunk_data(string), "%d", &value)==1)
1.42 frystyk 488: PUTC((char)value);
489: else
490: {
491: PUTB("&#", 2);
1.48 frystyk 492: PUTB(HTChunk_data(string), HTChunk_size(string)-1);
1.42 frystyk 493: }
494: text = b;
495: count = 0;
496: if (c != ';')
497: {
498: --text;
499: goto S_text;
500: }
501: context->state = S_text;
502: }
503: break;
1.1 timbl 504:
1.42 frystyk 505: case S_tag: /* new tag */
506: handle_S_tag:
1.43 frystyk 507: if (isalnum((int)c))
1.42 frystyk 508: HTChunk_putc(string, c);
1.48 frystyk 509: else { /* End of tag name */
510: int i;
511: if (c == '/') {
512: if (HTChunk_size(string) > 0)
1.51 ! frystyk 513: HTTRACE(SGML_TRACE, "`<%s/' found!\n" _ HTChunk_data(string));
1.48 frystyk 514: context->state = S_end;
515: break;
516: } else if (c == '!') {
517: if (HTChunk_size(string) > 0)
1.51 ! frystyk 518: HTTRACE(SGML_TRACE, " `<%s!' found!\n" _ HTChunk_data(string));
1.48 frystyk 519: context->state = S_md;
520: break;
521: }
522: HTChunk_terminate(string);
523: context->current_tag = SGMLFindTag(dtd, HTChunk_data(string));
524: if (context->current_tag == NULL) {
1.51 ! frystyk 525: HTTRACE(SGML_TRACE, "*** Unknown element %s\n" _ HTChunk_data(string));
1.48 frystyk 526: (*context->actions->unparsed_begin_element)
527: (context->target, HTChunk_data(string), HTChunk_size(string));
528: } else {
529: for (i=0; i<context->current_tag->number_of_attributes; i++) {
530: context->present[i] = NO;
531: context->value[i] = -1;
1.47 frystyk 532: }
1.42 frystyk 533: }
1.48 frystyk 534: context->token = 0;
535: HTChunk_clear(string);
536: context->current_attribute_number = INVALID;
537: goto S_tag_gap;
538: }
1.42 frystyk 539: break;
540:
541: S_tag_gap:
542: context->state = S_tag_gap;
543: case S_tag_gap: /* Expecting attribute or > */
1.43 frystyk 544: if (isspace((int) c))
1.42 frystyk 545: break; /* Gap between attributes */
546:
547: if (c == '>')
548: goto got_element_open;
549: else
550: goto S_attr;
551:
552: S_attr:
553: /*
554: ** Start collecting the attribute name and collect
555: ** it in S_attr.
556: */
557: context->state = S_attr;
1.48 frystyk 558: HTChunk_truncate(string, context->token);
1.42 frystyk 559: case S_attr:
1.43 frystyk 560: if (isspace((int) c) || c == '>' || c == '=')
1.42 frystyk 561: goto got_attribute_name;
562: else
563: HTChunk_putc(string, c);
564: break;
565:
566: got_attribute_name:
567: /*
568: ** This label is entered when attribute name has been
569: ** collected. Process it and enter S_attr_gap for
570: ** potential value or start of the next attribute.
571: */
572: HTChunk_terminate(string) ;
573: handle_attribute_name
1.48 frystyk 574: (context, HTChunk_data(string) + context->token);
575: HTChunk_truncate(string, context->token);
1.42 frystyk 576: context->state = S_attr_gap;
577: case S_attr_gap: /* Expecting attribute or = or > */
1.43 frystyk 578: if (isspace((int) c))
1.42 frystyk 579: break; /* Gap after attribute */
580:
581: if (c == '>')
582: goto got_element_open;
583: else if (c == '=')
584: context->state = S_equals;
585: else
586: goto S_attr; /* Get next attribute */
587: break;
588:
589: case S_equals: /* After attr = */
1.43 frystyk 590: if (isspace((int) c))
1.42 frystyk 591: break; /* Before attribute value */
592:
593: if (c == '>')
594: { /* End of tag */
1.51 ! frystyk 595: HTTRACE(SGML_TRACE, "found = but no value\n");
1.42 frystyk 596: goto got_element_open;
597: }
598: else if (c == '\'')
599: context->state = S_squoted;
600: else if (c == '"')
601: context->state = S_dquoted;
602: else
603: goto S_value;
604: break;
605:
606: S_value:
607: context->state = S_value;
1.48 frystyk 608: HTChunk_truncate(string, context->token);
1.42 frystyk 609: case S_value:
1.43 frystyk 610: if (isspace((int) c) || c == '>')
1.42 frystyk 611: {
612: HTChunk_terminate(string);
613: handle_attribute_value(context);
1.48 frystyk 614: context->token = HTChunk_size(string);
1.42 frystyk 615: goto S_tag_gap;
616: }
617: else
618: HTChunk_putc(string, c);
619: break;
1.1 timbl 620:
1.42 frystyk 621: case S_squoted: /* Quoted attribute value */
622: if (c == '\'')
623: {
624: HTChunk_terminate(string);
625: handle_attribute_value(context);
1.48 frystyk 626: context->token = HTChunk_size(string);
1.42 frystyk 627: context->state = S_tag_gap;
628: }
629: else if (c && c != '\n' && c != '\r')
630: HTChunk_putc(string, c);
631: break;
1.1 timbl 632:
1.42 frystyk 633: case S_dquoted: /* Quoted attribute value */
634: if (c == '"')
635: {
636: HTChunk_terminate(string);
637: handle_attribute_value(context);
1.48 frystyk 638: context->token = HTChunk_size(string);
1.42 frystyk 639: context->state = S_tag_gap;
640: }
641: else if (c && c != '\n' && c != '\r')
642: HTChunk_putc(string, c);
643: break;
1.2 timbl 644:
1.42 frystyk 645: case S_end: /* </ */
1.43 frystyk 646: if (isalnum((int) c))
1.42 frystyk 647: HTChunk_putc(string, c);
648: else
649: { /* End of end tag name */
650: HTTag *t;
1.48 frystyk 651: char * first;
1.42 frystyk 652: HTChunk_terminate(string);
1.48 frystyk 653: if ((first=HTChunk_data(string))!=NULL && *first != '\0')
654: t = SGMLFindTag(dtd, HTChunk_data(string));
1.42 frystyk 655: else
656: /* Empty end tag */
657: /* Original code popped here one
658: from the stack. If this feature
659: is required, I have to put the
660: stack back... -- msa */
661: t = NULL;
1.47 frystyk 662: if (!t) {
1.51 ! frystyk 663: HTTRACE(SGML_TRACE, "Unknown end tag </%s>\n" _ HTChunk_data(string));
1.47 frystyk 664: (*context->actions->unparsed_end_element)
1.48 frystyk 665: (context->target, HTChunk_data(string), HTChunk_size(string));
1.47 frystyk 666: } else {
667: context->current_tag = NULL;
668: end_element(context, t);
669: }
1.48 frystyk 670: HTChunk_clear(string);
1.42 frystyk 671: context->current_attribute_number = INVALID;
672: if (c != '>')
673: {
1.43 frystyk 674: if (!isspace((int) c))
1.51 ! frystyk 675: HTTRACE(SGML_TRACE, "`</%s%c' found!\n" _ HTChunk_data(string) _ c);
1.42 frystyk 676: context->state = S_junk_tag;
677: }
678: else
679: {
680: text = b;
681: count = 0;
682: context->state = S_text;
683: }
684: }
685: break;
686:
687: case S_junk_tag:
688: if (c == '>')
689: {
690: text = b;
691: count = 0;
692: context->state = S_text;
693: }
694: break;
695:
696: /*
697: ** Scanning (actually skipping) declarations
698: */
699: case S_md:
700: if (c == '-')
701: context->state = S_com_1;
702: else if (c == '"')
703: context->state = S_md_dqs;
704: else if (c == '\'')
705: context->state = S_md_sqs;
706: else if (c == '>')
707: {
708: text = b;
709: count = 0;
710: context->state = S_text;
711: }
712: break;
713:
714: case S_md_dqs: /* Skip double quoted string */
715: if (c == '"')
716: context->state = S_md;
1.46 frystyk 717: else if (c == '>')
718: {
719: text = b;
720: count = 0;
721: context->state = S_text;
722: }
1.42 frystyk 723: break;
724:
725: case S_md_sqs: /* Skip single quoted string */
726: if (c == '\'')
727: context->state = S_md;
1.46 frystyk 728: else if (c == '>')
729: {
730: text = b;
731: count = 0;
732: context->state = S_text;
733: }
1.42 frystyk 734: break;
735:
736: case S_com_1: /* Starting a comment? */
737: context->state = (c == '-') ? S_com : S_md;
1.46 frystyk 738: if (c == '>')
739: {
740: text = b;
741: count = 0;
742: context->state = S_text;
743: }
1.42 frystyk 744: break;
745:
746: case S_com: /* ..within comment */
747: if (c == '-')
748: context->state = S_com_2;
749: break;
750:
751: case S_com_2: /* Ending a comment ? */
1.44 frystyk 752: context->state = (c == '-') ? S_com_2a : S_com;
753: break;
754:
755: case S_com_2a:
756: if (c == '>') {
757: text = b;
758: count = 0;
759: context->state = S_text;
760: } else
761: context->state = S_com;
1.42 frystyk 762: break;
763: }
1.7 timbl 764: }
1.42 frystyk 765: if (count > 0)
766: PUTB(text, count);
767: return HT_OK;
768: }
1.1 timbl 769:
1.2 timbl 770:
1.40 frystyk 771: PRIVATE int SGML_string (HTStream * context, const char* s)
1.42 frystyk 772: {
773: return SGML_write(context, s, (int) strlen(s));
774: }
1.2 timbl 775:
776:
1.41 frystyk 777: PRIVATE int SGML_character (HTStream * context, char c)
1.42 frystyk 778: {
779: return SGML_write(context, &c, 1);
780: }
1.2 timbl 781:
782: /*_______________________________________________________________________
783: */
784:
785: /* Structured Object Class
786: ** -----------------------
787: */
1.38 frystyk 788: PRIVATE const HTStreamClass SGMLParser =
1.47 frystyk 789: {
790: "SGML",
791: SGML_flush,
792: SGML_free,
793: SGML_abort,
794: SGML_character,
795: SGML_string,
796: SGML_write
797: };
1.2 timbl 798:
799: /* Create SGML Engine
800: ** ------------------
801: **
802: ** On entry,
803: ** dtd represents the DTD, along with
804: ** actions is the sink for the data as a set of routines.
805: **
806: */
1.42 frystyk 807: PUBLIC HTStream *SGML_new(const SGML_dtd * dtd, HTStructured * target)
1.47 frystyk 808: {
809: int i;
810: HTStream* context;
811: if ((context = (HTStream *) HT_CALLOC(1, sizeof(HTStream))) == NULL)
812: HT_OUTOFMEM("SGML_begin");
813:
814: context->isa = &SGMLParser;
815: context->string = HTChunk_new(128); /* Grow by this much */
816: context->dtd = dtd;
817: context->target = target;
818: context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
819: /* Ugh: no OO */
820: context->state = S_text;
821: for(i=0; i<MAX_ATTRIBUTES; i++)
822: context->value[i] = 0;
823: return context;
824: }
825:
826: PUBLIC HTTag * SGML_findTag (SGML_dtd * dtd, int element_number)
827: {
828: return (dtd && element_number>=0 && element_number<dtd->number_of_tags) ?
829: (dtd->tags+element_number) : NULL;
830: }
831:
832: PUBLIC char * SGML_findTagName (SGML_dtd * dtd, int element_number)
833: {
834: return (dtd && element_number>=0 && element_number<dtd->number_of_tags) ?
835: (dtd->tags+element_number)->name : NULL;
836: }
837:
838: PUBLIC SGMLContent SGML_findTagContents (SGML_dtd * dtd, int element_number)
839: {
840: return (dtd && element_number>=0 && element_number<dtd->number_of_tags) ?
841: (dtd->tags+element_number)->contents : SGML_ELEMENT;
842: }
843:
1.50 frystyk 844: PUBLIC int SGML_findElementNumber (SGML_dtd * dtd, char * name_element)
845: {
846: if (dtd && name_element) {
847: int i;
848: HTTag *ct;
849: for (i = 0; i< dtd->number_of_tags; i++) {
850: ct = &(dtd->tags[i]);
851: if (!strcasecomp(ct->name,name_element))
852: return i;
853: }
854: }
855: return -1;
856: }
857:
1.47 frystyk 858: PUBLIC char * HTTag_name (HTTag * tag)
859: {
860: return tag ? tag->name : NULL;
1.49 frystyk 861: }
862:
863: PUBLIC SGMLContent HTTag_content (HTTag * tag)
864: {
865: return tag ? tag->contents : SGML_EMPTY;
1.47 frystyk 866: }
867:
868: PUBLIC int HTTag_attributes (HTTag * tag)
869: {
870: return tag ? tag->number_of_attributes : -1;
871: }
872:
873: PUBLIC char * HTTag_attributeName (HTTag * tag, int attribute_number)
874: {
875: return (tag && attribute_number>=0 && attribute_number<tag->number_of_attributes) ?
876: (tag->attributes+attribute_number)->name : NULL;
877: }
878:
879: PUBLIC char * HTAttr_name (HTAttr * attr)
880: {
881: return attr ? attr->name : NULL;
882: }
1.50 frystyk 883:
884:
885:
Webmaster