Annotation of libwww/Library/src/SGML.c, revision 1.1
1.1 ! timbl 1: /* General SGML Parser code SGML.c
! 2: ** ========================
! 3: **
! 4: ** This module implements an HTSGMLContext object. To parse an
! 5: ** SGML file, create this object which is a parser. The object
! 6: ** is (currently) created by being parsed a DTD structure.
! 7: **
! 8: **
! 9: */
! 10: #include "SGML.h"
! 11:
! 12: #include <ctype.h>
! 13: #include <stdio.h>
! 14: #include "HTUtils.h"
! 15: #include "HTChunk.h"
! 16: #include "tcp.h" /* For FROMASCII */
! 17:
! 18: /* The State (context) of the parser
! 19: **
! 20: ** This is passed with each call to make the parser recursive
! 21: **
! 22: */
! 23:
! 24: struct _HTSGMLContext {
! 25: SGML_dtd *dtd;
! 26: void (*contents_treatment) PARAMS((void * data, char c));
! 27: HTTag *current_tag;
! 28: attr *current_attribute;
! 29: HTChunk *string;
! 30: HTElement *element_stack;
! 31: enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap,
! 32: S_attr, S_attr_gap, S_equals, S_value,
! 33: S_ero, S_cro,
! 34: S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
! 35: void * callerData;
! 36: };
! 37:
! 38:
! 39: /* Handle Attribute
! 40: ** ----------------
! 41: */
! 42: /* PUBLIC CONST char * SGML_default = ""; ?? */
! 43:
! 44: #ifdef __STDC__
! 45: PRIVATE void handle_attribute_name(HTSGMLContext context, const char * s)
! 46: #else
! 47: PRIVATE void handle_attribute_name(context, s)
! 48: HTSGMLContext context;
! 49: char *s;
! 50: #endif
! 51: {
! 52: attr* a;
! 53: for( a = context->current_tag->attributes;
! 54: a->name;
! 55: a++) {
! 56: if (0==strcasecomp(a->name, s))
! 57: break;
! 58: }
! 59: if (!a->name) {
! 60: if (TRACE)
! 61: fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
! 62: s, context->current_tag->name);
! 63: context->current_attribute = 0; /* Invalid */
! 64: return;
! 65: }
! 66: a->present = YES;
! 67: if (a->value) {
! 68: free(a->value);
! 69: a->value = 0;
! 70: }
! 71: context->current_attribute = a;
! 72: }
! 73:
! 74:
! 75: /* Handle attribute value
! 76: ** ----------------------
! 77: */
! 78: #ifdef __STDC__
! 79: PRIVATE void handle_attribute_value(HTSGMLContext context, const char * s)
! 80: #else
! 81: PRIVATE void handle_attribute_value(context, s)
! 82: HTSGMLContext context;
! 83: char *s;
! 84: #endif
! 85: {
! 86: if (context->current_attribute) {
! 87: StrAllocCopy(context->current_attribute->value, s);
! 88: } else {
! 89: if (TRACE) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
! 90: }
! 91: context->current_attribute = 0; /* can't have two assignments! */
! 92: }
! 93:
! 94: /* Handle entity
! 95: ** -------------
! 96: **
! 97: ** On entry,
! 98: ** s contains the entity name zero terminated
! 99: ** Bugs:
! 100: ** If the entity name is unknown, the terminator is treated as
! 101: ** a printable non-special character in all cases, even if it is '<'
! 102: */
! 103: #ifdef __STDC__
! 104: PRIVATE void handle_entity(HTSGMLContext context, char term)
! 105: #else
! 106: PRIVATE void handle_entity(context, term)
! 107: HTSGMLContext context;
! 108: char term;
! 109: #endif
! 110: {
! 111: entity * e;
! 112: entity * entities = context->dtd->entities;
! 113: CONST char *s = context->string->data;
! 114:
! 115: for(e = entities; e->name; e++) {
! 116: if (0==strcmp(e->name, s)) {
! 117: char * p;
! 118: for (p=e->representation; *p; p++) {
! 119: (*context->contents_treatment)(context->callerData, *p);
! 120: }
! 121: return; /* Good */
! 122: }
! 123: }
! 124: /* If entity string not found, display as text */
! 125: if (TRACE)
! 126: fprintf(stderr, "SGML: Unknown entity %s\n", s);
! 127: (*context->contents_treatment)(context->callerData, '&');
! 128: {
! 129: CONST char *p;
! 130: for (p=s; *p; p++) {
! 131: (*context->contents_treatment)(context->callerData, *p);
! 132: }
! 133: }
! 134: (*context->contents_treatment)(context->callerData, term);
! 135: }
! 136:
! 137: /* End element
! 138: */
! 139: #ifdef __STDC__
! 140: PRIVATE void end_element(HTSGMLContext context, HTTag * old_tag)
! 141: #else
! 142: PRIVATE void end_element(context, old_tag)
! 143: HTTag * old_tag;
! 144: HTSGMLContext context;
! 145: #endif
! 146: {
! 147: if (TRACE) fprintf(stderr, "SGML: End </%s>\n", old_tag->name);
! 148: if (!old_tag->end) {
! 149: if (TRACE) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
! 150: old_tag->name);
! 151: return;
! 152: }
! 153: while (context->element_stack) {/* Loop is error path only */
! 154: HTElement * N = context->element_stack;
! 155: HTTag * t = N->tag;
! 156:
! 157: if (old_tag != t) { /* Mismatch: syntax error */
! 158: if (context->element_stack->next) { /* This is not the last level */
! 159: if (TRACE) fprintf(stderr,
! 160: "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
! 161: old_tag->name, t->name, t->name);
! 162: } else { /* last level */
! 163: if (TRACE) fprintf(stderr,
! 164: "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
! 165: old_tag->name, t->name, old_tag->name);
! 166: return; /* Ignore */
! 167: }
! 168: }
! 169:
! 170: context->element_stack = N->next; /* Remove from stack */
! 171: free(N);
! 172: (t->end)(context->callerData,
! 173: t,
! 174: context->element_stack); /* Assume tag end */
! 175: if (context->element_stack) /* not end of document */
! 176: context->contents_treatment = context->element_stack->tag->treat;
! 177: if (old_tag == t) return; /* Correct sequence */
! 178:
! 179: /* Syntax error path only */
! 180:
! 181: }
! 182: fprintf(stderr,
! 183: "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
! 184: }
! 185:
! 186:
! 187: /* Start a element
! 188: */
! 189: #ifdef __STDC__
! 190: PRIVATE void start_element(HTSGMLContext context)
! 191: #else
! 192: PRIVATE void start_element(context)
! 193: HTSGMLContext context;
! 194: #endif
! 195: {
! 196: HTTag * new_tag = context->current_tag;
! 197:
! 198: if (TRACE) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
! 199: (*new_tag->begin)(context->callerData, new_tag, context->element_stack);
! 200: if (new_tag->end) { /* i.e. tag not empty */
! 201: HTElement * N = (HTElement *)malloc(sizeof(HTElement));
! 202: if (N == NULL) outofmem(__FILE__, "start_element");
! 203: N->next = context->element_stack;
! 204: N->tag = new_tag;
! 205: context->element_stack = N;
! 206: context->contents_treatment = new_tag->treat;
! 207: }
! 208: }
! 209:
! 210: /*________________________________________________________________________
! 211: ** Public Methods
! 212: */
! 213:
! 214: /* Create SGML Engine
! 215: ** ------------------
! 216: **
! 217: ** On entry,
! 218: ** dtd->tags represents the DTD, along with
! 219: ** dtd->entities
! 220: **
! 221: ** default_tag represents the initial and final actions,
! 222: ** and the character processing, for data outside
! 223: ** any tags. May not be empty.
! 224: */
! 225:
! 226: PUBLIC HTSGMLContext SGML_begin ARGS1(SGML_dtd *,dtd)
! 227: {
! 228: HTSGMLContext context = (HTSGMLContext) malloc(sizeof(*context));
! 229: if (!context) outofmem(__FILE__, "SGML_begin");
! 230:
! 231: context->string = HTChunkCreate(128); /* Grow by this much */
! 232: context->dtd = dtd;
! 233: context->state = S_text;
! 234: context->element_stack = 0; /* empty */
! 235: context->callerData = (void*) 0; /* unspcified as yet */
! 236: context->current_tag = dtd->default_tag;
! 237: start_element(context); /* Start document */
! 238: return context;
! 239: }
! 240:
! 241:
! 242: PUBLIC void SGML_end ARGS1(HTSGMLContext, context)
! 243: {
! 244: end_element(context, context->dtd->default_tag); /* End document */
! 245: HTChunkFree(context->string);
! 246: free(context);
! 247: }
! 248:
! 249: /* Read and write user callback handle
! 250: ** -----------------------------------
! 251: **
! 252: ** The callbacks from the SGML parser have an SGML context parameter.
! 253: ** These calls allow the caller to associate his own context with a
! 254: ** particular SGML context.
! 255: */
! 256:
! 257: PUBLIC void* SGML_callerData ARGS1(HTSGMLContext, context)
! 258: {
! 259: return context->callerData;
! 260: }
! 261:
! 262: PUBLIC void SGML_setCallerData ARGS2(HTSGMLContext, context, void*, data)
! 263: {
! 264: context->callerData = data;
! 265: }
! 266:
! 267:
! 268: PUBLIC void SGML_string ARGS2(HTSGMLContext, context, char*, str)
! 269: {
! 270: char *p;
! 271: for(p=str; *p; p++)
! 272: SGML_character(context, *p);
! 273: }
! 274:
! 275: PUBLIC void SGML_character ARGS2(HTSGMLContext, context, char,c)
! 276:
! 277: {
! 278: SGML_dtd *dtd = context->dtd;
! 279: HTChunk *string = context->string;
! 280:
! 281: switch(context->state) {
! 282: case S_text:
! 283: if (c=='&' && !(context->element_stack &&
! 284: context->element_stack->tag &&
! 285: context->element_stack->tag->litteral)) {
! 286: string->size = 0;
! 287: context->state = S_ero;
! 288:
! 289: } else if (c=='<') {
! 290: string->size = 0;
! 291: context->state = (context->element_stack &&
! 292: context->element_stack->tag &&
! 293: context->element_stack->tag->litteral) ?
! 294: S_litteral : S_tag;
! 295: } else (*context->contents_treatment)(context->callerData, c);
! 296: break;
! 297:
! 298: /* In litteral mode, waits only for specific end tag!
! 299: */
! 300: case S_litteral :
! 301: HTChunkPutc(string, c);
! 302: if ( TOUPPER(c) != ((string->size ==1) ? '/'
! 303: : context->element_stack->tag->name[string->size-2])) {
! 304: int i;
! 305:
! 306: /* If complete match, end litteral */
! 307: if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
! 308: end_element(context, context->element_stack->tag);
! 309: string->size = 0;
! 310: context->current_attribute = (attr *) 0;
! 311: context->state = S_text;
! 312: break;
! 313: } /* If Mismatch: recover string. */
! 314: (*context->contents_treatment)(context->callerData, '<');
! 315: for (i=0; i<string->size; i++) /* recover */
! 316: (*context->contents_treatment)(context->callerData,
! 317: string->data[i]);
! 318: context->state = S_text;
! 319: }
! 320:
! 321: break;
! 322:
! 323: /* Character reference or Entity
! 324: */
! 325: case S_ero:
! 326: if (c=='#') {
! 327: context->state = S_cro; /* &# is Char Ref Open */
! 328: break;
! 329: }
! 330: context->state = S_entity; /* Fall through! */
! 331:
! 332: /* Handle Entities
! 333: */
! 334: case S_entity:
! 335: if (isalnum(c))
! 336: HTChunkPutc(string, c);
! 337: else {
! 338: HTChunkTerminate(string);
! 339: handle_entity(context, c);
! 340: context->state = S_text;
! 341: }
! 342: break;
! 343:
! 344: /* Character reference
! 345: */
! 346: case S_cro:
! 347: if (isalnum(c))
! 348: HTChunkPutc(string, c); /* accumulate a character NUMBER */
! 349: else {
! 350: int value;
! 351: HTChunkTerminate(string);
! 352: if (sscanf(string->data, "%d", &value)==1)
! 353: (*context->contents_treatment)(context->callerData,
! 354: FROMASCII((char)value));
! 355: context->state = S_text;
! 356: }
! 357: break;
! 358:
! 359: /* Tag
! 360: */
! 361: case S_tag: /* new tag */
! 362: if (isalnum(c))
! 363: HTChunkPutc(string, c);
! 364: else { /* End of tag name */
! 365: attr * a;
! 366: if (c=='/') {
! 367: if (TRACE) if (string->size!=0)
! 368: fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
! 369: context->state = S_end;
! 370: break;
! 371: }
! 372: HTChunkTerminate(string) ;
! 373: for(context->current_tag = dtd->tags;
! 374: context->current_tag->name; context->current_tag++) {
! 375: if (0==strcasecomp(context->current_tag->name, string->data)) {
! 376: break;
! 377: }
! 378: }
! 379: if (!context->current_tag->name) {
! 380: if(TRACE) fprintf(stderr, "Unknown tag %s\n",
! 381: string->data);
! 382: context->state = (c=='>') ? S_text : S_junk_tag;
! 383: break;
! 384: }
! 385:
! 386: for (a = context->current_tag->attributes; a->name; a++ ) {
! 387: a->present = NO;
! 388: }
! 389: string->size = 0;
! 390: context->current_attribute = (attr *) 0;
! 391:
! 392: if (c=='>') {
! 393: if (context->current_tag->name) start_element(context);
! 394: context->state = S_text;
! 395: } else {
! 396: context->state = S_tag_gap;
! 397: }
! 398: }
! 399: break;
! 400:
! 401:
! 402: case S_tag_gap: /* Expecting attribute or > */
! 403: if (WHITE(c)) break; /* Gap between attributes */
! 404: if (c=='>') { /* End of tag */
! 405: if (context->current_tag->name) start_element(context);
! 406: context->state = S_text;
! 407: break;
! 408: }
! 409: HTChunkPutc(string, c);
! 410: context->state = S_attr; /* Get attribute */
! 411: break;
! 412:
! 413: /* accumulating value */
! 414: case S_attr:
! 415: if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
! 416: HTChunkTerminate(string) ;
! 417: handle_attribute_name(context, string->data);
! 418: string->size = 0;
! 419: if (c=='>') { /* End of tag */
! 420: if (context->current_tag->name) start_element(context);
! 421: context->state = S_text;
! 422: break;
! 423: }
! 424: context->state = (c=='=' ? S_equals: S_attr_gap);
! 425: } else {
! 426: HTChunkPutc(string, c);
! 427: }
! 428: break;
! 429:
! 430: case S_attr_gap: /* Expecting attribute or = or > */
! 431: if (WHITE(c)) break; /* Gap after attribute */
! 432: if (c=='>') { /* End of tag */
! 433: if (context->current_tag->name) start_element(context);
! 434: context->state = S_text;
! 435: break;
! 436: } else if (c=='=') {
! 437: context->state = S_equals;
! 438: break;
! 439: }
! 440: HTChunkPutc(string, c);
! 441: context->state = S_attr; /* Get next attribute */
! 442: break;
! 443:
! 444: case S_equals: /* After attr = */
! 445: if (WHITE(c)) break; /* Before attribute value */
! 446: if (c=='>') { /* End of tag */
! 447: fprintf(stderr, "SGML: found = but no value\n");
! 448: if (context->current_tag->name) start_element(context);
! 449: context->state = S_text;
! 450: break;
! 451:
! 452: } else if (c=='\'') {
! 453: context->state = S_squoted;
! 454: break;
! 455:
! 456: } else if (c=='"') {
! 457: context->state = S_dquoted;
! 458: break;
! 459: }
! 460: HTChunkPutc(string, c);
! 461: context->state = S_value;
! 462: break;
! 463:
! 464: case S_value:
! 465: if (WHITE(c) || (c=='>')) { /* End of word */
! 466: HTChunkTerminate(string) ;
! 467: handle_attribute_value(context, string->data);
! 468: string->size = 0;
! 469: if (c=='>') { /* End of tag */
! 470: if (context->current_tag->name) start_element(context);
! 471: context->state = S_text;
! 472: break;
! 473: }
! 474: else context->state = S_tag_gap;
! 475: } else {
! 476: HTChunkPutc(string, c);
! 477: }
! 478: break;
! 479:
! 480: case S_squoted: /* Quoted attribute value */
! 481: if (c=='\'') { /* End of attribute value */
! 482: HTChunkTerminate(string) ;
! 483: handle_attribute_value(context, string->data);
! 484: string->size = 0;
! 485: context->state = S_tag_gap;
! 486: } else {
! 487: HTChunkPutc(string, c);
! 488: }
! 489: break;
! 490:
! 491: case S_dquoted: /* Quoted attribute value */
! 492: if (c=='"') { /* End of attribute value */
! 493: HTChunkTerminate(string) ;
! 494: handle_attribute_value(context, string->data);
! 495: string->size = 0;
! 496: context->state = S_tag_gap;
! 497: } else {
! 498: HTChunkPutc(string, c);
! 499: }
! 500: break;
! 501:
! 502: case S_end: /* </ */
! 503: if (isalnum(c))
! 504: HTChunkPutc(string, c);
! 505: else { /* End of end tag name */
! 506: HTChunkTerminate(string) ;
! 507: if (c!='>') {
! 508: if (TRACE) fprintf(stderr,"SGML: `</%s%c' found!\n",
! 509: string->data, c);
! 510: context->state = S_junk_tag;
! 511: break;
! 512: }
! 513: for(context->current_tag = dtd->tags;
! 514: context->current_tag->name; context->current_tag++) {
! 515: if (0==strcasecomp(context->current_tag->name, string->data)) {
! 516: end_element( context, context->current_tag);
! 517: break;
! 518: }
! 519: }
! 520: if (!context->current_tag->name) {
! 521: if(TRACE) fprintf(stderr,
! 522: "Unknown end tag </%s>\n", string->data);
! 523: }
! 524: string->size = 0;
! 525: context->current_attribute = (attr *) 0;
! 526: context->state = S_text;
! 527: }
! 528: break;
! 529:
! 530:
! 531: case S_junk_tag:
! 532: if (c=='>') {
! 533: context->state = S_text;
! 534: }
! 535:
! 536: } /* switch on context->state */
! 537:
! 538: } /* SGML_character */
Webmaster