Annotation of libwww/Library/src/HTML.c, revision 1.1
1.1 ! timbl 1: /* HTML Parser
! 2: ** ===========
! 3: **
! 4: ** An HTML displayable object has associated with it
! 5: **
! 6: ** - The underlying text object for display
! 7: ** - An SGML parsing context
! 8: ** - An anchor representing the whole object
! 9: ** - A style sheet, in the case os a style-oriented version
! 10: **
! 11: ** The first three could logically be represented by multiple inheritance if
! 12: ** that were supported, as an HTML object is like a subclass of all three.
! 13: **
! 14: ** In practice in C,
! 15: **
! 16: ** - a HText object is created by this module (when needed)
! 17: ** - an SGML parsing object is created by this module
! 18: ** - the anchor representing the object is given at creation time
! 19: **
! 20: ** Those using structured HTML objects will wish to override this module
! 21: ** completely
! 22: */
! 23: #include "HTML.h"
! 24:
! 25: #include <ctype.h>
! 26: #include <stdio.h>
! 27:
! 28: #include "HTAtom.h"
! 29: #include "HTChunk.h"
! 30: #include "HText.h"
! 31: #include "HTStyle.h"
! 32:
! 33:
! 34: /* SPECIAL HTML CODE
! 35: ** =================
! 36: */
! 37:
! 38: extern HTStyleSheet * styleSheet; /* Application-wide */
! 39:
! 40: /* Module-wide style cache
! 41: */
! 42: PRIVATE HTStyle * glossary_style;
! 43: PRIVATE HTStyle * list_compact_style;
! 44: PRIVATE HTStyle * glossary_compact_style;
! 45: PRIVATE int got_styles = 0;
! 46:
! 47:
! 48: /* HTML Object
! 49: ** -----------
! 50: */
! 51: struct _HTML {
! 52: HTParentAnchor * node_anchor;
! 53: HText * text;
! 54: HTSGMLContext context;
! 55:
! 56: HTChunk title; /* Grow by 128 */
! 57:
! 58: /* Used in parsing: */
! 59:
! 60: BOOL style_change;
! 61: HTStyle * new_style;
! 62: HTStyle * old_style;
! 63: BOOL in_word; /* Have just had a non-white character */
! 64: };
! 65:
! 66:
! 67: /* Forward declarations of routines
! 68: */
! 69: PRIVATE void get_styles NOPARAMS;
! 70:
! 71: /* For dtd: */
! 72: PRIVATE void no_change PARAMS((void*this, HTTag * t, HTElement * e));
! 73: PRIVATE void begin_litteral PARAMS((void*this, HTTag * t, HTElement * e));
! 74: PRIVATE void begin_element PARAMS((void*this, HTTag * t, HTElement * e));
! 75: PRIVATE void end_element PARAMS((void*this, HTTag * t, HTElement * e));
! 76: PRIVATE void begin_document PARAMS((void*this, HTTag * t, HTElement * e));
! 77: PRIVATE void end_document PARAMS((void*this, HTTag * t, HTElement * e));
! 78: PRIVATE void begin_anchor PARAMS((void*this, HTTag * t, HTElement * e));
! 79: PRIVATE void end_anchor PARAMS((void*this, HTTag * t, HTElement * e));
! 80: PRIVATE void begin_list PARAMS((void*this, HTTag * t, HTElement * e));
! 81: PRIVATE void list_element PARAMS((void*this, HTTag * t, HTElement * e));
! 82: PRIVATE void end_list PARAMS((void*this, HTTag * t, HTElement * e));
! 83: PRIVATE void begin_glossary PARAMS((void*this, HTTag * t, HTElement * e));
! 84: PRIVATE void end_glossary PARAMS((void*this, HTTag * t, HTElement * e));
! 85:
! 86: PRIVATE void actually_set_style PARAMS((HTML_id this));
! 87: PRIVATE void change_style PARAMS((HTML_id this, HTStyle * style));
! 88:
! 89: /* Style buffering avoids dummy paragraph begin/ends.
! 90: */
! 91: #define UPDATE_STYLE if (THIS->style_change) { actually_set_style(THIS); }
! 92:
! 93: #define THIS ((HTML_id)this)
! 94:
! 95: /* Things affecting the anchor but not the document itself
! 96: ** -------------------------------------------------------
! 97: */
! 98:
! 99:
! 100: /* TITLE
! 101: */
! 102:
! 103: /* Accumulate a character of title
! 104: */
! 105: static void accumulate_string ARGS2(void *, this, char, c)
! 106:
! 107: {
! 108: HTChunkPutc(&THIS->title, c);
! 109: }
! 110:
! 111:
! 112: /* Clear the title
! 113: */
! 114: PRIVATE void clear_string ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 115: {
! 116: HTChunkClear(&THIS->title);
! 117: }
! 118:
! 119: PRIVATE void set_title ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 120: {
! 121: HTChunkTerminate(&THIS->title);
! 122: HTAnchor_setTitle(THIS->node_anchor, THIS->title.data);
! 123: }
! 124:
! 125: PRIVATE void set_index ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 126: {
! 127: HTAnchor_setIndex(THIS->node_anchor);
! 128: }
! 129:
! 130: /* Things affecting the document
! 131: ** -----------------------------
! 132: */
! 133: /* Character handling
! 134: */
! 135: PRIVATE void pass_character ARGS2(void *, this, char, c)
! 136: {
! 137: if (THIS->style_change) {
! 138: if ((c=='\n') || (c==' ')) return; /* Ignore it */
! 139: UPDATE_STYLE;
! 140: }
! 141: if (c=='\n') {
! 142: if (THIS->in_word) {
! 143: HText_appendCharacter(THIS->text, ' ');
! 144: THIS->in_word = NO;
! 145: }
! 146: } else {
! 147: HText_appendCharacter(THIS->text, c);
! 148: THIS->in_word = YES;
! 149: }
! 150: }
! 151:
! 152: PRIVATE void litteral_text ARGS2(void *, this, char, c)
! 153: {
! 154: /* We guarrantee that the style is up-to-date in begin_litteral
! 155: */
! 156: HText_appendCharacter(THIS->text, c); /* @@@@@ */
! 157: }
! 158:
! 159: PRIVATE void ignore_text ARGS2(void *, this, char, c)
! 160: {
! 161: /* Do nothing */
! 162: }
! 163:
! 164: PRIVATE void set_next_id ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 165: {
! 166: /* Not needed */
! 167: }
! 168:
! 169: PRIVATE void new_paragraph ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 170: {
! 171: UPDATE_STYLE;
! 172: HText_appendParagraph(THIS->text);
! 173: THIS->in_word = NO;
! 174: }
! 175:
! 176: PRIVATE void term ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 177: {
! 178: if (!THIS->style_change) {
! 179: HText_appendParagraph(THIS->text);
! 180: THIS->in_word = NO;
! 181: }
! 182: }
! 183:
! 184: PRIVATE void definition ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 185: {
! 186: UPDATE_STYLE;
! 187: pass_character(this, '\t'); /* Just tab out one stop */
! 188: THIS->in_word = NO;
! 189: }
! 190:
! 191: /* Our Static DTD for HTML
! 192: ** -----------------------
! 193: */
! 194:
! 195: static entity entities[] = {
! 196: { "lt", "<" },
! 197: { "gt", ">" },
! 198: { "amp", "&" },
! 199: #ifdef NeXT
! 200: { "bullet" , "\267" }, /* @@@ NeXT only */
! 201: #endif
! 202: /* The following accented characters are from peter Flynn, curia project */
! 203:
! 204: /* these ifdefs don't solve the problem of a simple terminal emulator
! 205: ** with a different character set to the client machine. But nothing does,
! 206: ** except looking at the TERM setting */
! 207:
! 208: { "ocus" , "&" }, /* for CURIA */
! 209: #ifdef IBMPC
! 210: { "aacute" , "\240" }, /* For PC display */
! 211: { "eacute" , "\202" },
! 212: { "iacute" , "\241" },
! 213: { "oacute" , "\242" },
! 214: { "uacute" , "\243" },
! 215: { "Aacute" , "\101" },
! 216: { "Eacute" , "\220" },
! 217: { "Iacute" , "\111" },
! 218: { "Oacute" , "\117" },
! 219: { "Uacute" , "\125" },
! 220: #else
! 221: { "aacute" , "\341" }, /* Works for openwindows -- Peter Flynn */
! 222: { "eacute" , "\351" },
! 223: { "iacute" , "\355" },
! 224: { "oacute" , "\363" },
! 225: { "uacute" , "\372" },
! 226: { "Aacute" , "\301" },
! 227: { "Eacute" , "\310" },
! 228: { "Iacute" , "\315" },
! 229: { "Oacute" , "\323" },
! 230: { "Uacute" , "\332" },
! 231: #endif
! 232: { 0, 0 } /* Terminate list */
! 233: };
! 234:
! 235: static attr no_attr[] = {{ 0, 0 , 0}};
! 236:
! 237: static attr a_attr[] = { /* Anchor attributes */
! 238: #define A_ID 0
! 239: { "NAME", 0, 0 }, /* Should be ID */
! 240: #define A_TYPE 1
! 241: { "TYPE", 0, 0 },
! 242: #define A_HREF 2
! 243: { "HREF", 0, 0 },
! 244: { 0, 0 , 0} /* Terminate list */
! 245: };
! 246: static attr list_attr[] = {
! 247: #define LIST_COMPACT 0
! 248: { "COMPACT", 0, 0 },
! 249: { 0, 0, 0 } /* Terminate list */
! 250: };
! 251:
! 252: static attr glossary_attr[] = {
! 253: #define GLOSSARY_COMPACT 0
! 254: { "COMPACT", 0, 0 },
! 255: { 0, 0, 0 } /* Terminate list */
! 256: };
! 257:
! 258: static HTTag default_tag =
! 259: { "DOCUMENT", no_attr , 0, 0, begin_document, pass_character, end_document };
! 260: /* NAME ATTR STYLE LITERAL? ON_BEGIN ON__CHARACTER ON_END
! 261: */
! 262: static HTTag tags[] = {
! 263: #define TITLE_TAG 0
! 264: { "TITLE", no_attr, 0, 0, clear_string, accumulate_string, set_title },
! 265: #define ISINDEX_TAG 1
! 266: { "ISINDEX", no_attr, 0, 0, set_index, 0 , 0 },
! 267: #define NEXTID_TAG 2
! 268: { "NEXTID", no_attr, 0, 0, set_next_id, 0, 0 },
! 269: #define ADDRESS_TAG 3
! 270: { "ADDRESS" , no_attr, 0, 0, begin_element, pass_character, end_element },
! 271: #define H1_TAG 4
! 272: { "H1" , no_attr, 0, 0, begin_element, pass_character, end_element },
! 273: { "H2" , no_attr, 0, 0, begin_element, pass_character, end_element },
! 274: { "H3" , no_attr, 0, 0, begin_element, pass_character, end_element },
! 275: { "H4" , no_attr, 0, 0, begin_element, pass_character, end_element },
! 276: { "H5" , no_attr, 0, 0, begin_element, pass_character, end_element },
! 277: { "H6" , no_attr, 0, 0, begin_element, pass_character, end_element },
! 278: { "H7" , no_attr, 0, 0, begin_element, pass_character, end_element },
! 279: #define UL_TAG 11
! 280: { "UL" , list_attr, 0, 0, begin_list, pass_character, end_list },
! 281: #define OL_TAG 12
! 282: { "OL" , list_attr, 0, 0, begin_list, pass_character, end_list },
! 283: #define MENU_TAG 13
! 284: { "MENU" , list_attr, 0, 0, begin_list, pass_character, end_list },
! 285: #define DIR_TAG 14
! 286: { "DIR" , list_attr, 0, 0, begin_list, pass_character, end_list },
! 287: #define LI_TAG 15
! 288: { "LI" , list_attr, 0, 0, list_element, pass_character, 0 },
! 289: #define DL_TAG 16
! 290: { "DL" , glossary_attr, 0, 0, begin_glossary, pass_character, end_glossary },
! 291: { "DT" , no_attr, 0, 0, term, pass_character, 0 },
! 292: { "DD" , no_attr, 0, 0, definition, pass_character, 0 },
! 293: { "A" , a_attr, 0, 0, begin_anchor, pass_character, end_anchor },
! 294: #define P_TAG 20
! 295: { "P" , no_attr, 0, 0, new_paragraph, pass_character, 0 },
! 296: #define XMP_TAG 21
! 297: { "XMP" , no_attr, 0, YES, begin_litteral, litteral_text, end_element },
! 298: #define PRE_TAG 22
! 299: { "PRE" , no_attr, 0, 0, begin_litteral, litteral_text, end_element },
! 300: #define LISTING_TAG 23
! 301: { "LISTING" , no_attr, 0, YES,begin_litteral, litteral_text, end_element },
! 302: #define PLAINTEXT_TAG 24
! 303: { "PLAINTEXT", no_attr, 0, YES, begin_litteral, litteral_text, end_element },
! 304: #define COMMENT_TAG 25
! 305: { "COMMENT", no_attr, 0, YES, no_change, ignore_text, no_change },
! 306: { 0, 0, 0, 0, 0, 0 , 0} /* Terminate list */
! 307: };
! 308:
! 309: PUBLIC SGML_dtd HTML_dtd = { tags, &default_tag, entities };
! 310:
! 311:
! 312: /* Flattening the style structure
! 313: ** ------------------------------
! 314: **
! 315: On the NeXT, and on any read-only browser, it is simpler for the text to have
! 316: a sequence of styles, rather than a nested tree of styles. In this
! 317: case we have to flatten the structure as it arrives from SGML tags into
! 318: a sequence of styles.
! 319: */
! 320:
! 321: /* If style really needs to be set, call this
! 322: */
! 323: PRIVATE void actually_set_style ARGS1(HTML_id, this)
! 324: {
! 325: if (!THIS->text) { /* First time through */
! 326: THIS->text = HText_new(THIS->node_anchor);
! 327: HText_beginAppend(THIS->text);
! 328: HText_setStyle(THIS->text, THIS->new_style);
! 329: THIS->in_word = NO;
! 330: } else {
! 331: HText_setStyle(THIS->text, THIS->new_style);
! 332: }
! 333: THIS->old_style = THIS->new_style;
! 334: THIS->style_change = NO;
! 335: }
! 336:
! 337: /* If you THINK you need to change style, call this
! 338: */
! 339:
! 340: PRIVATE void change_style ARGS2(HTML_id, this, HTStyle *,style)
! 341: {
! 342: if (THIS->new_style!=style) {
! 343: THIS->style_change = YES /* was old_style == new_style */ ;
! 344: THIS->new_style = style;
! 345: }
! 346: }
! 347:
! 348: /* Anchor handling
! 349: ** ---------------
! 350: */
! 351: PRIVATE void begin_anchor ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 352: {
! 353: HTChildAnchor * source = HTAnchor_findChildAndLink(
! 354: THIS->node_anchor, /* parent */
! 355: a_attr[A_ID].present ? a_attr[A_ID].value : 0, /* Tag */
! 356: a_attr[A_HREF].present ? a_attr[A_HREF].value : 0, /* Addresss */
! 357: a_attr[A_TYPE].present ?
! 358: (HTLinkType*)HTAtom_for(a_attr[A_TYPE].value)
! 359: : 0);
! 360:
! 361: UPDATE_STYLE;
! 362: HText_beginAnchor(THIS->text, source);
! 363: }
! 364:
! 365: PRIVATE void end_anchor ARGS3(void *, this, HTTag *, t,
! 366: HTElement *, e)
! 367: {
! 368: UPDATE_STYLE;
! 369: HText_endAnchor(THIS->text);
! 370: }
! 371:
! 372:
! 373: /* General SGML Element Handling
! 374: ** -----------------------------
! 375: */
! 376: PRIVATE void begin_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 377: {
! 378: change_style(THIS, (HTStyle*)(t->style));
! 379: }
! 380: PRIVATE void no_change ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 381: {
! 382: /* Do nothing */;
! 383: }
! 384: PRIVATE void begin_litteral ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 385: {
! 386: change_style(THIS, t->style);
! 387: UPDATE_STYLE;
! 388: }
! 389: /* End Element
! 390: **
! 391: ** When we end an element, the style must be returned to that
! 392: ** in effect before that element. Note that anchors (etc?)
! 393: ** don't have an associated style, so that we must scan down the
! 394: ** stack for an element with a defined style. (In fact, the styles
! 395: ** should be linked to the whole stack not just the top one.)
! 396: ** TBL 921119
! 397: */
! 398: PRIVATE void end_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 399: {
! 400: /* if (e) change_style(THIS, e->tag->style); */
! 401: while (e) {
! 402: if (e->tag->style) {
! 403: change_style(THIS, e->tag->style);
! 404: return;
! 405: }
! 406: e = e->next;
! 407: }
! 408: }
! 409:
! 410: /* Lists
! 411: */
! 412: PRIVATE void begin_list ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 413: {
! 414: change_style(THIS, list_attr[LIST_COMPACT].present
! 415: ? list_compact_style
! 416: : (HTStyle*)(t->style));
! 417: THIS->in_word = NO;
! 418: }
! 419:
! 420: PRIVATE void end_list ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 421: {
! 422: change_style(THIS, e->tag->style);
! 423: THIS->in_word = NO;
! 424: }
! 425:
! 426: PRIVATE void list_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 427: {
! 428: UPDATE_STYLE;
! 429: if (e->tag != &tags[DIR_TAG])
! 430: HText_appendParagraph(THIS->text);
! 431: else
! 432: HText_appendCharacter(THIS->text, '\t'); /* Tab @@ nl for UL? */
! 433: THIS->in_word = NO;
! 434: }
! 435:
! 436:
! 437: PRIVATE void begin_glossary ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 438: {
! 439: change_style(THIS, glossary_attr[GLOSSARY_COMPACT].present
! 440: ? glossary_compact_style
! 441: : glossary_style);
! 442: THIS->in_word = NO;
! 443: }
! 444:
! 445: PRIVATE void end_glossary ARGS3(void *, this, HTTag *,t, HTElement *,e)
! 446: {
! 447: change_style(THIS, e->tag->style);
! 448: THIS->in_word = NO;
! 449: }
! 450:
! 451:
! 452: /* Create an HTML object
! 453: ** ---------------------
! 454: */
! 455: PUBLIC HTML_id HTML_new ARGS1(HTParentAnchor *,anchor)
! 456: {
! 457:
! 458: HTML_id this = malloc(sizeof(*this));
! 459:
! 460: if (!got_styles) get_styles();
! 461:
! 462: this->node_anchor = anchor;
! 463: this->title.size = 0;
! 464: this->title.growby = 128;
! 465: this->title.allocated = 0;
! 466: this->title.data = 0;
! 467: this->text = 0;
! 468: this->style_change = YES; /* Force check leading to text creation */
! 469: this->new_style = this->old_style = 0;
! 470:
! 471: this->context = SGML_begin(&HTML_dtd);
! 472: SGML_setCallerData(this->context, this);
! 473:
! 474: return this;
! 475: }
! 476:
! 477:
! 478: /* Free an HTML object
! 479: ** -------------------
! 480: **
! 481: ** Note that the SGML parsing context is freed, but the created object is not,
! 482: ** as it takes on an existence of its own unless explicitly freed.
! 483: */
! 484: PUBLIC void HTML_free ARGS1(HTML_id, this)
! 485: {
! 486: SGML_end(this->context);
! 487: free(this);
! 488: }
! 489:
! 490: PUBLIC HTSGMLContext HTML_SGMLContext ARGS1(HTML_id, this)
! 491: {
! 492: return this->context;
! 493: }
! 494:
! 495: PRIVATE void begin_document ARGS3(void *, this, HTTag *, t, HTElement *, e)
! 496: {
! 497: /* Can't do much, THIS is undefined here */
! 498: }
! 499:
! 500: PRIVATE void end_document ARGS3(void *, this, HTTag *, t, HTElement *, e)
! 501: /* If the document is empty, the text object will not yet exist.
! 502: So we could in fact abandon creating the document and return
! 503: an error code. In fact an empty document is an important type
! 504: of document, so we don't.
! 505: */
! 506: {
! 507: UPDATE_STYLE; /* Create empty document here! */
! 508: HText_endAppend(THIS->text);
! 509:
! 510: }
! 511:
! 512: /* Get Styles from style sheet
! 513: ** ---------------------------
! 514: */
! 515: PRIVATE void get_styles NOARGS
! 516: {
! 517: got_styles = YES;
! 518:
! 519: tags[P_TAG].style =
! 520: default_tag.style = HTStyleNamed(styleSheet, "Normal");
! 521: tags[H1_TAG].style = HTStyleNamed(styleSheet, "Heading1");
! 522: tags[H1_TAG+1].style = HTStyleNamed(styleSheet, "Heading2");
! 523: tags[H1_TAG+2].style = HTStyleNamed(styleSheet, "Heading3");
! 524: tags[H1_TAG+3].style = HTStyleNamed(styleSheet, "Heading4");
! 525: tags[H1_TAG+4].style = HTStyleNamed(styleSheet, "Heading5");
! 526: tags[H1_TAG+5].style = HTStyleNamed(styleSheet, "Heading6");
! 527: tags[H1_TAG+6].style = HTStyleNamed(styleSheet, "Heading7");
! 528: tags[DL_TAG].style = HTStyleNamed(styleSheet, "Glossary");
! 529: tags[UL_TAG].style = HTStyleNamed(styleSheet, "List");
! 530: tags[OL_TAG].style = HTStyleNamed(styleSheet, "List");
! 531: tags[MENU_TAG].style = HTStyleNamed(styleSheet, "Menu");
! 532: list_compact_style =
! 533: tags[DIR_TAG].style = HTStyleNamed(styleSheet, "Dir");
! 534: glossary_style = HTStyleNamed(styleSheet, "Glossary");
! 535: glossary_compact_style = HTStyleNamed(styleSheet, "GlossaryCompact");
! 536: tags[ADDRESS_TAG].style= HTStyleNamed(styleSheet, "Address");
! 537: tags[PLAINTEXT_TAG].style =
! 538: tags[XMP_TAG].style = HTStyleNamed(styleSheet, "Example");
! 539: tags[PRE_TAG].style = HTStyleNamed(styleSheet, "Preformatted");
! 540: tags[LISTING_TAG].style = HTStyleNamed(styleSheet, "Listing");
! 541: }
! 542:
! 543:
! 544: /* Parse an HTML file
! 545: ** ------------------
! 546: **
! 547: ** This version takes a pointer to the routine to call
! 548: ** to get each character.
! 549: */
! 550: BOOL HTML_Parse
! 551: #ifdef __STDC__
! 552: (HTParentAnchor * anchor, char (*next_char)() )
! 553: #else
! 554: (anchor, next_char)
! 555: HTParentAnchor * anchor;
! 556: char (*next_char)();
! 557: #endif
! 558: {
! 559: HTSGMLContext context;
! 560: HTML_id this = HTML_new(anchor);
! 561: context = SGML_begin(&HTML_dtd);
! 562: SGML_setCallerData(context, this);
! 563: for(;;) {
! 564: char character;
! 565: character = (*next_char)();
! 566: if (character == (char)EOF) break;
! 567:
! 568: SGML_character(context, character);
! 569: }
! 570: SGML_end(context);
! 571: free(this);
! 572: return YES;
! 573: }
Webmaster