Annotation of libwww/Library/src/HTML.c, revision 1.1.1.1
1.1 timbl 1: /* HTML Parser
2: ** ===========
3: **
4: ** An HTML displayable object has associated with it
5: **
6: ** - The underlying text object for display
7: ** - An SGML parsing context
8: ** - An anchor representing the whole object
9: ** - A style sheet, in the case os a style-oriented version
10: **
11: ** The first three could logically be represented by multiple inheritance if
12: ** that were supported, as an HTML object is like a subclass of all three.
13: **
14: ** In practice in C,
15: **
16: ** - a HText object is created by this module (when needed)
17: ** - an SGML parsing object is created by this module
18: ** - the anchor representing the object is given at creation time
19: **
20: ** Those using structured HTML objects will wish to override this module
21: ** completely
22: */
23: #include "HTML.h"
24:
25: #include <ctype.h>
26: #include <stdio.h>
27:
28: #include "HTAtom.h"
29: #include "HTChunk.h"
30: #include "HText.h"
31: #include "HTStyle.h"
32:
33:
34: /* SPECIAL HTML CODE
35: ** =================
36: */
37:
38: extern HTStyleSheet * styleSheet; /* Application-wide */
39:
40: /* Module-wide style cache
41: */
42: PRIVATE HTStyle * glossary_style;
43: PRIVATE HTStyle * list_compact_style;
44: PRIVATE HTStyle * glossary_compact_style;
45: PRIVATE int got_styles = 0;
46:
47:
48: /* HTML Object
49: ** -----------
50: */
51: struct _HTML {
52: HTParentAnchor * node_anchor;
53: HText * text;
54: HTSGMLContext context;
55:
56: HTChunk title; /* Grow by 128 */
57:
58: /* Used in parsing: */
59:
60: BOOL style_change;
61: HTStyle * new_style;
62: HTStyle * old_style;
63: BOOL in_word; /* Have just had a non-white character */
64: };
65:
66:
67: /* Forward declarations of routines
68: */
69: PRIVATE void get_styles NOPARAMS;
70:
71: /* For dtd: */
72: PRIVATE void no_change PARAMS((void*this, HTTag * t, HTElement * e));
73: PRIVATE void begin_litteral PARAMS((void*this, HTTag * t, HTElement * e));
74: PRIVATE void begin_element PARAMS((void*this, HTTag * t, HTElement * e));
75: PRIVATE void end_element PARAMS((void*this, HTTag * t, HTElement * e));
76: PRIVATE void begin_document PARAMS((void*this, HTTag * t, HTElement * e));
77: PRIVATE void end_document PARAMS((void*this, HTTag * t, HTElement * e));
78: PRIVATE void begin_anchor PARAMS((void*this, HTTag * t, HTElement * e));
79: PRIVATE void end_anchor PARAMS((void*this, HTTag * t, HTElement * e));
80: PRIVATE void begin_list PARAMS((void*this, HTTag * t, HTElement * e));
81: PRIVATE void list_element PARAMS((void*this, HTTag * t, HTElement * e));
82: PRIVATE void end_list PARAMS((void*this, HTTag * t, HTElement * e));
83: PRIVATE void begin_glossary PARAMS((void*this, HTTag * t, HTElement * e));
84: PRIVATE void end_glossary PARAMS((void*this, HTTag * t, HTElement * e));
85:
86: PRIVATE void actually_set_style PARAMS((HTML_id this));
87: PRIVATE void change_style PARAMS((HTML_id this, HTStyle * style));
88:
89: /* Style buffering avoids dummy paragraph begin/ends.
90: */
91: #define UPDATE_STYLE if (THIS->style_change) { actually_set_style(THIS); }
92:
93: #define THIS ((HTML_id)this)
94:
95: /* Things affecting the anchor but not the document itself
96: ** -------------------------------------------------------
97: */
98:
99:
100: /* TITLE
101: */
102:
103: /* Accumulate a character of title
104: */
105: static void accumulate_string ARGS2(void *, this, char, c)
106:
107: {
108: HTChunkPutc(&THIS->title, c);
109: }
110:
111:
112: /* Clear the title
113: */
114: PRIVATE void clear_string ARGS3(void *, this, HTTag *,t, HTElement *,e)
115: {
116: HTChunkClear(&THIS->title);
117: }
118:
119: PRIVATE void set_title ARGS3(void *, this, HTTag *,t, HTElement *,e)
120: {
121: HTChunkTerminate(&THIS->title);
122: HTAnchor_setTitle(THIS->node_anchor, THIS->title.data);
123: }
124:
125: PRIVATE void set_index ARGS3(void *, this, HTTag *,t, HTElement *,e)
126: {
127: HTAnchor_setIndex(THIS->node_anchor);
128: }
129:
130: /* Things affecting the document
131: ** -----------------------------
132: */
133: /* Character handling
134: */
135: PRIVATE void pass_character ARGS2(void *, this, char, c)
136: {
137: if (THIS->style_change) {
138: if ((c=='\n') || (c==' ')) return; /* Ignore it */
139: UPDATE_STYLE;
140: }
141: if (c=='\n') {
142: if (THIS->in_word) {
143: HText_appendCharacter(THIS->text, ' ');
144: THIS->in_word = NO;
145: }
146: } else {
147: HText_appendCharacter(THIS->text, c);
148: THIS->in_word = YES;
149: }
150: }
151:
152: PRIVATE void litteral_text ARGS2(void *, this, char, c)
153: {
154: /* We guarrantee that the style is up-to-date in begin_litteral
155: */
156: HText_appendCharacter(THIS->text, c); /* @@@@@ */
157: }
158:
159: PRIVATE void ignore_text ARGS2(void *, this, char, c)
160: {
161: /* Do nothing */
162: }
163:
164: PRIVATE void set_next_id ARGS3(void *, this, HTTag *,t, HTElement *,e)
165: {
166: /* Not needed */
167: }
168:
169: PRIVATE void new_paragraph ARGS3(void *, this, HTTag *,t, HTElement *,e)
170: {
171: UPDATE_STYLE;
172: HText_appendParagraph(THIS->text);
173: THIS->in_word = NO;
174: }
175:
176: PRIVATE void term ARGS3(void *, this, HTTag *,t, HTElement *,e)
177: {
178: if (!THIS->style_change) {
179: HText_appendParagraph(THIS->text);
180: THIS->in_word = NO;
181: }
182: }
183:
184: PRIVATE void definition ARGS3(void *, this, HTTag *,t, HTElement *,e)
185: {
186: UPDATE_STYLE;
187: pass_character(this, '\t'); /* Just tab out one stop */
188: THIS->in_word = NO;
189: }
190:
191: /* Our Static DTD for HTML
192: ** -----------------------
193: */
194:
195: static entity entities[] = {
196: { "lt", "<" },
197: { "gt", ">" },
198: { "amp", "&" },
199: #ifdef NeXT
200: { "bullet" , "\267" }, /* @@@ NeXT only */
201: #endif
202: /* The following accented characters are from peter Flynn, curia project */
203:
204: /* these ifdefs don't solve the problem of a simple terminal emulator
205: ** with a different character set to the client machine. But nothing does,
206: ** except looking at the TERM setting */
207:
208: { "ocus" , "&" }, /* for CURIA */
209: #ifdef IBMPC
210: { "aacute" , "\240" }, /* For PC display */
211: { "eacute" , "\202" },
212: { "iacute" , "\241" },
213: { "oacute" , "\242" },
214: { "uacute" , "\243" },
215: { "Aacute" , "\101" },
216: { "Eacute" , "\220" },
217: { "Iacute" , "\111" },
218: { "Oacute" , "\117" },
219: { "Uacute" , "\125" },
220: #else
221: { "aacute" , "\341" }, /* Works for openwindows -- Peter Flynn */
222: { "eacute" , "\351" },
223: { "iacute" , "\355" },
224: { "oacute" , "\363" },
225: { "uacute" , "\372" },
226: { "Aacute" , "\301" },
227: { "Eacute" , "\310" },
228: { "Iacute" , "\315" },
229: { "Oacute" , "\323" },
230: { "Uacute" , "\332" },
231: #endif
232: { 0, 0 } /* Terminate list */
233: };
234:
235: static attr no_attr[] = {{ 0, 0 , 0}};
236:
237: static attr a_attr[] = { /* Anchor attributes */
238: #define A_ID 0
239: { "NAME", 0, 0 }, /* Should be ID */
240: #define A_TYPE 1
241: { "TYPE", 0, 0 },
242: #define A_HREF 2
243: { "HREF", 0, 0 },
244: { 0, 0 , 0} /* Terminate list */
245: };
246: static attr list_attr[] = {
247: #define LIST_COMPACT 0
248: { "COMPACT", 0, 0 },
249: { 0, 0, 0 } /* Terminate list */
250: };
251:
252: static attr glossary_attr[] = {
253: #define GLOSSARY_COMPACT 0
254: { "COMPACT", 0, 0 },
255: { 0, 0, 0 } /* Terminate list */
256: };
257:
258: static HTTag default_tag =
259: { "DOCUMENT", no_attr , 0, 0, begin_document, pass_character, end_document };
260: /* NAME ATTR STYLE LITERAL? ON_BEGIN ON__CHARACTER ON_END
261: */
262: static HTTag tags[] = {
263: #define TITLE_TAG 0
264: { "TITLE", no_attr, 0, 0, clear_string, accumulate_string, set_title },
265: #define ISINDEX_TAG 1
266: { "ISINDEX", no_attr, 0, 0, set_index, 0 , 0 },
267: #define NEXTID_TAG 2
268: { "NEXTID", no_attr, 0, 0, set_next_id, 0, 0 },
269: #define ADDRESS_TAG 3
270: { "ADDRESS" , no_attr, 0, 0, begin_element, pass_character, end_element },
271: #define H1_TAG 4
272: { "H1" , no_attr, 0, 0, begin_element, pass_character, end_element },
273: { "H2" , no_attr, 0, 0, begin_element, pass_character, end_element },
274: { "H3" , no_attr, 0, 0, begin_element, pass_character, end_element },
275: { "H4" , no_attr, 0, 0, begin_element, pass_character, end_element },
276: { "H5" , no_attr, 0, 0, begin_element, pass_character, end_element },
277: { "H6" , no_attr, 0, 0, begin_element, pass_character, end_element },
278: { "H7" , no_attr, 0, 0, begin_element, pass_character, end_element },
279: #define UL_TAG 11
280: { "UL" , list_attr, 0, 0, begin_list, pass_character, end_list },
281: #define OL_TAG 12
282: { "OL" , list_attr, 0, 0, begin_list, pass_character, end_list },
283: #define MENU_TAG 13
284: { "MENU" , list_attr, 0, 0, begin_list, pass_character, end_list },
285: #define DIR_TAG 14
286: { "DIR" , list_attr, 0, 0, begin_list, pass_character, end_list },
287: #define LI_TAG 15
288: { "LI" , list_attr, 0, 0, list_element, pass_character, 0 },
289: #define DL_TAG 16
290: { "DL" , glossary_attr, 0, 0, begin_glossary, pass_character, end_glossary },
291: { "DT" , no_attr, 0, 0, term, pass_character, 0 },
292: { "DD" , no_attr, 0, 0, definition, pass_character, 0 },
293: { "A" , a_attr, 0, 0, begin_anchor, pass_character, end_anchor },
294: #define P_TAG 20
295: { "P" , no_attr, 0, 0, new_paragraph, pass_character, 0 },
296: #define XMP_TAG 21
297: { "XMP" , no_attr, 0, YES, begin_litteral, litteral_text, end_element },
298: #define PRE_TAG 22
299: { "PRE" , no_attr, 0, 0, begin_litteral, litteral_text, end_element },
300: #define LISTING_TAG 23
301: { "LISTING" , no_attr, 0, YES,begin_litteral, litteral_text, end_element },
302: #define PLAINTEXT_TAG 24
303: { "PLAINTEXT", no_attr, 0, YES, begin_litteral, litteral_text, end_element },
304: #define COMMENT_TAG 25
305: { "COMMENT", no_attr, 0, YES, no_change, ignore_text, no_change },
306: { 0, 0, 0, 0, 0, 0 , 0} /* Terminate list */
307: };
308:
309: PUBLIC SGML_dtd HTML_dtd = { tags, &default_tag, entities };
310:
311:
312: /* Flattening the style structure
313: ** ------------------------------
314: **
315: On the NeXT, and on any read-only browser, it is simpler for the text to have
316: a sequence of styles, rather than a nested tree of styles. In this
317: case we have to flatten the structure as it arrives from SGML tags into
318: a sequence of styles.
319: */
320:
321: /* If style really needs to be set, call this
322: */
323: PRIVATE void actually_set_style ARGS1(HTML_id, this)
324: {
325: if (!THIS->text) { /* First time through */
326: THIS->text = HText_new(THIS->node_anchor);
327: HText_beginAppend(THIS->text);
328: HText_setStyle(THIS->text, THIS->new_style);
329: THIS->in_word = NO;
330: } else {
331: HText_setStyle(THIS->text, THIS->new_style);
332: }
333: THIS->old_style = THIS->new_style;
334: THIS->style_change = NO;
335: }
336:
337: /* If you THINK you need to change style, call this
338: */
339:
340: PRIVATE void change_style ARGS2(HTML_id, this, HTStyle *,style)
341: {
342: if (THIS->new_style!=style) {
343: THIS->style_change = YES /* was old_style == new_style */ ;
344: THIS->new_style = style;
345: }
346: }
347:
348: /* Anchor handling
349: ** ---------------
350: */
351: PRIVATE void begin_anchor ARGS3(void *, this, HTTag *,t, HTElement *,e)
352: {
353: HTChildAnchor * source = HTAnchor_findChildAndLink(
354: THIS->node_anchor, /* parent */
355: a_attr[A_ID].present ? a_attr[A_ID].value : 0, /* Tag */
356: a_attr[A_HREF].present ? a_attr[A_HREF].value : 0, /* Addresss */
357: a_attr[A_TYPE].present ?
358: (HTLinkType*)HTAtom_for(a_attr[A_TYPE].value)
359: : 0);
360:
361: UPDATE_STYLE;
362: HText_beginAnchor(THIS->text, source);
363: }
364:
365: PRIVATE void end_anchor ARGS3(void *, this, HTTag *, t,
366: HTElement *, e)
367: {
368: UPDATE_STYLE;
369: HText_endAnchor(THIS->text);
370: }
371:
372:
373: /* General SGML Element Handling
374: ** -----------------------------
375: */
376: PRIVATE void begin_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
377: {
378: change_style(THIS, (HTStyle*)(t->style));
379: }
380: PRIVATE void no_change ARGS3(void *, this, HTTag *,t, HTElement *,e)
381: {
382: /* Do nothing */;
383: }
384: PRIVATE void begin_litteral ARGS3(void *, this, HTTag *,t, HTElement *,e)
385: {
386: change_style(THIS, t->style);
387: UPDATE_STYLE;
388: }
389: /* End Element
390: **
391: ** When we end an element, the style must be returned to that
392: ** in effect before that element. Note that anchors (etc?)
393: ** don't have an associated style, so that we must scan down the
394: ** stack for an element with a defined style. (In fact, the styles
395: ** should be linked to the whole stack not just the top one.)
396: ** TBL 921119
397: */
398: PRIVATE void end_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
399: {
400: /* if (e) change_style(THIS, e->tag->style); */
401: while (e) {
402: if (e->tag->style) {
403: change_style(THIS, e->tag->style);
404: return;
405: }
406: e = e->next;
407: }
408: }
409:
410: /* Lists
411: */
412: PRIVATE void begin_list ARGS3(void *, this, HTTag *,t, HTElement *,e)
413: {
414: change_style(THIS, list_attr[LIST_COMPACT].present
415: ? list_compact_style
416: : (HTStyle*)(t->style));
417: THIS->in_word = NO;
418: }
419:
420: PRIVATE void end_list ARGS3(void *, this, HTTag *,t, HTElement *,e)
421: {
422: change_style(THIS, e->tag->style);
423: THIS->in_word = NO;
424: }
425:
426: PRIVATE void list_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
427: {
428: UPDATE_STYLE;
429: if (e->tag != &tags[DIR_TAG])
430: HText_appendParagraph(THIS->text);
431: else
432: HText_appendCharacter(THIS->text, '\t'); /* Tab @@ nl for UL? */
433: THIS->in_word = NO;
434: }
435:
436:
437: PRIVATE void begin_glossary ARGS3(void *, this, HTTag *,t, HTElement *,e)
438: {
439: change_style(THIS, glossary_attr[GLOSSARY_COMPACT].present
440: ? glossary_compact_style
441: : glossary_style);
442: THIS->in_word = NO;
443: }
444:
445: PRIVATE void end_glossary ARGS3(void *, this, HTTag *,t, HTElement *,e)
446: {
447: change_style(THIS, e->tag->style);
448: THIS->in_word = NO;
449: }
450:
451:
452: /* Create an HTML object
453: ** ---------------------
454: */
455: PUBLIC HTML_id HTML_new ARGS1(HTParentAnchor *,anchor)
456: {
457:
458: HTML_id this = malloc(sizeof(*this));
459:
460: if (!got_styles) get_styles();
461:
462: this->node_anchor = anchor;
463: this->title.size = 0;
464: this->title.growby = 128;
465: this->title.allocated = 0;
466: this->title.data = 0;
467: this->text = 0;
468: this->style_change = YES; /* Force check leading to text creation */
469: this->new_style = this->old_style = 0;
470:
471: this->context = SGML_begin(&HTML_dtd);
472: SGML_setCallerData(this->context, this);
473:
474: return this;
475: }
476:
477:
478: /* Free an HTML object
479: ** -------------------
480: **
481: ** Note that the SGML parsing context is freed, but the created object is not,
482: ** as it takes on an existence of its own unless explicitly freed.
483: */
484: PUBLIC void HTML_free ARGS1(HTML_id, this)
485: {
486: SGML_end(this->context);
487: free(this);
488: }
489:
490: PUBLIC HTSGMLContext HTML_SGMLContext ARGS1(HTML_id, this)
491: {
492: return this->context;
493: }
494:
495: PRIVATE void begin_document ARGS3(void *, this, HTTag *, t, HTElement *, e)
496: {
497: /* Can't do much, THIS is undefined here */
498: }
499:
500: PRIVATE void end_document ARGS3(void *, this, HTTag *, t, HTElement *, e)
501: /* If the document is empty, the text object will not yet exist.
502: So we could in fact abandon creating the document and return
503: an error code. In fact an empty document is an important type
504: of document, so we don't.
505: */
506: {
507: UPDATE_STYLE; /* Create empty document here! */
508: HText_endAppend(THIS->text);
509:
510: }
511:
512: /* Get Styles from style sheet
513: ** ---------------------------
514: */
515: PRIVATE void get_styles NOARGS
516: {
517: got_styles = YES;
518:
519: tags[P_TAG].style =
520: default_tag.style = HTStyleNamed(styleSheet, "Normal");
521: tags[H1_TAG].style = HTStyleNamed(styleSheet, "Heading1");
522: tags[H1_TAG+1].style = HTStyleNamed(styleSheet, "Heading2");
523: tags[H1_TAG+2].style = HTStyleNamed(styleSheet, "Heading3");
524: tags[H1_TAG+3].style = HTStyleNamed(styleSheet, "Heading4");
525: tags[H1_TAG+4].style = HTStyleNamed(styleSheet, "Heading5");
526: tags[H1_TAG+5].style = HTStyleNamed(styleSheet, "Heading6");
527: tags[H1_TAG+6].style = HTStyleNamed(styleSheet, "Heading7");
528: tags[DL_TAG].style = HTStyleNamed(styleSheet, "Glossary");
529: tags[UL_TAG].style = HTStyleNamed(styleSheet, "List");
530: tags[OL_TAG].style = HTStyleNamed(styleSheet, "List");
531: tags[MENU_TAG].style = HTStyleNamed(styleSheet, "Menu");
532: list_compact_style =
533: tags[DIR_TAG].style = HTStyleNamed(styleSheet, "Dir");
534: glossary_style = HTStyleNamed(styleSheet, "Glossary");
535: glossary_compact_style = HTStyleNamed(styleSheet, "GlossaryCompact");
536: tags[ADDRESS_TAG].style= HTStyleNamed(styleSheet, "Address");
537: tags[PLAINTEXT_TAG].style =
538: tags[XMP_TAG].style = HTStyleNamed(styleSheet, "Example");
539: tags[PRE_TAG].style = HTStyleNamed(styleSheet, "Preformatted");
540: tags[LISTING_TAG].style = HTStyleNamed(styleSheet, "Listing");
541: }
542:
543:
544: /* Parse an HTML file
545: ** ------------------
546: **
547: ** This version takes a pointer to the routine to call
548: ** to get each character.
549: */
550: BOOL HTML_Parse
551: #ifdef __STDC__
552: (HTParentAnchor * anchor, char (*next_char)() )
553: #else
554: (anchor, next_char)
555: HTParentAnchor * anchor;
556: char (*next_char)();
557: #endif
558: {
559: HTSGMLContext context;
560: HTML_id this = HTML_new(anchor);
561: context = SGML_begin(&HTML_dtd);
562: SGML_setCallerData(context, this);
563: for(;;) {
564: char character;
565: character = (*next_char)();
566: if (character == (char)EOF) break;
567:
568: SGML_character(context, character);
569: }
570: SGML_end(context);
571: free(this);
572: return YES;
573: }
Webmaster