Annotation of XML/HTMLparser.c, revision 1.7
1.1 daniel 1: /*
2: * HTMLparser.c : an HTML 4.0 non-verifying parser
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
10: #define HAVE_FCNTL_H
11: #include <io.h>
12: #else
13: #include <config.h>
14: #endif
15: #include <stdio.h>
16: #include <ctype.h>
17: #include <string.h> /* for memset() only */
18: #include <stdlib.h>
19: #include <sys/stat.h>
20: #ifdef HAVE_FCNTL_H
21: #include <fcntl.h>
22: #endif
23: #ifdef HAVE_UNISTD_H
24: #include <unistd.h>
25: #endif
26: #ifdef HAVE_ZLIB_H
27: #include <zlib.h>
28: #endif
29:
30: #include "tree.h"
31: #include "HTMLparser.h"
32: #include "entities.h"
33: #include "encoding.h"
34: #include "valid.h"
35: #include "parserInternals.h"
1.5 daniel 36: #include "xmlIO.h"
37:
38: #define HTML_MAX_NAMELEN 1000
39: #define INPUT_CHUNK 50
1.1 daniel 40:
41: /* #define DEBUG */
42:
43: /************************************************************************
44: * *
45: * Parser stacks related functions and macros *
46: * *
47: ************************************************************************/
48:
49: /*
50: * Generic function for accessing stacks in the Parser Context
51: */
52:
53: #define PUSH_AND_POP(type, name) \
54: int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
55: if (ctxt->name##Nr >= ctxt->name##Max) { \
56: ctxt->name##Max *= 2; \
57: ctxt->name##Tab = (void *) realloc(ctxt->name##Tab, \
58: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
59: if (ctxt->name##Tab == NULL) { \
60: fprintf(stderr, "realloc failed !\n"); \
61: exit(1); \
62: } \
63: } \
64: ctxt->name##Tab[ctxt->name##Nr] = value; \
65: ctxt->name = value; \
66: return(ctxt->name##Nr++); \
67: } \
68: type html##name##Pop(htmlParserCtxtPtr ctxt) { \
69: type ret; \
70: if (ctxt->name##Nr <= 0) return(0); \
71: ctxt->name##Nr--; \
72: if (ctxt->name##Nr > 0) \
73: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
74: else \
75: ctxt->name = NULL; \
76: ret = ctxt->name##Tab[ctxt->name##Nr]; \
77: ctxt->name##Tab[ctxt->name##Nr] = 0; \
78: return(ret); \
79: } \
80:
81: PUSH_AND_POP(xmlNodePtr, node)
82:
83: /*
84: * Macros for accessing the content. Those should be used only by the parser,
85: * and not exported.
86: *
87: * Dirty macros, i.e. one need to make assumption on the context to use them
88: *
89: * CUR_PTR return the current pointer to the CHAR to be parsed.
90: * CUR returns the current CHAR value, i.e. a 8 bit value if compiled
91: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
92: * in UNICODE mode. This should be used internally by the parser
93: * only to compare to ASCII values otherwise it would break when
94: * running with UTF-8 encoding.
95: * NXT(n) returns the n'th next CHAR. Same as CUR is should be used only
96: * to compare on ASCII based substring.
97: * UPP(n) returns the n'th next CHAR converted to uppercase. Same as CUR
98: * it should be used only to compare on ASCII based substring.
99: * SKIP(n) Skip n CHAR, and must also be used only to skip ASCII defined
100: * strings within the parser.
101: *
102: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
103: *
104: * CURRENT Returns the current char value, with the full decoding of
105: * UTF-8 if we are using this mode. It returns an int.
106: * NEXT Skip to the next character, this does the proper decoding
107: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
108: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
109: */
110:
111: #define CUR (*ctxt->input->cur)
112: #define UPPER (toupper(*ctxt->input->cur))
113: #define SKIP(val) ctxt->input->cur += (val)
114: #define NXT(val) ctxt->input->cur[(val)]
115: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
116: #define CUR_PTR ctxt->input->cur
1.5 daniel 117: #define SHRINK xmlParserInputShrink(ctxt->input)
118: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
1.1 daniel 119:
120: #define SKIP_BLANKS \
121: while (IS_BLANK(*(ctxt->input->cur))) NEXT
122:
123: #ifndef USE_UTF_8
124: #define CURRENT (*ctxt->input->cur)
1.5 daniel 125: #define NEXT { \
126: if ((*ctxt->input->cur == 0) && \
127: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { \
128: xmlPopInput(ctxt); \
129: } else { \
130: if (*(ctxt->input->cur) == '\n') { \
131: ctxt->input->line++; ctxt->input->col = 1; \
132: } else ctxt->input->col++; \
133: ctxt->input->cur++; \
134: if (*ctxt->input->cur == 0) \
135: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
136: }}
137:
138: /****************************************
1.1 daniel 139: #define NEXT ((*ctxt->input->cur) ? \
140: (((*(ctxt->input->cur) == '\n') ? \
141: (ctxt->input->line++, ctxt->input->col = 1) : \
1.5 daniel 142: (ctxt->input->col++)), \
143: (ctxt->input->cur++), \
144: ((*ctxt->input->cur) ? \
145: (xmlParserInputGrow(ctxt->input, 100), \
146: ctxt->input->cur): \
147: (ctxt->input->cur))) : \
148: ((xmlParserInputGrow(ctxt->input, 100) > 0) ? \
149: ctxt->input->cur: \
150: (xmlPopInput(ctxt), ctxt->input->cur)))
151: ****************************************/
1.1 daniel 152: #else
153: #endif
154:
155:
1.5 daniel 156:
1.1 daniel 157: /************************************************************************
158: * *
159: * The list of HTML elements and their properties *
160: * *
161: ************************************************************************/
162:
163: /*
164: * Start Tag: 1 means the start tag can be ommited
165: * End Tag: 1 means the end tag can be ommited
166: * 2 means it's forbidden (empty elements)
167: * Depr: this element is deprecated
168: * DTD: 1 means that this element is valid only in the Loose DTD
169: * 2 means that this element is valid only in the Frameset DTD
170: *
171: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
172: */
173: htmlElemDesc html40ElementTable[] = {
174: { "A", 0, 0, 0, 0, 0, "anchor " },
175: { "ABBR", 0, 0, 0, 0, 0, "abbreviated form" },
176: { "ACRONYM", 0, 0, 0, 0, 0, "" },
177: { "ADDRESS", 0, 0, 0, 0, 0, "information on author " },
178: { "APPLET", 0, 0, 0, 1, 1, "Java applet " },
179: { "AREA", 0, 2, 1, 0, 0, "client-side image map area " },
180: { "B", 0, 0, 0, 0, 0, "bold text style" },
181: { "BASE", 0, 2, 1, 0, 0, "document base URI " },
182: { "BASEFONT", 0, 2, 1, 1, 1, "base font size " },
183: { "BDO", 0, 0, 0, 0, 0, "I18N BiDi over-ride " },
184: { "BIG", 0, 0, 0, 0, 0, "large text style" },
185: { "BLOCKQUOTE", 0, 0, 0, 0, 0, "long quotation " },
186: { "BODY", 1, 1, 0, 0, 0, "document body " },
187: { "BR", 0, 2, 1, 0, 0, "forced line break " },
188: { "BUTTON", 0, 0, 0, 0, 0, "push button " },
189: { "CAPTION", 0, 0, 0, 0, 0, "table caption " },
190: { "CENTER", 0, 0, 0, 1, 1, "shorthand for DIV align=center " },
191: { "CITE", 0, 0, 0, 0, 0, "citation" },
192: { "CODE", 0, 0, 0, 0, 0, "computer code fragment" },
193: { "COL", 0, 2, 1, 0, 0, "table column " },
194: { "COLGROUP", 0, 1, 0, 0, 0, "table column group " },
195: { "DD", 0, 1, 0, 0, 0, "definition description " },
196: { "DEL", 0, 0, 0, 0, 0, "deleted text " },
197: { "DFN", 0, 0, 0, 0, 0, "instance definition" },
198: { "DIR", 0, 0, 0, 1, 1, "directory list" },
199: { "DIV", 0, 0, 0, 0, 0, "generic language/style container"},
200: { "DL", 0, 0, 0, 0, 0, "definition list " },
201: { "DT", 0, 1, 0, 0, 0, "definition term " },
202: { "EM", 0, 0, 0, 0, 0, "emphasis" },
203: { "FIELDSET", 0, 0, 0, 0, 0, "form control group " },
204: { "FONT", 0, 0, 0, 1, 1, "local change to font " },
205: { "FORM", 0, 0, 0, 0, 0, "interactive form " },
206: { "FRAME", 0, 2, 1, 0, 2, "subwindow " },
207: { "FRAMESET", 0, 0, 0, 0, 2, "window subdivision" },
208: { "H1", 0, 0, 0, 0, 0, "heading " },
209: { "H2", 0, 0, 0, 0, 0, "heading " },
210: { "H3", 0, 0, 0, 0, 0, "heading " },
211: { "H4", 0, 0, 0, 0, 0, "heading " },
212: { "H5", 0, 0, 0, 0, 0, "heading " },
213: { "H6", 0, 0, 0, 0, 0, "heading " },
214: { "HEAD", 1, 1, 0, 0, 0, "document head " },
215: { "HR", 0, 2, 1, 0, 0, "horizontal rule " },
216: { "HTML", 1, 1, 0, 0, 0, "document root element " },
217: { "I", 0, 0, 0, 0, 0, "italic text style" },
218: { "IFRAME", 0, 0, 0, 0, 1, "inline subwindow " },
219: { "IMG", 0, 2, 1, 0, 0, "Embedded image " },
220: { "INPUT", 0, 2, 1, 0, 0, "form control " },
221: { "INS", 0, 0, 0, 0, 0, "inserted text" },
222: { "ISINDEX", 0, 2, 1, 1, 1, "single line prompt " },
223: { "KBD", 0, 0, 0, 0, 0, "text to be entered by the user" },
224: { "LABEL", 0, 0, 0, 0, 0, "form field label text " },
225: { "LEGEND", 0, 0, 0, 0, 0, "fieldset legend " },
226: { "LI", 0, 1, 0, 0, 0, "list item " },
227: { "LINK", 0, 2, 1, 0, 0, "a media-independent link " },
228: { "MAP", 0, 0, 0, 0, 0, "client-side image map " },
229: { "MENU", 0, 0, 0, 1, 1, "menu list " },
230: { "META", 0, 2, 1, 0, 0, "generic metainformation " },
231: { "NOFRAMES", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
232: { "NOSCRIPT", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
233: { "OBJECT", 0, 0, 0, 0, 0, "generic embedded object " },
234: { "OL", 0, 0, 0, 0, 0, "ordered list " },
235: { "OPTGROUP", 0, 0, 0, 0, 0, "option group " },
236: { "OPTION", 0, 1, 0, 0, 0, "selectable choice " },
237: { "P", 0, 1, 0, 0, 0, "paragraph " },
238: { "PARAM", 0, 2, 1, 0, 0, "named property value " },
239: { "PRE", 0, 0, 0, 0, 0, "preformatted text " },
240: { "Q", 0, 0, 0, 0, 0, "short inline quotation " },
241: { "S", 0, 0, 0, 1, 1, "strike-through text style" },
242: { "SAMP", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
243: { "SCRIPT", 0, 0, 0, 0, 0, "script statements " },
244: { "SELECT", 0, 0, 0, 0, 0, "option selector " },
245: { "SMALL", 0, 0, 0, 0, 0, "small text style" },
246: { "SPAN", 0, 0, 0, 0, 0, "generic language/style container " },
247: { "STRIKE", 0, 0, 0, 1, 1, "strike-through text" },
248: { "STRONG", 0, 0, 0, 0, 0, "strong emphasis" },
249: { "STYLE", 0, 0, 0, 0, 0, "style info " },
250: { "SUB", 0, 0, 0, 0, 0, "subscript" },
251: { "SUP", 0, 0, 0, 0, 0, "superscript " },
252: { "TABLE", 0, 0, 0, 0, 0, " " },
253: { "TBODY", 1, 1, 0, 0, 0, "table body " },
254: { "TD", 0, 1, 0, 0, 0, "table data cell" },
255: { "TEXTAREA", 0, 0, 0, 0, 0, "multi-line text field " },
256: { "TFOOT", 0, 1, 0, 0, 0, "table footer " },
257: { "TH", 0, 1, 0, 0, 0, "table header cell" },
258: { "THEAD", 0, 1, 0, 0, 0, "table header " },
259: { "TITLE", 0, 0, 0, 0, 0, "document title " },
260: { "TR", 0, 1, 0, 0, 0, "table row " },
261: { "TT", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
262: { "U", 0, 0, 0, 1, 1, "underlined text style" },
263: { "UL", 0, 0, 0, 0, 0, "unordered list " },
264: { "VAR", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
265: };
266:
267: /*
268: * start tags that imply the end of a current element
269: * any tag of each line implies the end of the current element if the type of
270: * that element is in the same line
271: */
272: CHAR *htmlEquEnd[] = {
273: "DT", "DD", "LI", "OPTION", NULL,
274: "H1", "H2", "H3", "H4", "H5", "H6", NULL,
275: "OL", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", NULL,
276: NULL
277: };
278: /*
279: * acording the HTML DTD, HR should be added to the 2nd line above, as it
280: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
281: * because many documents contain rules in headings...
282: */
283:
284: /*
285: * start tags that imply the end of current element
286: */
287: CHAR *htmlStartClose[] = {
288: "FORM", "FORM", "P", "HR", "H1", "H2", "H3", "H4", "H5", "H6",
289: "DL", "UL", "OL", "MENU", "DIR", "ADDRESS", "PRE",
290: "LISTING", "XMP", "HEAD", NULL,
291: "HEAD", "P", NULL,
292: "TITLE", "P", NULL,
293: "BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
294: "LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
295: "PRE", "LISTING", "XMP", "HEAD", NULL,
296: "HR", "P", "HEAD", NULL,
297: "H1", "P", "HEAD", NULL,
298: "H2", "P", "HEAD", NULL,
299: "H3", "P", "HEAD", NULL,
300: "H4", "P", "HEAD", NULL,
301: "H5", "P", "HEAD", NULL,
302: "H6", "P", "HEAD", NULL,
303: "DIR", "P", "HEAD", NULL,
304: "ADDRESS", "P", "HEAD", "UL", NULL,
305: "PRE", "P", "HEAD", "UL", NULL,
306: "LISTING", "P", "HEAD", NULL,
307: "XMP", "P", "HEAD", NULL,
308: "BLOCKQUOTE", "P", "HEAD", NULL,
309: "DL", "P", "DT", "MENU", "DIR", "ADDRESS", "PRE", "LISTING",
310: "XMP", "HEAD", NULL,
311: "DT", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL,
312: "DD", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL,
313: "UL", "P", "HEAD", "OL", "MENU", "DIR", "ADDRESS", "PRE",
314: "LISTING", "XMP", NULL,
315: "OL", "P", "HEAD", "UL", NULL,
316: "MENU", "P", "HEAD", "UL", NULL,
317: "P", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", NULL,
318: "DIV", "P", "HEAD", NULL,
319: "NOSCRIPT", "P", "HEAD", NULL,
320: "CENTER", "FONT", "B", "I", "P", "HEAD", NULL,
321: "A", "A", NULL,
322: "CAPTION", "P", NULL,
323: "COLGROUP", "CAPTION", "COLGROUP", "COL", "P", NULL,
324: "COL", "CAPTION", "COL", "P", NULL,
325: "TABLE", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", "PRE",
326: "LISTING", "XMP", "A", NULL,
327: "TH", "TH", "TD", NULL,
328: "TD", "TH", "TD", NULL,
329: "TR", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", NULL,
330: "THEAD", "CAPTION", "COL", "COLGROUP", NULL,
331: "TFOOT", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
332: "TBODY", NULL,
333: "TBODY", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
334: "TFOOT", "TBODY", NULL,
335: "OPTGROUP", "OPTION", NULL,
336: "FIELDSET", "LEGEND", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6",
337: "PRE", "LISTING", "XMP", "A", NULL,
338: NULL
339: };
340:
341: static CHAR** htmlStartCloseIndex[100];
342: static int htmlStartCloseIndexinitialized = 0;
343:
344: /************************************************************************
345: * *
346: * functions to handle HTML specific data *
347: * *
348: ************************************************************************/
349:
350: /**
351: * htmlInitAutoClose:
352: *
353: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
354: *
355: */
356: void
357: htmlInitAutoClose(void) {
358: int index, i = 0;
359:
360: if (htmlStartCloseIndexinitialized) return;
361:
362: for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
363: index = 0;
364: while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
365: htmlStartCloseIndex[index++] = &htmlStartClose[i];
366: while (htmlStartClose[i] != NULL) i++;
367: i++;
368: }
369: }
370:
371: /**
372: * htmlTagLookup:
373: * @tag: The tag name
374: *
375: * Lookup the HTML tag in the ElementTable
376: *
377: * Returns the related htmlElemDescPtr or NULL if not found.
378: */
379: htmlElemDescPtr
380: htmlTagLookup(const CHAR *tag) {
381: int i = 0;
382:
383: for (i = 0; i < (sizeof(html40ElementTable) /
384: sizeof(html40ElementTable[0]));i++) {
385: if (!xmlStrcmp(tag, html40ElementTable[i].name))
386: return(&html40ElementTable[i]);
387: }
388: return(NULL);
389: }
390:
391: /**
392: * htmlCheckAutoClose:
393: * @new: The new tag name
394: * @old: The old tag name
395: *
396: * Checks wether the new tag is one of the registered valid tags for closing old.
397: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
398: *
399: * Returns 0 if no, 1 if yes.
400: */
401: int
402: htmlCheckAutoClose(const CHAR *new, const CHAR *old) {
403: int i, index;
404: CHAR **close;
405:
406: if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
407:
408: /* inefficient, but not a big deal */
409: for (index = 0; index < 100;index++) {
410: close = htmlStartCloseIndex[index];
411: if (close == NULL) return(0);
412: if (!xmlStrcmp(*close, new)) break;
413: }
414:
415: i = close - htmlStartClose;
416: i++;
417: while (htmlStartClose[i] != NULL) {
418: if (!xmlStrcmp(htmlStartClose[i], old)) {
419: return(1);
420: }
421: i++;
422: }
423: return(0);
424: }
425:
426: /**
427: * htmlAutoClose:
428: * @ctxt: an HTML parser context
429: * @new: The new tag name
430: *
431: * The HTmL DtD allows a tag to implicitely close other tags.
432: * The list is kept in htmlStartClose array. This function is
433: * called when a new tag has been detected and generates the
434: * appropriates closes if possible/needed.
435: */
436: void
437: htmlAutoClose(htmlParserCtxtPtr ctxt, const CHAR *new) {
438:
439: while ((ctxt->node != NULL) &&
440: (htmlCheckAutoClose(new, ctxt->node->name))) {
441: #ifdef DEBUG
442: printf("htmlAutoClose: %s closes %s\n", new, ctxt->node->name);
443: #endif
444: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
445: ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
446: }
447: }
448:
449: /**
450: * htmlAutoCloseOnClose:
451: * @ctxt: an HTML parser context
452: * @new: The new tag name
453: *
454: * The HTmL DtD allows an ending tag to implicitely close other tags.
455: */
456: void
457: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const CHAR *new) {
458: htmlElemDescPtr info;
459:
460: while ((ctxt->node != NULL) &&
461: (xmlStrcmp(new, ctxt->node->name))) {
462: info = htmlTagLookup(ctxt->node->name);
463: if ((info == NULL) || (info->endTag == 1)) {
464: #ifdef DEBUG
465: printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->node->name);
466: #endif
467: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
468: ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
469: } else
470: break;
471: }
472: }
473:
474: /************************************************************************
475: * *
476: * The list of HTML predefined entities *
477: * *
478: ************************************************************************/
479:
480:
481: htmlEntityDesc html40EntitiesTable[] = {
482: /*
483: * the 4 absolute ones,
484: */
485: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
486: { 38, "amp", "ampersand, U+0026 ISOnum" },
1.4 daniel 487: { 39, "apos", "single quote" },
1.1 daniel 488: { 60, "lt", "less-than sign, U+003C ISOnum" },
489: { 62, "gt", "greater-than sign, U+003E ISOnum" },
490:
491: /*
492: * A bunch still in the 128-255 range
493: * Replacing them depend really on the charset used.
494: */
495: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
496: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
497: { 162, "cent", "cent sign, U+00A2 ISOnum" },
498: { 163, "pound","pound sign, U+00A3 ISOnum" },
499: { 164, "curren","currency sign, U+00A4 ISOnum" },
500: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
501: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
502: { 167, "sect", "section sign, U+00A7 ISOnum" },
503: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
504: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
505: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
506: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
507: { 172, "not", "not sign, U+00AC ISOnum" },
508: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
509: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
510: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
511: { 176, "deg", "degree sign, U+00B0 ISOnum" },
512: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
513: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
514: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
515: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
516: { 181, "micro","micro sign, U+00B5 ISOnum" },
517: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1.7 ! daniel 518: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1.1 daniel 519: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
520: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
521: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1.7 ! daniel 522: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1.1 daniel 523: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
524: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
525: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
526: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
527: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
528: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
529: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
530: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
531: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
532: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
533: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
534: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
535: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
536: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
537: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
538: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
539: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
540: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
541: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
542: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
543: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
544: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
545: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
546: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
547: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
548: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
549: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
550: { 215, "times","multiplication sign, U+00D7 ISOnum" },
1.7 ! daniel 551: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1.1 daniel 552: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
553: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
554: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
555: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
556: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
557: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
558: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
559: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
560: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
561: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
562: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
563: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
564: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
565: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
566: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
567: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
568: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
569: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
570: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
571: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
572: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
573: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
574: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
575: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
576: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
577: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
578: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
579: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
580: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
581: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
582: { 247, "divide","division sign, U+00F7 ISOnum" },
583: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
584: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
585: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
586: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
587: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
588: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
589: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
590: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
591:
592: /*
593: * Anything below should really be kept as entities references
594: */
595: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
596:
597: { 913, "Alpha","greek capital letter alpha, U+0391" },
598: { 914, "Beta", "greek capital letter beta, U+0392" },
599: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
600: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
601: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
602: { 918, "Zeta", "greek capital letter zeta, U+0396" },
603: { 919, "Eta", "greek capital letter eta, U+0397" },
604: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
605: { 921, "Iota", "greek capital letter iota, U+0399" },
606: { 922, "Kappa","greek capital letter kappa, U+039A" },
607: { 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
608: { 924, "Mu", "greek capital letter mu, U+039C" },
609: { 925, "Nu", "greek capital letter nu, U+039D" },
610: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
611: { 927, "Omicron","greek capital letter omicron, U+039F" },
612: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
613: { 929, "Rho", "greek capital letter rho, U+03A1" },
614: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
615: { 932, "Tau", "greek capital letter tau, U+03A4" },
616: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
617: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
618: { 935, "Chi", "greek capital letter chi, U+03A7" },
619: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
620: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
621:
622: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
623: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
624: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
625: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
626: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
627: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
628: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
629: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
630: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
631: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
632: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
633: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
634: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
635: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
636: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
637: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
638: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
639: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
640: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
641: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
642: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
643: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
644: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
645: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
646: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
647: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
648: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
649: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
650:
651: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
652: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
653: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
654: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
655: { 8254, "oline","overline = spacing overscore, U+203E NEW" },
656: { 8260, "frasl","fraction slash, U+2044 NEW" },
657:
1.7 ! daniel 658: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1.1 daniel 659: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
660: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
661: { 8482, "trade","trade mark sign, U+2122 ISOnum" },
662: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
663: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
664: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
665: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
666: { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
667: { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
668: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
669: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
670: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
671: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
672: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
673: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
674:
675:
676: { 8704, "forall","for all, U+2200 ISOtech" },
677: { 8706, "part", "partial differential, U+2202 ISOtech" },
678: { 8707, "exist","there exists, U+2203 ISOtech" },
679: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
680: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
681: { 8712, "isin", "element of, U+2208 ISOtech" },
682: { 8713, "notin","not an element of, U+2209 ISOtech" },
683: { 8715, "ni", "contains as member, U+220B ISOtech" },
684: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
685: { 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
686: { 8722, "minus","minus sign, U+2212 ISOtech" },
687: { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
688: { 8730, "radic","square root = radical sign, U+221A ISOtech" },
689: { 8733, "prop", "proportional to, U+221D ISOtech" },
690: { 8734, "infin","infinity, U+221E ISOtech" },
691: { 8736, "ang", "angle, U+2220 ISOamso" },
692: { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
693: { 8744, "or", "logical or = vee, U+2228 ISOtech" },
694: { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
695: { 8746, "cup", "union = cup, U+222A ISOtech" },
696: { 8747, "int", "integral, U+222B ISOtech" },
697: { 8756, "there4","therefore, U+2234 ISOtech" },
698: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
699: { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
700: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
701: { 8800, "ne", "not equal to, U+2260 ISOtech" },
702: { 8801, "equiv","identical to, U+2261 ISOtech" },
703: { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
704: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
705: { 8834, "sub", "subset of, U+2282 ISOtech" },
706: { 8835, "sup", "superset of, U+2283 ISOtech" },
707: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
708: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
709: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
710: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
711: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
712: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
713: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
714: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
715: { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
716: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
717: { 8971, "rfloor","right floor, U+230B ISOamsc" },
718: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
719: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
720: { 9674, "loz", "lozenge, U+25CA ISOpub" },
721:
722: { 9824, "spades","black spade suit, U+2660 ISOpub" },
723: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
724: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
725: { 9830, "diams","black diamond suit, U+2666 ISOpub" },
726:
727: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
728: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
729: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
730: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
731: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
732: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
733: { 732, "tilde","small tilde, U+02DC ISOdia" },
734:
735: { 8194, "ensp", "en space, U+2002 ISOpub" },
736: { 8195, "emsp", "em space, U+2003 ISOpub" },
737: { 8201, "thinsp","thin space, U+2009 ISOpub" },
738: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
739: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
740: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
741: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
742: { 8211, "ndash","en dash, U+2013 ISOpub" },
743: { 8212, "mdash","em dash, U+2014 ISOpub" },
744: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
745: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
746: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
747: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
748: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
749: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
750: { 8224, "dagger","dagger, U+2020 ISOpub" },
751: { 8225, "Dagger","double dagger, U+2021 ISOpub" },
752: { 8240, "permil","per mille sign, U+2030 ISOtech" },
753: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1.7 ! daniel 754: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1.1 daniel 755: { 8364, "euro", "euro sign, U+20AC NEW" }
756: };
757:
758: /************************************************************************
759: * *
760: * Commodity functions to handle entities *
761: * *
762: ************************************************************************/
763:
764: /*
765: * Macro used to grow the current buffer.
766: */
767: #define growBuffer(buffer) { \
768: buffer##_size *= 2; \
769: buffer = (CHAR *) realloc(buffer, buffer##_size * sizeof(CHAR)); \
770: if (buffer == NULL) { \
771: perror("realloc failed"); \
772: exit(1); \
773: } \
774: }
775:
776: /**
777: * htmlEntityLookup:
778: * @name: the entity name
779: *
780: * Lookup the given entity in EntitiesTable
781: *
782: * TODO: the linear scan is really ugly, an hash table is really needed.
783: *
784: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
785: */
786: htmlEntityDescPtr
787: htmlEntityLookup(const CHAR *name) {
788: int i;
789:
790: for (i = 0;i < (sizeof(html40EntitiesTable)/
791: sizeof(html40EntitiesTable[0]));i++) {
792: if (!xmlStrcmp(name, html40EntitiesTable[i].name)) {
793: #ifdef DEBUG
794: printf("Found entity %s\n", name);
795: #endif
796: return(&html40EntitiesTable[i]);
797: }
798: }
799: return(NULL);
800: }
801:
802:
803: /**
804: * htmlDecodeEntities:
805: * @ctxt: the parser context
806: * @len: the len to decode (in bytes !), -1 for no size limit
807: * @end: an end marker CHAR, 0 if none
808: * @end2: an end marker CHAR, 0 if none
809: * @end3: an end marker CHAR, 0 if none
810: *
811: * Subtitute the HTML entities by their value
812: *
813: * TODO: once the internal representation will be UTF-8, all entities
814: * will be substituable, in the meantime we only apply the substitution
815: * to the one with values in the 0-255 UNICODE range
816: *
817: * Returns A newly allocated string with the substitution done. The caller
818: * must deallocate it !
819: */
820: CHAR *
821: htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
822: CHAR end, CHAR end2, CHAR end3) {
823: CHAR *buffer = NULL;
824: int buffer_size = 0;
825: CHAR *out = NULL;
826: CHAR *name = NULL;
827:
828: CHAR *cur = NULL;
829: htmlEntityDescPtr ent;
1.5 daniel 830: int nbchars = 0;
1.1 daniel 831: unsigned int max = (unsigned int) len;
832:
833: /*
834: * allocate a translation buffer.
835: */
836: buffer_size = 1000;
837: buffer = (CHAR *) malloc(buffer_size * sizeof(CHAR));
838: if (buffer == NULL) {
839: perror("htmlDecodeEntities: malloc failed");
840: return(NULL);
841: }
842: out = buffer;
843:
844: /*
845: * Ok loop until we reach one of the ending char or a size limit.
846: */
1.5 daniel 847: while ((nbchars < max) && (CUR != end) &&
1.1 daniel 848: (CUR != end2) && (CUR != end3)) {
849:
850: if (CUR == '&') {
851: if (NXT(1) == '#') {
852: int val = htmlParseCharRef(ctxt);
853: /* TODO: invalid for UTF-8 variable encoding !!! */
854: *out++ = val;
1.5 daniel 855: nbchars += 3; /* !!!! */
1.1 daniel 856: } else {
857: ent = htmlParseEntityRef(ctxt, &name);
858: if (name != NULL) {
859: if ((ent == NULL) || (ent->value <= 0) ||
860: (ent->value >= 255)) {
861: *out++ = '&';
862: cur = name;
863: while (*cur != 0) {
864: if (out - buffer > buffer_size - 100) {
865: int index = out - buffer;
866:
867: growBuffer(buffer);
868: out = &buffer[index];
869: }
870: *out++ = *cur++;
871: }
872: *out++ = ';';
873: } else {
874: /* TODO: invalid for UTF-8 variable encoding !!! */
875: *out++ = (CHAR)ent->value;
876: if (out - buffer > buffer_size - 100) {
877: int index = out - buffer;
878:
879: growBuffer(buffer);
880: out = &buffer[index];
881: }
882: }
1.5 daniel 883: nbchars += 2 + xmlStrlen(name);
1.1 daniel 884: free(name);
885: }
886: }
887: } else {
888: /* TODO: invalid for UTF-8 , use COPY(out); */
889: *out++ = CUR;
1.5 daniel 890: nbchars++;
1.1 daniel 891: if (out - buffer > buffer_size - 100) {
892: int index = out - buffer;
893:
894: growBuffer(buffer);
895: out = &buffer[index];
896: }
897: NEXT;
898: }
899: }
900: *out++ = 0;
901: return(buffer);
902: }
903:
904:
905: /************************************************************************
906: * *
907: * Commodity functions to handle encodings *
908: * *
909: ************************************************************************/
910:
911: /**
912: * htmlSwitchEncoding:
913: * @ctxt: the parser context
914: * @len: the len of @cur
915: *
916: * change the input functions when discovering the character encoding
917: * of a given entity.
918: *
919: */
920: void
921: htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
922: {
923: switch (enc) {
924: case XML_CHAR_ENCODING_ERROR:
925: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
926: ctxt->sax->error(ctxt->userData, "encoding unknown\n");
927: ctxt->wellFormed = 0;
928: break;
929: case XML_CHAR_ENCODING_NONE:
930: /* let's assume it's UTF-8 without the XML decl */
931: return;
932: case XML_CHAR_ENCODING_UTF8:
933: /* default encoding, no conversion should be needed */
934: return;
935: case XML_CHAR_ENCODING_UTF16LE:
936: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
937: ctxt->sax->error(ctxt->userData,
938: "char encoding UTF16 little endian not supported\n");
939: break;
940: case XML_CHAR_ENCODING_UTF16BE:
941: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
942: ctxt->sax->error(ctxt->userData,
943: "char encoding UTF16 big endian not supported\n");
944: break;
945: case XML_CHAR_ENCODING_UCS4LE:
946: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
947: ctxt->sax->error(ctxt->userData,
948: "char encoding USC4 little endian not supported\n");
949: break;
950: case XML_CHAR_ENCODING_UCS4BE:
951: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
952: ctxt->sax->error(ctxt->userData,
953: "char encoding USC4 big endian not supported\n");
954: break;
955: case XML_CHAR_ENCODING_EBCDIC:
956: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
957: ctxt->sax->error(ctxt->userData,
958: "char encoding EBCDIC not supported\n");
959: break;
960: case XML_CHAR_ENCODING_UCS4_2143:
961: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
962: ctxt->sax->error(ctxt->userData,
963: "char encoding UCS4 2143 not supported\n");
964: break;
965: case XML_CHAR_ENCODING_UCS4_3412:
966: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
967: ctxt->sax->error(ctxt->userData,
968: "char encoding UCS4 3412 not supported\n");
969: break;
970: case XML_CHAR_ENCODING_UCS2:
971: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
972: ctxt->sax->error(ctxt->userData,
973: "char encoding UCS2 not supported\n");
974: break;
975: case XML_CHAR_ENCODING_8859_1:
976: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
977: ctxt->sax->error(ctxt->userData,
978: "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
979: break;
980: case XML_CHAR_ENCODING_8859_2:
981: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
982: ctxt->sax->error(ctxt->userData,
983: "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
984: break;
985: case XML_CHAR_ENCODING_8859_3:
986: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
987: ctxt->sax->error(ctxt->userData,
988: "char encoding ISO_8859_3 not supported\n");
989: break;
990: case XML_CHAR_ENCODING_8859_4:
991: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
992: ctxt->sax->error(ctxt->userData,
993: "char encoding ISO_8859_4 not supported\n");
994: break;
995: case XML_CHAR_ENCODING_8859_5:
996: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
997: ctxt->sax->error(ctxt->userData,
998: "char encoding ISO_8859_5 not supported\n");
999: break;
1000: case XML_CHAR_ENCODING_8859_6:
1001: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1002: ctxt->sax->error(ctxt->userData,
1003: "char encoding ISO_8859_6 not supported\n");
1004: break;
1005: case XML_CHAR_ENCODING_8859_7:
1006: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1007: ctxt->sax->error(ctxt->userData,
1008: "char encoding ISO_8859_7 not supported\n");
1009: break;
1010: case XML_CHAR_ENCODING_8859_8:
1011: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1012: ctxt->sax->error(ctxt->userData,
1013: "char encoding ISO_8859_8 not supported\n");
1014: break;
1015: case XML_CHAR_ENCODING_8859_9:
1016: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1017: ctxt->sax->error(ctxt->userData,
1018: "char encoding ISO_8859_9 not supported\n");
1019: break;
1020: case XML_CHAR_ENCODING_2022_JP:
1021: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1022: ctxt->sax->error(ctxt->userData,
1023: "char encoding ISO-2022-JPnot supported\n");
1024: break;
1025: case XML_CHAR_ENCODING_SHIFT_JIS:
1026: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1027: ctxt->sax->error(ctxt->userData,
1028: "char encoding Shift_JISnot supported\n");
1029: break;
1030: case XML_CHAR_ENCODING_EUC_JP:
1031: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1032: ctxt->sax->error(ctxt->userData,
1033: "char encoding EUC-JPnot supported\n");
1034: break;
1035: }
1036: }
1037:
1038:
1039: /************************************************************************
1040: * *
1041: * Commodity functions, cleanup needed ? *
1042: * *
1043: ************************************************************************/
1044:
1045: /**
1046: * areBlanks:
1047: * @ctxt: an HTML parser context
1048: * @str: a CHAR *
1049: * @len: the size of @str
1050: *
1051: * Is this a sequence of blank chars that one can ignore ?
1052: *
1053: * TODO: to be corrected accodingly to DTD information if available
1054: *
1055: * Returns 1 if ignorable 0 otherwise.
1056: */
1057:
1058: static int areBlanks(htmlParserCtxtPtr ctxt, const CHAR *str, int len) {
1059: int i;
1060: xmlNodePtr lastChild;
1061:
1062: for (i = 0;i < len;i++)
1063: if (!(IS_BLANK(str[i]))) return(0);
1064:
1065: if (CUR != '<') return(0);
1066: if (ctxt->node == NULL) return(0);
1067: lastChild = xmlGetLastChild(ctxt->node);
1068: if (lastChild == NULL) {
1069: if (ctxt->node->content != NULL) return(0);
1070: } else if (xmlNodeIsText(lastChild))
1071: return(0);
1072: return(1);
1073: }
1074:
1075: /**
1076: * htmlHandleEntity:
1077: * @ctxt: an HTML parser context
1078: * @entity: an XML entity pointer.
1079: *
1080: * Default handling of an HTML entity, call the parser with the
1081: * substitution string
1082: */
1083:
1084: void
1085: htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1086: int len;
1087:
1088: if (entity->content == NULL) {
1089: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1090: ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1091: entity->name);
1092: ctxt->wellFormed = 0;
1093: return;
1094: }
1095: len = xmlStrlen(entity->content);
1096:
1097: /*
1098: * Just handle the content as a set of chars.
1099: */
1100: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1101: ctxt->sax->characters(ctxt->userData, entity->content, len);
1102:
1103: }
1104:
1105: /**
1106: * htmlNewDoc:
1107: * @URI: URI for the dtd, or NULL
1108: * @ExternalID: the external ID of the DTD, or NULL
1109: *
1110: * Returns a new document
1111: */
1112: htmlDocPtr
1113: htmlNewDoc(const CHAR *URI, const CHAR *ExternalID) {
1114: xmlDocPtr cur;
1115:
1116: /*
1117: * Allocate a new document and fill the fields.
1118: */
1119: cur = (xmlDocPtr) malloc(sizeof(xmlDoc));
1120: if (cur == NULL) {
1121: fprintf(stderr, "xmlNewDoc : malloc failed\n");
1122: return(NULL);
1123: }
1124:
1125: cur->type = XML_DOCUMENT_NODE;
1126: cur->version = NULL;
1127: cur->intSubset = NULL;
1128: xmlCreateIntSubset(cur, "HTML", ExternalID, URI);
1129: cur->name = NULL;
1130: cur->root = NULL;
1131: cur->extSubset = NULL;
1132: cur->oldNs = NULL;
1133: cur->encoding = NULL;
1134: cur->standalone = 1;
1135: cur->compression = 0;
1136: #ifndef XML_WITHOUT_CORBA
1137: cur->_private = NULL;
1138: cur->vepv = NULL;
1139: #endif
1140: return(cur);
1141: }
1142:
1143:
1144: /************************************************************************
1145: * *
1146: * The parser itself *
1147: * Relates to http://www.w3.org/TR/html40 *
1148: * *
1149: ************************************************************************/
1150:
1151: /************************************************************************
1152: * *
1153: * The parser itself *
1154: * *
1155: ************************************************************************/
1156:
1157: /**
1158: * htmlParseHTMLName:
1159: * @ctxt: an HTML parser context
1160: *
1161: * parse an HTML tag or attribute name, note that we convert it to uppercase
1162: * since HTML names are not case-sensitive.
1163: *
1164: * Returns the Tag Name parsed or NULL
1165: */
1166:
1167: CHAR *
1168: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1169: CHAR *ret = NULL;
1170: int i = 0;
1171: CHAR loc[100];
1172:
1173: if (!IS_LETTER(CUR) && (CUR != '_') &&
1174: (CUR != ':')) return(NULL);
1175:
1176: while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
1177: if ((CUR >= 0x61) && (CUR <= 0x7a)) loc[i] = CUR - 0x20;
1178: else loc[i] = CUR;
1179: i++;
1180:
1181: NEXT;
1182: }
1183:
1184: ret = xmlStrndup(loc, i);
1185:
1186: return(ret);
1187: }
1188:
1189: /**
1190: * htmlParseName:
1191: * @ctxt: an HTML parser context
1192: *
1193: * parse an HTML name, this routine is case sensistive.
1194: *
1195: * Returns the Name parsed or NULL
1196: */
1197:
1198: CHAR *
1199: htmlParseName(htmlParserCtxtPtr ctxt) {
1.5 daniel 1200: CHAR buf[HTML_MAX_NAMELEN];
1201: int len = 0;
1.1 daniel 1202:
1.5 daniel 1203: GROW;
1204: if (!IS_LETTER(CUR) && (CUR != '_')) {
1205: return(NULL);
1206: }
1.1 daniel 1207:
1208: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1209: (CUR == '.') || (CUR == '-') ||
1210: (CUR == '_') || (CUR == ':') ||
1211: (IS_COMBINING(CUR)) ||
1.5 daniel 1212: (IS_EXTENDER(CUR))) {
1213: buf[len++] = CUR;
1.1 daniel 1214: NEXT;
1.5 daniel 1215: if (len >= HTML_MAX_NAMELEN) {
1216: fprintf(stderr,
1217: "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1218: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1219: (CUR == '.') || (CUR == '-') ||
1220: (CUR == '_') || (CUR == ':') ||
1221: (IS_COMBINING(CUR)) ||
1222: (IS_EXTENDER(CUR)))
1223: NEXT;
1224: break;
1225: }
1226: }
1227: return(xmlStrndup(buf, len));
1.1 daniel 1228: }
1229:
1230: /**
1231: * htmlParseHTMLAttribute:
1232: * @ctxt: an HTML parser context
1233: *
1.5 daniel 1234: * parse an HTML attribute value (without quotes).
1.1 daniel 1235: *
1236: * Returns the Nmtoken parsed or NULL
1237: */
1238:
1239: CHAR *
1240: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt) {
1.5 daniel 1241: CHAR buf[HTML_MAX_NAMELEN];
1242: int len = 0;
1.1 daniel 1243:
1.5 daniel 1244: GROW;
1.1 daniel 1245: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1246: (CUR != '&') && (CUR != '>') &&
1.5 daniel 1247: (CUR != '\'') && (CUR != '"')) {
1248: buf[len++] = CUR;
1.1 daniel 1249: NEXT;
1.5 daniel 1250: if (len >= HTML_MAX_NAMELEN) {
1251: fprintf(stderr,
1252: "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1253: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1254: (CUR != '&') && (CUR != '>') &&
1255: (CUR != '\'') && (CUR != '"'))
1256: NEXT;
1257: break;
1258: }
1259: }
1260: return(xmlStrndup(buf, len));
1.1 daniel 1261: }
1262:
1263: /**
1264: * htmlParseNmtoken:
1265: * @ctxt: an HTML parser context
1266: *
1267: * parse an HTML Nmtoken.
1268: *
1269: * Returns the Nmtoken parsed or NULL
1270: */
1271:
1272: CHAR *
1273: htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
1.5 daniel 1274: CHAR buf[HTML_MAX_NAMELEN];
1275: int len = 0;
1.1 daniel 1276:
1.5 daniel 1277: GROW;
1.1 daniel 1278: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1279: (CUR == '.') || (CUR == '-') ||
1280: (CUR == '_') || (CUR == ':') ||
1281: (IS_COMBINING(CUR)) ||
1.5 daniel 1282: (IS_EXTENDER(CUR))) {
1283: buf[len++] = CUR;
1.1 daniel 1284: NEXT;
1.5 daniel 1285: if (len >= HTML_MAX_NAMELEN) {
1286: fprintf(stderr,
1287: "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1288: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1289: (CUR == '.') || (CUR == '-') ||
1290: (CUR == '_') || (CUR == ':') ||
1291: (IS_COMBINING(CUR)) ||
1292: (IS_EXTENDER(CUR)))
1293: NEXT;
1294: break;
1295: }
1296: }
1297: return(xmlStrndup(buf, len));
1.1 daniel 1298: }
1299:
1300: /**
1301: * htmlParseEntityRef:
1302: * @ctxt: an HTML parser context
1303: * @str: location to store the entity name
1304: *
1305: * parse an HTML ENTITY references
1306: *
1307: * [68] EntityRef ::= '&' Name ';'
1308: *
1309: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1310: * if non-NULL *str will have to be freed by the caller.
1311: */
1312: htmlEntityDescPtr
1313: htmlParseEntityRef(htmlParserCtxtPtr ctxt, CHAR **str) {
1314: CHAR *name;
1315: htmlEntityDescPtr ent = NULL;
1316: *str = NULL;
1317:
1318: if (CUR == '&') {
1319: NEXT;
1320: name = htmlParseName(ctxt);
1321: if (name == NULL) {
1322: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1323: ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1324: ctxt->wellFormed = 0;
1325: } else {
1.5 daniel 1326: GROW;
1.1 daniel 1327: if (CUR == ';') {
1328: NEXT;
1329: *str = name;
1330:
1331: /*
1332: * Lookup the entity in the table.
1333: */
1334: ent = htmlEntityLookup(name);
1335: } else {
1336: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1337: ctxt->sax->error(ctxt->userData,
1338: "htmlParseEntityRef: expecting ';'\n");
1339: ctxt->wellFormed = 0;
1340: if (ctxt->sax->characters != NULL) {
1341: ctxt->sax->characters(ctxt->userData, "&", 1);
1342: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1343: }
1344: free(name);
1345: }
1346: }
1347: }
1348: return(ent);
1349: }
1350:
1351: /**
1352: * htmlParseAttValue:
1353: * @ctxt: an HTML parser context
1354: *
1355: * parse a value for an attribute
1356: * Note: the parser won't do substitution of entities here, this
1357: * will be handled later in xmlStringGetNodeList, unless it was
1358: * asked for ctxt->replaceEntities != 0
1359: *
1360: * Returns the AttValue parsed or NULL.
1361: */
1362:
1363: CHAR *
1364: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1365: CHAR *ret = NULL;
1366:
1367: if (CUR == '"') {
1368: NEXT;
1369: ret = htmlDecodeEntities(ctxt, -1, '"', '<', 0);
1370: if (CUR == '<') {
1371: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1372: ctxt->sax->error(ctxt->userData,
1373: "Unescaped '<' not allowed in attributes values\n");
1374: ctxt->wellFormed = 0;
1375: }
1376: if (CUR != '"') {
1377: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1378: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1379: ctxt->wellFormed = 0;
1380: } else
1381: NEXT;
1382: } else if (CUR == '\'') {
1383: NEXT;
1384: ret = htmlDecodeEntities(ctxt, -1, '\'', '<', 0);
1385: if (CUR == '<') {
1386: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1387: ctxt->sax->error(ctxt->userData,
1388: "Unescaped '<' not allowed in attributes values\n");
1389: ctxt->wellFormed = 0;
1390: }
1391: if (CUR != '\'') {
1392: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1393: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1394: ctxt->wellFormed = 0;
1395: } else
1396: NEXT;
1397: } else {
1398: /*
1399: * That's an HTMLism, the attribute value may not be quoted
1400: */
1401: ret = htmlParseHTMLAttribute(ctxt);
1402: if (ret == NULL) {
1403: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1404: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1405: ctxt->wellFormed = 0;
1406: }
1407: }
1408:
1409: return(ret);
1410: }
1411:
1412: /**
1413: * htmlParseSystemLiteral:
1414: * @ctxt: an HTML parser context
1415: *
1416: * parse an HTML Literal
1417: *
1418: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1419: *
1420: * Returns the SystemLiteral parsed or NULL
1421: */
1422:
1423: CHAR *
1424: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1425: const CHAR *q;
1426: CHAR *ret = NULL;
1427:
1428: if (CUR == '"') {
1429: NEXT;
1430: q = CUR_PTR;
1431: while ((IS_CHAR(CUR)) && (CUR != '"'))
1432: NEXT;
1433: if (!IS_CHAR(CUR)) {
1434: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1435: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1436: ctxt->wellFormed = 0;
1437: } else {
1438: ret = xmlStrndup(q, CUR_PTR - q);
1439: NEXT;
1440: }
1441: } else if (CUR == '\'') {
1442: NEXT;
1443: q = CUR_PTR;
1444: while ((IS_CHAR(CUR)) && (CUR != '\''))
1445: NEXT;
1446: if (!IS_CHAR(CUR)) {
1447: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1448: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1449: ctxt->wellFormed = 0;
1450: } else {
1451: ret = xmlStrndup(q, CUR_PTR - q);
1452: NEXT;
1453: }
1454: } else {
1455: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1456: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1457: ctxt->wellFormed = 0;
1458: }
1459:
1460: return(ret);
1461: }
1462:
1463: /**
1464: * htmlParsePubidLiteral:
1465: * @ctxt: an HTML parser context
1466: *
1467: * parse an HTML public literal
1468: *
1469: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1470: *
1471: * Returns the PubidLiteral parsed or NULL.
1472: */
1473:
1474: CHAR *
1475: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1476: const CHAR *q;
1477: CHAR *ret = NULL;
1478: /*
1479: * Name ::= (Letter | '_') (NameChar)*
1480: */
1481: if (CUR == '"') {
1482: NEXT;
1483: q = CUR_PTR;
1484: while (IS_PUBIDCHAR(CUR)) NEXT;
1485: if (CUR != '"') {
1486: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1487: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1488: ctxt->wellFormed = 0;
1489: } else {
1490: ret = xmlStrndup(q, CUR_PTR - q);
1491: NEXT;
1492: }
1493: } else if (CUR == '\'') {
1494: NEXT;
1495: q = CUR_PTR;
1496: while ((IS_LETTER(CUR)) && (CUR != '\''))
1497: NEXT;
1498: if (!IS_LETTER(CUR)) {
1499: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1500: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1501: ctxt->wellFormed = 0;
1502: } else {
1503: ret = xmlStrndup(q, CUR_PTR - q);
1504: NEXT;
1505: }
1506: } else {
1507: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1508: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1509: ctxt->wellFormed = 0;
1510: }
1511:
1512: return(ret);
1513: }
1514:
1515: /**
1516: * htmlParseCharData:
1517: * @ctxt: an HTML parser context
1518: * @cdata: int indicating whether we are within a CDATA section
1519: *
1520: * parse a CharData section.
1521: * if we are within a CDATA section ']]>' marks an end of section.
1522: *
1523: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1524: */
1525:
1526: void
1527: htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1528: const CHAR *q;
1529:
1530: q = CUR_PTR;
1531: while ((IS_CHAR(CUR)) && (CUR != '<') &&
1532: (CUR != '&')) {
1533: if ((CUR == ']') && (NXT(1) == ']') &&
1534: (NXT(2) == '>')) {
1535: if (cdata) break;
1536: else {
1537: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1538: ctxt->sax->error(ctxt->userData,
1539: "Sequence ']]>' not allowed in content\n");
1540: ctxt->wellFormed = 0;
1541: }
1542: }
1543: NEXT;
1544: }
1545: if (q == CUR_PTR) return;
1546:
1547: /*
1548: * Ok the segment [q CUR_PTR] is to be consumed as chars.
1549: */
1550: if (ctxt->sax != NULL) {
1551: if (areBlanks(ctxt, q, CUR_PTR - q)) {
1552: if (ctxt->sax->ignorableWhitespace != NULL)
1553: ctxt->sax->ignorableWhitespace(ctxt->userData, q, CUR_PTR - q);
1554: } else {
1555: if (ctxt->sax->characters != NULL)
1556: ctxt->sax->characters(ctxt->userData, q, CUR_PTR - q);
1557: }
1558: }
1559: }
1560:
1561: /**
1562: * htmlParseExternalID:
1563: * @ctxt: an HTML parser context
1564: * @publicID: a CHAR** receiving PubidLiteral
1565: * @strict: indicate whether we should restrict parsing to only
1566: * production [75], see NOTE below
1567: *
1568: * Parse an External ID or a Public ID
1569: *
1570: * NOTE: Productions [75] and [83] interract badly since [75] can generate
1571: * 'PUBLIC' S PubidLiteral S SystemLiteral
1572: *
1573: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1574: * | 'PUBLIC' S PubidLiteral S SystemLiteral
1575: *
1576: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1577: *
1578: * Returns the function returns SystemLiteral and in the second
1579: * case publicID receives PubidLiteral, is strict is off
1580: * it is possible to return NULL and have publicID set.
1581: */
1582:
1583: CHAR *
1584: htmlParseExternalID(htmlParserCtxtPtr ctxt, CHAR **publicID, int strict) {
1585: CHAR *URI = NULL;
1586:
1587: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1588: (UPP(2) == 'S') && (UPP(3) == 'T') &&
1589: (UPP(4) == 'E') && (UPP(5) == 'M')) {
1590: SKIP(6);
1591: if (!IS_BLANK(CUR)) {
1592: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1593: ctxt->sax->error(ctxt->userData,
1594: "Space required after 'SYSTEM'\n");
1595: ctxt->wellFormed = 0;
1596: }
1597: SKIP_BLANKS;
1598: URI = htmlParseSystemLiteral(ctxt);
1599: if (URI == NULL) {
1600: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1601: ctxt->sax->error(ctxt->userData,
1602: "htmlParseExternalID: SYSTEM, no URI\n");
1603: ctxt->wellFormed = 0;
1604: }
1605: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1606: (UPP(2) == 'B') && (UPP(3) == 'L') &&
1607: (UPP(4) == 'I') && (UPP(5) == 'C')) {
1608: SKIP(6);
1609: if (!IS_BLANK(CUR)) {
1610: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1611: ctxt->sax->error(ctxt->userData,
1612: "Space required after 'PUBLIC'\n");
1613: ctxt->wellFormed = 0;
1614: }
1615: SKIP_BLANKS;
1616: *publicID = htmlParsePubidLiteral(ctxt);
1617: if (*publicID == NULL) {
1618: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1619: ctxt->sax->error(ctxt->userData,
1620: "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1621: ctxt->wellFormed = 0;
1622: }
1.5 daniel 1623: SKIP_BLANKS;
1624: if ((CUR == '"') || (CUR == '\'')) {
1625: URI = htmlParseSystemLiteral(ctxt);
1.1 daniel 1626: }
1627: }
1628: return(URI);
1629: }
1630:
1631: /**
1632: * htmlParseComment:
1633: * @ctxt: an HTML parser context
1634: * @create: should we create a node, or just skip the content
1635: *
1636: * Parse an XML (SGML) comment <!-- .... -->
1637: *
1638: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1639: */
1640: void
1641: htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
1642: const CHAR *q, *start;
1643: const CHAR *r;
1644: CHAR *val;
1645:
1646: /*
1647: * Check that there is a comment right here.
1648: */
1649: if ((CUR != '<') || (NXT(1) != '!') ||
1650: (NXT(2) != '-') || (NXT(3) != '-')) return;
1651:
1652: SKIP(4);
1653: start = q = CUR_PTR;
1654: NEXT;
1655: r = CUR_PTR;
1656: NEXT;
1657: while (IS_CHAR(CUR) &&
1658: ((CUR == ':') || (CUR != '>') ||
1659: (*r != '-') || (*q != '-'))) {
1660: if ((*r == '-') && (*q == '-')) {
1661: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1662: ctxt->sax->error(ctxt->userData,
1663: "Comment must not contain '--' (double-hyphen)`\n");
1664: ctxt->wellFormed = 0;
1665: }
1666: NEXT;r++;q++;
1667: }
1668: if (!IS_CHAR(CUR)) {
1669: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1670: ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", start);
1671: ctxt->wellFormed = 0;
1672: } else {
1673: NEXT;
1674: if (create) {
1675: val = xmlStrndup(start, q - start);
1676: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL))
1677: ctxt->sax->comment(ctxt->userData, val);
1678: free(val);
1679: }
1680: }
1681: }
1682:
1683: /**
1684: * htmlParseCharRef:
1685: * @ctxt: an HTML parser context
1686: *
1687: * parse Reference declarations
1688: *
1689: * [66] CharRef ::= '&#' [0-9]+ ';' |
1690: * '&#x' [0-9a-fA-F]+ ';'
1691: *
1692: * Returns the value parsed (as an int)
1693: */
1694: int
1695: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1696: int val = 0;
1697:
1698: if ((CUR == '&') && (NXT(1) == '#') &&
1699: (NXT(2) == 'x')) {
1700: SKIP(3);
1701: while (CUR != ';') {
1702: if ((CUR >= '0') && (CUR <= '9'))
1703: val = val * 16 + (CUR - '0');
1704: else if ((CUR >= 'a') && (CUR <= 'f'))
1705: val = val * 16 + (CUR - 'a') + 10;
1706: else if ((CUR >= 'A') && (CUR <= 'F'))
1707: val = val * 16 + (CUR - 'A') + 10;
1708: else {
1709: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1710: ctxt->sax->error(ctxt->userData,
1711: "htmlParseCharRef: invalid hexadecimal value\n");
1712: ctxt->wellFormed = 0;
1713: val = 0;
1714: break;
1715: }
1716: NEXT;
1717: }
1718: if (CUR == ';')
1719: NEXT;
1720: } else if ((CUR == '&') && (NXT(1) == '#')) {
1721: SKIP(2);
1722: while (CUR != ';') {
1723: if ((CUR >= '0') && (CUR <= '9'))
1724: val = val * 10 + (CUR - '0');
1725: else {
1726: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1727: ctxt->sax->error(ctxt->userData,
1728: "htmlParseCharRef: invalid decimal value\n");
1729: ctxt->wellFormed = 0;
1730: val = 0;
1731: break;
1732: }
1733: NEXT;
1734: }
1735: if (CUR == ';')
1736: NEXT;
1737: } else {
1738: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1739: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
1740: ctxt->wellFormed = 0;
1741: }
1742: /*
1743: * Check the value IS_CHAR ...
1744: */
1745: if (IS_CHAR(val)) {
1746: return(val);
1747: } else {
1748: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1749: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid CHAR value %d\n",
1750: val);
1751: ctxt->wellFormed = 0;
1752: }
1753: return(0);
1754: }
1755:
1756:
1757: /**
1758: * htmlParseDocTypeDecl :
1759: * @ctxt: an HTML parser context
1760: *
1761: * parse a DOCTYPE declaration
1762: *
1763: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
1764: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
1765: */
1766:
1767: void
1768: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1769: CHAR *name;
1770: CHAR *ExternalID = NULL;
1771: CHAR *URI = NULL;
1772:
1773: /*
1774: * We know that '<!DOCTYPE' has been detected.
1775: */
1776: SKIP(9);
1777:
1778: SKIP_BLANKS;
1779:
1780: /*
1781: * Parse the DOCTYPE name.
1782: */
1783: name = htmlParseName(ctxt);
1784: if (name == NULL) {
1785: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1786: ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
1787: ctxt->wellFormed = 0;
1788: }
1789: /*
1790: * Check that upper(name) == "HTML" !!!!!!!!!!!!!
1791: */
1792:
1793: SKIP_BLANKS;
1794:
1795: /*
1796: * Check for SystemID and ExternalID
1797: */
1.5 daniel 1798: URI = htmlParseExternalID(ctxt, &ExternalID, 0);
1.1 daniel 1799: SKIP_BLANKS;
1800:
1801: /*
1802: * We should be at the end of the DOCTYPE declaration.
1803: */
1804: if (CUR != '>') {
1805: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1806: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
1807: ctxt->wellFormed = 0;
1808: /* We shouldn't try to resynchronize ... */
1809: } else {
1810: }
1811: NEXT;
1812:
1813: /*
1814: * Create the document accordingly to the DOCTYPE
1815: */
1816: ctxt->myDoc = htmlNewDoc(URI, ExternalID);
1817:
1818: /*
1819: * Cleanup, since we don't use all those identifiers
1820: * TODO : the DOCTYPE if available should be stored !
1821: */
1822: if (URI != NULL) free(URI);
1823: if (ExternalID != NULL) free(ExternalID);
1824: if (name != NULL) free(name);
1825: }
1826:
1827: /**
1828: * htmlParseAttribute:
1829: * @ctxt: an HTML parser context
1830: * @value: a CHAR ** used to store the value of the attribute
1831: *
1832: * parse an attribute
1833: *
1834: * [41] Attribute ::= Name Eq AttValue
1835: *
1836: * [25] Eq ::= S? '=' S?
1837: *
1838: * With namespace:
1839: *
1840: * [NS 11] Attribute ::= QName Eq AttValue
1841: *
1842: * Also the case QName == xmlns:??? is handled independently as a namespace
1843: * definition.
1844: *
1845: * Returns the attribute name, and the value in *value.
1846: */
1847:
1848: CHAR *
1849: htmlParseAttribute(htmlParserCtxtPtr ctxt, CHAR **value) {
1850: CHAR *name, *val;
1851:
1852: *value = NULL;
1853: name = htmlParseName(ctxt);
1854: if (name == NULL) {
1855: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1856: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
1857: ctxt->wellFormed = 0;
1858: return(NULL);
1859: }
1860:
1861: /*
1862: * read the value
1863: */
1864: SKIP_BLANKS;
1865: if (CUR == '=') {
1866: NEXT;
1867: SKIP_BLANKS;
1868: val = htmlParseAttValue(ctxt);
1869: } else {
1870: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1871: ctxt->sax->error(ctxt->userData,
1872: "Specification mandate value for attribute %s\n", name);
1873: ctxt->wellFormed = 0;
1874: return(NULL);
1875: }
1876:
1877: *value = val;
1878: return(name);
1879: }
1880:
1881: /**
1882: * htmlParseStartTag:
1883: * @ctxt: an HTML parser context
1884: *
1885: * parse a start of tag either for rule element or
1886: * EmptyElement. In both case we don't parse the tag closing chars.
1887: *
1888: * [40] STag ::= '<' Name (S Attribute)* S? '>'
1889: *
1890: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
1891: *
1892: * With namespace:
1893: *
1894: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
1895: *
1896: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
1897: *
1898: * Returns the element name parsed
1899: */
1900:
1901: CHAR *
1902: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1903: CHAR *name;
1904: CHAR *attname;
1905: CHAR *attvalue;
1906: const CHAR **atts = NULL;
1907: int nbatts = 0;
1908: int maxatts = 0;
1909: int i;
1910:
1911: if (CUR != '<') return(NULL);
1912: NEXT;
1913:
1914: name = htmlParseHTMLName(ctxt);
1915: if (name == NULL) {
1916: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1917: ctxt->sax->error(ctxt->userData,
1918: "htmlParseStartTag: invalid element name\n");
1919: ctxt->wellFormed = 0;
1920: return(NULL);
1921: }
1922:
1923: /*
1924: * Check for auto-closure of HTML elements.
1925: */
1926: htmlAutoClose(ctxt, name);
1927:
1928: /*
1929: * Now parse the attributes, it ends up with the ending
1930: *
1931: * (S Attribute)* S?
1932: */
1933: SKIP_BLANKS;
1934: while ((IS_CHAR(CUR)) &&
1935: (CUR != '>') &&
1936: ((CUR != '/') || (NXT(1) != '>'))) {
1937: const CHAR *q = CUR_PTR;
1938:
1939: attname = htmlParseAttribute(ctxt, &attvalue);
1940: if ((attname != NULL) && (attvalue != NULL)) {
1941: /*
1942: * Well formedness requires at most one declaration of an attribute
1943: */
1944: for (i = 0; i < nbatts;i += 2) {
1945: if (!xmlStrcmp(atts[i], attname)) {
1946: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1947: ctxt->sax->error(ctxt->userData, "Attribute %s redefined\n",
1948: name);
1949: ctxt->wellFormed = 0;
1950: free(attname);
1951: free(attvalue);
1952: break;
1953: }
1954: }
1955:
1956: /*
1957: * Add the pair to atts
1958: */
1959: if (atts == NULL) {
1960: maxatts = 10;
1961: atts = (const CHAR **) malloc(maxatts * sizeof(CHAR *));
1962: if (atts == NULL) {
1963: fprintf(stderr, "malloc of %ld byte failed\n",
1964: maxatts * (long)sizeof(CHAR *));
1965: return(NULL);
1966: }
1967: } else if (nbatts + 2 < maxatts) {
1968: maxatts *= 2;
1969: atts = (const CHAR **) realloc(atts, maxatts * sizeof(CHAR *));
1970: if (atts == NULL) {
1971: fprintf(stderr, "realloc of %ld byte failed\n",
1972: maxatts * (long)sizeof(CHAR *));
1973: return(NULL);
1974: }
1975: }
1976: atts[nbatts++] = attname;
1977: atts[nbatts++] = attvalue;
1978: atts[nbatts] = NULL;
1979: atts[nbatts + 1] = NULL;
1980: }
1981:
1982: SKIP_BLANKS;
1983: if (q == CUR_PTR) {
1984: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1985: ctxt->sax->error(ctxt->userData,
1986: "htmlParseStartTag: problem parsing attributes\n");
1987: ctxt->wellFormed = 0;
1988: break;
1989: }
1990: }
1991:
1992: /*
1993: * SAX: Start of Element !
1994: */
1995: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1996: ctxt->sax->startElement(ctxt->userData, name, atts);
1997:
1998: if (atts != NULL) {
1999: for (i = 0;i < nbatts;i++) free((CHAR *) atts[i]);
2000: free(atts);
2001: }
2002: return(name);
2003: }
2004:
2005: /**
2006: * htmlParseEndTag:
2007: * @ctxt: an HTML parser context
2008: * @tagname: the tag name as parsed in the opening tag.
2009: *
2010: * parse an end of tag
2011: *
2012: * [42] ETag ::= '</' Name S? '>'
2013: *
2014: * With namespace
2015: *
2016: * [NS 9] ETag ::= '</' QName S? '>'
2017: */
2018:
2019: void
2020: htmlParseEndTag(htmlParserCtxtPtr ctxt, const CHAR *tagname) {
2021: CHAR *name;
2022: int i;
2023:
2024: if ((CUR != '<') || (NXT(1) != '/')) {
2025: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2026: ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2027: ctxt->wellFormed = 0;
2028: return;
2029: }
2030: SKIP(2);
2031:
2032: name = htmlParseHTMLName(ctxt);
2033:
2034: /*
2035: * We should definitely be at the ending "S? '>'" part
2036: */
2037: SKIP_BLANKS;
2038: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2039: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2040: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2041: ctxt->wellFormed = 0;
2042: } else
2043: NEXT;
2044:
2045: /*
2046: * Check that we are not closing an already closed tag,
2047: * <p><b>...</p></b> is a really common error !
2048: */
2049: for (i = ctxt->nodeNr - 1;i >= 0;i--) {
2050: if ((ctxt->nodeTab[i] != NULL) &&
2051: (!xmlStrcmp(tagname, ctxt->nodeTab[i]->name)))
2052: break;
2053: }
2054: if (i < 0) {
2055: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2056: ctxt->sax->error(ctxt->userData,
2057: "htmlParseEndTag: unexpected close for tag %s\n",
2058: tagname);
1.6 veillard 2059: free(name);
1.1 daniel 2060: ctxt->wellFormed = 0;
2061: return;
2062: }
2063:
2064: /*
2065: * Check for auto-closure of HTML elements.
2066: */
2067: htmlAutoCloseOnClose(ctxt, name);
2068:
2069: /*
2070: * Well formedness constraints, opening and closing must match.
2071: * With the exception that the autoclose may have popped stuff out
2072: * of the stack.
2073: */
2074: if (xmlStrcmp(name, tagname)) {
2075: if ((ctxt->node != NULL) &&
2076: (xmlStrcmp(ctxt->node->name, name))) {
2077: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2078: ctxt->sax->error(ctxt->userData,
2079: "Opening and ending tag mismatch: %s and %s\n",
2080: name, ctxt->node->name);
2081: ctxt->wellFormed = 0;
2082: }
2083: }
2084:
2085: /*
2086: * SAX: End of Tag
2087: */
2088: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2089: ctxt->sax->endElement(ctxt->userData, name);
2090:
2091: if (name != NULL)
2092: free(name);
2093:
2094: return;
2095: }
2096:
2097:
2098: /**
2099: * htmlParseReference:
2100: * @ctxt: an HTML parser context
2101: *
2102: * parse and handle entity references in content,
2103: * this will end-up in a call to character() since this is either a
2104: * CharRef, or a predefined entity.
2105: */
2106: void
2107: htmlParseReference(htmlParserCtxtPtr ctxt) {
2108: htmlEntityDescPtr ent;
2109: CHAR out[2];
2110: CHAR *name;
2111: int val;
2112: if (CUR != '&') return;
2113:
2114: if (NXT(1) == '#') {
2115: val = htmlParseCharRef(ctxt);
2116: /* TODO: invalid for UTF-8 variable encoding !!! */
2117: out[0] = val;
2118: out[1] = 0;
2119: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2120: ctxt->sax->characters(ctxt->userData, out, 1);
2121: } else {
2122: ent = htmlParseEntityRef(ctxt, &name);
2123: if (name == NULL) return; /* Shall we output & anyway ? */
2124: if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2125: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
2126: ctxt->sax->characters(ctxt->userData, "&", 1);
2127: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
2128: ctxt->sax->characters(ctxt->userData, ";", 1);
2129: }
2130: } else {
2131: /* TODO: invalid for UTF-8 variable encoding !!! */
2132: out[0] = ent->value;
2133: out[1] = 0;
2134: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2135: ctxt->sax->characters(ctxt->userData, out, 1);
2136: }
2137: free(name);
2138: }
2139: }
2140:
2141: /**
2142: * htmlParseContent:
2143: * @ctxt: an HTML parser context
2144: * @name: the node name
2145: *
2146: * Parse a content: comment, sub-element, reference or text.
2147: *
2148: */
2149:
2150: void
2151: htmlParseContent(htmlParserCtxtPtr ctxt, const CHAR *name) {
2152: htmlNodePtr currentNode;
2153:
2154: currentNode = ctxt->node;
2155: while ((CUR != '<') || (NXT(1) != '/')) {
2156: const CHAR *test = CUR_PTR;
2157:
2158: /*
2159: * Has this node been popped out during parsing of
2160: * the next element
2161: */
2162: if (currentNode != ctxt->node) return;
2163:
2164: /*
2165: * First case : a comment
2166: */
2167: if ((CUR == '<') && (NXT(1) == '!') &&
2168: (NXT(2) == '-') && (NXT(3) == '-')) {
2169: htmlParseComment(ctxt, 1);
2170: }
2171:
2172: /*
2173: * Second case : a sub-element.
2174: */
2175: else if (CUR == '<') {
2176: htmlParseElement(ctxt);
2177: }
2178:
2179: /*
2180: * Third case : a reference. If if has not been resolved,
2181: * parsing returns it's Name, create the node
2182: */
2183: else if (CUR == '&') {
2184: htmlParseReference(ctxt);
2185: }
2186:
2187: /*
2188: * Last case, text. Note that References are handled directly.
2189: */
2190: else {
2191: htmlParseCharData(ctxt, 0);
2192: }
2193:
2194: if (test == CUR_PTR) {
2195: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2196: ctxt->sax->error(ctxt->userData,
2197: "detected an error in element content\n");
2198: ctxt->wellFormed = 0;
2199: break;
2200: }
1.5 daniel 2201: GROW;
1.1 daniel 2202: }
2203:
2204: /*
2205: * parse the end of tag: '</' should be here.
2206: */
2207: htmlParseEndTag(ctxt, name);
2208: }
2209:
2210: /**
2211: * htmlParseElement:
2212: * @ctxt: an HTML parser context
2213: *
2214: * parse an HTML element, this is highly recursive
2215: *
2216: * [39] element ::= EmptyElemTag | STag content ETag
2217: *
2218: * [41] Attribute ::= Name Eq AttValue
2219: */
2220:
2221: void
2222: htmlParseElement(htmlParserCtxtPtr ctxt) {
2223: const CHAR *openTag = CUR_PTR;
2224: CHAR *name;
2225: htmlParserNodeInfo node_info;
2226: htmlNodePtr currentNode;
2227: htmlElemDescPtr info;
2228:
2229: /* Capture start position */
2230: node_info.begin_pos = CUR_PTR - ctxt->input->base;
2231: node_info.begin_line = ctxt->input->line;
2232:
2233: name = htmlParseStartTag(ctxt);
2234: if (name == NULL) {
2235: return;
2236: }
2237:
2238: /*
2239: * Lookup the info for that element.
2240: */
2241: info = htmlTagLookup(name);
2242: if (info == NULL) {
2243: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2244: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2245: name);
2246: ctxt->wellFormed = 0;
2247: } else if (info->depr) {
2248: /***************************
2249: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2250: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2251: name);
2252: ***************************/
2253: }
2254:
2255: /*
2256: * Check for an Empty Element labelled the XML/SGML way
2257: */
2258: if ((CUR == '/') && (NXT(1) == '>')) {
2259: SKIP(2);
2260: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2261: ctxt->sax->endElement(ctxt->userData, name);
2262: free(name);
2263: return;
2264: }
2265:
1.5 daniel 2266: if (CUR == '>') {
2267: NEXT;
2268: } else {
1.1 daniel 2269: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2270: ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2271: openTag);
2272: ctxt->wellFormed = 0;
2273:
2274: /*
2275: * end of parsing of this node.
2276: */
2277: nodePop(ctxt);
2278: free(name);
2279: return;
2280: }
2281:
2282: /*
2283: * Check for an Empty Element from DTD definition
2284: */
2285: if ((info != NULL) && (info->empty)) {
2286: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2287: ctxt->sax->endElement(ctxt->userData, name);
2288: free(name);
2289: return;
2290: }
2291:
2292: /*
2293: * Parse the content of the element:
2294: */
2295: currentNode = ctxt->node;
2296: htmlParseContent(ctxt, name);
2297:
2298: /*
2299: * check whether the element get popped due to auto closure
2300: * on start tag
2301: */
2302: if (currentNode != ctxt->node) {
2303: free(name);
2304: return;
2305: }
2306:
2307: if (!IS_CHAR(CUR)) {
2308: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2309: ctxt->sax->error(ctxt->userData,
2310: "Premature end of data in tag %.30s\n", openTag);
2311: ctxt->wellFormed = 0;
2312:
2313: /*
2314: * end of parsing of this node.
2315: */
2316: nodePop(ctxt);
2317: free(name);
2318: return;
2319: }
2320:
2321: free(name);
2322: }
2323:
2324: /**
2325: * htmlParseDocument :
2326: * @ctxt: an HTML parser context
2327: *
2328: * parse an HTML document (and build a tree if using the standard SAX
2329: * interface).
2330: *
2331: * Returns 0, -1 in case of error. the parser context is augmented
2332: * as a result of the parsing.
2333: */
2334:
2335: int
2336: htmlParseDocument(htmlParserCtxtPtr ctxt) {
2337: htmlDefaultSAXHandlerInit();
2338: ctxt->html = 1;
2339:
1.5 daniel 2340: GROW;
1.1 daniel 2341: /*
2342: * SAX: beginning of the document processing TODO: update for HTML.
2343: */
2344: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2345: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2346:
2347: /*
2348: * We should check for encoding here and plug-in some
2349: * conversion code TODO !!!!
2350: */
2351:
2352: /*
2353: * Wipe out everything which is before the first '<'
2354: */
2355: if (IS_BLANK(CUR)) {
2356: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2357: ctxt->sax->error(ctxt->userData,
2358: "Extra spaces at the beginning of the document are not allowed\n");
2359: ctxt->wellFormed = 0;
2360: SKIP_BLANKS;
2361: }
2362:
2363: if (CUR == 0) {
2364: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2365: ctxt->sax->error(ctxt->userData, "Document is empty\n");
2366: ctxt->wellFormed = 0;
2367: }
2368:
2369:
2370: /*
2371: * Then possibly doc type declaration(s) and more Misc
2372: * (doctypedecl Misc*)?
2373: */
2374: if ((CUR == '<') && (NXT(1) == '!') &&
2375: (UPP(2) == 'D') && (UPP(3) == 'O') &&
2376: (UPP(4) == 'C') && (UPP(5) == 'T') &&
2377: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2378: (UPP(8) == 'E')) {
2379: htmlParseDocTypeDecl(ctxt);
2380: }
2381: SKIP_BLANKS;
2382:
2383: /*
2384: * Create the document if not done already.
2385: */
2386: if (ctxt->myDoc == NULL) {
2387: ctxt->myDoc = htmlNewDoc(NULL, NULL);
2388: }
2389:
2390: /*
2391: * Time to start parsing the tree itself
2392: */
2393: htmlParseElement(ctxt);
2394:
2395: /*
2396: * SAX: end of the document processing.
2397: */
2398: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2399: ctxt->sax->endDocument(ctxt->userData);
2400: if (! ctxt->wellFormed) return(-1);
2401: return(0);
2402: }
2403:
2404:
2405: /********************************************************************************
2406: * *
2407: * Parser contexts handling *
2408: * *
2409: ********************************************************************************/
2410:
2411: /**
2412: * xmlInitParserCtxt:
2413: * @ctxt: an HTML parser context
2414: *
2415: * Initialize a parser context
2416: */
2417:
2418: void
2419: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2420: {
2421: htmlSAXHandler *sax;
2422:
2423: sax = (htmlSAXHandler *) malloc(sizeof(htmlSAXHandler));
2424: if (sax == NULL) {
2425: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2426: }
2427:
2428: /* Allocate the Input stack */
2429: ctxt->inputTab = (htmlParserInputPtr *) malloc(5 * sizeof(htmlParserInputPtr));
2430: ctxt->inputNr = 0;
2431: ctxt->inputMax = 5;
2432: ctxt->input = NULL;
2433: ctxt->version = NULL;
2434: ctxt->encoding = NULL;
2435: ctxt->standalone = -1;
2436:
2437: /* Allocate the Node stack */
2438: ctxt->nodeTab = (htmlNodePtr *) malloc(10 * sizeof(htmlNodePtr));
2439: ctxt->nodeNr = 0;
2440: ctxt->nodeMax = 10;
2441: ctxt->node = NULL;
2442:
2443: if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2444: else {
2445: ctxt->sax = sax;
2446: memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2447: }
2448: ctxt->userData = ctxt;
2449: ctxt->myDoc = NULL;
2450: ctxt->wellFormed = 1;
2451: ctxt->replaceEntities = 0;
2452: ctxt->html = 1;
2453: ctxt->record_info = 0;
2454: xmlInitNodeInfoSeq(&ctxt->node_seq);
2455: }
2456:
2457: /**
2458: * htmlFreeParserCtxt:
2459: * @ctxt: an HTML parser context
2460: *
2461: * Free all the memory used by a parser context. However the parsed
2462: * document in ctxt->myDoc is not freed.
2463: */
2464:
2465: void
2466: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2467: {
2468: htmlParserInputPtr input;
2469:
2470: if (ctxt == NULL) return;
2471:
2472: while ((input = inputPop(ctxt)) != NULL) {
2473: xmlFreeInputStream(input);
2474: }
2475:
2476: if (ctxt->nodeTab != NULL) free(ctxt->nodeTab);
2477: if (ctxt->inputTab != NULL) free(ctxt->inputTab);
2478: if (ctxt->version != NULL) free((char *) ctxt->version);
2479: if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
2480: free(ctxt->sax);
2481: free(ctxt);
2482: }
2483:
2484: /**
2485: * htmlCreateDocParserCtxt :
2486: * @cur: a pointer to an array of CHAR
2487: * @encoding: a free form C string describing the HTML document encoding, or NULL
2488: *
2489: * Create a parser context for an HTML document.
2490: *
2491: * Returns the new parser context or NULL
2492: */
2493: htmlParserCtxtPtr
2494: htmlCreateDocParserCtxt(CHAR *cur, const char *encoding) {
2495: htmlParserCtxtPtr ctxt;
2496: htmlParserInputPtr input;
2497: /* htmlCharEncoding enc; */
2498:
2499: ctxt = (htmlParserCtxtPtr) malloc(sizeof(htmlParserCtxt));
2500: if (ctxt == NULL) {
2501: perror("malloc");
2502: return(NULL);
2503: }
2504: htmlInitParserCtxt(ctxt);
2505: input = (htmlParserInputPtr) malloc(sizeof(htmlParserInput));
2506: if (input == NULL) {
2507: perror("malloc");
2508: free(ctxt);
2509: return(NULL);
2510: }
2511:
2512: /*
2513: * plug some encoding conversion routines here. !!!
2514: if (encoding != NULL) {
2515: enc = htmlDetectCharEncoding(cur);
2516: htmlSwitchEncoding(ctxt, enc);
2517: }
2518: */
2519:
2520: input->filename = NULL;
2521: input->line = 1;
2522: input->col = 1;
2523: input->base = cur;
2524: input->cur = cur;
2525: input->free = NULL;
1.5 daniel 2526: input->buf = NULL;
1.1 daniel 2527:
2528: inputPush(ctxt, input);
2529: return(ctxt);
2530: }
2531:
2532: /********************************************************************************
2533: * *
2534: * User entry points *
2535: * *
2536: ********************************************************************************/
2537:
2538: /**
2539: * htmlSAXParseDoc :
2540: * @cur: a pointer to an array of CHAR
2541: * @encoding: a free form C string describing the HTML document encoding, or NULL
2542: * @sax: the SAX handler block
2543: * @userData: if using SAX, this pointer will be provided on callbacks.
2544: *
2545: * parse an HTML in-memory document and build a tree.
2546: * It use the given SAX function block to handle the parsing callback.
2547: * If sax is NULL, fallback to the default DOM tree building routines.
2548: *
2549: * Returns the resulting document tree
2550: */
2551:
2552: htmlDocPtr
2553: htmlSAXParseDoc(CHAR *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
2554: htmlDocPtr ret;
2555: htmlParserCtxtPtr ctxt;
2556:
2557: if (cur == NULL) return(NULL);
2558:
2559:
2560: ctxt = htmlCreateDocParserCtxt(cur, encoding);
2561: if (ctxt == NULL) return(NULL);
2562: if (sax != NULL) {
2563: ctxt->sax = sax;
2564: ctxt->userData = userData;
2565: }
2566:
2567: htmlParseDocument(ctxt);
2568: ret = ctxt->myDoc;
2569: if (sax != NULL) {
2570: ctxt->sax = NULL;
2571: ctxt->userData = NULL;
2572: }
2573: htmlFreeParserCtxt(ctxt);
2574:
2575: return(ret);
2576: }
2577:
2578: /**
2579: * htmlParseDoc :
2580: * @cur: a pointer to an array of CHAR
2581: * @encoding: a free form C string describing the HTML document encoding, or NULL
2582: *
2583: * parse an HTML in-memory document and build a tree.
2584: *
2585: * Returns the resulting document tree
2586: */
2587:
2588: htmlDocPtr
2589: htmlParseDoc(CHAR *cur, const char *encoding) {
2590: return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
2591: }
2592:
2593:
2594: /**
2595: * htmlCreateFileParserCtxt :
2596: * @filename: the filename
2597: * @encoding: a free form C string describing the HTML document encoding, or NULL
2598: *
2599: * Create a parser context for a file content.
2600: * Automatic support for ZLIB/Compress compressed document is provided
2601: * by default if found at compile-time.
2602: *
2603: * Returns the new parser context or NULL
2604: */
2605: htmlParserCtxtPtr
2606: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
2607: {
2608: htmlParserCtxtPtr ctxt;
2609: htmlParserInputPtr inputStream;
1.5 daniel 2610: xmlParserInputBufferPtr buf;
1.1 daniel 2611: /* htmlCharEncoding enc; */
2612:
1.5 daniel 2613: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
2614: if (buf == NULL) return(NULL);
1.1 daniel 2615:
2616: ctxt = (htmlParserCtxtPtr) malloc(sizeof(htmlParserCtxt));
2617: if (ctxt == NULL) {
2618: perror("malloc");
2619: return(NULL);
2620: }
2621: htmlInitParserCtxt(ctxt);
2622: inputStream = (htmlParserInputPtr) malloc(sizeof(htmlParserInput));
2623: if (inputStream == NULL) {
2624: perror("malloc");
2625: free(ctxt);
2626: return(NULL);
2627: }
2628:
2629: inputStream->filename = strdup(filename);
2630: inputStream->line = 1;
2631: inputStream->col = 1;
1.5 daniel 2632: inputStream->buf = buf;
1.1 daniel 2633:
1.5 daniel 2634: inputStream->base = inputStream->buf->buffer->content;
2635: inputStream->cur = inputStream->buf->buffer->content;
2636: inputStream->free = NULL;
1.1 daniel 2637:
2638: inputPush(ctxt, inputStream);
2639: return(ctxt);
2640: }
2641:
2642: /**
2643: * htmlSAXParseFile :
2644: * @filename: the filename
2645: * @encoding: a free form C string describing the HTML document encoding, or NULL
2646: * @sax: the SAX handler block
2647: * @userData: if using SAX, this pointer will be provided on callbacks.
2648: *
2649: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2650: * compressed document is provided by default if found at compile-time.
2651: * It use the given SAX function block to handle the parsing callback.
2652: * If sax is NULL, fallback to the default DOM tree building routines.
2653: *
2654: * Returns the resulting document tree
2655: */
2656:
2657: htmlDocPtr
2658: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
2659: void *userData) {
2660: htmlDocPtr ret;
2661: htmlParserCtxtPtr ctxt;
2662:
2663: ctxt = htmlCreateFileParserCtxt(filename, encoding);
2664: if (ctxt == NULL) return(NULL);
2665: if (sax != NULL) {
2666: ctxt->sax = sax;
2667: ctxt->userData = userData;
2668: }
2669:
2670: htmlParseDocument(ctxt);
2671:
2672: ret = ctxt->myDoc;
2673: if (sax != NULL) {
2674: ctxt->sax = NULL;
2675: ctxt->userData = NULL;
2676: }
2677: htmlFreeParserCtxt(ctxt);
2678:
2679: return(ret);
2680: }
2681:
2682: /**
2683: * htmlParseFile :
2684: * @filename: the filename
2685: * @encoding: a free form C string describing the HTML document encoding, or NULL
2686: *
2687: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2688: * compressed document is provided by default if found at compile-time.
2689: *
2690: * Returns the resulting document tree
2691: */
2692:
2693: htmlDocPtr
2694: htmlParseFile(const char *filename, const char *encoding) {
2695: return(htmlSAXParseFile(filename, encoding, NULL, NULL));
2696: }
Webmaster