Annotation of XML/HTMLparser.c, revision 1.9
1.1 daniel 1: /*
2: * HTMLparser.c : an HTML 4.0 non-verifying parser
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
10: #define HAVE_FCNTL_H
11: #include <io.h>
12: #else
13: #include <config.h>
14: #endif
15: #include <stdio.h>
16: #include <ctype.h>
17: #include <string.h> /* for memset() only */
18: #include <stdlib.h>
19: #include <sys/stat.h>
20: #ifdef HAVE_FCNTL_H
21: #include <fcntl.h>
22: #endif
23: #ifdef HAVE_UNISTD_H
24: #include <unistd.h>
25: #endif
26: #ifdef HAVE_ZLIB_H
27: #include <zlib.h>
28: #endif
29:
30: #include "tree.h"
31: #include "HTMLparser.h"
32: #include "entities.h"
33: #include "encoding.h"
34: #include "valid.h"
35: #include "parserInternals.h"
1.5 daniel 36: #include "xmlIO.h"
37:
38: #define HTML_MAX_NAMELEN 1000
39: #define INPUT_CHUNK 50
1.1 daniel 40:
41: /* #define DEBUG */
42:
43: /************************************************************************
44: * *
45: * Parser stacks related functions and macros *
46: * *
47: ************************************************************************/
48:
49: /*
50: * Generic function for accessing stacks in the Parser Context
51: */
52:
53: #define PUSH_AND_POP(type, name) \
54: int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
55: if (ctxt->name##Nr >= ctxt->name##Max) { \
56: ctxt->name##Max *= 2; \
57: ctxt->name##Tab = (void *) realloc(ctxt->name##Tab, \
58: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
59: if (ctxt->name##Tab == NULL) { \
60: fprintf(stderr, "realloc failed !\n"); \
61: exit(1); \
62: } \
63: } \
64: ctxt->name##Tab[ctxt->name##Nr] = value; \
65: ctxt->name = value; \
66: return(ctxt->name##Nr++); \
67: } \
68: type html##name##Pop(htmlParserCtxtPtr ctxt) { \
69: type ret; \
70: if (ctxt->name##Nr <= 0) return(0); \
71: ctxt->name##Nr--; \
72: if (ctxt->name##Nr > 0) \
73: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
74: else \
75: ctxt->name = NULL; \
76: ret = ctxt->name##Tab[ctxt->name##Nr]; \
77: ctxt->name##Tab[ctxt->name##Nr] = 0; \
78: return(ret); \
79: } \
80:
81: PUSH_AND_POP(xmlNodePtr, node)
82:
83: /*
84: * Macros for accessing the content. Those should be used only by the parser,
85: * and not exported.
86: *
87: * Dirty macros, i.e. one need to make assumption on the context to use them
88: *
89: * CUR_PTR return the current pointer to the CHAR to be parsed.
90: * CUR returns the current CHAR value, i.e. a 8 bit value if compiled
91: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
92: * in UNICODE mode. This should be used internally by the parser
93: * only to compare to ASCII values otherwise it would break when
94: * running with UTF-8 encoding.
95: * NXT(n) returns the n'th next CHAR. Same as CUR is should be used only
96: * to compare on ASCII based substring.
97: * UPP(n) returns the n'th next CHAR converted to uppercase. Same as CUR
98: * it should be used only to compare on ASCII based substring.
99: * SKIP(n) Skip n CHAR, and must also be used only to skip ASCII defined
100: * strings within the parser.
101: *
102: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
103: *
104: * CURRENT Returns the current char value, with the full decoding of
105: * UTF-8 if we are using this mode. It returns an int.
106: * NEXT Skip to the next character, this does the proper decoding
107: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
108: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
109: */
110:
111: #define CUR (*ctxt->input->cur)
112: #define UPPER (toupper(*ctxt->input->cur))
113: #define SKIP(val) ctxt->input->cur += (val)
114: #define NXT(val) ctxt->input->cur[(val)]
115: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
116: #define CUR_PTR ctxt->input->cur
1.5 daniel 117: #define SHRINK xmlParserInputShrink(ctxt->input)
118: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
1.1 daniel 119:
120: #define SKIP_BLANKS \
121: while (IS_BLANK(*(ctxt->input->cur))) NEXT
122:
123: #ifndef USE_UTF_8
124: #define CURRENT (*ctxt->input->cur)
1.5 daniel 125: #define NEXT { \
126: if ((*ctxt->input->cur == 0) && \
127: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { \
128: xmlPopInput(ctxt); \
129: } else { \
130: if (*(ctxt->input->cur) == '\n') { \
131: ctxt->input->line++; ctxt->input->col = 1; \
132: } else ctxt->input->col++; \
133: ctxt->input->cur++; \
134: if (*ctxt->input->cur == 0) \
135: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
136: }}
137:
138: /****************************************
1.1 daniel 139: #define NEXT ((*ctxt->input->cur) ? \
140: (((*(ctxt->input->cur) == '\n') ? \
141: (ctxt->input->line++, ctxt->input->col = 1) : \
1.5 daniel 142: (ctxt->input->col++)), \
143: (ctxt->input->cur++), \
144: ((*ctxt->input->cur) ? \
145: (xmlParserInputGrow(ctxt->input, 100), \
146: ctxt->input->cur): \
147: (ctxt->input->cur))) : \
148: ((xmlParserInputGrow(ctxt->input, 100) > 0) ? \
149: ctxt->input->cur: \
150: (xmlPopInput(ctxt), ctxt->input->cur)))
151: ****************************************/
1.1 daniel 152: #else
153: #endif
154:
155:
1.5 daniel 156:
1.1 daniel 157: /************************************************************************
158: * *
159: * The list of HTML elements and their properties *
160: * *
161: ************************************************************************/
162:
163: /*
164: * Start Tag: 1 means the start tag can be ommited
165: * End Tag: 1 means the end tag can be ommited
166: * 2 means it's forbidden (empty elements)
167: * Depr: this element is deprecated
168: * DTD: 1 means that this element is valid only in the Loose DTD
169: * 2 means that this element is valid only in the Frameset DTD
170: *
171: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
172: */
173: htmlElemDesc html40ElementTable[] = {
174: { "A", 0, 0, 0, 0, 0, "anchor " },
175: { "ABBR", 0, 0, 0, 0, 0, "abbreviated form" },
176: { "ACRONYM", 0, 0, 0, 0, 0, "" },
177: { "ADDRESS", 0, 0, 0, 0, 0, "information on author " },
178: { "APPLET", 0, 0, 0, 1, 1, "Java applet " },
179: { "AREA", 0, 2, 1, 0, 0, "client-side image map area " },
180: { "B", 0, 0, 0, 0, 0, "bold text style" },
181: { "BASE", 0, 2, 1, 0, 0, "document base URI " },
182: { "BASEFONT", 0, 2, 1, 1, 1, "base font size " },
183: { "BDO", 0, 0, 0, 0, 0, "I18N BiDi over-ride " },
184: { "BIG", 0, 0, 0, 0, 0, "large text style" },
185: { "BLOCKQUOTE", 0, 0, 0, 0, 0, "long quotation " },
186: { "BODY", 1, 1, 0, 0, 0, "document body " },
187: { "BR", 0, 2, 1, 0, 0, "forced line break " },
188: { "BUTTON", 0, 0, 0, 0, 0, "push button " },
189: { "CAPTION", 0, 0, 0, 0, 0, "table caption " },
190: { "CENTER", 0, 0, 0, 1, 1, "shorthand for DIV align=center " },
191: { "CITE", 0, 0, 0, 0, 0, "citation" },
192: { "CODE", 0, 0, 0, 0, 0, "computer code fragment" },
193: { "COL", 0, 2, 1, 0, 0, "table column " },
194: { "COLGROUP", 0, 1, 0, 0, 0, "table column group " },
195: { "DD", 0, 1, 0, 0, 0, "definition description " },
196: { "DEL", 0, 0, 0, 0, 0, "deleted text " },
197: { "DFN", 0, 0, 0, 0, 0, "instance definition" },
198: { "DIR", 0, 0, 0, 1, 1, "directory list" },
199: { "DIV", 0, 0, 0, 0, 0, "generic language/style container"},
200: { "DL", 0, 0, 0, 0, 0, "definition list " },
201: { "DT", 0, 1, 0, 0, 0, "definition term " },
202: { "EM", 0, 0, 0, 0, 0, "emphasis" },
203: { "FIELDSET", 0, 0, 0, 0, 0, "form control group " },
204: { "FONT", 0, 0, 0, 1, 1, "local change to font " },
205: { "FORM", 0, 0, 0, 0, 0, "interactive form " },
206: { "FRAME", 0, 2, 1, 0, 2, "subwindow " },
207: { "FRAMESET", 0, 0, 0, 0, 2, "window subdivision" },
208: { "H1", 0, 0, 0, 0, 0, "heading " },
209: { "H2", 0, 0, 0, 0, 0, "heading " },
210: { "H3", 0, 0, 0, 0, 0, "heading " },
211: { "H4", 0, 0, 0, 0, 0, "heading " },
212: { "H5", 0, 0, 0, 0, 0, "heading " },
213: { "H6", 0, 0, 0, 0, 0, "heading " },
214: { "HEAD", 1, 1, 0, 0, 0, "document head " },
215: { "HR", 0, 2, 1, 0, 0, "horizontal rule " },
216: { "HTML", 1, 1, 0, 0, 0, "document root element " },
217: { "I", 0, 0, 0, 0, 0, "italic text style" },
218: { "IFRAME", 0, 0, 0, 0, 1, "inline subwindow " },
219: { "IMG", 0, 2, 1, 0, 0, "Embedded image " },
220: { "INPUT", 0, 2, 1, 0, 0, "form control " },
221: { "INS", 0, 0, 0, 0, 0, "inserted text" },
222: { "ISINDEX", 0, 2, 1, 1, 1, "single line prompt " },
223: { "KBD", 0, 0, 0, 0, 0, "text to be entered by the user" },
224: { "LABEL", 0, 0, 0, 0, 0, "form field label text " },
225: { "LEGEND", 0, 0, 0, 0, 0, "fieldset legend " },
226: { "LI", 0, 1, 0, 0, 0, "list item " },
227: { "LINK", 0, 2, 1, 0, 0, "a media-independent link " },
228: { "MAP", 0, 0, 0, 0, 0, "client-side image map " },
229: { "MENU", 0, 0, 0, 1, 1, "menu list " },
230: { "META", 0, 2, 1, 0, 0, "generic metainformation " },
231: { "NOFRAMES", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
232: { "NOSCRIPT", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
233: { "OBJECT", 0, 0, 0, 0, 0, "generic embedded object " },
234: { "OL", 0, 0, 0, 0, 0, "ordered list " },
235: { "OPTGROUP", 0, 0, 0, 0, 0, "option group " },
236: { "OPTION", 0, 1, 0, 0, 0, "selectable choice " },
237: { "P", 0, 1, 0, 0, 0, "paragraph " },
238: { "PARAM", 0, 2, 1, 0, 0, "named property value " },
239: { "PRE", 0, 0, 0, 0, 0, "preformatted text " },
240: { "Q", 0, 0, 0, 0, 0, "short inline quotation " },
241: { "S", 0, 0, 0, 1, 1, "strike-through text style" },
242: { "SAMP", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
243: { "SCRIPT", 0, 0, 0, 0, 0, "script statements " },
244: { "SELECT", 0, 0, 0, 0, 0, "option selector " },
245: { "SMALL", 0, 0, 0, 0, 0, "small text style" },
246: { "SPAN", 0, 0, 0, 0, 0, "generic language/style container " },
247: { "STRIKE", 0, 0, 0, 1, 1, "strike-through text" },
248: { "STRONG", 0, 0, 0, 0, 0, "strong emphasis" },
249: { "STYLE", 0, 0, 0, 0, 0, "style info " },
250: { "SUB", 0, 0, 0, 0, 0, "subscript" },
251: { "SUP", 0, 0, 0, 0, 0, "superscript " },
252: { "TABLE", 0, 0, 0, 0, 0, " " },
253: { "TBODY", 1, 1, 0, 0, 0, "table body " },
254: { "TD", 0, 1, 0, 0, 0, "table data cell" },
255: { "TEXTAREA", 0, 0, 0, 0, 0, "multi-line text field " },
256: { "TFOOT", 0, 1, 0, 0, 0, "table footer " },
257: { "TH", 0, 1, 0, 0, 0, "table header cell" },
258: { "THEAD", 0, 1, 0, 0, 0, "table header " },
259: { "TITLE", 0, 0, 0, 0, 0, "document title " },
260: { "TR", 0, 1, 0, 0, 0, "table row " },
261: { "TT", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
262: { "U", 0, 0, 0, 1, 1, "underlined text style" },
263: { "UL", 0, 0, 0, 0, 0, "unordered list " },
264: { "VAR", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
265: };
266:
267: /*
268: * start tags that imply the end of a current element
269: * any tag of each line implies the end of the current element if the type of
270: * that element is in the same line
271: */
1.8 daniel 272: char *htmlEquEnd[] = {
1.1 daniel 273: "DT", "DD", "LI", "OPTION", NULL,
274: "H1", "H2", "H3", "H4", "H5", "H6", NULL,
275: "OL", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", NULL,
276: NULL
277: };
278: /*
279: * acording the HTML DTD, HR should be added to the 2nd line above, as it
280: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
281: * because many documents contain rules in headings...
282: */
283:
284: /*
285: * start tags that imply the end of current element
286: */
1.8 daniel 287: char *htmlStartClose[] = {
1.1 daniel 288: "FORM", "FORM", "P", "HR", "H1", "H2", "H3", "H4", "H5", "H6",
289: "DL", "UL", "OL", "MENU", "DIR", "ADDRESS", "PRE",
290: "LISTING", "XMP", "HEAD", NULL,
291: "HEAD", "P", NULL,
292: "TITLE", "P", NULL,
293: "BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
294: "LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
295: "PRE", "LISTING", "XMP", "HEAD", NULL,
296: "HR", "P", "HEAD", NULL,
297: "H1", "P", "HEAD", NULL,
298: "H2", "P", "HEAD", NULL,
299: "H3", "P", "HEAD", NULL,
300: "H4", "P", "HEAD", NULL,
301: "H5", "P", "HEAD", NULL,
302: "H6", "P", "HEAD", NULL,
303: "DIR", "P", "HEAD", NULL,
304: "ADDRESS", "P", "HEAD", "UL", NULL,
305: "PRE", "P", "HEAD", "UL", NULL,
306: "LISTING", "P", "HEAD", NULL,
307: "XMP", "P", "HEAD", NULL,
308: "BLOCKQUOTE", "P", "HEAD", NULL,
309: "DL", "P", "DT", "MENU", "DIR", "ADDRESS", "PRE", "LISTING",
310: "XMP", "HEAD", NULL,
311: "DT", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL,
312: "DD", "P", "MENU", "DIR", "ADDRESS", "PRE", "LISTING", "XMP", "HEAD", NULL,
313: "UL", "P", "HEAD", "OL", "MENU", "DIR", "ADDRESS", "PRE",
314: "LISTING", "XMP", NULL,
315: "OL", "P", "HEAD", "UL", NULL,
316: "MENU", "P", "HEAD", "UL", NULL,
317: "P", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", NULL,
318: "DIV", "P", "HEAD", NULL,
319: "NOSCRIPT", "P", "HEAD", NULL,
320: "CENTER", "FONT", "B", "I", "P", "HEAD", NULL,
321: "A", "A", NULL,
322: "CAPTION", "P", NULL,
323: "COLGROUP", "CAPTION", "COLGROUP", "COL", "P", NULL,
324: "COL", "CAPTION", "COL", "P", NULL,
325: "TABLE", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", "PRE",
326: "LISTING", "XMP", "A", NULL,
327: "TH", "TH", "TD", NULL,
328: "TD", "TH", "TD", NULL,
329: "TR", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", NULL,
330: "THEAD", "CAPTION", "COL", "COLGROUP", NULL,
331: "TFOOT", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
332: "TBODY", NULL,
333: "TBODY", "TH", "TD", "TR", "CAPTION", "COL", "COLGROUP", "THEAD",
334: "TFOOT", "TBODY", NULL,
335: "OPTGROUP", "OPTION", NULL,
336: "FIELDSET", "LEGEND", "P", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6",
337: "PRE", "LISTING", "XMP", "A", NULL,
338: NULL
339: };
340:
1.8 daniel 341: static char** htmlStartCloseIndex[100];
1.1 daniel 342: static int htmlStartCloseIndexinitialized = 0;
343:
344: /************************************************************************
345: * *
346: * functions to handle HTML specific data *
347: * *
348: ************************************************************************/
349:
350: /**
351: * htmlInitAutoClose:
352: *
353: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
354: *
355: */
356: void
357: htmlInitAutoClose(void) {
358: int index, i = 0;
359:
360: if (htmlStartCloseIndexinitialized) return;
361:
362: for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
363: index = 0;
364: while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
365: htmlStartCloseIndex[index++] = &htmlStartClose[i];
366: while (htmlStartClose[i] != NULL) i++;
367: i++;
368: }
369: }
370:
371: /**
372: * htmlTagLookup:
373: * @tag: The tag name
374: *
375: * Lookup the HTML tag in the ElementTable
376: *
377: * Returns the related htmlElemDescPtr or NULL if not found.
378: */
379: htmlElemDescPtr
380: htmlTagLookup(const CHAR *tag) {
381: int i = 0;
382:
383: for (i = 0; i < (sizeof(html40ElementTable) /
384: sizeof(html40ElementTable[0]));i++) {
1.8 daniel 385: if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
1.1 daniel 386: return(&html40ElementTable[i]);
387: }
388: return(NULL);
389: }
390:
391: /**
392: * htmlCheckAutoClose:
393: * @new: The new tag name
394: * @old: The old tag name
395: *
396: * Checks wether the new tag is one of the registered valid tags for closing old.
397: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
398: *
399: * Returns 0 if no, 1 if yes.
400: */
401: int
402: htmlCheckAutoClose(const CHAR *new, const CHAR *old) {
403: int i, index;
1.8 daniel 404: char **close;
1.1 daniel 405:
406: if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
407:
408: /* inefficient, but not a big deal */
409: for (index = 0; index < 100;index++) {
410: close = htmlStartCloseIndex[index];
411: if (close == NULL) return(0);
1.8 daniel 412: if (!xmlStrcmp(BAD_CAST *close, new)) break;
1.1 daniel 413: }
414:
415: i = close - htmlStartClose;
416: i++;
417: while (htmlStartClose[i] != NULL) {
1.8 daniel 418: if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
1.1 daniel 419: return(1);
420: }
421: i++;
422: }
423: return(0);
424: }
425:
426: /**
427: * htmlAutoClose:
428: * @ctxt: an HTML parser context
429: * @new: The new tag name
430: *
431: * The HTmL DtD allows a tag to implicitely close other tags.
432: * The list is kept in htmlStartClose array. This function is
433: * called when a new tag has been detected and generates the
434: * appropriates closes if possible/needed.
435: */
436: void
437: htmlAutoClose(htmlParserCtxtPtr ctxt, const CHAR *new) {
438:
439: while ((ctxt->node != NULL) &&
440: (htmlCheckAutoClose(new, ctxt->node->name))) {
441: #ifdef DEBUG
442: printf("htmlAutoClose: %s closes %s\n", new, ctxt->node->name);
443: #endif
444: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
445: ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
446: }
447: }
448:
449: /**
450: * htmlAutoCloseOnClose:
451: * @ctxt: an HTML parser context
452: * @new: The new tag name
453: *
454: * The HTmL DtD allows an ending tag to implicitely close other tags.
455: */
456: void
457: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const CHAR *new) {
458: htmlElemDescPtr info;
459:
460: while ((ctxt->node != NULL) &&
461: (xmlStrcmp(new, ctxt->node->name))) {
462: info = htmlTagLookup(ctxt->node->name);
463: if ((info == NULL) || (info->endTag == 1)) {
464: #ifdef DEBUG
465: printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->node->name);
466: #endif
467: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
468: ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
469: } else
470: break;
471: }
472: }
473:
474: /************************************************************************
475: * *
476: * The list of HTML predefined entities *
477: * *
478: ************************************************************************/
479:
480:
481: htmlEntityDesc html40EntitiesTable[] = {
482: /*
483: * the 4 absolute ones,
484: */
485: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
486: { 38, "amp", "ampersand, U+0026 ISOnum" },
1.4 daniel 487: { 39, "apos", "single quote" },
1.1 daniel 488: { 60, "lt", "less-than sign, U+003C ISOnum" },
489: { 62, "gt", "greater-than sign, U+003E ISOnum" },
490:
491: /*
492: * A bunch still in the 128-255 range
493: * Replacing them depend really on the charset used.
494: */
495: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
496: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
497: { 162, "cent", "cent sign, U+00A2 ISOnum" },
498: { 163, "pound","pound sign, U+00A3 ISOnum" },
499: { 164, "curren","currency sign, U+00A4 ISOnum" },
500: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
501: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
502: { 167, "sect", "section sign, U+00A7 ISOnum" },
503: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
504: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
505: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
506: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
507: { 172, "not", "not sign, U+00AC ISOnum" },
508: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
509: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
510: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
511: { 176, "deg", "degree sign, U+00B0 ISOnum" },
512: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
513: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
514: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
515: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
516: { 181, "micro","micro sign, U+00B5 ISOnum" },
517: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1.7 daniel 518: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1.1 daniel 519: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
520: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
521: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1.7 daniel 522: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1.1 daniel 523: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
524: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
525: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
526: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
527: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
528: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
529: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
530: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
531: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
532: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
533: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
534: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
535: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
536: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
537: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
538: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
539: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
540: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
541: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
542: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
543: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
544: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
545: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
546: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
547: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
548: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
549: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
550: { 215, "times","multiplication sign, U+00D7 ISOnum" },
1.7 daniel 551: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1.1 daniel 552: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
553: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
554: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
555: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
556: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
557: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
558: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
559: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
560: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
561: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
562: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
563: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
564: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
565: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
566: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
567: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
568: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
569: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
570: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
571: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
572: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
573: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
574: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
575: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
576: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
577: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
578: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
579: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
580: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
581: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
582: { 247, "divide","division sign, U+00F7 ISOnum" },
583: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
584: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
585: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
586: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
587: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
588: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
589: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
590: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
591:
592: /*
593: * Anything below should really be kept as entities references
594: */
595: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
596:
597: { 913, "Alpha","greek capital letter alpha, U+0391" },
598: { 914, "Beta", "greek capital letter beta, U+0392" },
599: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
600: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
601: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
602: { 918, "Zeta", "greek capital letter zeta, U+0396" },
603: { 919, "Eta", "greek capital letter eta, U+0397" },
604: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
605: { 921, "Iota", "greek capital letter iota, U+0399" },
606: { 922, "Kappa","greek capital letter kappa, U+039A" },
607: { 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
608: { 924, "Mu", "greek capital letter mu, U+039C" },
609: { 925, "Nu", "greek capital letter nu, U+039D" },
610: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
611: { 927, "Omicron","greek capital letter omicron, U+039F" },
612: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
613: { 929, "Rho", "greek capital letter rho, U+03A1" },
614: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
615: { 932, "Tau", "greek capital letter tau, U+03A4" },
616: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
617: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
618: { 935, "Chi", "greek capital letter chi, U+03A7" },
619: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
620: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
621:
622: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
623: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
624: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
625: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
626: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
627: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
628: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
629: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
630: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
631: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
632: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
633: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
634: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
635: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
636: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
637: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
638: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
639: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
640: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
641: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
642: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
643: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
644: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
645: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
646: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
647: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
648: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
649: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
650:
651: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
652: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
653: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
654: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
655: { 8254, "oline","overline = spacing overscore, U+203E NEW" },
656: { 8260, "frasl","fraction slash, U+2044 NEW" },
657:
1.7 daniel 658: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1.1 daniel 659: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
660: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
661: { 8482, "trade","trade mark sign, U+2122 ISOnum" },
662: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
663: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
664: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
665: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
666: { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
667: { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
668: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
669: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
670: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
671: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
672: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
673: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
674:
675:
676: { 8704, "forall","for all, U+2200 ISOtech" },
677: { 8706, "part", "partial differential, U+2202 ISOtech" },
678: { 8707, "exist","there exists, U+2203 ISOtech" },
679: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
680: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
681: { 8712, "isin", "element of, U+2208 ISOtech" },
682: { 8713, "notin","not an element of, U+2209 ISOtech" },
683: { 8715, "ni", "contains as member, U+220B ISOtech" },
684: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
685: { 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
686: { 8722, "minus","minus sign, U+2212 ISOtech" },
687: { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
688: { 8730, "radic","square root = radical sign, U+221A ISOtech" },
689: { 8733, "prop", "proportional to, U+221D ISOtech" },
690: { 8734, "infin","infinity, U+221E ISOtech" },
691: { 8736, "ang", "angle, U+2220 ISOamso" },
692: { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
693: { 8744, "or", "logical or = vee, U+2228 ISOtech" },
694: { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
695: { 8746, "cup", "union = cup, U+222A ISOtech" },
696: { 8747, "int", "integral, U+222B ISOtech" },
697: { 8756, "there4","therefore, U+2234 ISOtech" },
698: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
699: { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
700: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
701: { 8800, "ne", "not equal to, U+2260 ISOtech" },
702: { 8801, "equiv","identical to, U+2261 ISOtech" },
703: { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
704: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
705: { 8834, "sub", "subset of, U+2282 ISOtech" },
706: { 8835, "sup", "superset of, U+2283 ISOtech" },
707: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
708: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
709: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
710: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
711: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
712: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
713: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
714: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
715: { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
716: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
717: { 8971, "rfloor","right floor, U+230B ISOamsc" },
718: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
719: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
720: { 9674, "loz", "lozenge, U+25CA ISOpub" },
721:
722: { 9824, "spades","black spade suit, U+2660 ISOpub" },
723: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
724: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
725: { 9830, "diams","black diamond suit, U+2666 ISOpub" },
726:
727: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
728: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
729: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
730: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
731: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
732: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
733: { 732, "tilde","small tilde, U+02DC ISOdia" },
734:
735: { 8194, "ensp", "en space, U+2002 ISOpub" },
736: { 8195, "emsp", "em space, U+2003 ISOpub" },
737: { 8201, "thinsp","thin space, U+2009 ISOpub" },
738: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
739: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
740: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
741: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
742: { 8211, "ndash","en dash, U+2013 ISOpub" },
743: { 8212, "mdash","em dash, U+2014 ISOpub" },
744: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
745: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
746: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
747: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
748: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
749: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
750: { 8224, "dagger","dagger, U+2020 ISOpub" },
751: { 8225, "Dagger","double dagger, U+2021 ISOpub" },
752: { 8240, "permil","per mille sign, U+2030 ISOtech" },
753: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1.7 daniel 754: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1.1 daniel 755: { 8364, "euro", "euro sign, U+20AC NEW" }
756: };
757:
758: /************************************************************************
759: * *
760: * Commodity functions to handle entities *
761: * *
762: ************************************************************************/
763:
764: /*
765: * Macro used to grow the current buffer.
766: */
767: #define growBuffer(buffer) { \
768: buffer##_size *= 2; \
769: buffer = (CHAR *) realloc(buffer, buffer##_size * sizeof(CHAR)); \
770: if (buffer == NULL) { \
771: perror("realloc failed"); \
772: exit(1); \
773: } \
774: }
775:
776: /**
777: * htmlEntityLookup:
778: * @name: the entity name
779: *
780: * Lookup the given entity in EntitiesTable
781: *
782: * TODO: the linear scan is really ugly, an hash table is really needed.
783: *
784: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
785: */
786: htmlEntityDescPtr
787: htmlEntityLookup(const CHAR *name) {
788: int i;
789:
790: for (i = 0;i < (sizeof(html40EntitiesTable)/
791: sizeof(html40EntitiesTable[0]));i++) {
1.8 daniel 792: if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
1.1 daniel 793: #ifdef DEBUG
794: printf("Found entity %s\n", name);
795: #endif
796: return(&html40EntitiesTable[i]);
797: }
798: }
799: return(NULL);
800: }
801:
802:
803: /**
804: * htmlDecodeEntities:
805: * @ctxt: the parser context
806: * @len: the len to decode (in bytes !), -1 for no size limit
807: * @end: an end marker CHAR, 0 if none
808: * @end2: an end marker CHAR, 0 if none
809: * @end3: an end marker CHAR, 0 if none
810: *
811: * Subtitute the HTML entities by their value
812: *
813: * TODO: once the internal representation will be UTF-8, all entities
814: * will be substituable, in the meantime we only apply the substitution
815: * to the one with values in the 0-255 UNICODE range
816: *
817: * Returns A newly allocated string with the substitution done. The caller
818: * must deallocate it !
819: */
820: CHAR *
821: htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
822: CHAR end, CHAR end2, CHAR end3) {
823: CHAR *buffer = NULL;
824: int buffer_size = 0;
825: CHAR *out = NULL;
826: CHAR *name = NULL;
827:
828: CHAR *cur = NULL;
829: htmlEntityDescPtr ent;
1.5 daniel 830: int nbchars = 0;
1.1 daniel 831: unsigned int max = (unsigned int) len;
832:
833: /*
834: * allocate a translation buffer.
835: */
836: buffer_size = 1000;
837: buffer = (CHAR *) malloc(buffer_size * sizeof(CHAR));
838: if (buffer == NULL) {
839: perror("htmlDecodeEntities: malloc failed");
840: return(NULL);
841: }
842: out = buffer;
843:
844: /*
845: * Ok loop until we reach one of the ending char or a size limit.
846: */
1.5 daniel 847: while ((nbchars < max) && (CUR != end) &&
1.1 daniel 848: (CUR != end2) && (CUR != end3)) {
849:
850: if (CUR == '&') {
851: if (NXT(1) == '#') {
852: int val = htmlParseCharRef(ctxt);
1.8 daniel 853: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 854: *out++ = val;
1.5 daniel 855: nbchars += 3; /* !!!! */
1.1 daniel 856: } else {
857: ent = htmlParseEntityRef(ctxt, &name);
858: if (name != NULL) {
859: if ((ent == NULL) || (ent->value <= 0) ||
860: (ent->value >= 255)) {
861: *out++ = '&';
862: cur = name;
863: while (*cur != 0) {
864: if (out - buffer > buffer_size - 100) {
865: int index = out - buffer;
866:
867: growBuffer(buffer);
868: out = &buffer[index];
869: }
870: *out++ = *cur++;
871: }
872: *out++ = ';';
873: } else {
1.8 daniel 874: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 875: *out++ = (CHAR)ent->value;
876: if (out - buffer > buffer_size - 100) {
877: int index = out - buffer;
878:
879: growBuffer(buffer);
880: out = &buffer[index];
881: }
882: }
1.5 daniel 883: nbchars += 2 + xmlStrlen(name);
1.1 daniel 884: free(name);
885: }
886: }
887: } else {
1.8 daniel 888: /* invalid for UTF-8 , use COPY(out); !!!!! */
1.1 daniel 889: *out++ = CUR;
1.5 daniel 890: nbchars++;
1.1 daniel 891: if (out - buffer > buffer_size - 100) {
892: int index = out - buffer;
893:
894: growBuffer(buffer);
895: out = &buffer[index];
896: }
897: NEXT;
898: }
899: }
900: *out++ = 0;
901: return(buffer);
902: }
903:
904:
905: /************************************************************************
906: * *
907: * Commodity functions to handle encodings *
908: * *
909: ************************************************************************/
910:
911: /**
912: * htmlSwitchEncoding:
913: * @ctxt: the parser context
914: * @len: the len of @cur
915: *
916: * change the input functions when discovering the character encoding
917: * of a given entity.
918: *
919: */
920: void
921: htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
922: {
923: switch (enc) {
924: case XML_CHAR_ENCODING_ERROR:
925: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
926: ctxt->sax->error(ctxt->userData, "encoding unknown\n");
927: ctxt->wellFormed = 0;
928: break;
929: case XML_CHAR_ENCODING_NONE:
930: /* let's assume it's UTF-8 without the XML decl */
931: return;
932: case XML_CHAR_ENCODING_UTF8:
933: /* default encoding, no conversion should be needed */
934: return;
935: case XML_CHAR_ENCODING_UTF16LE:
936: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
937: ctxt->sax->error(ctxt->userData,
938: "char encoding UTF16 little endian not supported\n");
939: break;
940: case XML_CHAR_ENCODING_UTF16BE:
941: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
942: ctxt->sax->error(ctxt->userData,
943: "char encoding UTF16 big endian not supported\n");
944: break;
945: case XML_CHAR_ENCODING_UCS4LE:
946: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
947: ctxt->sax->error(ctxt->userData,
948: "char encoding USC4 little endian not supported\n");
949: break;
950: case XML_CHAR_ENCODING_UCS4BE:
951: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
952: ctxt->sax->error(ctxt->userData,
953: "char encoding USC4 big endian not supported\n");
954: break;
955: case XML_CHAR_ENCODING_EBCDIC:
956: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
957: ctxt->sax->error(ctxt->userData,
958: "char encoding EBCDIC not supported\n");
959: break;
960: case XML_CHAR_ENCODING_UCS4_2143:
961: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
962: ctxt->sax->error(ctxt->userData,
963: "char encoding UCS4 2143 not supported\n");
964: break;
965: case XML_CHAR_ENCODING_UCS4_3412:
966: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
967: ctxt->sax->error(ctxt->userData,
968: "char encoding UCS4 3412 not supported\n");
969: break;
970: case XML_CHAR_ENCODING_UCS2:
971: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
972: ctxt->sax->error(ctxt->userData,
973: "char encoding UCS2 not supported\n");
974: break;
975: case XML_CHAR_ENCODING_8859_1:
976: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
977: ctxt->sax->error(ctxt->userData,
978: "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
979: break;
980: case XML_CHAR_ENCODING_8859_2:
981: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
982: ctxt->sax->error(ctxt->userData,
983: "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
984: break;
985: case XML_CHAR_ENCODING_8859_3:
986: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
987: ctxt->sax->error(ctxt->userData,
988: "char encoding ISO_8859_3 not supported\n");
989: break;
990: case XML_CHAR_ENCODING_8859_4:
991: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
992: ctxt->sax->error(ctxt->userData,
993: "char encoding ISO_8859_4 not supported\n");
994: break;
995: case XML_CHAR_ENCODING_8859_5:
996: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
997: ctxt->sax->error(ctxt->userData,
998: "char encoding ISO_8859_5 not supported\n");
999: break;
1000: case XML_CHAR_ENCODING_8859_6:
1001: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1002: ctxt->sax->error(ctxt->userData,
1003: "char encoding ISO_8859_6 not supported\n");
1004: break;
1005: case XML_CHAR_ENCODING_8859_7:
1006: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1007: ctxt->sax->error(ctxt->userData,
1008: "char encoding ISO_8859_7 not supported\n");
1009: break;
1010: case XML_CHAR_ENCODING_8859_8:
1011: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1012: ctxt->sax->error(ctxt->userData,
1013: "char encoding ISO_8859_8 not supported\n");
1014: break;
1015: case XML_CHAR_ENCODING_8859_9:
1016: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1017: ctxt->sax->error(ctxt->userData,
1018: "char encoding ISO_8859_9 not supported\n");
1019: break;
1020: case XML_CHAR_ENCODING_2022_JP:
1021: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1022: ctxt->sax->error(ctxt->userData,
1023: "char encoding ISO-2022-JPnot supported\n");
1024: break;
1025: case XML_CHAR_ENCODING_SHIFT_JIS:
1026: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1027: ctxt->sax->error(ctxt->userData,
1028: "char encoding Shift_JISnot supported\n");
1029: break;
1030: case XML_CHAR_ENCODING_EUC_JP:
1031: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1032: ctxt->sax->error(ctxt->userData,
1033: "char encoding EUC-JPnot supported\n");
1034: break;
1035: }
1036: }
1037:
1038:
1039: /************************************************************************
1040: * *
1041: * Commodity functions, cleanup needed ? *
1042: * *
1043: ************************************************************************/
1044:
1045: /**
1046: * areBlanks:
1047: * @ctxt: an HTML parser context
1048: * @str: a CHAR *
1049: * @len: the size of @str
1050: *
1051: * Is this a sequence of blank chars that one can ignore ?
1052: *
1053: * Returns 1 if ignorable 0 otherwise.
1054: */
1055:
1056: static int areBlanks(htmlParserCtxtPtr ctxt, const CHAR *str, int len) {
1057: int i;
1058: xmlNodePtr lastChild;
1059:
1060: for (i = 0;i < len;i++)
1061: if (!(IS_BLANK(str[i]))) return(0);
1062:
1063: if (CUR != '<') return(0);
1064: if (ctxt->node == NULL) return(0);
1065: lastChild = xmlGetLastChild(ctxt->node);
1066: if (lastChild == NULL) {
1067: if (ctxt->node->content != NULL) return(0);
1068: } else if (xmlNodeIsText(lastChild))
1069: return(0);
1070: return(1);
1071: }
1072:
1073: /**
1074: * htmlHandleEntity:
1075: * @ctxt: an HTML parser context
1076: * @entity: an XML entity pointer.
1077: *
1078: * Default handling of an HTML entity, call the parser with the
1079: * substitution string
1080: */
1081:
1082: void
1083: htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1084: int len;
1085:
1086: if (entity->content == NULL) {
1087: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1088: ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1089: entity->name);
1090: ctxt->wellFormed = 0;
1091: return;
1092: }
1093: len = xmlStrlen(entity->content);
1094:
1095: /*
1096: * Just handle the content as a set of chars.
1097: */
1098: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1099: ctxt->sax->characters(ctxt->userData, entity->content, len);
1100:
1101: }
1102:
1103: /**
1104: * htmlNewDoc:
1105: * @URI: URI for the dtd, or NULL
1106: * @ExternalID: the external ID of the DTD, or NULL
1107: *
1108: * Returns a new document
1109: */
1110: htmlDocPtr
1111: htmlNewDoc(const CHAR *URI, const CHAR *ExternalID) {
1112: xmlDocPtr cur;
1113:
1114: /*
1115: * Allocate a new document and fill the fields.
1116: */
1117: cur = (xmlDocPtr) malloc(sizeof(xmlDoc));
1118: if (cur == NULL) {
1119: fprintf(stderr, "xmlNewDoc : malloc failed\n");
1120: return(NULL);
1121: }
1122:
1123: cur->type = XML_DOCUMENT_NODE;
1124: cur->version = NULL;
1125: cur->intSubset = NULL;
1.8 daniel 1126: xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1.1 daniel 1127: cur->name = NULL;
1128: cur->root = NULL;
1129: cur->extSubset = NULL;
1130: cur->oldNs = NULL;
1131: cur->encoding = NULL;
1132: cur->standalone = 1;
1133: cur->compression = 0;
1134: #ifndef XML_WITHOUT_CORBA
1135: cur->_private = NULL;
1136: cur->vepv = NULL;
1137: #endif
1138: return(cur);
1139: }
1140:
1141:
1142: /************************************************************************
1143: * *
1144: * The parser itself *
1145: * Relates to http://www.w3.org/TR/html40 *
1146: * *
1147: ************************************************************************/
1148:
1149: /************************************************************************
1150: * *
1151: * The parser itself *
1152: * *
1153: ************************************************************************/
1154:
1155: /**
1156: * htmlParseHTMLName:
1157: * @ctxt: an HTML parser context
1158: *
1159: * parse an HTML tag or attribute name, note that we convert it to uppercase
1160: * since HTML names are not case-sensitive.
1161: *
1162: * Returns the Tag Name parsed or NULL
1163: */
1164:
1165: CHAR *
1166: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1167: CHAR *ret = NULL;
1168: int i = 0;
1169: CHAR loc[100];
1170:
1171: if (!IS_LETTER(CUR) && (CUR != '_') &&
1172: (CUR != ':')) return(NULL);
1173:
1174: while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
1175: if ((CUR >= 0x61) && (CUR <= 0x7a)) loc[i] = CUR - 0x20;
1176: else loc[i] = CUR;
1177: i++;
1178:
1179: NEXT;
1180: }
1181:
1182: ret = xmlStrndup(loc, i);
1183:
1184: return(ret);
1185: }
1186:
1187: /**
1188: * htmlParseName:
1189: * @ctxt: an HTML parser context
1190: *
1191: * parse an HTML name, this routine is case sensistive.
1192: *
1193: * Returns the Name parsed or NULL
1194: */
1195:
1196: CHAR *
1197: htmlParseName(htmlParserCtxtPtr ctxt) {
1.5 daniel 1198: CHAR buf[HTML_MAX_NAMELEN];
1199: int len = 0;
1.1 daniel 1200:
1.5 daniel 1201: GROW;
1202: if (!IS_LETTER(CUR) && (CUR != '_')) {
1203: return(NULL);
1204: }
1.1 daniel 1205:
1206: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1207: (CUR == '.') || (CUR == '-') ||
1208: (CUR == '_') || (CUR == ':') ||
1209: (IS_COMBINING(CUR)) ||
1.5 daniel 1210: (IS_EXTENDER(CUR))) {
1211: buf[len++] = CUR;
1.1 daniel 1212: NEXT;
1.5 daniel 1213: if (len >= HTML_MAX_NAMELEN) {
1214: fprintf(stderr,
1215: "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1216: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1217: (CUR == '.') || (CUR == '-') ||
1218: (CUR == '_') || (CUR == ':') ||
1219: (IS_COMBINING(CUR)) ||
1220: (IS_EXTENDER(CUR)))
1221: NEXT;
1222: break;
1223: }
1224: }
1225: return(xmlStrndup(buf, len));
1.1 daniel 1226: }
1227:
1228: /**
1229: * htmlParseHTMLAttribute:
1230: * @ctxt: an HTML parser context
1231: *
1.5 daniel 1232: * parse an HTML attribute value (without quotes).
1.1 daniel 1233: *
1234: * Returns the Nmtoken parsed or NULL
1235: */
1236:
1237: CHAR *
1238: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt) {
1.5 daniel 1239: CHAR buf[HTML_MAX_NAMELEN];
1240: int len = 0;
1.1 daniel 1241:
1.5 daniel 1242: GROW;
1.1 daniel 1243: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1244: (CUR != '&') && (CUR != '>') &&
1.5 daniel 1245: (CUR != '\'') && (CUR != '"')) {
1246: buf[len++] = CUR;
1.1 daniel 1247: NEXT;
1.5 daniel 1248: if (len >= HTML_MAX_NAMELEN) {
1249: fprintf(stderr,
1250: "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1251: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1252: (CUR != '&') && (CUR != '>') &&
1253: (CUR != '\'') && (CUR != '"'))
1254: NEXT;
1255: break;
1256: }
1257: }
1258: return(xmlStrndup(buf, len));
1.1 daniel 1259: }
1260:
1261: /**
1262: * htmlParseNmtoken:
1263: * @ctxt: an HTML parser context
1264: *
1265: * parse an HTML Nmtoken.
1266: *
1267: * Returns the Nmtoken parsed or NULL
1268: */
1269:
1270: CHAR *
1271: htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
1.5 daniel 1272: CHAR buf[HTML_MAX_NAMELEN];
1273: int len = 0;
1.1 daniel 1274:
1.5 daniel 1275: GROW;
1.1 daniel 1276: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1277: (CUR == '.') || (CUR == '-') ||
1278: (CUR == '_') || (CUR == ':') ||
1279: (IS_COMBINING(CUR)) ||
1.5 daniel 1280: (IS_EXTENDER(CUR))) {
1281: buf[len++] = CUR;
1.1 daniel 1282: NEXT;
1.5 daniel 1283: if (len >= HTML_MAX_NAMELEN) {
1284: fprintf(stderr,
1285: "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1286: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1287: (CUR == '.') || (CUR == '-') ||
1288: (CUR == '_') || (CUR == ':') ||
1289: (IS_COMBINING(CUR)) ||
1290: (IS_EXTENDER(CUR)))
1291: NEXT;
1292: break;
1293: }
1294: }
1295: return(xmlStrndup(buf, len));
1.1 daniel 1296: }
1297:
1298: /**
1299: * htmlParseEntityRef:
1300: * @ctxt: an HTML parser context
1301: * @str: location to store the entity name
1302: *
1303: * parse an HTML ENTITY references
1304: *
1305: * [68] EntityRef ::= '&' Name ';'
1306: *
1307: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1308: * if non-NULL *str will have to be freed by the caller.
1309: */
1310: htmlEntityDescPtr
1311: htmlParseEntityRef(htmlParserCtxtPtr ctxt, CHAR **str) {
1312: CHAR *name;
1313: htmlEntityDescPtr ent = NULL;
1314: *str = NULL;
1315:
1316: if (CUR == '&') {
1317: NEXT;
1318: name = htmlParseName(ctxt);
1319: if (name == NULL) {
1320: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1321: ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1322: ctxt->wellFormed = 0;
1323: } else {
1.5 daniel 1324: GROW;
1.1 daniel 1325: if (CUR == ';') {
1326: NEXT;
1327: *str = name;
1328:
1329: /*
1330: * Lookup the entity in the table.
1331: */
1332: ent = htmlEntityLookup(name);
1333: } else {
1334: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1335: ctxt->sax->error(ctxt->userData,
1336: "htmlParseEntityRef: expecting ';'\n");
1337: ctxt->wellFormed = 0;
1338: if (ctxt->sax->characters != NULL) {
1.8 daniel 1339: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.1 daniel 1340: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1341: }
1342: free(name);
1343: }
1344: }
1345: }
1346: return(ent);
1347: }
1348:
1349: /**
1350: * htmlParseAttValue:
1351: * @ctxt: an HTML parser context
1352: *
1353: * parse a value for an attribute
1354: * Note: the parser won't do substitution of entities here, this
1355: * will be handled later in xmlStringGetNodeList, unless it was
1356: * asked for ctxt->replaceEntities != 0
1357: *
1358: * Returns the AttValue parsed or NULL.
1359: */
1360:
1361: CHAR *
1362: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1363: CHAR *ret = NULL;
1364:
1365: if (CUR == '"') {
1366: NEXT;
1367: ret = htmlDecodeEntities(ctxt, -1, '"', '<', 0);
1368: if (CUR == '<') {
1369: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1370: ctxt->sax->error(ctxt->userData,
1371: "Unescaped '<' not allowed in attributes values\n");
1372: ctxt->wellFormed = 0;
1373: }
1374: if (CUR != '"') {
1375: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1376: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1377: ctxt->wellFormed = 0;
1378: } else
1379: NEXT;
1380: } else if (CUR == '\'') {
1381: NEXT;
1382: ret = htmlDecodeEntities(ctxt, -1, '\'', '<', 0);
1383: if (CUR == '<') {
1384: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1385: ctxt->sax->error(ctxt->userData,
1386: "Unescaped '<' not allowed in attributes values\n");
1387: ctxt->wellFormed = 0;
1388: }
1389: if (CUR != '\'') {
1390: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1391: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1392: ctxt->wellFormed = 0;
1393: } else
1394: NEXT;
1395: } else {
1396: /*
1397: * That's an HTMLism, the attribute value may not be quoted
1398: */
1399: ret = htmlParseHTMLAttribute(ctxt);
1400: if (ret == NULL) {
1401: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1402: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1403: ctxt->wellFormed = 0;
1404: }
1405: }
1406:
1407: return(ret);
1408: }
1409:
1410: /**
1411: * htmlParseSystemLiteral:
1412: * @ctxt: an HTML parser context
1413: *
1414: * parse an HTML Literal
1415: *
1416: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1417: *
1418: * Returns the SystemLiteral parsed or NULL
1419: */
1420:
1421: CHAR *
1422: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1423: const CHAR *q;
1424: CHAR *ret = NULL;
1425:
1426: if (CUR == '"') {
1427: NEXT;
1428: q = CUR_PTR;
1429: while ((IS_CHAR(CUR)) && (CUR != '"'))
1430: NEXT;
1431: if (!IS_CHAR(CUR)) {
1432: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1433: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1434: ctxt->wellFormed = 0;
1435: } else {
1436: ret = xmlStrndup(q, CUR_PTR - q);
1437: NEXT;
1438: }
1439: } else if (CUR == '\'') {
1440: NEXT;
1441: q = CUR_PTR;
1442: while ((IS_CHAR(CUR)) && (CUR != '\''))
1443: NEXT;
1444: if (!IS_CHAR(CUR)) {
1445: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1446: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1447: ctxt->wellFormed = 0;
1448: } else {
1449: ret = xmlStrndup(q, CUR_PTR - q);
1450: NEXT;
1451: }
1452: } else {
1453: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1454: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1455: ctxt->wellFormed = 0;
1456: }
1457:
1458: return(ret);
1459: }
1460:
1461: /**
1462: * htmlParsePubidLiteral:
1463: * @ctxt: an HTML parser context
1464: *
1465: * parse an HTML public literal
1466: *
1467: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1468: *
1469: * Returns the PubidLiteral parsed or NULL.
1470: */
1471:
1472: CHAR *
1473: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1474: const CHAR *q;
1475: CHAR *ret = NULL;
1476: /*
1477: * Name ::= (Letter | '_') (NameChar)*
1478: */
1479: if (CUR == '"') {
1480: NEXT;
1481: q = CUR_PTR;
1482: while (IS_PUBIDCHAR(CUR)) NEXT;
1483: if (CUR != '"') {
1484: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1485: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1486: ctxt->wellFormed = 0;
1487: } else {
1488: ret = xmlStrndup(q, CUR_PTR - q);
1489: NEXT;
1490: }
1491: } else if (CUR == '\'') {
1492: NEXT;
1493: q = CUR_PTR;
1494: while ((IS_LETTER(CUR)) && (CUR != '\''))
1495: NEXT;
1496: if (!IS_LETTER(CUR)) {
1497: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1498: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1499: ctxt->wellFormed = 0;
1500: } else {
1501: ret = xmlStrndup(q, CUR_PTR - q);
1502: NEXT;
1503: }
1504: } else {
1505: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1506: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1507: ctxt->wellFormed = 0;
1508: }
1509:
1510: return(ret);
1511: }
1512:
1513: /**
1514: * htmlParseCharData:
1515: * @ctxt: an HTML parser context
1516: * @cdata: int indicating whether we are within a CDATA section
1517: *
1518: * parse a CharData section.
1519: * if we are within a CDATA section ']]>' marks an end of section.
1520: *
1521: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1522: */
1523:
1524: void
1525: htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1526: const CHAR *q;
1527:
1528: q = CUR_PTR;
1529: while ((IS_CHAR(CUR)) && (CUR != '<') &&
1530: (CUR != '&')) {
1531: if ((CUR == ']') && (NXT(1) == ']') &&
1532: (NXT(2) == '>')) {
1533: if (cdata) break;
1534: else {
1535: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1536: ctxt->sax->error(ctxt->userData,
1537: "Sequence ']]>' not allowed in content\n");
1538: ctxt->wellFormed = 0;
1539: }
1540: }
1541: NEXT;
1542: }
1543: if (q == CUR_PTR) return;
1544:
1545: /*
1546: * Ok the segment [q CUR_PTR] is to be consumed as chars.
1547: */
1548: if (ctxt->sax != NULL) {
1549: if (areBlanks(ctxt, q, CUR_PTR - q)) {
1550: if (ctxt->sax->ignorableWhitespace != NULL)
1551: ctxt->sax->ignorableWhitespace(ctxt->userData, q, CUR_PTR - q);
1552: } else {
1553: if (ctxt->sax->characters != NULL)
1554: ctxt->sax->characters(ctxt->userData, q, CUR_PTR - q);
1555: }
1556: }
1557: }
1558:
1559: /**
1560: * htmlParseExternalID:
1561: * @ctxt: an HTML parser context
1562: * @publicID: a CHAR** receiving PubidLiteral
1563: * @strict: indicate whether we should restrict parsing to only
1564: * production [75], see NOTE below
1565: *
1566: * Parse an External ID or a Public ID
1567: *
1568: * NOTE: Productions [75] and [83] interract badly since [75] can generate
1569: * 'PUBLIC' S PubidLiteral S SystemLiteral
1570: *
1571: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1572: * | 'PUBLIC' S PubidLiteral S SystemLiteral
1573: *
1574: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1575: *
1576: * Returns the function returns SystemLiteral and in the second
1577: * case publicID receives PubidLiteral, is strict is off
1578: * it is possible to return NULL and have publicID set.
1579: */
1580:
1581: CHAR *
1582: htmlParseExternalID(htmlParserCtxtPtr ctxt, CHAR **publicID, int strict) {
1583: CHAR *URI = NULL;
1584:
1585: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1586: (UPP(2) == 'S') && (UPP(3) == 'T') &&
1587: (UPP(4) == 'E') && (UPP(5) == 'M')) {
1588: SKIP(6);
1589: if (!IS_BLANK(CUR)) {
1590: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1591: ctxt->sax->error(ctxt->userData,
1592: "Space required after 'SYSTEM'\n");
1593: ctxt->wellFormed = 0;
1594: }
1595: SKIP_BLANKS;
1596: URI = htmlParseSystemLiteral(ctxt);
1597: if (URI == NULL) {
1598: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1599: ctxt->sax->error(ctxt->userData,
1600: "htmlParseExternalID: SYSTEM, no URI\n");
1601: ctxt->wellFormed = 0;
1602: }
1603: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1604: (UPP(2) == 'B') && (UPP(3) == 'L') &&
1605: (UPP(4) == 'I') && (UPP(5) == 'C')) {
1606: SKIP(6);
1607: if (!IS_BLANK(CUR)) {
1608: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1609: ctxt->sax->error(ctxt->userData,
1610: "Space required after 'PUBLIC'\n");
1611: ctxt->wellFormed = 0;
1612: }
1613: SKIP_BLANKS;
1614: *publicID = htmlParsePubidLiteral(ctxt);
1615: if (*publicID == NULL) {
1616: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1617: ctxt->sax->error(ctxt->userData,
1618: "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1619: ctxt->wellFormed = 0;
1620: }
1.5 daniel 1621: SKIP_BLANKS;
1622: if ((CUR == '"') || (CUR == '\'')) {
1623: URI = htmlParseSystemLiteral(ctxt);
1.1 daniel 1624: }
1625: }
1626: return(URI);
1627: }
1628:
1629: /**
1630: * htmlParseComment:
1631: * @ctxt: an HTML parser context
1632: * @create: should we create a node, or just skip the content
1633: *
1634: * Parse an XML (SGML) comment <!-- .... -->
1635: *
1636: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1637: */
1638: void
1639: htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
1640: const CHAR *q, *start;
1641: const CHAR *r;
1642: CHAR *val;
1643:
1644: /*
1645: * Check that there is a comment right here.
1646: */
1647: if ((CUR != '<') || (NXT(1) != '!') ||
1648: (NXT(2) != '-') || (NXT(3) != '-')) return;
1649:
1650: SKIP(4);
1651: start = q = CUR_PTR;
1652: NEXT;
1653: r = CUR_PTR;
1654: NEXT;
1655: while (IS_CHAR(CUR) &&
1656: ((CUR == ':') || (CUR != '>') ||
1657: (*r != '-') || (*q != '-'))) {
1658: if ((*r == '-') && (*q == '-')) {
1659: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1660: ctxt->sax->error(ctxt->userData,
1661: "Comment must not contain '--' (double-hyphen)`\n");
1662: ctxt->wellFormed = 0;
1663: }
1664: NEXT;r++;q++;
1665: }
1666: if (!IS_CHAR(CUR)) {
1667: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1668: ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", start);
1669: ctxt->wellFormed = 0;
1670: } else {
1671: NEXT;
1672: if (create) {
1673: val = xmlStrndup(start, q - start);
1674: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL))
1675: ctxt->sax->comment(ctxt->userData, val);
1676: free(val);
1677: }
1678: }
1679: }
1680:
1681: /**
1682: * htmlParseCharRef:
1683: * @ctxt: an HTML parser context
1684: *
1685: * parse Reference declarations
1686: *
1687: * [66] CharRef ::= '&#' [0-9]+ ';' |
1688: * '&#x' [0-9a-fA-F]+ ';'
1689: *
1690: * Returns the value parsed (as an int)
1691: */
1692: int
1693: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1694: int val = 0;
1695:
1696: if ((CUR == '&') && (NXT(1) == '#') &&
1697: (NXT(2) == 'x')) {
1698: SKIP(3);
1699: while (CUR != ';') {
1700: if ((CUR >= '0') && (CUR <= '9'))
1701: val = val * 16 + (CUR - '0');
1702: else if ((CUR >= 'a') && (CUR <= 'f'))
1703: val = val * 16 + (CUR - 'a') + 10;
1704: else if ((CUR >= 'A') && (CUR <= 'F'))
1705: val = val * 16 + (CUR - 'A') + 10;
1706: else {
1707: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1708: ctxt->sax->error(ctxt->userData,
1709: "htmlParseCharRef: invalid hexadecimal value\n");
1710: ctxt->wellFormed = 0;
1711: val = 0;
1712: break;
1713: }
1714: NEXT;
1715: }
1716: if (CUR == ';')
1717: NEXT;
1718: } else if ((CUR == '&') && (NXT(1) == '#')) {
1719: SKIP(2);
1720: while (CUR != ';') {
1721: if ((CUR >= '0') && (CUR <= '9'))
1722: val = val * 10 + (CUR - '0');
1723: else {
1724: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1725: ctxt->sax->error(ctxt->userData,
1726: "htmlParseCharRef: invalid decimal value\n");
1727: ctxt->wellFormed = 0;
1728: val = 0;
1729: break;
1730: }
1731: NEXT;
1732: }
1733: if (CUR == ';')
1734: NEXT;
1735: } else {
1736: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1737: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
1738: ctxt->wellFormed = 0;
1739: }
1740: /*
1741: * Check the value IS_CHAR ...
1742: */
1743: if (IS_CHAR(val)) {
1744: return(val);
1745: } else {
1746: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1747: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid CHAR value %d\n",
1748: val);
1749: ctxt->wellFormed = 0;
1750: }
1751: return(0);
1752: }
1753:
1754:
1755: /**
1756: * htmlParseDocTypeDecl :
1757: * @ctxt: an HTML parser context
1758: *
1759: * parse a DOCTYPE declaration
1760: *
1761: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
1762: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
1763: */
1764:
1765: void
1766: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1767: CHAR *name;
1768: CHAR *ExternalID = NULL;
1769: CHAR *URI = NULL;
1770:
1771: /*
1772: * We know that '<!DOCTYPE' has been detected.
1773: */
1774: SKIP(9);
1775:
1776: SKIP_BLANKS;
1777:
1778: /*
1779: * Parse the DOCTYPE name.
1780: */
1781: name = htmlParseName(ctxt);
1782: if (name == NULL) {
1783: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1784: ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
1785: ctxt->wellFormed = 0;
1786: }
1787: /*
1788: * Check that upper(name) == "HTML" !!!!!!!!!!!!!
1789: */
1790:
1791: SKIP_BLANKS;
1792:
1793: /*
1794: * Check for SystemID and ExternalID
1795: */
1.5 daniel 1796: URI = htmlParseExternalID(ctxt, &ExternalID, 0);
1.1 daniel 1797: SKIP_BLANKS;
1798:
1799: /*
1800: * We should be at the end of the DOCTYPE declaration.
1801: */
1802: if (CUR != '>') {
1803: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1804: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
1805: ctxt->wellFormed = 0;
1806: /* We shouldn't try to resynchronize ... */
1807: } else {
1808: }
1809: NEXT;
1810:
1811: /*
1812: * Create the document accordingly to the DOCTYPE
1813: */
1814: ctxt->myDoc = htmlNewDoc(URI, ExternalID);
1815:
1816: /*
1817: * Cleanup, since we don't use all those identifiers
1818: */
1819: if (URI != NULL) free(URI);
1820: if (ExternalID != NULL) free(ExternalID);
1821: if (name != NULL) free(name);
1822: }
1823:
1824: /**
1825: * htmlParseAttribute:
1826: * @ctxt: an HTML parser context
1827: * @value: a CHAR ** used to store the value of the attribute
1828: *
1829: * parse an attribute
1830: *
1831: * [41] Attribute ::= Name Eq AttValue
1832: *
1833: * [25] Eq ::= S? '=' S?
1834: *
1835: * With namespace:
1836: *
1837: * [NS 11] Attribute ::= QName Eq AttValue
1838: *
1839: * Also the case QName == xmlns:??? is handled independently as a namespace
1840: * definition.
1841: *
1842: * Returns the attribute name, and the value in *value.
1843: */
1844:
1845: CHAR *
1846: htmlParseAttribute(htmlParserCtxtPtr ctxt, CHAR **value) {
1847: CHAR *name, *val;
1848:
1849: *value = NULL;
1850: name = htmlParseName(ctxt);
1851: if (name == NULL) {
1852: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1853: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
1854: ctxt->wellFormed = 0;
1855: return(NULL);
1856: }
1857:
1858: /*
1859: * read the value
1860: */
1861: SKIP_BLANKS;
1862: if (CUR == '=') {
1863: NEXT;
1864: SKIP_BLANKS;
1865: val = htmlParseAttValue(ctxt);
1866: } else {
1867: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1868: ctxt->sax->error(ctxt->userData,
1869: "Specification mandate value for attribute %s\n", name);
1870: ctxt->wellFormed = 0;
1871: return(NULL);
1872: }
1873:
1874: *value = val;
1875: return(name);
1876: }
1877:
1878: /**
1879: * htmlParseStartTag:
1880: * @ctxt: an HTML parser context
1881: *
1882: * parse a start of tag either for rule element or
1883: * EmptyElement. In both case we don't parse the tag closing chars.
1884: *
1885: * [40] STag ::= '<' Name (S Attribute)* S? '>'
1886: *
1887: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
1888: *
1889: * With namespace:
1890: *
1891: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
1892: *
1893: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
1894: *
1895: * Returns the element name parsed
1896: */
1897:
1898: CHAR *
1899: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1900: CHAR *name;
1901: CHAR *attname;
1902: CHAR *attvalue;
1903: const CHAR **atts = NULL;
1904: int nbatts = 0;
1905: int maxatts = 0;
1906: int i;
1907:
1908: if (CUR != '<') return(NULL);
1909: NEXT;
1910:
1911: name = htmlParseHTMLName(ctxt);
1912: if (name == NULL) {
1913: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1914: ctxt->sax->error(ctxt->userData,
1915: "htmlParseStartTag: invalid element name\n");
1916: ctxt->wellFormed = 0;
1917: return(NULL);
1918: }
1919:
1920: /*
1921: * Check for auto-closure of HTML elements.
1922: */
1923: htmlAutoClose(ctxt, name);
1924:
1925: /*
1926: * Now parse the attributes, it ends up with the ending
1927: *
1928: * (S Attribute)* S?
1929: */
1930: SKIP_BLANKS;
1931: while ((IS_CHAR(CUR)) &&
1932: (CUR != '>') &&
1933: ((CUR != '/') || (NXT(1) != '>'))) {
1934: const CHAR *q = CUR_PTR;
1935:
1936: attname = htmlParseAttribute(ctxt, &attvalue);
1937: if ((attname != NULL) && (attvalue != NULL)) {
1938: /*
1939: * Well formedness requires at most one declaration of an attribute
1940: */
1941: for (i = 0; i < nbatts;i += 2) {
1942: if (!xmlStrcmp(atts[i], attname)) {
1943: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1944: ctxt->sax->error(ctxt->userData, "Attribute %s redefined\n",
1945: name);
1946: ctxt->wellFormed = 0;
1947: free(attname);
1948: free(attvalue);
1949: break;
1950: }
1951: }
1952:
1953: /*
1954: * Add the pair to atts
1955: */
1956: if (atts == NULL) {
1957: maxatts = 10;
1958: atts = (const CHAR **) malloc(maxatts * sizeof(CHAR *));
1959: if (atts == NULL) {
1960: fprintf(stderr, "malloc of %ld byte failed\n",
1961: maxatts * (long)sizeof(CHAR *));
1962: return(NULL);
1963: }
1964: } else if (nbatts + 2 < maxatts) {
1965: maxatts *= 2;
1966: atts = (const CHAR **) realloc(atts, maxatts * sizeof(CHAR *));
1967: if (atts == NULL) {
1968: fprintf(stderr, "realloc of %ld byte failed\n",
1969: maxatts * (long)sizeof(CHAR *));
1970: return(NULL);
1971: }
1972: }
1973: atts[nbatts++] = attname;
1974: atts[nbatts++] = attvalue;
1975: atts[nbatts] = NULL;
1976: atts[nbatts + 1] = NULL;
1977: }
1978:
1979: SKIP_BLANKS;
1980: if (q == CUR_PTR) {
1981: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1982: ctxt->sax->error(ctxt->userData,
1983: "htmlParseStartTag: problem parsing attributes\n");
1984: ctxt->wellFormed = 0;
1985: break;
1986: }
1987: }
1988:
1989: /*
1990: * SAX: Start of Element !
1991: */
1992: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1993: ctxt->sax->startElement(ctxt->userData, name, atts);
1994:
1995: if (atts != NULL) {
1996: for (i = 0;i < nbatts;i++) free((CHAR *) atts[i]);
1997: free(atts);
1998: }
1999: return(name);
2000: }
2001:
2002: /**
2003: * htmlParseEndTag:
2004: * @ctxt: an HTML parser context
2005: * @tagname: the tag name as parsed in the opening tag.
2006: *
2007: * parse an end of tag
2008: *
2009: * [42] ETag ::= '</' Name S? '>'
2010: *
2011: * With namespace
2012: *
2013: * [NS 9] ETag ::= '</' QName S? '>'
2014: */
2015:
2016: void
2017: htmlParseEndTag(htmlParserCtxtPtr ctxt, const CHAR *tagname) {
2018: CHAR *name;
2019: int i;
2020:
2021: if ((CUR != '<') || (NXT(1) != '/')) {
2022: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2023: ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2024: ctxt->wellFormed = 0;
2025: return;
2026: }
2027: SKIP(2);
2028:
2029: name = htmlParseHTMLName(ctxt);
2030:
2031: /*
2032: * We should definitely be at the ending "S? '>'" part
2033: */
2034: SKIP_BLANKS;
2035: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2036: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2037: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2038: ctxt->wellFormed = 0;
2039: } else
2040: NEXT;
2041:
2042: /*
2043: * Check that we are not closing an already closed tag,
2044: * <p><b>...</p></b> is a really common error !
2045: */
2046: for (i = ctxt->nodeNr - 1;i >= 0;i--) {
2047: if ((ctxt->nodeTab[i] != NULL) &&
2048: (!xmlStrcmp(tagname, ctxt->nodeTab[i]->name)))
2049: break;
2050: }
2051: if (i < 0) {
2052: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2053: ctxt->sax->error(ctxt->userData,
2054: "htmlParseEndTag: unexpected close for tag %s\n",
2055: tagname);
1.6 veillard 2056: free(name);
1.1 daniel 2057: ctxt->wellFormed = 0;
2058: return;
2059: }
2060:
2061: /*
2062: * Check for auto-closure of HTML elements.
2063: */
2064: htmlAutoCloseOnClose(ctxt, name);
2065:
2066: /*
2067: * Well formedness constraints, opening and closing must match.
2068: * With the exception that the autoclose may have popped stuff out
2069: * of the stack.
2070: */
2071: if (xmlStrcmp(name, tagname)) {
2072: if ((ctxt->node != NULL) &&
2073: (xmlStrcmp(ctxt->node->name, name))) {
2074: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2075: ctxt->sax->error(ctxt->userData,
2076: "Opening and ending tag mismatch: %s and %s\n",
2077: name, ctxt->node->name);
2078: ctxt->wellFormed = 0;
2079: }
2080: }
2081:
2082: /*
2083: * SAX: End of Tag
2084: */
2085: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2086: ctxt->sax->endElement(ctxt->userData, name);
2087:
2088: if (name != NULL)
2089: free(name);
2090:
2091: return;
2092: }
2093:
2094:
2095: /**
2096: * htmlParseReference:
2097: * @ctxt: an HTML parser context
2098: *
2099: * parse and handle entity references in content,
2100: * this will end-up in a call to character() since this is either a
2101: * CharRef, or a predefined entity.
2102: */
2103: void
2104: htmlParseReference(htmlParserCtxtPtr ctxt) {
2105: htmlEntityDescPtr ent;
2106: CHAR out[2];
2107: CHAR *name;
2108: int val;
2109: if (CUR != '&') return;
2110:
2111: if (NXT(1) == '#') {
2112: val = htmlParseCharRef(ctxt);
1.8 daniel 2113: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 2114: out[0] = val;
2115: out[1] = 0;
2116: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2117: ctxt->sax->characters(ctxt->userData, out, 1);
2118: } else {
2119: ent = htmlParseEntityRef(ctxt, &name);
2120: if (name == NULL) return; /* Shall we output & anyway ? */
2121: if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2122: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
1.8 daniel 2123: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.1 daniel 2124: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1.8 daniel 2125: ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1);
1.1 daniel 2126: }
2127: } else {
1.8 daniel 2128: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 2129: out[0] = ent->value;
2130: out[1] = 0;
2131: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2132: ctxt->sax->characters(ctxt->userData, out, 1);
2133: }
2134: free(name);
2135: }
2136: }
2137:
2138: /**
2139: * htmlParseContent:
2140: * @ctxt: an HTML parser context
2141: * @name: the node name
2142: *
2143: * Parse a content: comment, sub-element, reference or text.
2144: *
2145: */
2146:
2147: void
2148: htmlParseContent(htmlParserCtxtPtr ctxt, const CHAR *name) {
2149: htmlNodePtr currentNode;
2150:
2151: currentNode = ctxt->node;
2152: while ((CUR != '<') || (NXT(1) != '/')) {
2153: const CHAR *test = CUR_PTR;
2154:
2155: /*
2156: * Has this node been popped out during parsing of
2157: * the next element
2158: */
2159: if (currentNode != ctxt->node) return;
2160:
2161: /*
2162: * First case : a comment
2163: */
2164: if ((CUR == '<') && (NXT(1) == '!') &&
2165: (NXT(2) == '-') && (NXT(3) == '-')) {
2166: htmlParseComment(ctxt, 1);
2167: }
2168:
2169: /*
2170: * Second case : a sub-element.
2171: */
2172: else if (CUR == '<') {
2173: htmlParseElement(ctxt);
2174: }
2175:
2176: /*
2177: * Third case : a reference. If if has not been resolved,
2178: * parsing returns it's Name, create the node
2179: */
2180: else if (CUR == '&') {
2181: htmlParseReference(ctxt);
2182: }
2183:
2184: /*
2185: * Last case, text. Note that References are handled directly.
2186: */
2187: else {
2188: htmlParseCharData(ctxt, 0);
2189: }
2190:
2191: if (test == CUR_PTR) {
2192: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2193: ctxt->sax->error(ctxt->userData,
2194: "detected an error in element content\n");
2195: ctxt->wellFormed = 0;
2196: break;
2197: }
1.5 daniel 2198: GROW;
1.1 daniel 2199: }
2200:
2201: /*
2202: * parse the end of tag: '</' should be here.
2203: */
2204: htmlParseEndTag(ctxt, name);
2205: }
2206:
2207: /**
2208: * htmlParseElement:
2209: * @ctxt: an HTML parser context
2210: *
2211: * parse an HTML element, this is highly recursive
2212: *
2213: * [39] element ::= EmptyElemTag | STag content ETag
2214: *
2215: * [41] Attribute ::= Name Eq AttValue
2216: */
2217:
2218: void
2219: htmlParseElement(htmlParserCtxtPtr ctxt) {
2220: const CHAR *openTag = CUR_PTR;
2221: CHAR *name;
2222: htmlParserNodeInfo node_info;
2223: htmlNodePtr currentNode;
2224: htmlElemDescPtr info;
2225:
2226: /* Capture start position */
2227: node_info.begin_pos = CUR_PTR - ctxt->input->base;
2228: node_info.begin_line = ctxt->input->line;
2229:
2230: name = htmlParseStartTag(ctxt);
2231: if (name == NULL) {
2232: return;
2233: }
2234:
2235: /*
2236: * Lookup the info for that element.
2237: */
2238: info = htmlTagLookup(name);
2239: if (info == NULL) {
2240: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2241: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2242: name);
2243: ctxt->wellFormed = 0;
2244: } else if (info->depr) {
2245: /***************************
2246: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2247: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2248: name);
2249: ***************************/
2250: }
2251:
2252: /*
2253: * Check for an Empty Element labelled the XML/SGML way
2254: */
2255: if ((CUR == '/') && (NXT(1) == '>')) {
2256: SKIP(2);
2257: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2258: ctxt->sax->endElement(ctxt->userData, name);
2259: free(name);
2260: return;
2261: }
2262:
1.5 daniel 2263: if (CUR == '>') {
2264: NEXT;
2265: } else {
1.1 daniel 2266: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2267: ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2268: openTag);
2269: ctxt->wellFormed = 0;
2270:
2271: /*
2272: * end of parsing of this node.
2273: */
2274: nodePop(ctxt);
2275: free(name);
2276: return;
2277: }
2278:
2279: /*
2280: * Check for an Empty Element from DTD definition
2281: */
2282: if ((info != NULL) && (info->empty)) {
2283: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2284: ctxt->sax->endElement(ctxt->userData, name);
2285: free(name);
2286: return;
2287: }
2288:
2289: /*
2290: * Parse the content of the element:
2291: */
2292: currentNode = ctxt->node;
2293: htmlParseContent(ctxt, name);
2294:
2295: /*
2296: * check whether the element get popped due to auto closure
2297: * on start tag
2298: */
2299: if (currentNode != ctxt->node) {
2300: free(name);
2301: return;
2302: }
2303:
2304: if (!IS_CHAR(CUR)) {
2305: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2306: ctxt->sax->error(ctxt->userData,
2307: "Premature end of data in tag %.30s\n", openTag);
2308: ctxt->wellFormed = 0;
2309:
2310: /*
2311: * end of parsing of this node.
2312: */
2313: nodePop(ctxt);
2314: free(name);
2315: return;
2316: }
2317:
2318: free(name);
2319: }
2320:
2321: /**
2322: * htmlParseDocument :
2323: * @ctxt: an HTML parser context
2324: *
2325: * parse an HTML document (and build a tree if using the standard SAX
2326: * interface).
2327: *
2328: * Returns 0, -1 in case of error. the parser context is augmented
2329: * as a result of the parsing.
2330: */
2331:
2332: int
2333: htmlParseDocument(htmlParserCtxtPtr ctxt) {
2334: htmlDefaultSAXHandlerInit();
2335: ctxt->html = 1;
2336:
1.5 daniel 2337: GROW;
1.1 daniel 2338: /*
1.9 ! daniel 2339: * SAX: beginning of the document processing.
1.1 daniel 2340: */
2341: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2342: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2343:
2344: /*
2345: * Wipe out everything which is before the first '<'
2346: */
2347: if (IS_BLANK(CUR)) {
2348: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2349: ctxt->sax->error(ctxt->userData,
2350: "Extra spaces at the beginning of the document are not allowed\n");
2351: ctxt->wellFormed = 0;
2352: SKIP_BLANKS;
2353: }
2354:
2355: if (CUR == 0) {
2356: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2357: ctxt->sax->error(ctxt->userData, "Document is empty\n");
2358: ctxt->wellFormed = 0;
2359: }
2360:
2361:
2362: /*
2363: * Then possibly doc type declaration(s) and more Misc
2364: * (doctypedecl Misc*)?
2365: */
2366: if ((CUR == '<') && (NXT(1) == '!') &&
2367: (UPP(2) == 'D') && (UPP(3) == 'O') &&
2368: (UPP(4) == 'C') && (UPP(5) == 'T') &&
2369: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2370: (UPP(8) == 'E')) {
2371: htmlParseDocTypeDecl(ctxt);
2372: }
2373: SKIP_BLANKS;
2374:
2375: /*
2376: * Create the document if not done already.
2377: */
2378: if (ctxt->myDoc == NULL) {
2379: ctxt->myDoc = htmlNewDoc(NULL, NULL);
2380: }
2381:
2382: /*
2383: * Time to start parsing the tree itself
2384: */
2385: htmlParseElement(ctxt);
2386:
2387: /*
2388: * SAX: end of the document processing.
2389: */
2390: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2391: ctxt->sax->endDocument(ctxt->userData);
2392: if (! ctxt->wellFormed) return(-1);
2393: return(0);
2394: }
2395:
2396:
2397: /********************************************************************************
2398: * *
2399: * Parser contexts handling *
2400: * *
2401: ********************************************************************************/
2402:
2403: /**
2404: * xmlInitParserCtxt:
2405: * @ctxt: an HTML parser context
2406: *
2407: * Initialize a parser context
2408: */
2409:
2410: void
2411: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2412: {
2413: htmlSAXHandler *sax;
2414:
2415: sax = (htmlSAXHandler *) malloc(sizeof(htmlSAXHandler));
2416: if (sax == NULL) {
2417: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2418: }
2419:
2420: /* Allocate the Input stack */
2421: ctxt->inputTab = (htmlParserInputPtr *) malloc(5 * sizeof(htmlParserInputPtr));
2422: ctxt->inputNr = 0;
2423: ctxt->inputMax = 5;
2424: ctxt->input = NULL;
2425: ctxt->version = NULL;
2426: ctxt->encoding = NULL;
2427: ctxt->standalone = -1;
2428:
2429: /* Allocate the Node stack */
2430: ctxt->nodeTab = (htmlNodePtr *) malloc(10 * sizeof(htmlNodePtr));
2431: ctxt->nodeNr = 0;
2432: ctxt->nodeMax = 10;
2433: ctxt->node = NULL;
2434:
2435: if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2436: else {
2437: ctxt->sax = sax;
2438: memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2439: }
2440: ctxt->userData = ctxt;
2441: ctxt->myDoc = NULL;
2442: ctxt->wellFormed = 1;
2443: ctxt->replaceEntities = 0;
2444: ctxt->html = 1;
2445: ctxt->record_info = 0;
2446: xmlInitNodeInfoSeq(&ctxt->node_seq);
2447: }
2448:
2449: /**
2450: * htmlFreeParserCtxt:
2451: * @ctxt: an HTML parser context
2452: *
2453: * Free all the memory used by a parser context. However the parsed
2454: * document in ctxt->myDoc is not freed.
2455: */
2456:
2457: void
2458: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2459: {
2460: htmlParserInputPtr input;
2461:
2462: if (ctxt == NULL) return;
2463:
2464: while ((input = inputPop(ctxt)) != NULL) {
2465: xmlFreeInputStream(input);
2466: }
2467:
2468: if (ctxt->nodeTab != NULL) free(ctxt->nodeTab);
2469: if (ctxt->inputTab != NULL) free(ctxt->inputTab);
2470: if (ctxt->version != NULL) free((char *) ctxt->version);
2471: if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
2472: free(ctxt->sax);
2473: free(ctxt);
2474: }
2475:
2476: /**
2477: * htmlCreateDocParserCtxt :
2478: * @cur: a pointer to an array of CHAR
2479: * @encoding: a free form C string describing the HTML document encoding, or NULL
2480: *
2481: * Create a parser context for an HTML document.
2482: *
2483: * Returns the new parser context or NULL
2484: */
2485: htmlParserCtxtPtr
2486: htmlCreateDocParserCtxt(CHAR *cur, const char *encoding) {
2487: htmlParserCtxtPtr ctxt;
2488: htmlParserInputPtr input;
2489: /* htmlCharEncoding enc; */
2490:
2491: ctxt = (htmlParserCtxtPtr) malloc(sizeof(htmlParserCtxt));
2492: if (ctxt == NULL) {
2493: perror("malloc");
2494: return(NULL);
2495: }
2496: htmlInitParserCtxt(ctxt);
2497: input = (htmlParserInputPtr) malloc(sizeof(htmlParserInput));
2498: if (input == NULL) {
2499: perror("malloc");
2500: free(ctxt);
2501: return(NULL);
2502: }
2503:
2504: /*
2505: * plug some encoding conversion routines here. !!!
2506: if (encoding != NULL) {
2507: enc = htmlDetectCharEncoding(cur);
2508: htmlSwitchEncoding(ctxt, enc);
2509: }
2510: */
2511:
2512: input->filename = NULL;
2513: input->line = 1;
2514: input->col = 1;
2515: input->base = cur;
2516: input->cur = cur;
2517: input->free = NULL;
1.5 daniel 2518: input->buf = NULL;
1.1 daniel 2519:
2520: inputPush(ctxt, input);
2521: return(ctxt);
2522: }
2523:
2524: /********************************************************************************
2525: * *
2526: * User entry points *
2527: * *
2528: ********************************************************************************/
2529:
2530: /**
2531: * htmlSAXParseDoc :
2532: * @cur: a pointer to an array of CHAR
2533: * @encoding: a free form C string describing the HTML document encoding, or NULL
2534: * @sax: the SAX handler block
2535: * @userData: if using SAX, this pointer will be provided on callbacks.
2536: *
2537: * parse an HTML in-memory document and build a tree.
2538: * It use the given SAX function block to handle the parsing callback.
2539: * If sax is NULL, fallback to the default DOM tree building routines.
2540: *
2541: * Returns the resulting document tree
2542: */
2543:
2544: htmlDocPtr
2545: htmlSAXParseDoc(CHAR *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
2546: htmlDocPtr ret;
2547: htmlParserCtxtPtr ctxt;
2548:
2549: if (cur == NULL) return(NULL);
2550:
2551:
2552: ctxt = htmlCreateDocParserCtxt(cur, encoding);
2553: if (ctxt == NULL) return(NULL);
2554: if (sax != NULL) {
2555: ctxt->sax = sax;
2556: ctxt->userData = userData;
2557: }
2558:
2559: htmlParseDocument(ctxt);
2560: ret = ctxt->myDoc;
2561: if (sax != NULL) {
2562: ctxt->sax = NULL;
2563: ctxt->userData = NULL;
2564: }
2565: htmlFreeParserCtxt(ctxt);
2566:
2567: return(ret);
2568: }
2569:
2570: /**
2571: * htmlParseDoc :
2572: * @cur: a pointer to an array of CHAR
2573: * @encoding: a free form C string describing the HTML document encoding, or NULL
2574: *
2575: * parse an HTML in-memory document and build a tree.
2576: *
2577: * Returns the resulting document tree
2578: */
2579:
2580: htmlDocPtr
2581: htmlParseDoc(CHAR *cur, const char *encoding) {
2582: return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
2583: }
2584:
2585:
2586: /**
2587: * htmlCreateFileParserCtxt :
2588: * @filename: the filename
2589: * @encoding: a free form C string describing the HTML document encoding, or NULL
2590: *
2591: * Create a parser context for a file content.
2592: * Automatic support for ZLIB/Compress compressed document is provided
2593: * by default if found at compile-time.
2594: *
2595: * Returns the new parser context or NULL
2596: */
2597: htmlParserCtxtPtr
2598: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
2599: {
2600: htmlParserCtxtPtr ctxt;
2601: htmlParserInputPtr inputStream;
1.5 daniel 2602: xmlParserInputBufferPtr buf;
1.1 daniel 2603: /* htmlCharEncoding enc; */
2604:
1.5 daniel 2605: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
2606: if (buf == NULL) return(NULL);
1.1 daniel 2607:
2608: ctxt = (htmlParserCtxtPtr) malloc(sizeof(htmlParserCtxt));
2609: if (ctxt == NULL) {
2610: perror("malloc");
2611: return(NULL);
2612: }
2613: htmlInitParserCtxt(ctxt);
2614: inputStream = (htmlParserInputPtr) malloc(sizeof(htmlParserInput));
2615: if (inputStream == NULL) {
2616: perror("malloc");
2617: free(ctxt);
2618: return(NULL);
2619: }
2620:
2621: inputStream->filename = strdup(filename);
2622: inputStream->line = 1;
2623: inputStream->col = 1;
1.5 daniel 2624: inputStream->buf = buf;
1.1 daniel 2625:
1.5 daniel 2626: inputStream->base = inputStream->buf->buffer->content;
2627: inputStream->cur = inputStream->buf->buffer->content;
2628: inputStream->free = NULL;
1.1 daniel 2629:
2630: inputPush(ctxt, inputStream);
2631: return(ctxt);
2632: }
2633:
2634: /**
2635: * htmlSAXParseFile :
2636: * @filename: the filename
2637: * @encoding: a free form C string describing the HTML document encoding, or NULL
2638: * @sax: the SAX handler block
2639: * @userData: if using SAX, this pointer will be provided on callbacks.
2640: *
2641: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2642: * compressed document is provided by default if found at compile-time.
2643: * It use the given SAX function block to handle the parsing callback.
2644: * If sax is NULL, fallback to the default DOM tree building routines.
2645: *
2646: * Returns the resulting document tree
2647: */
2648:
2649: htmlDocPtr
2650: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
2651: void *userData) {
2652: htmlDocPtr ret;
2653: htmlParserCtxtPtr ctxt;
2654:
2655: ctxt = htmlCreateFileParserCtxt(filename, encoding);
2656: if (ctxt == NULL) return(NULL);
2657: if (sax != NULL) {
2658: ctxt->sax = sax;
2659: ctxt->userData = userData;
2660: }
2661:
2662: htmlParseDocument(ctxt);
2663:
2664: ret = ctxt->myDoc;
2665: if (sax != NULL) {
2666: ctxt->sax = NULL;
2667: ctxt->userData = NULL;
2668: }
2669: htmlFreeParserCtxt(ctxt);
2670:
2671: return(ret);
2672: }
2673:
2674: /**
2675: * htmlParseFile :
2676: * @filename: the filename
2677: * @encoding: a free form C string describing the HTML document encoding, or NULL
2678: *
2679: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
2680: * compressed document is provided by default if found at compile-time.
2681: *
2682: * Returns the resulting document tree
2683: */
2684:
2685: htmlDocPtr
2686: htmlParseFile(const char *filename, const char *encoding) {
2687: return(htmlSAXParseFile(filename, encoding, NULL, NULL));
2688: }
Webmaster