Annotation of XML/HTMLparser.c, revision 1.43
1.1 daniel 1: /*
2: * HTMLparser.c : an HTML 4.0 non-verifying parser
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
1.29 daniel 10: #include "win32config.h"
1.1 daniel 11: #else
1.13 daniel 12: #include "config.h"
1.1 daniel 13: #endif
1.13 daniel 14:
1.39 daniel 15: #include "xmlversion.h"
16: #ifdef LIBXML_HTML_ENABLED
17:
1.1 daniel 18: #include <stdio.h>
1.13 daniel 19: #include <string.h> /* for memset() only */
20: #ifdef HAVE_CTYPE_H
1.1 daniel 21: #include <ctype.h>
1.13 daniel 22: #endif
23: #ifdef HAVE_STDLIB_H
1.1 daniel 24: #include <stdlib.h>
1.13 daniel 25: #endif
26: #ifdef HAVE_SYS_STAT_H
1.1 daniel 27: #include <sys/stat.h>
1.13 daniel 28: #endif
1.1 daniel 29: #ifdef HAVE_FCNTL_H
30: #include <fcntl.h>
31: #endif
32: #ifdef HAVE_UNISTD_H
33: #include <unistd.h>
34: #endif
35: #ifdef HAVE_ZLIB_H
36: #include <zlib.h>
37: #endif
38:
1.39 daniel 39: #include <libxml/xmlmemory.h>
40: #include <libxml/tree.h>
41: #include <libxml/HTMLparser.h>
42: #include <libxml/entities.h>
43: #include <libxml/encoding.h>
44: #include <libxml/valid.h>
45: #include <libxml/parserInternals.h>
46: #include <libxml/xmlIO.h>
1.31 daniel 47: #include "xml-error.h"
1.5 daniel 48:
49: #define HTML_MAX_NAMELEN 1000
50: #define INPUT_CHUNK 50
1.31 daniel 51: #define HTML_PARSER_BIG_BUFFER_SIZE 1024
52: #define HTML_PARSER_BUFFER_SIZE 100
1.1 daniel 53:
54: /* #define DEBUG */
1.31 daniel 55: /* #define DEBUG_PUSH */
1.1 daniel 56:
57: /************************************************************************
58: * *
59: * Parser stacks related functions and macros *
60: * *
61: ************************************************************************/
62:
63: /*
64: * Generic function for accessing stacks in the Parser Context
65: */
66:
1.30 daniel 67: #define PUSH_AND_POP(scope, type, name) \
68: scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
1.1 daniel 69: if (ctxt->name##Nr >= ctxt->name##Max) { \
70: ctxt->name##Max *= 2; \
1.11 daniel 71: ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
1.1 daniel 72: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
73: if (ctxt->name##Tab == NULL) { \
74: fprintf(stderr, "realloc failed !\n"); \
1.33 daniel 75: return(0); \
1.1 daniel 76: } \
77: } \
78: ctxt->name##Tab[ctxt->name##Nr] = value; \
79: ctxt->name = value; \
80: return(ctxt->name##Nr++); \
81: } \
1.30 daniel 82: scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
1.1 daniel 83: type ret; \
1.18 daniel 84: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 85: ctxt->name##Nr--; \
1.18 daniel 86: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 87: if (ctxt->name##Nr > 0) \
88: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
89: else \
90: ctxt->name = NULL; \
91: ret = ctxt->name##Tab[ctxt->name##Nr]; \
92: ctxt->name##Tab[ctxt->name##Nr] = 0; \
93: return(ret); \
94: } \
95:
1.30 daniel 96: PUSH_AND_POP(extern, xmlNodePtr, node)
97: PUSH_AND_POP(extern, xmlChar*, name)
1.1 daniel 98:
99: /*
100: * Macros for accessing the content. Those should be used only by the parser,
101: * and not exported.
102: *
103: * Dirty macros, i.e. one need to make assumption on the context to use them
104: *
1.14 daniel 105: * CUR_PTR return the current pointer to the xmlChar to be parsed.
106: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
1.1 daniel 107: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
108: * in UNICODE mode. This should be used internally by the parser
109: * only to compare to ASCII values otherwise it would break when
110: * running with UTF-8 encoding.
1.14 daniel 111: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
1.1 daniel 112: * to compare on ASCII based substring.
1.14 daniel 113: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
1.1 daniel 114: * it should be used only to compare on ASCII based substring.
1.14 daniel 115: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
1.1 daniel 116: * strings within the parser.
117: *
118: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
119: *
120: * CURRENT Returns the current char value, with the full decoding of
121: * UTF-8 if we are using this mode. It returns an int.
122: * NEXT Skip to the next character, this does the proper decoding
123: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
124: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
125: */
126:
1.36 daniel 127: #define CUR ((int) (*ctxt->input->cur))
128:
1.1 daniel 129: #define UPPER (toupper(*ctxt->input->cur))
1.36 daniel 130:
1.26 daniel 131: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
1.36 daniel 132:
1.1 daniel 133: #define NXT(val) ctxt->input->cur[(val)]
1.36 daniel 134:
1.1 daniel 135: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
1.36 daniel 136:
1.1 daniel 137: #define CUR_PTR ctxt->input->cur
1.36 daniel 138:
1.5 daniel 139: #define SHRINK xmlParserInputShrink(ctxt->input)
1.36 daniel 140:
1.5 daniel 141: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
1.1 daniel 142:
1.36 daniel 143: #define CURRENT ((int) (*ctxt->input->cur))
1.1 daniel 144:
1.36 daniel 145: #define NEXT htmlNextChar(ctxt);
1.35 daniel 146:
1.36 daniel 147: #define SKIP_BLANKS htmlSkipBlankChars(ctxt);
1.35 daniel 148:
149: /**
150: * htmlNextChar:
151: * @ctxt: the HTML parser context
152: *
153: * Skip to the next char input char.
154: */
155:
156: void
157: htmlNextChar(htmlParserCtxtPtr ctxt) {
158: if ((*ctxt->input->cur == 0) &&
159: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
160: xmlPopInput(ctxt);
161: } else {
162: if (*(ctxt->input->cur) == '\n') {
163: ctxt->input->line++; ctxt->input->col = 1;
164: } else ctxt->input->col++;
165: ctxt->input->cur++;
166: ctxt->nbChars++;
167: if (*ctxt->input->cur == 0)
168: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
169: }
170: }
1.5 daniel 171:
1.36 daniel 172: /**
173: * htmlSkipBlankChars:
174: * @ctxt: the HTML parser context
175: *
176: * skip all blanks character found at that point in the input streams.
177: *
178: * Returns the number of space chars skipped
179: */
180:
181: int
182: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
183: int res = 0;
184:
185: while (IS_BLANK(*(ctxt->input->cur))) {
186: if ((*ctxt->input->cur == 0) &&
187: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
188: xmlPopInput(ctxt);
189: } else {
190: if (*(ctxt->input->cur) == '\n') {
191: ctxt->input->line++; ctxt->input->col = 1;
192: } else ctxt->input->col++;
193: ctxt->input->cur++;
194: ctxt->nbChars++;
195: if (*ctxt->input->cur == 0)
196: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
197: }
198: res++;
199: }
200: return(res);
201: }
1.1 daniel 202:
203:
1.5 daniel 204:
1.1 daniel 205: /************************************************************************
206: * *
207: * The list of HTML elements and their properties *
208: * *
209: ************************************************************************/
210:
211: /*
212: * Start Tag: 1 means the start tag can be ommited
213: * End Tag: 1 means the end tag can be ommited
214: * 2 means it's forbidden (empty elements)
215: * Depr: this element is deprecated
216: * DTD: 1 means that this element is valid only in the Loose DTD
217: * 2 means that this element is valid only in the Frameset DTD
218: *
219: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
220: */
221: htmlElemDesc html40ElementTable[] = {
1.26 daniel 222: { "a", 0, 0, 0, 0, 0, "anchor " },
223: { "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
224: { "acronym", 0, 0, 0, 0, 0, "" },
225: { "address", 0, 0, 0, 0, 0, "information on author " },
226: { "applet", 0, 0, 0, 1, 1, "java applet " },
227: { "area", 0, 2, 1, 0, 0, "client-side image map area " },
228: { "b", 0, 0, 0, 0, 0, "bold text style" },
229: { "base", 0, 2, 1, 0, 0, "document base uri " },
230: { "basefont", 0, 2, 1, 1, 1, "base font size " },
231: { "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
232: { "big", 0, 0, 0, 0, 0, "large text style" },
233: { "blockquote", 0, 0, 0, 0, 0, "long quotation " },
234: { "body", 1, 1, 0, 0, 0, "document body " },
235: { "br", 0, 2, 1, 0, 0, "forced line break " },
236: { "button", 0, 0, 0, 0, 0, "push button " },
237: { "caption", 0, 0, 0, 0, 0, "table caption " },
238: { "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
239: { "cite", 0, 0, 0, 0, 0, "citation" },
240: { "code", 0, 0, 0, 0, 0, "computer code fragment" },
241: { "col", 0, 2, 1, 0, 0, "table column " },
242: { "colgroup", 0, 1, 0, 0, 0, "table column group " },
243: { "dd", 0, 1, 0, 0, 0, "definition description " },
244: { "del", 0, 0, 0, 0, 0, "deleted text " },
245: { "dfn", 0, 0, 0, 0, 0, "instance definition" },
246: { "dir", 0, 0, 0, 1, 1, "directory list" },
247: { "div", 0, 0, 0, 0, 0, "generic language/style container"},
248: { "dl", 0, 0, 0, 0, 0, "definition list " },
249: { "dt", 0, 1, 0, 0, 0, "definition term " },
250: { "em", 0, 0, 0, 0, 0, "emphasis" },
251: { "fieldset", 0, 0, 0, 0, 0, "form control group " },
252: { "font", 0, 0, 0, 1, 1, "local change to font " },
253: { "form", 0, 0, 0, 0, 0, "interactive form " },
254: { "frame", 0, 2, 1, 0, 2, "subwindow " },
255: { "frameset", 0, 0, 0, 0, 2, "window subdivision" },
256: { "h1", 0, 0, 0, 0, 0, "heading " },
257: { "h2", 0, 0, 0, 0, 0, "heading " },
258: { "h3", 0, 0, 0, 0, 0, "heading " },
259: { "h4", 0, 0, 0, 0, 0, "heading " },
260: { "h5", 0, 0, 0, 0, 0, "heading " },
261: { "h6", 0, 0, 0, 0, 0, "heading " },
262: { "head", 1, 1, 0, 0, 0, "document head " },
263: { "hr", 0, 2, 1, 0, 0, "horizontal rule " },
264: { "html", 1, 1, 0, 0, 0, "document root element " },
265: { "i", 0, 0, 0, 0, 0, "italic text style" },
266: { "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
267: { "img", 0, 2, 1, 0, 0, "embedded image " },
268: { "input", 0, 2, 1, 0, 0, "form control " },
269: { "ins", 0, 0, 0, 0, 0, "inserted text" },
270: { "isindex", 0, 2, 1, 1, 1, "single line prompt " },
271: { "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
272: { "label", 0, 0, 0, 0, 0, "form field label text " },
273: { "legend", 0, 0, 0, 0, 0, "fieldset legend " },
274: { "li", 0, 1, 0, 0, 0, "list item " },
275: { "link", 0, 2, 1, 0, 0, "a media-independent link " },
276: { "map", 0, 0, 0, 0, 0, "client-side image map " },
277: { "menu", 0, 0, 0, 1, 1, "menu list " },
278: { "meta", 0, 2, 1, 0, 0, "generic metainformation " },
279: { "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
280: { "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
281: { "object", 0, 0, 0, 0, 0, "generic embedded object " },
282: { "ol", 0, 0, 0, 0, 0, "ordered list " },
283: { "optgroup", 0, 0, 0, 0, 0, "option group " },
284: { "option", 0, 1, 0, 0, 0, "selectable choice " },
285: { "p", 0, 1, 0, 0, 0, "paragraph " },
286: { "param", 0, 2, 1, 0, 0, "named property value " },
287: { "pre", 0, 0, 0, 0, 0, "preformatted text " },
288: { "q", 0, 0, 0, 0, 0, "short inline quotation " },
289: { "s", 0, 0, 0, 1, 1, "strike-through text style" },
290: { "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
291: { "script", 0, 0, 0, 0, 0, "script statements " },
292: { "select", 0, 0, 0, 0, 0, "option selector " },
293: { "small", 0, 0, 0, 0, 0, "small text style" },
294: { "span", 0, 0, 0, 0, 0, "generic language/style container " },
295: { "strike", 0, 0, 0, 1, 1, "strike-through text" },
296: { "strong", 0, 0, 0, 0, 0, "strong emphasis" },
297: { "style", 0, 0, 0, 0, 0, "style info " },
298: { "sub", 0, 0, 0, 0, 0, "subscript" },
299: { "sup", 0, 0, 0, 0, 0, "superscript " },
300: { "table", 0, 0, 0, 0, 0, " " },
301: { "tbody", 1, 1, 0, 0, 0, "table body " },
302: { "td", 0, 1, 0, 0, 0, "table data cell" },
303: { "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
304: { "tfoot", 0, 1, 0, 0, 0, "table footer " },
305: { "th", 0, 1, 0, 0, 0, "table header cell" },
306: { "thead", 0, 1, 0, 0, 0, "table header " },
307: { "title", 0, 0, 0, 0, 0, "document title " },
308: { "tr", 0, 1, 0, 0, 0, "table row " },
309: { "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
310: { "u", 0, 0, 0, 1, 1, "underlined text style" },
311: { "ul", 0, 0, 0, 0, 0, "unordered list " },
312: { "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
1.1 daniel 313: };
314:
315: /*
316: * start tags that imply the end of a current element
317: * any tag of each line implies the end of the current element if the type of
318: * that element is in the same line
319: */
1.8 daniel 320: char *htmlEquEnd[] = {
1.26 daniel 321: "dt", "dd", "li", "option", NULL,
322: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
323: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
1.1 daniel 324: NULL
325: };
326: /*
327: * acording the HTML DTD, HR should be added to the 2nd line above, as it
328: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
329: * because many documents contain rules in headings...
330: */
331:
332: /*
333: * start tags that imply the end of current element
334: */
1.8 daniel 335: char *htmlStartClose[] = {
1.26 daniel 336: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
337: "dl", "ul", "ol", "menu", "dir", "address", "pre",
338: "listing", "xmp", "head", NULL,
339: "head", "p", NULL,
340: "title", "p", NULL,
341: "body", "head", "style", "link", "title", "p", NULL,
342: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
343: "pre", "listing", "xmp", "head", "li", NULL,
344: "hr", "p", "head", NULL,
345: "h1", "p", "head", NULL,
346: "h2", "p", "head", NULL,
347: "h3", "p", "head", NULL,
348: "h4", "p", "head", NULL,
349: "h5", "p", "head", NULL,
350: "h6", "p", "head", NULL,
351: "dir", "p", "head", NULL,
352: "address", "p", "head", "ul", NULL,
353: "pre", "p", "head", "ul", NULL,
354: "listing", "p", "head", NULL,
355: "xmp", "p", "head", NULL,
356: "blockquote", "p", "head", NULL,
357: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
358: "xmp", "head", NULL,
359: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
360: "head", "dd", NULL,
361: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
362: "head", "dt", NULL,
363: "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
364: "listing", "xmp", NULL,
365: "ol", "p", "head", "ul", NULL,
366: "menu", "p", "head", "ul", NULL,
367: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
368: "div", "p", "head", NULL,
369: "noscript", "p", "head", NULL,
370: "center", "font", "b", "i", "p", "head", NULL,
371: "a", "a", NULL,
372: "caption", "p", NULL,
373: "colgroup", "caption", "colgroup", "col", "p", NULL,
374: "col", "caption", "col", "p", NULL,
375: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
376: "listing", "xmp", "a", NULL,
377: "th", "th", "td", NULL,
378: "td", "th", "td", "p", NULL,
379: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
380: "thead", "caption", "col", "colgroup", NULL,
381: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
382: "tbody", "p", NULL,
383: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
384: "tfoot", "tbody", "p", NULL,
385: "optgroup", "option", NULL,
386: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
387: "pre", "listing", "xmp", "a", NULL,
1.1 daniel 388: NULL
389: };
390:
1.43 ! daniel 391:
1.8 daniel 392: static char** htmlStartCloseIndex[100];
1.1 daniel 393: static int htmlStartCloseIndexinitialized = 0;
394:
395: /************************************************************************
396: * *
397: * functions to handle HTML specific data *
398: * *
399: ************************************************************************/
400:
401: /**
402: * htmlInitAutoClose:
403: *
404: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
405: *
406: */
407: void
408: htmlInitAutoClose(void) {
409: int index, i = 0;
410:
411: if (htmlStartCloseIndexinitialized) return;
412:
413: for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
414: index = 0;
415: while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
416: htmlStartCloseIndex[index++] = &htmlStartClose[i];
417: while (htmlStartClose[i] != NULL) i++;
418: i++;
419: }
420: }
421:
422: /**
423: * htmlTagLookup:
424: * @tag: The tag name
425: *
426: * Lookup the HTML tag in the ElementTable
427: *
428: * Returns the related htmlElemDescPtr or NULL if not found.
429: */
430: htmlElemDescPtr
1.14 daniel 431: htmlTagLookup(const xmlChar *tag) {
1.1 daniel 432: int i = 0;
433:
434: for (i = 0; i < (sizeof(html40ElementTable) /
435: sizeof(html40ElementTable[0]));i++) {
1.8 daniel 436: if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
1.1 daniel 437: return(&html40ElementTable[i]);
438: }
439: return(NULL);
440: }
441:
442: /**
443: * htmlCheckAutoClose:
444: * @new: The new tag name
445: * @old: The old tag name
446: *
447: * Checks wether the new tag is one of the registered valid tags for closing old.
448: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
449: *
450: * Returns 0 if no, 1 if yes.
451: */
452: int
1.14 daniel 453: htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
1.1 daniel 454: int i, index;
1.8 daniel 455: char **close;
1.1 daniel 456:
457: if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
458:
459: /* inefficient, but not a big deal */
460: for (index = 0; index < 100;index++) {
461: close = htmlStartCloseIndex[index];
462: if (close == NULL) return(0);
1.8 daniel 463: if (!xmlStrcmp(BAD_CAST *close, new)) break;
1.1 daniel 464: }
465:
466: i = close - htmlStartClose;
467: i++;
468: while (htmlStartClose[i] != NULL) {
1.8 daniel 469: if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
1.1 daniel 470: return(1);
471: }
472: i++;
473: }
474: return(0);
475: }
476:
477: /**
478: * htmlAutoClose:
479: * @ctxt: an HTML parser context
480: * @new: The new tag name
481: *
482: * The HTmL DtD allows a tag to implicitely close other tags.
483: * The list is kept in htmlStartClose array. This function is
484: * called when a new tag has been detected and generates the
485: * appropriates closes if possible/needed.
486: */
487: void
1.14 daniel 488: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
1.15 daniel 489: xmlChar *oldname;
490: while ((ctxt->name != NULL) &&
491: (htmlCheckAutoClose(new, ctxt->name))) {
1.1 daniel 492: #ifdef DEBUG
1.18 daniel 493: fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
1.1 daniel 494: #endif
495: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1.15 daniel 496: ctxt->sax->endElement(ctxt->userData, ctxt->name);
1.24 daniel 497: oldname = htmlnamePop(ctxt);
1.18 daniel 498: if (oldname != NULL) {
499: #ifdef DEBUG
500: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
501: #endif
1.17 daniel 502: xmlFree(oldname);
1.18 daniel 503: }
1.1 daniel 504: }
505: }
506:
507: /**
1.28 daniel 508: * htmlAutoCloseTag:
509: * @doc: the HTML document
510: * @name: The tag name
511: * @elem: the HTML element
512: *
513: * The HTmL DtD allows a tag to implicitely close other tags.
514: * The list is kept in htmlStartClose array. This function checks
515: * if the element or one of it's children would autoclose the
516: * given tag.
517: *
518: * Returns 1 if autoclose, 0 otherwise
519: */
520: int
521: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
522: htmlNodePtr child;
523:
524: if (elem == NULL) return(1);
525: if (!xmlStrcmp(name, elem->name)) return(0);
526: if (htmlCheckAutoClose(elem->name, name)) return(1);
1.37 daniel 527: child = elem->children;
1.28 daniel 528: while (child != NULL) {
529: if (htmlAutoCloseTag(doc, name, child)) return(1);
530: child = child->next;
531: }
532: return(0);
533: }
534:
535: /**
536: * htmlIsAutoClosed:
537: * @doc: the HTML document
538: * @elem: the HTML element
539: *
540: * The HTmL DtD allows a tag to implicitely close other tags.
541: * The list is kept in htmlStartClose array. This function checks
542: * if a tag is autoclosed by one of it's child
543: *
544: * Returns 1 if autoclosed, 0 otherwise
545: */
546: int
547: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
548: htmlNodePtr child;
549:
550: if (elem == NULL) return(1);
1.37 daniel 551: child = elem->children;
1.28 daniel 552: while (child != NULL) {
553: if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
554: child = child->next;
555: }
556: return(0);
557: }
558:
559: /**
1.1 daniel 560: * htmlAutoCloseOnClose:
561: * @ctxt: an HTML parser context
562: * @new: The new tag name
563: *
564: * The HTmL DtD allows an ending tag to implicitely close other tags.
565: */
566: void
1.14 daniel 567: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
1.1 daniel 568: htmlElemDescPtr info;
1.15 daniel 569: xmlChar *oldname;
1.18 daniel 570: int i;
1.1 daniel 571:
1.18 daniel 572: #ifdef DEBUG
573: fprintf(stderr,"Close of %s stack: %d elements\n", new, ctxt->nameNr);
574: for (i = 0;i < ctxt->nameNr;i++)
575: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
576: #endif
577:
578: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
579: if (!xmlStrcmp(new, ctxt->nameTab[i])) break;
580: }
581: if (i < 0) return;
582:
583: while (xmlStrcmp(new, ctxt->name)) {
1.15 daniel 584: info = htmlTagLookup(ctxt->name);
1.1 daniel 585: if ((info == NULL) || (info->endTag == 1)) {
586: #ifdef DEBUG
1.18 daniel 587: fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
588: #endif
589: } else {
590: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
591: ctxt->sax->error(ctxt->userData,
592: "Opening and ending tag mismatch: %s and %s\n",
593: new, ctxt->name);
594: ctxt->wellFormed = 0;
595: }
596: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
597: ctxt->sax->endElement(ctxt->userData, ctxt->name);
1.24 daniel 598: oldname = htmlnamePop(ctxt);
1.18 daniel 599: if (oldname != NULL) {
600: #ifdef DEBUG
601: fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
1.1 daniel 602: #endif
1.18 daniel 603: xmlFree(oldname);
604: }
1.1 daniel 605: }
606: }
607:
1.43 ! daniel 608: /**
! 609: * htmlCheckImplied:
! 610: * @ctxt: an HTML parser context
! 611: * @new: The new tag name
! 612: *
! 613: * The HTmL DtD allows a tag to exists only implicitely
! 614: * called when a new tag has been detected and generates the
! 615: * appropriates implicit tags if missing
! 616: */
! 617: void
! 618: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *new) {
! 619: if (!strcmp(new, "html"))
! 620: return;
! 621: if (ctxt->nameNr <= 0) {
! 622: #ifdef DEBUG
! 623: fprintf(stderr,"Implied element html: pushed html\n");
! 624: #endif
! 625: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
! 626: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
! 627: ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
! 628: }
! 629: if ((!strcmp(new, "body")) || (!strcmp(new, "head")))
! 630: return;
! 631: if (ctxt->nameNr <= 1) {
! 632: if ((!strcmp(new, "script")) || (!strcmp(new, "style")) ||
! 633: (!strcmp(new, "meta")) || (!strcmp(new, "link")) ||
! 634: (!strcmp(new, "title")) || (!strcmp(new, "base"))) {
! 635: /*
! 636: * dropped OBJECT ... i you put it first BODY will be
! 637: * assumed !
! 638: */
! 639: #ifdef DEBUG
! 640: fprintf(stderr,"Implied element head: pushed head\n");
! 641: #endif
! 642: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
! 643: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
! 644: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
! 645: } else {
! 646: #ifdef DEBUG
! 647: fprintf(stderr,"Implied element body: pushed body\n");
! 648: #endif
! 649: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
! 650: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
! 651: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
! 652: }
! 653: }
! 654: }
! 655:
1.1 daniel 656: /************************************************************************
657: * *
658: * The list of HTML predefined entities *
659: * *
660: ************************************************************************/
661:
662:
663: htmlEntityDesc html40EntitiesTable[] = {
664: /*
665: * the 4 absolute ones,
666: */
667: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
668: { 38, "amp", "ampersand, U+0026 ISOnum" },
669: { 60, "lt", "less-than sign, U+003C ISOnum" },
670: { 62, "gt", "greater-than sign, U+003E ISOnum" },
671:
672: /*
673: * A bunch still in the 128-255 range
674: * Replacing them depend really on the charset used.
675: */
1.28 daniel 676: { 39, "apos", "single quote" },
1.1 daniel 677: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
678: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
679: { 162, "cent", "cent sign, U+00A2 ISOnum" },
680: { 163, "pound","pound sign, U+00A3 ISOnum" },
681: { 164, "curren","currency sign, U+00A4 ISOnum" },
682: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
683: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
684: { 167, "sect", "section sign, U+00A7 ISOnum" },
685: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
686: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
687: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
688: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
689: { 172, "not", "not sign, U+00AC ISOnum" },
690: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
691: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
692: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
693: { 176, "deg", "degree sign, U+00B0 ISOnum" },
694: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
695: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
696: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
697: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
698: { 181, "micro","micro sign, U+00B5 ISOnum" },
699: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1.7 daniel 700: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1.1 daniel 701: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
702: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
703: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1.7 daniel 704: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1.1 daniel 705: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
706: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
707: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
708: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
709: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
710: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
711: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
712: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
713: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
714: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
715: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
716: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
717: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
718: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
719: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
720: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
721: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
722: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
723: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
724: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
725: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
726: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
727: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
728: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
729: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
730: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
731: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
732: { 215, "times","multiplication sign, U+00D7 ISOnum" },
1.7 daniel 733: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1.1 daniel 734: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
735: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
736: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
737: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
738: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
739: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
740: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
741: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
742: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
743: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
744: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
745: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
746: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
747: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
748: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
749: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
750: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
751: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
752: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
753: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
754: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
755: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
756: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
757: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
758: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
759: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
760: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
761: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
762: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
763: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
764: { 247, "divide","division sign, U+00F7 ISOnum" },
765: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
766: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
767: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
768: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
769: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
770: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
771: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
772: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
773:
774: /*
775: * Anything below should really be kept as entities references
776: */
777: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
778:
779: { 913, "Alpha","greek capital letter alpha, U+0391" },
780: { 914, "Beta", "greek capital letter beta, U+0392" },
781: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
782: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
783: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
784: { 918, "Zeta", "greek capital letter zeta, U+0396" },
785: { 919, "Eta", "greek capital letter eta, U+0397" },
786: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
787: { 921, "Iota", "greek capital letter iota, U+0399" },
788: { 922, "Kappa","greek capital letter kappa, U+039A" },
789: { 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
790: { 924, "Mu", "greek capital letter mu, U+039C" },
791: { 925, "Nu", "greek capital letter nu, U+039D" },
792: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
793: { 927, "Omicron","greek capital letter omicron, U+039F" },
794: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
795: { 929, "Rho", "greek capital letter rho, U+03A1" },
796: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
797: { 932, "Tau", "greek capital letter tau, U+03A4" },
798: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
799: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
800: { 935, "Chi", "greek capital letter chi, U+03A7" },
801: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
802: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
803:
804: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
805: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
806: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
807: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
808: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
809: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
810: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
811: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
812: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
813: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
814: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
815: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
816: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
817: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
818: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
819: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
820: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
821: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
822: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
823: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
824: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
825: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
826: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
827: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
828: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
829: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
830: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
831: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
832:
833: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
834: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
835: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
836: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
837: { 8254, "oline","overline = spacing overscore, U+203E NEW" },
838: { 8260, "frasl","fraction slash, U+2044 NEW" },
839:
1.7 daniel 840: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1.1 daniel 841: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
842: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
843: { 8482, "trade","trade mark sign, U+2122 ISOnum" },
844: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
845: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
846: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
847: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
848: { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
849: { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
850: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
851: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
852: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
853: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
854: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
855: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
856:
857:
858: { 8704, "forall","for all, U+2200 ISOtech" },
859: { 8706, "part", "partial differential, U+2202 ISOtech" },
860: { 8707, "exist","there exists, U+2203 ISOtech" },
861: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
862: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
863: { 8712, "isin", "element of, U+2208 ISOtech" },
864: { 8713, "notin","not an element of, U+2209 ISOtech" },
865: { 8715, "ni", "contains as member, U+220B ISOtech" },
866: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
867: { 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
868: { 8722, "minus","minus sign, U+2212 ISOtech" },
869: { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
870: { 8730, "radic","square root = radical sign, U+221A ISOtech" },
871: { 8733, "prop", "proportional to, U+221D ISOtech" },
872: { 8734, "infin","infinity, U+221E ISOtech" },
873: { 8736, "ang", "angle, U+2220 ISOamso" },
874: { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
875: { 8744, "or", "logical or = vee, U+2228 ISOtech" },
876: { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
877: { 8746, "cup", "union = cup, U+222A ISOtech" },
878: { 8747, "int", "integral, U+222B ISOtech" },
879: { 8756, "there4","therefore, U+2234 ISOtech" },
880: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
881: { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
882: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
883: { 8800, "ne", "not equal to, U+2260 ISOtech" },
884: { 8801, "equiv","identical to, U+2261 ISOtech" },
885: { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
886: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
887: { 8834, "sub", "subset of, U+2282 ISOtech" },
888: { 8835, "sup", "superset of, U+2283 ISOtech" },
889: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
890: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
891: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
892: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
893: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
894: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
895: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
896: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
897: { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
898: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
899: { 8971, "rfloor","right floor, U+230B ISOamsc" },
900: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
901: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
902: { 9674, "loz", "lozenge, U+25CA ISOpub" },
903:
904: { 9824, "spades","black spade suit, U+2660 ISOpub" },
905: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
906: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
907: { 9830, "diams","black diamond suit, U+2666 ISOpub" },
908:
909: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
910: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
911: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
912: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
913: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
914: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
915: { 732, "tilde","small tilde, U+02DC ISOdia" },
916:
917: { 8194, "ensp", "en space, U+2002 ISOpub" },
918: { 8195, "emsp", "em space, U+2003 ISOpub" },
919: { 8201, "thinsp","thin space, U+2009 ISOpub" },
920: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
921: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
922: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
923: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
924: { 8211, "ndash","en dash, U+2013 ISOpub" },
925: { 8212, "mdash","em dash, U+2014 ISOpub" },
926: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
927: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
928: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
929: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
930: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
931: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
932: { 8224, "dagger","dagger, U+2020 ISOpub" },
933: { 8225, "Dagger","double dagger, U+2021 ISOpub" },
934: { 8240, "permil","per mille sign, U+2030 ISOtech" },
935: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1.7 daniel 936: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1.1 daniel 937: { 8364, "euro", "euro sign, U+20AC NEW" }
938: };
939:
940: /************************************************************************
941: * *
942: * Commodity functions to handle entities *
943: * *
944: ************************************************************************/
945:
946: /*
947: * Macro used to grow the current buffer.
948: */
949: #define growBuffer(buffer) { \
950: buffer##_size *= 2; \
1.14 daniel 951: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1.1 daniel 952: if (buffer == NULL) { \
953: perror("realloc failed"); \
1.33 daniel 954: return(NULL); \
1.1 daniel 955: } \
956: }
957:
958: /**
959: * htmlEntityLookup:
960: * @name: the entity name
961: *
962: * Lookup the given entity in EntitiesTable
963: *
964: * TODO: the linear scan is really ugly, an hash table is really needed.
965: *
966: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
967: */
968: htmlEntityDescPtr
1.14 daniel 969: htmlEntityLookup(const xmlChar *name) {
1.1 daniel 970: int i;
971:
972: for (i = 0;i < (sizeof(html40EntitiesTable)/
973: sizeof(html40EntitiesTable[0]));i++) {
1.8 daniel 974: if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
1.1 daniel 975: #ifdef DEBUG
1.18 daniel 976: fprintf(stderr,"Found entity %s\n", name);
1.1 daniel 977: #endif
978: return(&html40EntitiesTable[i]);
979: }
980: }
981: return(NULL);
982: }
983:
984:
985: /**
986: * htmlDecodeEntities:
987: * @ctxt: the parser context
988: * @len: the len to decode (in bytes !), -1 for no size limit
1.14 daniel 989: * @end: an end marker xmlChar, 0 if none
990: * @end2: an end marker xmlChar, 0 if none
991: * @end3: an end marker xmlChar, 0 if none
1.1 daniel 992: *
993: * Subtitute the HTML entities by their value
994: *
1.19 daniel 995: * DEPRECATED !!!!
1.1 daniel 996: *
997: * Returns A newly allocated string with the substitution done. The caller
998: * must deallocate it !
999: */
1.14 daniel 1000: xmlChar *
1.1 daniel 1001: htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
1.14 daniel 1002: xmlChar end, xmlChar end2, xmlChar end3) {
1003: xmlChar *buffer = NULL;
1.1 daniel 1004: int buffer_size = 0;
1.14 daniel 1005: xmlChar *out = NULL;
1006: xmlChar *name = NULL;
1.1 daniel 1007:
1.14 daniel 1008: xmlChar *cur = NULL;
1.1 daniel 1009: htmlEntityDescPtr ent;
1.5 daniel 1010: int nbchars = 0;
1.1 daniel 1011: unsigned int max = (unsigned int) len;
1012:
1013: /*
1014: * allocate a translation buffer.
1015: */
1.31 daniel 1016: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1.14 daniel 1017: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1.1 daniel 1018: if (buffer == NULL) {
1019: perror("htmlDecodeEntities: malloc failed");
1020: return(NULL);
1021: }
1022: out = buffer;
1023:
1024: /*
1025: * Ok loop until we reach one of the ending char or a size limit.
1026: */
1.5 daniel 1027: while ((nbchars < max) && (CUR != end) &&
1.1 daniel 1028: (CUR != end2) && (CUR != end3)) {
1029:
1030: if (CUR == '&') {
1031: if (NXT(1) == '#') {
1032: int val = htmlParseCharRef(ctxt);
1.8 daniel 1033: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 1034: *out++ = val;
1.5 daniel 1035: nbchars += 3; /* !!!! */
1.1 daniel 1036: } else {
1037: ent = htmlParseEntityRef(ctxt, &name);
1038: if (name != NULL) {
1039: if ((ent == NULL) || (ent->value <= 0) ||
1040: (ent->value >= 255)) {
1041: *out++ = '&';
1042: cur = name;
1043: while (*cur != 0) {
1044: if (out - buffer > buffer_size - 100) {
1045: int index = out - buffer;
1046:
1047: growBuffer(buffer);
1048: out = &buffer[index];
1049: }
1050: *out++ = *cur++;
1051: }
1052: *out++ = ';';
1053: } else {
1.8 daniel 1054: /* invalid for UTF-8 variable encoding !!!!! */
1.14 daniel 1055: *out++ = (xmlChar)ent->value;
1.1 daniel 1056: if (out - buffer > buffer_size - 100) {
1057: int index = out - buffer;
1058:
1059: growBuffer(buffer);
1060: out = &buffer[index];
1061: }
1062: }
1.5 daniel 1063: nbchars += 2 + xmlStrlen(name);
1.11 daniel 1064: xmlFree(name);
1.1 daniel 1065: }
1066: }
1067: } else {
1.8 daniel 1068: /* invalid for UTF-8 , use COPY(out); !!!!! */
1.1 daniel 1069: *out++ = CUR;
1.5 daniel 1070: nbchars++;
1.1 daniel 1071: if (out - buffer > buffer_size - 100) {
1072: int index = out - buffer;
1073:
1074: growBuffer(buffer);
1075: out = &buffer[index];
1076: }
1077: NEXT;
1078: }
1079: }
1080: *out++ = 0;
1081: return(buffer);
1082: }
1083:
1084:
1085: /************************************************************************
1086: * *
1087: * Commodity functions to handle encodings *
1088: * *
1089: ************************************************************************/
1090:
1091: /**
1092: * htmlSwitchEncoding:
1093: * @ctxt: the parser context
1094: * @len: the len of @cur
1095: *
1096: * change the input functions when discovering the character encoding
1097: * of a given entity.
1098: *
1099: */
1100: void
1101: htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
1102: {
1103: switch (enc) {
1104: case XML_CHAR_ENCODING_ERROR:
1105: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1106: ctxt->sax->error(ctxt->userData, "encoding unknown\n");
1107: ctxt->wellFormed = 0;
1108: break;
1109: case XML_CHAR_ENCODING_NONE:
1110: /* let's assume it's UTF-8 without the XML decl */
1111: return;
1112: case XML_CHAR_ENCODING_UTF8:
1113: /* default encoding, no conversion should be needed */
1114: return;
1115: case XML_CHAR_ENCODING_UTF16LE:
1116: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1117: ctxt->sax->error(ctxt->userData,
1118: "char encoding UTF16 little endian not supported\n");
1119: break;
1120: case XML_CHAR_ENCODING_UTF16BE:
1121: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1122: ctxt->sax->error(ctxt->userData,
1123: "char encoding UTF16 big endian not supported\n");
1124: break;
1125: case XML_CHAR_ENCODING_UCS4LE:
1126: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1127: ctxt->sax->error(ctxt->userData,
1128: "char encoding USC4 little endian not supported\n");
1129: break;
1130: case XML_CHAR_ENCODING_UCS4BE:
1131: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1132: ctxt->sax->error(ctxt->userData,
1133: "char encoding USC4 big endian not supported\n");
1134: break;
1135: case XML_CHAR_ENCODING_EBCDIC:
1136: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1137: ctxt->sax->error(ctxt->userData,
1138: "char encoding EBCDIC not supported\n");
1139: break;
1140: case XML_CHAR_ENCODING_UCS4_2143:
1141: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1142: ctxt->sax->error(ctxt->userData,
1143: "char encoding UCS4 2143 not supported\n");
1144: break;
1145: case XML_CHAR_ENCODING_UCS4_3412:
1146: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1147: ctxt->sax->error(ctxt->userData,
1148: "char encoding UCS4 3412 not supported\n");
1149: break;
1150: case XML_CHAR_ENCODING_UCS2:
1151: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1152: ctxt->sax->error(ctxt->userData,
1153: "char encoding UCS2 not supported\n");
1154: break;
1155: case XML_CHAR_ENCODING_8859_1:
1156: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1157: ctxt->sax->error(ctxt->userData,
1158: "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
1159: break;
1160: case XML_CHAR_ENCODING_8859_2:
1161: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1162: ctxt->sax->error(ctxt->userData,
1163: "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
1164: break;
1165: case XML_CHAR_ENCODING_8859_3:
1166: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1167: ctxt->sax->error(ctxt->userData,
1168: "char encoding ISO_8859_3 not supported\n");
1169: break;
1170: case XML_CHAR_ENCODING_8859_4:
1171: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1172: ctxt->sax->error(ctxt->userData,
1173: "char encoding ISO_8859_4 not supported\n");
1174: break;
1175: case XML_CHAR_ENCODING_8859_5:
1176: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1177: ctxt->sax->error(ctxt->userData,
1178: "char encoding ISO_8859_5 not supported\n");
1179: break;
1180: case XML_CHAR_ENCODING_8859_6:
1181: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1182: ctxt->sax->error(ctxt->userData,
1183: "char encoding ISO_8859_6 not supported\n");
1184: break;
1185: case XML_CHAR_ENCODING_8859_7:
1186: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1187: ctxt->sax->error(ctxt->userData,
1188: "char encoding ISO_8859_7 not supported\n");
1189: break;
1190: case XML_CHAR_ENCODING_8859_8:
1191: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1192: ctxt->sax->error(ctxt->userData,
1193: "char encoding ISO_8859_8 not supported\n");
1194: break;
1195: case XML_CHAR_ENCODING_8859_9:
1196: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1197: ctxt->sax->error(ctxt->userData,
1198: "char encoding ISO_8859_9 not supported\n");
1199: break;
1200: case XML_CHAR_ENCODING_2022_JP:
1201: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1202: ctxt->sax->error(ctxt->userData,
1203: "char encoding ISO-2022-JPnot supported\n");
1204: break;
1205: case XML_CHAR_ENCODING_SHIFT_JIS:
1206: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1207: ctxt->sax->error(ctxt->userData,
1208: "char encoding Shift_JISnot supported\n");
1209: break;
1210: case XML_CHAR_ENCODING_EUC_JP:
1211: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1212: ctxt->sax->error(ctxt->userData,
1213: "char encoding EUC-JPnot supported\n");
1214: break;
1215: }
1216: }
1217:
1.31 daniel 1218: /************************************************************************
1219: * *
1220: * Commodity functions to handle streams *
1221: * *
1222: ************************************************************************/
1223:
1224: /**
1225: * htmlFreeInputStream:
1226: * @input: an htmlParserInputPtr
1227: *
1228: * Free up an input stream.
1229: */
1230: void
1231: htmlFreeInputStream(htmlParserInputPtr input) {
1232: if (input == NULL) return;
1233:
1234: if (input->filename != NULL) xmlFree((char *) input->filename);
1235: if (input->directory != NULL) xmlFree((char *) input->directory);
1236: if ((input->free != NULL) && (input->base != NULL))
1237: input->free((xmlChar *) input->base);
1238: if (input->buf != NULL)
1239: xmlFreeParserInputBuffer(input->buf);
1240: memset(input, -1, sizeof(htmlParserInput));
1241: xmlFree(input);
1242: }
1243:
1244: /**
1245: * htmlNewInputStream:
1246: * @ctxt: an HTML parser context
1247: *
1248: * Create a new input stream structure
1249: * Returns the new input stream or NULL
1250: */
1251: htmlParserInputPtr
1252: htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1253: htmlParserInputPtr input;
1254:
1255: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1256: if (input == NULL) {
1257: ctxt->errNo = XML_ERR_NO_MEMORY;
1258: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1259: ctxt->sax->error(ctxt->userData,
1260: "malloc: couldn't allocate a new input stream\n");
1261: ctxt->errNo = XML_ERR_NO_MEMORY;
1262: return(NULL);
1263: }
1264: input->filename = NULL;
1265: input->directory = NULL;
1266: input->base = NULL;
1267: input->cur = NULL;
1268: input->buf = NULL;
1269: input->line = 1;
1270: input->col = 1;
1271: input->buf = NULL;
1272: input->free = NULL;
1273: input->consumed = 0;
1274: input->length = 0;
1275: return(input);
1276: }
1277:
1.1 daniel 1278:
1279: /************************************************************************
1280: * *
1281: * Commodity functions, cleanup needed ? *
1282: * *
1283: ************************************************************************/
1284:
1285: /**
1286: * areBlanks:
1287: * @ctxt: an HTML parser context
1.14 daniel 1288: * @str: a xmlChar *
1.1 daniel 1289: * @len: the size of @str
1290: *
1291: * Is this a sequence of blank chars that one can ignore ?
1292: *
1293: * Returns 1 if ignorable 0 otherwise.
1294: */
1295:
1.14 daniel 1296: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1.1 daniel 1297: int i;
1298: xmlNodePtr lastChild;
1299:
1300: for (i = 0;i < len;i++)
1301: if (!(IS_BLANK(str[i]))) return(0);
1302:
1303: if (CUR != '<') return(0);
1304: if (ctxt->node == NULL) return(0);
1305: lastChild = xmlGetLastChild(ctxt->node);
1306: if (lastChild == NULL) {
1307: if (ctxt->node->content != NULL) return(0);
1308: } else if (xmlNodeIsText(lastChild))
1309: return(0);
1310: return(1);
1311: }
1312:
1313: /**
1314: * htmlHandleEntity:
1315: * @ctxt: an HTML parser context
1316: * @entity: an XML entity pointer.
1317: *
1318: * Default handling of an HTML entity, call the parser with the
1319: * substitution string
1320: */
1321:
1322: void
1323: htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1324: int len;
1325:
1326: if (entity->content == NULL) {
1327: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1328: ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1329: entity->name);
1330: ctxt->wellFormed = 0;
1331: return;
1332: }
1333: len = xmlStrlen(entity->content);
1334:
1335: /*
1336: * Just handle the content as a set of chars.
1337: */
1338: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1339: ctxt->sax->characters(ctxt->userData, entity->content, len);
1340:
1341: }
1342:
1343: /**
1344: * htmlNewDoc:
1345: * @URI: URI for the dtd, or NULL
1346: * @ExternalID: the external ID of the DTD, or NULL
1347: *
1348: * Returns a new document
1349: */
1350: htmlDocPtr
1.14 daniel 1351: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1.1 daniel 1352: xmlDocPtr cur;
1353:
1354: /*
1355: * Allocate a new document and fill the fields.
1356: */
1.11 daniel 1357: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1.1 daniel 1358: if (cur == NULL) {
1359: fprintf(stderr, "xmlNewDoc : malloc failed\n");
1360: return(NULL);
1361: }
1.10 daniel 1362: memset(cur, 0, sizeof(xmlDoc));
1.1 daniel 1363:
1.20 daniel 1364: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 1365: cur->version = NULL;
1366: cur->intSubset = NULL;
1.28 daniel 1367: if ((ExternalID == NULL) &&
1368: (URI == NULL))
1369: xmlCreateIntSubset(cur, BAD_CAST "HTML",
1370: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1371: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1372: else
1373: xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1.41 daniel 1374: cur->doc = cur;
1.1 daniel 1375: cur->name = NULL;
1.37 daniel 1376: cur->children = NULL;
1.1 daniel 1377: cur->extSubset = NULL;
1378: cur->oldNs = NULL;
1379: cur->encoding = NULL;
1380: cur->standalone = 1;
1381: cur->compression = 0;
1.12 daniel 1382: cur->ids = NULL;
1383: cur->refs = NULL;
1.1 daniel 1384: #ifndef XML_WITHOUT_CORBA
1385: cur->_private = NULL;
1386: #endif
1387: return(cur);
1388: }
1389:
1390:
1391: /************************************************************************
1392: * *
1393: * The parser itself *
1394: * Relates to http://www.w3.org/TR/html40 *
1395: * *
1396: ************************************************************************/
1397:
1398: /************************************************************************
1399: * *
1400: * The parser itself *
1401: * *
1402: ************************************************************************/
1403:
1404: /**
1405: * htmlParseHTMLName:
1406: * @ctxt: an HTML parser context
1407: *
1.26 daniel 1408: * parse an HTML tag or attribute name, note that we convert it to lowercase
1.1 daniel 1409: * since HTML names are not case-sensitive.
1410: *
1411: * Returns the Tag Name parsed or NULL
1412: */
1413:
1.14 daniel 1414: xmlChar *
1.1 daniel 1415: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1416: xmlChar *ret = NULL;
1.1 daniel 1417: int i = 0;
1.31 daniel 1418: xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1.1 daniel 1419:
1420: if (!IS_LETTER(CUR) && (CUR != '_') &&
1421: (CUR != ':')) return(NULL);
1422:
1.31 daniel 1423: while ((i < HTML_PARSER_BUFFER_SIZE) &&
1424: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
1.26 daniel 1425: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1.1 daniel 1426: else loc[i] = CUR;
1427: i++;
1428:
1429: NEXT;
1430: }
1431:
1432: ret = xmlStrndup(loc, i);
1433:
1434: return(ret);
1435: }
1436:
1437: /**
1438: * htmlParseName:
1439: * @ctxt: an HTML parser context
1440: *
1441: * parse an HTML name, this routine is case sensistive.
1442: *
1443: * Returns the Name parsed or NULL
1444: */
1445:
1.14 daniel 1446: xmlChar *
1.1 daniel 1447: htmlParseName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1448: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1449: int len = 0;
1.1 daniel 1450:
1.5 daniel 1451: GROW;
1452: if (!IS_LETTER(CUR) && (CUR != '_')) {
1453: return(NULL);
1454: }
1.1 daniel 1455:
1456: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1457: (CUR == '.') || (CUR == '-') ||
1458: (CUR == '_') || (CUR == ':') ||
1459: (IS_COMBINING(CUR)) ||
1.5 daniel 1460: (IS_EXTENDER(CUR))) {
1461: buf[len++] = CUR;
1.1 daniel 1462: NEXT;
1.5 daniel 1463: if (len >= HTML_MAX_NAMELEN) {
1464: fprintf(stderr,
1465: "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1466: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1467: (CUR == '.') || (CUR == '-') ||
1468: (CUR == '_') || (CUR == ':') ||
1469: (IS_COMBINING(CUR)) ||
1470: (IS_EXTENDER(CUR)))
1471: NEXT;
1472: break;
1473: }
1474: }
1475: return(xmlStrndup(buf, len));
1.1 daniel 1476: }
1477:
1478: /**
1479: * htmlParseHTMLAttribute:
1480: * @ctxt: an HTML parser context
1.19 daniel 1481: * @stop: a char stop value
1.1 daniel 1482: *
1.19 daniel 1483: * parse an HTML attribute value till the stop (quote), if
1484: * stop is 0 then it stops at the first space
1.1 daniel 1485: *
1.19 daniel 1486: * Returns the attribute parsed or NULL
1.1 daniel 1487: */
1488:
1.14 daniel 1489: xmlChar *
1.19 daniel 1490: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1.32 daniel 1491: #if 0
1.14 daniel 1492: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1493: int len = 0;
1.1 daniel 1494:
1.5 daniel 1495: GROW;
1.19 daniel 1496: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1497: if ((stop == 0) && (IS_BLANK(CUR))) break;
1.5 daniel 1498: buf[len++] = CUR;
1.1 daniel 1499: NEXT;
1.5 daniel 1500: if (len >= HTML_MAX_NAMELEN) {
1501: fprintf(stderr,
1502: "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1503: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1.19 daniel 1504: (CUR != '>') &&
1.5 daniel 1505: (CUR != '\'') && (CUR != '"'))
1506: NEXT;
1507: break;
1508: }
1509: }
1510: return(xmlStrndup(buf, len));
1.32 daniel 1511: #else
1512: xmlChar *buffer = NULL;
1513: int buffer_size = 0;
1514: xmlChar *out = NULL;
1515: xmlChar *name = NULL;
1516:
1517: xmlChar *cur = NULL;
1518: htmlEntityDescPtr ent;
1519:
1520: /*
1521: * allocate a translation buffer.
1522: */
1523: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1524: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1525: if (buffer == NULL) {
1526: perror("htmlParseHTMLAttribute: malloc failed");
1527: return(NULL);
1528: }
1529: out = buffer;
1530:
1531: /*
1532: * Ok loop until we reach one of the ending chars
1533: */
1534: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1535: if ((stop == 0) && (IS_BLANK(CUR))) break;
1536: if (CUR == '&') {
1537: if (NXT(1) == '#') {
1538: int val = htmlParseCharRef(ctxt);
1539: *out++ = val;
1540: } else {
1541: ent = htmlParseEntityRef(ctxt, &name);
1542: if (name == NULL) {
1543: *out++ = '&';
1544: if (out - buffer > buffer_size - 100) {
1545: int index = out - buffer;
1546:
1547: growBuffer(buffer);
1548: out = &buffer[index];
1549: }
1550: } else if ((ent == NULL) || (ent->value <= 0) ||
1551: (ent->value >= 255)) {
1552: *out++ = '&';
1553: cur = name;
1554: while (*cur != 0) {
1555: if (out - buffer > buffer_size - 100) {
1556: int index = out - buffer;
1557:
1558: growBuffer(buffer);
1559: out = &buffer[index];
1560: }
1561: *out++ = *cur++;
1562: }
1563: xmlFree(name);
1564: } else {
1565: *out++ = ent->value;
1566: if (out - buffer > buffer_size - 100) {
1567: int index = out - buffer;
1568:
1569: growBuffer(buffer);
1570: out = &buffer[index];
1571: }
1572: xmlFree(name);
1573: }
1574: }
1575: } else {
1576: *out++ = CUR;
1577: if (out - buffer > buffer_size - 100) {
1578: int index = out - buffer;
1579:
1580: growBuffer(buffer);
1581: out = &buffer[index];
1582: }
1583: NEXT;
1584: }
1585: }
1586: *out++ = 0;
1587: return(buffer);
1588: #endif
1.1 daniel 1589: }
1590:
1591: /**
1592: * htmlParseNmtoken:
1593: * @ctxt: an HTML parser context
1594: *
1595: * parse an HTML Nmtoken.
1596: *
1597: * Returns the Nmtoken parsed or NULL
1598: */
1599:
1.14 daniel 1600: xmlChar *
1.1 daniel 1601: htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
1.14 daniel 1602: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1603: int len = 0;
1.1 daniel 1604:
1.5 daniel 1605: GROW;
1.1 daniel 1606: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1607: (CUR == '.') || (CUR == '-') ||
1608: (CUR == '_') || (CUR == ':') ||
1609: (IS_COMBINING(CUR)) ||
1.5 daniel 1610: (IS_EXTENDER(CUR))) {
1611: buf[len++] = CUR;
1.1 daniel 1612: NEXT;
1.5 daniel 1613: if (len >= HTML_MAX_NAMELEN) {
1614: fprintf(stderr,
1615: "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1616: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1617: (CUR == '.') || (CUR == '-') ||
1618: (CUR == '_') || (CUR == ':') ||
1619: (IS_COMBINING(CUR)) ||
1620: (IS_EXTENDER(CUR)))
1621: NEXT;
1622: break;
1623: }
1624: }
1625: return(xmlStrndup(buf, len));
1.1 daniel 1626: }
1627:
1628: /**
1629: * htmlParseEntityRef:
1630: * @ctxt: an HTML parser context
1631: * @str: location to store the entity name
1632: *
1633: * parse an HTML ENTITY references
1634: *
1635: * [68] EntityRef ::= '&' Name ';'
1636: *
1637: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1638: * if non-NULL *str will have to be freed by the caller.
1639: */
1640: htmlEntityDescPtr
1.14 daniel 1641: htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1642: xmlChar *name;
1.1 daniel 1643: htmlEntityDescPtr ent = NULL;
1644: *str = NULL;
1645:
1646: if (CUR == '&') {
1647: NEXT;
1648: name = htmlParseName(ctxt);
1649: if (name == NULL) {
1650: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1651: ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1652: ctxt->wellFormed = 0;
1653: } else {
1.5 daniel 1654: GROW;
1.1 daniel 1655: if (CUR == ';') {
1656: *str = name;
1657:
1658: /*
1659: * Lookup the entity in the table.
1660: */
1661: ent = htmlEntityLookup(name);
1.32 daniel 1662: if (ent != NULL) /* OK that's ugly !!! */
1663: NEXT;
1.1 daniel 1664: } else {
1665: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1666: ctxt->sax->error(ctxt->userData,
1667: "htmlParseEntityRef: expecting ';'\n");
1.32 daniel 1668: *str = name;
1.1 daniel 1669: }
1670: }
1671: }
1672: return(ent);
1673: }
1674:
1675: /**
1676: * htmlParseAttValue:
1677: * @ctxt: an HTML parser context
1678: *
1679: * parse a value for an attribute
1680: * Note: the parser won't do substitution of entities here, this
1681: * will be handled later in xmlStringGetNodeList, unless it was
1682: * asked for ctxt->replaceEntities != 0
1683: *
1684: * Returns the AttValue parsed or NULL.
1685: */
1686:
1.14 daniel 1687: xmlChar *
1.1 daniel 1688: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1.14 daniel 1689: xmlChar *ret = NULL;
1.1 daniel 1690:
1691: if (CUR == '"') {
1692: NEXT;
1.19 daniel 1693: ret = htmlParseHTMLAttribute(ctxt, '"');
1.1 daniel 1694: if (CUR != '"') {
1695: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1696: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1697: ctxt->wellFormed = 0;
1698: } else
1699: NEXT;
1700: } else if (CUR == '\'') {
1701: NEXT;
1.19 daniel 1702: ret = htmlParseHTMLAttribute(ctxt, '\'');
1.1 daniel 1703: if (CUR != '\'') {
1704: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1705: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1706: ctxt->wellFormed = 0;
1707: } else
1708: NEXT;
1709: } else {
1710: /*
1711: * That's an HTMLism, the attribute value may not be quoted
1712: */
1.19 daniel 1713: ret = htmlParseHTMLAttribute(ctxt, 0);
1.1 daniel 1714: if (ret == NULL) {
1715: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1716: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1717: ctxt->wellFormed = 0;
1718: }
1719: }
1720: return(ret);
1721: }
1722:
1723: /**
1724: * htmlParseSystemLiteral:
1725: * @ctxt: an HTML parser context
1726: *
1727: * parse an HTML Literal
1728: *
1729: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1730: *
1731: * Returns the SystemLiteral parsed or NULL
1732: */
1733:
1.14 daniel 1734: xmlChar *
1.1 daniel 1735: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 1736: const xmlChar *q;
1737: xmlChar *ret = NULL;
1.1 daniel 1738:
1739: if (CUR == '"') {
1740: NEXT;
1741: q = CUR_PTR;
1742: while ((IS_CHAR(CUR)) && (CUR != '"'))
1743: NEXT;
1744: if (!IS_CHAR(CUR)) {
1745: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1746: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1747: ctxt->wellFormed = 0;
1748: } else {
1749: ret = xmlStrndup(q, CUR_PTR - q);
1750: NEXT;
1751: }
1752: } else if (CUR == '\'') {
1753: NEXT;
1754: q = CUR_PTR;
1755: while ((IS_CHAR(CUR)) && (CUR != '\''))
1756: NEXT;
1757: if (!IS_CHAR(CUR)) {
1758: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1759: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1760: ctxt->wellFormed = 0;
1761: } else {
1762: ret = xmlStrndup(q, CUR_PTR - q);
1763: NEXT;
1764: }
1765: } else {
1766: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.38 daniel 1767: ctxt->sax->error(ctxt->userData,
1768: "SystemLiteral \" or ' expected\n");
1.1 daniel 1769: ctxt->wellFormed = 0;
1770: }
1771:
1772: return(ret);
1773: }
1774:
1775: /**
1776: * htmlParsePubidLiteral:
1777: * @ctxt: an HTML parser context
1778: *
1779: * parse an HTML public literal
1780: *
1781: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1782: *
1783: * Returns the PubidLiteral parsed or NULL.
1784: */
1785:
1.14 daniel 1786: xmlChar *
1.1 daniel 1787: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 1788: const xmlChar *q;
1789: xmlChar *ret = NULL;
1.1 daniel 1790: /*
1791: * Name ::= (Letter | '_') (NameChar)*
1792: */
1793: if (CUR == '"') {
1794: NEXT;
1795: q = CUR_PTR;
1796: while (IS_PUBIDCHAR(CUR)) NEXT;
1797: if (CUR != '"') {
1798: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1799: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1800: ctxt->wellFormed = 0;
1801: } else {
1802: ret = xmlStrndup(q, CUR_PTR - q);
1803: NEXT;
1804: }
1805: } else if (CUR == '\'') {
1806: NEXT;
1807: q = CUR_PTR;
1808: while ((IS_LETTER(CUR)) && (CUR != '\''))
1809: NEXT;
1810: if (!IS_LETTER(CUR)) {
1811: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1812: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1813: ctxt->wellFormed = 0;
1814: } else {
1815: ret = xmlStrndup(q, CUR_PTR - q);
1816: NEXT;
1817: }
1818: } else {
1819: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1820: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1821: ctxt->wellFormed = 0;
1822: }
1823:
1824: return(ret);
1825: }
1826:
1827: /**
1828: * htmlParseCharData:
1829: * @ctxt: an HTML parser context
1830: * @cdata: int indicating whether we are within a CDATA section
1831: *
1832: * parse a CharData section.
1833: * if we are within a CDATA section ']]>' marks an end of section.
1834: *
1835: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1836: */
1837:
1838: void
1839: htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1.25 daniel 1840: xmlChar *buf = NULL;
1841: int len = 0;
1.31 daniel 1842: int size = HTML_PARSER_BUFFER_SIZE;
1.25 daniel 1843: xmlChar q;
1844:
1845: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1846: if (buf == NULL) {
1847: fprintf(stderr, "malloc of %d byte failed\n", size);
1848: return;
1849: }
1.1 daniel 1850:
1.25 daniel 1851: q = CUR;
1852: while ((IS_CHAR(q)) && (q != '<') &&
1853: (q != '&')) {
1854: if ((q == ']') && (NXT(1) == ']') &&
1.1 daniel 1855: (NXT(2) == '>')) {
1856: if (cdata) break;
1857: else {
1858: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1859: ctxt->sax->error(ctxt->userData,
1860: "Sequence ']]>' not allowed in content\n");
1861: ctxt->wellFormed = 0;
1862: }
1863: }
1.25 daniel 1864: if (len + 1 >= size) {
1865: size *= 2;
1866: buf = xmlRealloc(buf, size * sizeof(xmlChar));
1867: if (buf == NULL) {
1868: fprintf(stderr, "realloc of %d byte failed\n", size);
1869: return;
1870: }
1871: }
1872: buf[len++] = q;
1.1 daniel 1873: NEXT;
1.25 daniel 1874: q = CUR;
1875: }
1876: if (len == 0) {
1877: xmlFree(buf);
1878: return;
1.1 daniel 1879: }
1880:
1881: /*
1.25 daniel 1882: * Ok the buffer is to be consumed as chars.
1.1 daniel 1883: */
1884: if (ctxt->sax != NULL) {
1.25 daniel 1885: if (areBlanks(ctxt, buf, len)) {
1.1 daniel 1886: if (ctxt->sax->ignorableWhitespace != NULL)
1.25 daniel 1887: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len);
1.1 daniel 1888: } else {
1889: if (ctxt->sax->characters != NULL)
1.25 daniel 1890: ctxt->sax->characters(ctxt->userData, buf, len);
1.1 daniel 1891: }
1892: }
1.25 daniel 1893: xmlFree(buf);
1.1 daniel 1894: }
1895:
1896: /**
1897: * htmlParseExternalID:
1898: * @ctxt: an HTML parser context
1.14 daniel 1899: * @publicID: a xmlChar** receiving PubidLiteral
1.1 daniel 1900: * @strict: indicate whether we should restrict parsing to only
1901: * production [75], see NOTE below
1902: *
1903: * Parse an External ID or a Public ID
1904: *
1905: * NOTE: Productions [75] and [83] interract badly since [75] can generate
1906: * 'PUBLIC' S PubidLiteral S SystemLiteral
1907: *
1908: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1909: * | 'PUBLIC' S PubidLiteral S SystemLiteral
1910: *
1911: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1912: *
1913: * Returns the function returns SystemLiteral and in the second
1914: * case publicID receives PubidLiteral, is strict is off
1915: * it is possible to return NULL and have publicID set.
1916: */
1917:
1.14 daniel 1918: xmlChar *
1919: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1920: xmlChar *URI = NULL;
1.1 daniel 1921:
1922: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1923: (UPP(2) == 'S') && (UPP(3) == 'T') &&
1924: (UPP(4) == 'E') && (UPP(5) == 'M')) {
1925: SKIP(6);
1926: if (!IS_BLANK(CUR)) {
1927: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1928: ctxt->sax->error(ctxt->userData,
1929: "Space required after 'SYSTEM'\n");
1930: ctxt->wellFormed = 0;
1931: }
1932: SKIP_BLANKS;
1933: URI = htmlParseSystemLiteral(ctxt);
1934: if (URI == NULL) {
1935: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1936: ctxt->sax->error(ctxt->userData,
1937: "htmlParseExternalID: SYSTEM, no URI\n");
1938: ctxt->wellFormed = 0;
1939: }
1940: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1941: (UPP(2) == 'B') && (UPP(3) == 'L') &&
1942: (UPP(4) == 'I') && (UPP(5) == 'C')) {
1943: SKIP(6);
1944: if (!IS_BLANK(CUR)) {
1945: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1946: ctxt->sax->error(ctxt->userData,
1947: "Space required after 'PUBLIC'\n");
1948: ctxt->wellFormed = 0;
1949: }
1950: SKIP_BLANKS;
1951: *publicID = htmlParsePubidLiteral(ctxt);
1952: if (*publicID == NULL) {
1953: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1954: ctxt->sax->error(ctxt->userData,
1955: "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1956: ctxt->wellFormed = 0;
1957: }
1.5 daniel 1958: SKIP_BLANKS;
1959: if ((CUR == '"') || (CUR == '\'')) {
1960: URI = htmlParseSystemLiteral(ctxt);
1.1 daniel 1961: }
1962: }
1963: return(URI);
1964: }
1965:
1966: /**
1967: * htmlParseComment:
1968: * @ctxt: an HTML parser context
1969: *
1970: * Parse an XML (SGML) comment <!-- .... -->
1971: *
1972: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1973: */
1974: void
1.31 daniel 1975: htmlParseComment(htmlParserCtxtPtr ctxt) {
1.25 daniel 1976: xmlChar *buf = NULL;
1977: int len = 0;
1.31 daniel 1978: int size = HTML_PARSER_BUFFER_SIZE;
1.25 daniel 1979: register xmlChar s, r, q;
1.1 daniel 1980:
1981: /*
1982: * Check that there is a comment right here.
1983: */
1984: if ((CUR != '<') || (NXT(1) != '!') ||
1985: (NXT(2) != '-') || (NXT(3) != '-')) return;
1986:
1.25 daniel 1987: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1988: if (buf == NULL) {
1989: fprintf(stderr, "malloc of %d byte failed\n", size);
1990: return;
1991: }
1992: q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
1.1 daniel 1993: SKIP(4);
1.25 daniel 1994: s = CUR;
1995:
1996: while (IS_CHAR(s) &&
1997: ((s != '>') || (r != '-') || (q != '-'))) {
1998: if (len + 1 >= size) {
1999: size *= 2;
2000: buf = xmlRealloc(buf, size * sizeof(xmlChar));
2001: if (buf == NULL) {
2002: fprintf(stderr, "realloc of %d byte failed\n", size);
2003: return;
2004: }
2005: }
2006: buf[len++] = s;
2007: NEXT;
2008: q = r;
2009: r = s;
2010: s = CUR;
1.1 daniel 2011: }
1.25 daniel 2012: buf[len - 2] = 0;
2013: if (!IS_CHAR(s)) {
1.1 daniel 2014: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.25 daniel 2015: ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
1.1 daniel 2016: ctxt->wellFormed = 0;
2017: } else {
2018: NEXT;
1.31 daniel 2019: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
2020: ctxt->sax->comment(ctxt->userData, buf);
1.1 daniel 2021: }
2022: }
1.25 daniel 2023: xmlFree(buf);
1.1 daniel 2024: }
2025:
2026: /**
2027: * htmlParseCharRef:
2028: * @ctxt: an HTML parser context
2029: *
2030: * parse Reference declarations
2031: *
2032: * [66] CharRef ::= '&#' [0-9]+ ';' |
2033: * '&#x' [0-9a-fA-F]+ ';'
2034: *
2035: * Returns the value parsed (as an int)
2036: */
2037: int
2038: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2039: int val = 0;
2040:
2041: if ((CUR == '&') && (NXT(1) == '#') &&
2042: (NXT(2) == 'x')) {
2043: SKIP(3);
2044: while (CUR != ';') {
2045: if ((CUR >= '0') && (CUR <= '9'))
2046: val = val * 16 + (CUR - '0');
2047: else if ((CUR >= 'a') && (CUR <= 'f'))
2048: val = val * 16 + (CUR - 'a') + 10;
2049: else if ((CUR >= 'A') && (CUR <= 'F'))
2050: val = val * 16 + (CUR - 'A') + 10;
2051: else {
2052: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2053: ctxt->sax->error(ctxt->userData,
2054: "htmlParseCharRef: invalid hexadecimal value\n");
2055: ctxt->wellFormed = 0;
2056: val = 0;
2057: break;
2058: }
2059: NEXT;
2060: }
2061: if (CUR == ';')
2062: NEXT;
2063: } else if ((CUR == '&') && (NXT(1) == '#')) {
2064: SKIP(2);
2065: while (CUR != ';') {
2066: if ((CUR >= '0') && (CUR <= '9'))
2067: val = val * 10 + (CUR - '0');
2068: else {
2069: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2070: ctxt->sax->error(ctxt->userData,
2071: "htmlParseCharRef: invalid decimal value\n");
2072: ctxt->wellFormed = 0;
2073: val = 0;
2074: break;
2075: }
2076: NEXT;
2077: }
2078: if (CUR == ';')
2079: NEXT;
2080: } else {
2081: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2082: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2083: ctxt->wellFormed = 0;
2084: }
2085: /*
2086: * Check the value IS_CHAR ...
2087: */
2088: if (IS_CHAR(val)) {
2089: return(val);
2090: } else {
2091: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.14 daniel 2092: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
1.1 daniel 2093: val);
2094: ctxt->wellFormed = 0;
2095: }
2096: return(0);
2097: }
2098:
2099:
2100: /**
2101: * htmlParseDocTypeDecl :
2102: * @ctxt: an HTML parser context
2103: *
2104: * parse a DOCTYPE declaration
2105: *
2106: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2107: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2108: */
2109:
2110: void
2111: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1.14 daniel 2112: xmlChar *name;
2113: xmlChar *ExternalID = NULL;
2114: xmlChar *URI = NULL;
1.1 daniel 2115:
2116: /*
2117: * We know that '<!DOCTYPE' has been detected.
2118: */
2119: SKIP(9);
2120:
2121: SKIP_BLANKS;
2122:
2123: /*
2124: * Parse the DOCTYPE name.
2125: */
2126: name = htmlParseName(ctxt);
2127: if (name == NULL) {
2128: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2129: ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2130: ctxt->wellFormed = 0;
2131: }
2132: /*
2133: * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2134: */
2135:
2136: SKIP_BLANKS;
2137:
2138: /*
2139: * Check for SystemID and ExternalID
2140: */
1.5 daniel 2141: URI = htmlParseExternalID(ctxt, &ExternalID, 0);
1.1 daniel 2142: SKIP_BLANKS;
2143:
2144: /*
2145: * We should be at the end of the DOCTYPE declaration.
2146: */
2147: if (CUR != '>') {
2148: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2149: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2150: ctxt->wellFormed = 0;
2151: /* We shouldn't try to resynchronize ... */
2152: } else {
2153: }
2154: NEXT;
2155:
2156: /*
2157: * Create the document accordingly to the DOCTYPE
2158: */
1.31 daniel 2159: if (ctxt->myDoc != NULL)
2160: xmlFreeDoc(ctxt->myDoc);
2161:
1.1 daniel 2162: ctxt->myDoc = htmlNewDoc(URI, ExternalID);
2163:
2164: /*
2165: * Cleanup, since we don't use all those identifiers
2166: */
1.11 daniel 2167: if (URI != NULL) xmlFree(URI);
2168: if (ExternalID != NULL) xmlFree(ExternalID);
2169: if (name != NULL) xmlFree(name);
1.1 daniel 2170: }
2171:
2172: /**
2173: * htmlParseAttribute:
2174: * @ctxt: an HTML parser context
1.14 daniel 2175: * @value: a xmlChar ** used to store the value of the attribute
1.1 daniel 2176: *
2177: * parse an attribute
2178: *
2179: * [41] Attribute ::= Name Eq AttValue
2180: *
2181: * [25] Eq ::= S? '=' S?
2182: *
2183: * With namespace:
2184: *
2185: * [NS 11] Attribute ::= QName Eq AttValue
2186: *
2187: * Also the case QName == xmlns:??? is handled independently as a namespace
2188: * definition.
2189: *
2190: * Returns the attribute name, and the value in *value.
2191: */
2192:
1.14 daniel 2193: xmlChar *
2194: htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1.31 daniel 2195: xmlChar *name, *val = NULL;
1.1 daniel 2196:
2197: *value = NULL;
2198: name = htmlParseName(ctxt);
2199: if (name == NULL) {
2200: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2201: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2202: ctxt->wellFormed = 0;
2203: return(NULL);
2204: }
2205:
2206: /*
2207: * read the value
2208: */
2209: SKIP_BLANKS;
2210: if (CUR == '=') {
2211: NEXT;
2212: SKIP_BLANKS;
2213: val = htmlParseAttValue(ctxt);
1.42 daniel 2214: /******
1.1 daniel 2215: } else {
1.42 daniel 2216: * TODO : some attribute must have values, some may not
1.1 daniel 2217: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.31 daniel 2218: ctxt->sax->warning(ctxt->userData,
1.42 daniel 2219: "No value for attribute %s\n", name); */
1.1 daniel 2220: }
2221:
2222: *value = val;
2223: return(name);
2224: }
2225:
2226: /**
2227: * htmlParseStartTag:
2228: * @ctxt: an HTML parser context
2229: *
2230: * parse a start of tag either for rule element or
2231: * EmptyElement. In both case we don't parse the tag closing chars.
2232: *
2233: * [40] STag ::= '<' Name (S Attribute)* S? '>'
2234: *
2235: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2236: *
2237: * With namespace:
2238: *
2239: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2240: *
2241: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2242: *
2243: */
2244:
1.18 daniel 2245: void
1.1 daniel 2246: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2247: xmlChar *name;
2248: xmlChar *attname;
2249: xmlChar *attvalue;
2250: const xmlChar **atts = NULL;
1.1 daniel 2251: int nbatts = 0;
2252: int maxatts = 0;
2253: int i;
2254:
1.18 daniel 2255: if (CUR != '<') return;
1.1 daniel 2256: NEXT;
2257:
1.19 daniel 2258: GROW;
1.1 daniel 2259: name = htmlParseHTMLName(ctxt);
2260: if (name == NULL) {
2261: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2262: ctxt->sax->error(ctxt->userData,
2263: "htmlParseStartTag: invalid element name\n");
2264: ctxt->wellFormed = 0;
1.18 daniel 2265: return;
1.1 daniel 2266: }
2267:
2268: /*
2269: * Check for auto-closure of HTML elements.
2270: */
2271: htmlAutoClose(ctxt, name);
1.43 ! daniel 2272:
! 2273: /*
! 2274: * Check for implied HTML elements.
! 2275: */
! 2276: htmlCheckImplied(ctxt, name);
1.1 daniel 2277:
2278: /*
2279: * Now parse the attributes, it ends up with the ending
2280: *
2281: * (S Attribute)* S?
2282: */
2283: SKIP_BLANKS;
2284: while ((IS_CHAR(CUR)) &&
2285: (CUR != '>') &&
2286: ((CUR != '/') || (NXT(1) != '>'))) {
1.26 daniel 2287: long cons = ctxt->nbChars;
1.1 daniel 2288:
1.19 daniel 2289: GROW;
1.1 daniel 2290: attname = htmlParseAttribute(ctxt, &attvalue);
1.31 daniel 2291: if (attname != NULL) {
1.1 daniel 2292: /*
2293: * Well formedness requires at most one declaration of an attribute
2294: */
2295: for (i = 0; i < nbatts;i += 2) {
2296: if (!xmlStrcmp(atts[i], attname)) {
2297: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.19 daniel 2298: ctxt->sax->error(ctxt->userData,
2299: "Attribute %s redefined\n",
2300: attname);
1.1 daniel 2301: ctxt->wellFormed = 0;
1.11 daniel 2302: xmlFree(attname);
1.31 daniel 2303: if (attvalue != NULL)
2304: xmlFree(attvalue);
1.19 daniel 2305: goto failed;
1.1 daniel 2306: }
2307: }
2308:
2309: /*
2310: * Add the pair to atts
2311: */
2312: if (atts == NULL) {
2313: maxatts = 10;
1.14 daniel 2314: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
1.1 daniel 2315: if (atts == NULL) {
2316: fprintf(stderr, "malloc of %ld byte failed\n",
1.14 daniel 2317: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2318: if (name != NULL) xmlFree(name);
2319: return;
1.1 daniel 2320: }
1.23 daniel 2321: } else if (nbatts + 4 > maxatts) {
1.1 daniel 2322: maxatts *= 2;
1.14 daniel 2323: atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
1.1 daniel 2324: if (atts == NULL) {
2325: fprintf(stderr, "realloc of %ld byte failed\n",
1.14 daniel 2326: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2327: if (name != NULL) xmlFree(name);
2328: return;
1.1 daniel 2329: }
2330: }
2331: atts[nbatts++] = attname;
2332: atts[nbatts++] = attvalue;
2333: atts[nbatts] = NULL;
2334: atts[nbatts + 1] = NULL;
2335: }
2336:
1.19 daniel 2337: failed:
1.1 daniel 2338: SKIP_BLANKS;
1.26 daniel 2339: if (cons == ctxt->nbChars) {
1.1 daniel 2340: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2341: ctxt->sax->error(ctxt->userData,
2342: "htmlParseStartTag: problem parsing attributes\n");
2343: ctxt->wellFormed = 0;
2344: break;
2345: }
2346: }
2347:
2348: /*
2349: * SAX: Start of Element !
2350: */
1.15 daniel 2351: htmlnamePush(ctxt, xmlStrdup(name));
1.18 daniel 2352: #ifdef DEBUG
2353: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2354: #endif
1.1 daniel 2355: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2356: ctxt->sax->startElement(ctxt->userData, name, atts);
2357:
2358: if (atts != NULL) {
1.31 daniel 2359: for (i = 0;i < nbatts;i++) {
2360: if (atts[i] != NULL)
2361: xmlFree((xmlChar *) atts[i]);
2362: }
1.11 daniel 2363: xmlFree(atts);
1.1 daniel 2364: }
1.18 daniel 2365: if (name != NULL) xmlFree(name);
1.1 daniel 2366: }
2367:
2368: /**
2369: * htmlParseEndTag:
2370: * @ctxt: an HTML parser context
2371: *
2372: * parse an end of tag
2373: *
2374: * [42] ETag ::= '</' Name S? '>'
2375: *
2376: * With namespace
2377: *
2378: * [NS 9] ETag ::= '</' QName S? '>'
2379: */
2380:
2381: void
1.18 daniel 2382: htmlParseEndTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2383: xmlChar *name;
1.15 daniel 2384: xmlChar *oldname;
1.1 daniel 2385: int i;
2386:
2387: if ((CUR != '<') || (NXT(1) != '/')) {
2388: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2389: ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2390: ctxt->wellFormed = 0;
2391: return;
2392: }
2393: SKIP(2);
2394:
2395: name = htmlParseHTMLName(ctxt);
1.24 daniel 2396: if (name == NULL) return;
1.1 daniel 2397:
2398: /*
2399: * We should definitely be at the ending "S? '>'" part
2400: */
2401: SKIP_BLANKS;
2402: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2403: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2404: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2405: ctxt->wellFormed = 0;
2406: } else
2407: NEXT;
2408:
2409: /*
1.18 daniel 2410: * If the name read is not one of the element in the parsing stack
2411: * then return, it's just an error.
1.1 daniel 2412: */
1.18 daniel 2413: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2414: if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
1.1 daniel 2415: }
2416: if (i < 0) {
2417: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.18 daniel 2418: ctxt->sax->error(ctxt->userData,
2419: "Unexpected end tag : %s\n", name);
1.11 daniel 2420: xmlFree(name);
1.1 daniel 2421: ctxt->wellFormed = 0;
2422: return;
2423: }
2424:
1.18 daniel 2425:
1.1 daniel 2426: /*
2427: * Check for auto-closure of HTML elements.
2428: */
1.18 daniel 2429:
1.1 daniel 2430: htmlAutoCloseOnClose(ctxt, name);
2431:
2432: /*
2433: * Well formedness constraints, opening and closing must match.
2434: * With the exception that the autoclose may have popped stuff out
2435: * of the stack.
2436: */
1.18 daniel 2437: if (xmlStrcmp(name, ctxt->name)) {
2438: #ifdef DEBUG
2439: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2440: #endif
1.15 daniel 2441: if ((ctxt->name != NULL) &&
2442: (xmlStrcmp(ctxt->name, name))) {
1.1 daniel 2443: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2444: ctxt->sax->error(ctxt->userData,
2445: "Opening and ending tag mismatch: %s and %s\n",
1.15 daniel 2446: name, ctxt->name);
1.1 daniel 2447: ctxt->wellFormed = 0;
2448: }
2449: }
2450:
2451: /*
2452: * SAX: End of Tag
2453: */
1.15 daniel 2454: oldname = ctxt->name;
1.24 daniel 2455: if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
1.18 daniel 2456: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2457: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2458: oldname = htmlnamePop(ctxt);
1.18 daniel 2459: if (oldname != NULL) {
2460: #ifdef DEBUG
2461: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2462: #endif
2463: xmlFree(oldname);
2464: #ifdef DEBUG
2465: } else {
2466: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2467: #endif
2468: }
2469: }
1.1 daniel 2470:
2471: if (name != NULL)
1.11 daniel 2472: xmlFree(name);
1.1 daniel 2473:
2474: return;
2475: }
2476:
2477:
2478: /**
2479: * htmlParseReference:
2480: * @ctxt: an HTML parser context
2481: *
2482: * parse and handle entity references in content,
2483: * this will end-up in a call to character() since this is either a
2484: * CharRef, or a predefined entity.
2485: */
2486: void
2487: htmlParseReference(htmlParserCtxtPtr ctxt) {
2488: htmlEntityDescPtr ent;
1.14 daniel 2489: xmlChar out[2];
2490: xmlChar *name;
1.1 daniel 2491: int val;
2492: if (CUR != '&') return;
2493:
2494: if (NXT(1) == '#') {
2495: val = htmlParseCharRef(ctxt);
1.8 daniel 2496: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 2497: out[0] = val;
2498: out[1] = 0;
2499: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2500: ctxt->sax->characters(ctxt->userData, out, 1);
2501: } else {
2502: ent = htmlParseEntityRef(ctxt, &name);
1.32 daniel 2503: if (name == NULL) {
2504: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
2505: return;
2506: }
1.1 daniel 2507: if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2508: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
1.8 daniel 2509: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.1 daniel 2510: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1.32 daniel 2511: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
1.1 daniel 2512: }
2513: } else {
1.8 daniel 2514: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 2515: out[0] = ent->value;
2516: out[1] = 0;
2517: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2518: ctxt->sax->characters(ctxt->userData, out, 1);
2519: }
1.11 daniel 2520: xmlFree(name);
1.1 daniel 2521: }
2522: }
2523:
2524: /**
2525: * htmlParseContent:
2526: * @ctxt: an HTML parser context
2527: * @name: the node name
2528: *
2529: * Parse a content: comment, sub-element, reference or text.
2530: *
2531: */
2532:
2533: void
1.18 daniel 2534: htmlParseContent(htmlParserCtxtPtr ctxt) {
1.15 daniel 2535: xmlChar *currentNode;
1.18 daniel 2536: int depth;
1.1 daniel 2537:
1.26 daniel 2538: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 2539: depth = ctxt->nameNr;
2540: while (1) {
1.26 daniel 2541: long cons = ctxt->nbChars;
1.1 daniel 2542:
1.18 daniel 2543: GROW;
2544: /*
2545: * Our tag or one of it's parent or children is ending.
2546: */
2547: if ((CUR == '<') && (NXT(1) == '/')) {
2548: htmlParseEndTag(ctxt);
1.26 daniel 2549: if (currentNode != NULL) xmlFree(currentNode);
1.18 daniel 2550: return;
2551: }
2552:
2553: /*
2554: * Has this node been popped out during parsing of
2555: * the next element
2556: */
1.26 daniel 2557: if ((xmlStrcmp(currentNode, ctxt->name)) &&
2558: (depth >= ctxt->nameNr)) {
2559: if (currentNode != NULL) xmlFree(currentNode);
2560: return;
2561: }
1.18 daniel 2562:
1.1 daniel 2563: /*
2564: * First case : a comment
2565: */
2566: if ((CUR == '<') && (NXT(1) == '!') &&
2567: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 2568: htmlParseComment(ctxt);
1.1 daniel 2569: }
2570:
2571: /*
2572: * Second case : a sub-element.
2573: */
2574: else if (CUR == '<') {
2575: htmlParseElement(ctxt);
2576: }
2577:
2578: /*
2579: * Third case : a reference. If if has not been resolved,
2580: * parsing returns it's Name, create the node
2581: */
2582: else if (CUR == '&') {
2583: htmlParseReference(ctxt);
2584: }
2585:
2586: /*
2587: * Last case, text. Note that References are handled directly.
2588: */
2589: else {
2590: htmlParseCharData(ctxt, 0);
2591: }
2592:
1.26 daniel 2593: if (cons == ctxt->nbChars) {
1.22 daniel 2594: if (ctxt->node != NULL) {
2595: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2596: ctxt->sax->error(ctxt->userData,
2597: "detected an error in element content\n");
2598: ctxt->wellFormed = 0;
2599: }
1.1 daniel 2600: break;
2601: }
1.17 daniel 2602:
1.5 daniel 2603: GROW;
1.1 daniel 2604: }
1.26 daniel 2605: if (currentNode != NULL) xmlFree(currentNode);
1.1 daniel 2606: }
2607:
2608: /**
2609: * htmlParseElement:
2610: * @ctxt: an HTML parser context
2611: *
2612: * parse an HTML element, this is highly recursive
2613: *
2614: * [39] element ::= EmptyElemTag | STag content ETag
2615: *
2616: * [41] Attribute ::= Name Eq AttValue
2617: */
2618:
2619: void
2620: htmlParseElement(htmlParserCtxtPtr ctxt) {
1.14 daniel 2621: const xmlChar *openTag = CUR_PTR;
2622: xmlChar *name;
1.16 daniel 2623: xmlChar *currentNode = NULL;
1.1 daniel 2624: htmlElemDescPtr info;
1.10 daniel 2625: htmlParserNodeInfo node_info;
1.31 daniel 2626: xmlChar *oldname;
1.18 daniel 2627: int depth = ctxt->nameNr;
1.1 daniel 2628:
2629: /* Capture start position */
1.10 daniel 2630: if (ctxt->record_info) {
2631: node_info.begin_pos = ctxt->input->consumed +
2632: (CUR_PTR - ctxt->input->base);
2633: node_info.begin_line = ctxt->input->line;
2634: }
1.1 daniel 2635:
1.26 daniel 2636: oldname = xmlStrdup(ctxt->name);
1.18 daniel 2637: htmlParseStartTag(ctxt);
2638: name = ctxt->name;
1.19 daniel 2639: #ifdef DEBUG
2640: if (oldname == NULL)
2641: fprintf(stderr, "Start of element %s\n", name);
2642: else if (name == NULL)
2643: fprintf(stderr, "Start of element failed, was %s\n", oldname);
2644: else
2645: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2646: #endif
1.26 daniel 2647: if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
1.18 daniel 2648: (name == NULL)) {
1.19 daniel 2649: if (CUR == '>')
2650: NEXT;
1.26 daniel 2651: if (oldname != NULL)
2652: xmlFree(oldname);
1.1 daniel 2653: return;
2654: }
1.26 daniel 2655: if (oldname != NULL)
2656: xmlFree(oldname);
1.1 daniel 2657:
2658: /*
2659: * Lookup the info for that element.
2660: */
2661: info = htmlTagLookup(name);
2662: if (info == NULL) {
2663: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2664: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2665: name);
2666: ctxt->wellFormed = 0;
2667: } else if (info->depr) {
2668: /***************************
2669: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2670: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2671: name);
2672: ***************************/
2673: }
2674:
2675: /*
2676: * Check for an Empty Element labelled the XML/SGML way
2677: */
2678: if ((CUR == '/') && (NXT(1) == '>')) {
2679: SKIP(2);
2680: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2681: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2682: oldname = htmlnamePop(ctxt);
1.18 daniel 2683: #ifdef DEBUG
2684: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2685: #endif
1.17 daniel 2686: if (oldname != NULL)
2687: xmlFree(oldname);
1.1 daniel 2688: return;
2689: }
2690:
1.5 daniel 2691: if (CUR == '>') {
2692: NEXT;
2693: } else {
1.1 daniel 2694: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2695: ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2696: openTag);
2697: ctxt->wellFormed = 0;
2698:
2699: /*
2700: * end of parsing of this node.
2701: */
1.18 daniel 2702: if (!xmlStrcmp(name, ctxt->name)) {
2703: nodePop(ctxt);
1.24 daniel 2704: oldname = htmlnamePop(ctxt);
1.18 daniel 2705: #ifdef DEBUG
2706: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2707: #endif
2708: if (oldname != NULL)
2709: xmlFree(oldname);
2710: }
1.10 daniel 2711:
2712: /*
2713: * Capture end position and add node
2714: */
2715: if ( currentNode != NULL && ctxt->record_info ) {
2716: node_info.end_pos = ctxt->input->consumed +
2717: (CUR_PTR - ctxt->input->base);
2718: node_info.end_line = ctxt->input->line;
1.15 daniel 2719: node_info.node = ctxt->node;
1.10 daniel 2720: xmlParserAddNodeInfo(ctxt, &node_info);
2721: }
1.1 daniel 2722: return;
2723: }
2724:
2725: /*
2726: * Check for an Empty Element from DTD definition
2727: */
2728: if ((info != NULL) && (info->empty)) {
2729: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2730: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2731: oldname = htmlnamePop(ctxt);
1.18 daniel 2732: #ifdef DEBUG
2733: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2734: #endif
1.17 daniel 2735: if (oldname != NULL)
2736: xmlFree(oldname);
1.1 daniel 2737: return;
2738: }
2739:
2740: /*
2741: * Parse the content of the element:
2742: */
1.26 daniel 2743: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 2744: depth = ctxt->nameNr;
2745: while (IS_CHAR(CUR)) {
2746: htmlParseContent(ctxt);
2747: if (ctxt->nameNr < depth) break;
2748: }
1.1 daniel 2749:
2750: if (!IS_CHAR(CUR)) {
2751: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2752: ctxt->sax->error(ctxt->userData,
1.18 daniel 2753: "Premature end of data in tag %s\n", currentNode);
1.1 daniel 2754: ctxt->wellFormed = 0;
2755:
2756: /*
2757: * end of parsing of this node.
2758: */
2759: nodePop(ctxt);
1.24 daniel 2760: oldname = htmlnamePop(ctxt);
1.18 daniel 2761: #ifdef DEBUG
2762: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2763: #endif
1.17 daniel 2764: if (oldname != NULL)
2765: xmlFree(oldname);
1.26 daniel 2766: if (currentNode != NULL)
2767: xmlFree(currentNode);
1.1 daniel 2768: return;
2769: }
1.10 daniel 2770:
2771: /*
2772: * Capture end position and add node
2773: */
2774: if ( currentNode != NULL && ctxt->record_info ) {
2775: node_info.end_pos = ctxt->input->consumed +
2776: (CUR_PTR - ctxt->input->base);
2777: node_info.end_line = ctxt->input->line;
1.15 daniel 2778: node_info.node = ctxt->node;
1.10 daniel 2779: xmlParserAddNodeInfo(ctxt, &node_info);
2780: }
1.26 daniel 2781: if (currentNode != NULL)
2782: xmlFree(currentNode);
1.1 daniel 2783: }
2784:
2785: /**
2786: * htmlParseDocument :
2787: * @ctxt: an HTML parser context
2788: *
2789: * parse an HTML document (and build a tree if using the standard SAX
2790: * interface).
2791: *
2792: * Returns 0, -1 in case of error. the parser context is augmented
2793: * as a result of the parsing.
2794: */
2795:
2796: int
2797: htmlParseDocument(htmlParserCtxtPtr ctxt) {
2798: htmlDefaultSAXHandlerInit();
2799: ctxt->html = 1;
2800:
1.5 daniel 2801: GROW;
1.1 daniel 2802: /*
1.9 daniel 2803: * SAX: beginning of the document processing.
1.1 daniel 2804: */
2805: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2806: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2807:
2808: /*
2809: * Wipe out everything which is before the first '<'
2810: */
1.22 daniel 2811: SKIP_BLANKS;
1.1 daniel 2812: if (CUR == 0) {
2813: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2814: ctxt->sax->error(ctxt->userData, "Document is empty\n");
2815: ctxt->wellFormed = 0;
2816: }
2817:
1.40 daniel 2818: if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
2819: ctxt->sax->startDocument(ctxt->userData);
2820:
2821:
1.22 daniel 2822: /*
2823: * Parse possible comments before any content
2824: */
2825: while ((CUR == '<') && (NXT(1) == '!') &&
2826: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 2827: if (ctxt->myDoc == NULL)
2828: ctxt->myDoc = htmlNewDoc(NULL, NULL);
2829: htmlParseComment(ctxt);
1.22 daniel 2830: SKIP_BLANKS;
2831: }
2832:
1.1 daniel 2833:
2834: /*
2835: * Then possibly doc type declaration(s) and more Misc
2836: * (doctypedecl Misc*)?
2837: */
2838: if ((CUR == '<') && (NXT(1) == '!') &&
2839: (UPP(2) == 'D') && (UPP(3) == 'O') &&
2840: (UPP(4) == 'C') && (UPP(5) == 'T') &&
2841: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2842: (UPP(8) == 'E')) {
2843: htmlParseDocTypeDecl(ctxt);
2844: }
2845: SKIP_BLANKS;
2846:
2847: /*
2848: * Create the document if not done already.
2849: */
2850: if (ctxt->myDoc == NULL) {
2851: ctxt->myDoc = htmlNewDoc(NULL, NULL);
2852: }
2853:
2854: /*
2855: * Time to start parsing the tree itself
2856: */
1.22 daniel 2857: htmlParseContent(ctxt);
1.1 daniel 2858:
2859: /*
2860: * SAX: end of the document processing.
2861: */
2862: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2863: ctxt->sax->endDocument(ctxt->userData);
2864: if (! ctxt->wellFormed) return(-1);
2865: return(0);
2866: }
2867:
2868:
1.30 daniel 2869: /************************************************************************
2870: * *
2871: * Parser contexts handling *
2872: * *
2873: ************************************************************************/
1.1 daniel 2874:
2875: /**
2876: * xmlInitParserCtxt:
2877: * @ctxt: an HTML parser context
2878: *
2879: * Initialize a parser context
2880: */
2881:
2882: void
2883: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2884: {
2885: htmlSAXHandler *sax;
2886:
1.21 daniel 2887: if (ctxt == NULL) return;
2888: memset(ctxt, 0, sizeof(htmlParserCtxt));
2889:
1.11 daniel 2890: sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
1.1 daniel 2891: if (sax == NULL) {
2892: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2893: }
1.19 daniel 2894: memset(sax, 0, sizeof(htmlSAXHandler));
1.1 daniel 2895:
2896: /* Allocate the Input stack */
1.19 daniel 2897: ctxt->inputTab = (htmlParserInputPtr *)
2898: xmlMalloc(5 * sizeof(htmlParserInputPtr));
2899: if (ctxt->inputTab == NULL) {
2900: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2901: }
1.1 daniel 2902: ctxt->inputNr = 0;
2903: ctxt->inputMax = 5;
2904: ctxt->input = NULL;
2905: ctxt->version = NULL;
2906: ctxt->encoding = NULL;
2907: ctxt->standalone = -1;
1.30 daniel 2908: ctxt->instate = XML_PARSER_START;
1.1 daniel 2909:
2910: /* Allocate the Node stack */
1.11 daniel 2911: ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
1.1 daniel 2912: ctxt->nodeNr = 0;
2913: ctxt->nodeMax = 10;
2914: ctxt->node = NULL;
2915:
1.15 daniel 2916: /* Allocate the Name stack */
2917: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
2918: ctxt->nameNr = 0;
2919: ctxt->nameMax = 10;
2920: ctxt->name = NULL;
2921:
1.1 daniel 2922: if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2923: else {
2924: ctxt->sax = sax;
2925: memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2926: }
2927: ctxt->userData = ctxt;
2928: ctxt->myDoc = NULL;
2929: ctxt->wellFormed = 1;
2930: ctxt->replaceEntities = 0;
2931: ctxt->html = 1;
2932: ctxt->record_info = 0;
1.21 daniel 2933: ctxt->validate = 0;
1.26 daniel 2934: ctxt->nbChars = 0;
1.30 daniel 2935: ctxt->checkIndex = 0;
1.1 daniel 2936: xmlInitNodeInfoSeq(&ctxt->node_seq);
2937: }
2938:
2939: /**
2940: * htmlFreeParserCtxt:
2941: * @ctxt: an HTML parser context
2942: *
2943: * Free all the memory used by a parser context. However the parsed
2944: * document in ctxt->myDoc is not freed.
2945: */
2946:
2947: void
2948: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2949: {
2950: htmlParserInputPtr input;
1.15 daniel 2951: xmlChar *oldname;
1.1 daniel 2952:
2953: if (ctxt == NULL) return;
2954:
2955: while ((input = inputPop(ctxt)) != NULL) {
2956: xmlFreeInputStream(input);
2957: }
2958:
1.11 daniel 2959: if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
1.24 daniel 2960: while ((oldname = htmlnamePop(ctxt)) != NULL) {
2961: xmlFree(oldname);
1.15 daniel 2962: }
2963: if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
1.31 daniel 2964: if (ctxt->directory != NULL) xmlFree(ctxt->directory);
1.11 daniel 2965: if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
2966: if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
1.1 daniel 2967: if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
1.11 daniel 2968: xmlFree(ctxt->sax);
2969: xmlFree(ctxt);
1.1 daniel 2970: }
2971:
2972: /**
2973: * htmlCreateDocParserCtxt :
1.14 daniel 2974: * @cur: a pointer to an array of xmlChar
1.1 daniel 2975: * @encoding: a free form C string describing the HTML document encoding, or NULL
2976: *
2977: * Create a parser context for an HTML document.
2978: *
2979: * Returns the new parser context or NULL
2980: */
2981: htmlParserCtxtPtr
1.14 daniel 2982: htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
1.1 daniel 2983: htmlParserCtxtPtr ctxt;
2984: htmlParserInputPtr input;
2985: /* htmlCharEncoding enc; */
2986:
1.11 daniel 2987: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 2988: if (ctxt == NULL) {
2989: perror("malloc");
2990: return(NULL);
2991: }
2992: htmlInitParserCtxt(ctxt);
1.11 daniel 2993: input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 2994: if (input == NULL) {
2995: perror("malloc");
1.11 daniel 2996: xmlFree(ctxt);
1.1 daniel 2997: return(NULL);
2998: }
1.19 daniel 2999: memset(input, 0, sizeof(htmlParserInput));
1.1 daniel 3000:
3001: input->line = 1;
3002: input->col = 1;
3003: input->base = cur;
3004: input->cur = cur;
3005:
3006: inputPush(ctxt, input);
3007: return(ctxt);
3008: }
3009:
1.31 daniel 3010: /************************************************************************
3011: * *
3012: * Progressive parsing interfaces *
3013: * *
3014: ************************************************************************/
3015:
3016: /**
3017: * htmlParseLookupSequence:
3018: * @ctxt: an HTML parser context
3019: * @first: the first char to lookup
3020: * @next: the next char to lookup or zero
3021: * @third: the next char to lookup or zero
3022: *
3023: * Try to find if a sequence (first, next, third) or just (first next) or
3024: * (first) is available in the input stream.
3025: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3026: * to avoid rescanning sequences of bytes, it DOES change the state of the
3027: * parser, do not use liberally.
3028: * This is basically similar to xmlParseLookupSequence()
3029: *
3030: * Returns the index to the current parsing point if the full sequence
3031: * is available, -1 otherwise.
3032: */
3033: int
3034: htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3035: xmlChar next, xmlChar third) {
3036: int base, len;
3037: htmlParserInputPtr in;
3038: const xmlChar *buf;
3039:
3040: in = ctxt->input;
3041: if (in == NULL) return(-1);
3042: base = in->cur - in->base;
3043: if (base < 0) return(-1);
3044: if (ctxt->checkIndex > base)
3045: base = ctxt->checkIndex;
3046: if (in->buf == NULL) {
3047: buf = in->base;
3048: len = in->length;
3049: } else {
3050: buf = in->buf->buffer->content;
3051: len = in->buf->buffer->use;
3052: }
3053: /* take into account the sequence length */
3054: if (third) len -= 2;
3055: else if (next) len --;
3056: for (;base < len;base++) {
3057: if (buf[base] == first) {
3058: if (third != 0) {
3059: if ((buf[base + 1] != next) ||
3060: (buf[base + 2] != third)) continue;
3061: } else if (next != 0) {
3062: if (buf[base + 1] != next) continue;
3063: }
3064: ctxt->checkIndex = 0;
3065: #ifdef DEBUG_PUSH
3066: if (next == 0)
3067: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3068: first, base);
3069: else if (third == 0)
3070: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3071: first, next, base);
3072: else
3073: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3074: first, next, third, base);
3075: #endif
3076: return(base - (in->cur - in->base));
3077: }
3078: }
3079: ctxt->checkIndex = base;
3080: #ifdef DEBUG_PUSH
3081: if (next == 0)
3082: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3083: else if (third == 0)
3084: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3085: else
3086: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3087: #endif
3088: return(-1);
3089: }
3090:
3091: /**
1.32 daniel 3092: * htmlParseTryOrFinish:
1.31 daniel 3093: * @ctxt: an HTML parser context
1.32 daniel 3094: * @terminate: last chunk indicator
1.31 daniel 3095: *
3096: * Try to progress on parsing
3097: *
3098: * Returns zero if no parsing was possible
3099: */
3100: int
1.32 daniel 3101: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
1.31 daniel 3102: int ret = 0;
3103: htmlParserInputPtr in;
3104: int avail;
3105: xmlChar cur, next;
3106:
3107: #ifdef DEBUG_PUSH
3108: switch (ctxt->instate) {
3109: case XML_PARSER_EOF:
3110: fprintf(stderr, "HPP: try EOF\n"); break;
3111: case XML_PARSER_START:
3112: fprintf(stderr, "HPP: try START\n"); break;
3113: case XML_PARSER_MISC:
3114: fprintf(stderr, "HPP: try MISC\n");break;
3115: case XML_PARSER_COMMENT:
3116: fprintf(stderr, "HPP: try COMMENT\n");break;
3117: case XML_PARSER_PROLOG:
3118: fprintf(stderr, "HPP: try PROLOG\n");break;
3119: case XML_PARSER_START_TAG:
3120: fprintf(stderr, "HPP: try START_TAG\n");break;
3121: case XML_PARSER_CONTENT:
3122: fprintf(stderr, "HPP: try CONTENT\n");break;
3123: case XML_PARSER_CDATA_SECTION:
3124: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3125: case XML_PARSER_END_TAG:
3126: fprintf(stderr, "HPP: try END_TAG\n");break;
3127: case XML_PARSER_ENTITY_DECL:
3128: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3129: case XML_PARSER_ENTITY_VALUE:
3130: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3131: case XML_PARSER_ATTRIBUTE_VALUE:
3132: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3133: case XML_PARSER_DTD:
3134: fprintf(stderr, "HPP: try DTD\n");break;
3135: case XML_PARSER_EPILOG:
3136: fprintf(stderr, "HPP: try EPILOG\n");break;
3137: case XML_PARSER_PI:
3138: fprintf(stderr, "HPP: try PI\n");break;
3139: }
3140: #endif
3141:
3142: while (1) {
3143:
3144: in = ctxt->input;
3145: if (in == NULL) break;
3146: if (in->buf == NULL)
3147: avail = in->length - (in->cur - in->base);
3148: else
3149: avail = in->buf->buffer->use - (in->cur - in->base);
3150: if (avail < 1)
3151: goto done;
3152: switch (ctxt->instate) {
3153: case XML_PARSER_EOF:
3154: /*
3155: * Document parsing is done !
3156: */
3157: goto done;
3158: case XML_PARSER_START:
3159: /*
3160: * Very first chars read from the document flow.
3161: */
3162: cur = in->cur[0];
3163: if (IS_BLANK(cur)) {
3164: SKIP_BLANKS;
3165: if (in->buf == NULL)
3166: avail = in->length - (in->cur - in->base);
3167: else
3168: avail = in->buf->buffer->use - (in->cur - in->base);
3169: }
3170: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3171: ctxt->sax->setDocumentLocator(ctxt->userData,
3172: &xmlDefaultSAXLocator);
3173: cur = in->cur[0];
3174: next = in->cur[1];
3175: if ((cur == '<') && (next == '!') &&
3176: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3177: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3178: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3179: (UPP(8) == 'E')) {
1.32 daniel 3180: if ((!terminate) &&
3181: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3182: goto done;
3183: #ifdef DEBUG_PUSH
3184: fprintf(stderr, "HPP: Parsing internal subset\n");
3185: #endif
3186: htmlParseDocTypeDecl(ctxt);
3187: ctxt->instate = XML_PARSER_PROLOG;
3188: #ifdef DEBUG_PUSH
3189: fprintf(stderr, "HPP: entering PROLOG\n");
3190: #endif
3191: } else {
3192: ctxt->myDoc = htmlNewDoc(NULL, NULL);
3193: ctxt->instate = XML_PARSER_MISC;
3194: }
3195: #ifdef DEBUG_PUSH
3196: fprintf(stderr, "HPP: entering MISC\n");
3197: #endif
3198: break;
3199: case XML_PARSER_MISC:
3200: SKIP_BLANKS;
3201: if (in->buf == NULL)
3202: avail = in->length - (in->cur - in->base);
3203: else
3204: avail = in->buf->buffer->use - (in->cur - in->base);
3205: if (avail < 2)
3206: goto done;
3207: cur = in->cur[0];
3208: next = in->cur[1];
3209: if ((cur == '<') && (next == '!') &&
3210: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3211: if ((!terminate) &&
3212: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3213: goto done;
3214: #ifdef DEBUG_PUSH
3215: fprintf(stderr, "HPP: Parsing Comment\n");
3216: #endif
3217: htmlParseComment(ctxt);
3218: ctxt->instate = XML_PARSER_MISC;
3219: } else if ((cur == '<') && (next == '!') &&
3220: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3221: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3222: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3223: (UPP(8) == 'E')) {
1.32 daniel 3224: if ((!terminate) &&
3225: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3226: goto done;
3227: #ifdef DEBUG_PUSH
3228: fprintf(stderr, "HPP: Parsing internal subset\n");
3229: #endif
3230: htmlParseDocTypeDecl(ctxt);
3231: ctxt->instate = XML_PARSER_PROLOG;
3232: #ifdef DEBUG_PUSH
3233: fprintf(stderr, "HPP: entering PROLOG\n");
3234: #endif
3235: } else if ((cur == '<') && (next == '!') &&
3236: (avail < 9)) {
3237: goto done;
3238: } else {
3239: ctxt->instate = XML_PARSER_START_TAG;
3240: #ifdef DEBUG_PUSH
3241: fprintf(stderr, "HPP: entering START_TAG\n");
3242: #endif
3243: }
3244: break;
3245: case XML_PARSER_PROLOG:
3246: SKIP_BLANKS;
3247: if (in->buf == NULL)
3248: avail = in->length - (in->cur - in->base);
3249: else
3250: avail = in->buf->buffer->use - (in->cur - in->base);
3251: if (avail < 2)
3252: goto done;
3253: cur = in->cur[0];
3254: next = in->cur[1];
3255: if ((cur == '<') && (next == '!') &&
3256: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3257: if ((!terminate) &&
3258: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3259: goto done;
3260: #ifdef DEBUG_PUSH
3261: fprintf(stderr, "HPP: Parsing Comment\n");
3262: #endif
3263: htmlParseComment(ctxt);
3264: ctxt->instate = XML_PARSER_PROLOG;
3265: } else if ((cur == '<') && (next == '!') &&
3266: (avail < 4)) {
3267: goto done;
3268: } else {
3269: ctxt->instate = XML_PARSER_START_TAG;
3270: #ifdef DEBUG_PUSH
3271: fprintf(stderr, "HPP: entering START_TAG\n");
3272: #endif
3273: }
3274: break;
3275: case XML_PARSER_EPILOG:
3276: SKIP_BLANKS;
3277: if (in->buf == NULL)
3278: avail = in->length - (in->cur - in->base);
3279: else
3280: avail = in->buf->buffer->use - (in->cur - in->base);
3281: if (avail < 2)
3282: goto done;
3283: cur = in->cur[0];
3284: next = in->cur[1];
3285: if ((cur == '<') && (next == '!') &&
3286: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3287: if ((!terminate) &&
3288: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3289: goto done;
3290: #ifdef DEBUG_PUSH
3291: fprintf(stderr, "HPP: Parsing Comment\n");
3292: #endif
3293: htmlParseComment(ctxt);
3294: ctxt->instate = XML_PARSER_EPILOG;
3295: } else if ((cur == '<') && (next == '!') &&
3296: (avail < 4)) {
3297: goto done;
3298: } else {
3299: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3300: ctxt->sax->error(ctxt->userData,
3301: "Extra content at the end of the document\n");
3302: ctxt->wellFormed = 0;
3303: ctxt->errNo = XML_ERR_DOCUMENT_END;
3304: ctxt->instate = XML_PARSER_EOF;
3305: #ifdef DEBUG_PUSH
3306: fprintf(stderr, "HPP: entering EOF\n");
3307: #endif
3308: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3309: ctxt->sax->endDocument(ctxt->userData);
3310: goto done;
3311: }
3312: break;
3313: case XML_PARSER_START_TAG: {
3314: xmlChar *name, *oldname;
3315: int depth = ctxt->nameNr;
3316: htmlElemDescPtr info;
3317:
3318: if (avail < 2)
3319: goto done;
3320: cur = in->cur[0];
3321: if (cur != '<') {
3322: ctxt->instate = XML_PARSER_CONTENT;
3323: #ifdef DEBUG_PUSH
3324: fprintf(stderr, "HPP: entering CONTENT\n");
3325: #endif
3326: break;
3327: }
1.32 daniel 3328: if ((!terminate) &&
3329: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3330: goto done;
3331:
3332: oldname = xmlStrdup(ctxt->name);
3333: htmlParseStartTag(ctxt);
3334: name = ctxt->name;
3335: #ifdef DEBUG
3336: if (oldname == NULL)
3337: fprintf(stderr, "Start of element %s\n", name);
3338: else if (name == NULL)
3339: fprintf(stderr, "Start of element failed, was %s\n",
3340: oldname);
3341: else
3342: fprintf(stderr, "Start of element %s, was %s\n",
3343: name, oldname);
3344: #endif
3345: if (((depth == ctxt->nameNr) &&
3346: (!xmlStrcmp(oldname, ctxt->name))) ||
3347: (name == NULL)) {
3348: if (CUR == '>')
3349: NEXT;
3350: if (oldname != NULL)
3351: xmlFree(oldname);
3352: break;
3353: }
3354: if (oldname != NULL)
3355: xmlFree(oldname);
3356:
3357: /*
3358: * Lookup the info for that element.
3359: */
3360: info = htmlTagLookup(name);
3361: if (info == NULL) {
3362: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3363: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3364: name);
3365: ctxt->wellFormed = 0;
3366: } else if (info->depr) {
3367: /***************************
3368: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3369: ctxt->sax->warning(ctxt->userData,
3370: "Tag %s is deprecated\n",
3371: name);
3372: ***************************/
3373: }
3374:
3375: /*
3376: * Check for an Empty Element labelled the XML/SGML way
3377: */
3378: if ((CUR == '/') && (NXT(1) == '>')) {
3379: SKIP(2);
3380: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3381: ctxt->sax->endElement(ctxt->userData, name);
3382: oldname = htmlnamePop(ctxt);
3383: #ifdef DEBUG
3384: fprintf(stderr,"End of tag the XML way: popping out %s\n",
3385: oldname);
3386: #endif
3387: if (oldname != NULL)
3388: xmlFree(oldname);
3389: ctxt->instate = XML_PARSER_CONTENT;
3390: #ifdef DEBUG_PUSH
3391: fprintf(stderr, "HPP: entering CONTENT\n");
3392: #endif
3393: break;
3394: }
3395:
3396: if (CUR == '>') {
3397: NEXT;
3398: } else {
3399: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3400: ctxt->sax->error(ctxt->userData,
3401: "Couldn't find end of Start Tag %s\n",
3402: name);
3403: ctxt->wellFormed = 0;
3404:
3405: /*
3406: * end of parsing of this node.
3407: */
3408: if (!xmlStrcmp(name, ctxt->name)) {
3409: nodePop(ctxt);
3410: oldname = htmlnamePop(ctxt);
3411: #ifdef DEBUG
3412: fprintf(stderr,
3413: "End of start tag problem: popping out %s\n", oldname);
3414: #endif
3415: if (oldname != NULL)
3416: xmlFree(oldname);
3417: }
3418:
3419: ctxt->instate = XML_PARSER_CONTENT;
3420: #ifdef DEBUG_PUSH
3421: fprintf(stderr, "HPP: entering CONTENT\n");
3422: #endif
3423: break;
3424: }
3425:
3426: /*
3427: * Check for an Empty Element from DTD definition
3428: */
3429: if ((info != NULL) && (info->empty)) {
3430: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3431: ctxt->sax->endElement(ctxt->userData, name);
3432: oldname = htmlnamePop(ctxt);
3433: #ifdef DEBUG
3434: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3435: #endif
3436: if (oldname != NULL)
3437: xmlFree(oldname);
3438: }
3439: ctxt->instate = XML_PARSER_CONTENT;
3440: #ifdef DEBUG_PUSH
3441: fprintf(stderr, "HPP: entering CONTENT\n");
3442: #endif
3443: break;
3444: }
3445: case XML_PARSER_CONTENT:
3446: /*
3447: * Handle preparsed entities and charRef
3448: */
3449: if (ctxt->token != 0) {
3450: xmlChar cur[2] = { 0 , 0 } ;
3451:
3452: cur[0] = (xmlChar) ctxt->token;
3453: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3454: ctxt->sax->characters(ctxt->userData, cur, 1);
3455: ctxt->token = 0;
3456: ctxt->checkIndex = 0;
3457: }
3458: if (avail < 2)
3459: goto done;
3460: cur = in->cur[0];
3461: next = in->cur[1];
3462: if ((cur == '<') && (next == '!') &&
3463: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3464: if ((!terminate) &&
3465: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3466: goto done;
3467: #ifdef DEBUG_PUSH
3468: fprintf(stderr, "HPP: Parsing Comment\n");
3469: #endif
3470: htmlParseComment(ctxt);
3471: ctxt->instate = XML_PARSER_CONTENT;
3472: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
3473: goto done;
3474: } else if ((cur == '<') && (next == '/')) {
3475: ctxt->instate = XML_PARSER_END_TAG;
3476: ctxt->checkIndex = 0;
3477: #ifdef DEBUG_PUSH
3478: fprintf(stderr, "HPP: entering END_TAG\n");
3479: #endif
3480: break;
3481: } else if (cur == '<') {
3482: ctxt->instate = XML_PARSER_START_TAG;
3483: ctxt->checkIndex = 0;
3484: #ifdef DEBUG_PUSH
3485: fprintf(stderr, "HPP: entering START_TAG\n");
3486: #endif
3487: break;
3488: } else if (cur == '&') {
1.32 daniel 3489: if ((!terminate) &&
3490: (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
1.31 daniel 3491: goto done;
3492: #ifdef DEBUG_PUSH
3493: fprintf(stderr, "HPP: Parsing Reference\n");
3494: #endif
3495: /* TODO: check generation of subtrees if noent !!! */
3496: htmlParseReference(ctxt);
3497: } else {
3498: /* TODO Avoid the extra copy, handle directly !!!!!! */
3499: /*
3500: * Goal of the following test is :
3501: * - minimize calls to the SAX 'character' callback
3502: * when they are mergeable
3503: */
3504: if ((ctxt->inputNr == 1) &&
3505: (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
1.32 daniel 3506: if ((!terminate) &&
3507: (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
1.31 daniel 3508: goto done;
3509: }
3510: ctxt->checkIndex = 0;
3511: #ifdef DEBUG_PUSH
3512: fprintf(stderr, "HPP: Parsing char data\n");
3513: #endif
3514: htmlParseCharData(ctxt, 0);
3515: }
3516: break;
3517: case XML_PARSER_END_TAG:
3518: if (avail < 2)
3519: goto done;
1.32 daniel 3520: if ((!terminate) &&
3521: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3522: goto done;
3523: htmlParseEndTag(ctxt);
3524: if (ctxt->nameNr == 0) {
3525: ctxt->instate = XML_PARSER_EPILOG;
3526: } else {
3527: ctxt->instate = XML_PARSER_CONTENT;
3528: }
3529: ctxt->checkIndex = 0;
3530: #ifdef DEBUG_PUSH
3531: fprintf(stderr, "HPP: entering CONTENT\n");
3532: #endif
3533: break;
3534: case XML_PARSER_CDATA_SECTION:
3535: fprintf(stderr, "HPP: internal error, state == CDATA\n");
3536: ctxt->instate = XML_PARSER_CONTENT;
3537: ctxt->checkIndex = 0;
3538: #ifdef DEBUG_PUSH
3539: fprintf(stderr, "HPP: entering CONTENT\n");
3540: #endif
3541: break;
3542: case XML_PARSER_DTD:
3543: fprintf(stderr, "HPP: internal error, state == DTD\n");
3544: ctxt->instate = XML_PARSER_CONTENT;
3545: ctxt->checkIndex = 0;
3546: #ifdef DEBUG_PUSH
3547: fprintf(stderr, "HPP: entering CONTENT\n");
3548: #endif
3549: break;
3550: case XML_PARSER_COMMENT:
3551: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
3552: ctxt->instate = XML_PARSER_CONTENT;
3553: ctxt->checkIndex = 0;
3554: #ifdef DEBUG_PUSH
3555: fprintf(stderr, "HPP: entering CONTENT\n");
3556: #endif
3557: break;
3558: case XML_PARSER_PI:
3559: fprintf(stderr, "HPP: internal error, state == PI\n");
3560: ctxt->instate = XML_PARSER_CONTENT;
3561: ctxt->checkIndex = 0;
3562: #ifdef DEBUG_PUSH
3563: fprintf(stderr, "HPP: entering CONTENT\n");
3564: #endif
3565: break;
3566: case XML_PARSER_ENTITY_DECL:
3567: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
3568: ctxt->instate = XML_PARSER_CONTENT;
3569: ctxt->checkIndex = 0;
3570: #ifdef DEBUG_PUSH
3571: fprintf(stderr, "HPP: entering CONTENT\n");
3572: #endif
3573: break;
3574: case XML_PARSER_ENTITY_VALUE:
3575: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
3576: ctxt->instate = XML_PARSER_CONTENT;
3577: ctxt->checkIndex = 0;
3578: #ifdef DEBUG_PUSH
3579: fprintf(stderr, "HPP: entering DTD\n");
3580: #endif
3581: break;
3582: case XML_PARSER_ATTRIBUTE_VALUE:
3583: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
3584: ctxt->instate = XML_PARSER_START_TAG;
3585: ctxt->checkIndex = 0;
3586: #ifdef DEBUG_PUSH
3587: fprintf(stderr, "HPP: entering START_TAG\n");
3588: #endif
3589: break;
3590: }
3591: }
3592: done:
3593: #ifdef DEBUG_PUSH
3594: fprintf(stderr, "HPP: done %d\n", ret);
3595: #endif
3596: return(ret);
3597: }
3598:
3599: /**
1.32 daniel 3600: * htmlParseTry:
3601: * @ctxt: an HTML parser context
3602: *
3603: * Try to progress on parsing
3604: *
3605: * Returns zero if no parsing was possible
3606: */
3607: int
3608: htmlParseTry(htmlParserCtxtPtr ctxt) {
3609: return(htmlParseTryOrFinish(ctxt, 0));
3610: }
3611:
3612: /**
1.31 daniel 3613: * htmlParseChunk:
3614: * @ctxt: an XML parser context
3615: * @chunk: an char array
3616: * @size: the size in byte of the chunk
3617: * @terminate: last chunk indicator
3618: *
3619: * Parse a Chunk of memory
3620: *
3621: * Returns zero if no error, the xmlParserErrors otherwise.
3622: */
3623: int
3624: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
3625: int terminate) {
3626: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3627: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
3628: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
3629: int cur = ctxt->input->cur - ctxt->input->base;
3630:
3631: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3632: ctxt->input->base = ctxt->input->buf->buffer->content + base;
3633: ctxt->input->cur = ctxt->input->base + cur;
3634: #ifdef DEBUG_PUSH
3635: fprintf(stderr, "HPP: pushed %d\n", size);
3636: #endif
3637:
1.34 daniel 3638: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
3639: htmlParseTryOrFinish(ctxt, terminate);
1.31 daniel 3640: } else if (ctxt->instate != XML_PARSER_EOF)
1.32 daniel 3641: htmlParseTryOrFinish(ctxt, terminate);
1.31 daniel 3642: if (terminate) {
3643: if ((ctxt->instate != XML_PARSER_EOF) &&
3644: (ctxt->instate != XML_PARSER_EPILOG) &&
3645: (ctxt->instate != XML_PARSER_MISC)) {
3646: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3647: ctxt->sax->error(ctxt->userData,
3648: "Extra content at the end of the document\n");
3649: ctxt->wellFormed = 0;
3650: ctxt->errNo = XML_ERR_DOCUMENT_END;
3651: }
3652: if (ctxt->instate != XML_PARSER_EOF) {
3653: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3654: ctxt->sax->endDocument(ctxt->userData);
3655: }
3656: ctxt->instate = XML_PARSER_EOF;
3657: }
3658: return((xmlParserErrors) ctxt->errNo);
3659: }
3660:
3661: /************************************************************************
3662: * *
3663: * User entry points *
3664: * *
3665: ************************************************************************/
3666:
3667: /**
3668: * htmlCreatePushParserCtxt :
3669: * @sax: a SAX handler
3670: * @user_data: The user data returned on SAX callbacks
3671: * @chunk: a pointer to an array of chars
3672: * @size: number of chars in the array
3673: * @filename: an optional file name or URI
3674: * @enc: an optional encoding
3675: *
3676: * Create a parser context for using the HTML parser in push mode
3677: * To allow content encoding detection, @size should be >= 4
3678: * The value of @filename is used for fetching external entities
3679: * and error/warning reports.
3680: *
3681: * Returns the new parser context or NULL
3682: */
3683: htmlParserCtxtPtr
3684: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
3685: const char *chunk, int size, const char *filename,
3686: xmlCharEncoding enc) {
3687: htmlParserCtxtPtr ctxt;
3688: htmlParserInputPtr inputStream;
3689: xmlParserInputBufferPtr buf;
3690:
3691: buf = xmlAllocParserInputBuffer(enc);
3692: if (buf == NULL) return(NULL);
3693:
3694: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3695: if (ctxt == NULL) {
3696: xmlFree(buf);
3697: return(NULL);
3698: }
3699: memset(ctxt, 0, sizeof(htmlParserCtxt));
3700: htmlInitParserCtxt(ctxt);
3701: if (sax != NULL) {
3702: if (ctxt->sax != &htmlDefaultSAXHandler)
3703: xmlFree(ctxt->sax);
3704: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
3705: if (ctxt->sax == NULL) {
3706: xmlFree(buf);
3707: xmlFree(ctxt);
3708: return(NULL);
3709: }
3710: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
3711: if (user_data != NULL)
3712: ctxt->userData = user_data;
3713: }
3714: if (filename == NULL) {
3715: ctxt->directory = NULL;
3716: } else {
3717: ctxt->directory = xmlParserGetDirectory(filename);
3718: }
3719:
3720: inputStream = htmlNewInputStream(ctxt);
3721: if (inputStream == NULL) {
3722: xmlFreeParserCtxt(ctxt);
3723: return(NULL);
3724: }
3725:
3726: if (filename == NULL)
3727: inputStream->filename = NULL;
3728: else
3729: inputStream->filename = xmlMemStrdup(filename);
3730: inputStream->buf = buf;
3731: inputStream->base = inputStream->buf->buffer->content;
3732: inputStream->cur = inputStream->buf->buffer->content;
3733:
3734: inputPush(ctxt, inputStream);
3735:
3736: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3737: (ctxt->input->buf != NULL)) {
3738: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3739: #ifdef DEBUG_PUSH
3740: fprintf(stderr, "HPP: pushed %d\n", size);
3741: #endif
3742: }
3743:
3744: return(ctxt);
3745: }
1.1 daniel 3746:
3747: /**
3748: * htmlSAXParseDoc :
1.14 daniel 3749: * @cur: a pointer to an array of xmlChar
1.1 daniel 3750: * @encoding: a free form C string describing the HTML document encoding, or NULL
3751: * @sax: the SAX handler block
3752: * @userData: if using SAX, this pointer will be provided on callbacks.
3753: *
3754: * parse an HTML in-memory document and build a tree.
3755: * It use the given SAX function block to handle the parsing callback.
3756: * If sax is NULL, fallback to the default DOM tree building routines.
3757: *
3758: * Returns the resulting document tree
3759: */
3760:
3761: htmlDocPtr
1.14 daniel 3762: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
1.1 daniel 3763: htmlDocPtr ret;
3764: htmlParserCtxtPtr ctxt;
3765:
3766: if (cur == NULL) return(NULL);
3767:
3768:
3769: ctxt = htmlCreateDocParserCtxt(cur, encoding);
3770: if (ctxt == NULL) return(NULL);
3771: if (sax != NULL) {
3772: ctxt->sax = sax;
3773: ctxt->userData = userData;
3774: }
3775:
3776: htmlParseDocument(ctxt);
3777: ret = ctxt->myDoc;
3778: if (sax != NULL) {
3779: ctxt->sax = NULL;
3780: ctxt->userData = NULL;
3781: }
3782: htmlFreeParserCtxt(ctxt);
3783:
3784: return(ret);
3785: }
3786:
3787: /**
3788: * htmlParseDoc :
1.14 daniel 3789: * @cur: a pointer to an array of xmlChar
1.1 daniel 3790: * @encoding: a free form C string describing the HTML document encoding, or NULL
3791: *
3792: * parse an HTML in-memory document and build a tree.
3793: *
3794: * Returns the resulting document tree
3795: */
3796:
3797: htmlDocPtr
1.14 daniel 3798: htmlParseDoc(xmlChar *cur, const char *encoding) {
1.1 daniel 3799: return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
3800: }
3801:
3802:
3803: /**
3804: * htmlCreateFileParserCtxt :
3805: * @filename: the filename
3806: * @encoding: a free form C string describing the HTML document encoding, or NULL
3807: *
3808: * Create a parser context for a file content.
3809: * Automatic support for ZLIB/Compress compressed document is provided
3810: * by default if found at compile-time.
3811: *
3812: * Returns the new parser context or NULL
3813: */
3814: htmlParserCtxtPtr
3815: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
3816: {
3817: htmlParserCtxtPtr ctxt;
3818: htmlParserInputPtr inputStream;
1.5 daniel 3819: xmlParserInputBufferPtr buf;
1.1 daniel 3820: /* htmlCharEncoding enc; */
3821:
1.5 daniel 3822: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
3823: if (buf == NULL) return(NULL);
1.1 daniel 3824:
1.11 daniel 3825: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 3826: if (ctxt == NULL) {
3827: perror("malloc");
3828: return(NULL);
3829: }
1.19 daniel 3830: memset(ctxt, 0, sizeof(htmlParserCtxt));
1.1 daniel 3831: htmlInitParserCtxt(ctxt);
1.11 daniel 3832: inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 3833: if (inputStream == NULL) {
3834: perror("malloc");
1.11 daniel 3835: xmlFree(ctxt);
1.1 daniel 3836: return(NULL);
3837: }
1.19 daniel 3838: memset(inputStream, 0, sizeof(htmlParserInput));
1.1 daniel 3839:
1.11 daniel 3840: inputStream->filename = xmlMemStrdup(filename);
1.1 daniel 3841: inputStream->line = 1;
3842: inputStream->col = 1;
1.5 daniel 3843: inputStream->buf = buf;
1.21 daniel 3844: inputStream->directory = NULL;
1.1 daniel 3845:
1.5 daniel 3846: inputStream->base = inputStream->buf->buffer->content;
3847: inputStream->cur = inputStream->buf->buffer->content;
3848: inputStream->free = NULL;
1.1 daniel 3849:
3850: inputPush(ctxt, inputStream);
3851: return(ctxt);
3852: }
3853:
3854: /**
3855: * htmlSAXParseFile :
3856: * @filename: the filename
3857: * @encoding: a free form C string describing the HTML document encoding, or NULL
3858: * @sax: the SAX handler block
3859: * @userData: if using SAX, this pointer will be provided on callbacks.
3860: *
3861: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3862: * compressed document is provided by default if found at compile-time.
3863: * It use the given SAX function block to handle the parsing callback.
3864: * If sax is NULL, fallback to the default DOM tree building routines.
3865: *
3866: * Returns the resulting document tree
3867: */
3868:
3869: htmlDocPtr
3870: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
3871: void *userData) {
3872: htmlDocPtr ret;
3873: htmlParserCtxtPtr ctxt;
3874:
3875: ctxt = htmlCreateFileParserCtxt(filename, encoding);
3876: if (ctxt == NULL) return(NULL);
3877: if (sax != NULL) {
3878: ctxt->sax = sax;
3879: ctxt->userData = userData;
3880: }
3881:
3882: htmlParseDocument(ctxt);
3883:
3884: ret = ctxt->myDoc;
3885: if (sax != NULL) {
3886: ctxt->sax = NULL;
3887: ctxt->userData = NULL;
3888: }
3889: htmlFreeParserCtxt(ctxt);
3890:
3891: return(ret);
3892: }
3893:
3894: /**
3895: * htmlParseFile :
3896: * @filename: the filename
3897: * @encoding: a free form C string describing the HTML document encoding, or NULL
3898: *
3899: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3900: * compressed document is provided by default if found at compile-time.
3901: *
3902: * Returns the resulting document tree
3903: */
3904:
3905: htmlDocPtr
3906: htmlParseFile(const char *filename, const char *encoding) {
3907: return(htmlSAXParseFile(filename, encoding, NULL, NULL));
3908: }
1.39 daniel 3909:
3910: #endif /* LIBXML_HTML_ENABLED */
Webmaster