Annotation of XML/HTMLparser.c, revision 1.51
1.1 daniel 1: /*
2: * HTMLparser.c : an HTML 4.0 non-verifying parser
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
1.29 daniel 10: #include "win32config.h"
1.1 daniel 11: #else
1.13 daniel 12: #include "config.h"
1.1 daniel 13: #endif
1.13 daniel 14:
1.39 daniel 15: #include "xmlversion.h"
16: #ifdef LIBXML_HTML_ENABLED
17:
1.1 daniel 18: #include <stdio.h>
1.50 veillard 19: #include <string.h>
1.13 daniel 20: #ifdef HAVE_CTYPE_H
1.1 daniel 21: #include <ctype.h>
1.13 daniel 22: #endif
23: #ifdef HAVE_STDLIB_H
1.1 daniel 24: #include <stdlib.h>
1.13 daniel 25: #endif
26: #ifdef HAVE_SYS_STAT_H
1.1 daniel 27: #include <sys/stat.h>
1.13 daniel 28: #endif
1.1 daniel 29: #ifdef HAVE_FCNTL_H
30: #include <fcntl.h>
31: #endif
32: #ifdef HAVE_UNISTD_H
33: #include <unistd.h>
34: #endif
35: #ifdef HAVE_ZLIB_H
36: #include <zlib.h>
37: #endif
38:
1.39 daniel 39: #include <libxml/xmlmemory.h>
40: #include <libxml/tree.h>
41: #include <libxml/HTMLparser.h>
42: #include <libxml/entities.h>
43: #include <libxml/encoding.h>
1.50 veillard 44: #include <libxml/parser.h>
1.39 daniel 45: #include <libxml/valid.h>
46: #include <libxml/parserInternals.h>
47: #include <libxml/xmlIO.h>
1.31 daniel 48: #include "xml-error.h"
1.5 daniel 49:
50: #define HTML_MAX_NAMELEN 1000
51: #define INPUT_CHUNK 50
1.31 daniel 52: #define HTML_PARSER_BIG_BUFFER_SIZE 1024
53: #define HTML_PARSER_BUFFER_SIZE 100
1.1 daniel 54:
55: /* #define DEBUG */
1.31 daniel 56: /* #define DEBUG_PUSH */
1.1 daniel 57:
58: /************************************************************************
59: * *
60: * Parser stacks related functions and macros *
61: * *
62: ************************************************************************/
63:
64: /*
65: * Generic function for accessing stacks in the Parser Context
66: */
67:
1.30 daniel 68: #define PUSH_AND_POP(scope, type, name) \
69: scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
1.1 daniel 70: if (ctxt->name##Nr >= ctxt->name##Max) { \
71: ctxt->name##Max *= 2; \
1.50 veillard 72: ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
1.1 daniel 73: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74: if (ctxt->name##Tab == NULL) { \
75: fprintf(stderr, "realloc failed !\n"); \
1.33 daniel 76: return(0); \
1.1 daniel 77: } \
78: } \
79: ctxt->name##Tab[ctxt->name##Nr] = value; \
80: ctxt->name = value; \
81: return(ctxt->name##Nr++); \
82: } \
1.30 daniel 83: scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
1.1 daniel 84: type ret; \
1.18 daniel 85: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 86: ctxt->name##Nr--; \
1.18 daniel 87: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 88: if (ctxt->name##Nr > 0) \
89: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90: else \
91: ctxt->name = NULL; \
92: ret = ctxt->name##Tab[ctxt->name##Nr]; \
93: ctxt->name##Tab[ctxt->name##Nr] = 0; \
94: return(ret); \
95: } \
96:
1.30 daniel 97: PUSH_AND_POP(extern, xmlNodePtr, node)
98: PUSH_AND_POP(extern, xmlChar*, name)
1.1 daniel 99:
100: /*
101: * Macros for accessing the content. Those should be used only by the parser,
102: * and not exported.
103: *
104: * Dirty macros, i.e. one need to make assumption on the context to use them
105: *
1.14 daniel 106: * CUR_PTR return the current pointer to the xmlChar to be parsed.
107: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
1.1 daniel 108: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109: * in UNICODE mode. This should be used internally by the parser
110: * only to compare to ASCII values otherwise it would break when
111: * running with UTF-8 encoding.
1.14 daniel 112: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
1.1 daniel 113: * to compare on ASCII based substring.
1.14 daniel 114: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
1.1 daniel 115: * it should be used only to compare on ASCII based substring.
1.14 daniel 116: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
1.1 daniel 117: * strings within the parser.
118: *
119: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120: *
121: * CURRENT Returns the current char value, with the full decoding of
122: * UTF-8 if we are using this mode. It returns an int.
123: * NEXT Skip to the next character, this does the proper decoding
124: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
125: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126: */
127:
1.36 daniel 128: #define CUR ((int) (*ctxt->input->cur))
129:
1.1 daniel 130: #define UPPER (toupper(*ctxt->input->cur))
1.36 daniel 131:
1.26 daniel 132: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
1.36 daniel 133:
1.1 daniel 134: #define NXT(val) ctxt->input->cur[(val)]
1.36 daniel 135:
1.1 daniel 136: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
1.36 daniel 137:
1.1 daniel 138: #define CUR_PTR ctxt->input->cur
1.36 daniel 139:
1.5 daniel 140: #define SHRINK xmlParserInputShrink(ctxt->input)
1.36 daniel 141:
1.5 daniel 142: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
1.1 daniel 143:
1.36 daniel 144: #define CURRENT ((int) (*ctxt->input->cur))
1.1 daniel 145:
1.36 daniel 146: #define NEXT htmlNextChar(ctxt);
1.35 daniel 147:
1.36 daniel 148: #define SKIP_BLANKS htmlSkipBlankChars(ctxt);
1.35 daniel 149:
150: /**
151: * htmlNextChar:
152: * @ctxt: the HTML parser context
153: *
154: * Skip to the next char input char.
155: */
156:
157: void
158: htmlNextChar(htmlParserCtxtPtr ctxt) {
1.44 daniel 159: if (ctxt->instate == XML_PARSER_EOF)
160: return;
1.35 daniel 161: if ((*ctxt->input->cur == 0) &&
162: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
163: xmlPopInput(ctxt);
164: } else {
165: if (*(ctxt->input->cur) == '\n') {
166: ctxt->input->line++; ctxt->input->col = 1;
167: } else ctxt->input->col++;
168: ctxt->input->cur++;
169: ctxt->nbChars++;
170: if (*ctxt->input->cur == 0)
171: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
172: }
173: }
1.5 daniel 174:
1.36 daniel 175: /**
176: * htmlSkipBlankChars:
177: * @ctxt: the HTML parser context
178: *
179: * skip all blanks character found at that point in the input streams.
180: *
181: * Returns the number of space chars skipped
182: */
183:
184: int
185: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
186: int res = 0;
187:
188: while (IS_BLANK(*(ctxt->input->cur))) {
189: if ((*ctxt->input->cur == 0) &&
190: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
191: xmlPopInput(ctxt);
192: } else {
193: if (*(ctxt->input->cur) == '\n') {
194: ctxt->input->line++; ctxt->input->col = 1;
195: } else ctxt->input->col++;
196: ctxt->input->cur++;
197: ctxt->nbChars++;
198: if (*ctxt->input->cur == 0)
199: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
200: }
201: res++;
202: }
203: return(res);
204: }
1.1 daniel 205:
206:
1.5 daniel 207:
1.1 daniel 208: /************************************************************************
209: * *
210: * The list of HTML elements and their properties *
211: * *
212: ************************************************************************/
213:
214: /*
215: * Start Tag: 1 means the start tag can be ommited
216: * End Tag: 1 means the end tag can be ommited
217: * 2 means it's forbidden (empty elements)
218: * Depr: this element is deprecated
219: * DTD: 1 means that this element is valid only in the Loose DTD
220: * 2 means that this element is valid only in the Frameset DTD
221: *
222: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
223: */
224: htmlElemDesc html40ElementTable[] = {
1.26 daniel 225: { "a", 0, 0, 0, 0, 0, "anchor " },
226: { "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
227: { "acronym", 0, 0, 0, 0, 0, "" },
228: { "address", 0, 0, 0, 0, 0, "information on author " },
229: { "applet", 0, 0, 0, 1, 1, "java applet " },
230: { "area", 0, 2, 1, 0, 0, "client-side image map area " },
231: { "b", 0, 0, 0, 0, 0, "bold text style" },
232: { "base", 0, 2, 1, 0, 0, "document base uri " },
233: { "basefont", 0, 2, 1, 1, 1, "base font size " },
234: { "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
235: { "big", 0, 0, 0, 0, 0, "large text style" },
236: { "blockquote", 0, 0, 0, 0, 0, "long quotation " },
237: { "body", 1, 1, 0, 0, 0, "document body " },
238: { "br", 0, 2, 1, 0, 0, "forced line break " },
239: { "button", 0, 0, 0, 0, 0, "push button " },
240: { "caption", 0, 0, 0, 0, 0, "table caption " },
241: { "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
242: { "cite", 0, 0, 0, 0, 0, "citation" },
243: { "code", 0, 0, 0, 0, 0, "computer code fragment" },
244: { "col", 0, 2, 1, 0, 0, "table column " },
245: { "colgroup", 0, 1, 0, 0, 0, "table column group " },
246: { "dd", 0, 1, 0, 0, 0, "definition description " },
247: { "del", 0, 0, 0, 0, 0, "deleted text " },
248: { "dfn", 0, 0, 0, 0, 0, "instance definition" },
249: { "dir", 0, 0, 0, 1, 1, "directory list" },
250: { "div", 0, 0, 0, 0, 0, "generic language/style container"},
251: { "dl", 0, 0, 0, 0, 0, "definition list " },
252: { "dt", 0, 1, 0, 0, 0, "definition term " },
253: { "em", 0, 0, 0, 0, 0, "emphasis" },
254: { "fieldset", 0, 0, 0, 0, 0, "form control group " },
255: { "font", 0, 0, 0, 1, 1, "local change to font " },
256: { "form", 0, 0, 0, 0, 0, "interactive form " },
257: { "frame", 0, 2, 1, 0, 2, "subwindow " },
258: { "frameset", 0, 0, 0, 0, 2, "window subdivision" },
259: { "h1", 0, 0, 0, 0, 0, "heading " },
260: { "h2", 0, 0, 0, 0, 0, "heading " },
261: { "h3", 0, 0, 0, 0, 0, "heading " },
262: { "h4", 0, 0, 0, 0, 0, "heading " },
263: { "h5", 0, 0, 0, 0, 0, "heading " },
264: { "h6", 0, 0, 0, 0, 0, "heading " },
265: { "head", 1, 1, 0, 0, 0, "document head " },
266: { "hr", 0, 2, 1, 0, 0, "horizontal rule " },
267: { "html", 1, 1, 0, 0, 0, "document root element " },
268: { "i", 0, 0, 0, 0, 0, "italic text style" },
269: { "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
270: { "img", 0, 2, 1, 0, 0, "embedded image " },
271: { "input", 0, 2, 1, 0, 0, "form control " },
272: { "ins", 0, 0, 0, 0, 0, "inserted text" },
273: { "isindex", 0, 2, 1, 1, 1, "single line prompt " },
274: { "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
275: { "label", 0, 0, 0, 0, 0, "form field label text " },
276: { "legend", 0, 0, 0, 0, 0, "fieldset legend " },
277: { "li", 0, 1, 0, 0, 0, "list item " },
278: { "link", 0, 2, 1, 0, 0, "a media-independent link " },
279: { "map", 0, 0, 0, 0, 0, "client-side image map " },
280: { "menu", 0, 0, 0, 1, 1, "menu list " },
281: { "meta", 0, 2, 1, 0, 0, "generic metainformation " },
282: { "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
283: { "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
284: { "object", 0, 0, 0, 0, 0, "generic embedded object " },
285: { "ol", 0, 0, 0, 0, 0, "ordered list " },
286: { "optgroup", 0, 0, 0, 0, 0, "option group " },
287: { "option", 0, 1, 0, 0, 0, "selectable choice " },
288: { "p", 0, 1, 0, 0, 0, "paragraph " },
289: { "param", 0, 2, 1, 0, 0, "named property value " },
290: { "pre", 0, 0, 0, 0, 0, "preformatted text " },
291: { "q", 0, 0, 0, 0, 0, "short inline quotation " },
292: { "s", 0, 0, 0, 1, 1, "strike-through text style" },
293: { "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
294: { "script", 0, 0, 0, 0, 0, "script statements " },
295: { "select", 0, 0, 0, 0, 0, "option selector " },
296: { "small", 0, 0, 0, 0, 0, "small text style" },
297: { "span", 0, 0, 0, 0, 0, "generic language/style container " },
298: { "strike", 0, 0, 0, 1, 1, "strike-through text" },
299: { "strong", 0, 0, 0, 0, 0, "strong emphasis" },
300: { "style", 0, 0, 0, 0, 0, "style info " },
301: { "sub", 0, 0, 0, 0, 0, "subscript" },
302: { "sup", 0, 0, 0, 0, 0, "superscript " },
303: { "table", 0, 0, 0, 0, 0, " " },
304: { "tbody", 1, 1, 0, 0, 0, "table body " },
305: { "td", 0, 1, 0, 0, 0, "table data cell" },
306: { "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
307: { "tfoot", 0, 1, 0, 0, 0, "table footer " },
308: { "th", 0, 1, 0, 0, 0, "table header cell" },
309: { "thead", 0, 1, 0, 0, 0, "table header " },
310: { "title", 0, 0, 0, 0, 0, "document title " },
311: { "tr", 0, 1, 0, 0, 0, "table row " },
312: { "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
313: { "u", 0, 0, 0, 1, 1, "underlined text style" },
314: { "ul", 0, 0, 0, 0, 0, "unordered list " },
315: { "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
1.1 daniel 316: };
317:
318: /*
319: * start tags that imply the end of a current element
320: * any tag of each line implies the end of the current element if the type of
321: * that element is in the same line
322: */
1.8 daniel 323: char *htmlEquEnd[] = {
1.26 daniel 324: "dt", "dd", "li", "option", NULL,
325: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
326: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
1.1 daniel 327: NULL
328: };
329: /*
330: * acording the HTML DTD, HR should be added to the 2nd line above, as it
331: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
332: * because many documents contain rules in headings...
333: */
334:
335: /*
336: * start tags that imply the end of current element
337: */
1.8 daniel 338: char *htmlStartClose[] = {
1.26 daniel 339: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
340: "dl", "ul", "ol", "menu", "dir", "address", "pre",
341: "listing", "xmp", "head", NULL,
342: "head", "p", NULL,
343: "title", "p", NULL,
344: "body", "head", "style", "link", "title", "p", NULL,
345: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
346: "pre", "listing", "xmp", "head", "li", NULL,
347: "hr", "p", "head", NULL,
348: "h1", "p", "head", NULL,
349: "h2", "p", "head", NULL,
350: "h3", "p", "head", NULL,
351: "h4", "p", "head", NULL,
352: "h5", "p", "head", NULL,
353: "h6", "p", "head", NULL,
354: "dir", "p", "head", NULL,
355: "address", "p", "head", "ul", NULL,
356: "pre", "p", "head", "ul", NULL,
357: "listing", "p", "head", NULL,
358: "xmp", "p", "head", NULL,
359: "blockquote", "p", "head", NULL,
360: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
361: "xmp", "head", NULL,
362: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
363: "head", "dd", NULL,
364: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
365: "head", "dt", NULL,
366: "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
367: "listing", "xmp", NULL,
368: "ol", "p", "head", "ul", NULL,
369: "menu", "p", "head", "ul", NULL,
370: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
371: "div", "p", "head", NULL,
372: "noscript", "p", "head", NULL,
373: "center", "font", "b", "i", "p", "head", NULL,
374: "a", "a", NULL,
375: "caption", "p", NULL,
376: "colgroup", "caption", "colgroup", "col", "p", NULL,
377: "col", "caption", "col", "p", NULL,
378: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
379: "listing", "xmp", "a", NULL,
380: "th", "th", "td", NULL,
381: "td", "th", "td", "p", NULL,
382: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
383: "thead", "caption", "col", "colgroup", NULL,
384: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
385: "tbody", "p", NULL,
386: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
387: "tfoot", "tbody", "p", NULL,
388: "optgroup", "option", NULL,
389: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
390: "pre", "listing", "xmp", "a", NULL,
1.1 daniel 391: NULL
392: };
393:
1.43 daniel 394:
1.8 daniel 395: static char** htmlStartCloseIndex[100];
1.1 daniel 396: static int htmlStartCloseIndexinitialized = 0;
397:
398: /************************************************************************
399: * *
400: * functions to handle HTML specific data *
401: * *
402: ************************************************************************/
403:
404: /**
405: * htmlInitAutoClose:
406: *
407: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
408: *
409: */
410: void
411: htmlInitAutoClose(void) {
412: int index, i = 0;
413:
414: if (htmlStartCloseIndexinitialized) return;
415:
416: for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
417: index = 0;
418: while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
419: htmlStartCloseIndex[index++] = &htmlStartClose[i];
420: while (htmlStartClose[i] != NULL) i++;
421: i++;
422: }
423: }
424:
425: /**
426: * htmlTagLookup:
427: * @tag: The tag name
428: *
429: * Lookup the HTML tag in the ElementTable
430: *
431: * Returns the related htmlElemDescPtr or NULL if not found.
432: */
433: htmlElemDescPtr
1.14 daniel 434: htmlTagLookup(const xmlChar *tag) {
1.1 daniel 435: int i = 0;
436:
437: for (i = 0; i < (sizeof(html40ElementTable) /
438: sizeof(html40ElementTable[0]));i++) {
1.8 daniel 439: if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
1.1 daniel 440: return(&html40ElementTable[i]);
441: }
442: return(NULL);
443: }
444:
445: /**
446: * htmlCheckAutoClose:
1.50 veillard 447: * @newtag: The new tag name
448: * @oldtag: The old tag name
1.1 daniel 449: *
450: * Checks wether the new tag is one of the registered valid tags for closing old.
451: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
452: *
453: * Returns 0 if no, 1 if yes.
454: */
455: int
1.50 veillard 456: htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
1.1 daniel 457: int i, index;
1.8 daniel 458: char **close;
1.1 daniel 459:
460: if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
461:
462: /* inefficient, but not a big deal */
463: for (index = 0; index < 100;index++) {
464: close = htmlStartCloseIndex[index];
465: if (close == NULL) return(0);
1.50 veillard 466: if (!xmlStrcmp(BAD_CAST *close, newtag)) break;
1.1 daniel 467: }
468:
469: i = close - htmlStartClose;
470: i++;
471: while (htmlStartClose[i] != NULL) {
1.50 veillard 472: if (!xmlStrcmp(BAD_CAST htmlStartClose[i], oldtag)) {
1.1 daniel 473: return(1);
474: }
475: i++;
476: }
477: return(0);
478: }
479:
480: /**
1.50 veillard 481: * htmlAutoCloseOnClose:
482: * @ctxt: an HTML parser context
483: * @newtag: The new tag name
484: *
485: * The HTmL DtD allows an ending tag to implicitely close other tags.
486: */
487: void
488: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
489: htmlElemDescPtr info;
490: xmlChar *oldname;
491: int i;
492:
493: #ifdef DEBUG
494: fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
495: for (i = 0;i < ctxt->nameNr;i++)
496: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
497: #endif
498:
499: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
500: if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break;
501: }
502: if (i < 0) return;
503:
504: while (xmlStrcmp(newtag, ctxt->name)) {
505: info = htmlTagLookup(ctxt->name);
506: if ((info == NULL) || (info->endTag == 1)) {
507: #ifdef DEBUG
508: fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
509: #endif
510: } else {
511: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
512: ctxt->sax->error(ctxt->userData,
513: "Opening and ending tag mismatch: %s and %s\n",
514: newtag, ctxt->name);
515: ctxt->wellFormed = 0;
516: }
517: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
518: ctxt->sax->endElement(ctxt->userData, ctxt->name);
519: oldname = htmlnamePop(ctxt);
520: if (oldname != NULL) {
521: #ifdef DEBUG
522: fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
523: #endif
524: xmlFree(oldname);
525: }
526: }
527: }
528:
529: /**
1.1 daniel 530: * htmlAutoClose:
531: * @ctxt: an HTML parser context
1.50 veillard 532: * @newtag: The new tag name or NULL
1.1 daniel 533: *
534: * The HTmL DtD allows a tag to implicitely close other tags.
535: * The list is kept in htmlStartClose array. This function is
536: * called when a new tag has been detected and generates the
537: * appropriates closes if possible/needed.
1.50 veillard 538: * If newtag is NULL this mean we are at the end of the resource
1.47 daniel 539: * and we should check
1.1 daniel 540: */
541: void
1.50 veillard 542: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1.15 daniel 543: xmlChar *oldname;
1.50 veillard 544: while ((newtag != NULL) && (ctxt->name != NULL) &&
545: (htmlCheckAutoClose(newtag, ctxt->name))) {
1.1 daniel 546: #ifdef DEBUG
1.50 veillard 547: fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
1.1 daniel 548: #endif
549: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1.15 daniel 550: ctxt->sax->endElement(ctxt->userData, ctxt->name);
1.24 daniel 551: oldname = htmlnamePop(ctxt);
1.18 daniel 552: if (oldname != NULL) {
553: #ifdef DEBUG
554: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
555: #endif
1.17 daniel 556: xmlFree(oldname);
1.18 daniel 557: }
1.1 daniel 558: }
1.50 veillard 559: if (newtag == NULL) {
1.49 daniel 560: htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
561: htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
562: htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
563: }
1.50 veillard 564: while ((newtag == NULL) && (ctxt->name != NULL) &&
1.47 daniel 565: ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
566: (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
567: (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) {
568: #ifdef DEBUG
569: fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
570: #endif
571: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
572: ctxt->sax->endElement(ctxt->userData, ctxt->name);
573: oldname = htmlnamePop(ctxt);
574: if (oldname != NULL) {
575: #ifdef DEBUG
576: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
577: #endif
578: xmlFree(oldname);
579: }
580: }
581:
1.1 daniel 582: }
583:
584: /**
1.28 daniel 585: * htmlAutoCloseTag:
586: * @doc: the HTML document
587: * @name: The tag name
588: * @elem: the HTML element
589: *
590: * The HTmL DtD allows a tag to implicitely close other tags.
591: * The list is kept in htmlStartClose array. This function checks
592: * if the element or one of it's children would autoclose the
593: * given tag.
594: *
595: * Returns 1 if autoclose, 0 otherwise
596: */
597: int
598: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
599: htmlNodePtr child;
600:
601: if (elem == NULL) return(1);
602: if (!xmlStrcmp(name, elem->name)) return(0);
603: if (htmlCheckAutoClose(elem->name, name)) return(1);
1.37 daniel 604: child = elem->children;
1.28 daniel 605: while (child != NULL) {
606: if (htmlAutoCloseTag(doc, name, child)) return(1);
607: child = child->next;
608: }
609: return(0);
610: }
611:
612: /**
613: * htmlIsAutoClosed:
614: * @doc: the HTML document
615: * @elem: the HTML element
616: *
617: * The HTmL DtD allows a tag to implicitely close other tags.
618: * The list is kept in htmlStartClose array. This function checks
619: * if a tag is autoclosed by one of it's child
620: *
621: * Returns 1 if autoclosed, 0 otherwise
622: */
623: int
624: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
625: htmlNodePtr child;
626:
627: if (elem == NULL) return(1);
1.37 daniel 628: child = elem->children;
1.28 daniel 629: while (child != NULL) {
630: if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
631: child = child->next;
632: }
633: return(0);
634: }
635:
636: /**
1.43 daniel 637: * htmlCheckImplied:
638: * @ctxt: an HTML parser context
1.50 veillard 639: * @newtag: The new tag name
1.43 daniel 640: *
641: * The HTmL DtD allows a tag to exists only implicitely
642: * called when a new tag has been detected and generates the
643: * appropriates implicit tags if missing
644: */
645: void
1.50 veillard 646: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
647: if (!xmlStrcmp(newtag, BAD_CAST"html"))
1.43 daniel 648: return;
649: if (ctxt->nameNr <= 0) {
650: #ifdef DEBUG
651: fprintf(stderr,"Implied element html: pushed html\n");
652: #endif
653: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
654: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
655: ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
656: }
1.50 veillard 657: if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head")))
1.43 daniel 658: return;
659: if (ctxt->nameNr <= 1) {
1.50 veillard 660: if ((!xmlStrcmp(newtag, BAD_CAST"script")) ||
661: (!xmlStrcmp(newtag, BAD_CAST"style")) ||
662: (!xmlStrcmp(newtag, BAD_CAST"meta")) ||
663: (!xmlStrcmp(newtag, BAD_CAST"link")) ||
664: (!xmlStrcmp(newtag, BAD_CAST"title")) ||
665: (!xmlStrcmp(newtag, BAD_CAST"base"))) {
1.43 daniel 666: /*
667: * dropped OBJECT ... i you put it first BODY will be
668: * assumed !
669: */
670: #ifdef DEBUG
671: fprintf(stderr,"Implied element head: pushed head\n");
672: #endif
673: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
674: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
675: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
676: } else {
677: #ifdef DEBUG
678: fprintf(stderr,"Implied element body: pushed body\n");
679: #endif
680: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
681: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
682: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
683: }
684: }
685: }
686:
1.1 daniel 687: /************************************************************************
688: * *
689: * The list of HTML predefined entities *
690: * *
691: ************************************************************************/
692:
693:
694: htmlEntityDesc html40EntitiesTable[] = {
695: /*
696: * the 4 absolute ones,
697: */
698: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
699: { 38, "amp", "ampersand, U+0026 ISOnum" },
700: { 60, "lt", "less-than sign, U+003C ISOnum" },
701: { 62, "gt", "greater-than sign, U+003E ISOnum" },
702:
703: /*
704: * A bunch still in the 128-255 range
705: * Replacing them depend really on the charset used.
706: */
1.28 daniel 707: { 39, "apos", "single quote" },
1.1 daniel 708: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
709: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
710: { 162, "cent", "cent sign, U+00A2 ISOnum" },
711: { 163, "pound","pound sign, U+00A3 ISOnum" },
712: { 164, "curren","currency sign, U+00A4 ISOnum" },
713: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
714: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
715: { 167, "sect", "section sign, U+00A7 ISOnum" },
716: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
717: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
718: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
719: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
720: { 172, "not", "not sign, U+00AC ISOnum" },
721: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
722: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
723: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
724: { 176, "deg", "degree sign, U+00B0 ISOnum" },
725: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
726: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
727: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
728: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
729: { 181, "micro","micro sign, U+00B5 ISOnum" },
730: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1.7 daniel 731: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1.1 daniel 732: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
733: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
734: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1.7 daniel 735: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1.1 daniel 736: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
737: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
738: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
739: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
740: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
741: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
742: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
743: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
744: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
745: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
746: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
747: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
748: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
749: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
750: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
751: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
752: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
753: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
754: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
755: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
756: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
757: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
758: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
759: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
760: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
761: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
762: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
763: { 215, "times","multiplication sign, U+00D7 ISOnum" },
1.7 daniel 764: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1.1 daniel 765: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
766: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
767: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
768: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
769: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
770: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
771: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
772: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
773: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
774: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
775: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
776: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
777: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
778: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
779: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
780: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
781: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
782: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
783: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
784: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
785: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
786: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
787: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
788: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
789: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
790: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
791: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
792: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
793: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
794: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
795: { 247, "divide","division sign, U+00F7 ISOnum" },
796: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
797: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
798: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
799: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
800: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
801: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
802: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
803: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
804:
805: /*
806: * Anything below should really be kept as entities references
807: */
808: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
809:
810: { 913, "Alpha","greek capital letter alpha, U+0391" },
811: { 914, "Beta", "greek capital letter beta, U+0392" },
812: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
813: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
814: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
815: { 918, "Zeta", "greek capital letter zeta, U+0396" },
816: { 919, "Eta", "greek capital letter eta, U+0397" },
817: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
818: { 921, "Iota", "greek capital letter iota, U+0399" },
819: { 922, "Kappa","greek capital letter kappa, U+039A" },
820: { 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
821: { 924, "Mu", "greek capital letter mu, U+039C" },
822: { 925, "Nu", "greek capital letter nu, U+039D" },
823: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
824: { 927, "Omicron","greek capital letter omicron, U+039F" },
825: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
826: { 929, "Rho", "greek capital letter rho, U+03A1" },
827: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
828: { 932, "Tau", "greek capital letter tau, U+03A4" },
829: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
830: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
831: { 935, "Chi", "greek capital letter chi, U+03A7" },
832: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
833: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
834:
835: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
836: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
837: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
838: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
839: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
840: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
841: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
842: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
843: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
844: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
845: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
846: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
847: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
848: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
849: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
850: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
851: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
852: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
853: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
854: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
855: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
856: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
857: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
858: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
859: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
860: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
861: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
862: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
863:
864: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
865: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
866: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
867: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
868: { 8254, "oline","overline = spacing overscore, U+203E NEW" },
869: { 8260, "frasl","fraction slash, U+2044 NEW" },
870:
1.7 daniel 871: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1.1 daniel 872: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
873: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
874: { 8482, "trade","trade mark sign, U+2122 ISOnum" },
875: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
876: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
877: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
878: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
879: { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
880: { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
881: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
882: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
883: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
884: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
885: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
886: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
887:
888:
889: { 8704, "forall","for all, U+2200 ISOtech" },
890: { 8706, "part", "partial differential, U+2202 ISOtech" },
891: { 8707, "exist","there exists, U+2203 ISOtech" },
892: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
893: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
894: { 8712, "isin", "element of, U+2208 ISOtech" },
895: { 8713, "notin","not an element of, U+2209 ISOtech" },
896: { 8715, "ni", "contains as member, U+220B ISOtech" },
897: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
898: { 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
899: { 8722, "minus","minus sign, U+2212 ISOtech" },
900: { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
901: { 8730, "radic","square root = radical sign, U+221A ISOtech" },
902: { 8733, "prop", "proportional to, U+221D ISOtech" },
903: { 8734, "infin","infinity, U+221E ISOtech" },
904: { 8736, "ang", "angle, U+2220 ISOamso" },
905: { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
906: { 8744, "or", "logical or = vee, U+2228 ISOtech" },
907: { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
908: { 8746, "cup", "union = cup, U+222A ISOtech" },
909: { 8747, "int", "integral, U+222B ISOtech" },
910: { 8756, "there4","therefore, U+2234 ISOtech" },
911: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
912: { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
913: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
914: { 8800, "ne", "not equal to, U+2260 ISOtech" },
915: { 8801, "equiv","identical to, U+2261 ISOtech" },
916: { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
917: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
918: { 8834, "sub", "subset of, U+2282 ISOtech" },
919: { 8835, "sup", "superset of, U+2283 ISOtech" },
920: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
921: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
922: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
923: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
924: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
925: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
926: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
927: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
928: { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
929: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
930: { 8971, "rfloor","right floor, U+230B ISOamsc" },
931: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
932: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
933: { 9674, "loz", "lozenge, U+25CA ISOpub" },
934:
935: { 9824, "spades","black spade suit, U+2660 ISOpub" },
936: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
937: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
938: { 9830, "diams","black diamond suit, U+2666 ISOpub" },
939:
940: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
941: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
942: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
943: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
944: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
945: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
946: { 732, "tilde","small tilde, U+02DC ISOdia" },
947:
948: { 8194, "ensp", "en space, U+2002 ISOpub" },
949: { 8195, "emsp", "em space, U+2003 ISOpub" },
950: { 8201, "thinsp","thin space, U+2009 ISOpub" },
951: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
952: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
953: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
954: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
955: { 8211, "ndash","en dash, U+2013 ISOpub" },
956: { 8212, "mdash","em dash, U+2014 ISOpub" },
957: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
958: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
959: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
960: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
961: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
962: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
963: { 8224, "dagger","dagger, U+2020 ISOpub" },
964: { 8225, "Dagger","double dagger, U+2021 ISOpub" },
965: { 8240, "permil","per mille sign, U+2030 ISOtech" },
966: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1.7 daniel 967: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1.1 daniel 968: { 8364, "euro", "euro sign, U+20AC NEW" }
969: };
970:
971: /************************************************************************
972: * *
973: * Commodity functions to handle entities *
974: * *
975: ************************************************************************/
976:
977: /*
978: * Macro used to grow the current buffer.
979: */
980: #define growBuffer(buffer) { \
981: buffer##_size *= 2; \
1.14 daniel 982: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1.1 daniel 983: if (buffer == NULL) { \
984: perror("realloc failed"); \
1.33 daniel 985: return(NULL); \
1.1 daniel 986: } \
987: }
988:
989: /**
990: * htmlEntityLookup:
991: * @name: the entity name
992: *
993: * Lookup the given entity in EntitiesTable
994: *
995: * TODO: the linear scan is really ugly, an hash table is really needed.
996: *
997: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
998: */
999: htmlEntityDescPtr
1.14 daniel 1000: htmlEntityLookup(const xmlChar *name) {
1.1 daniel 1001: int i;
1002:
1003: for (i = 0;i < (sizeof(html40EntitiesTable)/
1004: sizeof(html40EntitiesTable[0]));i++) {
1.8 daniel 1005: if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
1.1 daniel 1006: #ifdef DEBUG
1.18 daniel 1007: fprintf(stderr,"Found entity %s\n", name);
1.1 daniel 1008: #endif
1009: return(&html40EntitiesTable[i]);
1010: }
1011: }
1012: return(NULL);
1013: }
1014:
1015:
1016: /**
1017: * htmlDecodeEntities:
1018: * @ctxt: the parser context
1019: * @len: the len to decode (in bytes !), -1 for no size limit
1.14 daniel 1020: * @end: an end marker xmlChar, 0 if none
1021: * @end2: an end marker xmlChar, 0 if none
1022: * @end3: an end marker xmlChar, 0 if none
1.1 daniel 1023: *
1024: * Subtitute the HTML entities by their value
1025: *
1.19 daniel 1026: * DEPRECATED !!!!
1.1 daniel 1027: *
1028: * Returns A newly allocated string with the substitution done. The caller
1029: * must deallocate it !
1030: */
1.14 daniel 1031: xmlChar *
1.1 daniel 1032: htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
1.14 daniel 1033: xmlChar end, xmlChar end2, xmlChar end3) {
1034: xmlChar *buffer = NULL;
1.1 daniel 1035: int buffer_size = 0;
1.14 daniel 1036: xmlChar *out = NULL;
1037: xmlChar *name = NULL;
1.1 daniel 1038:
1.14 daniel 1039: xmlChar *cur = NULL;
1.1 daniel 1040: htmlEntityDescPtr ent;
1.5 daniel 1041: int nbchars = 0;
1.1 daniel 1042: unsigned int max = (unsigned int) len;
1043:
1044: /*
1045: * allocate a translation buffer.
1046: */
1.31 daniel 1047: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1.14 daniel 1048: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1.1 daniel 1049: if (buffer == NULL) {
1050: perror("htmlDecodeEntities: malloc failed");
1051: return(NULL);
1052: }
1053: out = buffer;
1054:
1055: /*
1056: * Ok loop until we reach one of the ending char or a size limit.
1057: */
1.45 daniel 1058: while ((nbchars < (int) max) && (CUR != end) &&
1.1 daniel 1059: (CUR != end2) && (CUR != end3)) {
1060:
1061: if (CUR == '&') {
1062: if (NXT(1) == '#') {
1063: int val = htmlParseCharRef(ctxt);
1.8 daniel 1064: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 1065: *out++ = val;
1.5 daniel 1066: nbchars += 3; /* !!!! */
1.1 daniel 1067: } else {
1068: ent = htmlParseEntityRef(ctxt, &name);
1069: if (name != NULL) {
1070: if ((ent == NULL) || (ent->value <= 0) ||
1071: (ent->value >= 255)) {
1072: *out++ = '&';
1073: cur = name;
1074: while (*cur != 0) {
1075: if (out - buffer > buffer_size - 100) {
1076: int index = out - buffer;
1077:
1078: growBuffer(buffer);
1079: out = &buffer[index];
1080: }
1081: *out++ = *cur++;
1082: }
1083: *out++ = ';';
1084: } else {
1.8 daniel 1085: /* invalid for UTF-8 variable encoding !!!!! */
1.14 daniel 1086: *out++ = (xmlChar)ent->value;
1.1 daniel 1087: if (out - buffer > buffer_size - 100) {
1088: int index = out - buffer;
1089:
1090: growBuffer(buffer);
1091: out = &buffer[index];
1092: }
1093: }
1.5 daniel 1094: nbchars += 2 + xmlStrlen(name);
1.11 daniel 1095: xmlFree(name);
1.1 daniel 1096: }
1097: }
1098: } else {
1.8 daniel 1099: /* invalid for UTF-8 , use COPY(out); !!!!! */
1.1 daniel 1100: *out++ = CUR;
1.5 daniel 1101: nbchars++;
1.1 daniel 1102: if (out - buffer > buffer_size - 100) {
1103: int index = out - buffer;
1104:
1105: growBuffer(buffer);
1106: out = &buffer[index];
1107: }
1108: NEXT;
1109: }
1110: }
1111: *out++ = 0;
1112: return(buffer);
1113: }
1114:
1.31 daniel 1115: /************************************************************************
1116: * *
1117: * Commodity functions to handle streams *
1118: * *
1119: ************************************************************************/
1120:
1121: /**
1122: * htmlFreeInputStream:
1123: * @input: an htmlParserInputPtr
1124: *
1125: * Free up an input stream.
1126: */
1127: void
1128: htmlFreeInputStream(htmlParserInputPtr input) {
1129: if (input == NULL) return;
1130:
1131: if (input->filename != NULL) xmlFree((char *) input->filename);
1132: if (input->directory != NULL) xmlFree((char *) input->directory);
1133: if ((input->free != NULL) && (input->base != NULL))
1134: input->free((xmlChar *) input->base);
1135: if (input->buf != NULL)
1136: xmlFreeParserInputBuffer(input->buf);
1137: memset(input, -1, sizeof(htmlParserInput));
1138: xmlFree(input);
1139: }
1140:
1141: /**
1142: * htmlNewInputStream:
1143: * @ctxt: an HTML parser context
1144: *
1145: * Create a new input stream structure
1146: * Returns the new input stream or NULL
1147: */
1148: htmlParserInputPtr
1149: htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1150: htmlParserInputPtr input;
1151:
1152: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1153: if (input == NULL) {
1154: ctxt->errNo = XML_ERR_NO_MEMORY;
1155: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1156: ctxt->sax->error(ctxt->userData,
1157: "malloc: couldn't allocate a new input stream\n");
1158: ctxt->errNo = XML_ERR_NO_MEMORY;
1159: return(NULL);
1160: }
1.51 ! veillard 1161: memset(input, 0, sizeof(htmlParserInput));
1.31 daniel 1162: input->filename = NULL;
1163: input->directory = NULL;
1164: input->base = NULL;
1165: input->cur = NULL;
1166: input->buf = NULL;
1167: input->line = 1;
1168: input->col = 1;
1169: input->buf = NULL;
1170: input->free = NULL;
1.51 ! veillard 1171: input->version = NULL;
1.31 daniel 1172: input->consumed = 0;
1173: input->length = 0;
1174: return(input);
1175: }
1176:
1.1 daniel 1177:
1178: /************************************************************************
1179: * *
1180: * Commodity functions, cleanup needed ? *
1181: * *
1182: ************************************************************************/
1183:
1184: /**
1185: * areBlanks:
1186: * @ctxt: an HTML parser context
1.14 daniel 1187: * @str: a xmlChar *
1.1 daniel 1188: * @len: the size of @str
1189: *
1190: * Is this a sequence of blank chars that one can ignore ?
1191: *
1192: * Returns 1 if ignorable 0 otherwise.
1193: */
1194:
1.14 daniel 1195: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1.1 daniel 1196: int i;
1197: xmlNodePtr lastChild;
1198:
1199: for (i = 0;i < len;i++)
1200: if (!(IS_BLANK(str[i]))) return(0);
1201:
1.48 daniel 1202: if (CUR == 0) return(1);
1.1 daniel 1203: if (CUR != '<') return(0);
1204: if (ctxt->node == NULL) return(0);
1205: lastChild = xmlGetLastChild(ctxt->node);
1206: if (lastChild == NULL) {
1207: if (ctxt->node->content != NULL) return(0);
1208: } else if (xmlNodeIsText(lastChild))
1209: return(0);
1210: return(1);
1211: }
1212:
1213: /**
1214: * htmlHandleEntity:
1215: * @ctxt: an HTML parser context
1216: * @entity: an XML entity pointer.
1217: *
1218: * Default handling of an HTML entity, call the parser with the
1219: * substitution string
1220: */
1221:
1222: void
1223: htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1224: int len;
1225:
1226: if (entity->content == NULL) {
1227: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1228: ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1229: entity->name);
1230: ctxt->wellFormed = 0;
1231: return;
1232: }
1233: len = xmlStrlen(entity->content);
1234:
1235: /*
1236: * Just handle the content as a set of chars.
1237: */
1238: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1239: ctxt->sax->characters(ctxt->userData, entity->content, len);
1240:
1241: }
1242:
1243: /**
1244: * htmlNewDoc:
1245: * @URI: URI for the dtd, or NULL
1246: * @ExternalID: the external ID of the DTD, or NULL
1247: *
1248: * Returns a new document
1249: */
1250: htmlDocPtr
1.14 daniel 1251: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1.1 daniel 1252: xmlDocPtr cur;
1253:
1254: /*
1255: * Allocate a new document and fill the fields.
1256: */
1.11 daniel 1257: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1.1 daniel 1258: if (cur == NULL) {
1259: fprintf(stderr, "xmlNewDoc : malloc failed\n");
1260: return(NULL);
1261: }
1.10 daniel 1262: memset(cur, 0, sizeof(xmlDoc));
1.1 daniel 1263:
1.20 daniel 1264: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 1265: cur->version = NULL;
1266: cur->intSubset = NULL;
1.28 daniel 1267: if ((ExternalID == NULL) &&
1268: (URI == NULL))
1269: xmlCreateIntSubset(cur, BAD_CAST "HTML",
1270: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1271: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1272: else
1273: xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1.41 daniel 1274: cur->doc = cur;
1.1 daniel 1275: cur->name = NULL;
1.37 daniel 1276: cur->children = NULL;
1.1 daniel 1277: cur->extSubset = NULL;
1278: cur->oldNs = NULL;
1279: cur->encoding = NULL;
1280: cur->standalone = 1;
1281: cur->compression = 0;
1.12 daniel 1282: cur->ids = NULL;
1283: cur->refs = NULL;
1.1 daniel 1284: #ifndef XML_WITHOUT_CORBA
1285: cur->_private = NULL;
1286: #endif
1287: return(cur);
1288: }
1289:
1290:
1291: /************************************************************************
1292: * *
1293: * The parser itself *
1294: * Relates to http://www.w3.org/TR/html40 *
1295: * *
1296: ************************************************************************/
1297:
1298: /************************************************************************
1299: * *
1300: * The parser itself *
1301: * *
1302: ************************************************************************/
1303:
1304: /**
1305: * htmlParseHTMLName:
1306: * @ctxt: an HTML parser context
1307: *
1.26 daniel 1308: * parse an HTML tag or attribute name, note that we convert it to lowercase
1.1 daniel 1309: * since HTML names are not case-sensitive.
1310: *
1311: * Returns the Tag Name parsed or NULL
1312: */
1313:
1.14 daniel 1314: xmlChar *
1.1 daniel 1315: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1316: xmlChar *ret = NULL;
1.1 daniel 1317: int i = 0;
1.31 daniel 1318: xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1.1 daniel 1319:
1320: if (!IS_LETTER(CUR) && (CUR != '_') &&
1321: (CUR != ':')) return(NULL);
1322:
1.31 daniel 1323: while ((i < HTML_PARSER_BUFFER_SIZE) &&
1.45 daniel 1324: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1325: (CUR == ':') || (CUR == '_'))) {
1.26 daniel 1326: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1.1 daniel 1327: else loc[i] = CUR;
1328: i++;
1329:
1330: NEXT;
1331: }
1332:
1333: ret = xmlStrndup(loc, i);
1334:
1335: return(ret);
1336: }
1337:
1338: /**
1339: * htmlParseName:
1340: * @ctxt: an HTML parser context
1341: *
1342: * parse an HTML name, this routine is case sensistive.
1343: *
1344: * Returns the Name parsed or NULL
1345: */
1346:
1.14 daniel 1347: xmlChar *
1.1 daniel 1348: htmlParseName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1349: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1350: int len = 0;
1.1 daniel 1351:
1.5 daniel 1352: GROW;
1353: if (!IS_LETTER(CUR) && (CUR != '_')) {
1354: return(NULL);
1355: }
1.1 daniel 1356:
1357: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1358: (CUR == '.') || (CUR == '-') ||
1359: (CUR == '_') || (CUR == ':') ||
1360: (IS_COMBINING(CUR)) ||
1.5 daniel 1361: (IS_EXTENDER(CUR))) {
1362: buf[len++] = CUR;
1.1 daniel 1363: NEXT;
1.5 daniel 1364: if (len >= HTML_MAX_NAMELEN) {
1365: fprintf(stderr,
1366: "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1367: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1368: (CUR == '.') || (CUR == '-') ||
1369: (CUR == '_') || (CUR == ':') ||
1370: (IS_COMBINING(CUR)) ||
1371: (IS_EXTENDER(CUR)))
1372: NEXT;
1373: break;
1374: }
1375: }
1376: return(xmlStrndup(buf, len));
1.1 daniel 1377: }
1378:
1379: /**
1380: * htmlParseHTMLAttribute:
1381: * @ctxt: an HTML parser context
1.19 daniel 1382: * @stop: a char stop value
1.1 daniel 1383: *
1.19 daniel 1384: * parse an HTML attribute value till the stop (quote), if
1385: * stop is 0 then it stops at the first space
1.1 daniel 1386: *
1.19 daniel 1387: * Returns the attribute parsed or NULL
1.1 daniel 1388: */
1389:
1.14 daniel 1390: xmlChar *
1.19 daniel 1391: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1.32 daniel 1392: #if 0
1.14 daniel 1393: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1394: int len = 0;
1.1 daniel 1395:
1.5 daniel 1396: GROW;
1.19 daniel 1397: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1398: if ((stop == 0) && (IS_BLANK(CUR))) break;
1.5 daniel 1399: buf[len++] = CUR;
1.1 daniel 1400: NEXT;
1.5 daniel 1401: if (len >= HTML_MAX_NAMELEN) {
1402: fprintf(stderr,
1403: "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1404: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1.19 daniel 1405: (CUR != '>') &&
1.5 daniel 1406: (CUR != '\'') && (CUR != '"'))
1407: NEXT;
1408: break;
1409: }
1410: }
1411: return(xmlStrndup(buf, len));
1.32 daniel 1412: #else
1413: xmlChar *buffer = NULL;
1414: int buffer_size = 0;
1415: xmlChar *out = NULL;
1416: xmlChar *name = NULL;
1417:
1418: xmlChar *cur = NULL;
1419: htmlEntityDescPtr ent;
1420:
1421: /*
1422: * allocate a translation buffer.
1423: */
1424: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1425: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1426: if (buffer == NULL) {
1427: perror("htmlParseHTMLAttribute: malloc failed");
1428: return(NULL);
1429: }
1430: out = buffer;
1431:
1432: /*
1433: * Ok loop until we reach one of the ending chars
1434: */
1435: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1436: if ((stop == 0) && (IS_BLANK(CUR))) break;
1437: if (CUR == '&') {
1438: if (NXT(1) == '#') {
1439: int val = htmlParseCharRef(ctxt);
1440: *out++ = val;
1441: } else {
1442: ent = htmlParseEntityRef(ctxt, &name);
1443: if (name == NULL) {
1444: *out++ = '&';
1445: if (out - buffer > buffer_size - 100) {
1446: int index = out - buffer;
1447:
1448: growBuffer(buffer);
1449: out = &buffer[index];
1450: }
1451: } else if ((ent == NULL) || (ent->value <= 0) ||
1452: (ent->value >= 255)) {
1453: *out++ = '&';
1454: cur = name;
1455: while (*cur != 0) {
1456: if (out - buffer > buffer_size - 100) {
1457: int index = out - buffer;
1458:
1459: growBuffer(buffer);
1460: out = &buffer[index];
1461: }
1462: *out++ = *cur++;
1463: }
1464: xmlFree(name);
1465: } else {
1466: *out++ = ent->value;
1467: if (out - buffer > buffer_size - 100) {
1468: int index = out - buffer;
1469:
1470: growBuffer(buffer);
1471: out = &buffer[index];
1472: }
1473: xmlFree(name);
1474: }
1475: }
1476: } else {
1477: *out++ = CUR;
1478: if (out - buffer > buffer_size - 100) {
1479: int index = out - buffer;
1480:
1481: growBuffer(buffer);
1482: out = &buffer[index];
1483: }
1484: NEXT;
1485: }
1486: }
1487: *out++ = 0;
1488: return(buffer);
1489: #endif
1.1 daniel 1490: }
1491:
1492: /**
1493: * htmlParseNmtoken:
1494: * @ctxt: an HTML parser context
1495: *
1496: * parse an HTML Nmtoken.
1497: *
1498: * Returns the Nmtoken parsed or NULL
1499: */
1500:
1.14 daniel 1501: xmlChar *
1.1 daniel 1502: htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
1.14 daniel 1503: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1504: int len = 0;
1.1 daniel 1505:
1.5 daniel 1506: GROW;
1.1 daniel 1507: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1508: (CUR == '.') || (CUR == '-') ||
1509: (CUR == '_') || (CUR == ':') ||
1510: (IS_COMBINING(CUR)) ||
1.5 daniel 1511: (IS_EXTENDER(CUR))) {
1512: buf[len++] = CUR;
1.1 daniel 1513: NEXT;
1.5 daniel 1514: if (len >= HTML_MAX_NAMELEN) {
1515: fprintf(stderr,
1516: "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1517: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1518: (CUR == '.') || (CUR == '-') ||
1519: (CUR == '_') || (CUR == ':') ||
1520: (IS_COMBINING(CUR)) ||
1521: (IS_EXTENDER(CUR)))
1522: NEXT;
1523: break;
1524: }
1525: }
1526: return(xmlStrndup(buf, len));
1.1 daniel 1527: }
1528:
1529: /**
1530: * htmlParseEntityRef:
1531: * @ctxt: an HTML parser context
1532: * @str: location to store the entity name
1533: *
1534: * parse an HTML ENTITY references
1535: *
1536: * [68] EntityRef ::= '&' Name ';'
1537: *
1538: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1539: * if non-NULL *str will have to be freed by the caller.
1540: */
1541: htmlEntityDescPtr
1.14 daniel 1542: htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1543: xmlChar *name;
1.1 daniel 1544: htmlEntityDescPtr ent = NULL;
1545: *str = NULL;
1546:
1547: if (CUR == '&') {
1548: NEXT;
1549: name = htmlParseName(ctxt);
1550: if (name == NULL) {
1551: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1552: ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1553: ctxt->wellFormed = 0;
1554: } else {
1.5 daniel 1555: GROW;
1.1 daniel 1556: if (CUR == ';') {
1557: *str = name;
1558:
1559: /*
1560: * Lookup the entity in the table.
1561: */
1562: ent = htmlEntityLookup(name);
1.32 daniel 1563: if (ent != NULL) /* OK that's ugly !!! */
1564: NEXT;
1.1 daniel 1565: } else {
1566: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1567: ctxt->sax->error(ctxt->userData,
1568: "htmlParseEntityRef: expecting ';'\n");
1.32 daniel 1569: *str = name;
1.1 daniel 1570: }
1571: }
1572: }
1573: return(ent);
1574: }
1575:
1576: /**
1577: * htmlParseAttValue:
1578: * @ctxt: an HTML parser context
1579: *
1580: * parse a value for an attribute
1581: * Note: the parser won't do substitution of entities here, this
1582: * will be handled later in xmlStringGetNodeList, unless it was
1583: * asked for ctxt->replaceEntities != 0
1584: *
1585: * Returns the AttValue parsed or NULL.
1586: */
1587:
1.14 daniel 1588: xmlChar *
1.1 daniel 1589: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1.14 daniel 1590: xmlChar *ret = NULL;
1.1 daniel 1591:
1592: if (CUR == '"') {
1593: NEXT;
1.19 daniel 1594: ret = htmlParseHTMLAttribute(ctxt, '"');
1.1 daniel 1595: if (CUR != '"') {
1596: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1597: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1598: ctxt->wellFormed = 0;
1599: } else
1600: NEXT;
1601: } else if (CUR == '\'') {
1602: NEXT;
1.19 daniel 1603: ret = htmlParseHTMLAttribute(ctxt, '\'');
1.1 daniel 1604: if (CUR != '\'') {
1605: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1606: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1607: ctxt->wellFormed = 0;
1608: } else
1609: NEXT;
1610: } else {
1611: /*
1612: * That's an HTMLism, the attribute value may not be quoted
1613: */
1.19 daniel 1614: ret = htmlParseHTMLAttribute(ctxt, 0);
1.1 daniel 1615: if (ret == NULL) {
1616: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1617: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1618: ctxt->wellFormed = 0;
1619: }
1620: }
1621: return(ret);
1622: }
1623:
1624: /**
1625: * htmlParseSystemLiteral:
1626: * @ctxt: an HTML parser context
1627: *
1628: * parse an HTML Literal
1629: *
1630: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1631: *
1632: * Returns the SystemLiteral parsed or NULL
1633: */
1634:
1.14 daniel 1635: xmlChar *
1.1 daniel 1636: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 1637: const xmlChar *q;
1638: xmlChar *ret = NULL;
1.1 daniel 1639:
1640: if (CUR == '"') {
1641: NEXT;
1642: q = CUR_PTR;
1643: while ((IS_CHAR(CUR)) && (CUR != '"'))
1644: NEXT;
1645: if (!IS_CHAR(CUR)) {
1646: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1647: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1648: ctxt->wellFormed = 0;
1649: } else {
1650: ret = xmlStrndup(q, CUR_PTR - q);
1651: NEXT;
1652: }
1653: } else if (CUR == '\'') {
1654: NEXT;
1655: q = CUR_PTR;
1656: while ((IS_CHAR(CUR)) && (CUR != '\''))
1657: NEXT;
1658: if (!IS_CHAR(CUR)) {
1659: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1660: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1661: ctxt->wellFormed = 0;
1662: } else {
1663: ret = xmlStrndup(q, CUR_PTR - q);
1664: NEXT;
1665: }
1666: } else {
1667: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.38 daniel 1668: ctxt->sax->error(ctxt->userData,
1669: "SystemLiteral \" or ' expected\n");
1.1 daniel 1670: ctxt->wellFormed = 0;
1671: }
1672:
1673: return(ret);
1674: }
1675:
1676: /**
1677: * htmlParsePubidLiteral:
1678: * @ctxt: an HTML parser context
1679: *
1680: * parse an HTML public literal
1681: *
1682: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1683: *
1684: * Returns the PubidLiteral parsed or NULL.
1685: */
1686:
1.14 daniel 1687: xmlChar *
1.1 daniel 1688: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 1689: const xmlChar *q;
1690: xmlChar *ret = NULL;
1.1 daniel 1691: /*
1692: * Name ::= (Letter | '_') (NameChar)*
1693: */
1694: if (CUR == '"') {
1695: NEXT;
1696: q = CUR_PTR;
1697: while (IS_PUBIDCHAR(CUR)) NEXT;
1698: if (CUR != '"') {
1699: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1700: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1701: ctxt->wellFormed = 0;
1702: } else {
1703: ret = xmlStrndup(q, CUR_PTR - q);
1704: NEXT;
1705: }
1706: } else if (CUR == '\'') {
1707: NEXT;
1708: q = CUR_PTR;
1709: while ((IS_LETTER(CUR)) && (CUR != '\''))
1710: NEXT;
1711: if (!IS_LETTER(CUR)) {
1712: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1713: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1714: ctxt->wellFormed = 0;
1715: } else {
1716: ret = xmlStrndup(q, CUR_PTR - q);
1717: NEXT;
1718: }
1719: } else {
1720: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1721: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1722: ctxt->wellFormed = 0;
1723: }
1724:
1725: return(ret);
1726: }
1727:
1728: /**
1729: * htmlParseCharData:
1730: * @ctxt: an HTML parser context
1731: * @cdata: int indicating whether we are within a CDATA section
1732: *
1733: * parse a CharData section.
1734: * if we are within a CDATA section ']]>' marks an end of section.
1735: *
1736: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1737: */
1738:
1739: void
1740: htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1.25 daniel 1741: xmlChar *buf = NULL;
1742: int len = 0;
1.31 daniel 1743: int size = HTML_PARSER_BUFFER_SIZE;
1.25 daniel 1744: xmlChar q;
1745:
1746: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1747: if (buf == NULL) {
1748: fprintf(stderr, "malloc of %d byte failed\n", size);
1749: return;
1750: }
1.1 daniel 1751:
1.25 daniel 1752: q = CUR;
1753: while ((IS_CHAR(q)) && (q != '<') &&
1754: (q != '&')) {
1755: if ((q == ']') && (NXT(1) == ']') &&
1.1 daniel 1756: (NXT(2) == '>')) {
1757: if (cdata) break;
1758: else {
1759: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1760: ctxt->sax->error(ctxt->userData,
1761: "Sequence ']]>' not allowed in content\n");
1762: ctxt->wellFormed = 0;
1763: }
1764: }
1.25 daniel 1765: if (len + 1 >= size) {
1766: size *= 2;
1.50 veillard 1767: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
1.25 daniel 1768: if (buf == NULL) {
1769: fprintf(stderr, "realloc of %d byte failed\n", size);
1770: return;
1771: }
1772: }
1773: buf[len++] = q;
1.1 daniel 1774: NEXT;
1.25 daniel 1775: q = CUR;
1776: }
1777: if (len == 0) {
1778: xmlFree(buf);
1779: return;
1.1 daniel 1780: }
1781:
1782: /*
1.25 daniel 1783: * Ok the buffer is to be consumed as chars.
1.1 daniel 1784: */
1785: if (ctxt->sax != NULL) {
1.25 daniel 1786: if (areBlanks(ctxt, buf, len)) {
1.1 daniel 1787: if (ctxt->sax->ignorableWhitespace != NULL)
1.25 daniel 1788: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len);
1.1 daniel 1789: } else {
1790: if (ctxt->sax->characters != NULL)
1.25 daniel 1791: ctxt->sax->characters(ctxt->userData, buf, len);
1.1 daniel 1792: }
1793: }
1.25 daniel 1794: xmlFree(buf);
1.1 daniel 1795: }
1796:
1797: /**
1798: * htmlParseExternalID:
1799: * @ctxt: an HTML parser context
1.14 daniel 1800: * @publicID: a xmlChar** receiving PubidLiteral
1.1 daniel 1801: * @strict: indicate whether we should restrict parsing to only
1802: * production [75], see NOTE below
1803: *
1804: * Parse an External ID or a Public ID
1805: *
1806: * NOTE: Productions [75] and [83] interract badly since [75] can generate
1807: * 'PUBLIC' S PubidLiteral S SystemLiteral
1808: *
1809: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1810: * | 'PUBLIC' S PubidLiteral S SystemLiteral
1811: *
1812: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1813: *
1814: * Returns the function returns SystemLiteral and in the second
1815: * case publicID receives PubidLiteral, is strict is off
1816: * it is possible to return NULL and have publicID set.
1817: */
1818:
1.14 daniel 1819: xmlChar *
1820: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1821: xmlChar *URI = NULL;
1.1 daniel 1822:
1823: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1824: (UPP(2) == 'S') && (UPP(3) == 'T') &&
1825: (UPP(4) == 'E') && (UPP(5) == 'M')) {
1826: SKIP(6);
1827: if (!IS_BLANK(CUR)) {
1828: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1829: ctxt->sax->error(ctxt->userData,
1830: "Space required after 'SYSTEM'\n");
1831: ctxt->wellFormed = 0;
1832: }
1833: SKIP_BLANKS;
1834: URI = htmlParseSystemLiteral(ctxt);
1835: if (URI == NULL) {
1836: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1837: ctxt->sax->error(ctxt->userData,
1838: "htmlParseExternalID: SYSTEM, no URI\n");
1839: ctxt->wellFormed = 0;
1840: }
1841: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1842: (UPP(2) == 'B') && (UPP(3) == 'L') &&
1843: (UPP(4) == 'I') && (UPP(5) == 'C')) {
1844: SKIP(6);
1845: if (!IS_BLANK(CUR)) {
1846: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1847: ctxt->sax->error(ctxt->userData,
1848: "Space required after 'PUBLIC'\n");
1849: ctxt->wellFormed = 0;
1850: }
1851: SKIP_BLANKS;
1852: *publicID = htmlParsePubidLiteral(ctxt);
1853: if (*publicID == NULL) {
1854: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1855: ctxt->sax->error(ctxt->userData,
1856: "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1857: ctxt->wellFormed = 0;
1858: }
1.5 daniel 1859: SKIP_BLANKS;
1860: if ((CUR == '"') || (CUR == '\'')) {
1861: URI = htmlParseSystemLiteral(ctxt);
1.1 daniel 1862: }
1863: }
1864: return(URI);
1865: }
1866:
1867: /**
1868: * htmlParseComment:
1869: * @ctxt: an HTML parser context
1870: *
1871: * Parse an XML (SGML) comment <!-- .... -->
1872: *
1873: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1874: */
1875: void
1.31 daniel 1876: htmlParseComment(htmlParserCtxtPtr ctxt) {
1.25 daniel 1877: xmlChar *buf = NULL;
1878: int len = 0;
1.31 daniel 1879: int size = HTML_PARSER_BUFFER_SIZE;
1.25 daniel 1880: register xmlChar s, r, q;
1.1 daniel 1881:
1882: /*
1883: * Check that there is a comment right here.
1884: */
1885: if ((CUR != '<') || (NXT(1) != '!') ||
1886: (NXT(2) != '-') || (NXT(3) != '-')) return;
1887:
1.25 daniel 1888: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1889: if (buf == NULL) {
1890: fprintf(stderr, "malloc of %d byte failed\n", size);
1891: return;
1892: }
1893: q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
1.1 daniel 1894: SKIP(4);
1.25 daniel 1895: s = CUR;
1896:
1897: while (IS_CHAR(s) &&
1898: ((s != '>') || (r != '-') || (q != '-'))) {
1899: if (len + 1 >= size) {
1900: size *= 2;
1.50 veillard 1901: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
1.25 daniel 1902: if (buf == NULL) {
1903: fprintf(stderr, "realloc of %d byte failed\n", size);
1904: return;
1905: }
1906: }
1907: buf[len++] = s;
1908: NEXT;
1909: q = r;
1910: r = s;
1911: s = CUR;
1.1 daniel 1912: }
1.25 daniel 1913: buf[len - 2] = 0;
1914: if (!IS_CHAR(s)) {
1.1 daniel 1915: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.25 daniel 1916: ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
1.1 daniel 1917: ctxt->wellFormed = 0;
1918: } else {
1919: NEXT;
1.31 daniel 1920: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
1921: ctxt->sax->comment(ctxt->userData, buf);
1.1 daniel 1922: }
1923: }
1.25 daniel 1924: xmlFree(buf);
1.1 daniel 1925: }
1926:
1927: /**
1928: * htmlParseCharRef:
1929: * @ctxt: an HTML parser context
1930: *
1931: * parse Reference declarations
1932: *
1933: * [66] CharRef ::= '&#' [0-9]+ ';' |
1934: * '&#x' [0-9a-fA-F]+ ';'
1935: *
1936: * Returns the value parsed (as an int)
1937: */
1938: int
1939: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1940: int val = 0;
1941:
1942: if ((CUR == '&') && (NXT(1) == '#') &&
1943: (NXT(2) == 'x')) {
1944: SKIP(3);
1945: while (CUR != ';') {
1946: if ((CUR >= '0') && (CUR <= '9'))
1947: val = val * 16 + (CUR - '0');
1948: else if ((CUR >= 'a') && (CUR <= 'f'))
1949: val = val * 16 + (CUR - 'a') + 10;
1950: else if ((CUR >= 'A') && (CUR <= 'F'))
1951: val = val * 16 + (CUR - 'A') + 10;
1952: else {
1953: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1954: ctxt->sax->error(ctxt->userData,
1955: "htmlParseCharRef: invalid hexadecimal value\n");
1956: ctxt->wellFormed = 0;
1957: val = 0;
1958: break;
1959: }
1960: NEXT;
1961: }
1962: if (CUR == ';')
1963: NEXT;
1964: } else if ((CUR == '&') && (NXT(1) == '#')) {
1965: SKIP(2);
1966: while (CUR != ';') {
1967: if ((CUR >= '0') && (CUR <= '9'))
1968: val = val * 10 + (CUR - '0');
1969: else {
1970: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1971: ctxt->sax->error(ctxt->userData,
1972: "htmlParseCharRef: invalid decimal value\n");
1973: ctxt->wellFormed = 0;
1974: val = 0;
1975: break;
1976: }
1977: NEXT;
1978: }
1979: if (CUR == ';')
1980: NEXT;
1981: } else {
1982: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1983: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
1984: ctxt->wellFormed = 0;
1985: }
1986: /*
1987: * Check the value IS_CHAR ...
1988: */
1989: if (IS_CHAR(val)) {
1990: return(val);
1991: } else {
1992: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.14 daniel 1993: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
1.1 daniel 1994: val);
1995: ctxt->wellFormed = 0;
1996: }
1997: return(0);
1998: }
1999:
2000:
2001: /**
2002: * htmlParseDocTypeDecl :
2003: * @ctxt: an HTML parser context
2004: *
2005: * parse a DOCTYPE declaration
2006: *
2007: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2008: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2009: */
2010:
2011: void
2012: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1.14 daniel 2013: xmlChar *name;
2014: xmlChar *ExternalID = NULL;
2015: xmlChar *URI = NULL;
1.1 daniel 2016:
2017: /*
2018: * We know that '<!DOCTYPE' has been detected.
2019: */
2020: SKIP(9);
2021:
2022: SKIP_BLANKS;
2023:
2024: /*
2025: * Parse the DOCTYPE name.
2026: */
2027: name = htmlParseName(ctxt);
2028: if (name == NULL) {
2029: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2030: ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2031: ctxt->wellFormed = 0;
2032: }
2033: /*
2034: * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2035: */
2036:
2037: SKIP_BLANKS;
2038:
2039: /*
2040: * Check for SystemID and ExternalID
2041: */
1.5 daniel 2042: URI = htmlParseExternalID(ctxt, &ExternalID, 0);
1.1 daniel 2043: SKIP_BLANKS;
2044:
2045: /*
2046: * We should be at the end of the DOCTYPE declaration.
2047: */
2048: if (CUR != '>') {
2049: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2050: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2051: ctxt->wellFormed = 0;
2052: /* We shouldn't try to resynchronize ... */
2053: }
2054: NEXT;
2055:
2056: /*
1.46 daniel 2057: * Create or update the document accordingly to the DOCTYPE
1.1 daniel 2058: */
1.46 daniel 2059: if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2060: (!ctxt->disableSAX))
2061: ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
1.1 daniel 2062:
2063: /*
2064: * Cleanup, since we don't use all those identifiers
2065: */
1.11 daniel 2066: if (URI != NULL) xmlFree(URI);
2067: if (ExternalID != NULL) xmlFree(ExternalID);
2068: if (name != NULL) xmlFree(name);
1.1 daniel 2069: }
2070:
2071: /**
2072: * htmlParseAttribute:
2073: * @ctxt: an HTML parser context
1.14 daniel 2074: * @value: a xmlChar ** used to store the value of the attribute
1.1 daniel 2075: *
2076: * parse an attribute
2077: *
2078: * [41] Attribute ::= Name Eq AttValue
2079: *
2080: * [25] Eq ::= S? '=' S?
2081: *
2082: * With namespace:
2083: *
2084: * [NS 11] Attribute ::= QName Eq AttValue
2085: *
2086: * Also the case QName == xmlns:??? is handled independently as a namespace
2087: * definition.
2088: *
2089: * Returns the attribute name, and the value in *value.
2090: */
2091:
1.14 daniel 2092: xmlChar *
2093: htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1.31 daniel 2094: xmlChar *name, *val = NULL;
1.1 daniel 2095:
2096: *value = NULL;
2097: name = htmlParseName(ctxt);
2098: if (name == NULL) {
2099: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2100: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2101: ctxt->wellFormed = 0;
2102: return(NULL);
2103: }
2104:
2105: /*
2106: * read the value
2107: */
2108: SKIP_BLANKS;
2109: if (CUR == '=') {
2110: NEXT;
2111: SKIP_BLANKS;
2112: val = htmlParseAttValue(ctxt);
1.42 daniel 2113: /******
1.1 daniel 2114: } else {
1.42 daniel 2115: * TODO : some attribute must have values, some may not
1.1 daniel 2116: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.31 daniel 2117: ctxt->sax->warning(ctxt->userData,
1.42 daniel 2118: "No value for attribute %s\n", name); */
1.1 daniel 2119: }
2120:
2121: *value = val;
2122: return(name);
2123: }
2124:
2125: /**
1.47 daniel 2126: * htmlCheckEncoding:
2127: * @ctxt: an HTML parser context
2128: * @attvalue: the attribute value
2129: *
2130: * Checks an http-equiv attribute from a Meta tag to detect
2131: * the encoding
2132: * If a new encoding is detected the parser is switched to decode
2133: * it and pass UTF8
2134: */
2135: void
2136: htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2137: const xmlChar *encoding;
2138:
2139: if ((ctxt == NULL) || (attvalue == NULL))
2140: return;
2141:
2142: fprintf(stderr, "htmlCheckEncoding: \"%s\"\n", attvalue);
2143:
2144: encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
2145: if (encoding == NULL)
2146: encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
2147: if (encoding == NULL)
2148: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
2149: if (encoding != NULL) {
2150: encoding += 8;
2151: } else {
2152: encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
2153: if (encoding == NULL)
2154: encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
2155: if (encoding == NULL)
2156: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
2157: if (encoding != NULL)
2158: encoding += 9;
2159: }
2160: if (encoding != NULL) {
2161: xmlCharEncoding enc;
2162: xmlCharEncodingHandlerPtr handler;
2163:
2164: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2165:
2166: if (ctxt->input->encoding != NULL)
2167: xmlFree((xmlChar *) ctxt->input->encoding);
2168: ctxt->input->encoding = xmlStrdup(encoding);
2169:
2170: enc = xmlParseCharEncoding((const char *) encoding);
2171: /*
2172: * registered set of known encodings
2173: */
2174: if (enc != XML_CHAR_ENCODING_ERROR) {
2175: xmlSwitchEncoding(ctxt, enc);
2176: } else {
2177: /*
2178: * fallback for unknown encodings
2179: */
2180: handler = xmlFindCharEncodingHandler((const char *) encoding);
2181: if (handler != NULL) {
2182: xmlSwitchToEncoding(ctxt, handler);
2183: } else {
2184: ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2185: }
2186: }
2187: }
2188: }
2189:
2190: /**
2191: * htmlCheckMeta:
2192: * @ctxt: an HTML parser context
2193: * @atts: the attributes values
2194: *
2195: * Checks an attributes from a Meta tag
2196: */
2197: void
2198: htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2199: int i;
2200: const xmlChar *att, *value;
2201: int http = 0;
2202: const xmlChar *content = NULL;
2203:
2204: if ((ctxt == NULL) || (atts == NULL))
2205: return;
2206:
2207: i = 0;
2208: att = atts[i++];
2209: while (att != NULL) {
2210: value = atts[i++];
2211: if ((value != NULL) &&
2212: ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
2213: (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
2214: (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
2215: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
2216: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
2217: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
2218: http = 1;
2219: else if ((value != NULL) &&
2220: ((!xmlStrcmp(att, BAD_CAST"content")) ||
2221: (!xmlStrcmp(att, BAD_CAST"Content")) ||
2222: (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
2223: content = value;
2224: att = atts[i++];
2225: }
2226: if ((http) && (content != NULL))
2227: htmlCheckEncoding(ctxt, content);
2228:
2229: }
2230:
2231: /**
1.1 daniel 2232: * htmlParseStartTag:
2233: * @ctxt: an HTML parser context
2234: *
2235: * parse a start of tag either for rule element or
2236: * EmptyElement. In both case we don't parse the tag closing chars.
2237: *
2238: * [40] STag ::= '<' Name (S Attribute)* S? '>'
2239: *
2240: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2241: *
2242: * With namespace:
2243: *
2244: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2245: *
2246: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2247: *
2248: */
2249:
1.18 daniel 2250: void
1.1 daniel 2251: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2252: xmlChar *name;
2253: xmlChar *attname;
2254: xmlChar *attvalue;
2255: const xmlChar **atts = NULL;
1.1 daniel 2256: int nbatts = 0;
2257: int maxatts = 0;
1.47 daniel 2258: int meta = 0;
1.1 daniel 2259: int i;
2260:
1.18 daniel 2261: if (CUR != '<') return;
1.1 daniel 2262: NEXT;
2263:
1.19 daniel 2264: GROW;
1.1 daniel 2265: name = htmlParseHTMLName(ctxt);
2266: if (name == NULL) {
2267: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2268: ctxt->sax->error(ctxt->userData,
2269: "htmlParseStartTag: invalid element name\n");
2270: ctxt->wellFormed = 0;
1.18 daniel 2271: return;
1.1 daniel 2272: }
1.47 daniel 2273: if (!xmlStrcmp(name, BAD_CAST"meta"))
2274: meta = 1;
1.1 daniel 2275:
2276: /*
2277: * Check for auto-closure of HTML elements.
2278: */
2279: htmlAutoClose(ctxt, name);
1.43 daniel 2280:
2281: /*
2282: * Check for implied HTML elements.
2283: */
2284: htmlCheckImplied(ctxt, name);
1.1 daniel 2285:
2286: /*
2287: * Now parse the attributes, it ends up with the ending
2288: *
2289: * (S Attribute)* S?
2290: */
2291: SKIP_BLANKS;
2292: while ((IS_CHAR(CUR)) &&
2293: (CUR != '>') &&
2294: ((CUR != '/') || (NXT(1) != '>'))) {
1.26 daniel 2295: long cons = ctxt->nbChars;
1.1 daniel 2296:
1.19 daniel 2297: GROW;
1.1 daniel 2298: attname = htmlParseAttribute(ctxt, &attvalue);
1.31 daniel 2299: if (attname != NULL) {
1.47 daniel 2300:
1.1 daniel 2301: /*
2302: * Well formedness requires at most one declaration of an attribute
2303: */
2304: for (i = 0; i < nbatts;i += 2) {
2305: if (!xmlStrcmp(atts[i], attname)) {
2306: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.19 daniel 2307: ctxt->sax->error(ctxt->userData,
2308: "Attribute %s redefined\n",
2309: attname);
1.1 daniel 2310: ctxt->wellFormed = 0;
1.11 daniel 2311: xmlFree(attname);
1.31 daniel 2312: if (attvalue != NULL)
2313: xmlFree(attvalue);
1.19 daniel 2314: goto failed;
1.1 daniel 2315: }
2316: }
2317:
2318: /*
2319: * Add the pair to atts
2320: */
2321: if (atts == NULL) {
2322: maxatts = 10;
1.14 daniel 2323: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
1.1 daniel 2324: if (atts == NULL) {
2325: fprintf(stderr, "malloc of %ld byte failed\n",
1.14 daniel 2326: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2327: if (name != NULL) xmlFree(name);
2328: return;
1.1 daniel 2329: }
1.23 daniel 2330: } else if (nbatts + 4 > maxatts) {
1.1 daniel 2331: maxatts *= 2;
1.14 daniel 2332: atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
1.1 daniel 2333: if (atts == NULL) {
2334: fprintf(stderr, "realloc of %ld byte failed\n",
1.14 daniel 2335: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2336: if (name != NULL) xmlFree(name);
2337: return;
1.1 daniel 2338: }
2339: }
2340: atts[nbatts++] = attname;
2341: atts[nbatts++] = attvalue;
2342: atts[nbatts] = NULL;
2343: atts[nbatts + 1] = NULL;
2344: }
2345:
1.19 daniel 2346: failed:
1.1 daniel 2347: SKIP_BLANKS;
1.26 daniel 2348: if (cons == ctxt->nbChars) {
1.1 daniel 2349: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2350: ctxt->sax->error(ctxt->userData,
2351: "htmlParseStartTag: problem parsing attributes\n");
2352: ctxt->wellFormed = 0;
2353: break;
2354: }
2355: }
2356:
2357: /*
1.47 daniel 2358: * Handle specific association to the META tag
2359: */
2360: if (meta)
2361: htmlCheckMeta(ctxt, atts);
2362:
2363: /*
1.1 daniel 2364: * SAX: Start of Element !
2365: */
1.15 daniel 2366: htmlnamePush(ctxt, xmlStrdup(name));
1.18 daniel 2367: #ifdef DEBUG
2368: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2369: #endif
1.1 daniel 2370: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2371: ctxt->sax->startElement(ctxt->userData, name, atts);
2372:
2373: if (atts != NULL) {
1.31 daniel 2374: for (i = 0;i < nbatts;i++) {
2375: if (atts[i] != NULL)
2376: xmlFree((xmlChar *) atts[i]);
2377: }
1.45 daniel 2378: xmlFree((void *) atts);
1.1 daniel 2379: }
1.18 daniel 2380: if (name != NULL) xmlFree(name);
1.1 daniel 2381: }
2382:
2383: /**
2384: * htmlParseEndTag:
2385: * @ctxt: an HTML parser context
2386: *
2387: * parse an end of tag
2388: *
2389: * [42] ETag ::= '</' Name S? '>'
2390: *
2391: * With namespace
2392: *
2393: * [NS 9] ETag ::= '</' QName S? '>'
2394: */
2395:
2396: void
1.18 daniel 2397: htmlParseEndTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2398: xmlChar *name;
1.15 daniel 2399: xmlChar *oldname;
1.1 daniel 2400: int i;
2401:
2402: if ((CUR != '<') || (NXT(1) != '/')) {
2403: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2404: ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2405: ctxt->wellFormed = 0;
2406: return;
2407: }
2408: SKIP(2);
2409:
2410: name = htmlParseHTMLName(ctxt);
1.24 daniel 2411: if (name == NULL) return;
1.1 daniel 2412:
2413: /*
2414: * We should definitely be at the ending "S? '>'" part
2415: */
2416: SKIP_BLANKS;
2417: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2418: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2419: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2420: ctxt->wellFormed = 0;
2421: } else
2422: NEXT;
2423:
2424: /*
1.18 daniel 2425: * If the name read is not one of the element in the parsing stack
2426: * then return, it's just an error.
1.1 daniel 2427: */
1.18 daniel 2428: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2429: if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
1.1 daniel 2430: }
2431: if (i < 0) {
2432: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.18 daniel 2433: ctxt->sax->error(ctxt->userData,
2434: "Unexpected end tag : %s\n", name);
1.11 daniel 2435: xmlFree(name);
1.1 daniel 2436: ctxt->wellFormed = 0;
2437: return;
2438: }
2439:
1.18 daniel 2440:
1.1 daniel 2441: /*
2442: * Check for auto-closure of HTML elements.
2443: */
1.18 daniel 2444:
1.1 daniel 2445: htmlAutoCloseOnClose(ctxt, name);
2446:
2447: /*
2448: * Well formedness constraints, opening and closing must match.
2449: * With the exception that the autoclose may have popped stuff out
2450: * of the stack.
2451: */
1.18 daniel 2452: if (xmlStrcmp(name, ctxt->name)) {
2453: #ifdef DEBUG
2454: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2455: #endif
1.15 daniel 2456: if ((ctxt->name != NULL) &&
2457: (xmlStrcmp(ctxt->name, name))) {
1.1 daniel 2458: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2459: ctxt->sax->error(ctxt->userData,
2460: "Opening and ending tag mismatch: %s and %s\n",
1.15 daniel 2461: name, ctxt->name);
1.1 daniel 2462: ctxt->wellFormed = 0;
2463: }
2464: }
2465:
2466: /*
2467: * SAX: End of Tag
2468: */
1.15 daniel 2469: oldname = ctxt->name;
1.24 daniel 2470: if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
1.18 daniel 2471: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2472: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2473: oldname = htmlnamePop(ctxt);
1.18 daniel 2474: if (oldname != NULL) {
2475: #ifdef DEBUG
2476: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2477: #endif
2478: xmlFree(oldname);
2479: #ifdef DEBUG
2480: } else {
2481: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2482: #endif
2483: }
2484: }
1.1 daniel 2485:
2486: if (name != NULL)
1.11 daniel 2487: xmlFree(name);
1.1 daniel 2488:
2489: return;
2490: }
2491:
2492:
2493: /**
2494: * htmlParseReference:
2495: * @ctxt: an HTML parser context
2496: *
2497: * parse and handle entity references in content,
2498: * this will end-up in a call to character() since this is either a
2499: * CharRef, or a predefined entity.
2500: */
2501: void
2502: htmlParseReference(htmlParserCtxtPtr ctxt) {
2503: htmlEntityDescPtr ent;
1.14 daniel 2504: xmlChar out[2];
2505: xmlChar *name;
1.1 daniel 2506: int val;
2507: if (CUR != '&') return;
2508:
2509: if (NXT(1) == '#') {
2510: val = htmlParseCharRef(ctxt);
1.8 daniel 2511: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 2512: out[0] = val;
2513: out[1] = 0;
2514: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2515: ctxt->sax->characters(ctxt->userData, out, 1);
2516: } else {
2517: ent = htmlParseEntityRef(ctxt, &name);
1.32 daniel 2518: if (name == NULL) {
2519: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
2520: return;
2521: }
1.1 daniel 2522: if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2523: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
1.8 daniel 2524: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.1 daniel 2525: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1.32 daniel 2526: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
1.1 daniel 2527: }
2528: } else {
1.8 daniel 2529: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 2530: out[0] = ent->value;
2531: out[1] = 0;
2532: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2533: ctxt->sax->characters(ctxt->userData, out, 1);
2534: }
1.11 daniel 2535: xmlFree(name);
1.1 daniel 2536: }
2537: }
2538:
2539: /**
2540: * htmlParseContent:
2541: * @ctxt: an HTML parser context
2542: * @name: the node name
2543: *
2544: * Parse a content: comment, sub-element, reference or text.
2545: *
2546: */
2547:
2548: void
1.18 daniel 2549: htmlParseContent(htmlParserCtxtPtr ctxt) {
1.15 daniel 2550: xmlChar *currentNode;
1.18 daniel 2551: int depth;
1.1 daniel 2552:
1.26 daniel 2553: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 2554: depth = ctxt->nameNr;
2555: while (1) {
1.26 daniel 2556: long cons = ctxt->nbChars;
1.1 daniel 2557:
1.18 daniel 2558: GROW;
2559: /*
2560: * Our tag or one of it's parent or children is ending.
2561: */
2562: if ((CUR == '<') && (NXT(1) == '/')) {
2563: htmlParseEndTag(ctxt);
1.26 daniel 2564: if (currentNode != NULL) xmlFree(currentNode);
1.18 daniel 2565: return;
2566: }
2567:
2568: /*
2569: * Has this node been popped out during parsing of
2570: * the next element
2571: */
1.26 daniel 2572: if ((xmlStrcmp(currentNode, ctxt->name)) &&
2573: (depth >= ctxt->nameNr)) {
2574: if (currentNode != NULL) xmlFree(currentNode);
2575: return;
2576: }
1.18 daniel 2577:
1.1 daniel 2578: /*
2579: * First case : a comment
2580: */
2581: if ((CUR == '<') && (NXT(1) == '!') &&
2582: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 2583: htmlParseComment(ctxt);
1.1 daniel 2584: }
2585:
2586: /*
2587: * Second case : a sub-element.
2588: */
2589: else if (CUR == '<') {
2590: htmlParseElement(ctxt);
2591: }
2592:
2593: /*
2594: * Third case : a reference. If if has not been resolved,
2595: * parsing returns it's Name, create the node
2596: */
2597: else if (CUR == '&') {
2598: htmlParseReference(ctxt);
2599: }
2600:
2601: /*
1.47 daniel 2602: * Fourth : end of the resource
2603: */
2604: else if (CUR == 0) {
2605: htmlAutoClose(ctxt, NULL);
2606: }
2607:
2608: /*
1.1 daniel 2609: * Last case, text. Note that References are handled directly.
2610: */
2611: else {
2612: htmlParseCharData(ctxt, 0);
2613: }
2614:
1.26 daniel 2615: if (cons == ctxt->nbChars) {
1.22 daniel 2616: if (ctxt->node != NULL) {
2617: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2618: ctxt->sax->error(ctxt->userData,
2619: "detected an error in element content\n");
2620: ctxt->wellFormed = 0;
2621: }
1.1 daniel 2622: break;
2623: }
1.17 daniel 2624:
1.5 daniel 2625: GROW;
1.1 daniel 2626: }
1.26 daniel 2627: if (currentNode != NULL) xmlFree(currentNode);
1.1 daniel 2628: }
2629:
2630: /**
2631: * htmlParseElement:
2632: * @ctxt: an HTML parser context
2633: *
2634: * parse an HTML element, this is highly recursive
2635: *
2636: * [39] element ::= EmptyElemTag | STag content ETag
2637: *
2638: * [41] Attribute ::= Name Eq AttValue
2639: */
2640:
2641: void
2642: htmlParseElement(htmlParserCtxtPtr ctxt) {
1.14 daniel 2643: const xmlChar *openTag = CUR_PTR;
2644: xmlChar *name;
1.16 daniel 2645: xmlChar *currentNode = NULL;
1.1 daniel 2646: htmlElemDescPtr info;
1.10 daniel 2647: htmlParserNodeInfo node_info;
1.31 daniel 2648: xmlChar *oldname;
1.18 daniel 2649: int depth = ctxt->nameNr;
1.1 daniel 2650:
2651: /* Capture start position */
1.10 daniel 2652: if (ctxt->record_info) {
2653: node_info.begin_pos = ctxt->input->consumed +
2654: (CUR_PTR - ctxt->input->base);
2655: node_info.begin_line = ctxt->input->line;
2656: }
1.1 daniel 2657:
1.26 daniel 2658: oldname = xmlStrdup(ctxt->name);
1.18 daniel 2659: htmlParseStartTag(ctxt);
2660: name = ctxt->name;
1.19 daniel 2661: #ifdef DEBUG
2662: if (oldname == NULL)
2663: fprintf(stderr, "Start of element %s\n", name);
2664: else if (name == NULL)
2665: fprintf(stderr, "Start of element failed, was %s\n", oldname);
2666: else
2667: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2668: #endif
1.26 daniel 2669: if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
1.18 daniel 2670: (name == NULL)) {
1.19 daniel 2671: if (CUR == '>')
2672: NEXT;
1.26 daniel 2673: if (oldname != NULL)
2674: xmlFree(oldname);
1.1 daniel 2675: return;
2676: }
1.26 daniel 2677: if (oldname != NULL)
2678: xmlFree(oldname);
1.1 daniel 2679:
2680: /*
2681: * Lookup the info for that element.
2682: */
2683: info = htmlTagLookup(name);
2684: if (info == NULL) {
2685: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2686: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2687: name);
2688: ctxt->wellFormed = 0;
2689: } else if (info->depr) {
2690: /***************************
2691: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2692: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2693: name);
2694: ***************************/
2695: }
2696:
2697: /*
2698: * Check for an Empty Element labelled the XML/SGML way
2699: */
2700: if ((CUR == '/') && (NXT(1) == '>')) {
2701: SKIP(2);
2702: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2703: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2704: oldname = htmlnamePop(ctxt);
1.18 daniel 2705: #ifdef DEBUG
2706: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2707: #endif
1.17 daniel 2708: if (oldname != NULL)
2709: xmlFree(oldname);
1.1 daniel 2710: return;
2711: }
2712:
1.5 daniel 2713: if (CUR == '>') {
2714: NEXT;
2715: } else {
1.1 daniel 2716: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2717: ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2718: openTag);
2719: ctxt->wellFormed = 0;
2720:
2721: /*
2722: * end of parsing of this node.
2723: */
1.18 daniel 2724: if (!xmlStrcmp(name, ctxt->name)) {
2725: nodePop(ctxt);
1.24 daniel 2726: oldname = htmlnamePop(ctxt);
1.18 daniel 2727: #ifdef DEBUG
2728: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2729: #endif
2730: if (oldname != NULL)
2731: xmlFree(oldname);
2732: }
1.10 daniel 2733:
2734: /*
2735: * Capture end position and add node
2736: */
2737: if ( currentNode != NULL && ctxt->record_info ) {
2738: node_info.end_pos = ctxt->input->consumed +
2739: (CUR_PTR - ctxt->input->base);
2740: node_info.end_line = ctxt->input->line;
1.15 daniel 2741: node_info.node = ctxt->node;
1.10 daniel 2742: xmlParserAddNodeInfo(ctxt, &node_info);
2743: }
1.1 daniel 2744: return;
2745: }
2746:
2747: /*
2748: * Check for an Empty Element from DTD definition
2749: */
2750: if ((info != NULL) && (info->empty)) {
2751: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2752: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2753: oldname = htmlnamePop(ctxt);
1.18 daniel 2754: #ifdef DEBUG
2755: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2756: #endif
1.17 daniel 2757: if (oldname != NULL)
2758: xmlFree(oldname);
1.1 daniel 2759: return;
2760: }
2761:
2762: /*
2763: * Parse the content of the element:
2764: */
1.26 daniel 2765: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 2766: depth = ctxt->nameNr;
2767: while (IS_CHAR(CUR)) {
2768: htmlParseContent(ctxt);
2769: if (ctxt->nameNr < depth) break;
2770: }
1.1 daniel 2771:
2772: if (!IS_CHAR(CUR)) {
1.49 daniel 2773: /************
1.1 daniel 2774: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2775: ctxt->sax->error(ctxt->userData,
1.18 daniel 2776: "Premature end of data in tag %s\n", currentNode);
1.1 daniel 2777: ctxt->wellFormed = 0;
1.49 daniel 2778: *************/
1.1 daniel 2779:
2780: /*
2781: * end of parsing of this node.
2782: */
2783: nodePop(ctxt);
1.24 daniel 2784: oldname = htmlnamePop(ctxt);
1.18 daniel 2785: #ifdef DEBUG
2786: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2787: #endif
1.17 daniel 2788: if (oldname != NULL)
2789: xmlFree(oldname);
1.26 daniel 2790: if (currentNode != NULL)
2791: xmlFree(currentNode);
1.1 daniel 2792: return;
2793: }
1.10 daniel 2794:
2795: /*
2796: * Capture end position and add node
2797: */
2798: if ( currentNode != NULL && ctxt->record_info ) {
2799: node_info.end_pos = ctxt->input->consumed +
2800: (CUR_PTR - ctxt->input->base);
2801: node_info.end_line = ctxt->input->line;
1.15 daniel 2802: node_info.node = ctxt->node;
1.10 daniel 2803: xmlParserAddNodeInfo(ctxt, &node_info);
2804: }
1.26 daniel 2805: if (currentNode != NULL)
2806: xmlFree(currentNode);
1.1 daniel 2807: }
2808:
2809: /**
2810: * htmlParseDocument :
2811: * @ctxt: an HTML parser context
2812: *
2813: * parse an HTML document (and build a tree if using the standard SAX
2814: * interface).
2815: *
2816: * Returns 0, -1 in case of error. the parser context is augmented
2817: * as a result of the parsing.
2818: */
2819:
2820: int
2821: htmlParseDocument(htmlParserCtxtPtr ctxt) {
2822: htmlDefaultSAXHandlerInit();
2823: ctxt->html = 1;
2824:
1.5 daniel 2825: GROW;
1.1 daniel 2826: /*
1.9 daniel 2827: * SAX: beginning of the document processing.
1.1 daniel 2828: */
2829: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2830: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2831:
2832: /*
2833: * Wipe out everything which is before the first '<'
2834: */
1.22 daniel 2835: SKIP_BLANKS;
1.1 daniel 2836: if (CUR == 0) {
2837: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2838: ctxt->sax->error(ctxt->userData, "Document is empty\n");
2839: ctxt->wellFormed = 0;
2840: }
2841:
1.40 daniel 2842: if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
2843: ctxt->sax->startDocument(ctxt->userData);
2844:
2845:
1.22 daniel 2846: /*
2847: * Parse possible comments before any content
2848: */
2849: while ((CUR == '<') && (NXT(1) == '!') &&
2850: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 2851: htmlParseComment(ctxt);
1.22 daniel 2852: SKIP_BLANKS;
2853: }
2854:
1.1 daniel 2855:
2856: /*
2857: * Then possibly doc type declaration(s) and more Misc
2858: * (doctypedecl Misc*)?
2859: */
2860: if ((CUR == '<') && (NXT(1) == '!') &&
2861: (UPP(2) == 'D') && (UPP(3) == 'O') &&
2862: (UPP(4) == 'C') && (UPP(5) == 'T') &&
2863: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2864: (UPP(8) == 'E')) {
2865: htmlParseDocTypeDecl(ctxt);
2866: }
2867: SKIP_BLANKS;
2868:
2869: /*
2870: * Time to start parsing the tree itself
2871: */
1.22 daniel 2872: htmlParseContent(ctxt);
1.1 daniel 2873:
2874: /*
1.47 daniel 2875: * autoclose
2876: */
2877: if (CUR == 0)
2878: htmlAutoClose(ctxt, NULL);
2879:
2880:
2881: /*
1.1 daniel 2882: * SAX: end of the document processing.
2883: */
2884: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2885: ctxt->sax->endDocument(ctxt->userData);
2886: if (! ctxt->wellFormed) return(-1);
2887: return(0);
2888: }
2889:
2890:
1.30 daniel 2891: /************************************************************************
2892: * *
2893: * Parser contexts handling *
2894: * *
2895: ************************************************************************/
1.1 daniel 2896:
2897: /**
2898: * xmlInitParserCtxt:
2899: * @ctxt: an HTML parser context
2900: *
2901: * Initialize a parser context
2902: */
2903:
2904: void
2905: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2906: {
2907: htmlSAXHandler *sax;
2908:
1.21 daniel 2909: if (ctxt == NULL) return;
2910: memset(ctxt, 0, sizeof(htmlParserCtxt));
2911:
1.11 daniel 2912: sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
1.1 daniel 2913: if (sax == NULL) {
2914: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2915: }
1.19 daniel 2916: memset(sax, 0, sizeof(htmlSAXHandler));
1.1 daniel 2917:
2918: /* Allocate the Input stack */
1.19 daniel 2919: ctxt->inputTab = (htmlParserInputPtr *)
2920: xmlMalloc(5 * sizeof(htmlParserInputPtr));
2921: if (ctxt->inputTab == NULL) {
2922: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2923: }
1.1 daniel 2924: ctxt->inputNr = 0;
2925: ctxt->inputMax = 5;
2926: ctxt->input = NULL;
2927: ctxt->version = NULL;
2928: ctxt->encoding = NULL;
2929: ctxt->standalone = -1;
1.30 daniel 2930: ctxt->instate = XML_PARSER_START;
1.1 daniel 2931:
2932: /* Allocate the Node stack */
1.11 daniel 2933: ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
1.1 daniel 2934: ctxt->nodeNr = 0;
2935: ctxt->nodeMax = 10;
2936: ctxt->node = NULL;
2937:
1.15 daniel 2938: /* Allocate the Name stack */
2939: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
2940: ctxt->nameNr = 0;
2941: ctxt->nameMax = 10;
2942: ctxt->name = NULL;
2943:
1.1 daniel 2944: if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2945: else {
2946: ctxt->sax = sax;
2947: memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2948: }
2949: ctxt->userData = ctxt;
2950: ctxt->myDoc = NULL;
2951: ctxt->wellFormed = 1;
2952: ctxt->replaceEntities = 0;
2953: ctxt->html = 1;
2954: ctxt->record_info = 0;
1.21 daniel 2955: ctxt->validate = 0;
1.26 daniel 2956: ctxt->nbChars = 0;
1.30 daniel 2957: ctxt->checkIndex = 0;
1.1 daniel 2958: xmlInitNodeInfoSeq(&ctxt->node_seq);
2959: }
2960:
2961: /**
2962: * htmlFreeParserCtxt:
2963: * @ctxt: an HTML parser context
2964: *
2965: * Free all the memory used by a parser context. However the parsed
2966: * document in ctxt->myDoc is not freed.
2967: */
2968:
2969: void
2970: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2971: {
1.47 daniel 2972: xmlFreeParserCtxt(ctxt);
1.1 daniel 2973: }
2974:
2975: /**
2976: * htmlCreateDocParserCtxt :
1.14 daniel 2977: * @cur: a pointer to an array of xmlChar
1.1 daniel 2978: * @encoding: a free form C string describing the HTML document encoding, or NULL
2979: *
2980: * Create a parser context for an HTML document.
2981: *
2982: * Returns the new parser context or NULL
2983: */
2984: htmlParserCtxtPtr
1.14 daniel 2985: htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
1.1 daniel 2986: htmlParserCtxtPtr ctxt;
2987: htmlParserInputPtr input;
2988: /* htmlCharEncoding enc; */
2989:
1.11 daniel 2990: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 2991: if (ctxt == NULL) {
2992: perror("malloc");
2993: return(NULL);
2994: }
2995: htmlInitParserCtxt(ctxt);
1.11 daniel 2996: input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 2997: if (input == NULL) {
2998: perror("malloc");
1.11 daniel 2999: xmlFree(ctxt);
1.1 daniel 3000: return(NULL);
3001: }
1.19 daniel 3002: memset(input, 0, sizeof(htmlParserInput));
1.1 daniel 3003:
3004: input->line = 1;
3005: input->col = 1;
3006: input->base = cur;
3007: input->cur = cur;
3008:
3009: inputPush(ctxt, input);
3010: return(ctxt);
3011: }
3012:
1.31 daniel 3013: /************************************************************************
3014: * *
3015: * Progressive parsing interfaces *
3016: * *
3017: ************************************************************************/
3018:
3019: /**
3020: * htmlParseLookupSequence:
3021: * @ctxt: an HTML parser context
3022: * @first: the first char to lookup
3023: * @next: the next char to lookup or zero
3024: * @third: the next char to lookup or zero
3025: *
3026: * Try to find if a sequence (first, next, third) or just (first next) or
3027: * (first) is available in the input stream.
3028: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3029: * to avoid rescanning sequences of bytes, it DOES change the state of the
3030: * parser, do not use liberally.
3031: * This is basically similar to xmlParseLookupSequence()
3032: *
3033: * Returns the index to the current parsing point if the full sequence
3034: * is available, -1 otherwise.
3035: */
3036: int
3037: htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3038: xmlChar next, xmlChar third) {
3039: int base, len;
3040: htmlParserInputPtr in;
3041: const xmlChar *buf;
3042:
3043: in = ctxt->input;
3044: if (in == NULL) return(-1);
3045: base = in->cur - in->base;
3046: if (base < 0) return(-1);
3047: if (ctxt->checkIndex > base)
3048: base = ctxt->checkIndex;
3049: if (in->buf == NULL) {
3050: buf = in->base;
3051: len = in->length;
3052: } else {
3053: buf = in->buf->buffer->content;
3054: len = in->buf->buffer->use;
3055: }
3056: /* take into account the sequence length */
3057: if (third) len -= 2;
3058: else if (next) len --;
3059: for (;base < len;base++) {
3060: if (buf[base] == first) {
3061: if (third != 0) {
3062: if ((buf[base + 1] != next) ||
3063: (buf[base + 2] != third)) continue;
3064: } else if (next != 0) {
3065: if (buf[base + 1] != next) continue;
3066: }
3067: ctxt->checkIndex = 0;
3068: #ifdef DEBUG_PUSH
3069: if (next == 0)
3070: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3071: first, base);
3072: else if (third == 0)
3073: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3074: first, next, base);
3075: else
3076: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3077: first, next, third, base);
3078: #endif
3079: return(base - (in->cur - in->base));
3080: }
3081: }
3082: ctxt->checkIndex = base;
3083: #ifdef DEBUG_PUSH
3084: if (next == 0)
3085: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3086: else if (third == 0)
3087: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3088: else
3089: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3090: #endif
3091: return(-1);
3092: }
3093:
3094: /**
1.32 daniel 3095: * htmlParseTryOrFinish:
1.31 daniel 3096: * @ctxt: an HTML parser context
1.32 daniel 3097: * @terminate: last chunk indicator
1.31 daniel 3098: *
3099: * Try to progress on parsing
3100: *
3101: * Returns zero if no parsing was possible
3102: */
3103: int
1.32 daniel 3104: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
1.31 daniel 3105: int ret = 0;
3106: htmlParserInputPtr in;
1.47 daniel 3107: int avail = 0;
1.31 daniel 3108: xmlChar cur, next;
3109:
3110: #ifdef DEBUG_PUSH
3111: switch (ctxt->instate) {
3112: case XML_PARSER_EOF:
3113: fprintf(stderr, "HPP: try EOF\n"); break;
3114: case XML_PARSER_START:
3115: fprintf(stderr, "HPP: try START\n"); break;
3116: case XML_PARSER_MISC:
3117: fprintf(stderr, "HPP: try MISC\n");break;
3118: case XML_PARSER_COMMENT:
3119: fprintf(stderr, "HPP: try COMMENT\n");break;
3120: case XML_PARSER_PROLOG:
3121: fprintf(stderr, "HPP: try PROLOG\n");break;
3122: case XML_PARSER_START_TAG:
3123: fprintf(stderr, "HPP: try START_TAG\n");break;
3124: case XML_PARSER_CONTENT:
3125: fprintf(stderr, "HPP: try CONTENT\n");break;
3126: case XML_PARSER_CDATA_SECTION:
3127: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3128: case XML_PARSER_END_TAG:
3129: fprintf(stderr, "HPP: try END_TAG\n");break;
3130: case XML_PARSER_ENTITY_DECL:
3131: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3132: case XML_PARSER_ENTITY_VALUE:
3133: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3134: case XML_PARSER_ATTRIBUTE_VALUE:
3135: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3136: case XML_PARSER_DTD:
3137: fprintf(stderr, "HPP: try DTD\n");break;
3138: case XML_PARSER_EPILOG:
3139: fprintf(stderr, "HPP: try EPILOG\n");break;
3140: case XML_PARSER_PI:
3141: fprintf(stderr, "HPP: try PI\n");break;
3142: }
3143: #endif
3144:
3145: while (1) {
3146:
3147: in = ctxt->input;
3148: if (in == NULL) break;
3149: if (in->buf == NULL)
3150: avail = in->length - (in->cur - in->base);
3151: else
3152: avail = in->buf->buffer->use - (in->cur - in->base);
1.47 daniel 3153: if ((avail == 0) && (terminate)) {
3154: htmlAutoClose(ctxt, NULL);
3155: if (ctxt->nameNr == 0)
3156: ctxt->instate = XML_PARSER_EOF;
3157: }
1.31 daniel 3158: if (avail < 1)
3159: goto done;
3160: switch (ctxt->instate) {
3161: case XML_PARSER_EOF:
3162: /*
3163: * Document parsing is done !
3164: */
3165: goto done;
3166: case XML_PARSER_START:
3167: /*
3168: * Very first chars read from the document flow.
3169: */
3170: cur = in->cur[0];
3171: if (IS_BLANK(cur)) {
3172: SKIP_BLANKS;
3173: if (in->buf == NULL)
3174: avail = in->length - (in->cur - in->base);
3175: else
3176: avail = in->buf->buffer->use - (in->cur - in->base);
3177: }
3178: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3179: ctxt->sax->setDocumentLocator(ctxt->userData,
3180: &xmlDefaultSAXLocator);
1.46 daniel 3181: if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3182: (!ctxt->disableSAX))
3183: ctxt->sax->startDocument(ctxt->userData);
3184:
1.31 daniel 3185: cur = in->cur[0];
3186: next = in->cur[1];
3187: if ((cur == '<') && (next == '!') &&
3188: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3189: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3190: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3191: (UPP(8) == 'E')) {
1.32 daniel 3192: if ((!terminate) &&
3193: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3194: goto done;
3195: #ifdef DEBUG_PUSH
3196: fprintf(stderr, "HPP: Parsing internal subset\n");
3197: #endif
3198: htmlParseDocTypeDecl(ctxt);
3199: ctxt->instate = XML_PARSER_PROLOG;
3200: #ifdef DEBUG_PUSH
3201: fprintf(stderr, "HPP: entering PROLOG\n");
3202: #endif
3203: } else {
3204: ctxt->instate = XML_PARSER_MISC;
3205: }
3206: #ifdef DEBUG_PUSH
3207: fprintf(stderr, "HPP: entering MISC\n");
3208: #endif
3209: break;
3210: case XML_PARSER_MISC:
3211: SKIP_BLANKS;
3212: if (in->buf == NULL)
3213: avail = in->length - (in->cur - in->base);
3214: else
3215: avail = in->buf->buffer->use - (in->cur - in->base);
3216: if (avail < 2)
3217: goto done;
3218: cur = in->cur[0];
3219: next = in->cur[1];
3220: if ((cur == '<') && (next == '!') &&
3221: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3222: if ((!terminate) &&
3223: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3224: goto done;
3225: #ifdef DEBUG_PUSH
3226: fprintf(stderr, "HPP: Parsing Comment\n");
3227: #endif
3228: htmlParseComment(ctxt);
3229: ctxt->instate = XML_PARSER_MISC;
3230: } else if ((cur == '<') && (next == '!') &&
3231: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3232: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3233: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3234: (UPP(8) == 'E')) {
1.32 daniel 3235: if ((!terminate) &&
3236: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3237: goto done;
3238: #ifdef DEBUG_PUSH
3239: fprintf(stderr, "HPP: Parsing internal subset\n");
3240: #endif
3241: htmlParseDocTypeDecl(ctxt);
3242: ctxt->instate = XML_PARSER_PROLOG;
3243: #ifdef DEBUG_PUSH
3244: fprintf(stderr, "HPP: entering PROLOG\n");
3245: #endif
3246: } else if ((cur == '<') && (next == '!') &&
3247: (avail < 9)) {
3248: goto done;
3249: } else {
3250: ctxt->instate = XML_PARSER_START_TAG;
3251: #ifdef DEBUG_PUSH
3252: fprintf(stderr, "HPP: entering START_TAG\n");
3253: #endif
3254: }
3255: break;
3256: case XML_PARSER_PROLOG:
3257: SKIP_BLANKS;
3258: if (in->buf == NULL)
3259: avail = in->length - (in->cur - in->base);
3260: else
3261: avail = in->buf->buffer->use - (in->cur - in->base);
3262: if (avail < 2)
3263: goto done;
3264: cur = in->cur[0];
3265: next = in->cur[1];
3266: if ((cur == '<') && (next == '!') &&
3267: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3268: if ((!terminate) &&
3269: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3270: goto done;
3271: #ifdef DEBUG_PUSH
3272: fprintf(stderr, "HPP: Parsing Comment\n");
3273: #endif
3274: htmlParseComment(ctxt);
3275: ctxt->instate = XML_PARSER_PROLOG;
3276: } else if ((cur == '<') && (next == '!') &&
3277: (avail < 4)) {
3278: goto done;
3279: } else {
3280: ctxt->instate = XML_PARSER_START_TAG;
3281: #ifdef DEBUG_PUSH
3282: fprintf(stderr, "HPP: entering START_TAG\n");
3283: #endif
3284: }
3285: break;
3286: case XML_PARSER_EPILOG:
3287: SKIP_BLANKS;
3288: if (in->buf == NULL)
3289: avail = in->length - (in->cur - in->base);
3290: else
3291: avail = in->buf->buffer->use - (in->cur - in->base);
3292: if (avail < 2)
3293: goto done;
3294: cur = in->cur[0];
3295: next = in->cur[1];
3296: if ((cur == '<') && (next == '!') &&
3297: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3298: if ((!terminate) &&
3299: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3300: goto done;
3301: #ifdef DEBUG_PUSH
3302: fprintf(stderr, "HPP: Parsing Comment\n");
3303: #endif
3304: htmlParseComment(ctxt);
3305: ctxt->instate = XML_PARSER_EPILOG;
3306: } else if ((cur == '<') && (next == '!') &&
3307: (avail < 4)) {
3308: goto done;
3309: } else {
3310: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3311: ctxt->sax->error(ctxt->userData,
3312: "Extra content at the end of the document\n");
3313: ctxt->wellFormed = 0;
3314: ctxt->errNo = XML_ERR_DOCUMENT_END;
3315: ctxt->instate = XML_PARSER_EOF;
3316: #ifdef DEBUG_PUSH
3317: fprintf(stderr, "HPP: entering EOF\n");
3318: #endif
3319: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3320: ctxt->sax->endDocument(ctxt->userData);
3321: goto done;
3322: }
3323: break;
3324: case XML_PARSER_START_TAG: {
3325: xmlChar *name, *oldname;
3326: int depth = ctxt->nameNr;
3327: htmlElemDescPtr info;
3328:
3329: if (avail < 2)
3330: goto done;
3331: cur = in->cur[0];
3332: if (cur != '<') {
3333: ctxt->instate = XML_PARSER_CONTENT;
3334: #ifdef DEBUG_PUSH
3335: fprintf(stderr, "HPP: entering CONTENT\n");
3336: #endif
3337: break;
3338: }
1.32 daniel 3339: if ((!terminate) &&
3340: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3341: goto done;
3342:
3343: oldname = xmlStrdup(ctxt->name);
3344: htmlParseStartTag(ctxt);
3345: name = ctxt->name;
3346: #ifdef DEBUG
3347: if (oldname == NULL)
3348: fprintf(stderr, "Start of element %s\n", name);
3349: else if (name == NULL)
3350: fprintf(stderr, "Start of element failed, was %s\n",
3351: oldname);
3352: else
3353: fprintf(stderr, "Start of element %s, was %s\n",
3354: name, oldname);
3355: #endif
3356: if (((depth == ctxt->nameNr) &&
3357: (!xmlStrcmp(oldname, ctxt->name))) ||
3358: (name == NULL)) {
3359: if (CUR == '>')
3360: NEXT;
3361: if (oldname != NULL)
3362: xmlFree(oldname);
3363: break;
3364: }
3365: if (oldname != NULL)
3366: xmlFree(oldname);
3367:
3368: /*
3369: * Lookup the info for that element.
3370: */
3371: info = htmlTagLookup(name);
3372: if (info == NULL) {
3373: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3374: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3375: name);
3376: ctxt->wellFormed = 0;
3377: } else if (info->depr) {
3378: /***************************
3379: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3380: ctxt->sax->warning(ctxt->userData,
3381: "Tag %s is deprecated\n",
3382: name);
3383: ***************************/
3384: }
3385:
3386: /*
3387: * Check for an Empty Element labelled the XML/SGML way
3388: */
3389: if ((CUR == '/') && (NXT(1) == '>')) {
3390: SKIP(2);
3391: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3392: ctxt->sax->endElement(ctxt->userData, name);
3393: oldname = htmlnamePop(ctxt);
3394: #ifdef DEBUG
3395: fprintf(stderr,"End of tag the XML way: popping out %s\n",
3396: oldname);
3397: #endif
3398: if (oldname != NULL)
3399: xmlFree(oldname);
3400: ctxt->instate = XML_PARSER_CONTENT;
3401: #ifdef DEBUG_PUSH
3402: fprintf(stderr, "HPP: entering CONTENT\n");
3403: #endif
3404: break;
3405: }
3406:
3407: if (CUR == '>') {
3408: NEXT;
3409: } else {
3410: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3411: ctxt->sax->error(ctxt->userData,
3412: "Couldn't find end of Start Tag %s\n",
3413: name);
3414: ctxt->wellFormed = 0;
3415:
3416: /*
3417: * end of parsing of this node.
3418: */
3419: if (!xmlStrcmp(name, ctxt->name)) {
3420: nodePop(ctxt);
3421: oldname = htmlnamePop(ctxt);
3422: #ifdef DEBUG
3423: fprintf(stderr,
3424: "End of start tag problem: popping out %s\n", oldname);
3425: #endif
3426: if (oldname != NULL)
3427: xmlFree(oldname);
3428: }
3429:
3430: ctxt->instate = XML_PARSER_CONTENT;
3431: #ifdef DEBUG_PUSH
3432: fprintf(stderr, "HPP: entering CONTENT\n");
3433: #endif
3434: break;
3435: }
3436:
3437: /*
3438: * Check for an Empty Element from DTD definition
3439: */
3440: if ((info != NULL) && (info->empty)) {
3441: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3442: ctxt->sax->endElement(ctxt->userData, name);
3443: oldname = htmlnamePop(ctxt);
3444: #ifdef DEBUG
3445: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3446: #endif
3447: if (oldname != NULL)
3448: xmlFree(oldname);
3449: }
3450: ctxt->instate = XML_PARSER_CONTENT;
3451: #ifdef DEBUG_PUSH
3452: fprintf(stderr, "HPP: entering CONTENT\n");
3453: #endif
3454: break;
3455: }
3456: case XML_PARSER_CONTENT:
3457: /*
3458: * Handle preparsed entities and charRef
3459: */
3460: if (ctxt->token != 0) {
1.47 daniel 3461: xmlChar chr[2] = { 0 , 0 } ;
1.31 daniel 3462:
1.47 daniel 3463: chr[0] = (xmlChar) ctxt->token;
1.31 daniel 3464: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.47 daniel 3465: ctxt->sax->characters(ctxt->userData, chr, 1);
1.31 daniel 3466: ctxt->token = 0;
3467: ctxt->checkIndex = 0;
3468: }
1.47 daniel 3469: if ((avail == 1) && (terminate)) {
3470: cur = in->cur[0];
3471: if ((cur != '<') && (cur != '&')) {
1.48 daniel 3472: if (ctxt->sax != NULL) {
3473: if (IS_BLANK(cur)) {
3474: if (ctxt->sax->ignorableWhitespace != NULL)
3475: ctxt->sax->ignorableWhitespace(
3476: ctxt->userData, &cur, 1);
3477: } else {
3478: if (ctxt->sax->characters != NULL)
3479: ctxt->sax->characters(
3480: ctxt->userData, &cur, 1);
3481: }
3482: }
1.47 daniel 3483: ctxt->token = 0;
3484: ctxt->checkIndex = 0;
3485: NEXT;
3486: }
3487: break;
3488: }
1.31 daniel 3489: if (avail < 2)
3490: goto done;
3491: cur = in->cur[0];
3492: next = in->cur[1];
3493: if ((cur == '<') && (next == '!') &&
3494: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3495: if ((!terminate) &&
3496: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3497: goto done;
3498: #ifdef DEBUG_PUSH
3499: fprintf(stderr, "HPP: Parsing Comment\n");
3500: #endif
3501: htmlParseComment(ctxt);
3502: ctxt->instate = XML_PARSER_CONTENT;
3503: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
3504: goto done;
3505: } else if ((cur == '<') && (next == '/')) {
3506: ctxt->instate = XML_PARSER_END_TAG;
3507: ctxt->checkIndex = 0;
3508: #ifdef DEBUG_PUSH
3509: fprintf(stderr, "HPP: entering END_TAG\n");
3510: #endif
3511: break;
3512: } else if (cur == '<') {
3513: ctxt->instate = XML_PARSER_START_TAG;
3514: ctxt->checkIndex = 0;
3515: #ifdef DEBUG_PUSH
3516: fprintf(stderr, "HPP: entering START_TAG\n");
3517: #endif
3518: break;
3519: } else if (cur == '&') {
1.32 daniel 3520: if ((!terminate) &&
3521: (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
1.31 daniel 3522: goto done;
3523: #ifdef DEBUG_PUSH
3524: fprintf(stderr, "HPP: Parsing Reference\n");
3525: #endif
3526: /* TODO: check generation of subtrees if noent !!! */
3527: htmlParseReference(ctxt);
3528: } else {
3529: /* TODO Avoid the extra copy, handle directly !!!!!! */
3530: /*
3531: * Goal of the following test is :
3532: * - minimize calls to the SAX 'character' callback
3533: * when they are mergeable
3534: */
3535: if ((ctxt->inputNr == 1) &&
3536: (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
1.32 daniel 3537: if ((!terminate) &&
3538: (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
1.31 daniel 3539: goto done;
3540: }
3541: ctxt->checkIndex = 0;
3542: #ifdef DEBUG_PUSH
3543: fprintf(stderr, "HPP: Parsing char data\n");
3544: #endif
3545: htmlParseCharData(ctxt, 0);
3546: }
3547: break;
3548: case XML_PARSER_END_TAG:
3549: if (avail < 2)
3550: goto done;
1.32 daniel 3551: if ((!terminate) &&
3552: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3553: goto done;
3554: htmlParseEndTag(ctxt);
3555: if (ctxt->nameNr == 0) {
3556: ctxt->instate = XML_PARSER_EPILOG;
3557: } else {
3558: ctxt->instate = XML_PARSER_CONTENT;
3559: }
3560: ctxt->checkIndex = 0;
3561: #ifdef DEBUG_PUSH
3562: fprintf(stderr, "HPP: entering CONTENT\n");
3563: #endif
3564: break;
3565: case XML_PARSER_CDATA_SECTION:
3566: fprintf(stderr, "HPP: internal error, state == CDATA\n");
3567: ctxt->instate = XML_PARSER_CONTENT;
3568: ctxt->checkIndex = 0;
3569: #ifdef DEBUG_PUSH
3570: fprintf(stderr, "HPP: entering CONTENT\n");
3571: #endif
3572: break;
3573: case XML_PARSER_DTD:
3574: fprintf(stderr, "HPP: internal error, state == DTD\n");
3575: ctxt->instate = XML_PARSER_CONTENT;
3576: ctxt->checkIndex = 0;
3577: #ifdef DEBUG_PUSH
3578: fprintf(stderr, "HPP: entering CONTENT\n");
3579: #endif
3580: break;
3581: case XML_PARSER_COMMENT:
3582: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
3583: ctxt->instate = XML_PARSER_CONTENT;
3584: ctxt->checkIndex = 0;
3585: #ifdef DEBUG_PUSH
3586: fprintf(stderr, "HPP: entering CONTENT\n");
3587: #endif
3588: break;
3589: case XML_PARSER_PI:
3590: fprintf(stderr, "HPP: internal error, state == PI\n");
3591: ctxt->instate = XML_PARSER_CONTENT;
3592: ctxt->checkIndex = 0;
3593: #ifdef DEBUG_PUSH
3594: fprintf(stderr, "HPP: entering CONTENT\n");
3595: #endif
3596: break;
3597: case XML_PARSER_ENTITY_DECL:
3598: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
3599: ctxt->instate = XML_PARSER_CONTENT;
3600: ctxt->checkIndex = 0;
3601: #ifdef DEBUG_PUSH
3602: fprintf(stderr, "HPP: entering CONTENT\n");
3603: #endif
3604: break;
3605: case XML_PARSER_ENTITY_VALUE:
3606: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
3607: ctxt->instate = XML_PARSER_CONTENT;
3608: ctxt->checkIndex = 0;
3609: #ifdef DEBUG_PUSH
3610: fprintf(stderr, "HPP: entering DTD\n");
3611: #endif
3612: break;
3613: case XML_PARSER_ATTRIBUTE_VALUE:
3614: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
3615: ctxt->instate = XML_PARSER_START_TAG;
3616: ctxt->checkIndex = 0;
3617: #ifdef DEBUG_PUSH
3618: fprintf(stderr, "HPP: entering START_TAG\n");
3619: #endif
3620: break;
3621: }
3622: }
3623: done:
1.47 daniel 3624: if ((avail == 0) && (terminate)) {
3625: htmlAutoClose(ctxt, NULL);
3626: if (ctxt->nameNr == 0)
3627: ctxt->instate = XML_PARSER_EOF;
3628: }
1.31 daniel 3629: #ifdef DEBUG_PUSH
3630: fprintf(stderr, "HPP: done %d\n", ret);
3631: #endif
3632: return(ret);
3633: }
3634:
3635: /**
1.32 daniel 3636: * htmlParseTry:
3637: * @ctxt: an HTML parser context
3638: *
3639: * Try to progress on parsing
3640: *
3641: * Returns zero if no parsing was possible
3642: */
3643: int
3644: htmlParseTry(htmlParserCtxtPtr ctxt) {
3645: return(htmlParseTryOrFinish(ctxt, 0));
3646: }
3647:
3648: /**
1.31 daniel 3649: * htmlParseChunk:
3650: * @ctxt: an XML parser context
3651: * @chunk: an char array
3652: * @size: the size in byte of the chunk
3653: * @terminate: last chunk indicator
3654: *
3655: * Parse a Chunk of memory
3656: *
3657: * Returns zero if no error, the xmlParserErrors otherwise.
3658: */
3659: int
3660: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
3661: int terminate) {
3662: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3663: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
3664: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
3665: int cur = ctxt->input->cur - ctxt->input->base;
3666:
3667: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3668: ctxt->input->base = ctxt->input->buf->buffer->content + base;
3669: ctxt->input->cur = ctxt->input->base + cur;
3670: #ifdef DEBUG_PUSH
3671: fprintf(stderr, "HPP: pushed %d\n", size);
3672: #endif
3673:
1.34 daniel 3674: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
3675: htmlParseTryOrFinish(ctxt, terminate);
1.31 daniel 3676: } else if (ctxt->instate != XML_PARSER_EOF)
1.32 daniel 3677: htmlParseTryOrFinish(ctxt, terminate);
1.31 daniel 3678: if (terminate) {
3679: if ((ctxt->instate != XML_PARSER_EOF) &&
3680: (ctxt->instate != XML_PARSER_EPILOG) &&
3681: (ctxt->instate != XML_PARSER_MISC)) {
3682: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3683: ctxt->sax->error(ctxt->userData,
3684: "Extra content at the end of the document\n");
3685: ctxt->wellFormed = 0;
3686: ctxt->errNo = XML_ERR_DOCUMENT_END;
3687: }
3688: if (ctxt->instate != XML_PARSER_EOF) {
3689: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3690: ctxt->sax->endDocument(ctxt->userData);
3691: }
3692: ctxt->instate = XML_PARSER_EOF;
3693: }
3694: return((xmlParserErrors) ctxt->errNo);
3695: }
3696:
3697: /************************************************************************
3698: * *
3699: * User entry points *
3700: * *
3701: ************************************************************************/
3702:
3703: /**
3704: * htmlCreatePushParserCtxt :
3705: * @sax: a SAX handler
3706: * @user_data: The user data returned on SAX callbacks
3707: * @chunk: a pointer to an array of chars
3708: * @size: number of chars in the array
3709: * @filename: an optional file name or URI
3710: * @enc: an optional encoding
3711: *
3712: * Create a parser context for using the HTML parser in push mode
3713: * To allow content encoding detection, @size should be >= 4
3714: * The value of @filename is used for fetching external entities
3715: * and error/warning reports.
3716: *
3717: * Returns the new parser context or NULL
3718: */
3719: htmlParserCtxtPtr
3720: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
3721: const char *chunk, int size, const char *filename,
3722: xmlCharEncoding enc) {
3723: htmlParserCtxtPtr ctxt;
3724: htmlParserInputPtr inputStream;
3725: xmlParserInputBufferPtr buf;
3726:
3727: buf = xmlAllocParserInputBuffer(enc);
3728: if (buf == NULL) return(NULL);
3729:
3730: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3731: if (ctxt == NULL) {
3732: xmlFree(buf);
3733: return(NULL);
3734: }
3735: memset(ctxt, 0, sizeof(htmlParserCtxt));
3736: htmlInitParserCtxt(ctxt);
3737: if (sax != NULL) {
3738: if (ctxt->sax != &htmlDefaultSAXHandler)
3739: xmlFree(ctxt->sax);
3740: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
3741: if (ctxt->sax == NULL) {
3742: xmlFree(buf);
3743: xmlFree(ctxt);
3744: return(NULL);
3745: }
3746: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
3747: if (user_data != NULL)
3748: ctxt->userData = user_data;
3749: }
3750: if (filename == NULL) {
3751: ctxt->directory = NULL;
3752: } else {
3753: ctxt->directory = xmlParserGetDirectory(filename);
3754: }
3755:
3756: inputStream = htmlNewInputStream(ctxt);
3757: if (inputStream == NULL) {
3758: xmlFreeParserCtxt(ctxt);
3759: return(NULL);
3760: }
3761:
3762: if (filename == NULL)
3763: inputStream->filename = NULL;
3764: else
3765: inputStream->filename = xmlMemStrdup(filename);
3766: inputStream->buf = buf;
3767: inputStream->base = inputStream->buf->buffer->content;
3768: inputStream->cur = inputStream->buf->buffer->content;
3769:
3770: inputPush(ctxt, inputStream);
3771:
3772: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3773: (ctxt->input->buf != NULL)) {
3774: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3775: #ifdef DEBUG_PUSH
3776: fprintf(stderr, "HPP: pushed %d\n", size);
3777: #endif
3778: }
3779:
3780: return(ctxt);
3781: }
1.1 daniel 3782:
3783: /**
3784: * htmlSAXParseDoc :
1.14 daniel 3785: * @cur: a pointer to an array of xmlChar
1.1 daniel 3786: * @encoding: a free form C string describing the HTML document encoding, or NULL
3787: * @sax: the SAX handler block
3788: * @userData: if using SAX, this pointer will be provided on callbacks.
3789: *
3790: * parse an HTML in-memory document and build a tree.
3791: * It use the given SAX function block to handle the parsing callback.
3792: * If sax is NULL, fallback to the default DOM tree building routines.
3793: *
3794: * Returns the resulting document tree
3795: */
3796:
3797: htmlDocPtr
1.14 daniel 3798: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
1.1 daniel 3799: htmlDocPtr ret;
3800: htmlParserCtxtPtr ctxt;
3801:
3802: if (cur == NULL) return(NULL);
3803:
3804:
3805: ctxt = htmlCreateDocParserCtxt(cur, encoding);
3806: if (ctxt == NULL) return(NULL);
3807: if (sax != NULL) {
3808: ctxt->sax = sax;
3809: ctxt->userData = userData;
3810: }
3811:
3812: htmlParseDocument(ctxt);
3813: ret = ctxt->myDoc;
3814: if (sax != NULL) {
3815: ctxt->sax = NULL;
3816: ctxt->userData = NULL;
3817: }
3818: htmlFreeParserCtxt(ctxt);
3819:
3820: return(ret);
3821: }
3822:
3823: /**
3824: * htmlParseDoc :
1.14 daniel 3825: * @cur: a pointer to an array of xmlChar
1.1 daniel 3826: * @encoding: a free form C string describing the HTML document encoding, or NULL
3827: *
3828: * parse an HTML in-memory document and build a tree.
3829: *
3830: * Returns the resulting document tree
3831: */
3832:
3833: htmlDocPtr
1.14 daniel 3834: htmlParseDoc(xmlChar *cur, const char *encoding) {
1.1 daniel 3835: return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
3836: }
3837:
3838:
3839: /**
3840: * htmlCreateFileParserCtxt :
3841: * @filename: the filename
3842: * @encoding: a free form C string describing the HTML document encoding, or NULL
3843: *
3844: * Create a parser context for a file content.
3845: * Automatic support for ZLIB/Compress compressed document is provided
3846: * by default if found at compile-time.
3847: *
3848: * Returns the new parser context or NULL
3849: */
3850: htmlParserCtxtPtr
3851: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
3852: {
3853: htmlParserCtxtPtr ctxt;
3854: htmlParserInputPtr inputStream;
1.5 daniel 3855: xmlParserInputBufferPtr buf;
1.1 daniel 3856: /* htmlCharEncoding enc; */
3857:
1.5 daniel 3858: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
3859: if (buf == NULL) return(NULL);
1.1 daniel 3860:
1.11 daniel 3861: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 3862: if (ctxt == NULL) {
3863: perror("malloc");
3864: return(NULL);
3865: }
1.19 daniel 3866: memset(ctxt, 0, sizeof(htmlParserCtxt));
1.1 daniel 3867: htmlInitParserCtxt(ctxt);
1.11 daniel 3868: inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 3869: if (inputStream == NULL) {
3870: perror("malloc");
1.11 daniel 3871: xmlFree(ctxt);
1.1 daniel 3872: return(NULL);
3873: }
1.19 daniel 3874: memset(inputStream, 0, sizeof(htmlParserInput));
1.1 daniel 3875:
1.11 daniel 3876: inputStream->filename = xmlMemStrdup(filename);
1.1 daniel 3877: inputStream->line = 1;
3878: inputStream->col = 1;
1.5 daniel 3879: inputStream->buf = buf;
1.21 daniel 3880: inputStream->directory = NULL;
1.1 daniel 3881:
1.5 daniel 3882: inputStream->base = inputStream->buf->buffer->content;
3883: inputStream->cur = inputStream->buf->buffer->content;
3884: inputStream->free = NULL;
1.1 daniel 3885:
3886: inputPush(ctxt, inputStream);
3887: return(ctxt);
3888: }
3889:
3890: /**
3891: * htmlSAXParseFile :
3892: * @filename: the filename
3893: * @encoding: a free form C string describing the HTML document encoding, or NULL
3894: * @sax: the SAX handler block
3895: * @userData: if using SAX, this pointer will be provided on callbacks.
3896: *
3897: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3898: * compressed document is provided by default if found at compile-time.
3899: * It use the given SAX function block to handle the parsing callback.
3900: * If sax is NULL, fallback to the default DOM tree building routines.
3901: *
3902: * Returns the resulting document tree
3903: */
3904:
3905: htmlDocPtr
3906: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
3907: void *userData) {
3908: htmlDocPtr ret;
3909: htmlParserCtxtPtr ctxt;
3910:
3911: ctxt = htmlCreateFileParserCtxt(filename, encoding);
3912: if (ctxt == NULL) return(NULL);
3913: if (sax != NULL) {
3914: ctxt->sax = sax;
3915: ctxt->userData = userData;
3916: }
3917:
3918: htmlParseDocument(ctxt);
3919:
3920: ret = ctxt->myDoc;
3921: if (sax != NULL) {
3922: ctxt->sax = NULL;
3923: ctxt->userData = NULL;
3924: }
3925: htmlFreeParserCtxt(ctxt);
3926:
3927: return(ret);
3928: }
3929:
3930: /**
3931: * htmlParseFile :
3932: * @filename: the filename
3933: * @encoding: a free form C string describing the HTML document encoding, or NULL
3934: *
3935: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3936: * compressed document is provided by default if found at compile-time.
3937: *
3938: * Returns the resulting document tree
3939: */
3940:
3941: htmlDocPtr
3942: htmlParseFile(const char *filename, const char *encoding) {
3943: return(htmlSAXParseFile(filename, encoding, NULL, NULL));
3944: }
1.39 daniel 3945:
3946: #endif /* LIBXML_HTML_ENABLED */
Webmaster