Annotation of XML/HTMLparser.c, revision 1.38
1.1 daniel 1: /*
2: * HTMLparser.c : an HTML 4.0 non-verifying parser
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
1.29 daniel 10: #include "win32config.h"
1.1 daniel 11: #else
1.13 daniel 12: #include "config.h"
1.1 daniel 13: #endif
1.13 daniel 14:
1.1 daniel 15: #include <stdio.h>
1.13 daniel 16: #include <string.h> /* for memset() only */
17: #ifdef HAVE_CTYPE_H
1.1 daniel 18: #include <ctype.h>
1.13 daniel 19: #endif
20: #ifdef HAVE_STDLIB_H
1.1 daniel 21: #include <stdlib.h>
1.13 daniel 22: #endif
23: #ifdef HAVE_SYS_STAT_H
1.1 daniel 24: #include <sys/stat.h>
1.13 daniel 25: #endif
1.1 daniel 26: #ifdef HAVE_FCNTL_H
27: #include <fcntl.h>
28: #endif
29: #ifdef HAVE_UNISTD_H
30: #include <unistd.h>
31: #endif
32: #ifdef HAVE_ZLIB_H
33: #include <zlib.h>
34: #endif
35:
1.11 daniel 36: #include "xmlmemory.h"
1.1 daniel 37: #include "tree.h"
38: #include "HTMLparser.h"
39: #include "entities.h"
40: #include "encoding.h"
41: #include "valid.h"
42: #include "parserInternals.h"
1.5 daniel 43: #include "xmlIO.h"
1.31 daniel 44: #include "xml-error.h"
1.5 daniel 45:
46: #define HTML_MAX_NAMELEN 1000
47: #define INPUT_CHUNK 50
1.31 daniel 48: #define HTML_PARSER_BIG_BUFFER_SIZE 1024
49: #define HTML_PARSER_BUFFER_SIZE 100
1.1 daniel 50:
51: /* #define DEBUG */
1.31 daniel 52: /* #define DEBUG_PUSH */
1.1 daniel 53:
54: /************************************************************************
55: * *
56: * Parser stacks related functions and macros *
57: * *
58: ************************************************************************/
59:
60: /*
61: * Generic function for accessing stacks in the Parser Context
62: */
63:
1.30 daniel 64: #define PUSH_AND_POP(scope, type, name) \
65: scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
1.1 daniel 66: if (ctxt->name##Nr >= ctxt->name##Max) { \
67: ctxt->name##Max *= 2; \
1.11 daniel 68: ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \
1.1 daniel 69: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
70: if (ctxt->name##Tab == NULL) { \
71: fprintf(stderr, "realloc failed !\n"); \
1.33 daniel 72: return(0); \
1.1 daniel 73: } \
74: } \
75: ctxt->name##Tab[ctxt->name##Nr] = value; \
76: ctxt->name = value; \
77: return(ctxt->name##Nr++); \
78: } \
1.30 daniel 79: scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
1.1 daniel 80: type ret; \
1.18 daniel 81: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 82: ctxt->name##Nr--; \
1.18 daniel 83: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 84: if (ctxt->name##Nr > 0) \
85: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
86: else \
87: ctxt->name = NULL; \
88: ret = ctxt->name##Tab[ctxt->name##Nr]; \
89: ctxt->name##Tab[ctxt->name##Nr] = 0; \
90: return(ret); \
91: } \
92:
1.30 daniel 93: PUSH_AND_POP(extern, xmlNodePtr, node)
94: PUSH_AND_POP(extern, xmlChar*, name)
1.1 daniel 95:
96: /*
97: * Macros for accessing the content. Those should be used only by the parser,
98: * and not exported.
99: *
100: * Dirty macros, i.e. one need to make assumption on the context to use them
101: *
1.14 daniel 102: * CUR_PTR return the current pointer to the xmlChar to be parsed.
103: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
1.1 daniel 104: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
105: * in UNICODE mode. This should be used internally by the parser
106: * only to compare to ASCII values otherwise it would break when
107: * running with UTF-8 encoding.
1.14 daniel 108: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
1.1 daniel 109: * to compare on ASCII based substring.
1.14 daniel 110: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
1.1 daniel 111: * it should be used only to compare on ASCII based substring.
1.14 daniel 112: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
1.1 daniel 113: * strings within the parser.
114: *
115: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
116: *
117: * CURRENT Returns the current char value, with the full decoding of
118: * UTF-8 if we are using this mode. It returns an int.
119: * NEXT Skip to the next character, this does the proper decoding
120: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
121: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
122: */
123:
1.36 daniel 124: #define CUR ((int) (*ctxt->input->cur))
125:
1.1 daniel 126: #define UPPER (toupper(*ctxt->input->cur))
1.36 daniel 127:
1.26 daniel 128: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
1.36 daniel 129:
1.1 daniel 130: #define NXT(val) ctxt->input->cur[(val)]
1.36 daniel 131:
1.1 daniel 132: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
1.36 daniel 133:
1.1 daniel 134: #define CUR_PTR ctxt->input->cur
1.36 daniel 135:
1.5 daniel 136: #define SHRINK xmlParserInputShrink(ctxt->input)
1.36 daniel 137:
1.5 daniel 138: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
1.1 daniel 139:
1.36 daniel 140: #define CURRENT ((int) (*ctxt->input->cur))
1.1 daniel 141:
1.36 daniel 142: #define NEXT htmlNextChar(ctxt);
1.35 daniel 143:
1.36 daniel 144: #define SKIP_BLANKS htmlSkipBlankChars(ctxt);
1.35 daniel 145:
146: /**
147: * htmlNextChar:
148: * @ctxt: the HTML parser context
149: *
150: * Skip to the next char input char.
151: */
152:
153: void
154: htmlNextChar(htmlParserCtxtPtr ctxt) {
155: if ((*ctxt->input->cur == 0) &&
156: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
157: xmlPopInput(ctxt);
158: } else {
159: if (*(ctxt->input->cur) == '\n') {
160: ctxt->input->line++; ctxt->input->col = 1;
161: } else ctxt->input->col++;
162: ctxt->input->cur++;
163: ctxt->nbChars++;
164: if (*ctxt->input->cur == 0)
165: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
166: }
167: }
1.5 daniel 168:
1.36 daniel 169: /**
170: * htmlSkipBlankChars:
171: * @ctxt: the HTML parser context
172: *
173: * skip all blanks character found at that point in the input streams.
174: *
175: * Returns the number of space chars skipped
176: */
177:
178: int
179: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
180: int res = 0;
181:
182: while (IS_BLANK(*(ctxt->input->cur))) {
183: if ((*ctxt->input->cur == 0) &&
184: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
185: xmlPopInput(ctxt);
186: } else {
187: if (*(ctxt->input->cur) == '\n') {
188: ctxt->input->line++; ctxt->input->col = 1;
189: } else ctxt->input->col++;
190: ctxt->input->cur++;
191: ctxt->nbChars++;
192: if (*ctxt->input->cur == 0)
193: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
194: }
195: res++;
196: }
197: return(res);
198: }
1.1 daniel 199:
200:
1.5 daniel 201:
1.1 daniel 202: /************************************************************************
203: * *
204: * The list of HTML elements and their properties *
205: * *
206: ************************************************************************/
207:
208: /*
209: * Start Tag: 1 means the start tag can be ommited
210: * End Tag: 1 means the end tag can be ommited
211: * 2 means it's forbidden (empty elements)
212: * Depr: this element is deprecated
213: * DTD: 1 means that this element is valid only in the Loose DTD
214: * 2 means that this element is valid only in the Frameset DTD
215: *
216: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
217: */
218: htmlElemDesc html40ElementTable[] = {
1.26 daniel 219: { "a", 0, 0, 0, 0, 0, "anchor " },
220: { "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
221: { "acronym", 0, 0, 0, 0, 0, "" },
222: { "address", 0, 0, 0, 0, 0, "information on author " },
223: { "applet", 0, 0, 0, 1, 1, "java applet " },
224: { "area", 0, 2, 1, 0, 0, "client-side image map area " },
225: { "b", 0, 0, 0, 0, 0, "bold text style" },
226: { "base", 0, 2, 1, 0, 0, "document base uri " },
227: { "basefont", 0, 2, 1, 1, 1, "base font size " },
228: { "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
229: { "big", 0, 0, 0, 0, 0, "large text style" },
230: { "blockquote", 0, 0, 0, 0, 0, "long quotation " },
231: { "body", 1, 1, 0, 0, 0, "document body " },
232: { "br", 0, 2, 1, 0, 0, "forced line break " },
233: { "button", 0, 0, 0, 0, 0, "push button " },
234: { "caption", 0, 0, 0, 0, 0, "table caption " },
235: { "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
236: { "cite", 0, 0, 0, 0, 0, "citation" },
237: { "code", 0, 0, 0, 0, 0, "computer code fragment" },
238: { "col", 0, 2, 1, 0, 0, "table column " },
239: { "colgroup", 0, 1, 0, 0, 0, "table column group " },
240: { "dd", 0, 1, 0, 0, 0, "definition description " },
241: { "del", 0, 0, 0, 0, 0, "deleted text " },
242: { "dfn", 0, 0, 0, 0, 0, "instance definition" },
243: { "dir", 0, 0, 0, 1, 1, "directory list" },
244: { "div", 0, 0, 0, 0, 0, "generic language/style container"},
245: { "dl", 0, 0, 0, 0, 0, "definition list " },
246: { "dt", 0, 1, 0, 0, 0, "definition term " },
247: { "em", 0, 0, 0, 0, 0, "emphasis" },
248: { "fieldset", 0, 0, 0, 0, 0, "form control group " },
249: { "font", 0, 0, 0, 1, 1, "local change to font " },
250: { "form", 0, 0, 0, 0, 0, "interactive form " },
251: { "frame", 0, 2, 1, 0, 2, "subwindow " },
252: { "frameset", 0, 0, 0, 0, 2, "window subdivision" },
253: { "h1", 0, 0, 0, 0, 0, "heading " },
254: { "h2", 0, 0, 0, 0, 0, "heading " },
255: { "h3", 0, 0, 0, 0, 0, "heading " },
256: { "h4", 0, 0, 0, 0, 0, "heading " },
257: { "h5", 0, 0, 0, 0, 0, "heading " },
258: { "h6", 0, 0, 0, 0, 0, "heading " },
259: { "head", 1, 1, 0, 0, 0, "document head " },
260: { "hr", 0, 2, 1, 0, 0, "horizontal rule " },
261: { "html", 1, 1, 0, 0, 0, "document root element " },
262: { "i", 0, 0, 0, 0, 0, "italic text style" },
263: { "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
264: { "img", 0, 2, 1, 0, 0, "embedded image " },
265: { "input", 0, 2, 1, 0, 0, "form control " },
266: { "ins", 0, 0, 0, 0, 0, "inserted text" },
267: { "isindex", 0, 2, 1, 1, 1, "single line prompt " },
268: { "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
269: { "label", 0, 0, 0, 0, 0, "form field label text " },
270: { "legend", 0, 0, 0, 0, 0, "fieldset legend " },
271: { "li", 0, 1, 0, 0, 0, "list item " },
272: { "link", 0, 2, 1, 0, 0, "a media-independent link " },
273: { "map", 0, 0, 0, 0, 0, "client-side image map " },
274: { "menu", 0, 0, 0, 1, 1, "menu list " },
275: { "meta", 0, 2, 1, 0, 0, "generic metainformation " },
276: { "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
277: { "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
278: { "object", 0, 0, 0, 0, 0, "generic embedded object " },
279: { "ol", 0, 0, 0, 0, 0, "ordered list " },
280: { "optgroup", 0, 0, 0, 0, 0, "option group " },
281: { "option", 0, 1, 0, 0, 0, "selectable choice " },
282: { "p", 0, 1, 0, 0, 0, "paragraph " },
283: { "param", 0, 2, 1, 0, 0, "named property value " },
284: { "pre", 0, 0, 0, 0, 0, "preformatted text " },
285: { "q", 0, 0, 0, 0, 0, "short inline quotation " },
286: { "s", 0, 0, 0, 1, 1, "strike-through text style" },
287: { "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
288: { "script", 0, 0, 0, 0, 0, "script statements " },
289: { "select", 0, 0, 0, 0, 0, "option selector " },
290: { "small", 0, 0, 0, 0, 0, "small text style" },
291: { "span", 0, 0, 0, 0, 0, "generic language/style container " },
292: { "strike", 0, 0, 0, 1, 1, "strike-through text" },
293: { "strong", 0, 0, 0, 0, 0, "strong emphasis" },
294: { "style", 0, 0, 0, 0, 0, "style info " },
295: { "sub", 0, 0, 0, 0, 0, "subscript" },
296: { "sup", 0, 0, 0, 0, 0, "superscript " },
297: { "table", 0, 0, 0, 0, 0, " " },
298: { "tbody", 1, 1, 0, 0, 0, "table body " },
299: { "td", 0, 1, 0, 0, 0, "table data cell" },
300: { "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
301: { "tfoot", 0, 1, 0, 0, 0, "table footer " },
302: { "th", 0, 1, 0, 0, 0, "table header cell" },
303: { "thead", 0, 1, 0, 0, 0, "table header " },
304: { "title", 0, 0, 0, 0, 0, "document title " },
305: { "tr", 0, 1, 0, 0, 0, "table row " },
306: { "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
307: { "u", 0, 0, 0, 1, 1, "underlined text style" },
308: { "ul", 0, 0, 0, 0, 0, "unordered list " },
309: { "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
1.1 daniel 310: };
311:
312: /*
313: * start tags that imply the end of a current element
314: * any tag of each line implies the end of the current element if the type of
315: * that element is in the same line
316: */
1.8 daniel 317: char *htmlEquEnd[] = {
1.26 daniel 318: "dt", "dd", "li", "option", NULL,
319: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
320: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
1.1 daniel 321: NULL
322: };
323: /*
324: * acording the HTML DTD, HR should be added to the 2nd line above, as it
325: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
326: * because many documents contain rules in headings...
327: */
328:
329: /*
330: * start tags that imply the end of current element
331: */
1.8 daniel 332: char *htmlStartClose[] = {
1.26 daniel 333: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
334: "dl", "ul", "ol", "menu", "dir", "address", "pre",
335: "listing", "xmp", "head", NULL,
336: "head", "p", NULL,
337: "title", "p", NULL,
338: "body", "head", "style", "link", "title", "p", NULL,
339: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
340: "pre", "listing", "xmp", "head", "li", NULL,
341: "hr", "p", "head", NULL,
342: "h1", "p", "head", NULL,
343: "h2", "p", "head", NULL,
344: "h3", "p", "head", NULL,
345: "h4", "p", "head", NULL,
346: "h5", "p", "head", NULL,
347: "h6", "p", "head", NULL,
348: "dir", "p", "head", NULL,
349: "address", "p", "head", "ul", NULL,
350: "pre", "p", "head", "ul", NULL,
351: "listing", "p", "head", NULL,
352: "xmp", "p", "head", NULL,
353: "blockquote", "p", "head", NULL,
354: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
355: "xmp", "head", NULL,
356: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
357: "head", "dd", NULL,
358: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
359: "head", "dt", NULL,
360: "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
361: "listing", "xmp", NULL,
362: "ol", "p", "head", "ul", NULL,
363: "menu", "p", "head", "ul", NULL,
364: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
365: "div", "p", "head", NULL,
366: "noscript", "p", "head", NULL,
367: "center", "font", "b", "i", "p", "head", NULL,
368: "a", "a", NULL,
369: "caption", "p", NULL,
370: "colgroup", "caption", "colgroup", "col", "p", NULL,
371: "col", "caption", "col", "p", NULL,
372: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
373: "listing", "xmp", "a", NULL,
374: "th", "th", "td", NULL,
375: "td", "th", "td", "p", NULL,
376: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
377: "thead", "caption", "col", "colgroup", NULL,
378: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
379: "tbody", "p", NULL,
380: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
381: "tfoot", "tbody", "p", NULL,
382: "optgroup", "option", NULL,
383: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
384: "pre", "listing", "xmp", "a", NULL,
1.1 daniel 385: NULL
386: };
387:
1.8 daniel 388: static char** htmlStartCloseIndex[100];
1.1 daniel 389: static int htmlStartCloseIndexinitialized = 0;
390:
391: /************************************************************************
392: * *
393: * functions to handle HTML specific data *
394: * *
395: ************************************************************************/
396:
397: /**
398: * htmlInitAutoClose:
399: *
400: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
401: *
402: */
403: void
404: htmlInitAutoClose(void) {
405: int index, i = 0;
406:
407: if (htmlStartCloseIndexinitialized) return;
408:
409: for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
410: index = 0;
411: while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
412: htmlStartCloseIndex[index++] = &htmlStartClose[i];
413: while (htmlStartClose[i] != NULL) i++;
414: i++;
415: }
416: }
417:
418: /**
419: * htmlTagLookup:
420: * @tag: The tag name
421: *
422: * Lookup the HTML tag in the ElementTable
423: *
424: * Returns the related htmlElemDescPtr or NULL if not found.
425: */
426: htmlElemDescPtr
1.14 daniel 427: htmlTagLookup(const xmlChar *tag) {
1.1 daniel 428: int i = 0;
429:
430: for (i = 0; i < (sizeof(html40ElementTable) /
431: sizeof(html40ElementTable[0]));i++) {
1.8 daniel 432: if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
1.1 daniel 433: return(&html40ElementTable[i]);
434: }
435: return(NULL);
436: }
437:
438: /**
439: * htmlCheckAutoClose:
440: * @new: The new tag name
441: * @old: The old tag name
442: *
443: * Checks wether the new tag is one of the registered valid tags for closing old.
444: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
445: *
446: * Returns 0 if no, 1 if yes.
447: */
448: int
1.14 daniel 449: htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
1.1 daniel 450: int i, index;
1.8 daniel 451: char **close;
1.1 daniel 452:
453: if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
454:
455: /* inefficient, but not a big deal */
456: for (index = 0; index < 100;index++) {
457: close = htmlStartCloseIndex[index];
458: if (close == NULL) return(0);
1.8 daniel 459: if (!xmlStrcmp(BAD_CAST *close, new)) break;
1.1 daniel 460: }
461:
462: i = close - htmlStartClose;
463: i++;
464: while (htmlStartClose[i] != NULL) {
1.8 daniel 465: if (!xmlStrcmp(BAD_CAST htmlStartClose[i], old)) {
1.1 daniel 466: return(1);
467: }
468: i++;
469: }
470: return(0);
471: }
472:
473: /**
474: * htmlAutoClose:
475: * @ctxt: an HTML parser context
476: * @new: The new tag name
477: *
478: * The HTmL DtD allows a tag to implicitely close other tags.
479: * The list is kept in htmlStartClose array. This function is
480: * called when a new tag has been detected and generates the
481: * appropriates closes if possible/needed.
482: */
483: void
1.14 daniel 484: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
1.15 daniel 485: xmlChar *oldname;
486: while ((ctxt->name != NULL) &&
487: (htmlCheckAutoClose(new, ctxt->name))) {
1.1 daniel 488: #ifdef DEBUG
1.18 daniel 489: fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
1.1 daniel 490: #endif
491: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1.15 daniel 492: ctxt->sax->endElement(ctxt->userData, ctxt->name);
1.24 daniel 493: oldname = htmlnamePop(ctxt);
1.18 daniel 494: if (oldname != NULL) {
495: #ifdef DEBUG
496: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
497: #endif
1.17 daniel 498: xmlFree(oldname);
1.18 daniel 499: }
1.1 daniel 500: }
501: }
502:
503: /**
1.28 daniel 504: * htmlAutoCloseTag:
505: * @doc: the HTML document
506: * @name: The tag name
507: * @elem: the HTML element
508: *
509: * The HTmL DtD allows a tag to implicitely close other tags.
510: * The list is kept in htmlStartClose array. This function checks
511: * if the element or one of it's children would autoclose the
512: * given tag.
513: *
514: * Returns 1 if autoclose, 0 otherwise
515: */
516: int
517: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
518: htmlNodePtr child;
519:
520: if (elem == NULL) return(1);
521: if (!xmlStrcmp(name, elem->name)) return(0);
522: if (htmlCheckAutoClose(elem->name, name)) return(1);
1.37 daniel 523: child = elem->children;
1.28 daniel 524: while (child != NULL) {
525: if (htmlAutoCloseTag(doc, name, child)) return(1);
526: child = child->next;
527: }
528: return(0);
529: }
530:
531: /**
532: * htmlIsAutoClosed:
533: * @doc: the HTML document
534: * @elem: the HTML element
535: *
536: * The HTmL DtD allows a tag to implicitely close other tags.
537: * The list is kept in htmlStartClose array. This function checks
538: * if a tag is autoclosed by one of it's child
539: *
540: * Returns 1 if autoclosed, 0 otherwise
541: */
542: int
543: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
544: htmlNodePtr child;
545:
546: if (elem == NULL) return(1);
1.37 daniel 547: child = elem->children;
1.28 daniel 548: while (child != NULL) {
549: if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
550: child = child->next;
551: }
552: return(0);
553: }
554:
555: /**
1.1 daniel 556: * htmlAutoCloseOnClose:
557: * @ctxt: an HTML parser context
558: * @new: The new tag name
559: *
560: * The HTmL DtD allows an ending tag to implicitely close other tags.
561: */
562: void
1.14 daniel 563: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
1.1 daniel 564: htmlElemDescPtr info;
1.15 daniel 565: xmlChar *oldname;
1.18 daniel 566: int i;
1.1 daniel 567:
1.18 daniel 568: #ifdef DEBUG
569: fprintf(stderr,"Close of %s stack: %d elements\n", new, ctxt->nameNr);
570: for (i = 0;i < ctxt->nameNr;i++)
571: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
572: #endif
573:
574: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
575: if (!xmlStrcmp(new, ctxt->nameTab[i])) break;
576: }
577: if (i < 0) return;
578:
579: while (xmlStrcmp(new, ctxt->name)) {
1.15 daniel 580: info = htmlTagLookup(ctxt->name);
1.1 daniel 581: if ((info == NULL) || (info->endTag == 1)) {
582: #ifdef DEBUG
1.18 daniel 583: fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
584: #endif
585: } else {
586: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
587: ctxt->sax->error(ctxt->userData,
588: "Opening and ending tag mismatch: %s and %s\n",
589: new, ctxt->name);
590: ctxt->wellFormed = 0;
591: }
592: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
593: ctxt->sax->endElement(ctxt->userData, ctxt->name);
1.24 daniel 594: oldname = htmlnamePop(ctxt);
1.18 daniel 595: if (oldname != NULL) {
596: #ifdef DEBUG
597: fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
1.1 daniel 598: #endif
1.18 daniel 599: xmlFree(oldname);
600: }
1.1 daniel 601: }
602: }
603:
604: /************************************************************************
605: * *
606: * The list of HTML predefined entities *
607: * *
608: ************************************************************************/
609:
610:
611: htmlEntityDesc html40EntitiesTable[] = {
612: /*
613: * the 4 absolute ones,
614: */
615: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
616: { 38, "amp", "ampersand, U+0026 ISOnum" },
617: { 60, "lt", "less-than sign, U+003C ISOnum" },
618: { 62, "gt", "greater-than sign, U+003E ISOnum" },
619:
620: /*
621: * A bunch still in the 128-255 range
622: * Replacing them depend really on the charset used.
623: */
1.28 daniel 624: { 39, "apos", "single quote" },
1.1 daniel 625: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
626: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
627: { 162, "cent", "cent sign, U+00A2 ISOnum" },
628: { 163, "pound","pound sign, U+00A3 ISOnum" },
629: { 164, "curren","currency sign, U+00A4 ISOnum" },
630: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
631: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
632: { 167, "sect", "section sign, U+00A7 ISOnum" },
633: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
634: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
635: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
636: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
637: { 172, "not", "not sign, U+00AC ISOnum" },
638: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
639: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
640: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
641: { 176, "deg", "degree sign, U+00B0 ISOnum" },
642: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
643: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
644: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
645: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
646: { 181, "micro","micro sign, U+00B5 ISOnum" },
647: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1.7 daniel 648: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1.1 daniel 649: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
650: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
651: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1.7 daniel 652: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1.1 daniel 653: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
654: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
655: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
656: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
657: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
658: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
659: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
660: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
661: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
662: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
663: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
664: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
665: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
666: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
667: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
668: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
669: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
670: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
671: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
672: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
673: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
674: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
675: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
676: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
677: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
678: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
679: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
680: { 215, "times","multiplication sign, U+00D7 ISOnum" },
1.7 daniel 681: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1.1 daniel 682: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
683: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
684: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
685: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
686: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
687: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
688: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
689: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
690: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
691: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
692: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
693: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
694: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
695: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
696: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
697: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
698: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
699: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
700: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
701: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
702: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
703: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
704: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
705: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
706: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
707: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
708: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
709: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
710: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
711: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
712: { 247, "divide","division sign, U+00F7 ISOnum" },
713: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
714: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
715: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
716: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
717: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
718: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
719: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
720: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
721:
722: /*
723: * Anything below should really be kept as entities references
724: */
725: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
726:
727: { 913, "Alpha","greek capital letter alpha, U+0391" },
728: { 914, "Beta", "greek capital letter beta, U+0392" },
729: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
730: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
731: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
732: { 918, "Zeta", "greek capital letter zeta, U+0396" },
733: { 919, "Eta", "greek capital letter eta, U+0397" },
734: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
735: { 921, "Iota", "greek capital letter iota, U+0399" },
736: { 922, "Kappa","greek capital letter kappa, U+039A" },
737: { 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
738: { 924, "Mu", "greek capital letter mu, U+039C" },
739: { 925, "Nu", "greek capital letter nu, U+039D" },
740: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
741: { 927, "Omicron","greek capital letter omicron, U+039F" },
742: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
743: { 929, "Rho", "greek capital letter rho, U+03A1" },
744: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
745: { 932, "Tau", "greek capital letter tau, U+03A4" },
746: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
747: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
748: { 935, "Chi", "greek capital letter chi, U+03A7" },
749: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
750: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
751:
752: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
753: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
754: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
755: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
756: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
757: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
758: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
759: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
760: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
761: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
762: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
763: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
764: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
765: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
766: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
767: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
768: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
769: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
770: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
771: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
772: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
773: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
774: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
775: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
776: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
777: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
778: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
779: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
780:
781: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
782: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
783: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
784: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
785: { 8254, "oline","overline = spacing overscore, U+203E NEW" },
786: { 8260, "frasl","fraction slash, U+2044 NEW" },
787:
1.7 daniel 788: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1.1 daniel 789: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
790: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
791: { 8482, "trade","trade mark sign, U+2122 ISOnum" },
792: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
793: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
794: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
795: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
796: { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
797: { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
798: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
799: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
800: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
801: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
802: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
803: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
804:
805:
806: { 8704, "forall","for all, U+2200 ISOtech" },
807: { 8706, "part", "partial differential, U+2202 ISOtech" },
808: { 8707, "exist","there exists, U+2203 ISOtech" },
809: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
810: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
811: { 8712, "isin", "element of, U+2208 ISOtech" },
812: { 8713, "notin","not an element of, U+2209 ISOtech" },
813: { 8715, "ni", "contains as member, U+220B ISOtech" },
814: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
815: { 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
816: { 8722, "minus","minus sign, U+2212 ISOtech" },
817: { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
818: { 8730, "radic","square root = radical sign, U+221A ISOtech" },
819: { 8733, "prop", "proportional to, U+221D ISOtech" },
820: { 8734, "infin","infinity, U+221E ISOtech" },
821: { 8736, "ang", "angle, U+2220 ISOamso" },
822: { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
823: { 8744, "or", "logical or = vee, U+2228 ISOtech" },
824: { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
825: { 8746, "cup", "union = cup, U+222A ISOtech" },
826: { 8747, "int", "integral, U+222B ISOtech" },
827: { 8756, "there4","therefore, U+2234 ISOtech" },
828: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
829: { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
830: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
831: { 8800, "ne", "not equal to, U+2260 ISOtech" },
832: { 8801, "equiv","identical to, U+2261 ISOtech" },
833: { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
834: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
835: { 8834, "sub", "subset of, U+2282 ISOtech" },
836: { 8835, "sup", "superset of, U+2283 ISOtech" },
837: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
838: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
839: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
840: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
841: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
842: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
843: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
844: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
845: { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
846: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
847: { 8971, "rfloor","right floor, U+230B ISOamsc" },
848: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
849: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
850: { 9674, "loz", "lozenge, U+25CA ISOpub" },
851:
852: { 9824, "spades","black spade suit, U+2660 ISOpub" },
853: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
854: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
855: { 9830, "diams","black diamond suit, U+2666 ISOpub" },
856:
857: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
858: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
859: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
860: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
861: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
862: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
863: { 732, "tilde","small tilde, U+02DC ISOdia" },
864:
865: { 8194, "ensp", "en space, U+2002 ISOpub" },
866: { 8195, "emsp", "em space, U+2003 ISOpub" },
867: { 8201, "thinsp","thin space, U+2009 ISOpub" },
868: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
869: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
870: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
871: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
872: { 8211, "ndash","en dash, U+2013 ISOpub" },
873: { 8212, "mdash","em dash, U+2014 ISOpub" },
874: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
875: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
876: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
877: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
878: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
879: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
880: { 8224, "dagger","dagger, U+2020 ISOpub" },
881: { 8225, "Dagger","double dagger, U+2021 ISOpub" },
882: { 8240, "permil","per mille sign, U+2030 ISOtech" },
883: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1.7 daniel 884: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1.1 daniel 885: { 8364, "euro", "euro sign, U+20AC NEW" }
886: };
887:
888: /************************************************************************
889: * *
890: * Commodity functions to handle entities *
891: * *
892: ************************************************************************/
893:
894: /*
895: * Macro used to grow the current buffer.
896: */
897: #define growBuffer(buffer) { \
898: buffer##_size *= 2; \
1.14 daniel 899: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1.1 daniel 900: if (buffer == NULL) { \
901: perror("realloc failed"); \
1.33 daniel 902: return(NULL); \
1.1 daniel 903: } \
904: }
905:
906: /**
907: * htmlEntityLookup:
908: * @name: the entity name
909: *
910: * Lookup the given entity in EntitiesTable
911: *
912: * TODO: the linear scan is really ugly, an hash table is really needed.
913: *
914: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
915: */
916: htmlEntityDescPtr
1.14 daniel 917: htmlEntityLookup(const xmlChar *name) {
1.1 daniel 918: int i;
919:
920: for (i = 0;i < (sizeof(html40EntitiesTable)/
921: sizeof(html40EntitiesTable[0]));i++) {
1.8 daniel 922: if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
1.1 daniel 923: #ifdef DEBUG
1.18 daniel 924: fprintf(stderr,"Found entity %s\n", name);
1.1 daniel 925: #endif
926: return(&html40EntitiesTable[i]);
927: }
928: }
929: return(NULL);
930: }
931:
932:
933: /**
934: * htmlDecodeEntities:
935: * @ctxt: the parser context
936: * @len: the len to decode (in bytes !), -1 for no size limit
1.14 daniel 937: * @end: an end marker xmlChar, 0 if none
938: * @end2: an end marker xmlChar, 0 if none
939: * @end3: an end marker xmlChar, 0 if none
1.1 daniel 940: *
941: * Subtitute the HTML entities by their value
942: *
1.19 daniel 943: * DEPRECATED !!!!
1.1 daniel 944: *
945: * Returns A newly allocated string with the substitution done. The caller
946: * must deallocate it !
947: */
1.14 daniel 948: xmlChar *
1.1 daniel 949: htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
1.14 daniel 950: xmlChar end, xmlChar end2, xmlChar end3) {
951: xmlChar *buffer = NULL;
1.1 daniel 952: int buffer_size = 0;
1.14 daniel 953: xmlChar *out = NULL;
954: xmlChar *name = NULL;
1.1 daniel 955:
1.14 daniel 956: xmlChar *cur = NULL;
1.1 daniel 957: htmlEntityDescPtr ent;
1.5 daniel 958: int nbchars = 0;
1.1 daniel 959: unsigned int max = (unsigned int) len;
960:
961: /*
962: * allocate a translation buffer.
963: */
1.31 daniel 964: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1.14 daniel 965: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1.1 daniel 966: if (buffer == NULL) {
967: perror("htmlDecodeEntities: malloc failed");
968: return(NULL);
969: }
970: out = buffer;
971:
972: /*
973: * Ok loop until we reach one of the ending char or a size limit.
974: */
1.5 daniel 975: while ((nbchars < max) && (CUR != end) &&
1.1 daniel 976: (CUR != end2) && (CUR != end3)) {
977:
978: if (CUR == '&') {
979: if (NXT(1) == '#') {
980: int val = htmlParseCharRef(ctxt);
1.8 daniel 981: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 982: *out++ = val;
1.5 daniel 983: nbchars += 3; /* !!!! */
1.1 daniel 984: } else {
985: ent = htmlParseEntityRef(ctxt, &name);
986: if (name != NULL) {
987: if ((ent == NULL) || (ent->value <= 0) ||
988: (ent->value >= 255)) {
989: *out++ = '&';
990: cur = name;
991: while (*cur != 0) {
992: if (out - buffer > buffer_size - 100) {
993: int index = out - buffer;
994:
995: growBuffer(buffer);
996: out = &buffer[index];
997: }
998: *out++ = *cur++;
999: }
1000: *out++ = ';';
1001: } else {
1.8 daniel 1002: /* invalid for UTF-8 variable encoding !!!!! */
1.14 daniel 1003: *out++ = (xmlChar)ent->value;
1.1 daniel 1004: if (out - buffer > buffer_size - 100) {
1005: int index = out - buffer;
1006:
1007: growBuffer(buffer);
1008: out = &buffer[index];
1009: }
1010: }
1.5 daniel 1011: nbchars += 2 + xmlStrlen(name);
1.11 daniel 1012: xmlFree(name);
1.1 daniel 1013: }
1014: }
1015: } else {
1.8 daniel 1016: /* invalid for UTF-8 , use COPY(out); !!!!! */
1.1 daniel 1017: *out++ = CUR;
1.5 daniel 1018: nbchars++;
1.1 daniel 1019: if (out - buffer > buffer_size - 100) {
1020: int index = out - buffer;
1021:
1022: growBuffer(buffer);
1023: out = &buffer[index];
1024: }
1025: NEXT;
1026: }
1027: }
1028: *out++ = 0;
1029: return(buffer);
1030: }
1031:
1032:
1033: /************************************************************************
1034: * *
1035: * Commodity functions to handle encodings *
1036: * *
1037: ************************************************************************/
1038:
1039: /**
1040: * htmlSwitchEncoding:
1041: * @ctxt: the parser context
1042: * @len: the len of @cur
1043: *
1044: * change the input functions when discovering the character encoding
1045: * of a given entity.
1046: *
1047: */
1048: void
1049: htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
1050: {
1051: switch (enc) {
1052: case XML_CHAR_ENCODING_ERROR:
1053: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1054: ctxt->sax->error(ctxt->userData, "encoding unknown\n");
1055: ctxt->wellFormed = 0;
1056: break;
1057: case XML_CHAR_ENCODING_NONE:
1058: /* let's assume it's UTF-8 without the XML decl */
1059: return;
1060: case XML_CHAR_ENCODING_UTF8:
1061: /* default encoding, no conversion should be needed */
1062: return;
1063: case XML_CHAR_ENCODING_UTF16LE:
1064: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1065: ctxt->sax->error(ctxt->userData,
1066: "char encoding UTF16 little endian not supported\n");
1067: break;
1068: case XML_CHAR_ENCODING_UTF16BE:
1069: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1070: ctxt->sax->error(ctxt->userData,
1071: "char encoding UTF16 big endian not supported\n");
1072: break;
1073: case XML_CHAR_ENCODING_UCS4LE:
1074: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1075: ctxt->sax->error(ctxt->userData,
1076: "char encoding USC4 little endian not supported\n");
1077: break;
1078: case XML_CHAR_ENCODING_UCS4BE:
1079: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1080: ctxt->sax->error(ctxt->userData,
1081: "char encoding USC4 big endian not supported\n");
1082: break;
1083: case XML_CHAR_ENCODING_EBCDIC:
1084: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1085: ctxt->sax->error(ctxt->userData,
1086: "char encoding EBCDIC not supported\n");
1087: break;
1088: case XML_CHAR_ENCODING_UCS4_2143:
1089: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1090: ctxt->sax->error(ctxt->userData,
1091: "char encoding UCS4 2143 not supported\n");
1092: break;
1093: case XML_CHAR_ENCODING_UCS4_3412:
1094: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1095: ctxt->sax->error(ctxt->userData,
1096: "char encoding UCS4 3412 not supported\n");
1097: break;
1098: case XML_CHAR_ENCODING_UCS2:
1099: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1100: ctxt->sax->error(ctxt->userData,
1101: "char encoding UCS2 not supported\n");
1102: break;
1103: case XML_CHAR_ENCODING_8859_1:
1104: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1105: ctxt->sax->error(ctxt->userData,
1106: "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
1107: break;
1108: case XML_CHAR_ENCODING_8859_2:
1109: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1110: ctxt->sax->error(ctxt->userData,
1111: "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
1112: break;
1113: case XML_CHAR_ENCODING_8859_3:
1114: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1115: ctxt->sax->error(ctxt->userData,
1116: "char encoding ISO_8859_3 not supported\n");
1117: break;
1118: case XML_CHAR_ENCODING_8859_4:
1119: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1120: ctxt->sax->error(ctxt->userData,
1121: "char encoding ISO_8859_4 not supported\n");
1122: break;
1123: case XML_CHAR_ENCODING_8859_5:
1124: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1125: ctxt->sax->error(ctxt->userData,
1126: "char encoding ISO_8859_5 not supported\n");
1127: break;
1128: case XML_CHAR_ENCODING_8859_6:
1129: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1130: ctxt->sax->error(ctxt->userData,
1131: "char encoding ISO_8859_6 not supported\n");
1132: break;
1133: case XML_CHAR_ENCODING_8859_7:
1134: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1135: ctxt->sax->error(ctxt->userData,
1136: "char encoding ISO_8859_7 not supported\n");
1137: break;
1138: case XML_CHAR_ENCODING_8859_8:
1139: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1140: ctxt->sax->error(ctxt->userData,
1141: "char encoding ISO_8859_8 not supported\n");
1142: break;
1143: case XML_CHAR_ENCODING_8859_9:
1144: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1145: ctxt->sax->error(ctxt->userData,
1146: "char encoding ISO_8859_9 not supported\n");
1147: break;
1148: case XML_CHAR_ENCODING_2022_JP:
1149: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1150: ctxt->sax->error(ctxt->userData,
1151: "char encoding ISO-2022-JPnot supported\n");
1152: break;
1153: case XML_CHAR_ENCODING_SHIFT_JIS:
1154: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1155: ctxt->sax->error(ctxt->userData,
1156: "char encoding Shift_JISnot supported\n");
1157: break;
1158: case XML_CHAR_ENCODING_EUC_JP:
1159: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1160: ctxt->sax->error(ctxt->userData,
1161: "char encoding EUC-JPnot supported\n");
1162: break;
1163: }
1164: }
1165:
1.31 daniel 1166: /************************************************************************
1167: * *
1168: * Commodity functions to handle streams *
1169: * *
1170: ************************************************************************/
1171:
1172: /**
1173: * htmlFreeInputStream:
1174: * @input: an htmlParserInputPtr
1175: *
1176: * Free up an input stream.
1177: */
1178: void
1179: htmlFreeInputStream(htmlParserInputPtr input) {
1180: if (input == NULL) return;
1181:
1182: if (input->filename != NULL) xmlFree((char *) input->filename);
1183: if (input->directory != NULL) xmlFree((char *) input->directory);
1184: if ((input->free != NULL) && (input->base != NULL))
1185: input->free((xmlChar *) input->base);
1186: if (input->buf != NULL)
1187: xmlFreeParserInputBuffer(input->buf);
1188: memset(input, -1, sizeof(htmlParserInput));
1189: xmlFree(input);
1190: }
1191:
1192: /**
1193: * htmlNewInputStream:
1194: * @ctxt: an HTML parser context
1195: *
1196: * Create a new input stream structure
1197: * Returns the new input stream or NULL
1198: */
1199: htmlParserInputPtr
1200: htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1201: htmlParserInputPtr input;
1202:
1203: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1204: if (input == NULL) {
1205: ctxt->errNo = XML_ERR_NO_MEMORY;
1206: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1207: ctxt->sax->error(ctxt->userData,
1208: "malloc: couldn't allocate a new input stream\n");
1209: ctxt->errNo = XML_ERR_NO_MEMORY;
1210: return(NULL);
1211: }
1212: input->filename = NULL;
1213: input->directory = NULL;
1214: input->base = NULL;
1215: input->cur = NULL;
1216: input->buf = NULL;
1217: input->line = 1;
1218: input->col = 1;
1219: input->buf = NULL;
1220: input->free = NULL;
1221: input->consumed = 0;
1222: input->length = 0;
1223: return(input);
1224: }
1225:
1.1 daniel 1226:
1227: /************************************************************************
1228: * *
1229: * Commodity functions, cleanup needed ? *
1230: * *
1231: ************************************************************************/
1232:
1233: /**
1234: * areBlanks:
1235: * @ctxt: an HTML parser context
1.14 daniel 1236: * @str: a xmlChar *
1.1 daniel 1237: * @len: the size of @str
1238: *
1239: * Is this a sequence of blank chars that one can ignore ?
1240: *
1241: * Returns 1 if ignorable 0 otherwise.
1242: */
1243:
1.14 daniel 1244: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1.1 daniel 1245: int i;
1246: xmlNodePtr lastChild;
1247:
1248: for (i = 0;i < len;i++)
1249: if (!(IS_BLANK(str[i]))) return(0);
1250:
1251: if (CUR != '<') return(0);
1252: if (ctxt->node == NULL) return(0);
1253: lastChild = xmlGetLastChild(ctxt->node);
1254: if (lastChild == NULL) {
1255: if (ctxt->node->content != NULL) return(0);
1256: } else if (xmlNodeIsText(lastChild))
1257: return(0);
1258: return(1);
1259: }
1260:
1261: /**
1262: * htmlHandleEntity:
1263: * @ctxt: an HTML parser context
1264: * @entity: an XML entity pointer.
1265: *
1266: * Default handling of an HTML entity, call the parser with the
1267: * substitution string
1268: */
1269:
1270: void
1271: htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1272: int len;
1273:
1274: if (entity->content == NULL) {
1275: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1276: ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1277: entity->name);
1278: ctxt->wellFormed = 0;
1279: return;
1280: }
1281: len = xmlStrlen(entity->content);
1282:
1283: /*
1284: * Just handle the content as a set of chars.
1285: */
1286: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1287: ctxt->sax->characters(ctxt->userData, entity->content, len);
1288:
1289: }
1290:
1291: /**
1292: * htmlNewDoc:
1293: * @URI: URI for the dtd, or NULL
1294: * @ExternalID: the external ID of the DTD, or NULL
1295: *
1296: * Returns a new document
1297: */
1298: htmlDocPtr
1.14 daniel 1299: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1.1 daniel 1300: xmlDocPtr cur;
1301:
1302: /*
1303: * Allocate a new document and fill the fields.
1304: */
1.11 daniel 1305: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1.1 daniel 1306: if (cur == NULL) {
1307: fprintf(stderr, "xmlNewDoc : malloc failed\n");
1308: return(NULL);
1309: }
1.10 daniel 1310: memset(cur, 0, sizeof(xmlDoc));
1.1 daniel 1311:
1.20 daniel 1312: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 1313: cur->version = NULL;
1314: cur->intSubset = NULL;
1.28 daniel 1315: if ((ExternalID == NULL) &&
1316: (URI == NULL))
1317: xmlCreateIntSubset(cur, BAD_CAST "HTML",
1318: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1319: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1320: else
1321: xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1.1 daniel 1322: cur->name = NULL;
1.37 daniel 1323: cur->children = NULL;
1.1 daniel 1324: cur->extSubset = NULL;
1325: cur->oldNs = NULL;
1326: cur->encoding = NULL;
1327: cur->standalone = 1;
1328: cur->compression = 0;
1.12 daniel 1329: cur->ids = NULL;
1330: cur->refs = NULL;
1.1 daniel 1331: #ifndef XML_WITHOUT_CORBA
1332: cur->_private = NULL;
1333: #endif
1334: return(cur);
1335: }
1336:
1337:
1338: /************************************************************************
1339: * *
1340: * The parser itself *
1341: * Relates to http://www.w3.org/TR/html40 *
1342: * *
1343: ************************************************************************/
1344:
1345: /************************************************************************
1346: * *
1347: * The parser itself *
1348: * *
1349: ************************************************************************/
1350:
1351: /**
1352: * htmlParseHTMLName:
1353: * @ctxt: an HTML parser context
1354: *
1.26 daniel 1355: * parse an HTML tag or attribute name, note that we convert it to lowercase
1.1 daniel 1356: * since HTML names are not case-sensitive.
1357: *
1358: * Returns the Tag Name parsed or NULL
1359: */
1360:
1.14 daniel 1361: xmlChar *
1.1 daniel 1362: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1363: xmlChar *ret = NULL;
1.1 daniel 1364: int i = 0;
1.31 daniel 1365: xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1.1 daniel 1366:
1367: if (!IS_LETTER(CUR) && (CUR != '_') &&
1368: (CUR != ':')) return(NULL);
1369:
1.31 daniel 1370: while ((i < HTML_PARSER_BUFFER_SIZE) &&
1371: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
1.26 daniel 1372: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1.1 daniel 1373: else loc[i] = CUR;
1374: i++;
1375:
1376: NEXT;
1377: }
1378:
1379: ret = xmlStrndup(loc, i);
1380:
1381: return(ret);
1382: }
1383:
1384: /**
1385: * htmlParseName:
1386: * @ctxt: an HTML parser context
1387: *
1388: * parse an HTML name, this routine is case sensistive.
1389: *
1390: * Returns the Name parsed or NULL
1391: */
1392:
1.14 daniel 1393: xmlChar *
1.1 daniel 1394: htmlParseName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1395: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1396: int len = 0;
1.1 daniel 1397:
1.5 daniel 1398: GROW;
1399: if (!IS_LETTER(CUR) && (CUR != '_')) {
1400: return(NULL);
1401: }
1.1 daniel 1402:
1403: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1404: (CUR == '.') || (CUR == '-') ||
1405: (CUR == '_') || (CUR == ':') ||
1406: (IS_COMBINING(CUR)) ||
1.5 daniel 1407: (IS_EXTENDER(CUR))) {
1408: buf[len++] = CUR;
1.1 daniel 1409: NEXT;
1.5 daniel 1410: if (len >= HTML_MAX_NAMELEN) {
1411: fprintf(stderr,
1412: "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1413: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1414: (CUR == '.') || (CUR == '-') ||
1415: (CUR == '_') || (CUR == ':') ||
1416: (IS_COMBINING(CUR)) ||
1417: (IS_EXTENDER(CUR)))
1418: NEXT;
1419: break;
1420: }
1421: }
1422: return(xmlStrndup(buf, len));
1.1 daniel 1423: }
1424:
1425: /**
1426: * htmlParseHTMLAttribute:
1427: * @ctxt: an HTML parser context
1.19 daniel 1428: * @stop: a char stop value
1.1 daniel 1429: *
1.19 daniel 1430: * parse an HTML attribute value till the stop (quote), if
1431: * stop is 0 then it stops at the first space
1.1 daniel 1432: *
1.19 daniel 1433: * Returns the attribute parsed or NULL
1.1 daniel 1434: */
1435:
1.14 daniel 1436: xmlChar *
1.19 daniel 1437: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1.32 daniel 1438: #if 0
1.14 daniel 1439: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1440: int len = 0;
1.1 daniel 1441:
1.5 daniel 1442: GROW;
1.19 daniel 1443: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1444: if ((stop == 0) && (IS_BLANK(CUR))) break;
1.5 daniel 1445: buf[len++] = CUR;
1.1 daniel 1446: NEXT;
1.5 daniel 1447: if (len >= HTML_MAX_NAMELEN) {
1448: fprintf(stderr,
1449: "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1450: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1.19 daniel 1451: (CUR != '>') &&
1.5 daniel 1452: (CUR != '\'') && (CUR != '"'))
1453: NEXT;
1454: break;
1455: }
1456: }
1457: return(xmlStrndup(buf, len));
1.32 daniel 1458: #else
1459: xmlChar *buffer = NULL;
1460: int buffer_size = 0;
1461: xmlChar *out = NULL;
1462: xmlChar *name = NULL;
1463:
1464: xmlChar *cur = NULL;
1465: htmlEntityDescPtr ent;
1466:
1467: /*
1468: * allocate a translation buffer.
1469: */
1470: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1471: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1472: if (buffer == NULL) {
1473: perror("htmlParseHTMLAttribute: malloc failed");
1474: return(NULL);
1475: }
1476: out = buffer;
1477:
1478: /*
1479: * Ok loop until we reach one of the ending chars
1480: */
1481: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1482: if ((stop == 0) && (IS_BLANK(CUR))) break;
1483: if (CUR == '&') {
1484: if (NXT(1) == '#') {
1485: int val = htmlParseCharRef(ctxt);
1486: *out++ = val;
1487: } else {
1488: ent = htmlParseEntityRef(ctxt, &name);
1489: if (name == NULL) {
1490: *out++ = '&';
1491: if (out - buffer > buffer_size - 100) {
1492: int index = out - buffer;
1493:
1494: growBuffer(buffer);
1495: out = &buffer[index];
1496: }
1497: } else if ((ent == NULL) || (ent->value <= 0) ||
1498: (ent->value >= 255)) {
1499: *out++ = '&';
1500: cur = name;
1501: while (*cur != 0) {
1502: if (out - buffer > buffer_size - 100) {
1503: int index = out - buffer;
1504:
1505: growBuffer(buffer);
1506: out = &buffer[index];
1507: }
1508: *out++ = *cur++;
1509: }
1510: xmlFree(name);
1511: } else {
1512: *out++ = ent->value;
1513: if (out - buffer > buffer_size - 100) {
1514: int index = out - buffer;
1515:
1516: growBuffer(buffer);
1517: out = &buffer[index];
1518: }
1519: xmlFree(name);
1520: }
1521: }
1522: } else {
1523: *out++ = CUR;
1524: if (out - buffer > buffer_size - 100) {
1525: int index = out - buffer;
1526:
1527: growBuffer(buffer);
1528: out = &buffer[index];
1529: }
1530: NEXT;
1531: }
1532: }
1533: *out++ = 0;
1534: return(buffer);
1535: #endif
1.1 daniel 1536: }
1537:
1538: /**
1539: * htmlParseNmtoken:
1540: * @ctxt: an HTML parser context
1541: *
1542: * parse an HTML Nmtoken.
1543: *
1544: * Returns the Nmtoken parsed or NULL
1545: */
1546:
1.14 daniel 1547: xmlChar *
1.1 daniel 1548: htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
1.14 daniel 1549: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1550: int len = 0;
1.1 daniel 1551:
1.5 daniel 1552: GROW;
1.1 daniel 1553: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1554: (CUR == '.') || (CUR == '-') ||
1555: (CUR == '_') || (CUR == ':') ||
1556: (IS_COMBINING(CUR)) ||
1.5 daniel 1557: (IS_EXTENDER(CUR))) {
1558: buf[len++] = CUR;
1.1 daniel 1559: NEXT;
1.5 daniel 1560: if (len >= HTML_MAX_NAMELEN) {
1561: fprintf(stderr,
1562: "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1563: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1564: (CUR == '.') || (CUR == '-') ||
1565: (CUR == '_') || (CUR == ':') ||
1566: (IS_COMBINING(CUR)) ||
1567: (IS_EXTENDER(CUR)))
1568: NEXT;
1569: break;
1570: }
1571: }
1572: return(xmlStrndup(buf, len));
1.1 daniel 1573: }
1574:
1575: /**
1576: * htmlParseEntityRef:
1577: * @ctxt: an HTML parser context
1578: * @str: location to store the entity name
1579: *
1580: * parse an HTML ENTITY references
1581: *
1582: * [68] EntityRef ::= '&' Name ';'
1583: *
1584: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1585: * if non-NULL *str will have to be freed by the caller.
1586: */
1587: htmlEntityDescPtr
1.14 daniel 1588: htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1589: xmlChar *name;
1.1 daniel 1590: htmlEntityDescPtr ent = NULL;
1591: *str = NULL;
1592:
1593: if (CUR == '&') {
1594: NEXT;
1595: name = htmlParseName(ctxt);
1596: if (name == NULL) {
1597: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1598: ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1599: ctxt->wellFormed = 0;
1600: } else {
1.5 daniel 1601: GROW;
1.1 daniel 1602: if (CUR == ';') {
1603: *str = name;
1604:
1605: /*
1606: * Lookup the entity in the table.
1607: */
1608: ent = htmlEntityLookup(name);
1.32 daniel 1609: if (ent != NULL) /* OK that's ugly !!! */
1610: NEXT;
1.1 daniel 1611: } else {
1612: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1613: ctxt->sax->error(ctxt->userData,
1614: "htmlParseEntityRef: expecting ';'\n");
1.32 daniel 1615: *str = name;
1.1 daniel 1616: }
1617: }
1618: }
1619: return(ent);
1620: }
1621:
1622: /**
1623: * htmlParseAttValue:
1624: * @ctxt: an HTML parser context
1625: *
1626: * parse a value for an attribute
1627: * Note: the parser won't do substitution of entities here, this
1628: * will be handled later in xmlStringGetNodeList, unless it was
1629: * asked for ctxt->replaceEntities != 0
1630: *
1631: * Returns the AttValue parsed or NULL.
1632: */
1633:
1.14 daniel 1634: xmlChar *
1.1 daniel 1635: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1.14 daniel 1636: xmlChar *ret = NULL;
1.1 daniel 1637:
1638: if (CUR == '"') {
1639: NEXT;
1.19 daniel 1640: ret = htmlParseHTMLAttribute(ctxt, '"');
1.1 daniel 1641: if (CUR != '"') {
1642: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1643: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1644: ctxt->wellFormed = 0;
1645: } else
1646: NEXT;
1647: } else if (CUR == '\'') {
1648: NEXT;
1.19 daniel 1649: ret = htmlParseHTMLAttribute(ctxt, '\'');
1.1 daniel 1650: if (CUR != '\'') {
1651: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1652: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1653: ctxt->wellFormed = 0;
1654: } else
1655: NEXT;
1656: } else {
1657: /*
1658: * That's an HTMLism, the attribute value may not be quoted
1659: */
1.19 daniel 1660: ret = htmlParseHTMLAttribute(ctxt, 0);
1.1 daniel 1661: if (ret == NULL) {
1662: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1663: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1664: ctxt->wellFormed = 0;
1665: }
1666: }
1667: return(ret);
1668: }
1669:
1670: /**
1671: * htmlParseSystemLiteral:
1672: * @ctxt: an HTML parser context
1673: *
1674: * parse an HTML Literal
1675: *
1676: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1677: *
1678: * Returns the SystemLiteral parsed or NULL
1679: */
1680:
1.14 daniel 1681: xmlChar *
1.1 daniel 1682: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 1683: const xmlChar *q;
1684: xmlChar *ret = NULL;
1.1 daniel 1685:
1686: if (CUR == '"') {
1687: NEXT;
1688: q = CUR_PTR;
1689: while ((IS_CHAR(CUR)) && (CUR != '"'))
1690: NEXT;
1691: if (!IS_CHAR(CUR)) {
1692: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1693: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1694: ctxt->wellFormed = 0;
1695: } else {
1696: ret = xmlStrndup(q, CUR_PTR - q);
1697: NEXT;
1698: }
1699: } else if (CUR == '\'') {
1700: NEXT;
1701: q = CUR_PTR;
1702: while ((IS_CHAR(CUR)) && (CUR != '\''))
1703: NEXT;
1704: if (!IS_CHAR(CUR)) {
1705: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1706: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1707: ctxt->wellFormed = 0;
1708: } else {
1709: ret = xmlStrndup(q, CUR_PTR - q);
1710: NEXT;
1711: }
1712: } else {
1713: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.38 ! daniel 1714: ctxt->sax->error(ctxt->userData,
! 1715: "SystemLiteral \" or ' expected\n");
1.1 daniel 1716: ctxt->wellFormed = 0;
1717: }
1718:
1719: return(ret);
1720: }
1721:
1722: /**
1723: * htmlParsePubidLiteral:
1724: * @ctxt: an HTML parser context
1725: *
1726: * parse an HTML public literal
1727: *
1728: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1729: *
1730: * Returns the PubidLiteral parsed or NULL.
1731: */
1732:
1.14 daniel 1733: xmlChar *
1.1 daniel 1734: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 1735: const xmlChar *q;
1736: xmlChar *ret = NULL;
1.1 daniel 1737: /*
1738: * Name ::= (Letter | '_') (NameChar)*
1739: */
1740: if (CUR == '"') {
1741: NEXT;
1742: q = CUR_PTR;
1743: while (IS_PUBIDCHAR(CUR)) NEXT;
1744: if (CUR != '"') {
1745: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1746: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1747: ctxt->wellFormed = 0;
1748: } else {
1749: ret = xmlStrndup(q, CUR_PTR - q);
1750: NEXT;
1751: }
1752: } else if (CUR == '\'') {
1753: NEXT;
1754: q = CUR_PTR;
1755: while ((IS_LETTER(CUR)) && (CUR != '\''))
1756: NEXT;
1757: if (!IS_LETTER(CUR)) {
1758: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1759: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1760: ctxt->wellFormed = 0;
1761: } else {
1762: ret = xmlStrndup(q, CUR_PTR - q);
1763: NEXT;
1764: }
1765: } else {
1766: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1767: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1768: ctxt->wellFormed = 0;
1769: }
1770:
1771: return(ret);
1772: }
1773:
1774: /**
1775: * htmlParseCharData:
1776: * @ctxt: an HTML parser context
1777: * @cdata: int indicating whether we are within a CDATA section
1778: *
1779: * parse a CharData section.
1780: * if we are within a CDATA section ']]>' marks an end of section.
1781: *
1782: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1783: */
1784:
1785: void
1786: htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1.25 daniel 1787: xmlChar *buf = NULL;
1788: int len = 0;
1.31 daniel 1789: int size = HTML_PARSER_BUFFER_SIZE;
1.25 daniel 1790: xmlChar q;
1791:
1792: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1793: if (buf == NULL) {
1794: fprintf(stderr, "malloc of %d byte failed\n", size);
1795: return;
1796: }
1.1 daniel 1797:
1.25 daniel 1798: q = CUR;
1799: while ((IS_CHAR(q)) && (q != '<') &&
1800: (q != '&')) {
1801: if ((q == ']') && (NXT(1) == ']') &&
1.1 daniel 1802: (NXT(2) == '>')) {
1803: if (cdata) break;
1804: else {
1805: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1806: ctxt->sax->error(ctxt->userData,
1807: "Sequence ']]>' not allowed in content\n");
1808: ctxt->wellFormed = 0;
1809: }
1810: }
1.25 daniel 1811: if (len + 1 >= size) {
1812: size *= 2;
1813: buf = xmlRealloc(buf, size * sizeof(xmlChar));
1814: if (buf == NULL) {
1815: fprintf(stderr, "realloc of %d byte failed\n", size);
1816: return;
1817: }
1818: }
1819: buf[len++] = q;
1.1 daniel 1820: NEXT;
1.25 daniel 1821: q = CUR;
1822: }
1823: if (len == 0) {
1824: xmlFree(buf);
1825: return;
1.1 daniel 1826: }
1827:
1828: /*
1.25 daniel 1829: * Ok the buffer is to be consumed as chars.
1.1 daniel 1830: */
1831: if (ctxt->sax != NULL) {
1.25 daniel 1832: if (areBlanks(ctxt, buf, len)) {
1.1 daniel 1833: if (ctxt->sax->ignorableWhitespace != NULL)
1.25 daniel 1834: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len);
1.1 daniel 1835: } else {
1836: if (ctxt->sax->characters != NULL)
1.25 daniel 1837: ctxt->sax->characters(ctxt->userData, buf, len);
1.1 daniel 1838: }
1839: }
1.25 daniel 1840: xmlFree(buf);
1.1 daniel 1841: }
1842:
1843: /**
1844: * htmlParseExternalID:
1845: * @ctxt: an HTML parser context
1.14 daniel 1846: * @publicID: a xmlChar** receiving PubidLiteral
1.1 daniel 1847: * @strict: indicate whether we should restrict parsing to only
1848: * production [75], see NOTE below
1849: *
1850: * Parse an External ID or a Public ID
1851: *
1852: * NOTE: Productions [75] and [83] interract badly since [75] can generate
1853: * 'PUBLIC' S PubidLiteral S SystemLiteral
1854: *
1855: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1856: * | 'PUBLIC' S PubidLiteral S SystemLiteral
1857: *
1858: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1859: *
1860: * Returns the function returns SystemLiteral and in the second
1861: * case publicID receives PubidLiteral, is strict is off
1862: * it is possible to return NULL and have publicID set.
1863: */
1864:
1.14 daniel 1865: xmlChar *
1866: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1867: xmlChar *URI = NULL;
1.1 daniel 1868:
1869: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1870: (UPP(2) == 'S') && (UPP(3) == 'T') &&
1871: (UPP(4) == 'E') && (UPP(5) == 'M')) {
1872: SKIP(6);
1873: if (!IS_BLANK(CUR)) {
1874: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1875: ctxt->sax->error(ctxt->userData,
1876: "Space required after 'SYSTEM'\n");
1877: ctxt->wellFormed = 0;
1878: }
1879: SKIP_BLANKS;
1880: URI = htmlParseSystemLiteral(ctxt);
1881: if (URI == NULL) {
1882: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1883: ctxt->sax->error(ctxt->userData,
1884: "htmlParseExternalID: SYSTEM, no URI\n");
1885: ctxt->wellFormed = 0;
1886: }
1887: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1888: (UPP(2) == 'B') && (UPP(3) == 'L') &&
1889: (UPP(4) == 'I') && (UPP(5) == 'C')) {
1890: SKIP(6);
1891: if (!IS_BLANK(CUR)) {
1892: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1893: ctxt->sax->error(ctxt->userData,
1894: "Space required after 'PUBLIC'\n");
1895: ctxt->wellFormed = 0;
1896: }
1897: SKIP_BLANKS;
1898: *publicID = htmlParsePubidLiteral(ctxt);
1899: if (*publicID == NULL) {
1900: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1901: ctxt->sax->error(ctxt->userData,
1902: "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1903: ctxt->wellFormed = 0;
1904: }
1.5 daniel 1905: SKIP_BLANKS;
1906: if ((CUR == '"') || (CUR == '\'')) {
1907: URI = htmlParseSystemLiteral(ctxt);
1.1 daniel 1908: }
1909: }
1910: return(URI);
1911: }
1912:
1913: /**
1914: * htmlParseComment:
1915: * @ctxt: an HTML parser context
1916: *
1917: * Parse an XML (SGML) comment <!-- .... -->
1918: *
1919: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1920: */
1921: void
1.31 daniel 1922: htmlParseComment(htmlParserCtxtPtr ctxt) {
1.25 daniel 1923: xmlChar *buf = NULL;
1924: int len = 0;
1.31 daniel 1925: int size = HTML_PARSER_BUFFER_SIZE;
1.25 daniel 1926: register xmlChar s, r, q;
1.1 daniel 1927:
1928: /*
1929: * Check that there is a comment right here.
1930: */
1931: if ((CUR != '<') || (NXT(1) != '!') ||
1932: (NXT(2) != '-') || (NXT(3) != '-')) return;
1933:
1.25 daniel 1934: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1935: if (buf == NULL) {
1936: fprintf(stderr, "malloc of %d byte failed\n", size);
1937: return;
1938: }
1939: q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
1.1 daniel 1940: SKIP(4);
1.25 daniel 1941: s = CUR;
1942:
1943: while (IS_CHAR(s) &&
1944: ((s != '>') || (r != '-') || (q != '-'))) {
1945: if (len + 1 >= size) {
1946: size *= 2;
1947: buf = xmlRealloc(buf, size * sizeof(xmlChar));
1948: if (buf == NULL) {
1949: fprintf(stderr, "realloc of %d byte failed\n", size);
1950: return;
1951: }
1952: }
1953: buf[len++] = s;
1954: NEXT;
1955: q = r;
1956: r = s;
1957: s = CUR;
1.1 daniel 1958: }
1.25 daniel 1959: buf[len - 2] = 0;
1960: if (!IS_CHAR(s)) {
1.1 daniel 1961: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.25 daniel 1962: ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
1.1 daniel 1963: ctxt->wellFormed = 0;
1964: } else {
1965: NEXT;
1.31 daniel 1966: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
1967: ctxt->sax->comment(ctxt->userData, buf);
1.1 daniel 1968: }
1969: }
1.25 daniel 1970: xmlFree(buf);
1.1 daniel 1971: }
1972:
1973: /**
1974: * htmlParseCharRef:
1975: * @ctxt: an HTML parser context
1976: *
1977: * parse Reference declarations
1978: *
1979: * [66] CharRef ::= '&#' [0-9]+ ';' |
1980: * '&#x' [0-9a-fA-F]+ ';'
1981: *
1982: * Returns the value parsed (as an int)
1983: */
1984: int
1985: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
1986: int val = 0;
1987:
1988: if ((CUR == '&') && (NXT(1) == '#') &&
1989: (NXT(2) == 'x')) {
1990: SKIP(3);
1991: while (CUR != ';') {
1992: if ((CUR >= '0') && (CUR <= '9'))
1993: val = val * 16 + (CUR - '0');
1994: else if ((CUR >= 'a') && (CUR <= 'f'))
1995: val = val * 16 + (CUR - 'a') + 10;
1996: else if ((CUR >= 'A') && (CUR <= 'F'))
1997: val = val * 16 + (CUR - 'A') + 10;
1998: else {
1999: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2000: ctxt->sax->error(ctxt->userData,
2001: "htmlParseCharRef: invalid hexadecimal value\n");
2002: ctxt->wellFormed = 0;
2003: val = 0;
2004: break;
2005: }
2006: NEXT;
2007: }
2008: if (CUR == ';')
2009: NEXT;
2010: } else if ((CUR == '&') && (NXT(1) == '#')) {
2011: SKIP(2);
2012: while (CUR != ';') {
2013: if ((CUR >= '0') && (CUR <= '9'))
2014: val = val * 10 + (CUR - '0');
2015: else {
2016: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2017: ctxt->sax->error(ctxt->userData,
2018: "htmlParseCharRef: invalid decimal value\n");
2019: ctxt->wellFormed = 0;
2020: val = 0;
2021: break;
2022: }
2023: NEXT;
2024: }
2025: if (CUR == ';')
2026: NEXT;
2027: } else {
2028: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2029: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2030: ctxt->wellFormed = 0;
2031: }
2032: /*
2033: * Check the value IS_CHAR ...
2034: */
2035: if (IS_CHAR(val)) {
2036: return(val);
2037: } else {
2038: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.14 daniel 2039: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
1.1 daniel 2040: val);
2041: ctxt->wellFormed = 0;
2042: }
2043: return(0);
2044: }
2045:
2046:
2047: /**
2048: * htmlParseDocTypeDecl :
2049: * @ctxt: an HTML parser context
2050: *
2051: * parse a DOCTYPE declaration
2052: *
2053: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2054: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2055: */
2056:
2057: void
2058: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1.14 daniel 2059: xmlChar *name;
2060: xmlChar *ExternalID = NULL;
2061: xmlChar *URI = NULL;
1.1 daniel 2062:
2063: /*
2064: * We know that '<!DOCTYPE' has been detected.
2065: */
2066: SKIP(9);
2067:
2068: SKIP_BLANKS;
2069:
2070: /*
2071: * Parse the DOCTYPE name.
2072: */
2073: name = htmlParseName(ctxt);
2074: if (name == NULL) {
2075: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2076: ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2077: ctxt->wellFormed = 0;
2078: }
2079: /*
2080: * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2081: */
2082:
2083: SKIP_BLANKS;
2084:
2085: /*
2086: * Check for SystemID and ExternalID
2087: */
1.5 daniel 2088: URI = htmlParseExternalID(ctxt, &ExternalID, 0);
1.1 daniel 2089: SKIP_BLANKS;
2090:
2091: /*
2092: * We should be at the end of the DOCTYPE declaration.
2093: */
2094: if (CUR != '>') {
2095: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2096: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2097: ctxt->wellFormed = 0;
2098: /* We shouldn't try to resynchronize ... */
2099: } else {
2100: }
2101: NEXT;
2102:
2103: /*
2104: * Create the document accordingly to the DOCTYPE
2105: */
1.31 daniel 2106: if (ctxt->myDoc != NULL)
2107: xmlFreeDoc(ctxt->myDoc);
2108:
1.1 daniel 2109: ctxt->myDoc = htmlNewDoc(URI, ExternalID);
2110:
2111: /*
2112: * Cleanup, since we don't use all those identifiers
2113: */
1.11 daniel 2114: if (URI != NULL) xmlFree(URI);
2115: if (ExternalID != NULL) xmlFree(ExternalID);
2116: if (name != NULL) xmlFree(name);
1.1 daniel 2117: }
2118:
2119: /**
2120: * htmlParseAttribute:
2121: * @ctxt: an HTML parser context
1.14 daniel 2122: * @value: a xmlChar ** used to store the value of the attribute
1.1 daniel 2123: *
2124: * parse an attribute
2125: *
2126: * [41] Attribute ::= Name Eq AttValue
2127: *
2128: * [25] Eq ::= S? '=' S?
2129: *
2130: * With namespace:
2131: *
2132: * [NS 11] Attribute ::= QName Eq AttValue
2133: *
2134: * Also the case QName == xmlns:??? is handled independently as a namespace
2135: * definition.
2136: *
2137: * Returns the attribute name, and the value in *value.
2138: */
2139:
1.14 daniel 2140: xmlChar *
2141: htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1.31 daniel 2142: xmlChar *name, *val = NULL;
1.1 daniel 2143:
2144: *value = NULL;
2145: name = htmlParseName(ctxt);
2146: if (name == NULL) {
2147: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2148: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2149: ctxt->wellFormed = 0;
2150: return(NULL);
2151: }
2152:
2153: /*
2154: * read the value
2155: */
2156: SKIP_BLANKS;
2157: if (CUR == '=') {
2158: NEXT;
2159: SKIP_BLANKS;
2160: val = htmlParseAttValue(ctxt);
2161: } else {
1.27 daniel 2162: /* TODO : some attribute must have values, some may not */
1.1 daniel 2163: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.31 daniel 2164: ctxt->sax->warning(ctxt->userData,
2165: "No value for attribute %s\n", name);
1.1 daniel 2166: }
2167:
2168: *value = val;
2169: return(name);
2170: }
2171:
2172: /**
2173: * htmlParseStartTag:
2174: * @ctxt: an HTML parser context
2175: *
2176: * parse a start of tag either for rule element or
2177: * EmptyElement. In both case we don't parse the tag closing chars.
2178: *
2179: * [40] STag ::= '<' Name (S Attribute)* S? '>'
2180: *
2181: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2182: *
2183: * With namespace:
2184: *
2185: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2186: *
2187: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2188: *
2189: */
2190:
1.18 daniel 2191: void
1.1 daniel 2192: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2193: xmlChar *name;
2194: xmlChar *attname;
2195: xmlChar *attvalue;
2196: const xmlChar **atts = NULL;
1.1 daniel 2197: int nbatts = 0;
2198: int maxatts = 0;
2199: int i;
2200:
1.18 daniel 2201: if (CUR != '<') return;
1.1 daniel 2202: NEXT;
2203:
1.19 daniel 2204: GROW;
1.1 daniel 2205: name = htmlParseHTMLName(ctxt);
2206: if (name == NULL) {
2207: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2208: ctxt->sax->error(ctxt->userData,
2209: "htmlParseStartTag: invalid element name\n");
2210: ctxt->wellFormed = 0;
1.18 daniel 2211: return;
1.1 daniel 2212: }
2213:
2214: /*
2215: * Check for auto-closure of HTML elements.
2216: */
2217: htmlAutoClose(ctxt, name);
2218:
2219: /*
2220: * Now parse the attributes, it ends up with the ending
2221: *
2222: * (S Attribute)* S?
2223: */
2224: SKIP_BLANKS;
2225: while ((IS_CHAR(CUR)) &&
2226: (CUR != '>') &&
2227: ((CUR != '/') || (NXT(1) != '>'))) {
1.26 daniel 2228: long cons = ctxt->nbChars;
1.1 daniel 2229:
1.19 daniel 2230: GROW;
1.1 daniel 2231: attname = htmlParseAttribute(ctxt, &attvalue);
1.31 daniel 2232: if (attname != NULL) {
1.1 daniel 2233: /*
2234: * Well formedness requires at most one declaration of an attribute
2235: */
2236: for (i = 0; i < nbatts;i += 2) {
2237: if (!xmlStrcmp(atts[i], attname)) {
2238: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.19 daniel 2239: ctxt->sax->error(ctxt->userData,
2240: "Attribute %s redefined\n",
2241: attname);
1.1 daniel 2242: ctxt->wellFormed = 0;
1.11 daniel 2243: xmlFree(attname);
1.31 daniel 2244: if (attvalue != NULL)
2245: xmlFree(attvalue);
1.19 daniel 2246: goto failed;
1.1 daniel 2247: }
2248: }
2249:
2250: /*
2251: * Add the pair to atts
2252: */
2253: if (atts == NULL) {
2254: maxatts = 10;
1.14 daniel 2255: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
1.1 daniel 2256: if (atts == NULL) {
2257: fprintf(stderr, "malloc of %ld byte failed\n",
1.14 daniel 2258: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2259: if (name != NULL) xmlFree(name);
2260: return;
1.1 daniel 2261: }
1.23 daniel 2262: } else if (nbatts + 4 > maxatts) {
1.1 daniel 2263: maxatts *= 2;
1.14 daniel 2264: atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
1.1 daniel 2265: if (atts == NULL) {
2266: fprintf(stderr, "realloc of %ld byte failed\n",
1.14 daniel 2267: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2268: if (name != NULL) xmlFree(name);
2269: return;
1.1 daniel 2270: }
2271: }
2272: atts[nbatts++] = attname;
2273: atts[nbatts++] = attvalue;
2274: atts[nbatts] = NULL;
2275: atts[nbatts + 1] = NULL;
2276: }
2277:
1.19 daniel 2278: failed:
1.1 daniel 2279: SKIP_BLANKS;
1.26 daniel 2280: if (cons == ctxt->nbChars) {
1.1 daniel 2281: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2282: ctxt->sax->error(ctxt->userData,
2283: "htmlParseStartTag: problem parsing attributes\n");
2284: ctxt->wellFormed = 0;
2285: break;
2286: }
2287: }
2288:
2289: /*
2290: * SAX: Start of Element !
2291: */
1.15 daniel 2292: htmlnamePush(ctxt, xmlStrdup(name));
1.18 daniel 2293: #ifdef DEBUG
2294: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2295: #endif
1.1 daniel 2296: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2297: ctxt->sax->startElement(ctxt->userData, name, atts);
2298:
2299: if (atts != NULL) {
1.31 daniel 2300: for (i = 0;i < nbatts;i++) {
2301: if (atts[i] != NULL)
2302: xmlFree((xmlChar *) atts[i]);
2303: }
1.11 daniel 2304: xmlFree(atts);
1.1 daniel 2305: }
1.18 daniel 2306: if (name != NULL) xmlFree(name);
1.1 daniel 2307: }
2308:
2309: /**
2310: * htmlParseEndTag:
2311: * @ctxt: an HTML parser context
2312: *
2313: * parse an end of tag
2314: *
2315: * [42] ETag ::= '</' Name S? '>'
2316: *
2317: * With namespace
2318: *
2319: * [NS 9] ETag ::= '</' QName S? '>'
2320: */
2321:
2322: void
1.18 daniel 2323: htmlParseEndTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2324: xmlChar *name;
1.15 daniel 2325: xmlChar *oldname;
1.1 daniel 2326: int i;
2327:
2328: if ((CUR != '<') || (NXT(1) != '/')) {
2329: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2330: ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2331: ctxt->wellFormed = 0;
2332: return;
2333: }
2334: SKIP(2);
2335:
2336: name = htmlParseHTMLName(ctxt);
1.24 daniel 2337: if (name == NULL) return;
1.1 daniel 2338:
2339: /*
2340: * We should definitely be at the ending "S? '>'" part
2341: */
2342: SKIP_BLANKS;
2343: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2344: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2345: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2346: ctxt->wellFormed = 0;
2347: } else
2348: NEXT;
2349:
2350: /*
1.18 daniel 2351: * If the name read is not one of the element in the parsing stack
2352: * then return, it's just an error.
1.1 daniel 2353: */
1.18 daniel 2354: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2355: if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
1.1 daniel 2356: }
2357: if (i < 0) {
2358: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.18 daniel 2359: ctxt->sax->error(ctxt->userData,
2360: "Unexpected end tag : %s\n", name);
1.11 daniel 2361: xmlFree(name);
1.1 daniel 2362: ctxt->wellFormed = 0;
2363: return;
2364: }
2365:
1.18 daniel 2366:
1.1 daniel 2367: /*
2368: * Check for auto-closure of HTML elements.
2369: */
1.18 daniel 2370:
1.1 daniel 2371: htmlAutoCloseOnClose(ctxt, name);
2372:
2373: /*
2374: * Well formedness constraints, opening and closing must match.
2375: * With the exception that the autoclose may have popped stuff out
2376: * of the stack.
2377: */
1.18 daniel 2378: if (xmlStrcmp(name, ctxt->name)) {
2379: #ifdef DEBUG
2380: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2381: #endif
1.15 daniel 2382: if ((ctxt->name != NULL) &&
2383: (xmlStrcmp(ctxt->name, name))) {
1.1 daniel 2384: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2385: ctxt->sax->error(ctxt->userData,
2386: "Opening and ending tag mismatch: %s and %s\n",
1.15 daniel 2387: name, ctxt->name);
1.1 daniel 2388: ctxt->wellFormed = 0;
2389: }
2390: }
2391:
2392: /*
2393: * SAX: End of Tag
2394: */
1.15 daniel 2395: oldname = ctxt->name;
1.24 daniel 2396: if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
1.18 daniel 2397: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2398: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2399: oldname = htmlnamePop(ctxt);
1.18 daniel 2400: if (oldname != NULL) {
2401: #ifdef DEBUG
2402: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2403: #endif
2404: xmlFree(oldname);
2405: #ifdef DEBUG
2406: } else {
2407: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2408: #endif
2409: }
2410: }
1.1 daniel 2411:
2412: if (name != NULL)
1.11 daniel 2413: xmlFree(name);
1.1 daniel 2414:
2415: return;
2416: }
2417:
2418:
2419: /**
2420: * htmlParseReference:
2421: * @ctxt: an HTML parser context
2422: *
2423: * parse and handle entity references in content,
2424: * this will end-up in a call to character() since this is either a
2425: * CharRef, or a predefined entity.
2426: */
2427: void
2428: htmlParseReference(htmlParserCtxtPtr ctxt) {
2429: htmlEntityDescPtr ent;
1.14 daniel 2430: xmlChar out[2];
2431: xmlChar *name;
1.1 daniel 2432: int val;
2433: if (CUR != '&') return;
2434:
2435: if (NXT(1) == '#') {
2436: val = htmlParseCharRef(ctxt);
1.8 daniel 2437: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 2438: out[0] = val;
2439: out[1] = 0;
2440: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2441: ctxt->sax->characters(ctxt->userData, out, 1);
2442: } else {
2443: ent = htmlParseEntityRef(ctxt, &name);
1.32 daniel 2444: if (name == NULL) {
2445: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
2446: return;
2447: }
1.1 daniel 2448: if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) {
2449: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
1.8 daniel 2450: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.1 daniel 2451: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1.32 daniel 2452: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
1.1 daniel 2453: }
2454: } else {
1.8 daniel 2455: /* invalid for UTF-8 variable encoding !!!!! */
1.1 daniel 2456: out[0] = ent->value;
2457: out[1] = 0;
2458: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2459: ctxt->sax->characters(ctxt->userData, out, 1);
2460: }
1.11 daniel 2461: xmlFree(name);
1.1 daniel 2462: }
2463: }
2464:
2465: /**
2466: * htmlParseContent:
2467: * @ctxt: an HTML parser context
2468: * @name: the node name
2469: *
2470: * Parse a content: comment, sub-element, reference or text.
2471: *
2472: */
2473:
2474: void
1.18 daniel 2475: htmlParseContent(htmlParserCtxtPtr ctxt) {
1.15 daniel 2476: xmlChar *currentNode;
1.18 daniel 2477: int depth;
1.1 daniel 2478:
1.26 daniel 2479: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 2480: depth = ctxt->nameNr;
2481: while (1) {
1.26 daniel 2482: long cons = ctxt->nbChars;
1.1 daniel 2483:
1.18 daniel 2484: GROW;
2485: /*
2486: * Our tag or one of it's parent or children is ending.
2487: */
2488: if ((CUR == '<') && (NXT(1) == '/')) {
2489: htmlParseEndTag(ctxt);
1.26 daniel 2490: if (currentNode != NULL) xmlFree(currentNode);
1.18 daniel 2491: return;
2492: }
2493:
2494: /*
2495: * Has this node been popped out during parsing of
2496: * the next element
2497: */
1.26 daniel 2498: if ((xmlStrcmp(currentNode, ctxt->name)) &&
2499: (depth >= ctxt->nameNr)) {
2500: if (currentNode != NULL) xmlFree(currentNode);
2501: return;
2502: }
1.18 daniel 2503:
1.1 daniel 2504: /*
2505: * First case : a comment
2506: */
2507: if ((CUR == '<') && (NXT(1) == '!') &&
2508: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 2509: htmlParseComment(ctxt);
1.1 daniel 2510: }
2511:
2512: /*
2513: * Second case : a sub-element.
2514: */
2515: else if (CUR == '<') {
2516: htmlParseElement(ctxt);
2517: }
2518:
2519: /*
2520: * Third case : a reference. If if has not been resolved,
2521: * parsing returns it's Name, create the node
2522: */
2523: else if (CUR == '&') {
2524: htmlParseReference(ctxt);
2525: }
2526:
2527: /*
2528: * Last case, text. Note that References are handled directly.
2529: */
2530: else {
2531: htmlParseCharData(ctxt, 0);
2532: }
2533:
1.26 daniel 2534: if (cons == ctxt->nbChars) {
1.22 daniel 2535: if (ctxt->node != NULL) {
2536: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2537: ctxt->sax->error(ctxt->userData,
2538: "detected an error in element content\n");
2539: ctxt->wellFormed = 0;
2540: }
1.1 daniel 2541: break;
2542: }
1.17 daniel 2543:
1.5 daniel 2544: GROW;
1.1 daniel 2545: }
1.26 daniel 2546: if (currentNode != NULL) xmlFree(currentNode);
1.1 daniel 2547: }
2548:
2549: /**
2550: * htmlParseElement:
2551: * @ctxt: an HTML parser context
2552: *
2553: * parse an HTML element, this is highly recursive
2554: *
2555: * [39] element ::= EmptyElemTag | STag content ETag
2556: *
2557: * [41] Attribute ::= Name Eq AttValue
2558: */
2559:
2560: void
2561: htmlParseElement(htmlParserCtxtPtr ctxt) {
1.14 daniel 2562: const xmlChar *openTag = CUR_PTR;
2563: xmlChar *name;
1.16 daniel 2564: xmlChar *currentNode = NULL;
1.1 daniel 2565: htmlElemDescPtr info;
1.10 daniel 2566: htmlParserNodeInfo node_info;
1.31 daniel 2567: xmlChar *oldname;
1.18 daniel 2568: int depth = ctxt->nameNr;
1.1 daniel 2569:
2570: /* Capture start position */
1.10 daniel 2571: if (ctxt->record_info) {
2572: node_info.begin_pos = ctxt->input->consumed +
2573: (CUR_PTR - ctxt->input->base);
2574: node_info.begin_line = ctxt->input->line;
2575: }
1.1 daniel 2576:
1.26 daniel 2577: oldname = xmlStrdup(ctxt->name);
1.18 daniel 2578: htmlParseStartTag(ctxt);
2579: name = ctxt->name;
1.19 daniel 2580: #ifdef DEBUG
2581: if (oldname == NULL)
2582: fprintf(stderr, "Start of element %s\n", name);
2583: else if (name == NULL)
2584: fprintf(stderr, "Start of element failed, was %s\n", oldname);
2585: else
2586: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2587: #endif
1.26 daniel 2588: if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
1.18 daniel 2589: (name == NULL)) {
1.19 daniel 2590: if (CUR == '>')
2591: NEXT;
1.26 daniel 2592: if (oldname != NULL)
2593: xmlFree(oldname);
1.1 daniel 2594: return;
2595: }
1.26 daniel 2596: if (oldname != NULL)
2597: xmlFree(oldname);
1.1 daniel 2598:
2599: /*
2600: * Lookup the info for that element.
2601: */
2602: info = htmlTagLookup(name);
2603: if (info == NULL) {
2604: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2605: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2606: name);
2607: ctxt->wellFormed = 0;
2608: } else if (info->depr) {
2609: /***************************
2610: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2611: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2612: name);
2613: ***************************/
2614: }
2615:
2616: /*
2617: * Check for an Empty Element labelled the XML/SGML way
2618: */
2619: if ((CUR == '/') && (NXT(1) == '>')) {
2620: SKIP(2);
2621: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2622: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2623: oldname = htmlnamePop(ctxt);
1.18 daniel 2624: #ifdef DEBUG
2625: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2626: #endif
1.17 daniel 2627: if (oldname != NULL)
2628: xmlFree(oldname);
1.1 daniel 2629: return;
2630: }
2631:
1.5 daniel 2632: if (CUR == '>') {
2633: NEXT;
2634: } else {
1.1 daniel 2635: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2636: ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2637: openTag);
2638: ctxt->wellFormed = 0;
2639:
2640: /*
2641: * end of parsing of this node.
2642: */
1.18 daniel 2643: if (!xmlStrcmp(name, ctxt->name)) {
2644: nodePop(ctxt);
1.24 daniel 2645: oldname = htmlnamePop(ctxt);
1.18 daniel 2646: #ifdef DEBUG
2647: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2648: #endif
2649: if (oldname != NULL)
2650: xmlFree(oldname);
2651: }
1.10 daniel 2652:
2653: /*
2654: * Capture end position and add node
2655: */
2656: if ( currentNode != NULL && ctxt->record_info ) {
2657: node_info.end_pos = ctxt->input->consumed +
2658: (CUR_PTR - ctxt->input->base);
2659: node_info.end_line = ctxt->input->line;
1.15 daniel 2660: node_info.node = ctxt->node;
1.10 daniel 2661: xmlParserAddNodeInfo(ctxt, &node_info);
2662: }
1.1 daniel 2663: return;
2664: }
2665:
2666: /*
2667: * Check for an Empty Element from DTD definition
2668: */
2669: if ((info != NULL) && (info->empty)) {
2670: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2671: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2672: oldname = htmlnamePop(ctxt);
1.18 daniel 2673: #ifdef DEBUG
2674: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2675: #endif
1.17 daniel 2676: if (oldname != NULL)
2677: xmlFree(oldname);
1.1 daniel 2678: return;
2679: }
2680:
2681: /*
2682: * Parse the content of the element:
2683: */
1.26 daniel 2684: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 2685: depth = ctxt->nameNr;
2686: while (IS_CHAR(CUR)) {
2687: htmlParseContent(ctxt);
2688: if (ctxt->nameNr < depth) break;
2689: }
1.1 daniel 2690:
2691: if (!IS_CHAR(CUR)) {
2692: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2693: ctxt->sax->error(ctxt->userData,
1.18 daniel 2694: "Premature end of data in tag %s\n", currentNode);
1.1 daniel 2695: ctxt->wellFormed = 0;
2696:
2697: /*
2698: * end of parsing of this node.
2699: */
2700: nodePop(ctxt);
1.24 daniel 2701: oldname = htmlnamePop(ctxt);
1.18 daniel 2702: #ifdef DEBUG
2703: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2704: #endif
1.17 daniel 2705: if (oldname != NULL)
2706: xmlFree(oldname);
1.26 daniel 2707: if (currentNode != NULL)
2708: xmlFree(currentNode);
1.1 daniel 2709: return;
2710: }
1.10 daniel 2711:
2712: /*
2713: * Capture end position and add node
2714: */
2715: if ( currentNode != NULL && ctxt->record_info ) {
2716: node_info.end_pos = ctxt->input->consumed +
2717: (CUR_PTR - ctxt->input->base);
2718: node_info.end_line = ctxt->input->line;
1.15 daniel 2719: node_info.node = ctxt->node;
1.10 daniel 2720: xmlParserAddNodeInfo(ctxt, &node_info);
2721: }
1.26 daniel 2722: if (currentNode != NULL)
2723: xmlFree(currentNode);
1.1 daniel 2724: }
2725:
2726: /**
2727: * htmlParseDocument :
2728: * @ctxt: an HTML parser context
2729: *
2730: * parse an HTML document (and build a tree if using the standard SAX
2731: * interface).
2732: *
2733: * Returns 0, -1 in case of error. the parser context is augmented
2734: * as a result of the parsing.
2735: */
2736:
2737: int
2738: htmlParseDocument(htmlParserCtxtPtr ctxt) {
2739: htmlDefaultSAXHandlerInit();
2740: ctxt->html = 1;
2741:
1.5 daniel 2742: GROW;
1.1 daniel 2743: /*
1.9 daniel 2744: * SAX: beginning of the document processing.
1.1 daniel 2745: */
2746: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2747: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2748:
2749: /*
2750: * Wipe out everything which is before the first '<'
2751: */
1.22 daniel 2752: SKIP_BLANKS;
1.1 daniel 2753: if (CUR == 0) {
2754: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2755: ctxt->sax->error(ctxt->userData, "Document is empty\n");
2756: ctxt->wellFormed = 0;
2757: }
2758:
1.22 daniel 2759: /*
2760: * Parse possible comments before any content
2761: */
2762: while ((CUR == '<') && (NXT(1) == '!') &&
2763: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 2764: if (ctxt->myDoc == NULL)
2765: ctxt->myDoc = htmlNewDoc(NULL, NULL);
2766: htmlParseComment(ctxt);
1.22 daniel 2767: SKIP_BLANKS;
2768: }
2769:
1.1 daniel 2770:
2771: /*
2772: * Then possibly doc type declaration(s) and more Misc
2773: * (doctypedecl Misc*)?
2774: */
2775: if ((CUR == '<') && (NXT(1) == '!') &&
2776: (UPP(2) == 'D') && (UPP(3) == 'O') &&
2777: (UPP(4) == 'C') && (UPP(5) == 'T') &&
2778: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2779: (UPP(8) == 'E')) {
2780: htmlParseDocTypeDecl(ctxt);
2781: }
2782: SKIP_BLANKS;
2783:
2784: /*
2785: * Create the document if not done already.
2786: */
2787: if (ctxt->myDoc == NULL) {
2788: ctxt->myDoc = htmlNewDoc(NULL, NULL);
2789: }
2790:
2791: /*
2792: * Time to start parsing the tree itself
2793: */
1.22 daniel 2794: htmlParseContent(ctxt);
1.1 daniel 2795:
2796: /*
2797: * SAX: end of the document processing.
2798: */
2799: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
2800: ctxt->sax->endDocument(ctxt->userData);
2801: if (! ctxt->wellFormed) return(-1);
2802: return(0);
2803: }
2804:
2805:
1.30 daniel 2806: /************************************************************************
2807: * *
2808: * Parser contexts handling *
2809: * *
2810: ************************************************************************/
1.1 daniel 2811:
2812: /**
2813: * xmlInitParserCtxt:
2814: * @ctxt: an HTML parser context
2815: *
2816: * Initialize a parser context
2817: */
2818:
2819: void
2820: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
2821: {
2822: htmlSAXHandler *sax;
2823:
1.21 daniel 2824: if (ctxt == NULL) return;
2825: memset(ctxt, 0, sizeof(htmlParserCtxt));
2826:
1.11 daniel 2827: sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
1.1 daniel 2828: if (sax == NULL) {
2829: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2830: }
1.19 daniel 2831: memset(sax, 0, sizeof(htmlSAXHandler));
1.1 daniel 2832:
2833: /* Allocate the Input stack */
1.19 daniel 2834: ctxt->inputTab = (htmlParserInputPtr *)
2835: xmlMalloc(5 * sizeof(htmlParserInputPtr));
2836: if (ctxt->inputTab == NULL) {
2837: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
2838: }
1.1 daniel 2839: ctxt->inputNr = 0;
2840: ctxt->inputMax = 5;
2841: ctxt->input = NULL;
2842: ctxt->version = NULL;
2843: ctxt->encoding = NULL;
2844: ctxt->standalone = -1;
1.30 daniel 2845: ctxt->instate = XML_PARSER_START;
1.1 daniel 2846:
2847: /* Allocate the Node stack */
1.11 daniel 2848: ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
1.1 daniel 2849: ctxt->nodeNr = 0;
2850: ctxt->nodeMax = 10;
2851: ctxt->node = NULL;
2852:
1.15 daniel 2853: /* Allocate the Name stack */
2854: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
2855: ctxt->nameNr = 0;
2856: ctxt->nameMax = 10;
2857: ctxt->name = NULL;
2858:
1.1 daniel 2859: if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
2860: else {
2861: ctxt->sax = sax;
2862: memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
2863: }
2864: ctxt->userData = ctxt;
2865: ctxt->myDoc = NULL;
2866: ctxt->wellFormed = 1;
2867: ctxt->replaceEntities = 0;
2868: ctxt->html = 1;
2869: ctxt->record_info = 0;
1.21 daniel 2870: ctxt->validate = 0;
1.26 daniel 2871: ctxt->nbChars = 0;
1.30 daniel 2872: ctxt->checkIndex = 0;
1.1 daniel 2873: xmlInitNodeInfoSeq(&ctxt->node_seq);
2874: }
2875:
2876: /**
2877: * htmlFreeParserCtxt:
2878: * @ctxt: an HTML parser context
2879: *
2880: * Free all the memory used by a parser context. However the parsed
2881: * document in ctxt->myDoc is not freed.
2882: */
2883:
2884: void
2885: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
2886: {
2887: htmlParserInputPtr input;
1.15 daniel 2888: xmlChar *oldname;
1.1 daniel 2889:
2890: if (ctxt == NULL) return;
2891:
2892: while ((input = inputPop(ctxt)) != NULL) {
2893: xmlFreeInputStream(input);
2894: }
2895:
1.11 daniel 2896: if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
1.24 daniel 2897: while ((oldname = htmlnamePop(ctxt)) != NULL) {
2898: xmlFree(oldname);
1.15 daniel 2899: }
2900: if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
1.31 daniel 2901: if (ctxt->directory != NULL) xmlFree(ctxt->directory);
1.11 daniel 2902: if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
2903: if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
1.1 daniel 2904: if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
1.11 daniel 2905: xmlFree(ctxt->sax);
2906: xmlFree(ctxt);
1.1 daniel 2907: }
2908:
2909: /**
2910: * htmlCreateDocParserCtxt :
1.14 daniel 2911: * @cur: a pointer to an array of xmlChar
1.1 daniel 2912: * @encoding: a free form C string describing the HTML document encoding, or NULL
2913: *
2914: * Create a parser context for an HTML document.
2915: *
2916: * Returns the new parser context or NULL
2917: */
2918: htmlParserCtxtPtr
1.14 daniel 2919: htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
1.1 daniel 2920: htmlParserCtxtPtr ctxt;
2921: htmlParserInputPtr input;
2922: /* htmlCharEncoding enc; */
2923:
1.11 daniel 2924: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 2925: if (ctxt == NULL) {
2926: perror("malloc");
2927: return(NULL);
2928: }
2929: htmlInitParserCtxt(ctxt);
1.11 daniel 2930: input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 2931: if (input == NULL) {
2932: perror("malloc");
1.11 daniel 2933: xmlFree(ctxt);
1.1 daniel 2934: return(NULL);
2935: }
1.19 daniel 2936: memset(input, 0, sizeof(htmlParserInput));
1.1 daniel 2937:
2938: input->line = 1;
2939: input->col = 1;
2940: input->base = cur;
2941: input->cur = cur;
2942:
2943: inputPush(ctxt, input);
2944: return(ctxt);
2945: }
2946:
1.31 daniel 2947: /************************************************************************
2948: * *
2949: * Progressive parsing interfaces *
2950: * *
2951: ************************************************************************/
2952:
2953: /**
2954: * htmlParseLookupSequence:
2955: * @ctxt: an HTML parser context
2956: * @first: the first char to lookup
2957: * @next: the next char to lookup or zero
2958: * @third: the next char to lookup or zero
2959: *
2960: * Try to find if a sequence (first, next, third) or just (first next) or
2961: * (first) is available in the input stream.
2962: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
2963: * to avoid rescanning sequences of bytes, it DOES change the state of the
2964: * parser, do not use liberally.
2965: * This is basically similar to xmlParseLookupSequence()
2966: *
2967: * Returns the index to the current parsing point if the full sequence
2968: * is available, -1 otherwise.
2969: */
2970: int
2971: htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
2972: xmlChar next, xmlChar third) {
2973: int base, len;
2974: htmlParserInputPtr in;
2975: const xmlChar *buf;
2976:
2977: in = ctxt->input;
2978: if (in == NULL) return(-1);
2979: base = in->cur - in->base;
2980: if (base < 0) return(-1);
2981: if (ctxt->checkIndex > base)
2982: base = ctxt->checkIndex;
2983: if (in->buf == NULL) {
2984: buf = in->base;
2985: len = in->length;
2986: } else {
2987: buf = in->buf->buffer->content;
2988: len = in->buf->buffer->use;
2989: }
2990: /* take into account the sequence length */
2991: if (third) len -= 2;
2992: else if (next) len --;
2993: for (;base < len;base++) {
2994: if (buf[base] == first) {
2995: if (third != 0) {
2996: if ((buf[base + 1] != next) ||
2997: (buf[base + 2] != third)) continue;
2998: } else if (next != 0) {
2999: if (buf[base + 1] != next) continue;
3000: }
3001: ctxt->checkIndex = 0;
3002: #ifdef DEBUG_PUSH
3003: if (next == 0)
3004: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3005: first, base);
3006: else if (third == 0)
3007: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3008: first, next, base);
3009: else
3010: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3011: first, next, third, base);
3012: #endif
3013: return(base - (in->cur - in->base));
3014: }
3015: }
3016: ctxt->checkIndex = base;
3017: #ifdef DEBUG_PUSH
3018: if (next == 0)
3019: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3020: else if (third == 0)
3021: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3022: else
3023: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3024: #endif
3025: return(-1);
3026: }
3027:
3028: /**
1.32 daniel 3029: * htmlParseTryOrFinish:
1.31 daniel 3030: * @ctxt: an HTML parser context
1.32 daniel 3031: * @terminate: last chunk indicator
1.31 daniel 3032: *
3033: * Try to progress on parsing
3034: *
3035: * Returns zero if no parsing was possible
3036: */
3037: int
1.32 daniel 3038: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
1.31 daniel 3039: int ret = 0;
3040: htmlParserInputPtr in;
3041: int avail;
3042: xmlChar cur, next;
3043:
3044: #ifdef DEBUG_PUSH
3045: switch (ctxt->instate) {
3046: case XML_PARSER_EOF:
3047: fprintf(stderr, "HPP: try EOF\n"); break;
3048: case XML_PARSER_START:
3049: fprintf(stderr, "HPP: try START\n"); break;
3050: case XML_PARSER_MISC:
3051: fprintf(stderr, "HPP: try MISC\n");break;
3052: case XML_PARSER_COMMENT:
3053: fprintf(stderr, "HPP: try COMMENT\n");break;
3054: case XML_PARSER_PROLOG:
3055: fprintf(stderr, "HPP: try PROLOG\n");break;
3056: case XML_PARSER_START_TAG:
3057: fprintf(stderr, "HPP: try START_TAG\n");break;
3058: case XML_PARSER_CONTENT:
3059: fprintf(stderr, "HPP: try CONTENT\n");break;
3060: case XML_PARSER_CDATA_SECTION:
3061: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3062: case XML_PARSER_END_TAG:
3063: fprintf(stderr, "HPP: try END_TAG\n");break;
3064: case XML_PARSER_ENTITY_DECL:
3065: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3066: case XML_PARSER_ENTITY_VALUE:
3067: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3068: case XML_PARSER_ATTRIBUTE_VALUE:
3069: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3070: case XML_PARSER_DTD:
3071: fprintf(stderr, "HPP: try DTD\n");break;
3072: case XML_PARSER_EPILOG:
3073: fprintf(stderr, "HPP: try EPILOG\n");break;
3074: case XML_PARSER_PI:
3075: fprintf(stderr, "HPP: try PI\n");break;
3076: }
3077: #endif
3078:
3079: while (1) {
3080:
3081: in = ctxt->input;
3082: if (in == NULL) break;
3083: if (in->buf == NULL)
3084: avail = in->length - (in->cur - in->base);
3085: else
3086: avail = in->buf->buffer->use - (in->cur - in->base);
3087: if (avail < 1)
3088: goto done;
3089: switch (ctxt->instate) {
3090: case XML_PARSER_EOF:
3091: /*
3092: * Document parsing is done !
3093: */
3094: goto done;
3095: case XML_PARSER_START:
3096: /*
3097: * Very first chars read from the document flow.
3098: */
3099: cur = in->cur[0];
3100: if (IS_BLANK(cur)) {
3101: SKIP_BLANKS;
3102: if (in->buf == NULL)
3103: avail = in->length - (in->cur - in->base);
3104: else
3105: avail = in->buf->buffer->use - (in->cur - in->base);
3106: }
3107: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3108: ctxt->sax->setDocumentLocator(ctxt->userData,
3109: &xmlDefaultSAXLocator);
3110: cur = in->cur[0];
3111: next = in->cur[1];
3112: if ((cur == '<') && (next == '!') &&
3113: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3114: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3115: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3116: (UPP(8) == 'E')) {
1.32 daniel 3117: if ((!terminate) &&
3118: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3119: goto done;
3120: #ifdef DEBUG_PUSH
3121: fprintf(stderr, "HPP: Parsing internal subset\n");
3122: #endif
3123: htmlParseDocTypeDecl(ctxt);
3124: ctxt->instate = XML_PARSER_PROLOG;
3125: #ifdef DEBUG_PUSH
3126: fprintf(stderr, "HPP: entering PROLOG\n");
3127: #endif
3128: } else {
3129: ctxt->myDoc = htmlNewDoc(NULL, NULL);
3130: ctxt->instate = XML_PARSER_MISC;
3131: }
3132: #ifdef DEBUG_PUSH
3133: fprintf(stderr, "HPP: entering MISC\n");
3134: #endif
3135: break;
3136: case XML_PARSER_MISC:
3137: SKIP_BLANKS;
3138: if (in->buf == NULL)
3139: avail = in->length - (in->cur - in->base);
3140: else
3141: avail = in->buf->buffer->use - (in->cur - in->base);
3142: if (avail < 2)
3143: goto done;
3144: cur = in->cur[0];
3145: next = in->cur[1];
3146: if ((cur == '<') && (next == '!') &&
3147: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3148: if ((!terminate) &&
3149: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3150: goto done;
3151: #ifdef DEBUG_PUSH
3152: fprintf(stderr, "HPP: Parsing Comment\n");
3153: #endif
3154: htmlParseComment(ctxt);
3155: ctxt->instate = XML_PARSER_MISC;
3156: } else if ((cur == '<') && (next == '!') &&
3157: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3158: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3159: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3160: (UPP(8) == 'E')) {
1.32 daniel 3161: if ((!terminate) &&
3162: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3163: goto done;
3164: #ifdef DEBUG_PUSH
3165: fprintf(stderr, "HPP: Parsing internal subset\n");
3166: #endif
3167: htmlParseDocTypeDecl(ctxt);
3168: ctxt->instate = XML_PARSER_PROLOG;
3169: #ifdef DEBUG_PUSH
3170: fprintf(stderr, "HPP: entering PROLOG\n");
3171: #endif
3172: } else if ((cur == '<') && (next == '!') &&
3173: (avail < 9)) {
3174: goto done;
3175: } else {
3176: ctxt->instate = XML_PARSER_START_TAG;
3177: #ifdef DEBUG_PUSH
3178: fprintf(stderr, "HPP: entering START_TAG\n");
3179: #endif
3180: }
3181: break;
3182: case XML_PARSER_PROLOG:
3183: SKIP_BLANKS;
3184: if (in->buf == NULL)
3185: avail = in->length - (in->cur - in->base);
3186: else
3187: avail = in->buf->buffer->use - (in->cur - in->base);
3188: if (avail < 2)
3189: goto done;
3190: cur = in->cur[0];
3191: next = in->cur[1];
3192: if ((cur == '<') && (next == '!') &&
3193: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3194: if ((!terminate) &&
3195: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3196: goto done;
3197: #ifdef DEBUG_PUSH
3198: fprintf(stderr, "HPP: Parsing Comment\n");
3199: #endif
3200: htmlParseComment(ctxt);
3201: ctxt->instate = XML_PARSER_PROLOG;
3202: } else if ((cur == '<') && (next == '!') &&
3203: (avail < 4)) {
3204: goto done;
3205: } else {
3206: ctxt->instate = XML_PARSER_START_TAG;
3207: #ifdef DEBUG_PUSH
3208: fprintf(stderr, "HPP: entering START_TAG\n");
3209: #endif
3210: }
3211: break;
3212: case XML_PARSER_EPILOG:
3213: SKIP_BLANKS;
3214: if (in->buf == NULL)
3215: avail = in->length - (in->cur - in->base);
3216: else
3217: avail = in->buf->buffer->use - (in->cur - in->base);
3218: if (avail < 2)
3219: goto done;
3220: cur = in->cur[0];
3221: next = in->cur[1];
3222: if ((cur == '<') && (next == '!') &&
3223: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3224: if ((!terminate) &&
3225: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3226: goto done;
3227: #ifdef DEBUG_PUSH
3228: fprintf(stderr, "HPP: Parsing Comment\n");
3229: #endif
3230: htmlParseComment(ctxt);
3231: ctxt->instate = XML_PARSER_EPILOG;
3232: } else if ((cur == '<') && (next == '!') &&
3233: (avail < 4)) {
3234: goto done;
3235: } else {
3236: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3237: ctxt->sax->error(ctxt->userData,
3238: "Extra content at the end of the document\n");
3239: ctxt->wellFormed = 0;
3240: ctxt->errNo = XML_ERR_DOCUMENT_END;
3241: ctxt->instate = XML_PARSER_EOF;
3242: #ifdef DEBUG_PUSH
3243: fprintf(stderr, "HPP: entering EOF\n");
3244: #endif
3245: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3246: ctxt->sax->endDocument(ctxt->userData);
3247: goto done;
3248: }
3249: break;
3250: case XML_PARSER_START_TAG: {
3251: xmlChar *name, *oldname;
3252: int depth = ctxt->nameNr;
3253: htmlElemDescPtr info;
3254:
3255: if (avail < 2)
3256: goto done;
3257: cur = in->cur[0];
3258: if (cur != '<') {
3259: ctxt->instate = XML_PARSER_CONTENT;
3260: #ifdef DEBUG_PUSH
3261: fprintf(stderr, "HPP: entering CONTENT\n");
3262: #endif
3263: break;
3264: }
1.32 daniel 3265: if ((!terminate) &&
3266: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3267: goto done;
3268:
3269: oldname = xmlStrdup(ctxt->name);
3270: htmlParseStartTag(ctxt);
3271: name = ctxt->name;
3272: #ifdef DEBUG
3273: if (oldname == NULL)
3274: fprintf(stderr, "Start of element %s\n", name);
3275: else if (name == NULL)
3276: fprintf(stderr, "Start of element failed, was %s\n",
3277: oldname);
3278: else
3279: fprintf(stderr, "Start of element %s, was %s\n",
3280: name, oldname);
3281: #endif
3282: if (((depth == ctxt->nameNr) &&
3283: (!xmlStrcmp(oldname, ctxt->name))) ||
3284: (name == NULL)) {
3285: if (CUR == '>')
3286: NEXT;
3287: if (oldname != NULL)
3288: xmlFree(oldname);
3289: break;
3290: }
3291: if (oldname != NULL)
3292: xmlFree(oldname);
3293:
3294: /*
3295: * Lookup the info for that element.
3296: */
3297: info = htmlTagLookup(name);
3298: if (info == NULL) {
3299: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3300: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3301: name);
3302: ctxt->wellFormed = 0;
3303: } else if (info->depr) {
3304: /***************************
3305: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3306: ctxt->sax->warning(ctxt->userData,
3307: "Tag %s is deprecated\n",
3308: name);
3309: ***************************/
3310: }
3311:
3312: /*
3313: * Check for an Empty Element labelled the XML/SGML way
3314: */
3315: if ((CUR == '/') && (NXT(1) == '>')) {
3316: SKIP(2);
3317: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3318: ctxt->sax->endElement(ctxt->userData, name);
3319: oldname = htmlnamePop(ctxt);
3320: #ifdef DEBUG
3321: fprintf(stderr,"End of tag the XML way: popping out %s\n",
3322: oldname);
3323: #endif
3324: if (oldname != NULL)
3325: xmlFree(oldname);
3326: ctxt->instate = XML_PARSER_CONTENT;
3327: #ifdef DEBUG_PUSH
3328: fprintf(stderr, "HPP: entering CONTENT\n");
3329: #endif
3330: break;
3331: }
3332:
3333: if (CUR == '>') {
3334: NEXT;
3335: } else {
3336: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3337: ctxt->sax->error(ctxt->userData,
3338: "Couldn't find end of Start Tag %s\n",
3339: name);
3340: ctxt->wellFormed = 0;
3341:
3342: /*
3343: * end of parsing of this node.
3344: */
3345: if (!xmlStrcmp(name, ctxt->name)) {
3346: nodePop(ctxt);
3347: oldname = htmlnamePop(ctxt);
3348: #ifdef DEBUG
3349: fprintf(stderr,
3350: "End of start tag problem: popping out %s\n", oldname);
3351: #endif
3352: if (oldname != NULL)
3353: xmlFree(oldname);
3354: }
3355:
3356: ctxt->instate = XML_PARSER_CONTENT;
3357: #ifdef DEBUG_PUSH
3358: fprintf(stderr, "HPP: entering CONTENT\n");
3359: #endif
3360: break;
3361: }
3362:
3363: /*
3364: * Check for an Empty Element from DTD definition
3365: */
3366: if ((info != NULL) && (info->empty)) {
3367: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3368: ctxt->sax->endElement(ctxt->userData, name);
3369: oldname = htmlnamePop(ctxt);
3370: #ifdef DEBUG
3371: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3372: #endif
3373: if (oldname != NULL)
3374: xmlFree(oldname);
3375: }
3376: ctxt->instate = XML_PARSER_CONTENT;
3377: #ifdef DEBUG_PUSH
3378: fprintf(stderr, "HPP: entering CONTENT\n");
3379: #endif
3380: break;
3381: }
3382: case XML_PARSER_CONTENT:
3383: /*
3384: * Handle preparsed entities and charRef
3385: */
3386: if (ctxt->token != 0) {
3387: xmlChar cur[2] = { 0 , 0 } ;
3388:
3389: cur[0] = (xmlChar) ctxt->token;
3390: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3391: ctxt->sax->characters(ctxt->userData, cur, 1);
3392: ctxt->token = 0;
3393: ctxt->checkIndex = 0;
3394: }
3395: if (avail < 2)
3396: goto done;
3397: cur = in->cur[0];
3398: next = in->cur[1];
3399: if ((cur == '<') && (next == '!') &&
3400: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3401: if ((!terminate) &&
3402: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3403: goto done;
3404: #ifdef DEBUG_PUSH
3405: fprintf(stderr, "HPP: Parsing Comment\n");
3406: #endif
3407: htmlParseComment(ctxt);
3408: ctxt->instate = XML_PARSER_CONTENT;
3409: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
3410: goto done;
3411: } else if ((cur == '<') && (next == '/')) {
3412: ctxt->instate = XML_PARSER_END_TAG;
3413: ctxt->checkIndex = 0;
3414: #ifdef DEBUG_PUSH
3415: fprintf(stderr, "HPP: entering END_TAG\n");
3416: #endif
3417: break;
3418: } else if (cur == '<') {
3419: ctxt->instate = XML_PARSER_START_TAG;
3420: ctxt->checkIndex = 0;
3421: #ifdef DEBUG_PUSH
3422: fprintf(stderr, "HPP: entering START_TAG\n");
3423: #endif
3424: break;
3425: } else if (cur == '&') {
1.32 daniel 3426: if ((!terminate) &&
3427: (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
1.31 daniel 3428: goto done;
3429: #ifdef DEBUG_PUSH
3430: fprintf(stderr, "HPP: Parsing Reference\n");
3431: #endif
3432: /* TODO: check generation of subtrees if noent !!! */
3433: htmlParseReference(ctxt);
3434: } else {
3435: /* TODO Avoid the extra copy, handle directly !!!!!! */
3436: /*
3437: * Goal of the following test is :
3438: * - minimize calls to the SAX 'character' callback
3439: * when they are mergeable
3440: */
3441: if ((ctxt->inputNr == 1) &&
3442: (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
1.32 daniel 3443: if ((!terminate) &&
3444: (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
1.31 daniel 3445: goto done;
3446: }
3447: ctxt->checkIndex = 0;
3448: #ifdef DEBUG_PUSH
3449: fprintf(stderr, "HPP: Parsing char data\n");
3450: #endif
3451: htmlParseCharData(ctxt, 0);
3452: }
3453: break;
3454: case XML_PARSER_END_TAG:
3455: if (avail < 2)
3456: goto done;
1.32 daniel 3457: if ((!terminate) &&
3458: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3459: goto done;
3460: htmlParseEndTag(ctxt);
3461: if (ctxt->nameNr == 0) {
3462: ctxt->instate = XML_PARSER_EPILOG;
3463: } else {
3464: ctxt->instate = XML_PARSER_CONTENT;
3465: }
3466: ctxt->checkIndex = 0;
3467: #ifdef DEBUG_PUSH
3468: fprintf(stderr, "HPP: entering CONTENT\n");
3469: #endif
3470: break;
3471: case XML_PARSER_CDATA_SECTION:
3472: fprintf(stderr, "HPP: internal error, state == CDATA\n");
3473: ctxt->instate = XML_PARSER_CONTENT;
3474: ctxt->checkIndex = 0;
3475: #ifdef DEBUG_PUSH
3476: fprintf(stderr, "HPP: entering CONTENT\n");
3477: #endif
3478: break;
3479: case XML_PARSER_DTD:
3480: fprintf(stderr, "HPP: internal error, state == DTD\n");
3481: ctxt->instate = XML_PARSER_CONTENT;
3482: ctxt->checkIndex = 0;
3483: #ifdef DEBUG_PUSH
3484: fprintf(stderr, "HPP: entering CONTENT\n");
3485: #endif
3486: break;
3487: case XML_PARSER_COMMENT:
3488: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
3489: ctxt->instate = XML_PARSER_CONTENT;
3490: ctxt->checkIndex = 0;
3491: #ifdef DEBUG_PUSH
3492: fprintf(stderr, "HPP: entering CONTENT\n");
3493: #endif
3494: break;
3495: case XML_PARSER_PI:
3496: fprintf(stderr, "HPP: internal error, state == PI\n");
3497: ctxt->instate = XML_PARSER_CONTENT;
3498: ctxt->checkIndex = 0;
3499: #ifdef DEBUG_PUSH
3500: fprintf(stderr, "HPP: entering CONTENT\n");
3501: #endif
3502: break;
3503: case XML_PARSER_ENTITY_DECL:
3504: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
3505: ctxt->instate = XML_PARSER_CONTENT;
3506: ctxt->checkIndex = 0;
3507: #ifdef DEBUG_PUSH
3508: fprintf(stderr, "HPP: entering CONTENT\n");
3509: #endif
3510: break;
3511: case XML_PARSER_ENTITY_VALUE:
3512: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
3513: ctxt->instate = XML_PARSER_CONTENT;
3514: ctxt->checkIndex = 0;
3515: #ifdef DEBUG_PUSH
3516: fprintf(stderr, "HPP: entering DTD\n");
3517: #endif
3518: break;
3519: case XML_PARSER_ATTRIBUTE_VALUE:
3520: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
3521: ctxt->instate = XML_PARSER_START_TAG;
3522: ctxt->checkIndex = 0;
3523: #ifdef DEBUG_PUSH
3524: fprintf(stderr, "HPP: entering START_TAG\n");
3525: #endif
3526: break;
3527: }
3528: }
3529: done:
3530: #ifdef DEBUG_PUSH
3531: fprintf(stderr, "HPP: done %d\n", ret);
3532: #endif
3533: return(ret);
3534: }
3535:
3536: /**
1.32 daniel 3537: * htmlParseTry:
3538: * @ctxt: an HTML parser context
3539: *
3540: * Try to progress on parsing
3541: *
3542: * Returns zero if no parsing was possible
3543: */
3544: int
3545: htmlParseTry(htmlParserCtxtPtr ctxt) {
3546: return(htmlParseTryOrFinish(ctxt, 0));
3547: }
3548:
3549: /**
1.31 daniel 3550: * htmlParseChunk:
3551: * @ctxt: an XML parser context
3552: * @chunk: an char array
3553: * @size: the size in byte of the chunk
3554: * @terminate: last chunk indicator
3555: *
3556: * Parse a Chunk of memory
3557: *
3558: * Returns zero if no error, the xmlParserErrors otherwise.
3559: */
3560: int
3561: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
3562: int terminate) {
3563: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3564: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
3565: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
3566: int cur = ctxt->input->cur - ctxt->input->base;
3567:
3568: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3569: ctxt->input->base = ctxt->input->buf->buffer->content + base;
3570: ctxt->input->cur = ctxt->input->base + cur;
3571: #ifdef DEBUG_PUSH
3572: fprintf(stderr, "HPP: pushed %d\n", size);
3573: #endif
3574:
1.34 daniel 3575: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
3576: htmlParseTryOrFinish(ctxt, terminate);
1.31 daniel 3577: } else if (ctxt->instate != XML_PARSER_EOF)
1.32 daniel 3578: htmlParseTryOrFinish(ctxt, terminate);
1.31 daniel 3579: if (terminate) {
3580: if ((ctxt->instate != XML_PARSER_EOF) &&
3581: (ctxt->instate != XML_PARSER_EPILOG) &&
3582: (ctxt->instate != XML_PARSER_MISC)) {
3583: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3584: ctxt->sax->error(ctxt->userData,
3585: "Extra content at the end of the document\n");
3586: ctxt->wellFormed = 0;
3587: ctxt->errNo = XML_ERR_DOCUMENT_END;
3588: }
3589: if (ctxt->instate != XML_PARSER_EOF) {
3590: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3591: ctxt->sax->endDocument(ctxt->userData);
3592: }
3593: ctxt->instate = XML_PARSER_EOF;
3594: }
3595: return((xmlParserErrors) ctxt->errNo);
3596: }
3597:
3598: /************************************************************************
3599: * *
3600: * User entry points *
3601: * *
3602: ************************************************************************/
3603:
3604: /**
3605: * htmlCreatePushParserCtxt :
3606: * @sax: a SAX handler
3607: * @user_data: The user data returned on SAX callbacks
3608: * @chunk: a pointer to an array of chars
3609: * @size: number of chars in the array
3610: * @filename: an optional file name or URI
3611: * @enc: an optional encoding
3612: *
3613: * Create a parser context for using the HTML parser in push mode
3614: * To allow content encoding detection, @size should be >= 4
3615: * The value of @filename is used for fetching external entities
3616: * and error/warning reports.
3617: *
3618: * Returns the new parser context or NULL
3619: */
3620: htmlParserCtxtPtr
3621: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
3622: const char *chunk, int size, const char *filename,
3623: xmlCharEncoding enc) {
3624: htmlParserCtxtPtr ctxt;
3625: htmlParserInputPtr inputStream;
3626: xmlParserInputBufferPtr buf;
3627:
3628: buf = xmlAllocParserInputBuffer(enc);
3629: if (buf == NULL) return(NULL);
3630:
3631: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3632: if (ctxt == NULL) {
3633: xmlFree(buf);
3634: return(NULL);
3635: }
3636: memset(ctxt, 0, sizeof(htmlParserCtxt));
3637: htmlInitParserCtxt(ctxt);
3638: if (sax != NULL) {
3639: if (ctxt->sax != &htmlDefaultSAXHandler)
3640: xmlFree(ctxt->sax);
3641: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
3642: if (ctxt->sax == NULL) {
3643: xmlFree(buf);
3644: xmlFree(ctxt);
3645: return(NULL);
3646: }
3647: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
3648: if (user_data != NULL)
3649: ctxt->userData = user_data;
3650: }
3651: if (filename == NULL) {
3652: ctxt->directory = NULL;
3653: } else {
3654: ctxt->directory = xmlParserGetDirectory(filename);
3655: }
3656:
3657: inputStream = htmlNewInputStream(ctxt);
3658: if (inputStream == NULL) {
3659: xmlFreeParserCtxt(ctxt);
3660: return(NULL);
3661: }
3662:
3663: if (filename == NULL)
3664: inputStream->filename = NULL;
3665: else
3666: inputStream->filename = xmlMemStrdup(filename);
3667: inputStream->buf = buf;
3668: inputStream->base = inputStream->buf->buffer->content;
3669: inputStream->cur = inputStream->buf->buffer->content;
3670:
3671: inputPush(ctxt, inputStream);
3672:
3673: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3674: (ctxt->input->buf != NULL)) {
3675: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3676: #ifdef DEBUG_PUSH
3677: fprintf(stderr, "HPP: pushed %d\n", size);
3678: #endif
3679: }
3680:
3681: return(ctxt);
3682: }
1.1 daniel 3683:
3684: /**
3685: * htmlSAXParseDoc :
1.14 daniel 3686: * @cur: a pointer to an array of xmlChar
1.1 daniel 3687: * @encoding: a free form C string describing the HTML document encoding, or NULL
3688: * @sax: the SAX handler block
3689: * @userData: if using SAX, this pointer will be provided on callbacks.
3690: *
3691: * parse an HTML in-memory document and build a tree.
3692: * It use the given SAX function block to handle the parsing callback.
3693: * If sax is NULL, fallback to the default DOM tree building routines.
3694: *
3695: * Returns the resulting document tree
3696: */
3697:
3698: htmlDocPtr
1.14 daniel 3699: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
1.1 daniel 3700: htmlDocPtr ret;
3701: htmlParserCtxtPtr ctxt;
3702:
3703: if (cur == NULL) return(NULL);
3704:
3705:
3706: ctxt = htmlCreateDocParserCtxt(cur, encoding);
3707: if (ctxt == NULL) return(NULL);
3708: if (sax != NULL) {
3709: ctxt->sax = sax;
3710: ctxt->userData = userData;
3711: }
3712:
3713: htmlParseDocument(ctxt);
3714: ret = ctxt->myDoc;
3715: if (sax != NULL) {
3716: ctxt->sax = NULL;
3717: ctxt->userData = NULL;
3718: }
3719: htmlFreeParserCtxt(ctxt);
3720:
3721: return(ret);
3722: }
3723:
3724: /**
3725: * htmlParseDoc :
1.14 daniel 3726: * @cur: a pointer to an array of xmlChar
1.1 daniel 3727: * @encoding: a free form C string describing the HTML document encoding, or NULL
3728: *
3729: * parse an HTML in-memory document and build a tree.
3730: *
3731: * Returns the resulting document tree
3732: */
3733:
3734: htmlDocPtr
1.14 daniel 3735: htmlParseDoc(xmlChar *cur, const char *encoding) {
1.1 daniel 3736: return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
3737: }
3738:
3739:
3740: /**
3741: * htmlCreateFileParserCtxt :
3742: * @filename: the filename
3743: * @encoding: a free form C string describing the HTML document encoding, or NULL
3744: *
3745: * Create a parser context for a file content.
3746: * Automatic support for ZLIB/Compress compressed document is provided
3747: * by default if found at compile-time.
3748: *
3749: * Returns the new parser context or NULL
3750: */
3751: htmlParserCtxtPtr
3752: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
3753: {
3754: htmlParserCtxtPtr ctxt;
3755: htmlParserInputPtr inputStream;
1.5 daniel 3756: xmlParserInputBufferPtr buf;
1.1 daniel 3757: /* htmlCharEncoding enc; */
3758:
1.5 daniel 3759: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
3760: if (buf == NULL) return(NULL);
1.1 daniel 3761:
1.11 daniel 3762: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 3763: if (ctxt == NULL) {
3764: perror("malloc");
3765: return(NULL);
3766: }
1.19 daniel 3767: memset(ctxt, 0, sizeof(htmlParserCtxt));
1.1 daniel 3768: htmlInitParserCtxt(ctxt);
1.11 daniel 3769: inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 3770: if (inputStream == NULL) {
3771: perror("malloc");
1.11 daniel 3772: xmlFree(ctxt);
1.1 daniel 3773: return(NULL);
3774: }
1.19 daniel 3775: memset(inputStream, 0, sizeof(htmlParserInput));
1.1 daniel 3776:
1.11 daniel 3777: inputStream->filename = xmlMemStrdup(filename);
1.1 daniel 3778: inputStream->line = 1;
3779: inputStream->col = 1;
1.5 daniel 3780: inputStream->buf = buf;
1.21 daniel 3781: inputStream->directory = NULL;
1.1 daniel 3782:
1.5 daniel 3783: inputStream->base = inputStream->buf->buffer->content;
3784: inputStream->cur = inputStream->buf->buffer->content;
3785: inputStream->free = NULL;
1.1 daniel 3786:
3787: inputPush(ctxt, inputStream);
3788: return(ctxt);
3789: }
3790:
3791: /**
3792: * htmlSAXParseFile :
3793: * @filename: the filename
3794: * @encoding: a free form C string describing the HTML document encoding, or NULL
3795: * @sax: the SAX handler block
3796: * @userData: if using SAX, this pointer will be provided on callbacks.
3797: *
3798: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3799: * compressed document is provided by default if found at compile-time.
3800: * It use the given SAX function block to handle the parsing callback.
3801: * If sax is NULL, fallback to the default DOM tree building routines.
3802: *
3803: * Returns the resulting document tree
3804: */
3805:
3806: htmlDocPtr
3807: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
3808: void *userData) {
3809: htmlDocPtr ret;
3810: htmlParserCtxtPtr ctxt;
3811:
3812: ctxt = htmlCreateFileParserCtxt(filename, encoding);
3813: if (ctxt == NULL) return(NULL);
3814: if (sax != NULL) {
3815: ctxt->sax = sax;
3816: ctxt->userData = userData;
3817: }
3818:
3819: htmlParseDocument(ctxt);
3820:
3821: ret = ctxt->myDoc;
3822: if (sax != NULL) {
3823: ctxt->sax = NULL;
3824: ctxt->userData = NULL;
3825: }
3826: htmlFreeParserCtxt(ctxt);
3827:
3828: return(ret);
3829: }
3830:
3831: /**
3832: * htmlParseFile :
3833: * @filename: the filename
3834: * @encoding: a free form C string describing the HTML document encoding, or NULL
3835: *
3836: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
3837: * compressed document is provided by default if found at compile-time.
3838: *
3839: * Returns the resulting document tree
3840: */
3841:
3842: htmlDocPtr
3843: htmlParseFile(const char *filename, const char *encoding) {
3844: return(htmlSAXParseFile(filename, encoding, NULL, NULL));
3845: }
Webmaster