Annotation of XML/HTMLparser.c, revision 1.52
1.1 daniel 1: /*
2: * HTMLparser.c : an HTML 4.0 non-verifying parser
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
1.29 daniel 10: #include "win32config.h"
1.1 daniel 11: #else
1.13 daniel 12: #include "config.h"
1.1 daniel 13: #endif
1.13 daniel 14:
1.39 daniel 15: #include "xmlversion.h"
16: #ifdef LIBXML_HTML_ENABLED
17:
1.1 daniel 18: #include <stdio.h>
1.50 veillard 19: #include <string.h>
1.13 daniel 20: #ifdef HAVE_CTYPE_H
1.1 daniel 21: #include <ctype.h>
1.13 daniel 22: #endif
23: #ifdef HAVE_STDLIB_H
1.1 daniel 24: #include <stdlib.h>
1.13 daniel 25: #endif
26: #ifdef HAVE_SYS_STAT_H
1.1 daniel 27: #include <sys/stat.h>
1.13 daniel 28: #endif
1.1 daniel 29: #ifdef HAVE_FCNTL_H
30: #include <fcntl.h>
31: #endif
32: #ifdef HAVE_UNISTD_H
33: #include <unistd.h>
34: #endif
35: #ifdef HAVE_ZLIB_H
36: #include <zlib.h>
37: #endif
38:
1.39 daniel 39: #include <libxml/xmlmemory.h>
40: #include <libxml/tree.h>
41: #include <libxml/HTMLparser.h>
42: #include <libxml/entities.h>
43: #include <libxml/encoding.h>
1.50 veillard 44: #include <libxml/parser.h>
1.39 daniel 45: #include <libxml/valid.h>
46: #include <libxml/parserInternals.h>
47: #include <libxml/xmlIO.h>
1.31 daniel 48: #include "xml-error.h"
1.5 daniel 49:
50: #define HTML_MAX_NAMELEN 1000
51: #define INPUT_CHUNK 50
1.31 daniel 52: #define HTML_PARSER_BIG_BUFFER_SIZE 1024
53: #define HTML_PARSER_BUFFER_SIZE 100
1.1 daniel 54:
55: /* #define DEBUG */
1.31 daniel 56: /* #define DEBUG_PUSH */
1.1 daniel 57:
58: /************************************************************************
59: * *
60: * Parser stacks related functions and macros *
61: * *
62: ************************************************************************/
63:
64: /*
65: * Generic function for accessing stacks in the Parser Context
66: */
67:
1.30 daniel 68: #define PUSH_AND_POP(scope, type, name) \
69: scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
1.1 daniel 70: if (ctxt->name##Nr >= ctxt->name##Max) { \
71: ctxt->name##Max *= 2; \
1.50 veillard 72: ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
1.1 daniel 73: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74: if (ctxt->name##Tab == NULL) { \
75: fprintf(stderr, "realloc failed !\n"); \
1.33 daniel 76: return(0); \
1.1 daniel 77: } \
78: } \
79: ctxt->name##Tab[ctxt->name##Nr] = value; \
80: ctxt->name = value; \
81: return(ctxt->name##Nr++); \
82: } \
1.30 daniel 83: scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
1.1 daniel 84: type ret; \
1.18 daniel 85: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 86: ctxt->name##Nr--; \
1.18 daniel 87: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 88: if (ctxt->name##Nr > 0) \
89: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90: else \
91: ctxt->name = NULL; \
92: ret = ctxt->name##Tab[ctxt->name##Nr]; \
93: ctxt->name##Tab[ctxt->name##Nr] = 0; \
94: return(ret); \
95: } \
96:
1.30 daniel 97: PUSH_AND_POP(extern, xmlNodePtr, node)
98: PUSH_AND_POP(extern, xmlChar*, name)
1.1 daniel 99:
100: /*
101: * Macros for accessing the content. Those should be used only by the parser,
102: * and not exported.
103: *
104: * Dirty macros, i.e. one need to make assumption on the context to use them
105: *
1.14 daniel 106: * CUR_PTR return the current pointer to the xmlChar to be parsed.
107: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
1.1 daniel 108: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109: * in UNICODE mode. This should be used internally by the parser
110: * only to compare to ASCII values otherwise it would break when
111: * running with UTF-8 encoding.
1.14 daniel 112: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
1.1 daniel 113: * to compare on ASCII based substring.
1.14 daniel 114: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
1.1 daniel 115: * it should be used only to compare on ASCII based substring.
1.14 daniel 116: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
1.1 daniel 117: * strings within the parser.
118: *
119: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120: *
121: * CURRENT Returns the current char value, with the full decoding of
122: * UTF-8 if we are using this mode. It returns an int.
123: * NEXT Skip to the next character, this does the proper decoding
124: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
125: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126: */
127:
1.36 daniel 128: #define CUR ((int) (*ctxt->input->cur))
129:
1.1 daniel 130: #define UPPER (toupper(*ctxt->input->cur))
1.36 daniel 131:
1.26 daniel 132: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
1.36 daniel 133:
1.1 daniel 134: #define NXT(val) ctxt->input->cur[(val)]
1.36 daniel 135:
1.1 daniel 136: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
1.36 daniel 137:
1.1 daniel 138: #define CUR_PTR ctxt->input->cur
1.36 daniel 139:
1.5 daniel 140: #define SHRINK xmlParserInputShrink(ctxt->input)
1.36 daniel 141:
1.5 daniel 142: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
1.1 daniel 143:
1.36 daniel 144: #define CURRENT ((int) (*ctxt->input->cur))
1.1 daniel 145:
1.36 daniel 146: #define NEXT htmlNextChar(ctxt);
1.35 daniel 147:
1.36 daniel 148: #define SKIP_BLANKS htmlSkipBlankChars(ctxt);
1.35 daniel 149:
150: /**
151: * htmlNextChar:
152: * @ctxt: the HTML parser context
153: *
154: * Skip to the next char input char.
155: */
156:
157: void
158: htmlNextChar(htmlParserCtxtPtr ctxt) {
1.44 daniel 159: if (ctxt->instate == XML_PARSER_EOF)
160: return;
1.35 daniel 161: if ((*ctxt->input->cur == 0) &&
162: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
163: xmlPopInput(ctxt);
164: } else {
165: if (*(ctxt->input->cur) == '\n') {
166: ctxt->input->line++; ctxt->input->col = 1;
167: } else ctxt->input->col++;
168: ctxt->input->cur++;
169: ctxt->nbChars++;
170: if (*ctxt->input->cur == 0)
171: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
172: }
173: }
1.5 daniel 174:
1.36 daniel 175: /**
176: * htmlSkipBlankChars:
177: * @ctxt: the HTML parser context
178: *
179: * skip all blanks character found at that point in the input streams.
180: *
181: * Returns the number of space chars skipped
182: */
183:
184: int
185: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
186: int res = 0;
187:
188: while (IS_BLANK(*(ctxt->input->cur))) {
189: if ((*ctxt->input->cur == 0) &&
190: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
191: xmlPopInput(ctxt);
192: } else {
193: if (*(ctxt->input->cur) == '\n') {
194: ctxt->input->line++; ctxt->input->col = 1;
195: } else ctxt->input->col++;
196: ctxt->input->cur++;
197: ctxt->nbChars++;
198: if (*ctxt->input->cur == 0)
199: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
200: }
201: res++;
202: }
203: return(res);
204: }
1.1 daniel 205:
206:
1.5 daniel 207:
1.1 daniel 208: /************************************************************************
209: * *
210: * The list of HTML elements and their properties *
211: * *
212: ************************************************************************/
213:
214: /*
215: * Start Tag: 1 means the start tag can be ommited
216: * End Tag: 1 means the end tag can be ommited
217: * 2 means it's forbidden (empty elements)
218: * Depr: this element is deprecated
219: * DTD: 1 means that this element is valid only in the Loose DTD
220: * 2 means that this element is valid only in the Frameset DTD
221: *
222: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
223: */
224: htmlElemDesc html40ElementTable[] = {
1.26 daniel 225: { "a", 0, 0, 0, 0, 0, "anchor " },
226: { "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
227: { "acronym", 0, 0, 0, 0, 0, "" },
228: { "address", 0, 0, 0, 0, 0, "information on author " },
229: { "applet", 0, 0, 0, 1, 1, "java applet " },
230: { "area", 0, 2, 1, 0, 0, "client-side image map area " },
231: { "b", 0, 0, 0, 0, 0, "bold text style" },
232: { "base", 0, 2, 1, 0, 0, "document base uri " },
233: { "basefont", 0, 2, 1, 1, 1, "base font size " },
234: { "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
235: { "big", 0, 0, 0, 0, 0, "large text style" },
236: { "blockquote", 0, 0, 0, 0, 0, "long quotation " },
237: { "body", 1, 1, 0, 0, 0, "document body " },
238: { "br", 0, 2, 1, 0, 0, "forced line break " },
239: { "button", 0, 0, 0, 0, 0, "push button " },
240: { "caption", 0, 0, 0, 0, 0, "table caption " },
241: { "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
242: { "cite", 0, 0, 0, 0, 0, "citation" },
243: { "code", 0, 0, 0, 0, 0, "computer code fragment" },
244: { "col", 0, 2, 1, 0, 0, "table column " },
245: { "colgroup", 0, 1, 0, 0, 0, "table column group " },
246: { "dd", 0, 1, 0, 0, 0, "definition description " },
247: { "del", 0, 0, 0, 0, 0, "deleted text " },
248: { "dfn", 0, 0, 0, 0, 0, "instance definition" },
249: { "dir", 0, 0, 0, 1, 1, "directory list" },
250: { "div", 0, 0, 0, 0, 0, "generic language/style container"},
251: { "dl", 0, 0, 0, 0, 0, "definition list " },
252: { "dt", 0, 1, 0, 0, 0, "definition term " },
253: { "em", 0, 0, 0, 0, 0, "emphasis" },
254: { "fieldset", 0, 0, 0, 0, 0, "form control group " },
255: { "font", 0, 0, 0, 1, 1, "local change to font " },
256: { "form", 0, 0, 0, 0, 0, "interactive form " },
257: { "frame", 0, 2, 1, 0, 2, "subwindow " },
258: { "frameset", 0, 0, 0, 0, 2, "window subdivision" },
259: { "h1", 0, 0, 0, 0, 0, "heading " },
260: { "h2", 0, 0, 0, 0, 0, "heading " },
261: { "h3", 0, 0, 0, 0, 0, "heading " },
262: { "h4", 0, 0, 0, 0, 0, "heading " },
263: { "h5", 0, 0, 0, 0, 0, "heading " },
264: { "h6", 0, 0, 0, 0, 0, "heading " },
265: { "head", 1, 1, 0, 0, 0, "document head " },
266: { "hr", 0, 2, 1, 0, 0, "horizontal rule " },
267: { "html", 1, 1, 0, 0, 0, "document root element " },
268: { "i", 0, 0, 0, 0, 0, "italic text style" },
269: { "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
270: { "img", 0, 2, 1, 0, 0, "embedded image " },
271: { "input", 0, 2, 1, 0, 0, "form control " },
272: { "ins", 0, 0, 0, 0, 0, "inserted text" },
273: { "isindex", 0, 2, 1, 1, 1, "single line prompt " },
274: { "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
275: { "label", 0, 0, 0, 0, 0, "form field label text " },
276: { "legend", 0, 0, 0, 0, 0, "fieldset legend " },
277: { "li", 0, 1, 0, 0, 0, "list item " },
278: { "link", 0, 2, 1, 0, 0, "a media-independent link " },
279: { "map", 0, 0, 0, 0, 0, "client-side image map " },
280: { "menu", 0, 0, 0, 1, 1, "menu list " },
281: { "meta", 0, 2, 1, 0, 0, "generic metainformation " },
282: { "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
283: { "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
284: { "object", 0, 0, 0, 0, 0, "generic embedded object " },
285: { "ol", 0, 0, 0, 0, 0, "ordered list " },
286: { "optgroup", 0, 0, 0, 0, 0, "option group " },
287: { "option", 0, 1, 0, 0, 0, "selectable choice " },
288: { "p", 0, 1, 0, 0, 0, "paragraph " },
289: { "param", 0, 2, 1, 0, 0, "named property value " },
290: { "pre", 0, 0, 0, 0, 0, "preformatted text " },
291: { "q", 0, 0, 0, 0, 0, "short inline quotation " },
292: { "s", 0, 0, 0, 1, 1, "strike-through text style" },
293: { "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
294: { "script", 0, 0, 0, 0, 0, "script statements " },
295: { "select", 0, 0, 0, 0, 0, "option selector " },
296: { "small", 0, 0, 0, 0, 0, "small text style" },
297: { "span", 0, 0, 0, 0, 0, "generic language/style container " },
298: { "strike", 0, 0, 0, 1, 1, "strike-through text" },
299: { "strong", 0, 0, 0, 0, 0, "strong emphasis" },
300: { "style", 0, 0, 0, 0, 0, "style info " },
301: { "sub", 0, 0, 0, 0, 0, "subscript" },
302: { "sup", 0, 0, 0, 0, 0, "superscript " },
303: { "table", 0, 0, 0, 0, 0, " " },
304: { "tbody", 1, 1, 0, 0, 0, "table body " },
305: { "td", 0, 1, 0, 0, 0, "table data cell" },
306: { "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
307: { "tfoot", 0, 1, 0, 0, 0, "table footer " },
308: { "th", 0, 1, 0, 0, 0, "table header cell" },
309: { "thead", 0, 1, 0, 0, 0, "table header " },
310: { "title", 0, 0, 0, 0, 0, "document title " },
311: { "tr", 0, 1, 0, 0, 0, "table row " },
312: { "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
313: { "u", 0, 0, 0, 1, 1, "underlined text style" },
314: { "ul", 0, 0, 0, 0, 0, "unordered list " },
315: { "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
1.1 daniel 316: };
317:
318: /*
319: * start tags that imply the end of a current element
320: * any tag of each line implies the end of the current element if the type of
321: * that element is in the same line
322: */
1.8 daniel 323: char *htmlEquEnd[] = {
1.26 daniel 324: "dt", "dd", "li", "option", NULL,
325: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
326: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
1.1 daniel 327: NULL
328: };
329: /*
330: * acording the HTML DTD, HR should be added to the 2nd line above, as it
331: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
332: * because many documents contain rules in headings...
333: */
334:
335: /*
336: * start tags that imply the end of current element
337: */
1.8 daniel 338: char *htmlStartClose[] = {
1.26 daniel 339: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
340: "dl", "ul", "ol", "menu", "dir", "address", "pre",
341: "listing", "xmp", "head", NULL,
342: "head", "p", NULL,
343: "title", "p", NULL,
344: "body", "head", "style", "link", "title", "p", NULL,
345: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
346: "pre", "listing", "xmp", "head", "li", NULL,
347: "hr", "p", "head", NULL,
348: "h1", "p", "head", NULL,
349: "h2", "p", "head", NULL,
350: "h3", "p", "head", NULL,
351: "h4", "p", "head", NULL,
352: "h5", "p", "head", NULL,
353: "h6", "p", "head", NULL,
354: "dir", "p", "head", NULL,
355: "address", "p", "head", "ul", NULL,
356: "pre", "p", "head", "ul", NULL,
357: "listing", "p", "head", NULL,
358: "xmp", "p", "head", NULL,
359: "blockquote", "p", "head", NULL,
360: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
361: "xmp", "head", NULL,
362: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
363: "head", "dd", NULL,
364: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
365: "head", "dt", NULL,
366: "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
367: "listing", "xmp", NULL,
368: "ol", "p", "head", "ul", NULL,
369: "menu", "p", "head", "ul", NULL,
370: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
371: "div", "p", "head", NULL,
372: "noscript", "p", "head", NULL,
373: "center", "font", "b", "i", "p", "head", NULL,
374: "a", "a", NULL,
375: "caption", "p", NULL,
376: "colgroup", "caption", "colgroup", "col", "p", NULL,
377: "col", "caption", "col", "p", NULL,
378: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
379: "listing", "xmp", "a", NULL,
380: "th", "th", "td", NULL,
381: "td", "th", "td", "p", NULL,
382: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
383: "thead", "caption", "col", "colgroup", NULL,
384: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
385: "tbody", "p", NULL,
386: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
387: "tfoot", "tbody", "p", NULL,
388: "optgroup", "option", NULL,
389: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
390: "pre", "listing", "xmp", "a", NULL,
1.1 daniel 391: NULL
392: };
393:
1.43 daniel 394:
1.8 daniel 395: static char** htmlStartCloseIndex[100];
1.1 daniel 396: static int htmlStartCloseIndexinitialized = 0;
397:
398: /************************************************************************
399: * *
400: * functions to handle HTML specific data *
401: * *
402: ************************************************************************/
403:
404: /**
405: * htmlInitAutoClose:
406: *
407: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
408: *
409: */
410: void
411: htmlInitAutoClose(void) {
412: int index, i = 0;
413:
414: if (htmlStartCloseIndexinitialized) return;
415:
416: for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
417: index = 0;
418: while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
419: htmlStartCloseIndex[index++] = &htmlStartClose[i];
420: while (htmlStartClose[i] != NULL) i++;
421: i++;
422: }
423: }
424:
425: /**
426: * htmlTagLookup:
427: * @tag: The tag name
428: *
429: * Lookup the HTML tag in the ElementTable
430: *
431: * Returns the related htmlElemDescPtr or NULL if not found.
432: */
433: htmlElemDescPtr
1.14 daniel 434: htmlTagLookup(const xmlChar *tag) {
1.1 daniel 435: int i = 0;
436:
437: for (i = 0; i < (sizeof(html40ElementTable) /
438: sizeof(html40ElementTable[0]));i++) {
1.8 daniel 439: if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
1.1 daniel 440: return(&html40ElementTable[i]);
441: }
442: return(NULL);
443: }
444:
445: /**
446: * htmlCheckAutoClose:
1.50 veillard 447: * @newtag: The new tag name
448: * @oldtag: The old tag name
1.1 daniel 449: *
450: * Checks wether the new tag is one of the registered valid tags for closing old.
451: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
452: *
453: * Returns 0 if no, 1 if yes.
454: */
455: int
1.50 veillard 456: htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
1.1 daniel 457: int i, index;
1.8 daniel 458: char **close;
1.1 daniel 459:
460: if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
461:
462: /* inefficient, but not a big deal */
463: for (index = 0; index < 100;index++) {
464: close = htmlStartCloseIndex[index];
465: if (close == NULL) return(0);
1.50 veillard 466: if (!xmlStrcmp(BAD_CAST *close, newtag)) break;
1.1 daniel 467: }
468:
469: i = close - htmlStartClose;
470: i++;
471: while (htmlStartClose[i] != NULL) {
1.50 veillard 472: if (!xmlStrcmp(BAD_CAST htmlStartClose[i], oldtag)) {
1.1 daniel 473: return(1);
474: }
475: i++;
476: }
477: return(0);
478: }
479:
480: /**
1.50 veillard 481: * htmlAutoCloseOnClose:
482: * @ctxt: an HTML parser context
483: * @newtag: The new tag name
484: *
485: * The HTmL DtD allows an ending tag to implicitely close other tags.
486: */
487: void
488: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
489: htmlElemDescPtr info;
490: xmlChar *oldname;
491: int i;
492:
493: #ifdef DEBUG
494: fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
495: for (i = 0;i < ctxt->nameNr;i++)
496: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
497: #endif
498:
499: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
500: if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break;
501: }
502: if (i < 0) return;
503:
504: while (xmlStrcmp(newtag, ctxt->name)) {
505: info = htmlTagLookup(ctxt->name);
506: if ((info == NULL) || (info->endTag == 1)) {
507: #ifdef DEBUG
508: fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
509: #endif
510: } else {
511: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
512: ctxt->sax->error(ctxt->userData,
513: "Opening and ending tag mismatch: %s and %s\n",
514: newtag, ctxt->name);
515: ctxt->wellFormed = 0;
516: }
517: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
518: ctxt->sax->endElement(ctxt->userData, ctxt->name);
519: oldname = htmlnamePop(ctxt);
520: if (oldname != NULL) {
521: #ifdef DEBUG
522: fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
523: #endif
524: xmlFree(oldname);
525: }
526: }
527: }
528:
529: /**
1.1 daniel 530: * htmlAutoClose:
531: * @ctxt: an HTML parser context
1.50 veillard 532: * @newtag: The new tag name or NULL
1.1 daniel 533: *
534: * The HTmL DtD allows a tag to implicitely close other tags.
535: * The list is kept in htmlStartClose array. This function is
536: * called when a new tag has been detected and generates the
537: * appropriates closes if possible/needed.
1.50 veillard 538: * If newtag is NULL this mean we are at the end of the resource
1.47 daniel 539: * and we should check
1.1 daniel 540: */
541: void
1.50 veillard 542: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1.15 daniel 543: xmlChar *oldname;
1.50 veillard 544: while ((newtag != NULL) && (ctxt->name != NULL) &&
545: (htmlCheckAutoClose(newtag, ctxt->name))) {
1.1 daniel 546: #ifdef DEBUG
1.50 veillard 547: fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
1.1 daniel 548: #endif
549: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1.15 daniel 550: ctxt->sax->endElement(ctxt->userData, ctxt->name);
1.24 daniel 551: oldname = htmlnamePop(ctxt);
1.18 daniel 552: if (oldname != NULL) {
553: #ifdef DEBUG
554: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
555: #endif
1.17 daniel 556: xmlFree(oldname);
1.18 daniel 557: }
1.1 daniel 558: }
1.50 veillard 559: if (newtag == NULL) {
1.49 daniel 560: htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
561: htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
562: htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
563: }
1.50 veillard 564: while ((newtag == NULL) && (ctxt->name != NULL) &&
1.47 daniel 565: ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
566: (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
567: (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) {
568: #ifdef DEBUG
569: fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
570: #endif
571: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
572: ctxt->sax->endElement(ctxt->userData, ctxt->name);
573: oldname = htmlnamePop(ctxt);
574: if (oldname != NULL) {
575: #ifdef DEBUG
576: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
577: #endif
578: xmlFree(oldname);
579: }
580: }
581:
1.1 daniel 582: }
583:
584: /**
1.28 daniel 585: * htmlAutoCloseTag:
586: * @doc: the HTML document
587: * @name: The tag name
588: * @elem: the HTML element
589: *
590: * The HTmL DtD allows a tag to implicitely close other tags.
591: * The list is kept in htmlStartClose array. This function checks
592: * if the element or one of it's children would autoclose the
593: * given tag.
594: *
595: * Returns 1 if autoclose, 0 otherwise
596: */
597: int
598: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
599: htmlNodePtr child;
600:
601: if (elem == NULL) return(1);
602: if (!xmlStrcmp(name, elem->name)) return(0);
603: if (htmlCheckAutoClose(elem->name, name)) return(1);
1.37 daniel 604: child = elem->children;
1.28 daniel 605: while (child != NULL) {
606: if (htmlAutoCloseTag(doc, name, child)) return(1);
607: child = child->next;
608: }
609: return(0);
610: }
611:
612: /**
613: * htmlIsAutoClosed:
614: * @doc: the HTML document
615: * @elem: the HTML element
616: *
617: * The HTmL DtD allows a tag to implicitely close other tags.
618: * The list is kept in htmlStartClose array. This function checks
619: * if a tag is autoclosed by one of it's child
620: *
621: * Returns 1 if autoclosed, 0 otherwise
622: */
623: int
624: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
625: htmlNodePtr child;
626:
627: if (elem == NULL) return(1);
1.37 daniel 628: child = elem->children;
1.28 daniel 629: while (child != NULL) {
630: if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
631: child = child->next;
632: }
633: return(0);
634: }
635:
636: /**
1.43 daniel 637: * htmlCheckImplied:
638: * @ctxt: an HTML parser context
1.50 veillard 639: * @newtag: The new tag name
1.43 daniel 640: *
641: * The HTmL DtD allows a tag to exists only implicitely
642: * called when a new tag has been detected and generates the
643: * appropriates implicit tags if missing
644: */
645: void
1.50 veillard 646: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
647: if (!xmlStrcmp(newtag, BAD_CAST"html"))
1.43 daniel 648: return;
649: if (ctxt->nameNr <= 0) {
650: #ifdef DEBUG
651: fprintf(stderr,"Implied element html: pushed html\n");
652: #endif
653: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
654: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
655: ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
656: }
1.50 veillard 657: if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head")))
1.43 daniel 658: return;
659: if (ctxt->nameNr <= 1) {
1.50 veillard 660: if ((!xmlStrcmp(newtag, BAD_CAST"script")) ||
661: (!xmlStrcmp(newtag, BAD_CAST"style")) ||
662: (!xmlStrcmp(newtag, BAD_CAST"meta")) ||
663: (!xmlStrcmp(newtag, BAD_CAST"link")) ||
664: (!xmlStrcmp(newtag, BAD_CAST"title")) ||
665: (!xmlStrcmp(newtag, BAD_CAST"base"))) {
1.43 daniel 666: /*
667: * dropped OBJECT ... i you put it first BODY will be
668: * assumed !
669: */
670: #ifdef DEBUG
671: fprintf(stderr,"Implied element head: pushed head\n");
672: #endif
673: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
674: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
675: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
676: } else {
677: #ifdef DEBUG
678: fprintf(stderr,"Implied element body: pushed body\n");
679: #endif
680: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
681: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
682: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
683: }
684: }
685: }
686:
1.1 daniel 687: /************************************************************************
688: * *
689: * The list of HTML predefined entities *
690: * *
691: ************************************************************************/
692:
693:
694: htmlEntityDesc html40EntitiesTable[] = {
695: /*
696: * the 4 absolute ones,
697: */
698: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
699: { 38, "amp", "ampersand, U+0026 ISOnum" },
700: { 60, "lt", "less-than sign, U+003C ISOnum" },
701: { 62, "gt", "greater-than sign, U+003E ISOnum" },
702:
703: /*
704: * A bunch still in the 128-255 range
705: * Replacing them depend really on the charset used.
706: */
1.28 daniel 707: { 39, "apos", "single quote" },
1.1 daniel 708: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
709: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
710: { 162, "cent", "cent sign, U+00A2 ISOnum" },
711: { 163, "pound","pound sign, U+00A3 ISOnum" },
712: { 164, "curren","currency sign, U+00A4 ISOnum" },
713: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
714: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
715: { 167, "sect", "section sign, U+00A7 ISOnum" },
716: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
717: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
718: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
719: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
720: { 172, "not", "not sign, U+00AC ISOnum" },
721: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
722: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
723: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
724: { 176, "deg", "degree sign, U+00B0 ISOnum" },
725: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
726: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
727: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
728: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
729: { 181, "micro","micro sign, U+00B5 ISOnum" },
730: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1.7 daniel 731: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1.1 daniel 732: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
733: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
734: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1.7 daniel 735: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1.1 daniel 736: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
737: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
738: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
739: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
740: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
741: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
742: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
743: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
744: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
745: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
746: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
747: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
748: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
749: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
750: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
751: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
752: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
753: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
754: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
755: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
756: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
757: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
758: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
759: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
760: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
761: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
762: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
763: { 215, "times","multiplication sign, U+00D7 ISOnum" },
1.7 daniel 764: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1.1 daniel 765: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
766: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
767: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
768: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
769: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
770: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
771: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
772: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
773: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
774: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
775: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
776: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
777: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
778: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
779: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
780: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
781: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
782: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
783: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
784: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
785: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
786: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
787: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
788: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
789: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
790: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
791: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
792: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
793: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
794: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
795: { 247, "divide","division sign, U+00F7 ISOnum" },
796: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
797: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
798: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
799: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
800: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
801: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
802: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
803: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
804:
805: /*
806: * Anything below should really be kept as entities references
807: */
808: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
809:
810: { 913, "Alpha","greek capital letter alpha, U+0391" },
811: { 914, "Beta", "greek capital letter beta, U+0392" },
812: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
813: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
814: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
815: { 918, "Zeta", "greek capital letter zeta, U+0396" },
816: { 919, "Eta", "greek capital letter eta, U+0397" },
817: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
818: { 921, "Iota", "greek capital letter iota, U+0399" },
819: { 922, "Kappa","greek capital letter kappa, U+039A" },
820: { 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
821: { 924, "Mu", "greek capital letter mu, U+039C" },
822: { 925, "Nu", "greek capital letter nu, U+039D" },
823: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
824: { 927, "Omicron","greek capital letter omicron, U+039F" },
825: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
826: { 929, "Rho", "greek capital letter rho, U+03A1" },
827: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
828: { 932, "Tau", "greek capital letter tau, U+03A4" },
829: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
830: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
831: { 935, "Chi", "greek capital letter chi, U+03A7" },
832: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
833: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
834:
835: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
836: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
837: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
838: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
839: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
840: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
841: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
842: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
843: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
844: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
845: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
846: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
847: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
848: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
849: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
850: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
851: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
852: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
853: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
854: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
855: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
856: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
857: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
858: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
859: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
860: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
861: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
862: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
863:
864: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
865: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
866: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
867: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
868: { 8254, "oline","overline = spacing overscore, U+203E NEW" },
869: { 8260, "frasl","fraction slash, U+2044 NEW" },
870:
1.7 daniel 871: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1.1 daniel 872: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
873: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
874: { 8482, "trade","trade mark sign, U+2122 ISOnum" },
875: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
876: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
877: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
878: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
879: { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
880: { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
881: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
882: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
883: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
884: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
885: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
886: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
887:
888:
889: { 8704, "forall","for all, U+2200 ISOtech" },
890: { 8706, "part", "partial differential, U+2202 ISOtech" },
891: { 8707, "exist","there exists, U+2203 ISOtech" },
892: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
893: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
894: { 8712, "isin", "element of, U+2208 ISOtech" },
895: { 8713, "notin","not an element of, U+2209 ISOtech" },
896: { 8715, "ni", "contains as member, U+220B ISOtech" },
897: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
898: { 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
899: { 8722, "minus","minus sign, U+2212 ISOtech" },
900: { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
901: { 8730, "radic","square root = radical sign, U+221A ISOtech" },
902: { 8733, "prop", "proportional to, U+221D ISOtech" },
903: { 8734, "infin","infinity, U+221E ISOtech" },
904: { 8736, "ang", "angle, U+2220 ISOamso" },
905: { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
906: { 8744, "or", "logical or = vee, U+2228 ISOtech" },
907: { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
908: { 8746, "cup", "union = cup, U+222A ISOtech" },
909: { 8747, "int", "integral, U+222B ISOtech" },
910: { 8756, "there4","therefore, U+2234 ISOtech" },
911: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
912: { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
913: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
914: { 8800, "ne", "not equal to, U+2260 ISOtech" },
915: { 8801, "equiv","identical to, U+2261 ISOtech" },
916: { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
917: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
918: { 8834, "sub", "subset of, U+2282 ISOtech" },
919: { 8835, "sup", "superset of, U+2283 ISOtech" },
920: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
921: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
922: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
923: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
924: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
925: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
926: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
927: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
928: { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
929: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
930: { 8971, "rfloor","right floor, U+230B ISOamsc" },
931: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
932: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
933: { 9674, "loz", "lozenge, U+25CA ISOpub" },
934:
935: { 9824, "spades","black spade suit, U+2660 ISOpub" },
936: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
937: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
938: { 9830, "diams","black diamond suit, U+2666 ISOpub" },
939:
940: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
941: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
942: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
943: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
944: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
945: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
946: { 732, "tilde","small tilde, U+02DC ISOdia" },
947:
948: { 8194, "ensp", "en space, U+2002 ISOpub" },
949: { 8195, "emsp", "em space, U+2003 ISOpub" },
950: { 8201, "thinsp","thin space, U+2009 ISOpub" },
951: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
952: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
953: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
954: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
955: { 8211, "ndash","en dash, U+2013 ISOpub" },
956: { 8212, "mdash","em dash, U+2014 ISOpub" },
957: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
958: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
959: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
960: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
961: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
962: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
963: { 8224, "dagger","dagger, U+2020 ISOpub" },
964: { 8225, "Dagger","double dagger, U+2021 ISOpub" },
965: { 8240, "permil","per mille sign, U+2030 ISOtech" },
966: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1.7 daniel 967: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1.1 daniel 968: { 8364, "euro", "euro sign, U+20AC NEW" }
969: };
970:
971: /************************************************************************
972: * *
973: * Commodity functions to handle entities *
974: * *
975: ************************************************************************/
976:
977: /*
978: * Macro used to grow the current buffer.
979: */
980: #define growBuffer(buffer) { \
981: buffer##_size *= 2; \
1.14 daniel 982: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1.1 daniel 983: if (buffer == NULL) { \
984: perror("realloc failed"); \
1.33 daniel 985: return(NULL); \
1.1 daniel 986: } \
987: }
988:
989: /**
990: * htmlEntityLookup:
991: * @name: the entity name
992: *
993: * Lookup the given entity in EntitiesTable
994: *
995: * TODO: the linear scan is really ugly, an hash table is really needed.
996: *
997: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
998: */
999: htmlEntityDescPtr
1.14 daniel 1000: htmlEntityLookup(const xmlChar *name) {
1.1 daniel 1001: int i;
1002:
1003: for (i = 0;i < (sizeof(html40EntitiesTable)/
1004: sizeof(html40EntitiesTable[0]));i++) {
1.8 daniel 1005: if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
1.1 daniel 1006: #ifdef DEBUG
1.18 daniel 1007: fprintf(stderr,"Found entity %s\n", name);
1.1 daniel 1008: #endif
1009: return(&html40EntitiesTable[i]);
1010: }
1011: }
1012: return(NULL);
1013: }
1014:
1015:
1016: /**
1017: * htmlDecodeEntities:
1018: * @ctxt: the parser context
1019: * @len: the len to decode (in bytes !), -1 for no size limit
1.14 daniel 1020: * @end: an end marker xmlChar, 0 if none
1021: * @end2: an end marker xmlChar, 0 if none
1022: * @end3: an end marker xmlChar, 0 if none
1.1 daniel 1023: *
1024: * Subtitute the HTML entities by their value
1025: *
1.19 daniel 1026: * DEPRECATED !!!!
1.1 daniel 1027: *
1028: * Returns A newly allocated string with the substitution done. The caller
1029: * must deallocate it !
1030: */
1.14 daniel 1031: xmlChar *
1.1 daniel 1032: htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
1.14 daniel 1033: xmlChar end, xmlChar end2, xmlChar end3) {
1034: xmlChar *buffer = NULL;
1.1 daniel 1035: int buffer_size = 0;
1.14 daniel 1036: xmlChar *out = NULL;
1037: xmlChar *name = NULL;
1.1 daniel 1038:
1.14 daniel 1039: xmlChar *cur = NULL;
1.1 daniel 1040: htmlEntityDescPtr ent;
1.5 daniel 1041: int nbchars = 0;
1.1 daniel 1042: unsigned int max = (unsigned int) len;
1043:
1044: /*
1045: * allocate a translation buffer.
1046: */
1.31 daniel 1047: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1.14 daniel 1048: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1.1 daniel 1049: if (buffer == NULL) {
1050: perror("htmlDecodeEntities: malloc failed");
1051: return(NULL);
1052: }
1053: out = buffer;
1054:
1055: /*
1056: * Ok loop until we reach one of the ending char or a size limit.
1057: */
1.45 daniel 1058: while ((nbchars < (int) max) && (CUR != end) &&
1.1 daniel 1059: (CUR != end2) && (CUR != end3)) {
1060:
1061: if (CUR == '&') {
1062: if (NXT(1) == '#') {
1.52 ! veillard 1063: unsigned int c;
! 1064: int bits;
! 1065:
! 1066: c = htmlParseCharRef(ctxt);
! 1067: if (c < 0x80)
! 1068: { *out++ = c; bits= -6; }
! 1069: else if (c < 0x800)
! 1070: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
! 1071: else if (c < 0x10000)
! 1072: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
! 1073: else
! 1074: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
! 1075:
! 1076: for ( ; bits >= 0; bits-= 6) {
! 1077: *out++ = ((c >> bits) & 0x3F) | 0x80;
! 1078: }
! 1079:
! 1080: nbchars += 4; /* !!!! */
1.1 daniel 1081: } else {
1082: ent = htmlParseEntityRef(ctxt, &name);
1083: if (name != NULL) {
1.52 ! veillard 1084: if (ent == NULL) {
1.1 daniel 1085: *out++ = '&';
1086: cur = name;
1087: while (*cur != 0) {
1088: if (out - buffer > buffer_size - 100) {
1089: int index = out - buffer;
1090:
1091: growBuffer(buffer);
1092: out = &buffer[index];
1093: }
1094: *out++ = *cur++;
1095: }
1096: *out++ = ';';
1097: } else {
1.52 ! veillard 1098: unsigned int c;
! 1099: int bits;
! 1100:
1.1 daniel 1101: if (out - buffer > buffer_size - 100) {
1102: int index = out - buffer;
1103:
1104: growBuffer(buffer);
1105: out = &buffer[index];
1106: }
1.52 ! veillard 1107: c = (xmlChar)ent->value;
! 1108: if (c < 0x80)
! 1109: { *out++ = c; bits= -6; }
! 1110: else if (c < 0x800)
! 1111: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
! 1112: else if (c < 0x10000)
! 1113: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
! 1114: else
! 1115: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
! 1116:
! 1117: for ( ; bits >= 0; bits-= 6) {
! 1118: *out++ = ((c >> bits) & 0x3F) | 0x80;
! 1119: }
1.1 daniel 1120: }
1.5 daniel 1121: nbchars += 2 + xmlStrlen(name);
1.11 daniel 1122: xmlFree(name);
1.1 daniel 1123: }
1124: }
1125: } else {
1.52 ! veillard 1126: unsigned int c;
! 1127: int bits;
! 1128:
1.1 daniel 1129: if (out - buffer > buffer_size - 100) {
1.52 ! veillard 1130: int index = out - buffer;
! 1131:
! 1132: growBuffer(buffer);
! 1133: out = &buffer[index];
! 1134: }
! 1135: c = CUR;
! 1136: if (c < 0x80)
! 1137: { *out++ = c; bits= -6; }
! 1138: else if (c < 0x800)
! 1139: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
! 1140: else if (c < 0x10000)
! 1141: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
! 1142: else
! 1143: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
! 1144:
! 1145: for ( ; bits >= 0; bits-= 6) {
! 1146: *out++ = ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 1147: }
1148: NEXT;
1149: }
1150: }
1151: *out++ = 0;
1152: return(buffer);
1153: }
1154:
1.31 daniel 1155: /************************************************************************
1156: * *
1157: * Commodity functions to handle streams *
1158: * *
1159: ************************************************************************/
1160:
1161: /**
1162: * htmlFreeInputStream:
1163: * @input: an htmlParserInputPtr
1164: *
1165: * Free up an input stream.
1166: */
1167: void
1168: htmlFreeInputStream(htmlParserInputPtr input) {
1169: if (input == NULL) return;
1170:
1171: if (input->filename != NULL) xmlFree((char *) input->filename);
1172: if (input->directory != NULL) xmlFree((char *) input->directory);
1173: if ((input->free != NULL) && (input->base != NULL))
1174: input->free((xmlChar *) input->base);
1175: if (input->buf != NULL)
1176: xmlFreeParserInputBuffer(input->buf);
1177: memset(input, -1, sizeof(htmlParserInput));
1178: xmlFree(input);
1179: }
1180:
1181: /**
1182: * htmlNewInputStream:
1183: * @ctxt: an HTML parser context
1184: *
1185: * Create a new input stream structure
1186: * Returns the new input stream or NULL
1187: */
1188: htmlParserInputPtr
1189: htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1190: htmlParserInputPtr input;
1191:
1192: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1193: if (input == NULL) {
1194: ctxt->errNo = XML_ERR_NO_MEMORY;
1195: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1196: ctxt->sax->error(ctxt->userData,
1197: "malloc: couldn't allocate a new input stream\n");
1198: ctxt->errNo = XML_ERR_NO_MEMORY;
1199: return(NULL);
1200: }
1.51 veillard 1201: memset(input, 0, sizeof(htmlParserInput));
1.31 daniel 1202: input->filename = NULL;
1203: input->directory = NULL;
1204: input->base = NULL;
1205: input->cur = NULL;
1206: input->buf = NULL;
1207: input->line = 1;
1208: input->col = 1;
1209: input->buf = NULL;
1210: input->free = NULL;
1.51 veillard 1211: input->version = NULL;
1.31 daniel 1212: input->consumed = 0;
1213: input->length = 0;
1214: return(input);
1215: }
1216:
1.1 daniel 1217:
1218: /************************************************************************
1219: * *
1220: * Commodity functions, cleanup needed ? *
1221: * *
1222: ************************************************************************/
1223:
1224: /**
1225: * areBlanks:
1226: * @ctxt: an HTML parser context
1.14 daniel 1227: * @str: a xmlChar *
1.1 daniel 1228: * @len: the size of @str
1229: *
1230: * Is this a sequence of blank chars that one can ignore ?
1231: *
1232: * Returns 1 if ignorable 0 otherwise.
1233: */
1234:
1.14 daniel 1235: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1.1 daniel 1236: int i;
1237: xmlNodePtr lastChild;
1238:
1239: for (i = 0;i < len;i++)
1240: if (!(IS_BLANK(str[i]))) return(0);
1241:
1.48 daniel 1242: if (CUR == 0) return(1);
1.1 daniel 1243: if (CUR != '<') return(0);
1244: if (ctxt->node == NULL) return(0);
1245: lastChild = xmlGetLastChild(ctxt->node);
1246: if (lastChild == NULL) {
1247: if (ctxt->node->content != NULL) return(0);
1248: } else if (xmlNodeIsText(lastChild))
1249: return(0);
1250: return(1);
1251: }
1252:
1253: /**
1254: * htmlHandleEntity:
1255: * @ctxt: an HTML parser context
1256: * @entity: an XML entity pointer.
1257: *
1258: * Default handling of an HTML entity, call the parser with the
1259: * substitution string
1260: */
1261:
1262: void
1263: htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1264: int len;
1265:
1266: if (entity->content == NULL) {
1267: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1268: ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1269: entity->name);
1270: ctxt->wellFormed = 0;
1271: return;
1272: }
1273: len = xmlStrlen(entity->content);
1274:
1275: /*
1276: * Just handle the content as a set of chars.
1277: */
1278: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1279: ctxt->sax->characters(ctxt->userData, entity->content, len);
1280:
1281: }
1282:
1283: /**
1284: * htmlNewDoc:
1285: * @URI: URI for the dtd, or NULL
1286: * @ExternalID: the external ID of the DTD, or NULL
1287: *
1288: * Returns a new document
1289: */
1290: htmlDocPtr
1.14 daniel 1291: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1.1 daniel 1292: xmlDocPtr cur;
1293:
1294: /*
1295: * Allocate a new document and fill the fields.
1296: */
1.11 daniel 1297: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1.1 daniel 1298: if (cur == NULL) {
1299: fprintf(stderr, "xmlNewDoc : malloc failed\n");
1300: return(NULL);
1301: }
1.10 daniel 1302: memset(cur, 0, sizeof(xmlDoc));
1.1 daniel 1303:
1.20 daniel 1304: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 1305: cur->version = NULL;
1306: cur->intSubset = NULL;
1.28 daniel 1307: if ((ExternalID == NULL) &&
1308: (URI == NULL))
1309: xmlCreateIntSubset(cur, BAD_CAST "HTML",
1310: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1311: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1312: else
1313: xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1.41 daniel 1314: cur->doc = cur;
1.1 daniel 1315: cur->name = NULL;
1.37 daniel 1316: cur->children = NULL;
1.1 daniel 1317: cur->extSubset = NULL;
1318: cur->oldNs = NULL;
1319: cur->encoding = NULL;
1320: cur->standalone = 1;
1321: cur->compression = 0;
1.12 daniel 1322: cur->ids = NULL;
1323: cur->refs = NULL;
1.1 daniel 1324: #ifndef XML_WITHOUT_CORBA
1325: cur->_private = NULL;
1326: #endif
1327: return(cur);
1328: }
1329:
1330:
1331: /************************************************************************
1332: * *
1333: * The parser itself *
1334: * Relates to http://www.w3.org/TR/html40 *
1335: * *
1336: ************************************************************************/
1337:
1338: /************************************************************************
1339: * *
1340: * The parser itself *
1341: * *
1342: ************************************************************************/
1343:
1344: /**
1345: * htmlParseHTMLName:
1346: * @ctxt: an HTML parser context
1347: *
1.26 daniel 1348: * parse an HTML tag or attribute name, note that we convert it to lowercase
1.1 daniel 1349: * since HTML names are not case-sensitive.
1350: *
1351: * Returns the Tag Name parsed or NULL
1352: */
1353:
1.14 daniel 1354: xmlChar *
1.1 daniel 1355: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1356: xmlChar *ret = NULL;
1.1 daniel 1357: int i = 0;
1.31 daniel 1358: xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1.1 daniel 1359:
1360: if (!IS_LETTER(CUR) && (CUR != '_') &&
1361: (CUR != ':')) return(NULL);
1362:
1.31 daniel 1363: while ((i < HTML_PARSER_BUFFER_SIZE) &&
1.45 daniel 1364: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1365: (CUR == ':') || (CUR == '_'))) {
1.26 daniel 1366: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1.1 daniel 1367: else loc[i] = CUR;
1368: i++;
1369:
1370: NEXT;
1371: }
1372:
1373: ret = xmlStrndup(loc, i);
1374:
1375: return(ret);
1376: }
1377:
1378: /**
1379: * htmlParseName:
1380: * @ctxt: an HTML parser context
1381: *
1382: * parse an HTML name, this routine is case sensistive.
1383: *
1384: * Returns the Name parsed or NULL
1385: */
1386:
1.14 daniel 1387: xmlChar *
1.1 daniel 1388: htmlParseName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1389: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1390: int len = 0;
1.1 daniel 1391:
1.5 daniel 1392: GROW;
1393: if (!IS_LETTER(CUR) && (CUR != '_')) {
1394: return(NULL);
1395: }
1.1 daniel 1396:
1397: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1398: (CUR == '.') || (CUR == '-') ||
1399: (CUR == '_') || (CUR == ':') ||
1400: (IS_COMBINING(CUR)) ||
1.5 daniel 1401: (IS_EXTENDER(CUR))) {
1402: buf[len++] = CUR;
1.1 daniel 1403: NEXT;
1.5 daniel 1404: if (len >= HTML_MAX_NAMELEN) {
1405: fprintf(stderr,
1406: "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1407: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1408: (CUR == '.') || (CUR == '-') ||
1409: (CUR == '_') || (CUR == ':') ||
1410: (IS_COMBINING(CUR)) ||
1411: (IS_EXTENDER(CUR)))
1412: NEXT;
1413: break;
1414: }
1415: }
1416: return(xmlStrndup(buf, len));
1.1 daniel 1417: }
1418:
1419: /**
1420: * htmlParseHTMLAttribute:
1421: * @ctxt: an HTML parser context
1.19 daniel 1422: * @stop: a char stop value
1.1 daniel 1423: *
1.19 daniel 1424: * parse an HTML attribute value till the stop (quote), if
1425: * stop is 0 then it stops at the first space
1.1 daniel 1426: *
1.19 daniel 1427: * Returns the attribute parsed or NULL
1.1 daniel 1428: */
1429:
1.14 daniel 1430: xmlChar *
1.19 daniel 1431: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1.32 daniel 1432: #if 0
1.14 daniel 1433: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1434: int len = 0;
1.1 daniel 1435:
1.5 daniel 1436: GROW;
1.19 daniel 1437: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1438: if ((stop == 0) && (IS_BLANK(CUR))) break;
1.5 daniel 1439: buf[len++] = CUR;
1.1 daniel 1440: NEXT;
1.5 daniel 1441: if (len >= HTML_MAX_NAMELEN) {
1442: fprintf(stderr,
1443: "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1444: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1.19 daniel 1445: (CUR != '>') &&
1.5 daniel 1446: (CUR != '\'') && (CUR != '"'))
1447: NEXT;
1448: break;
1449: }
1450: }
1451: return(xmlStrndup(buf, len));
1.32 daniel 1452: #else
1453: xmlChar *buffer = NULL;
1454: int buffer_size = 0;
1455: xmlChar *out = NULL;
1456: xmlChar *name = NULL;
1457:
1458: xmlChar *cur = NULL;
1459: htmlEntityDescPtr ent;
1460:
1461: /*
1462: * allocate a translation buffer.
1463: */
1464: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1465: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1466: if (buffer == NULL) {
1467: perror("htmlParseHTMLAttribute: malloc failed");
1468: return(NULL);
1469: }
1470: out = buffer;
1471:
1472: /*
1473: * Ok loop until we reach one of the ending chars
1474: */
1475: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1476: if ((stop == 0) && (IS_BLANK(CUR))) break;
1477: if (CUR == '&') {
1478: if (NXT(1) == '#') {
1.52 ! veillard 1479: unsigned int c;
! 1480: int bits;
! 1481:
! 1482: c = htmlParseCharRef(ctxt);
! 1483: if (c < 0x80)
! 1484: { *out++ = c; bits= -6; }
! 1485: else if (c < 0x800)
! 1486: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
! 1487: else if (c < 0x10000)
! 1488: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
! 1489: else
! 1490: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
! 1491:
! 1492: for ( ; bits >= 0; bits-= 6) {
! 1493: *out++ = ((c >> bits) & 0x3F) | 0x80;
! 1494: }
1.32 daniel 1495: } else {
1496: ent = htmlParseEntityRef(ctxt, &name);
1497: if (name == NULL) {
1498: *out++ = '&';
1499: if (out - buffer > buffer_size - 100) {
1500: int index = out - buffer;
1501:
1502: growBuffer(buffer);
1503: out = &buffer[index];
1504: }
1.52 ! veillard 1505: } else if (ent == NULL) {
1.32 daniel 1506: *out++ = '&';
1507: cur = name;
1508: while (*cur != 0) {
1509: if (out - buffer > buffer_size - 100) {
1510: int index = out - buffer;
1511:
1512: growBuffer(buffer);
1513: out = &buffer[index];
1514: }
1515: *out++ = *cur++;
1516: }
1517: xmlFree(name);
1518: } else {
1.52 ! veillard 1519: unsigned int c;
! 1520: int bits;
! 1521:
1.32 daniel 1522: if (out - buffer > buffer_size - 100) {
1523: int index = out - buffer;
1524:
1525: growBuffer(buffer);
1526: out = &buffer[index];
1527: }
1.52 ! veillard 1528: c = (xmlChar)ent->value;
! 1529: if (c < 0x80)
! 1530: { *out++ = c; bits= -6; }
! 1531: else if (c < 0x800)
! 1532: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
! 1533: else if (c < 0x10000)
! 1534: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
! 1535: else
! 1536: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
! 1537:
! 1538: for ( ; bits >= 0; bits-= 6) {
! 1539: *out++ = ((c >> bits) & 0x3F) | 0x80;
! 1540: }
1.32 daniel 1541: xmlFree(name);
1542: }
1543: }
1544: } else {
1.52 ! veillard 1545: unsigned int c;
! 1546: int bits;
! 1547:
1.32 daniel 1548: if (out - buffer > buffer_size - 100) {
1.52 ! veillard 1549: int index = out - buffer;
! 1550:
! 1551: growBuffer(buffer);
! 1552: out = &buffer[index];
! 1553: }
! 1554: c = CUR;
! 1555: if (c < 0x80)
! 1556: { *out++ = c; bits= -6; }
! 1557: else if (c < 0x800)
! 1558: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
! 1559: else if (c < 0x10000)
! 1560: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
! 1561: else
! 1562: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
! 1563:
! 1564: for ( ; bits >= 0; bits-= 6) {
! 1565: *out++ = ((c >> bits) & 0x3F) | 0x80;
1.32 daniel 1566: }
1567: NEXT;
1568: }
1569: }
1570: *out++ = 0;
1571: return(buffer);
1572: #endif
1.1 daniel 1573: }
1574:
1575: /**
1576: * htmlParseNmtoken:
1577: * @ctxt: an HTML parser context
1578: *
1579: * parse an HTML Nmtoken.
1580: *
1581: * Returns the Nmtoken parsed or NULL
1582: */
1583:
1.14 daniel 1584: xmlChar *
1.1 daniel 1585: htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
1.14 daniel 1586: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1587: int len = 0;
1.1 daniel 1588:
1.5 daniel 1589: GROW;
1.1 daniel 1590: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1591: (CUR == '.') || (CUR == '-') ||
1592: (CUR == '_') || (CUR == ':') ||
1593: (IS_COMBINING(CUR)) ||
1.5 daniel 1594: (IS_EXTENDER(CUR))) {
1595: buf[len++] = CUR;
1.1 daniel 1596: NEXT;
1.5 daniel 1597: if (len >= HTML_MAX_NAMELEN) {
1598: fprintf(stderr,
1599: "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
1600: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1601: (CUR == '.') || (CUR == '-') ||
1602: (CUR == '_') || (CUR == ':') ||
1603: (IS_COMBINING(CUR)) ||
1604: (IS_EXTENDER(CUR)))
1605: NEXT;
1606: break;
1607: }
1608: }
1609: return(xmlStrndup(buf, len));
1.1 daniel 1610: }
1611:
1612: /**
1613: * htmlParseEntityRef:
1614: * @ctxt: an HTML parser context
1615: * @str: location to store the entity name
1616: *
1617: * parse an HTML ENTITY references
1618: *
1619: * [68] EntityRef ::= '&' Name ';'
1620: *
1621: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
1622: * if non-NULL *str will have to be freed by the caller.
1623: */
1624: htmlEntityDescPtr
1.14 daniel 1625: htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
1626: xmlChar *name;
1.1 daniel 1627: htmlEntityDescPtr ent = NULL;
1628: *str = NULL;
1629:
1630: if (CUR == '&') {
1631: NEXT;
1632: name = htmlParseName(ctxt);
1633: if (name == NULL) {
1634: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1635: ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
1636: ctxt->wellFormed = 0;
1637: } else {
1.5 daniel 1638: GROW;
1.1 daniel 1639: if (CUR == ';') {
1640: *str = name;
1641:
1642: /*
1643: * Lookup the entity in the table.
1644: */
1645: ent = htmlEntityLookup(name);
1.32 daniel 1646: if (ent != NULL) /* OK that's ugly !!! */
1647: NEXT;
1.1 daniel 1648: } else {
1649: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1650: ctxt->sax->error(ctxt->userData,
1651: "htmlParseEntityRef: expecting ';'\n");
1.32 daniel 1652: *str = name;
1.1 daniel 1653: }
1654: }
1655: }
1656: return(ent);
1657: }
1658:
1659: /**
1660: * htmlParseAttValue:
1661: * @ctxt: an HTML parser context
1662: *
1663: * parse a value for an attribute
1664: * Note: the parser won't do substitution of entities here, this
1665: * will be handled later in xmlStringGetNodeList, unless it was
1666: * asked for ctxt->replaceEntities != 0
1667: *
1668: * Returns the AttValue parsed or NULL.
1669: */
1670:
1.14 daniel 1671: xmlChar *
1.1 daniel 1672: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1.14 daniel 1673: xmlChar *ret = NULL;
1.1 daniel 1674:
1675: if (CUR == '"') {
1676: NEXT;
1.19 daniel 1677: ret = htmlParseHTMLAttribute(ctxt, '"');
1.1 daniel 1678: if (CUR != '"') {
1679: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1680: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1681: ctxt->wellFormed = 0;
1682: } else
1683: NEXT;
1684: } else if (CUR == '\'') {
1685: NEXT;
1.19 daniel 1686: ret = htmlParseHTMLAttribute(ctxt, '\'');
1.1 daniel 1687: if (CUR != '\'') {
1688: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1689: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
1690: ctxt->wellFormed = 0;
1691: } else
1692: NEXT;
1693: } else {
1694: /*
1695: * That's an HTMLism, the attribute value may not be quoted
1696: */
1.19 daniel 1697: ret = htmlParseHTMLAttribute(ctxt, 0);
1.1 daniel 1698: if (ret == NULL) {
1699: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1700: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
1701: ctxt->wellFormed = 0;
1702: }
1703: }
1704: return(ret);
1705: }
1706:
1707: /**
1708: * htmlParseSystemLiteral:
1709: * @ctxt: an HTML parser context
1710: *
1711: * parse an HTML Literal
1712: *
1713: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1714: *
1715: * Returns the SystemLiteral parsed or NULL
1716: */
1717:
1.14 daniel 1718: xmlChar *
1.1 daniel 1719: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 1720: const xmlChar *q;
1721: xmlChar *ret = NULL;
1.1 daniel 1722:
1723: if (CUR == '"') {
1724: NEXT;
1725: q = CUR_PTR;
1726: while ((IS_CHAR(CUR)) && (CUR != '"'))
1727: NEXT;
1728: if (!IS_CHAR(CUR)) {
1729: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1730: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1731: ctxt->wellFormed = 0;
1732: } else {
1733: ret = xmlStrndup(q, CUR_PTR - q);
1734: NEXT;
1735: }
1736: } else if (CUR == '\'') {
1737: NEXT;
1738: q = CUR_PTR;
1739: while ((IS_CHAR(CUR)) && (CUR != '\''))
1740: NEXT;
1741: if (!IS_CHAR(CUR)) {
1742: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1743: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
1744: ctxt->wellFormed = 0;
1745: } else {
1746: ret = xmlStrndup(q, CUR_PTR - q);
1747: NEXT;
1748: }
1749: } else {
1750: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.38 daniel 1751: ctxt->sax->error(ctxt->userData,
1752: "SystemLiteral \" or ' expected\n");
1.1 daniel 1753: ctxt->wellFormed = 0;
1754: }
1755:
1756: return(ret);
1757: }
1758:
1759: /**
1760: * htmlParsePubidLiteral:
1761: * @ctxt: an HTML parser context
1762: *
1763: * parse an HTML public literal
1764: *
1765: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1766: *
1767: * Returns the PubidLiteral parsed or NULL.
1768: */
1769:
1.14 daniel 1770: xmlChar *
1.1 daniel 1771: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 1772: const xmlChar *q;
1773: xmlChar *ret = NULL;
1.1 daniel 1774: /*
1775: * Name ::= (Letter | '_') (NameChar)*
1776: */
1777: if (CUR == '"') {
1778: NEXT;
1779: q = CUR_PTR;
1780: while (IS_PUBIDCHAR(CUR)) NEXT;
1781: if (CUR != '"') {
1782: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1783: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1784: ctxt->wellFormed = 0;
1785: } else {
1786: ret = xmlStrndup(q, CUR_PTR - q);
1787: NEXT;
1788: }
1789: } else if (CUR == '\'') {
1790: NEXT;
1791: q = CUR_PTR;
1792: while ((IS_LETTER(CUR)) && (CUR != '\''))
1793: NEXT;
1794: if (!IS_LETTER(CUR)) {
1795: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1796: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
1797: ctxt->wellFormed = 0;
1798: } else {
1799: ret = xmlStrndup(q, CUR_PTR - q);
1800: NEXT;
1801: }
1802: } else {
1803: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1804: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
1805: ctxt->wellFormed = 0;
1806: }
1807:
1808: return(ret);
1809: }
1810:
1811: /**
1812: * htmlParseCharData:
1813: * @ctxt: an HTML parser context
1814: * @cdata: int indicating whether we are within a CDATA section
1815: *
1816: * parse a CharData section.
1817: * if we are within a CDATA section ']]>' marks an end of section.
1818: *
1819: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1820: */
1821:
1822: void
1823: htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1.25 daniel 1824: xmlChar *buf = NULL;
1825: int len = 0;
1.31 daniel 1826: int size = HTML_PARSER_BUFFER_SIZE;
1.52 ! veillard 1827: int q;
! 1828: int bits;
1.25 daniel 1829:
1830: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1831: if (buf == NULL) {
1832: fprintf(stderr, "malloc of %d byte failed\n", size);
1833: return;
1834: }
1.1 daniel 1835:
1.25 daniel 1836: q = CUR;
1837: while ((IS_CHAR(q)) && (q != '<') &&
1838: (q != '&')) {
1839: if ((q == ']') && (NXT(1) == ']') &&
1.1 daniel 1840: (NXT(2) == '>')) {
1841: if (cdata) break;
1842: else {
1843: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1844: ctxt->sax->error(ctxt->userData,
1845: "Sequence ']]>' not allowed in content\n");
1846: ctxt->wellFormed = 0;
1847: }
1848: }
1.25 daniel 1849: if (len + 1 >= size) {
1850: size *= 2;
1.50 veillard 1851: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
1.25 daniel 1852: if (buf == NULL) {
1853: fprintf(stderr, "realloc of %d byte failed\n", size);
1854: return;
1855: }
1856: }
1.52 ! veillard 1857:
! 1858: if (q < 0x80)
! 1859: { buf[len++] = q; bits= -6; }
! 1860: else if (q < 0x800)
! 1861: { buf[len++] =((q >> 6) & 0x1F) | 0xC0; bits= 0; }
! 1862: else if (q < 0x10000)
! 1863: { buf[len++] =((q >> 12) & 0x0F) | 0xE0; bits= 6; }
! 1864: else
! 1865: { buf[len++] =((q >> 18) & 0x07) | 0xF0; bits= 12; }
! 1866:
! 1867: for ( ; bits >= 0; bits-= 6) {
! 1868: buf[len++] = ((q >> bits) & 0x3F) | 0x80;
! 1869: }
1.1 daniel 1870: NEXT;
1.25 daniel 1871: q = CUR;
1872: }
1873: if (len == 0) {
1874: xmlFree(buf);
1875: return;
1.1 daniel 1876: }
1877:
1878: /*
1.25 daniel 1879: * Ok the buffer is to be consumed as chars.
1.1 daniel 1880: */
1881: if (ctxt->sax != NULL) {
1.25 daniel 1882: if (areBlanks(ctxt, buf, len)) {
1.1 daniel 1883: if (ctxt->sax->ignorableWhitespace != NULL)
1.25 daniel 1884: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, len);
1.1 daniel 1885: } else {
1886: if (ctxt->sax->characters != NULL)
1.25 daniel 1887: ctxt->sax->characters(ctxt->userData, buf, len);
1.1 daniel 1888: }
1889: }
1.25 daniel 1890: xmlFree(buf);
1.1 daniel 1891: }
1892:
1893: /**
1894: * htmlParseExternalID:
1895: * @ctxt: an HTML parser context
1.14 daniel 1896: * @publicID: a xmlChar** receiving PubidLiteral
1.1 daniel 1897: * @strict: indicate whether we should restrict parsing to only
1898: * production [75], see NOTE below
1899: *
1900: * Parse an External ID or a Public ID
1901: *
1902: * NOTE: Productions [75] and [83] interract badly since [75] can generate
1903: * 'PUBLIC' S PubidLiteral S SystemLiteral
1904: *
1905: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1906: * | 'PUBLIC' S PubidLiteral S SystemLiteral
1907: *
1908: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1909: *
1910: * Returns the function returns SystemLiteral and in the second
1911: * case publicID receives PubidLiteral, is strict is off
1912: * it is possible to return NULL and have publicID set.
1913: */
1914:
1.14 daniel 1915: xmlChar *
1916: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
1917: xmlChar *URI = NULL;
1.1 daniel 1918:
1919: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
1920: (UPP(2) == 'S') && (UPP(3) == 'T') &&
1921: (UPP(4) == 'E') && (UPP(5) == 'M')) {
1922: SKIP(6);
1923: if (!IS_BLANK(CUR)) {
1924: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1925: ctxt->sax->error(ctxt->userData,
1926: "Space required after 'SYSTEM'\n");
1927: ctxt->wellFormed = 0;
1928: }
1929: SKIP_BLANKS;
1930: URI = htmlParseSystemLiteral(ctxt);
1931: if (URI == NULL) {
1932: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1933: ctxt->sax->error(ctxt->userData,
1934: "htmlParseExternalID: SYSTEM, no URI\n");
1935: ctxt->wellFormed = 0;
1936: }
1937: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
1938: (UPP(2) == 'B') && (UPP(3) == 'L') &&
1939: (UPP(4) == 'I') && (UPP(5) == 'C')) {
1940: SKIP(6);
1941: if (!IS_BLANK(CUR)) {
1942: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1943: ctxt->sax->error(ctxt->userData,
1944: "Space required after 'PUBLIC'\n");
1945: ctxt->wellFormed = 0;
1946: }
1947: SKIP_BLANKS;
1948: *publicID = htmlParsePubidLiteral(ctxt);
1949: if (*publicID == NULL) {
1950: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1951: ctxt->sax->error(ctxt->userData,
1952: "htmlParseExternalID: PUBLIC, no Public Identifier\n");
1953: ctxt->wellFormed = 0;
1954: }
1.5 daniel 1955: SKIP_BLANKS;
1956: if ((CUR == '"') || (CUR == '\'')) {
1957: URI = htmlParseSystemLiteral(ctxt);
1.1 daniel 1958: }
1959: }
1960: return(URI);
1961: }
1962:
1963: /**
1964: * htmlParseComment:
1965: * @ctxt: an HTML parser context
1966: *
1967: * Parse an XML (SGML) comment <!-- .... -->
1968: *
1969: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1970: */
1971: void
1.31 daniel 1972: htmlParseComment(htmlParserCtxtPtr ctxt) {
1.25 daniel 1973: xmlChar *buf = NULL;
1974: int len = 0;
1.31 daniel 1975: int size = HTML_PARSER_BUFFER_SIZE;
1.25 daniel 1976: register xmlChar s, r, q;
1.1 daniel 1977:
1978: /*
1979: * Check that there is a comment right here.
1980: */
1981: if ((CUR != '<') || (NXT(1) != '!') ||
1982: (NXT(2) != '-') || (NXT(3) != '-')) return;
1983:
1.25 daniel 1984: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
1985: if (buf == NULL) {
1986: fprintf(stderr, "malloc of %d byte failed\n", size);
1987: return;
1988: }
1989: q = r = '-'; /* 0 or '-' to cover our ass against <!--> and <!---> ? !!! */
1.1 daniel 1990: SKIP(4);
1.25 daniel 1991: s = CUR;
1992:
1993: while (IS_CHAR(s) &&
1994: ((s != '>') || (r != '-') || (q != '-'))) {
1995: if (len + 1 >= size) {
1996: size *= 2;
1.50 veillard 1997: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
1.25 daniel 1998: if (buf == NULL) {
1999: fprintf(stderr, "realloc of %d byte failed\n", size);
2000: return;
2001: }
2002: }
2003: buf[len++] = s;
2004: NEXT;
2005: q = r;
2006: r = s;
2007: s = CUR;
1.1 daniel 2008: }
1.25 daniel 2009: buf[len - 2] = 0;
2010: if (!IS_CHAR(s)) {
1.1 daniel 2011: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.25 daniel 2012: ctxt->sax->error(ctxt->userData, "Comment not terminated \n<!--%.50s\n", buf);
1.1 daniel 2013: ctxt->wellFormed = 0;
2014: } else {
2015: NEXT;
1.31 daniel 2016: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
2017: ctxt->sax->comment(ctxt->userData, buf);
1.1 daniel 2018: }
2019: }
1.25 daniel 2020: xmlFree(buf);
1.1 daniel 2021: }
2022:
2023: /**
2024: * htmlParseCharRef:
2025: * @ctxt: an HTML parser context
2026: *
2027: * parse Reference declarations
2028: *
2029: * [66] CharRef ::= '&#' [0-9]+ ';' |
2030: * '&#x' [0-9a-fA-F]+ ';'
2031: *
2032: * Returns the value parsed (as an int)
2033: */
2034: int
2035: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2036: int val = 0;
2037:
2038: if ((CUR == '&') && (NXT(1) == '#') &&
2039: (NXT(2) == 'x')) {
2040: SKIP(3);
2041: while (CUR != ';') {
2042: if ((CUR >= '0') && (CUR <= '9'))
2043: val = val * 16 + (CUR - '0');
2044: else if ((CUR >= 'a') && (CUR <= 'f'))
2045: val = val * 16 + (CUR - 'a') + 10;
2046: else if ((CUR >= 'A') && (CUR <= 'F'))
2047: val = val * 16 + (CUR - 'A') + 10;
2048: else {
2049: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2050: ctxt->sax->error(ctxt->userData,
2051: "htmlParseCharRef: invalid hexadecimal value\n");
2052: ctxt->wellFormed = 0;
2053: val = 0;
2054: break;
2055: }
2056: NEXT;
2057: }
2058: if (CUR == ';')
2059: NEXT;
2060: } else if ((CUR == '&') && (NXT(1) == '#')) {
2061: SKIP(2);
2062: while (CUR != ';') {
2063: if ((CUR >= '0') && (CUR <= '9'))
2064: val = val * 10 + (CUR - '0');
2065: else {
2066: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2067: ctxt->sax->error(ctxt->userData,
2068: "htmlParseCharRef: invalid decimal value\n");
2069: ctxt->wellFormed = 0;
2070: val = 0;
2071: break;
2072: }
2073: NEXT;
2074: }
2075: if (CUR == ';')
2076: NEXT;
2077: } else {
2078: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2079: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2080: ctxt->wellFormed = 0;
2081: }
2082: /*
2083: * Check the value IS_CHAR ...
2084: */
2085: if (IS_CHAR(val)) {
2086: return(val);
2087: } else {
2088: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.14 daniel 2089: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
1.1 daniel 2090: val);
2091: ctxt->wellFormed = 0;
2092: }
2093: return(0);
2094: }
2095:
2096:
2097: /**
2098: * htmlParseDocTypeDecl :
2099: * @ctxt: an HTML parser context
2100: *
2101: * parse a DOCTYPE declaration
2102: *
2103: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2104: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2105: */
2106:
2107: void
2108: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1.14 daniel 2109: xmlChar *name;
2110: xmlChar *ExternalID = NULL;
2111: xmlChar *URI = NULL;
1.1 daniel 2112:
2113: /*
2114: * We know that '<!DOCTYPE' has been detected.
2115: */
2116: SKIP(9);
2117:
2118: SKIP_BLANKS;
2119:
2120: /*
2121: * Parse the DOCTYPE name.
2122: */
2123: name = htmlParseName(ctxt);
2124: if (name == NULL) {
2125: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2126: ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2127: ctxt->wellFormed = 0;
2128: }
2129: /*
2130: * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2131: */
2132:
2133: SKIP_BLANKS;
2134:
2135: /*
2136: * Check for SystemID and ExternalID
2137: */
1.5 daniel 2138: URI = htmlParseExternalID(ctxt, &ExternalID, 0);
1.1 daniel 2139: SKIP_BLANKS;
2140:
2141: /*
2142: * We should be at the end of the DOCTYPE declaration.
2143: */
2144: if (CUR != '>') {
2145: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2146: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2147: ctxt->wellFormed = 0;
2148: /* We shouldn't try to resynchronize ... */
2149: }
2150: NEXT;
2151:
2152: /*
1.46 daniel 2153: * Create or update the document accordingly to the DOCTYPE
1.1 daniel 2154: */
1.46 daniel 2155: if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2156: (!ctxt->disableSAX))
2157: ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
1.1 daniel 2158:
2159: /*
2160: * Cleanup, since we don't use all those identifiers
2161: */
1.11 daniel 2162: if (URI != NULL) xmlFree(URI);
2163: if (ExternalID != NULL) xmlFree(ExternalID);
2164: if (name != NULL) xmlFree(name);
1.1 daniel 2165: }
2166:
2167: /**
2168: * htmlParseAttribute:
2169: * @ctxt: an HTML parser context
1.14 daniel 2170: * @value: a xmlChar ** used to store the value of the attribute
1.1 daniel 2171: *
2172: * parse an attribute
2173: *
2174: * [41] Attribute ::= Name Eq AttValue
2175: *
2176: * [25] Eq ::= S? '=' S?
2177: *
2178: * With namespace:
2179: *
2180: * [NS 11] Attribute ::= QName Eq AttValue
2181: *
2182: * Also the case QName == xmlns:??? is handled independently as a namespace
2183: * definition.
2184: *
2185: * Returns the attribute name, and the value in *value.
2186: */
2187:
1.14 daniel 2188: xmlChar *
2189: htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1.31 daniel 2190: xmlChar *name, *val = NULL;
1.1 daniel 2191:
2192: *value = NULL;
2193: name = htmlParseName(ctxt);
2194: if (name == NULL) {
2195: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2196: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2197: ctxt->wellFormed = 0;
2198: return(NULL);
2199: }
2200:
2201: /*
2202: * read the value
2203: */
2204: SKIP_BLANKS;
2205: if (CUR == '=') {
2206: NEXT;
2207: SKIP_BLANKS;
2208: val = htmlParseAttValue(ctxt);
1.42 daniel 2209: /******
1.1 daniel 2210: } else {
1.42 daniel 2211: * TODO : some attribute must have values, some may not
1.1 daniel 2212: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.31 daniel 2213: ctxt->sax->warning(ctxt->userData,
1.42 daniel 2214: "No value for attribute %s\n", name); */
1.1 daniel 2215: }
2216:
2217: *value = val;
2218: return(name);
2219: }
2220:
2221: /**
1.47 daniel 2222: * htmlCheckEncoding:
2223: * @ctxt: an HTML parser context
2224: * @attvalue: the attribute value
2225: *
2226: * Checks an http-equiv attribute from a Meta tag to detect
2227: * the encoding
2228: * If a new encoding is detected the parser is switched to decode
2229: * it and pass UTF8
2230: */
2231: void
2232: htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2233: const xmlChar *encoding;
2234:
2235: if ((ctxt == NULL) || (attvalue == NULL))
2236: return;
2237:
2238: fprintf(stderr, "htmlCheckEncoding: \"%s\"\n", attvalue);
2239:
2240: encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
2241: if (encoding == NULL)
2242: encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
2243: if (encoding == NULL)
2244: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
2245: if (encoding != NULL) {
2246: encoding += 8;
2247: } else {
2248: encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
2249: if (encoding == NULL)
2250: encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
2251: if (encoding == NULL)
2252: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
2253: if (encoding != NULL)
2254: encoding += 9;
2255: }
2256: if (encoding != NULL) {
2257: xmlCharEncoding enc;
2258: xmlCharEncodingHandlerPtr handler;
2259:
2260: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2261:
2262: if (ctxt->input->encoding != NULL)
2263: xmlFree((xmlChar *) ctxt->input->encoding);
2264: ctxt->input->encoding = xmlStrdup(encoding);
2265:
2266: enc = xmlParseCharEncoding((const char *) encoding);
2267: /*
2268: * registered set of known encodings
2269: */
2270: if (enc != XML_CHAR_ENCODING_ERROR) {
2271: xmlSwitchEncoding(ctxt, enc);
2272: } else {
2273: /*
2274: * fallback for unknown encodings
2275: */
2276: handler = xmlFindCharEncodingHandler((const char *) encoding);
2277: if (handler != NULL) {
2278: xmlSwitchToEncoding(ctxt, handler);
2279: } else {
2280: ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2281: }
2282: }
2283: }
2284: }
2285:
2286: /**
2287: * htmlCheckMeta:
2288: * @ctxt: an HTML parser context
2289: * @atts: the attributes values
2290: *
2291: * Checks an attributes from a Meta tag
2292: */
2293: void
2294: htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2295: int i;
2296: const xmlChar *att, *value;
2297: int http = 0;
2298: const xmlChar *content = NULL;
2299:
2300: if ((ctxt == NULL) || (atts == NULL))
2301: return;
2302:
2303: i = 0;
2304: att = atts[i++];
2305: while (att != NULL) {
2306: value = atts[i++];
2307: if ((value != NULL) &&
2308: ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
2309: (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
2310: (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
2311: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
2312: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
2313: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
2314: http = 1;
2315: else if ((value != NULL) &&
2316: ((!xmlStrcmp(att, BAD_CAST"content")) ||
2317: (!xmlStrcmp(att, BAD_CAST"Content")) ||
2318: (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
2319: content = value;
2320: att = atts[i++];
2321: }
2322: if ((http) && (content != NULL))
2323: htmlCheckEncoding(ctxt, content);
2324:
2325: }
2326:
2327: /**
1.1 daniel 2328: * htmlParseStartTag:
2329: * @ctxt: an HTML parser context
2330: *
2331: * parse a start of tag either for rule element or
2332: * EmptyElement. In both case we don't parse the tag closing chars.
2333: *
2334: * [40] STag ::= '<' Name (S Attribute)* S? '>'
2335: *
2336: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2337: *
2338: * With namespace:
2339: *
2340: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2341: *
2342: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2343: *
2344: */
2345:
1.18 daniel 2346: void
1.1 daniel 2347: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2348: xmlChar *name;
2349: xmlChar *attname;
2350: xmlChar *attvalue;
2351: const xmlChar **atts = NULL;
1.1 daniel 2352: int nbatts = 0;
2353: int maxatts = 0;
1.47 daniel 2354: int meta = 0;
1.1 daniel 2355: int i;
2356:
1.18 daniel 2357: if (CUR != '<') return;
1.1 daniel 2358: NEXT;
2359:
1.19 daniel 2360: GROW;
1.1 daniel 2361: name = htmlParseHTMLName(ctxt);
2362: if (name == NULL) {
2363: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2364: ctxt->sax->error(ctxt->userData,
2365: "htmlParseStartTag: invalid element name\n");
2366: ctxt->wellFormed = 0;
1.18 daniel 2367: return;
1.1 daniel 2368: }
1.47 daniel 2369: if (!xmlStrcmp(name, BAD_CAST"meta"))
2370: meta = 1;
1.1 daniel 2371:
2372: /*
2373: * Check for auto-closure of HTML elements.
2374: */
2375: htmlAutoClose(ctxt, name);
1.43 daniel 2376:
2377: /*
2378: * Check for implied HTML elements.
2379: */
2380: htmlCheckImplied(ctxt, name);
1.1 daniel 2381:
2382: /*
2383: * Now parse the attributes, it ends up with the ending
2384: *
2385: * (S Attribute)* S?
2386: */
2387: SKIP_BLANKS;
2388: while ((IS_CHAR(CUR)) &&
2389: (CUR != '>') &&
2390: ((CUR != '/') || (NXT(1) != '>'))) {
1.26 daniel 2391: long cons = ctxt->nbChars;
1.1 daniel 2392:
1.19 daniel 2393: GROW;
1.1 daniel 2394: attname = htmlParseAttribute(ctxt, &attvalue);
1.31 daniel 2395: if (attname != NULL) {
1.47 daniel 2396:
1.1 daniel 2397: /*
2398: * Well formedness requires at most one declaration of an attribute
2399: */
2400: for (i = 0; i < nbatts;i += 2) {
2401: if (!xmlStrcmp(atts[i], attname)) {
2402: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.19 daniel 2403: ctxt->sax->error(ctxt->userData,
2404: "Attribute %s redefined\n",
2405: attname);
1.1 daniel 2406: ctxt->wellFormed = 0;
1.11 daniel 2407: xmlFree(attname);
1.31 daniel 2408: if (attvalue != NULL)
2409: xmlFree(attvalue);
1.19 daniel 2410: goto failed;
1.1 daniel 2411: }
2412: }
2413:
2414: /*
2415: * Add the pair to atts
2416: */
2417: if (atts == NULL) {
2418: maxatts = 10;
1.14 daniel 2419: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
1.1 daniel 2420: if (atts == NULL) {
2421: fprintf(stderr, "malloc of %ld byte failed\n",
1.14 daniel 2422: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2423: if (name != NULL) xmlFree(name);
2424: return;
1.1 daniel 2425: }
1.23 daniel 2426: } else if (nbatts + 4 > maxatts) {
1.1 daniel 2427: maxatts *= 2;
1.14 daniel 2428: atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
1.1 daniel 2429: if (atts == NULL) {
2430: fprintf(stderr, "realloc of %ld byte failed\n",
1.14 daniel 2431: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2432: if (name != NULL) xmlFree(name);
2433: return;
1.1 daniel 2434: }
2435: }
2436: atts[nbatts++] = attname;
2437: atts[nbatts++] = attvalue;
2438: atts[nbatts] = NULL;
2439: atts[nbatts + 1] = NULL;
2440: }
2441:
1.19 daniel 2442: failed:
1.1 daniel 2443: SKIP_BLANKS;
1.26 daniel 2444: if (cons == ctxt->nbChars) {
1.1 daniel 2445: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2446: ctxt->sax->error(ctxt->userData,
2447: "htmlParseStartTag: problem parsing attributes\n");
2448: ctxt->wellFormed = 0;
2449: break;
2450: }
2451: }
2452:
2453: /*
1.47 daniel 2454: * Handle specific association to the META tag
2455: */
2456: if (meta)
2457: htmlCheckMeta(ctxt, atts);
2458:
2459: /*
1.1 daniel 2460: * SAX: Start of Element !
2461: */
1.15 daniel 2462: htmlnamePush(ctxt, xmlStrdup(name));
1.18 daniel 2463: #ifdef DEBUG
2464: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2465: #endif
1.1 daniel 2466: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2467: ctxt->sax->startElement(ctxt->userData, name, atts);
2468:
2469: if (atts != NULL) {
1.31 daniel 2470: for (i = 0;i < nbatts;i++) {
2471: if (atts[i] != NULL)
2472: xmlFree((xmlChar *) atts[i]);
2473: }
1.45 daniel 2474: xmlFree((void *) atts);
1.1 daniel 2475: }
1.18 daniel 2476: if (name != NULL) xmlFree(name);
1.1 daniel 2477: }
2478:
2479: /**
2480: * htmlParseEndTag:
2481: * @ctxt: an HTML parser context
2482: *
2483: * parse an end of tag
2484: *
2485: * [42] ETag ::= '</' Name S? '>'
2486: *
2487: * With namespace
2488: *
2489: * [NS 9] ETag ::= '</' QName S? '>'
2490: */
2491:
2492: void
1.18 daniel 2493: htmlParseEndTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2494: xmlChar *name;
1.15 daniel 2495: xmlChar *oldname;
1.1 daniel 2496: int i;
2497:
2498: if ((CUR != '<') || (NXT(1) != '/')) {
2499: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2500: ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2501: ctxt->wellFormed = 0;
2502: return;
2503: }
2504: SKIP(2);
2505:
2506: name = htmlParseHTMLName(ctxt);
1.24 daniel 2507: if (name == NULL) return;
1.1 daniel 2508:
2509: /*
2510: * We should definitely be at the ending "S? '>'" part
2511: */
2512: SKIP_BLANKS;
2513: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2514: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2515: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2516: ctxt->wellFormed = 0;
2517: } else
2518: NEXT;
2519:
2520: /*
1.18 daniel 2521: * If the name read is not one of the element in the parsing stack
2522: * then return, it's just an error.
1.1 daniel 2523: */
1.18 daniel 2524: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2525: if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
1.1 daniel 2526: }
2527: if (i < 0) {
2528: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.18 daniel 2529: ctxt->sax->error(ctxt->userData,
2530: "Unexpected end tag : %s\n", name);
1.11 daniel 2531: xmlFree(name);
1.1 daniel 2532: ctxt->wellFormed = 0;
2533: return;
2534: }
2535:
1.18 daniel 2536:
1.1 daniel 2537: /*
2538: * Check for auto-closure of HTML elements.
2539: */
1.18 daniel 2540:
1.1 daniel 2541: htmlAutoCloseOnClose(ctxt, name);
2542:
2543: /*
2544: * Well formedness constraints, opening and closing must match.
2545: * With the exception that the autoclose may have popped stuff out
2546: * of the stack.
2547: */
1.18 daniel 2548: if (xmlStrcmp(name, ctxt->name)) {
2549: #ifdef DEBUG
2550: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
2551: #endif
1.15 daniel 2552: if ((ctxt->name != NULL) &&
2553: (xmlStrcmp(ctxt->name, name))) {
1.1 daniel 2554: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2555: ctxt->sax->error(ctxt->userData,
2556: "Opening and ending tag mismatch: %s and %s\n",
1.15 daniel 2557: name, ctxt->name);
1.1 daniel 2558: ctxt->wellFormed = 0;
2559: }
2560: }
2561:
2562: /*
2563: * SAX: End of Tag
2564: */
1.15 daniel 2565: oldname = ctxt->name;
1.24 daniel 2566: if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
1.18 daniel 2567: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2568: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2569: oldname = htmlnamePop(ctxt);
1.18 daniel 2570: if (oldname != NULL) {
2571: #ifdef DEBUG
2572: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
2573: #endif
2574: xmlFree(oldname);
2575: #ifdef DEBUG
2576: } else {
2577: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
2578: #endif
2579: }
2580: }
1.1 daniel 2581:
2582: if (name != NULL)
1.11 daniel 2583: xmlFree(name);
1.1 daniel 2584:
2585: return;
2586: }
2587:
2588:
2589: /**
2590: * htmlParseReference:
2591: * @ctxt: an HTML parser context
2592: *
2593: * parse and handle entity references in content,
2594: * this will end-up in a call to character() since this is either a
2595: * CharRef, or a predefined entity.
2596: */
2597: void
2598: htmlParseReference(htmlParserCtxtPtr ctxt) {
2599: htmlEntityDescPtr ent;
1.52 ! veillard 2600: xmlChar out[6];
1.14 daniel 2601: xmlChar *name;
1.1 daniel 2602: if (CUR != '&') return;
2603:
2604: if (NXT(1) == '#') {
1.52 ! veillard 2605: unsigned int c;
! 2606: int bits, i = 0;
! 2607:
! 2608: c = htmlParseCharRef(ctxt);
! 2609: if (c < 0x80) { out[i++]= c; bits= -6; }
! 2610: else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
! 2611: else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
! 2612: else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
! 2613:
! 2614: for ( ; bits >= 0; bits-= 6) {
! 2615: out[i++]= ((c >> bits) & 0x3F) | 0x80;
! 2616: }
! 2617: out[i] = 0;
! 2618:
1.1 daniel 2619: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.52 ! veillard 2620: ctxt->sax->characters(ctxt->userData, out, i);
1.1 daniel 2621: } else {
2622: ent = htmlParseEntityRef(ctxt, &name);
1.32 daniel 2623: if (name == NULL) {
2624: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
2625: return;
2626: }
1.52 ! veillard 2627: if ((ent == NULL) || (ent->value <= 0)) {
1.1 daniel 2628: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
1.8 daniel 2629: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.1 daniel 2630: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1.32 daniel 2631: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
1.1 daniel 2632: }
2633: } else {
1.52 ! veillard 2634: unsigned int c;
! 2635: int bits, i = 0;
! 2636:
! 2637: c = ent->value;
! 2638: if (c < 0x80)
! 2639: { out[i++]= c; bits= -6; }
! 2640: else if (c < 0x800)
! 2641: { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
! 2642: else if (c < 0x10000)
! 2643: { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
! 2644: else
! 2645: { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
! 2646:
! 2647: for ( ; bits >= 0; bits-= 6) {
! 2648: out[i++]= ((c >> bits) & 0x3F) | 0x80;
! 2649: }
! 2650: out[i] = 0;
! 2651:
1.1 daniel 2652: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.52 ! veillard 2653: ctxt->sax->characters(ctxt->userData, out, i);
1.1 daniel 2654: }
1.11 daniel 2655: xmlFree(name);
1.1 daniel 2656: }
2657: }
2658:
2659: /**
2660: * htmlParseContent:
2661: * @ctxt: an HTML parser context
2662: * @name: the node name
2663: *
2664: * Parse a content: comment, sub-element, reference or text.
2665: *
2666: */
2667:
2668: void
1.18 daniel 2669: htmlParseContent(htmlParserCtxtPtr ctxt) {
1.15 daniel 2670: xmlChar *currentNode;
1.18 daniel 2671: int depth;
1.1 daniel 2672:
1.26 daniel 2673: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 2674: depth = ctxt->nameNr;
2675: while (1) {
1.26 daniel 2676: long cons = ctxt->nbChars;
1.1 daniel 2677:
1.18 daniel 2678: GROW;
2679: /*
2680: * Our tag or one of it's parent or children is ending.
2681: */
2682: if ((CUR == '<') && (NXT(1) == '/')) {
2683: htmlParseEndTag(ctxt);
1.26 daniel 2684: if (currentNode != NULL) xmlFree(currentNode);
1.18 daniel 2685: return;
2686: }
2687:
2688: /*
2689: * Has this node been popped out during parsing of
2690: * the next element
2691: */
1.26 daniel 2692: if ((xmlStrcmp(currentNode, ctxt->name)) &&
2693: (depth >= ctxt->nameNr)) {
2694: if (currentNode != NULL) xmlFree(currentNode);
2695: return;
2696: }
1.18 daniel 2697:
1.1 daniel 2698: /*
2699: * First case : a comment
2700: */
2701: if ((CUR == '<') && (NXT(1) == '!') &&
2702: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 2703: htmlParseComment(ctxt);
1.1 daniel 2704: }
2705:
2706: /*
2707: * Second case : a sub-element.
2708: */
2709: else if (CUR == '<') {
2710: htmlParseElement(ctxt);
2711: }
2712:
2713: /*
2714: * Third case : a reference. If if has not been resolved,
2715: * parsing returns it's Name, create the node
2716: */
2717: else if (CUR == '&') {
2718: htmlParseReference(ctxt);
2719: }
2720:
2721: /*
1.47 daniel 2722: * Fourth : end of the resource
2723: */
2724: else if (CUR == 0) {
2725: htmlAutoClose(ctxt, NULL);
2726: }
2727:
2728: /*
1.1 daniel 2729: * Last case, text. Note that References are handled directly.
2730: */
2731: else {
2732: htmlParseCharData(ctxt, 0);
2733: }
2734:
1.26 daniel 2735: if (cons == ctxt->nbChars) {
1.22 daniel 2736: if (ctxt->node != NULL) {
2737: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2738: ctxt->sax->error(ctxt->userData,
2739: "detected an error in element content\n");
2740: ctxt->wellFormed = 0;
2741: }
1.1 daniel 2742: break;
2743: }
1.17 daniel 2744:
1.5 daniel 2745: GROW;
1.1 daniel 2746: }
1.26 daniel 2747: if (currentNode != NULL) xmlFree(currentNode);
1.1 daniel 2748: }
2749:
2750: /**
2751: * htmlParseElement:
2752: * @ctxt: an HTML parser context
2753: *
2754: * parse an HTML element, this is highly recursive
2755: *
2756: * [39] element ::= EmptyElemTag | STag content ETag
2757: *
2758: * [41] Attribute ::= Name Eq AttValue
2759: */
2760:
2761: void
2762: htmlParseElement(htmlParserCtxtPtr ctxt) {
1.14 daniel 2763: const xmlChar *openTag = CUR_PTR;
2764: xmlChar *name;
1.16 daniel 2765: xmlChar *currentNode = NULL;
1.1 daniel 2766: htmlElemDescPtr info;
1.10 daniel 2767: htmlParserNodeInfo node_info;
1.31 daniel 2768: xmlChar *oldname;
1.18 daniel 2769: int depth = ctxt->nameNr;
1.1 daniel 2770:
2771: /* Capture start position */
1.10 daniel 2772: if (ctxt->record_info) {
2773: node_info.begin_pos = ctxt->input->consumed +
2774: (CUR_PTR - ctxt->input->base);
2775: node_info.begin_line = ctxt->input->line;
2776: }
1.1 daniel 2777:
1.26 daniel 2778: oldname = xmlStrdup(ctxt->name);
1.18 daniel 2779: htmlParseStartTag(ctxt);
2780: name = ctxt->name;
1.19 daniel 2781: #ifdef DEBUG
2782: if (oldname == NULL)
2783: fprintf(stderr, "Start of element %s\n", name);
2784: else if (name == NULL)
2785: fprintf(stderr, "Start of element failed, was %s\n", oldname);
2786: else
2787: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
2788: #endif
1.26 daniel 2789: if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
1.18 daniel 2790: (name == NULL)) {
1.19 daniel 2791: if (CUR == '>')
2792: NEXT;
1.26 daniel 2793: if (oldname != NULL)
2794: xmlFree(oldname);
1.1 daniel 2795: return;
2796: }
1.26 daniel 2797: if (oldname != NULL)
2798: xmlFree(oldname);
1.1 daniel 2799:
2800: /*
2801: * Lookup the info for that element.
2802: */
2803: info = htmlTagLookup(name);
2804: if (info == NULL) {
2805: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2806: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
2807: name);
2808: ctxt->wellFormed = 0;
2809: } else if (info->depr) {
2810: /***************************
2811: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
2812: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
2813: name);
2814: ***************************/
2815: }
2816:
2817: /*
2818: * Check for an Empty Element labelled the XML/SGML way
2819: */
2820: if ((CUR == '/') && (NXT(1) == '>')) {
2821: SKIP(2);
2822: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2823: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2824: oldname = htmlnamePop(ctxt);
1.18 daniel 2825: #ifdef DEBUG
2826: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
2827: #endif
1.17 daniel 2828: if (oldname != NULL)
2829: xmlFree(oldname);
1.1 daniel 2830: return;
2831: }
2832:
1.5 daniel 2833: if (CUR == '>') {
2834: NEXT;
2835: } else {
1.1 daniel 2836: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2837: ctxt->sax->error(ctxt->userData, "Couldn't find end of Start Tag\n%.30s\n",
2838: openTag);
2839: ctxt->wellFormed = 0;
2840:
2841: /*
2842: * end of parsing of this node.
2843: */
1.18 daniel 2844: if (!xmlStrcmp(name, ctxt->name)) {
2845: nodePop(ctxt);
1.24 daniel 2846: oldname = htmlnamePop(ctxt);
1.18 daniel 2847: #ifdef DEBUG
2848: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
2849: #endif
2850: if (oldname != NULL)
2851: xmlFree(oldname);
2852: }
1.10 daniel 2853:
2854: /*
2855: * Capture end position and add node
2856: */
2857: if ( currentNode != NULL && ctxt->record_info ) {
2858: node_info.end_pos = ctxt->input->consumed +
2859: (CUR_PTR - ctxt->input->base);
2860: node_info.end_line = ctxt->input->line;
1.15 daniel 2861: node_info.node = ctxt->node;
1.10 daniel 2862: xmlParserAddNodeInfo(ctxt, &node_info);
2863: }
1.1 daniel 2864: return;
2865: }
2866:
2867: /*
2868: * Check for an Empty Element from DTD definition
2869: */
2870: if ((info != NULL) && (info->empty)) {
2871: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
2872: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 2873: oldname = htmlnamePop(ctxt);
1.18 daniel 2874: #ifdef DEBUG
2875: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
2876: #endif
1.17 daniel 2877: if (oldname != NULL)
2878: xmlFree(oldname);
1.1 daniel 2879: return;
2880: }
2881:
2882: /*
2883: * Parse the content of the element:
2884: */
1.26 daniel 2885: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 2886: depth = ctxt->nameNr;
2887: while (IS_CHAR(CUR)) {
2888: htmlParseContent(ctxt);
2889: if (ctxt->nameNr < depth) break;
2890: }
1.1 daniel 2891:
2892: if (!IS_CHAR(CUR)) {
1.49 daniel 2893: /************
1.1 daniel 2894: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2895: ctxt->sax->error(ctxt->userData,
1.18 daniel 2896: "Premature end of data in tag %s\n", currentNode);
1.1 daniel 2897: ctxt->wellFormed = 0;
1.49 daniel 2898: *************/
1.1 daniel 2899:
2900: /*
2901: * end of parsing of this node.
2902: */
2903: nodePop(ctxt);
1.24 daniel 2904: oldname = htmlnamePop(ctxt);
1.18 daniel 2905: #ifdef DEBUG
2906: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
2907: #endif
1.17 daniel 2908: if (oldname != NULL)
2909: xmlFree(oldname);
1.26 daniel 2910: if (currentNode != NULL)
2911: xmlFree(currentNode);
1.1 daniel 2912: return;
2913: }
1.10 daniel 2914:
2915: /*
2916: * Capture end position and add node
2917: */
2918: if ( currentNode != NULL && ctxt->record_info ) {
2919: node_info.end_pos = ctxt->input->consumed +
2920: (CUR_PTR - ctxt->input->base);
2921: node_info.end_line = ctxt->input->line;
1.15 daniel 2922: node_info.node = ctxt->node;
1.10 daniel 2923: xmlParserAddNodeInfo(ctxt, &node_info);
2924: }
1.26 daniel 2925: if (currentNode != NULL)
2926: xmlFree(currentNode);
1.1 daniel 2927: }
2928:
2929: /**
2930: * htmlParseDocument :
2931: * @ctxt: an HTML parser context
2932: *
2933: * parse an HTML document (and build a tree if using the standard SAX
2934: * interface).
2935: *
2936: * Returns 0, -1 in case of error. the parser context is augmented
2937: * as a result of the parsing.
2938: */
2939:
2940: int
2941: htmlParseDocument(htmlParserCtxtPtr ctxt) {
2942: htmlDefaultSAXHandlerInit();
2943: ctxt->html = 1;
2944:
1.5 daniel 2945: GROW;
1.1 daniel 2946: /*
1.9 daniel 2947: * SAX: beginning of the document processing.
1.1 daniel 2948: */
2949: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
2950: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
2951:
2952: /*
2953: * Wipe out everything which is before the first '<'
2954: */
1.22 daniel 2955: SKIP_BLANKS;
1.1 daniel 2956: if (CUR == 0) {
2957: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2958: ctxt->sax->error(ctxt->userData, "Document is empty\n");
2959: ctxt->wellFormed = 0;
2960: }
2961:
1.40 daniel 2962: if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
2963: ctxt->sax->startDocument(ctxt->userData);
2964:
2965:
1.22 daniel 2966: /*
2967: * Parse possible comments before any content
2968: */
2969: while ((CUR == '<') && (NXT(1) == '!') &&
2970: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 2971: htmlParseComment(ctxt);
1.22 daniel 2972: SKIP_BLANKS;
2973: }
2974:
1.1 daniel 2975:
2976: /*
2977: * Then possibly doc type declaration(s) and more Misc
2978: * (doctypedecl Misc*)?
2979: */
2980: if ((CUR == '<') && (NXT(1) == '!') &&
2981: (UPP(2) == 'D') && (UPP(3) == 'O') &&
2982: (UPP(4) == 'C') && (UPP(5) == 'T') &&
2983: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
2984: (UPP(8) == 'E')) {
2985: htmlParseDocTypeDecl(ctxt);
2986: }
2987: SKIP_BLANKS;
2988:
2989: /*
2990: * Time to start parsing the tree itself
2991: */
1.22 daniel 2992: htmlParseContent(ctxt);
1.1 daniel 2993:
2994: /*
1.47 daniel 2995: * autoclose
2996: */
2997: if (CUR == 0)
2998: htmlAutoClose(ctxt, NULL);
2999:
3000:
3001: /*
1.1 daniel 3002: * SAX: end of the document processing.
3003: */
3004: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3005: ctxt->sax->endDocument(ctxt->userData);
3006: if (! ctxt->wellFormed) return(-1);
3007: return(0);
3008: }
3009:
3010:
1.30 daniel 3011: /************************************************************************
3012: * *
3013: * Parser contexts handling *
3014: * *
3015: ************************************************************************/
1.1 daniel 3016:
3017: /**
3018: * xmlInitParserCtxt:
3019: * @ctxt: an HTML parser context
3020: *
3021: * Initialize a parser context
3022: */
3023:
3024: void
3025: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3026: {
3027: htmlSAXHandler *sax;
3028:
1.21 daniel 3029: if (ctxt == NULL) return;
3030: memset(ctxt, 0, sizeof(htmlParserCtxt));
3031:
1.11 daniel 3032: sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
1.1 daniel 3033: if (sax == NULL) {
3034: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3035: }
1.19 daniel 3036: memset(sax, 0, sizeof(htmlSAXHandler));
1.1 daniel 3037:
3038: /* Allocate the Input stack */
1.19 daniel 3039: ctxt->inputTab = (htmlParserInputPtr *)
3040: xmlMalloc(5 * sizeof(htmlParserInputPtr));
3041: if (ctxt->inputTab == NULL) {
3042: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3043: }
1.1 daniel 3044: ctxt->inputNr = 0;
3045: ctxt->inputMax = 5;
3046: ctxt->input = NULL;
3047: ctxt->version = NULL;
3048: ctxt->encoding = NULL;
3049: ctxt->standalone = -1;
1.30 daniel 3050: ctxt->instate = XML_PARSER_START;
1.1 daniel 3051:
3052: /* Allocate the Node stack */
1.11 daniel 3053: ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
1.1 daniel 3054: ctxt->nodeNr = 0;
3055: ctxt->nodeMax = 10;
3056: ctxt->node = NULL;
3057:
1.15 daniel 3058: /* Allocate the Name stack */
3059: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3060: ctxt->nameNr = 0;
3061: ctxt->nameMax = 10;
3062: ctxt->name = NULL;
3063:
1.1 daniel 3064: if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3065: else {
3066: ctxt->sax = sax;
3067: memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3068: }
3069: ctxt->userData = ctxt;
3070: ctxt->myDoc = NULL;
3071: ctxt->wellFormed = 1;
3072: ctxt->replaceEntities = 0;
3073: ctxt->html = 1;
3074: ctxt->record_info = 0;
1.21 daniel 3075: ctxt->validate = 0;
1.26 daniel 3076: ctxt->nbChars = 0;
1.30 daniel 3077: ctxt->checkIndex = 0;
1.1 daniel 3078: xmlInitNodeInfoSeq(&ctxt->node_seq);
3079: }
3080:
3081: /**
3082: * htmlFreeParserCtxt:
3083: * @ctxt: an HTML parser context
3084: *
3085: * Free all the memory used by a parser context. However the parsed
3086: * document in ctxt->myDoc is not freed.
3087: */
3088:
3089: void
3090: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3091: {
1.47 daniel 3092: xmlFreeParserCtxt(ctxt);
1.1 daniel 3093: }
3094:
3095: /**
3096: * htmlCreateDocParserCtxt :
1.14 daniel 3097: * @cur: a pointer to an array of xmlChar
1.1 daniel 3098: * @encoding: a free form C string describing the HTML document encoding, or NULL
3099: *
3100: * Create a parser context for an HTML document.
3101: *
3102: * Returns the new parser context or NULL
3103: */
3104: htmlParserCtxtPtr
1.14 daniel 3105: htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
1.1 daniel 3106: htmlParserCtxtPtr ctxt;
3107: htmlParserInputPtr input;
3108: /* htmlCharEncoding enc; */
3109:
1.11 daniel 3110: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 3111: if (ctxt == NULL) {
3112: perror("malloc");
3113: return(NULL);
3114: }
3115: htmlInitParserCtxt(ctxt);
1.11 daniel 3116: input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 3117: if (input == NULL) {
3118: perror("malloc");
1.11 daniel 3119: xmlFree(ctxt);
1.1 daniel 3120: return(NULL);
3121: }
1.19 daniel 3122: memset(input, 0, sizeof(htmlParserInput));
1.1 daniel 3123:
3124: input->line = 1;
3125: input->col = 1;
3126: input->base = cur;
3127: input->cur = cur;
3128:
3129: inputPush(ctxt, input);
3130: return(ctxt);
3131: }
3132:
1.31 daniel 3133: /************************************************************************
3134: * *
3135: * Progressive parsing interfaces *
3136: * *
3137: ************************************************************************/
3138:
3139: /**
3140: * htmlParseLookupSequence:
3141: * @ctxt: an HTML parser context
3142: * @first: the first char to lookup
3143: * @next: the next char to lookup or zero
3144: * @third: the next char to lookup or zero
3145: *
3146: * Try to find if a sequence (first, next, third) or just (first next) or
3147: * (first) is available in the input stream.
3148: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3149: * to avoid rescanning sequences of bytes, it DOES change the state of the
3150: * parser, do not use liberally.
3151: * This is basically similar to xmlParseLookupSequence()
3152: *
3153: * Returns the index to the current parsing point if the full sequence
3154: * is available, -1 otherwise.
3155: */
3156: int
3157: htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3158: xmlChar next, xmlChar third) {
3159: int base, len;
3160: htmlParserInputPtr in;
3161: const xmlChar *buf;
3162:
3163: in = ctxt->input;
3164: if (in == NULL) return(-1);
3165: base = in->cur - in->base;
3166: if (base < 0) return(-1);
3167: if (ctxt->checkIndex > base)
3168: base = ctxt->checkIndex;
3169: if (in->buf == NULL) {
3170: buf = in->base;
3171: len = in->length;
3172: } else {
3173: buf = in->buf->buffer->content;
3174: len = in->buf->buffer->use;
3175: }
3176: /* take into account the sequence length */
3177: if (third) len -= 2;
3178: else if (next) len --;
3179: for (;base < len;base++) {
3180: if (buf[base] == first) {
3181: if (third != 0) {
3182: if ((buf[base + 1] != next) ||
3183: (buf[base + 2] != third)) continue;
3184: } else if (next != 0) {
3185: if (buf[base + 1] != next) continue;
3186: }
3187: ctxt->checkIndex = 0;
3188: #ifdef DEBUG_PUSH
3189: if (next == 0)
3190: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3191: first, base);
3192: else if (third == 0)
3193: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3194: first, next, base);
3195: else
3196: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3197: first, next, third, base);
3198: #endif
3199: return(base - (in->cur - in->base));
3200: }
3201: }
3202: ctxt->checkIndex = base;
3203: #ifdef DEBUG_PUSH
3204: if (next == 0)
3205: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3206: else if (third == 0)
3207: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3208: else
3209: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3210: #endif
3211: return(-1);
3212: }
3213:
3214: /**
1.32 daniel 3215: * htmlParseTryOrFinish:
1.31 daniel 3216: * @ctxt: an HTML parser context
1.32 daniel 3217: * @terminate: last chunk indicator
1.31 daniel 3218: *
3219: * Try to progress on parsing
3220: *
3221: * Returns zero if no parsing was possible
3222: */
3223: int
1.32 daniel 3224: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
1.31 daniel 3225: int ret = 0;
3226: htmlParserInputPtr in;
1.47 daniel 3227: int avail = 0;
1.31 daniel 3228: xmlChar cur, next;
3229:
3230: #ifdef DEBUG_PUSH
3231: switch (ctxt->instate) {
3232: case XML_PARSER_EOF:
3233: fprintf(stderr, "HPP: try EOF\n"); break;
3234: case XML_PARSER_START:
3235: fprintf(stderr, "HPP: try START\n"); break;
3236: case XML_PARSER_MISC:
3237: fprintf(stderr, "HPP: try MISC\n");break;
3238: case XML_PARSER_COMMENT:
3239: fprintf(stderr, "HPP: try COMMENT\n");break;
3240: case XML_PARSER_PROLOG:
3241: fprintf(stderr, "HPP: try PROLOG\n");break;
3242: case XML_PARSER_START_TAG:
3243: fprintf(stderr, "HPP: try START_TAG\n");break;
3244: case XML_PARSER_CONTENT:
3245: fprintf(stderr, "HPP: try CONTENT\n");break;
3246: case XML_PARSER_CDATA_SECTION:
3247: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3248: case XML_PARSER_END_TAG:
3249: fprintf(stderr, "HPP: try END_TAG\n");break;
3250: case XML_PARSER_ENTITY_DECL:
3251: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3252: case XML_PARSER_ENTITY_VALUE:
3253: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3254: case XML_PARSER_ATTRIBUTE_VALUE:
3255: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3256: case XML_PARSER_DTD:
3257: fprintf(stderr, "HPP: try DTD\n");break;
3258: case XML_PARSER_EPILOG:
3259: fprintf(stderr, "HPP: try EPILOG\n");break;
3260: case XML_PARSER_PI:
3261: fprintf(stderr, "HPP: try PI\n");break;
3262: }
3263: #endif
3264:
3265: while (1) {
3266:
3267: in = ctxt->input;
3268: if (in == NULL) break;
3269: if (in->buf == NULL)
3270: avail = in->length - (in->cur - in->base);
3271: else
3272: avail = in->buf->buffer->use - (in->cur - in->base);
1.47 daniel 3273: if ((avail == 0) && (terminate)) {
3274: htmlAutoClose(ctxt, NULL);
3275: if (ctxt->nameNr == 0)
3276: ctxt->instate = XML_PARSER_EOF;
3277: }
1.31 daniel 3278: if (avail < 1)
3279: goto done;
3280: switch (ctxt->instate) {
3281: case XML_PARSER_EOF:
3282: /*
3283: * Document parsing is done !
3284: */
3285: goto done;
3286: case XML_PARSER_START:
3287: /*
3288: * Very first chars read from the document flow.
3289: */
3290: cur = in->cur[0];
3291: if (IS_BLANK(cur)) {
3292: SKIP_BLANKS;
3293: if (in->buf == NULL)
3294: avail = in->length - (in->cur - in->base);
3295: else
3296: avail = in->buf->buffer->use - (in->cur - in->base);
3297: }
3298: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3299: ctxt->sax->setDocumentLocator(ctxt->userData,
3300: &xmlDefaultSAXLocator);
1.46 daniel 3301: if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3302: (!ctxt->disableSAX))
3303: ctxt->sax->startDocument(ctxt->userData);
3304:
1.31 daniel 3305: cur = in->cur[0];
3306: next = in->cur[1];
3307: if ((cur == '<') && (next == '!') &&
3308: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3309: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3310: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3311: (UPP(8) == 'E')) {
1.32 daniel 3312: if ((!terminate) &&
3313: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3314: goto done;
3315: #ifdef DEBUG_PUSH
3316: fprintf(stderr, "HPP: Parsing internal subset\n");
3317: #endif
3318: htmlParseDocTypeDecl(ctxt);
3319: ctxt->instate = XML_PARSER_PROLOG;
3320: #ifdef DEBUG_PUSH
3321: fprintf(stderr, "HPP: entering PROLOG\n");
3322: #endif
3323: } else {
3324: ctxt->instate = XML_PARSER_MISC;
3325: }
3326: #ifdef DEBUG_PUSH
3327: fprintf(stderr, "HPP: entering MISC\n");
3328: #endif
3329: break;
3330: case XML_PARSER_MISC:
3331: SKIP_BLANKS;
3332: if (in->buf == NULL)
3333: avail = in->length - (in->cur - in->base);
3334: else
3335: avail = in->buf->buffer->use - (in->cur - in->base);
3336: if (avail < 2)
3337: goto done;
3338: cur = in->cur[0];
3339: next = in->cur[1];
3340: if ((cur == '<') && (next == '!') &&
3341: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3342: if ((!terminate) &&
3343: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3344: goto done;
3345: #ifdef DEBUG_PUSH
3346: fprintf(stderr, "HPP: Parsing Comment\n");
3347: #endif
3348: htmlParseComment(ctxt);
3349: ctxt->instate = XML_PARSER_MISC;
3350: } else if ((cur == '<') && (next == '!') &&
3351: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3352: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3353: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3354: (UPP(8) == 'E')) {
1.32 daniel 3355: if ((!terminate) &&
3356: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3357: goto done;
3358: #ifdef DEBUG_PUSH
3359: fprintf(stderr, "HPP: Parsing internal subset\n");
3360: #endif
3361: htmlParseDocTypeDecl(ctxt);
3362: ctxt->instate = XML_PARSER_PROLOG;
3363: #ifdef DEBUG_PUSH
3364: fprintf(stderr, "HPP: entering PROLOG\n");
3365: #endif
3366: } else if ((cur == '<') && (next == '!') &&
3367: (avail < 9)) {
3368: goto done;
3369: } else {
3370: ctxt->instate = XML_PARSER_START_TAG;
3371: #ifdef DEBUG_PUSH
3372: fprintf(stderr, "HPP: entering START_TAG\n");
3373: #endif
3374: }
3375: break;
3376: case XML_PARSER_PROLOG:
3377: SKIP_BLANKS;
3378: if (in->buf == NULL)
3379: avail = in->length - (in->cur - in->base);
3380: else
3381: avail = in->buf->buffer->use - (in->cur - in->base);
3382: if (avail < 2)
3383: goto done;
3384: cur = in->cur[0];
3385: next = in->cur[1];
3386: if ((cur == '<') && (next == '!') &&
3387: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3388: if ((!terminate) &&
3389: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3390: goto done;
3391: #ifdef DEBUG_PUSH
3392: fprintf(stderr, "HPP: Parsing Comment\n");
3393: #endif
3394: htmlParseComment(ctxt);
3395: ctxt->instate = XML_PARSER_PROLOG;
3396: } else if ((cur == '<') && (next == '!') &&
3397: (avail < 4)) {
3398: goto done;
3399: } else {
3400: ctxt->instate = XML_PARSER_START_TAG;
3401: #ifdef DEBUG_PUSH
3402: fprintf(stderr, "HPP: entering START_TAG\n");
3403: #endif
3404: }
3405: break;
3406: case XML_PARSER_EPILOG:
3407: SKIP_BLANKS;
3408: if (in->buf == NULL)
3409: avail = in->length - (in->cur - in->base);
3410: else
3411: avail = in->buf->buffer->use - (in->cur - in->base);
3412: if (avail < 2)
3413: goto done;
3414: cur = in->cur[0];
3415: next = in->cur[1];
3416: if ((cur == '<') && (next == '!') &&
3417: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3418: if ((!terminate) &&
3419: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3420: goto done;
3421: #ifdef DEBUG_PUSH
3422: fprintf(stderr, "HPP: Parsing Comment\n");
3423: #endif
3424: htmlParseComment(ctxt);
3425: ctxt->instate = XML_PARSER_EPILOG;
3426: } else if ((cur == '<') && (next == '!') &&
3427: (avail < 4)) {
3428: goto done;
3429: } else {
3430: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3431: ctxt->sax->error(ctxt->userData,
3432: "Extra content at the end of the document\n");
3433: ctxt->wellFormed = 0;
3434: ctxt->errNo = XML_ERR_DOCUMENT_END;
3435: ctxt->instate = XML_PARSER_EOF;
3436: #ifdef DEBUG_PUSH
3437: fprintf(stderr, "HPP: entering EOF\n");
3438: #endif
3439: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3440: ctxt->sax->endDocument(ctxt->userData);
3441: goto done;
3442: }
3443: break;
3444: case XML_PARSER_START_TAG: {
3445: xmlChar *name, *oldname;
3446: int depth = ctxt->nameNr;
3447: htmlElemDescPtr info;
3448:
3449: if (avail < 2)
3450: goto done;
3451: cur = in->cur[0];
3452: if (cur != '<') {
3453: ctxt->instate = XML_PARSER_CONTENT;
3454: #ifdef DEBUG_PUSH
3455: fprintf(stderr, "HPP: entering CONTENT\n");
3456: #endif
3457: break;
3458: }
1.32 daniel 3459: if ((!terminate) &&
3460: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3461: goto done;
3462:
3463: oldname = xmlStrdup(ctxt->name);
3464: htmlParseStartTag(ctxt);
3465: name = ctxt->name;
3466: #ifdef DEBUG
3467: if (oldname == NULL)
3468: fprintf(stderr, "Start of element %s\n", name);
3469: else if (name == NULL)
3470: fprintf(stderr, "Start of element failed, was %s\n",
3471: oldname);
3472: else
3473: fprintf(stderr, "Start of element %s, was %s\n",
3474: name, oldname);
3475: #endif
3476: if (((depth == ctxt->nameNr) &&
3477: (!xmlStrcmp(oldname, ctxt->name))) ||
3478: (name == NULL)) {
3479: if (CUR == '>')
3480: NEXT;
3481: if (oldname != NULL)
3482: xmlFree(oldname);
3483: break;
3484: }
3485: if (oldname != NULL)
3486: xmlFree(oldname);
3487:
3488: /*
3489: * Lookup the info for that element.
3490: */
3491: info = htmlTagLookup(name);
3492: if (info == NULL) {
3493: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3494: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3495: name);
3496: ctxt->wellFormed = 0;
3497: } else if (info->depr) {
3498: /***************************
3499: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3500: ctxt->sax->warning(ctxt->userData,
3501: "Tag %s is deprecated\n",
3502: name);
3503: ***************************/
3504: }
3505:
3506: /*
3507: * Check for an Empty Element labelled the XML/SGML way
3508: */
3509: if ((CUR == '/') && (NXT(1) == '>')) {
3510: SKIP(2);
3511: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3512: ctxt->sax->endElement(ctxt->userData, name);
3513: oldname = htmlnamePop(ctxt);
3514: #ifdef DEBUG
3515: fprintf(stderr,"End of tag the XML way: popping out %s\n",
3516: oldname);
3517: #endif
3518: if (oldname != NULL)
3519: xmlFree(oldname);
3520: ctxt->instate = XML_PARSER_CONTENT;
3521: #ifdef DEBUG_PUSH
3522: fprintf(stderr, "HPP: entering CONTENT\n");
3523: #endif
3524: break;
3525: }
3526:
3527: if (CUR == '>') {
3528: NEXT;
3529: } else {
3530: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3531: ctxt->sax->error(ctxt->userData,
3532: "Couldn't find end of Start Tag %s\n",
3533: name);
3534: ctxt->wellFormed = 0;
3535:
3536: /*
3537: * end of parsing of this node.
3538: */
3539: if (!xmlStrcmp(name, ctxt->name)) {
3540: nodePop(ctxt);
3541: oldname = htmlnamePop(ctxt);
3542: #ifdef DEBUG
3543: fprintf(stderr,
3544: "End of start tag problem: popping out %s\n", oldname);
3545: #endif
3546: if (oldname != NULL)
3547: xmlFree(oldname);
3548: }
3549:
3550: ctxt->instate = XML_PARSER_CONTENT;
3551: #ifdef DEBUG_PUSH
3552: fprintf(stderr, "HPP: entering CONTENT\n");
3553: #endif
3554: break;
3555: }
3556:
3557: /*
3558: * Check for an Empty Element from DTD definition
3559: */
3560: if ((info != NULL) && (info->empty)) {
3561: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3562: ctxt->sax->endElement(ctxt->userData, name);
3563: oldname = htmlnamePop(ctxt);
3564: #ifdef DEBUG
3565: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3566: #endif
3567: if (oldname != NULL)
3568: xmlFree(oldname);
3569: }
3570: ctxt->instate = XML_PARSER_CONTENT;
3571: #ifdef DEBUG_PUSH
3572: fprintf(stderr, "HPP: entering CONTENT\n");
3573: #endif
3574: break;
3575: }
3576: case XML_PARSER_CONTENT:
3577: /*
3578: * Handle preparsed entities and charRef
3579: */
3580: if (ctxt->token != 0) {
1.47 daniel 3581: xmlChar chr[2] = { 0 , 0 } ;
1.31 daniel 3582:
1.47 daniel 3583: chr[0] = (xmlChar) ctxt->token;
1.31 daniel 3584: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.47 daniel 3585: ctxt->sax->characters(ctxt->userData, chr, 1);
1.31 daniel 3586: ctxt->token = 0;
3587: ctxt->checkIndex = 0;
3588: }
1.47 daniel 3589: if ((avail == 1) && (terminate)) {
3590: cur = in->cur[0];
3591: if ((cur != '<') && (cur != '&')) {
1.48 daniel 3592: if (ctxt->sax != NULL) {
3593: if (IS_BLANK(cur)) {
3594: if (ctxt->sax->ignorableWhitespace != NULL)
3595: ctxt->sax->ignorableWhitespace(
3596: ctxt->userData, &cur, 1);
3597: } else {
3598: if (ctxt->sax->characters != NULL)
3599: ctxt->sax->characters(
3600: ctxt->userData, &cur, 1);
3601: }
3602: }
1.47 daniel 3603: ctxt->token = 0;
3604: ctxt->checkIndex = 0;
3605: NEXT;
3606: }
3607: break;
3608: }
1.31 daniel 3609: if (avail < 2)
3610: goto done;
3611: cur = in->cur[0];
3612: next = in->cur[1];
3613: if ((cur == '<') && (next == '!') &&
3614: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3615: if ((!terminate) &&
3616: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3617: goto done;
3618: #ifdef DEBUG_PUSH
3619: fprintf(stderr, "HPP: Parsing Comment\n");
3620: #endif
3621: htmlParseComment(ctxt);
3622: ctxt->instate = XML_PARSER_CONTENT;
3623: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
3624: goto done;
3625: } else if ((cur == '<') && (next == '/')) {
3626: ctxt->instate = XML_PARSER_END_TAG;
3627: ctxt->checkIndex = 0;
3628: #ifdef DEBUG_PUSH
3629: fprintf(stderr, "HPP: entering END_TAG\n");
3630: #endif
3631: break;
3632: } else if (cur == '<') {
3633: ctxt->instate = XML_PARSER_START_TAG;
3634: ctxt->checkIndex = 0;
3635: #ifdef DEBUG_PUSH
3636: fprintf(stderr, "HPP: entering START_TAG\n");
3637: #endif
3638: break;
3639: } else if (cur == '&') {
1.32 daniel 3640: if ((!terminate) &&
3641: (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
1.31 daniel 3642: goto done;
3643: #ifdef DEBUG_PUSH
3644: fprintf(stderr, "HPP: Parsing Reference\n");
3645: #endif
3646: /* TODO: check generation of subtrees if noent !!! */
3647: htmlParseReference(ctxt);
3648: } else {
3649: /* TODO Avoid the extra copy, handle directly !!!!!! */
3650: /*
3651: * Goal of the following test is :
3652: * - minimize calls to the SAX 'character' callback
3653: * when they are mergeable
3654: */
3655: if ((ctxt->inputNr == 1) &&
3656: (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
1.32 daniel 3657: if ((!terminate) &&
3658: (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
1.31 daniel 3659: goto done;
3660: }
3661: ctxt->checkIndex = 0;
3662: #ifdef DEBUG_PUSH
3663: fprintf(stderr, "HPP: Parsing char data\n");
3664: #endif
3665: htmlParseCharData(ctxt, 0);
3666: }
3667: break;
3668: case XML_PARSER_END_TAG:
3669: if (avail < 2)
3670: goto done;
1.32 daniel 3671: if ((!terminate) &&
3672: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3673: goto done;
3674: htmlParseEndTag(ctxt);
3675: if (ctxt->nameNr == 0) {
3676: ctxt->instate = XML_PARSER_EPILOG;
3677: } else {
3678: ctxt->instate = XML_PARSER_CONTENT;
3679: }
3680: ctxt->checkIndex = 0;
3681: #ifdef DEBUG_PUSH
3682: fprintf(stderr, "HPP: entering CONTENT\n");
3683: #endif
3684: break;
3685: case XML_PARSER_CDATA_SECTION:
3686: fprintf(stderr, "HPP: internal error, state == CDATA\n");
3687: ctxt->instate = XML_PARSER_CONTENT;
3688: ctxt->checkIndex = 0;
3689: #ifdef DEBUG_PUSH
3690: fprintf(stderr, "HPP: entering CONTENT\n");
3691: #endif
3692: break;
3693: case XML_PARSER_DTD:
3694: fprintf(stderr, "HPP: internal error, state == DTD\n");
3695: ctxt->instate = XML_PARSER_CONTENT;
3696: ctxt->checkIndex = 0;
3697: #ifdef DEBUG_PUSH
3698: fprintf(stderr, "HPP: entering CONTENT\n");
3699: #endif
3700: break;
3701: case XML_PARSER_COMMENT:
3702: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
3703: ctxt->instate = XML_PARSER_CONTENT;
3704: ctxt->checkIndex = 0;
3705: #ifdef DEBUG_PUSH
3706: fprintf(stderr, "HPP: entering CONTENT\n");
3707: #endif
3708: break;
3709: case XML_PARSER_PI:
3710: fprintf(stderr, "HPP: internal error, state == PI\n");
3711: ctxt->instate = XML_PARSER_CONTENT;
3712: ctxt->checkIndex = 0;
3713: #ifdef DEBUG_PUSH
3714: fprintf(stderr, "HPP: entering CONTENT\n");
3715: #endif
3716: break;
3717: case XML_PARSER_ENTITY_DECL:
3718: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
3719: ctxt->instate = XML_PARSER_CONTENT;
3720: ctxt->checkIndex = 0;
3721: #ifdef DEBUG_PUSH
3722: fprintf(stderr, "HPP: entering CONTENT\n");
3723: #endif
3724: break;
3725: case XML_PARSER_ENTITY_VALUE:
3726: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
3727: ctxt->instate = XML_PARSER_CONTENT;
3728: ctxt->checkIndex = 0;
3729: #ifdef DEBUG_PUSH
3730: fprintf(stderr, "HPP: entering DTD\n");
3731: #endif
3732: break;
3733: case XML_PARSER_ATTRIBUTE_VALUE:
3734: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
3735: ctxt->instate = XML_PARSER_START_TAG;
3736: ctxt->checkIndex = 0;
3737: #ifdef DEBUG_PUSH
3738: fprintf(stderr, "HPP: entering START_TAG\n");
3739: #endif
3740: break;
3741: }
3742: }
3743: done:
1.47 daniel 3744: if ((avail == 0) && (terminate)) {
3745: htmlAutoClose(ctxt, NULL);
3746: if (ctxt->nameNr == 0)
3747: ctxt->instate = XML_PARSER_EOF;
3748: }
1.31 daniel 3749: #ifdef DEBUG_PUSH
3750: fprintf(stderr, "HPP: done %d\n", ret);
3751: #endif
3752: return(ret);
3753: }
3754:
3755: /**
1.32 daniel 3756: * htmlParseTry:
3757: * @ctxt: an HTML parser context
3758: *
3759: * Try to progress on parsing
3760: *
3761: * Returns zero if no parsing was possible
3762: */
3763: int
3764: htmlParseTry(htmlParserCtxtPtr ctxt) {
3765: return(htmlParseTryOrFinish(ctxt, 0));
3766: }
3767:
3768: /**
1.31 daniel 3769: * htmlParseChunk:
3770: * @ctxt: an XML parser context
3771: * @chunk: an char array
3772: * @size: the size in byte of the chunk
3773: * @terminate: last chunk indicator
3774: *
3775: * Parse a Chunk of memory
3776: *
3777: * Returns zero if no error, the xmlParserErrors otherwise.
3778: */
3779: int
3780: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
3781: int terminate) {
3782: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3783: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
3784: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
3785: int cur = ctxt->input->cur - ctxt->input->base;
3786:
3787: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3788: ctxt->input->base = ctxt->input->buf->buffer->content + base;
3789: ctxt->input->cur = ctxt->input->base + cur;
3790: #ifdef DEBUG_PUSH
3791: fprintf(stderr, "HPP: pushed %d\n", size);
3792: #endif
3793:
1.34 daniel 3794: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
3795: htmlParseTryOrFinish(ctxt, terminate);
1.31 daniel 3796: } else if (ctxt->instate != XML_PARSER_EOF)
1.32 daniel 3797: htmlParseTryOrFinish(ctxt, terminate);
1.31 daniel 3798: if (terminate) {
3799: if ((ctxt->instate != XML_PARSER_EOF) &&
3800: (ctxt->instate != XML_PARSER_EPILOG) &&
3801: (ctxt->instate != XML_PARSER_MISC)) {
3802: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3803: ctxt->sax->error(ctxt->userData,
3804: "Extra content at the end of the document\n");
3805: ctxt->wellFormed = 0;
3806: ctxt->errNo = XML_ERR_DOCUMENT_END;
3807: }
3808: if (ctxt->instate != XML_PARSER_EOF) {
3809: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3810: ctxt->sax->endDocument(ctxt->userData);
3811: }
3812: ctxt->instate = XML_PARSER_EOF;
3813: }
3814: return((xmlParserErrors) ctxt->errNo);
3815: }
3816:
3817: /************************************************************************
3818: * *
3819: * User entry points *
3820: * *
3821: ************************************************************************/
3822:
3823: /**
3824: * htmlCreatePushParserCtxt :
3825: * @sax: a SAX handler
3826: * @user_data: The user data returned on SAX callbacks
3827: * @chunk: a pointer to an array of chars
3828: * @size: number of chars in the array
3829: * @filename: an optional file name or URI
3830: * @enc: an optional encoding
3831: *
3832: * Create a parser context for using the HTML parser in push mode
3833: * To allow content encoding detection, @size should be >= 4
3834: * The value of @filename is used for fetching external entities
3835: * and error/warning reports.
3836: *
3837: * Returns the new parser context or NULL
3838: */
3839: htmlParserCtxtPtr
3840: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
3841: const char *chunk, int size, const char *filename,
3842: xmlCharEncoding enc) {
3843: htmlParserCtxtPtr ctxt;
3844: htmlParserInputPtr inputStream;
3845: xmlParserInputBufferPtr buf;
3846:
3847: buf = xmlAllocParserInputBuffer(enc);
3848: if (buf == NULL) return(NULL);
3849:
3850: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
3851: if (ctxt == NULL) {
3852: xmlFree(buf);
3853: return(NULL);
3854: }
3855: memset(ctxt, 0, sizeof(htmlParserCtxt));
3856: htmlInitParserCtxt(ctxt);
3857: if (sax != NULL) {
3858: if (ctxt->sax != &htmlDefaultSAXHandler)
3859: xmlFree(ctxt->sax);
3860: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
3861: if (ctxt->sax == NULL) {
3862: xmlFree(buf);
3863: xmlFree(ctxt);
3864: return(NULL);
3865: }
3866: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
3867: if (user_data != NULL)
3868: ctxt->userData = user_data;
3869: }
3870: if (filename == NULL) {
3871: ctxt->directory = NULL;
3872: } else {
3873: ctxt->directory = xmlParserGetDirectory(filename);
3874: }
3875:
3876: inputStream = htmlNewInputStream(ctxt);
3877: if (inputStream == NULL) {
3878: xmlFreeParserCtxt(ctxt);
3879: return(NULL);
3880: }
3881:
3882: if (filename == NULL)
3883: inputStream->filename = NULL;
3884: else
3885: inputStream->filename = xmlMemStrdup(filename);
3886: inputStream->buf = buf;
3887: inputStream->base = inputStream->buf->buffer->content;
3888: inputStream->cur = inputStream->buf->buffer->content;
3889:
3890: inputPush(ctxt, inputStream);
3891:
3892: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
3893: (ctxt->input->buf != NULL)) {
3894: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
3895: #ifdef DEBUG_PUSH
3896: fprintf(stderr, "HPP: pushed %d\n", size);
3897: #endif
3898: }
3899:
3900: return(ctxt);
3901: }
1.1 daniel 3902:
3903: /**
3904: * htmlSAXParseDoc :
1.14 daniel 3905: * @cur: a pointer to an array of xmlChar
1.1 daniel 3906: * @encoding: a free form C string describing the HTML document encoding, or NULL
3907: * @sax: the SAX handler block
3908: * @userData: if using SAX, this pointer will be provided on callbacks.
3909: *
3910: * parse an HTML in-memory document and build a tree.
3911: * It use the given SAX function block to handle the parsing callback.
3912: * If sax is NULL, fallback to the default DOM tree building routines.
3913: *
3914: * Returns the resulting document tree
3915: */
3916:
3917: htmlDocPtr
1.14 daniel 3918: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
1.1 daniel 3919: htmlDocPtr ret;
3920: htmlParserCtxtPtr ctxt;
3921:
3922: if (cur == NULL) return(NULL);
3923:
3924:
3925: ctxt = htmlCreateDocParserCtxt(cur, encoding);
3926: if (ctxt == NULL) return(NULL);
3927: if (sax != NULL) {
3928: ctxt->sax = sax;
3929: ctxt->userData = userData;
3930: }
3931:
3932: htmlParseDocument(ctxt);
3933: ret = ctxt->myDoc;
3934: if (sax != NULL) {
3935: ctxt->sax = NULL;
3936: ctxt->userData = NULL;
3937: }
3938: htmlFreeParserCtxt(ctxt);
3939:
3940: return(ret);
3941: }
3942:
3943: /**
3944: * htmlParseDoc :
1.14 daniel 3945: * @cur: a pointer to an array of xmlChar
1.1 daniel 3946: * @encoding: a free form C string describing the HTML document encoding, or NULL
3947: *
3948: * parse an HTML in-memory document and build a tree.
3949: *
3950: * Returns the resulting document tree
3951: */
3952:
3953: htmlDocPtr
1.14 daniel 3954: htmlParseDoc(xmlChar *cur, const char *encoding) {
1.1 daniel 3955: return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
3956: }
3957:
3958:
3959: /**
3960: * htmlCreateFileParserCtxt :
3961: * @filename: the filename
3962: * @encoding: a free form C string describing the HTML document encoding, or NULL
3963: *
3964: * Create a parser context for a file content.
3965: * Automatic support for ZLIB/Compress compressed document is provided
3966: * by default if found at compile-time.
3967: *
3968: * Returns the new parser context or NULL
3969: */
3970: htmlParserCtxtPtr
3971: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
3972: {
3973: htmlParserCtxtPtr ctxt;
3974: htmlParserInputPtr inputStream;
1.5 daniel 3975: xmlParserInputBufferPtr buf;
1.1 daniel 3976: /* htmlCharEncoding enc; */
3977:
1.5 daniel 3978: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
3979: if (buf == NULL) return(NULL);
1.1 daniel 3980:
1.11 daniel 3981: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 3982: if (ctxt == NULL) {
3983: perror("malloc");
3984: return(NULL);
3985: }
1.19 daniel 3986: memset(ctxt, 0, sizeof(htmlParserCtxt));
1.1 daniel 3987: htmlInitParserCtxt(ctxt);
1.11 daniel 3988: inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 3989: if (inputStream == NULL) {
3990: perror("malloc");
1.11 daniel 3991: xmlFree(ctxt);
1.1 daniel 3992: return(NULL);
3993: }
1.19 daniel 3994: memset(inputStream, 0, sizeof(htmlParserInput));
1.1 daniel 3995:
1.11 daniel 3996: inputStream->filename = xmlMemStrdup(filename);
1.1 daniel 3997: inputStream->line = 1;
3998: inputStream->col = 1;
1.5 daniel 3999: inputStream->buf = buf;
1.21 daniel 4000: inputStream->directory = NULL;
1.1 daniel 4001:
1.5 daniel 4002: inputStream->base = inputStream->buf->buffer->content;
4003: inputStream->cur = inputStream->buf->buffer->content;
4004: inputStream->free = NULL;
1.1 daniel 4005:
4006: inputPush(ctxt, inputStream);
4007: return(ctxt);
4008: }
4009:
4010: /**
4011: * htmlSAXParseFile :
4012: * @filename: the filename
4013: * @encoding: a free form C string describing the HTML document encoding, or NULL
4014: * @sax: the SAX handler block
4015: * @userData: if using SAX, this pointer will be provided on callbacks.
4016: *
4017: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4018: * compressed document is provided by default if found at compile-time.
4019: * It use the given SAX function block to handle the parsing callback.
4020: * If sax is NULL, fallback to the default DOM tree building routines.
4021: *
4022: * Returns the resulting document tree
4023: */
4024:
4025: htmlDocPtr
4026: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4027: void *userData) {
4028: htmlDocPtr ret;
4029: htmlParserCtxtPtr ctxt;
4030:
4031: ctxt = htmlCreateFileParserCtxt(filename, encoding);
4032: if (ctxt == NULL) return(NULL);
4033: if (sax != NULL) {
4034: ctxt->sax = sax;
4035: ctxt->userData = userData;
4036: }
4037:
4038: htmlParseDocument(ctxt);
4039:
4040: ret = ctxt->myDoc;
4041: if (sax != NULL) {
4042: ctxt->sax = NULL;
4043: ctxt->userData = NULL;
4044: }
4045: htmlFreeParserCtxt(ctxt);
4046:
4047: return(ret);
4048: }
4049:
4050: /**
4051: * htmlParseFile :
4052: * @filename: the filename
4053: * @encoding: a free form C string describing the HTML document encoding, or NULL
4054: *
4055: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4056: * compressed document is provided by default if found at compile-time.
4057: *
4058: * Returns the resulting document tree
4059: */
4060:
4061: htmlDocPtr
4062: htmlParseFile(const char *filename, const char *encoding) {
4063: return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4064: }
1.39 daniel 4065:
4066: #endif /* LIBXML_HTML_ENABLED */
Webmaster