Annotation of XML/HTMLparser.c, revision 1.65
1.1 daniel 1: /*
2: * HTMLparser.c : an HTML 4.0 non-verifying parser
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
1.29 daniel 10: #include "win32config.h"
1.1 daniel 11: #else
1.13 daniel 12: #include "config.h"
1.1 daniel 13: #endif
1.13 daniel 14:
1.39 daniel 15: #include "xmlversion.h"
16: #ifdef LIBXML_HTML_ENABLED
17:
1.1 daniel 18: #include <stdio.h>
1.50 veillard 19: #include <string.h>
1.13 daniel 20: #ifdef HAVE_CTYPE_H
1.1 daniel 21: #include <ctype.h>
1.13 daniel 22: #endif
23: #ifdef HAVE_STDLIB_H
1.1 daniel 24: #include <stdlib.h>
1.13 daniel 25: #endif
26: #ifdef HAVE_SYS_STAT_H
1.1 daniel 27: #include <sys/stat.h>
1.13 daniel 28: #endif
1.1 daniel 29: #ifdef HAVE_FCNTL_H
30: #include <fcntl.h>
31: #endif
32: #ifdef HAVE_UNISTD_H
33: #include <unistd.h>
34: #endif
35: #ifdef HAVE_ZLIB_H
36: #include <zlib.h>
37: #endif
38:
1.39 daniel 39: #include <libxml/xmlmemory.h>
40: #include <libxml/tree.h>
41: #include <libxml/HTMLparser.h>
42: #include <libxml/entities.h>
43: #include <libxml/encoding.h>
1.50 veillard 44: #include <libxml/parser.h>
1.39 daniel 45: #include <libxml/valid.h>
46: #include <libxml/parserInternals.h>
47: #include <libxml/xmlIO.h>
1.31 daniel 48: #include "xml-error.h"
1.5 daniel 49:
50: #define HTML_MAX_NAMELEN 1000
51: #define INPUT_CHUNK 50
1.53 veillard 52: #define HTML_PARSER_BIG_BUFFER_SIZE 1000
1.31 daniel 53: #define HTML_PARSER_BUFFER_SIZE 100
1.1 daniel 54:
55: /* #define DEBUG */
1.31 daniel 56: /* #define DEBUG_PUSH */
1.1 daniel 57:
58: /************************************************************************
59: * *
60: * Parser stacks related functions and macros *
61: * *
62: ************************************************************************/
63:
64: /*
65: * Generic function for accessing stacks in the Parser Context
66: */
67:
1.30 daniel 68: #define PUSH_AND_POP(scope, type, name) \
69: scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
1.1 daniel 70: if (ctxt->name##Nr >= ctxt->name##Max) { \
71: ctxt->name##Max *= 2; \
1.50 veillard 72: ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
1.1 daniel 73: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
74: if (ctxt->name##Tab == NULL) { \
75: fprintf(stderr, "realloc failed !\n"); \
1.33 daniel 76: return(0); \
1.1 daniel 77: } \
78: } \
79: ctxt->name##Tab[ctxt->name##Nr] = value; \
80: ctxt->name = value; \
81: return(ctxt->name##Nr++); \
82: } \
1.30 daniel 83: scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
1.1 daniel 84: type ret; \
1.18 daniel 85: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 86: ctxt->name##Nr--; \
1.18 daniel 87: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 88: if (ctxt->name##Nr > 0) \
89: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
90: else \
91: ctxt->name = NULL; \
92: ret = ctxt->name##Tab[ctxt->name##Nr]; \
93: ctxt->name##Tab[ctxt->name##Nr] = 0; \
94: return(ret); \
95: } \
96:
1.30 daniel 97: PUSH_AND_POP(extern, xmlNodePtr, node)
98: PUSH_AND_POP(extern, xmlChar*, name)
1.1 daniel 99:
100: /*
101: * Macros for accessing the content. Those should be used only by the parser,
102: * and not exported.
103: *
104: * Dirty macros, i.e. one need to make assumption on the context to use them
105: *
1.14 daniel 106: * CUR_PTR return the current pointer to the xmlChar to be parsed.
107: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
1.1 daniel 108: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
109: * in UNICODE mode. This should be used internally by the parser
110: * only to compare to ASCII values otherwise it would break when
111: * running with UTF-8 encoding.
1.14 daniel 112: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
1.1 daniel 113: * to compare on ASCII based substring.
1.14 daniel 114: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
1.1 daniel 115: * it should be used only to compare on ASCII based substring.
1.14 daniel 116: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
1.1 daniel 117: * strings within the parser.
118: *
119: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
120: *
121: * CURRENT Returns the current char value, with the full decoding of
122: * UTF-8 if we are using this mode. It returns an int.
123: * NEXT Skip to the next character, this does the proper decoding
124: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
125: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
126: */
127:
128: #define UPPER (toupper(*ctxt->input->cur))
1.36 daniel 129:
1.26 daniel 130: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
1.36 daniel 131:
1.1 daniel 132: #define NXT(val) ctxt->input->cur[(val)]
1.36 daniel 133:
1.1 daniel 134: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
1.36 daniel 135:
1.1 daniel 136: #define CUR_PTR ctxt->input->cur
1.36 daniel 137:
1.5 daniel 138: #define SHRINK xmlParserInputShrink(ctxt->input)
1.36 daniel 139:
1.5 daniel 140: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
1.1 daniel 141:
1.36 daniel 142: #define CURRENT ((int) (*ctxt->input->cur))
1.1 daniel 143:
1.53 veillard 144: #define SKIP_BLANKS htmlSkipBlankChars(ctxt);
145:
146: #if 0
147: #define CUR ((int) (*ctxt->input->cur))
1.36 daniel 148: #define NEXT htmlNextChar(ctxt);
1.53 veillard 149: #else
150: /* Inported from XML */
151:
152: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
153: #define CUR ((int) (*ctxt->input->cur))
154: #define NEXT xmlNextChar(ctxt);ctxt->nbChars++;
155:
156: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
157: #define NXT(val) ctxt->input->cur[(val)]
158: #define CUR_PTR ctxt->input->cur
159:
160:
161: #define NEXTL(l) \
162: if (*(ctxt->input->cur) == '\n') { \
163: ctxt->input->line++; ctxt->input->col = 1; \
164: } else ctxt->input->col++; \
165: ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;
166:
167: /************
168: \
169: if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
170: if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
171: ************/
172:
173: #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l);
174: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l);
175:
176: #define COPY_BUF(l,b,i,v) \
177: if (l == 1) b[i++] = (xmlChar) v; \
178: else i += xmlCopyChar(l,&b[i],v);
179: #endif
180:
181: /**
182: * htmlCurrentChar:
183: * @ctxt: the HTML parser context
184: * @len: pointer to the length of the char read
185: *
186: * The current char value, if using UTF-8 this may actaully span multiple
187: * bytes in the input buffer. Implement the end of line normalization:
188: * 2.11 End-of-Line Handling
189: * If the encoding is unspecified, in the case we find an ISO-Latin-1
190: * char, then the encoding converter is plugged in automatically.
191: *
192: * Returns the current char value and its lenght
193: */
194:
195: int
196: htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
197: if (ctxt->instate == XML_PARSER_EOF)
198: return(0);
1.35 daniel 199:
1.53 veillard 200: if (ctxt->token != 0) {
201: *len = 0;
202: return(ctxt->token);
203: }
204: if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
205: /*
206: * We are supposed to handle UTF8, check it's valid
207: * From rfc2044: encoding of the Unicode values on UTF-8:
208: *
209: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
210: * 0000 0000-0000 007F 0xxxxxxx
211: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
212: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
213: *
214: * Check for the 0x110000 limit too
215: */
216: const unsigned char *cur = ctxt->input->cur;
217: unsigned char c;
218: unsigned int val;
219:
220: c = *cur;
221: if (c & 0x80) {
222: if (cur[1] == 0)
223: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
224: if ((cur[1] & 0xc0) != 0x80)
225: goto encoding_error;
226: if ((c & 0xe0) == 0xe0) {
227:
228: if (cur[2] == 0)
229: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
230: if ((cur[2] & 0xc0) != 0x80)
231: goto encoding_error;
232: if ((c & 0xf0) == 0xf0) {
233: if (cur[3] == 0)
234: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
235: if (((c & 0xf8) != 0xf0) ||
236: ((cur[3] & 0xc0) != 0x80))
237: goto encoding_error;
238: /* 4-byte code */
239: *len = 4;
240: val = (cur[0] & 0x7) << 18;
241: val |= (cur[1] & 0x3f) << 12;
242: val |= (cur[2] & 0x3f) << 6;
243: val |= cur[3] & 0x3f;
244: } else {
245: /* 3-byte code */
246: *len = 3;
247: val = (cur[0] & 0xf) << 12;
248: val |= (cur[1] & 0x3f) << 6;
249: val |= cur[2] & 0x3f;
250: }
251: } else {
252: /* 2-byte code */
253: *len = 2;
254: val = (cur[0] & 0x1f) << 6;
255: val |= cur[1] & 0x3f;
256: }
257: if (!IS_CHAR(val)) {
258: if ((ctxt->sax != NULL) &&
259: (ctxt->sax->error != NULL))
260: ctxt->sax->error(ctxt->userData,
261: "Char 0x%X out of allowed range\n", val);
262: ctxt->errNo = XML_ERR_INVALID_ENCODING;
263: ctxt->wellFormed = 0;
264: ctxt->disableSAX = 1;
265: }
266: return(val);
267: } else {
268: /* 1-byte code */
269: *len = 1;
270: return((int) *ctxt->input->cur);
271: }
272: }
273: /*
274: * Assume it's a fixed lenght encoding (1) with
275: * a compatibke encoding for the ASCII set, since
276: * XML constructs only use < 128 chars
277: */
278: *len = 1;
279: if ((int) *ctxt->input->cur < 0x80)
280: return((int) *ctxt->input->cur);
281:
282: /*
283: * Humm this is bad, do an automatic flow conversion
284: */
285: xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
286: ctxt->charset = XML_CHAR_ENCODING_UTF8;
287: return(xmlCurrentChar(ctxt, len));
288:
289: encoding_error:
290: /*
291: * If we detect an UTF8 error that probably mean that the
292: * input encoding didn't get properly advertized in the
293: * declaration header. Report the error and switch the encoding
294: * to ISO-Latin-1 (if you don't like this policy, just declare the
295: * encoding !)
296: */
297: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
298: ctxt->sax->error(ctxt->userData,
299: "Input is not proper UTF-8, indicate encoding !\n");
300: ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
301: ctxt->input->cur[0], ctxt->input->cur[1],
302: ctxt->input->cur[2], ctxt->input->cur[3]);
303: }
304: ctxt->errNo = XML_ERR_INVALID_ENCODING;
305:
306: ctxt->charset = XML_CHAR_ENCODING_8859_1;
307: *len = 1;
308: return((int) *ctxt->input->cur);
309: }
1.35 daniel 310:
311: /**
312: * htmlNextChar:
313: * @ctxt: the HTML parser context
314: *
315: * Skip to the next char input char.
316: */
317:
318: void
319: htmlNextChar(htmlParserCtxtPtr ctxt) {
1.44 daniel 320: if (ctxt->instate == XML_PARSER_EOF)
321: return;
1.35 daniel 322: if ((*ctxt->input->cur == 0) &&
323: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
324: xmlPopInput(ctxt);
325: } else {
326: if (*(ctxt->input->cur) == '\n') {
327: ctxt->input->line++; ctxt->input->col = 1;
328: } else ctxt->input->col++;
329: ctxt->input->cur++;
330: ctxt->nbChars++;
331: if (*ctxt->input->cur == 0)
332: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333: }
334: }
1.5 daniel 335:
1.36 daniel 336: /**
337: * htmlSkipBlankChars:
338: * @ctxt: the HTML parser context
339: *
340: * skip all blanks character found at that point in the input streams.
341: *
342: * Returns the number of space chars skipped
343: */
344:
345: int
346: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
347: int res = 0;
348:
349: while (IS_BLANK(*(ctxt->input->cur))) {
350: if ((*ctxt->input->cur == 0) &&
351: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
352: xmlPopInput(ctxt);
353: } else {
354: if (*(ctxt->input->cur) == '\n') {
355: ctxt->input->line++; ctxt->input->col = 1;
356: } else ctxt->input->col++;
357: ctxt->input->cur++;
358: ctxt->nbChars++;
359: if (*ctxt->input->cur == 0)
360: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
361: }
362: res++;
363: }
364: return(res);
365: }
1.1 daniel 366:
367:
1.5 daniel 368:
1.1 daniel 369: /************************************************************************
370: * *
371: * The list of HTML elements and their properties *
372: * *
373: ************************************************************************/
374:
375: /*
376: * Start Tag: 1 means the start tag can be ommited
377: * End Tag: 1 means the end tag can be ommited
378: * 2 means it's forbidden (empty elements)
379: * Depr: this element is deprecated
380: * DTD: 1 means that this element is valid only in the Loose DTD
381: * 2 means that this element is valid only in the Frameset DTD
382: *
383: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
384: */
385: htmlElemDesc html40ElementTable[] = {
1.26 daniel 386: { "a", 0, 0, 0, 0, 0, "anchor " },
387: { "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
388: { "acronym", 0, 0, 0, 0, 0, "" },
389: { "address", 0, 0, 0, 0, 0, "information on author " },
390: { "applet", 0, 0, 0, 1, 1, "java applet " },
391: { "area", 0, 2, 1, 0, 0, "client-side image map area " },
392: { "b", 0, 0, 0, 0, 0, "bold text style" },
393: { "base", 0, 2, 1, 0, 0, "document base uri " },
394: { "basefont", 0, 2, 1, 1, 1, "base font size " },
395: { "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
396: { "big", 0, 0, 0, 0, 0, "large text style" },
397: { "blockquote", 0, 0, 0, 0, 0, "long quotation " },
398: { "body", 1, 1, 0, 0, 0, "document body " },
399: { "br", 0, 2, 1, 0, 0, "forced line break " },
400: { "button", 0, 0, 0, 0, 0, "push button " },
401: { "caption", 0, 0, 0, 0, 0, "table caption " },
402: { "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
403: { "cite", 0, 0, 0, 0, 0, "citation" },
404: { "code", 0, 0, 0, 0, 0, "computer code fragment" },
405: { "col", 0, 2, 1, 0, 0, "table column " },
406: { "colgroup", 0, 1, 0, 0, 0, "table column group " },
407: { "dd", 0, 1, 0, 0, 0, "definition description " },
408: { "del", 0, 0, 0, 0, 0, "deleted text " },
409: { "dfn", 0, 0, 0, 0, 0, "instance definition" },
410: { "dir", 0, 0, 0, 1, 1, "directory list" },
411: { "div", 0, 0, 0, 0, 0, "generic language/style container"},
412: { "dl", 0, 0, 0, 0, 0, "definition list " },
413: { "dt", 0, 1, 0, 0, 0, "definition term " },
414: { "em", 0, 0, 0, 0, 0, "emphasis" },
415: { "fieldset", 0, 0, 0, 0, 0, "form control group " },
416: { "font", 0, 0, 0, 1, 1, "local change to font " },
417: { "form", 0, 0, 0, 0, 0, "interactive form " },
418: { "frame", 0, 2, 1, 0, 2, "subwindow " },
419: { "frameset", 0, 0, 0, 0, 2, "window subdivision" },
420: { "h1", 0, 0, 0, 0, 0, "heading " },
421: { "h2", 0, 0, 0, 0, 0, "heading " },
422: { "h3", 0, 0, 0, 0, 0, "heading " },
423: { "h4", 0, 0, 0, 0, 0, "heading " },
424: { "h5", 0, 0, 0, 0, 0, "heading " },
425: { "h6", 0, 0, 0, 0, 0, "heading " },
426: { "head", 1, 1, 0, 0, 0, "document head " },
427: { "hr", 0, 2, 1, 0, 0, "horizontal rule " },
428: { "html", 1, 1, 0, 0, 0, "document root element " },
429: { "i", 0, 0, 0, 0, 0, "italic text style" },
430: { "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
431: { "img", 0, 2, 1, 0, 0, "embedded image " },
432: { "input", 0, 2, 1, 0, 0, "form control " },
433: { "ins", 0, 0, 0, 0, 0, "inserted text" },
434: { "isindex", 0, 2, 1, 1, 1, "single line prompt " },
435: { "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
436: { "label", 0, 0, 0, 0, 0, "form field label text " },
437: { "legend", 0, 0, 0, 0, 0, "fieldset legend " },
438: { "li", 0, 1, 0, 0, 0, "list item " },
439: { "link", 0, 2, 1, 0, 0, "a media-independent link " },
440: { "map", 0, 0, 0, 0, 0, "client-side image map " },
441: { "menu", 0, 0, 0, 1, 1, "menu list " },
442: { "meta", 0, 2, 1, 0, 0, "generic metainformation " },
443: { "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
444: { "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
445: { "object", 0, 0, 0, 0, 0, "generic embedded object " },
446: { "ol", 0, 0, 0, 0, 0, "ordered list " },
447: { "optgroup", 0, 0, 0, 0, 0, "option group " },
448: { "option", 0, 1, 0, 0, 0, "selectable choice " },
449: { "p", 0, 1, 0, 0, 0, "paragraph " },
450: { "param", 0, 2, 1, 0, 0, "named property value " },
451: { "pre", 0, 0, 0, 0, 0, "preformatted text " },
452: { "q", 0, 0, 0, 0, 0, "short inline quotation " },
453: { "s", 0, 0, 0, 1, 1, "strike-through text style" },
454: { "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
455: { "script", 0, 0, 0, 0, 0, "script statements " },
456: { "select", 0, 0, 0, 0, 0, "option selector " },
457: { "small", 0, 0, 0, 0, 0, "small text style" },
458: { "span", 0, 0, 0, 0, 0, "generic language/style container " },
459: { "strike", 0, 0, 0, 1, 1, "strike-through text" },
460: { "strong", 0, 0, 0, 0, 0, "strong emphasis" },
461: { "style", 0, 0, 0, 0, 0, "style info " },
462: { "sub", 0, 0, 0, 0, 0, "subscript" },
463: { "sup", 0, 0, 0, 0, 0, "superscript " },
464: { "table", 0, 0, 0, 0, 0, " " },
465: { "tbody", 1, 1, 0, 0, 0, "table body " },
466: { "td", 0, 1, 0, 0, 0, "table data cell" },
467: { "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
468: { "tfoot", 0, 1, 0, 0, 0, "table footer " },
469: { "th", 0, 1, 0, 0, 0, "table header cell" },
470: { "thead", 0, 1, 0, 0, 0, "table header " },
471: { "title", 0, 0, 0, 0, 0, "document title " },
472: { "tr", 0, 1, 0, 0, 0, "table row " },
473: { "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
474: { "u", 0, 0, 0, 1, 1, "underlined text style" },
475: { "ul", 0, 0, 0, 0, 0, "unordered list " },
476: { "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
1.1 daniel 477: };
478:
479: /*
480: * start tags that imply the end of a current element
481: * any tag of each line implies the end of the current element if the type of
482: * that element is in the same line
483: */
1.8 daniel 484: char *htmlEquEnd[] = {
1.26 daniel 485: "dt", "dd", "li", "option", NULL,
486: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
487: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
1.1 daniel 488: NULL
489: };
490: /*
491: * acording the HTML DTD, HR should be added to the 2nd line above, as it
492: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
493: * because many documents contain rules in headings...
494: */
495:
496: /*
497: * start tags that imply the end of current element
498: */
1.8 daniel 499: char *htmlStartClose[] = {
1.26 daniel 500: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
501: "dl", "ul", "ol", "menu", "dir", "address", "pre",
502: "listing", "xmp", "head", NULL,
503: "head", "p", NULL,
504: "title", "p", NULL,
505: "body", "head", "style", "link", "title", "p", NULL,
506: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
507: "pre", "listing", "xmp", "head", "li", NULL,
508: "hr", "p", "head", NULL,
509: "h1", "p", "head", NULL,
510: "h2", "p", "head", NULL,
511: "h3", "p", "head", NULL,
512: "h4", "p", "head", NULL,
513: "h5", "p", "head", NULL,
514: "h6", "p", "head", NULL,
515: "dir", "p", "head", NULL,
516: "address", "p", "head", "ul", NULL,
517: "pre", "p", "head", "ul", NULL,
518: "listing", "p", "head", NULL,
519: "xmp", "p", "head", NULL,
520: "blockquote", "p", "head", NULL,
521: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
522: "xmp", "head", NULL,
523: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
524: "head", "dd", NULL,
525: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
526: "head", "dt", NULL,
527: "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
528: "listing", "xmp", NULL,
529: "ol", "p", "head", "ul", NULL,
530: "menu", "p", "head", "ul", NULL,
531: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
532: "div", "p", "head", NULL,
533: "noscript", "p", "head", NULL,
534: "center", "font", "b", "i", "p", "head", NULL,
535: "a", "a", NULL,
536: "caption", "p", NULL,
537: "colgroup", "caption", "colgroup", "col", "p", NULL,
538: "col", "caption", "col", "p", NULL,
539: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
540: "listing", "xmp", "a", NULL,
541: "th", "th", "td", NULL,
542: "td", "th", "td", "p", NULL,
543: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
544: "thead", "caption", "col", "colgroup", NULL,
545: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
546: "tbody", "p", NULL,
547: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
548: "tfoot", "tbody", "p", NULL,
549: "optgroup", "option", NULL,
550: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
551: "pre", "listing", "xmp", "a", NULL,
1.1 daniel 552: NULL
553: };
554:
1.59 veillard 555: /*
556: * The list of HTML elements which are supposed not to have
557: * CDATA content and where a p element will be implied
558: *
559: * TODO: extend that list by reading the HTML SGML DtD on
560: * implied paragraph
561: */
562: static char *htmlNoContentElements[] = {
563: "html",
564: "head",
565: "body",
566: NULL
567: };
568:
1.43 daniel 569:
1.8 daniel 570: static char** htmlStartCloseIndex[100];
1.1 daniel 571: static int htmlStartCloseIndexinitialized = 0;
572:
573: /************************************************************************
574: * *
575: * functions to handle HTML specific data *
576: * *
577: ************************************************************************/
578:
579: /**
580: * htmlInitAutoClose:
581: *
582: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
583: *
584: */
585: void
586: htmlInitAutoClose(void) {
587: int index, i = 0;
588:
589: if (htmlStartCloseIndexinitialized) return;
590:
591: for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
592: index = 0;
593: while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
594: htmlStartCloseIndex[index++] = &htmlStartClose[i];
595: while (htmlStartClose[i] != NULL) i++;
596: i++;
597: }
598: }
599:
600: /**
601: * htmlTagLookup:
602: * @tag: The tag name
603: *
604: * Lookup the HTML tag in the ElementTable
605: *
606: * Returns the related htmlElemDescPtr or NULL if not found.
607: */
608: htmlElemDescPtr
1.14 daniel 609: htmlTagLookup(const xmlChar *tag) {
1.61 veillard 610: int i;
1.1 daniel 611:
612: for (i = 0; i < (sizeof(html40ElementTable) /
613: sizeof(html40ElementTable[0]));i++) {
1.8 daniel 614: if (!xmlStrcmp(tag, BAD_CAST html40ElementTable[i].name))
1.1 daniel 615: return(&html40ElementTable[i]);
616: }
617: return(NULL);
618: }
619:
620: /**
621: * htmlCheckAutoClose:
1.50 veillard 622: * @newtag: The new tag name
623: * @oldtag: The old tag name
1.1 daniel 624: *
625: * Checks wether the new tag is one of the registered valid tags for closing old.
626: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
627: *
628: * Returns 0 if no, 1 if yes.
629: */
630: int
1.50 veillard 631: htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
1.1 daniel 632: int i, index;
1.64 veillard 633: char **close = NULL;
1.1 daniel 634:
635: if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
636:
637: /* inefficient, but not a big deal */
638: for (index = 0; index < 100;index++) {
639: close = htmlStartCloseIndex[index];
640: if (close == NULL) return(0);
1.50 veillard 641: if (!xmlStrcmp(BAD_CAST *close, newtag)) break;
1.1 daniel 642: }
643:
644: i = close - htmlStartClose;
645: i++;
646: while (htmlStartClose[i] != NULL) {
1.50 veillard 647: if (!xmlStrcmp(BAD_CAST htmlStartClose[i], oldtag)) {
1.1 daniel 648: return(1);
649: }
650: i++;
651: }
652: return(0);
653: }
654:
655: /**
1.50 veillard 656: * htmlAutoCloseOnClose:
657: * @ctxt: an HTML parser context
658: * @newtag: The new tag name
659: *
660: * The HTmL DtD allows an ending tag to implicitely close other tags.
661: */
662: void
663: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
664: htmlElemDescPtr info;
665: xmlChar *oldname;
666: int i;
667:
668: #ifdef DEBUG
669: fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
670: for (i = 0;i < ctxt->nameNr;i++)
671: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
672: #endif
673:
674: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
675: if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break;
676: }
677: if (i < 0) return;
678:
679: while (xmlStrcmp(newtag, ctxt->name)) {
680: info = htmlTagLookup(ctxt->name);
681: if ((info == NULL) || (info->endTag == 1)) {
682: #ifdef DEBUG
683: fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
684: #endif
685: } else {
686: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
687: ctxt->sax->error(ctxt->userData,
688: "Opening and ending tag mismatch: %s and %s\n",
689: newtag, ctxt->name);
690: ctxt->wellFormed = 0;
691: }
692: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
693: ctxt->sax->endElement(ctxt->userData, ctxt->name);
694: oldname = htmlnamePop(ctxt);
695: if (oldname != NULL) {
696: #ifdef DEBUG
697: fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
698: #endif
699: xmlFree(oldname);
700: }
701: }
702: }
703:
704: /**
1.1 daniel 705: * htmlAutoClose:
706: * @ctxt: an HTML parser context
1.50 veillard 707: * @newtag: The new tag name or NULL
1.1 daniel 708: *
709: * The HTmL DtD allows a tag to implicitely close other tags.
710: * The list is kept in htmlStartClose array. This function is
711: * called when a new tag has been detected and generates the
712: * appropriates closes if possible/needed.
1.50 veillard 713: * If newtag is NULL this mean we are at the end of the resource
1.47 daniel 714: * and we should check
1.1 daniel 715: */
716: void
1.50 veillard 717: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1.15 daniel 718: xmlChar *oldname;
1.50 veillard 719: while ((newtag != NULL) && (ctxt->name != NULL) &&
720: (htmlCheckAutoClose(newtag, ctxt->name))) {
1.1 daniel 721: #ifdef DEBUG
1.50 veillard 722: fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
1.1 daniel 723: #endif
724: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1.15 daniel 725: ctxt->sax->endElement(ctxt->userData, ctxt->name);
1.24 daniel 726: oldname = htmlnamePop(ctxt);
1.18 daniel 727: if (oldname != NULL) {
728: #ifdef DEBUG
729: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
730: #endif
1.17 daniel 731: xmlFree(oldname);
1.18 daniel 732: }
1.1 daniel 733: }
1.50 veillard 734: if (newtag == NULL) {
1.49 daniel 735: htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
736: htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
737: htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
738: }
1.50 veillard 739: while ((newtag == NULL) && (ctxt->name != NULL) &&
1.47 daniel 740: ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
741: (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
742: (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) {
743: #ifdef DEBUG
744: fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
745: #endif
746: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
747: ctxt->sax->endElement(ctxt->userData, ctxt->name);
748: oldname = htmlnamePop(ctxt);
749: if (oldname != NULL) {
750: #ifdef DEBUG
751: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
752: #endif
753: xmlFree(oldname);
754: }
755: }
756:
1.1 daniel 757: }
758:
759: /**
1.28 daniel 760: * htmlAutoCloseTag:
761: * @doc: the HTML document
762: * @name: The tag name
763: * @elem: the HTML element
764: *
765: * The HTmL DtD allows a tag to implicitely close other tags.
766: * The list is kept in htmlStartClose array. This function checks
767: * if the element or one of it's children would autoclose the
768: * given tag.
769: *
770: * Returns 1 if autoclose, 0 otherwise
771: */
772: int
773: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
774: htmlNodePtr child;
775:
776: if (elem == NULL) return(1);
777: if (!xmlStrcmp(name, elem->name)) return(0);
778: if (htmlCheckAutoClose(elem->name, name)) return(1);
1.37 daniel 779: child = elem->children;
1.28 daniel 780: while (child != NULL) {
781: if (htmlAutoCloseTag(doc, name, child)) return(1);
782: child = child->next;
783: }
784: return(0);
785: }
786:
787: /**
788: * htmlIsAutoClosed:
789: * @doc: the HTML document
790: * @elem: the HTML element
791: *
792: * The HTmL DtD allows a tag to implicitely close other tags.
793: * The list is kept in htmlStartClose array. This function checks
794: * if a tag is autoclosed by one of it's child
795: *
796: * Returns 1 if autoclosed, 0 otherwise
797: */
798: int
799: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
800: htmlNodePtr child;
801:
802: if (elem == NULL) return(1);
1.37 daniel 803: child = elem->children;
1.28 daniel 804: while (child != NULL) {
805: if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
806: child = child->next;
807: }
808: return(0);
809: }
810:
811: /**
1.43 daniel 812: * htmlCheckImplied:
813: * @ctxt: an HTML parser context
1.50 veillard 814: * @newtag: The new tag name
1.43 daniel 815: *
816: * The HTmL DtD allows a tag to exists only implicitely
817: * called when a new tag has been detected and generates the
818: * appropriates implicit tags if missing
819: */
820: void
1.50 veillard 821: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
822: if (!xmlStrcmp(newtag, BAD_CAST"html"))
1.43 daniel 823: return;
824: if (ctxt->nameNr <= 0) {
825: #ifdef DEBUG
826: fprintf(stderr,"Implied element html: pushed html\n");
827: #endif
828: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
829: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
830: ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
831: }
1.50 veillard 832: if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head")))
1.43 daniel 833: return;
834: if (ctxt->nameNr <= 1) {
1.50 veillard 835: if ((!xmlStrcmp(newtag, BAD_CAST"script")) ||
836: (!xmlStrcmp(newtag, BAD_CAST"style")) ||
837: (!xmlStrcmp(newtag, BAD_CAST"meta")) ||
838: (!xmlStrcmp(newtag, BAD_CAST"link")) ||
839: (!xmlStrcmp(newtag, BAD_CAST"title")) ||
840: (!xmlStrcmp(newtag, BAD_CAST"base"))) {
1.43 daniel 841: /*
842: * dropped OBJECT ... i you put it first BODY will be
843: * assumed !
844: */
845: #ifdef DEBUG
846: fprintf(stderr,"Implied element head: pushed head\n");
847: #endif
848: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
849: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
850: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
851: } else {
852: #ifdef DEBUG
853: fprintf(stderr,"Implied element body: pushed body\n");
854: #endif
855: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
856: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
857: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
858: }
859: }
860: }
861:
1.59 veillard 862: /**
863: * htmlCheckParagraph
864: * @ctxt: an HTML parser context
865: *
866: * Check whether a p element need to be implied before inserting
867: * characters in the current element.
868: *
869: * Returns 1 if a paragraph has been inserted, 0 if not and -1
870: * in case of error.
871: */
872:
873: int
874: htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
875: const xmlChar *tag;
876: int i;
877:
878: if (ctxt == NULL)
879: return(-1);
880: tag = ctxt->name;
881: if (tag == NULL) {
882: htmlAutoClose(ctxt, BAD_CAST"p");
883: htmlCheckImplied(ctxt, BAD_CAST"p");
884: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
885: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
886: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
887: return(1);
888: }
889: for (i = 0; htmlNoContentElements[i] != NULL; i++) {
890: if (!xmlStrcmp(tag, BAD_CAST htmlNoContentElements[i])) {
891: #ifdef DEBUG
892: fprintf(stderr,"Implied element paragraph\n");
893: #endif
894: htmlAutoClose(ctxt, BAD_CAST"p");
895: htmlCheckImplied(ctxt, BAD_CAST"p");
896: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
897: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
898: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
899: return(1);
900: }
901: }
902: return(0);
903: }
904:
1.1 daniel 905: /************************************************************************
906: * *
907: * The list of HTML predefined entities *
908: * *
909: ************************************************************************/
910:
911:
912: htmlEntityDesc html40EntitiesTable[] = {
913: /*
1.61 veillard 914: * the 4 absolute ones, plus apostrophe.
1.1 daniel 915: */
916: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
917: { 38, "amp", "ampersand, U+0026 ISOnum" },
1.61 veillard 918: { 39, "apos", "single quote" },
1.1 daniel 919: { 60, "lt", "less-than sign, U+003C ISOnum" },
920: { 62, "gt", "greater-than sign, U+003E ISOnum" },
921:
922: /*
923: * A bunch still in the 128-255 range
924: * Replacing them depend really on the charset used.
925: */
926: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
927: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
928: { 162, "cent", "cent sign, U+00A2 ISOnum" },
929: { 163, "pound","pound sign, U+00A3 ISOnum" },
930: { 164, "curren","currency sign, U+00A4 ISOnum" },
931: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
932: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
933: { 167, "sect", "section sign, U+00A7 ISOnum" },
934: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
935: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
936: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
937: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
938: { 172, "not", "not sign, U+00AC ISOnum" },
939: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
940: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
941: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
942: { 176, "deg", "degree sign, U+00B0 ISOnum" },
943: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
944: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
945: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
946: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
947: { 181, "micro","micro sign, U+00B5 ISOnum" },
948: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1.7 daniel 949: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1.1 daniel 950: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
951: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
952: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1.7 daniel 953: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1.1 daniel 954: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
955: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
956: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
957: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
958: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
959: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
960: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
961: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
962: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
963: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
964: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
965: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
966: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
967: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
968: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
969: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
970: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
971: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
972: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
973: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
974: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
975: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
976: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
977: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
978: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
979: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
980: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
981: { 215, "times","multiplication sign, U+00D7 ISOnum" },
1.7 daniel 982: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1.1 daniel 983: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
984: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
985: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
986: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
987: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
988: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
989: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
990: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
991: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
992: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
993: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
994: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
995: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
996: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
997: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
998: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
999: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1000: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1001: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1002: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1003: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1004: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1005: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1006: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1007: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1008: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1009: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1010: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1011: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1012: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1013: { 247, "divide","division sign, U+00F7 ISOnum" },
1014: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1015: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1016: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1017: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1018: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1019: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1020: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1021: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1022:
1.61 veillard 1023: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1024: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1025: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1026: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1027: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1028:
1.1 daniel 1029: /*
1030: * Anything below should really be kept as entities references
1031: */
1032: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1033:
1.61 veillard 1034: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1035: { 732, "tilde","small tilde, U+02DC ISOdia" },
1036:
1.1 daniel 1037: { 913, "Alpha","greek capital letter alpha, U+0391" },
1038: { 914, "Beta", "greek capital letter beta, U+0392" },
1039: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1040: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1041: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1042: { 918, "Zeta", "greek capital letter zeta, U+0396" },
1043: { 919, "Eta", "greek capital letter eta, U+0397" },
1044: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1045: { 921, "Iota", "greek capital letter iota, U+0399" },
1046: { 922, "Kappa","greek capital letter kappa, U+039A" },
1047: { 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1048: { 924, "Mu", "greek capital letter mu, U+039C" },
1049: { 925, "Nu", "greek capital letter nu, U+039D" },
1050: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1051: { 927, "Omicron","greek capital letter omicron, U+039F" },
1052: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1053: { 929, "Rho", "greek capital letter rho, U+03A1" },
1054: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1055: { 932, "Tau", "greek capital letter tau, U+03A4" },
1056: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1057: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1058: { 935, "Chi", "greek capital letter chi, U+03A7" },
1059: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1060: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1061:
1062: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1063: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1064: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1065: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1066: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1067: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1068: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1069: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1070: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1071: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1072: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1073: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1074: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1075: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1076: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1077: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1078: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1079: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1080: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1081: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1082: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1083: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1084: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1085: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1086: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1087: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1088: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1089: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1090:
1.61 veillard 1091: { 8194, "ensp", "en space, U+2002 ISOpub" },
1092: { 8195, "emsp", "em space, U+2003 ISOpub" },
1093: { 8201, "thinsp","thin space, U+2009 ISOpub" },
1094: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1095: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1096: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1097: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1098: { 8211, "ndash","en dash, U+2013 ISOpub" },
1099: { 8212, "mdash","em dash, U+2014 ISOpub" },
1100: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1101: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1102: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1103: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1104: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1105: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1106: { 8224, "dagger","dagger, U+2020 ISOpub" },
1107: { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1108:
1.1 daniel 1109: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1110: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1.61 veillard 1111:
1112: { 8240, "permil","per mille sign, U+2030 ISOtech" },
1113:
1.1 daniel 1114: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1115: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1.61 veillard 1116:
1117: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1118: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1119:
1.1 daniel 1120: { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1121: { 8260, "frasl","fraction slash, U+2044 NEW" },
1122:
1.61 veillard 1123: { 8364, "euro", "euro sign, U+20AC NEW" },
1124:
1125: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1.7 daniel 1126: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1.1 daniel 1127: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1128: { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1129: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1130: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1131: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1132: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1133: { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1134: { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1135: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1136: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1137: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1138: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1139: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1140: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1141:
1142: { 8704, "forall","for all, U+2200 ISOtech" },
1143: { 8706, "part", "partial differential, U+2202 ISOtech" },
1144: { 8707, "exist","there exists, U+2203 ISOtech" },
1145: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1146: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1147: { 8712, "isin", "element of, U+2208 ISOtech" },
1148: { 8713, "notin","not an element of, U+2209 ISOtech" },
1149: { 8715, "ni", "contains as member, U+220B ISOtech" },
1150: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1151: { 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1152: { 8722, "minus","minus sign, U+2212 ISOtech" },
1153: { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1154: { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1155: { 8733, "prop", "proportional to, U+221D ISOtech" },
1156: { 8734, "infin","infinity, U+221E ISOtech" },
1157: { 8736, "ang", "angle, U+2220 ISOamso" },
1158: { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1159: { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1160: { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1161: { 8746, "cup", "union = cup, U+222A ISOtech" },
1162: { 8747, "int", "integral, U+222B ISOtech" },
1163: { 8756, "there4","therefore, U+2234 ISOtech" },
1164: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1165: { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1166: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1167: { 8800, "ne", "not equal to, U+2260 ISOtech" },
1168: { 8801, "equiv","identical to, U+2261 ISOtech" },
1169: { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1170: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1171: { 8834, "sub", "subset of, U+2282 ISOtech" },
1172: { 8835, "sup", "superset of, U+2283 ISOtech" },
1173: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1174: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1175: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1176: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1177: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1178: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1179: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1180: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1181: { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1182: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1183: { 8971, "rfloor","right floor, U+230B ISOamsc" },
1184: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1185: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1186: { 9674, "loz", "lozenge, U+25CA ISOpub" },
1187:
1188: { 9824, "spades","black spade suit, U+2660 ISOpub" },
1189: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1190: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1191: { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1192:
1193: };
1194:
1195: /************************************************************************
1196: * *
1197: * Commodity functions to handle entities *
1198: * *
1199: ************************************************************************/
1200:
1201: /*
1202: * Macro used to grow the current buffer.
1203: */
1204: #define growBuffer(buffer) { \
1205: buffer##_size *= 2; \
1.14 daniel 1206: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1.1 daniel 1207: if (buffer == NULL) { \
1208: perror("realloc failed"); \
1.33 daniel 1209: return(NULL); \
1.1 daniel 1210: } \
1211: }
1212:
1213: /**
1214: * htmlEntityLookup:
1215: * @name: the entity name
1216: *
1217: * Lookup the given entity in EntitiesTable
1218: *
1219: * TODO: the linear scan is really ugly, an hash table is really needed.
1220: *
1221: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1222: */
1223: htmlEntityDescPtr
1.14 daniel 1224: htmlEntityLookup(const xmlChar *name) {
1.1 daniel 1225: int i;
1226:
1227: for (i = 0;i < (sizeof(html40EntitiesTable)/
1228: sizeof(html40EntitiesTable[0]));i++) {
1.8 daniel 1229: if (!xmlStrcmp(name, BAD_CAST html40EntitiesTable[i].name)) {
1.1 daniel 1230: #ifdef DEBUG
1.18 daniel 1231: fprintf(stderr,"Found entity %s\n", name);
1.1 daniel 1232: #endif
1233: return(&html40EntitiesTable[i]);
1234: }
1235: }
1236: return(NULL);
1237: }
1238:
1.53 veillard 1239: /**
1.61 veillard 1240: * htmlEntityValueLookup:
1241: * @value: the entity's unicode value
1242: *
1243: * Lookup the given entity in EntitiesTable
1244: *
1245: * TODO: the linear scan is really ugly, an hash table is really needed.
1246: *
1247: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1248: */
1249: htmlEntityDescPtr
1250: htmlEntityValueLookup(int value) {
1251: int i;
1252: #ifdef DEBUG
1253: int lv = 0;
1254: #endif
1255:
1256: for (i = 0;i < (sizeof(html40EntitiesTable)/
1257: sizeof(html40EntitiesTable[0]));i++) {
1258: if (html40EntitiesTable[i].value >= value) {
1259: if (html40EntitiesTable[i].value > value)
1260: break;
1261: #ifdef DEBUG
1262: fprintf(stderr,"Found entity %s\n", html40EntitiesTable[i].name);
1263: #endif
1264: return(&html40EntitiesTable[i]);
1265: }
1266: #ifdef DEBUG
1267: if (lv > html40EntitiesTable[i].value) {
1268: fprintf(stderr, "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1269: lv, html40EntitiesTable[i].value);
1270: }
1271: lv = html40EntitiesTable[i].value;
1272: #endif
1273: }
1274: return(NULL);
1275: }
1276:
1277: /**
1.53 veillard 1278: * UTF8ToHtml:
1279: * @out: a pointer to an array of bytes to store the result
1280: * @outlen: the length of @out
1281: * @in: a pointer to an array of UTF-8 chars
1282: * @inlen: the length of @in
1283: *
1284: * Take a block of UTF-8 chars in and try to convert it to an ASCII
1285: * plus HTML entities block of chars out.
1286: *
1287: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1288: * The value of @inlen after return is the number of octets consumed
1289: * as the return value is positive, else unpredictiable.
1290: * The value of @outlen after return is the number of octets consumed.
1291: */
1292: int
1293: UTF8ToHtml(unsigned char* out, int *outlen,
1294: const unsigned char* in, int *inlen) {
1295: const unsigned char* processed = in;
1296: const unsigned char* outend;
1297: const unsigned char* outstart = out;
1298: const unsigned char* instart = in;
1299: const unsigned char* inend;
1300: unsigned int c, d;
1301: int trailing;
1302:
1303: if (in == NULL) {
1304: /*
1305: * initialization nothing to do
1306: */
1307: *outlen = 0;
1308: *inlen = 0;
1309: return(0);
1310: }
1311: inend = in + (*inlen);
1312: outend = out + (*outlen);
1313: while (in < inend) {
1314: d = *in++;
1315: if (d < 0x80) { c= d; trailing= 0; }
1316: else if (d < 0xC0) {
1317: /* trailing byte in leading position */
1318: *outlen = out - outstart;
1319: *inlen = processed - instart;
1320: return(-2);
1321: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1322: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1323: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1324: else {
1325: /* no chance for this in Ascii */
1326: *outlen = out - outstart;
1327: *inlen = processed - instart;
1328: return(-2);
1329: }
1330:
1331: if (inend - in < trailing) {
1332: break;
1333: }
1334:
1335: for ( ; trailing; trailing--) {
1336: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1337: break;
1338: c <<= 6;
1339: c |= d & 0x3F;
1340: }
1341:
1342: /* assertion: c is a single UTF-4 value */
1343: if (c < 0x80) {
1.62 veillard 1344: if (out + 1 >= outend)
1.53 veillard 1345: break;
1346: *out++ = c;
1347: } else {
1.61 veillard 1348: int len;
1349: htmlEntityDescPtr ent;
1350:
1.53 veillard 1351: /*
1352: * Try to lookup a predefined HTML entity for it
1353: */
1354:
1.61 veillard 1355: ent = htmlEntityValueLookup(c);
1356: if (ent == NULL) {
1357: /* no chance for this in Ascii */
1358: *outlen = out - outstart;
1359: *inlen = processed - instart;
1360: return(-2);
1.53 veillard 1361: }
1.61 veillard 1362: len = strlen(ent->name);
1.62 veillard 1363: if (out + 2 + len >= outend)
1.53 veillard 1364: break;
1365: *out++ = '&';
1.61 veillard 1366: memcpy(out, ent->name, len);
1367: out += len;
1.53 veillard 1368: *out++ = ';';
1369: }
1370: processed = in;
1371: }
1372: *outlen = out - outstart;
1373: *inlen = processed - instart;
1374: return(0);
1375: }
1376:
1.62 veillard 1377: /**
1378: * htmlEncodeEntities:
1379: * @out: a pointer to an array of bytes to store the result
1380: * @outlen: the length of @out
1381: * @in: a pointer to an array of UTF-8 chars
1382: * @inlen: the length of @in
1383: * @quoteChar: the quote character to escape (' or ") or zero.
1384: *
1385: * Take a block of UTF-8 chars in and try to convert it to an ASCII
1386: * plus HTML entities block of chars out.
1387: *
1388: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1389: * The value of @inlen after return is the number of octets consumed
1390: * as the return value is positive, else unpredictiable.
1391: * The value of @outlen after return is the number of octets consumed.
1392: */
1393: int
1394: htmlEncodeEntities(unsigned char* out, int *outlen,
1395: const unsigned char* in, int *inlen, int quoteChar) {
1396: const unsigned char* processed = in;
1397: const unsigned char* outend = out + (*outlen);
1398: const unsigned char* outstart = out;
1399: const unsigned char* instart = in;
1400: const unsigned char* inend = in + (*inlen);
1401: unsigned int c, d;
1402: int trailing;
1403:
1404: while (in < inend) {
1405: d = *in++;
1406: if (d < 0x80) { c= d; trailing= 0; }
1407: else if (d < 0xC0) {
1408: /* trailing byte in leading position */
1409: *outlen = out - outstart;
1410: *inlen = processed - instart;
1411: return(-2);
1412: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1413: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1414: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1415: else {
1416: /* no chance for this in Ascii */
1417: *outlen = out - outstart;
1418: *inlen = processed - instart;
1419: return(-2);
1420: }
1421:
1422: if (inend - in < trailing)
1423: break;
1424:
1425: while (trailing--) {
1426: if (((d= *in++) & 0xC0) != 0x80) {
1427: *outlen = out - outstart;
1428: *inlen = processed - instart;
1429: return(-2);
1430: }
1431: c <<= 6;
1432: c |= d & 0x3F;
1433: }
1434:
1435: /* assertion: c is a single UTF-4 value */
1436: if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1437: if (out >= outend)
1438: break;
1439: *out++ = c;
1440: } else {
1441: htmlEntityDescPtr ent;
1442: const char *cp;
1443: char nbuf[16];
1444: int len;
1445:
1446: /*
1447: * Try to lookup a predefined HTML entity for it
1448: */
1449: ent = htmlEntityValueLookup(c);
1450: if (ent == NULL) {
1451: sprintf(nbuf, "#%u", c);
1452: cp = nbuf;
1453: }
1454: else
1455: cp = ent->name;
1456: len = strlen(cp);
1457: if (out + 2 + len > outend)
1458: break;
1459: *out++ = '&';
1460: memcpy(out, cp, len);
1461: out += len;
1462: *out++ = ';';
1463: }
1464: processed = in;
1465: }
1466: *outlen = out - outstart;
1467: *inlen = processed - instart;
1468: return(0);
1469: }
1.1 daniel 1470:
1471: /**
1472: * htmlDecodeEntities:
1473: * @ctxt: the parser context
1474: * @len: the len to decode (in bytes !), -1 for no size limit
1.14 daniel 1475: * @end: an end marker xmlChar, 0 if none
1476: * @end2: an end marker xmlChar, 0 if none
1477: * @end3: an end marker xmlChar, 0 if none
1.1 daniel 1478: *
1479: * Subtitute the HTML entities by their value
1480: *
1.19 daniel 1481: * DEPRECATED !!!!
1.1 daniel 1482: *
1483: * Returns A newly allocated string with the substitution done. The caller
1484: * must deallocate it !
1485: */
1.14 daniel 1486: xmlChar *
1.1 daniel 1487: htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
1.14 daniel 1488: xmlChar end, xmlChar end2, xmlChar end3) {
1.53 veillard 1489: xmlChar *name = NULL;
1.14 daniel 1490: xmlChar *buffer = NULL;
1.53 veillard 1491: unsigned int buffer_size = 0;
1492: unsigned int nbchars = 0;
1.1 daniel 1493: htmlEntityDescPtr ent;
1494: unsigned int max = (unsigned int) len;
1.53 veillard 1495: int c,l;
1496:
1497: if (ctxt->depth > 40) {
1498: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1499: ctxt->sax->error(ctxt->userData,
1500: "Detected entity reference loop\n");
1501: ctxt->wellFormed = 0;
1502: ctxt->disableSAX = 1;
1503: ctxt->errNo = XML_ERR_ENTITY_LOOP;
1504: return(NULL);
1505: }
1.1 daniel 1506:
1507: /*
1508: * allocate a translation buffer.
1509: */
1.31 daniel 1510: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1.14 daniel 1511: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1.1 daniel 1512: if (buffer == NULL) {
1.53 veillard 1513: perror("xmlDecodeEntities: malloc failed");
1.1 daniel 1514: return(NULL);
1515: }
1516:
1517: /*
1518: * Ok loop until we reach one of the ending char or a size limit.
1519: */
1.53 veillard 1520: c = CUR_CHAR(l);
1521: while ((nbchars < max) && (c != end) &&
1522: (c != end2) && (c != end3)) {
1523:
1524: if (c == 0) break;
1525: if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1526: int val = htmlParseCharRef(ctxt);
1527: COPY_BUF(0,buffer,nbchars,val);
1528: NEXTL(l);
1529: } else if ((c == '&') && (ctxt->token != '&')) {
1530: ent = htmlParseEntityRef(ctxt, &name);
1531: if (name != NULL) {
1532: if (ent != NULL) {
1533: int val = ent->value;
1534: COPY_BUF(0,buffer,nbchars,val);
1535: NEXTL(l);
1536: } else {
1537: const xmlChar *cur = name;
1.1 daniel 1538:
1.53 veillard 1539: buffer[nbchars++] = '&';
1540: if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1541: growBuffer(buffer);
1542: }
1543: while (*cur != 0) {
1544: buffer[nbchars++] = *cur++;
1.1 daniel 1545: }
1.53 veillard 1546: buffer[nbchars++] = ';';
1.1 daniel 1547: }
1548: }
1549: } else {
1.53 veillard 1550: COPY_BUF(l,buffer,nbchars,c);
1551: NEXTL(l);
1552: if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1553: growBuffer(buffer);
1.1 daniel 1554: }
1555: }
1.53 veillard 1556: c = CUR_CHAR(l);
1.1 daniel 1557: }
1.53 veillard 1558: buffer[nbchars++] = 0;
1.1 daniel 1559: return(buffer);
1560: }
1561:
1.31 daniel 1562: /************************************************************************
1563: * *
1564: * Commodity functions to handle streams *
1565: * *
1566: ************************************************************************/
1567:
1568: /**
1569: * htmlFreeInputStream:
1570: * @input: an htmlParserInputPtr
1571: *
1572: * Free up an input stream.
1573: */
1574: void
1575: htmlFreeInputStream(htmlParserInputPtr input) {
1576: if (input == NULL) return;
1577:
1578: if (input->filename != NULL) xmlFree((char *) input->filename);
1579: if (input->directory != NULL) xmlFree((char *) input->directory);
1580: if ((input->free != NULL) && (input->base != NULL))
1581: input->free((xmlChar *) input->base);
1582: if (input->buf != NULL)
1583: xmlFreeParserInputBuffer(input->buf);
1584: memset(input, -1, sizeof(htmlParserInput));
1585: xmlFree(input);
1586: }
1587:
1588: /**
1589: * htmlNewInputStream:
1590: * @ctxt: an HTML parser context
1591: *
1592: * Create a new input stream structure
1593: * Returns the new input stream or NULL
1594: */
1595: htmlParserInputPtr
1596: htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1597: htmlParserInputPtr input;
1598:
1599: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1600: if (input == NULL) {
1601: ctxt->errNo = XML_ERR_NO_MEMORY;
1602: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1603: ctxt->sax->error(ctxt->userData,
1604: "malloc: couldn't allocate a new input stream\n");
1605: ctxt->errNo = XML_ERR_NO_MEMORY;
1606: return(NULL);
1607: }
1.51 veillard 1608: memset(input, 0, sizeof(htmlParserInput));
1.31 daniel 1609: input->filename = NULL;
1610: input->directory = NULL;
1611: input->base = NULL;
1612: input->cur = NULL;
1613: input->buf = NULL;
1614: input->line = 1;
1615: input->col = 1;
1616: input->buf = NULL;
1617: input->free = NULL;
1.51 veillard 1618: input->version = NULL;
1.31 daniel 1619: input->consumed = 0;
1620: input->length = 0;
1621: return(input);
1622: }
1623:
1.1 daniel 1624:
1625: /************************************************************************
1626: * *
1627: * Commodity functions, cleanup needed ? *
1628: * *
1629: ************************************************************************/
1630:
1631: /**
1632: * areBlanks:
1633: * @ctxt: an HTML parser context
1.14 daniel 1634: * @str: a xmlChar *
1.1 daniel 1635: * @len: the size of @str
1636: *
1637: * Is this a sequence of blank chars that one can ignore ?
1638: *
1639: * Returns 1 if ignorable 0 otherwise.
1640: */
1641:
1.14 daniel 1642: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1.1 daniel 1643: int i;
1644: xmlNodePtr lastChild;
1645:
1646: for (i = 0;i < len;i++)
1647: if (!(IS_BLANK(str[i]))) return(0);
1648:
1.48 daniel 1649: if (CUR == 0) return(1);
1.1 daniel 1650: if (CUR != '<') return(0);
1.62 veillard 1651: if (ctxt->name == NULL)
1652: return(1);
1.63 veillard 1653: if (!xmlStrcmp(ctxt->name, BAD_CAST"html"))
1654: return(1);
1.62 veillard 1655: if (!xmlStrcmp(ctxt->name, BAD_CAST"head"))
1656: return(1);
1657: if (!xmlStrcmp(ctxt->name, BAD_CAST"body"))
1658: return(1);
1.1 daniel 1659: if (ctxt->node == NULL) return(0);
1660: lastChild = xmlGetLastChild(ctxt->node);
1661: if (lastChild == NULL) {
1662: if (ctxt->node->content != NULL) return(0);
1663: } else if (xmlNodeIsText(lastChild))
1664: return(0);
1665: return(1);
1666: }
1667:
1668: /**
1669: * htmlHandleEntity:
1670: * @ctxt: an HTML parser context
1671: * @entity: an XML entity pointer.
1672: *
1673: * Default handling of an HTML entity, call the parser with the
1674: * substitution string
1675: */
1676:
1677: void
1678: htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1679: int len;
1680:
1681: if (entity->content == NULL) {
1682: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1683: ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1684: entity->name);
1685: ctxt->wellFormed = 0;
1686: return;
1687: }
1688: len = xmlStrlen(entity->content);
1689:
1690: /*
1691: * Just handle the content as a set of chars.
1692: */
1.59 veillard 1693: htmlCheckParagraph(ctxt);
1.1 daniel 1694: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1695: ctxt->sax->characters(ctxt->userData, entity->content, len);
1696:
1697: }
1698:
1699: /**
1.59 veillard 1700: * htmlNewDocNoDtD:
1.1 daniel 1701: * @URI: URI for the dtd, or NULL
1702: * @ExternalID: the external ID of the DTD, or NULL
1703: *
1.59 veillard 1704: * Returns a new document, do not intialize the DTD if not provided
1.1 daniel 1705: */
1706: htmlDocPtr
1.59 veillard 1707: htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1.1 daniel 1708: xmlDocPtr cur;
1709:
1710: /*
1711: * Allocate a new document and fill the fields.
1712: */
1.11 daniel 1713: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1.1 daniel 1714: if (cur == NULL) {
1715: fprintf(stderr, "xmlNewDoc : malloc failed\n");
1716: return(NULL);
1717: }
1.10 daniel 1718: memset(cur, 0, sizeof(xmlDoc));
1.1 daniel 1719:
1.20 daniel 1720: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 1721: cur->version = NULL;
1722: cur->intSubset = NULL;
1.59 veillard 1723: if ((ExternalID != NULL) ||
1724: (URI != NULL))
1.28 daniel 1725: xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1.41 daniel 1726: cur->doc = cur;
1.1 daniel 1727: cur->name = NULL;
1.37 daniel 1728: cur->children = NULL;
1.1 daniel 1729: cur->extSubset = NULL;
1730: cur->oldNs = NULL;
1731: cur->encoding = NULL;
1732: cur->standalone = 1;
1733: cur->compression = 0;
1.12 daniel 1734: cur->ids = NULL;
1735: cur->refs = NULL;
1.1 daniel 1736: #ifndef XML_WITHOUT_CORBA
1737: cur->_private = NULL;
1738: #endif
1739: return(cur);
1740: }
1741:
1.59 veillard 1742: /**
1743: * htmlNewDoc:
1744: * @URI: URI for the dtd, or NULL
1745: * @ExternalID: the external ID of the DTD, or NULL
1746: *
1747: * Returns a new document
1748: */
1749: htmlDocPtr
1750: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1751: if ((URI == NULL) && (ExternalID == NULL))
1752: return(htmlNewDocNoDtD(
1753: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1754: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1755:
1756: return(htmlNewDocNoDtD(URI, ExternalID));
1757: }
1758:
1.1 daniel 1759:
1760: /************************************************************************
1761: * *
1762: * The parser itself *
1763: * Relates to http://www.w3.org/TR/html40 *
1764: * *
1765: ************************************************************************/
1766:
1767: /************************************************************************
1768: * *
1769: * The parser itself *
1770: * *
1771: ************************************************************************/
1772:
1773: /**
1774: * htmlParseHTMLName:
1775: * @ctxt: an HTML parser context
1776: *
1.26 daniel 1777: * parse an HTML tag or attribute name, note that we convert it to lowercase
1.1 daniel 1778: * since HTML names are not case-sensitive.
1779: *
1780: * Returns the Tag Name parsed or NULL
1781: */
1782:
1.14 daniel 1783: xmlChar *
1.1 daniel 1784: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1785: xmlChar *ret = NULL;
1.1 daniel 1786: int i = 0;
1.31 daniel 1787: xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1.1 daniel 1788:
1789: if (!IS_LETTER(CUR) && (CUR != '_') &&
1790: (CUR != ':')) return(NULL);
1791:
1.31 daniel 1792: while ((i < HTML_PARSER_BUFFER_SIZE) &&
1.45 daniel 1793: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1794: (CUR == ':') || (CUR == '_'))) {
1.26 daniel 1795: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1.1 daniel 1796: else loc[i] = CUR;
1797: i++;
1798:
1799: NEXT;
1800: }
1801:
1802: ret = xmlStrndup(loc, i);
1803:
1804: return(ret);
1805: }
1806:
1807: /**
1808: * htmlParseName:
1809: * @ctxt: an HTML parser context
1810: *
1811: * parse an HTML name, this routine is case sensistive.
1812: *
1813: * Returns the Name parsed or NULL
1814: */
1815:
1.14 daniel 1816: xmlChar *
1.1 daniel 1817: htmlParseName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1818: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1819: int len = 0;
1.1 daniel 1820:
1.5 daniel 1821: GROW;
1822: if (!IS_LETTER(CUR) && (CUR != '_')) {
1823: return(NULL);
1824: }
1.1 daniel 1825:
1826: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1827: (CUR == '.') || (CUR == '-') ||
1828: (CUR == '_') || (CUR == ':') ||
1829: (IS_COMBINING(CUR)) ||
1.5 daniel 1830: (IS_EXTENDER(CUR))) {
1831: buf[len++] = CUR;
1.1 daniel 1832: NEXT;
1.5 daniel 1833: if (len >= HTML_MAX_NAMELEN) {
1834: fprintf(stderr,
1835: "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1836: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1837: (CUR == '.') || (CUR == '-') ||
1838: (CUR == '_') || (CUR == ':') ||
1839: (IS_COMBINING(CUR)) ||
1840: (IS_EXTENDER(CUR)))
1841: NEXT;
1842: break;
1843: }
1844: }
1845: return(xmlStrndup(buf, len));
1.1 daniel 1846: }
1847:
1848: /**
1849: * htmlParseHTMLAttribute:
1850: * @ctxt: an HTML parser context
1.19 daniel 1851: * @stop: a char stop value
1.1 daniel 1852: *
1.19 daniel 1853: * parse an HTML attribute value till the stop (quote), if
1854: * stop is 0 then it stops at the first space
1.1 daniel 1855: *
1.19 daniel 1856: * Returns the attribute parsed or NULL
1.1 daniel 1857: */
1858:
1.14 daniel 1859: xmlChar *
1.19 daniel 1860: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1.32 daniel 1861: #if 0
1.14 daniel 1862: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1863: int len = 0;
1.1 daniel 1864:
1.5 daniel 1865: GROW;
1.19 daniel 1866: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1867: if ((stop == 0) && (IS_BLANK(CUR))) break;
1.5 daniel 1868: buf[len++] = CUR;
1.1 daniel 1869: NEXT;
1.5 daniel 1870: if (len >= HTML_MAX_NAMELEN) {
1871: fprintf(stderr,
1872: "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n");
1873: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
1.19 daniel 1874: (CUR != '>') &&
1.5 daniel 1875: (CUR != '\'') && (CUR != '"'))
1876: NEXT;
1877: break;
1878: }
1879: }
1880: return(xmlStrndup(buf, len));
1.32 daniel 1881: #else
1882: xmlChar *buffer = NULL;
1883: int buffer_size = 0;
1884: xmlChar *out = NULL;
1885: xmlChar *name = NULL;
1886:
1887: xmlChar *cur = NULL;
1888: htmlEntityDescPtr ent;
1889:
1890: /*
1891: * allocate a translation buffer.
1892: */
1893: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1894: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1895: if (buffer == NULL) {
1896: perror("htmlParseHTMLAttribute: malloc failed");
1897: return(NULL);
1898: }
1899: out = buffer;
1900:
1901: /*
1902: * Ok loop until we reach one of the ending chars
1903: */
1904: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1905: if ((stop == 0) && (IS_BLANK(CUR))) break;
1906: if (CUR == '&') {
1907: if (NXT(1) == '#') {
1.52 veillard 1908: unsigned int c;
1909: int bits;
1910:
1911: c = htmlParseCharRef(ctxt);
1912: if (c < 0x80)
1913: { *out++ = c; bits= -6; }
1914: else if (c < 0x800)
1915: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1916: else if (c < 0x10000)
1917: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1918: else
1919: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1920:
1921: for ( ; bits >= 0; bits-= 6) {
1922: *out++ = ((c >> bits) & 0x3F) | 0x80;
1923: }
1.32 daniel 1924: } else {
1925: ent = htmlParseEntityRef(ctxt, &name);
1926: if (name == NULL) {
1927: *out++ = '&';
1928: if (out - buffer > buffer_size - 100) {
1929: int index = out - buffer;
1930:
1931: growBuffer(buffer);
1932: out = &buffer[index];
1933: }
1.52 veillard 1934: } else if (ent == NULL) {
1.32 daniel 1935: *out++ = '&';
1936: cur = name;
1937: while (*cur != 0) {
1938: if (out - buffer > buffer_size - 100) {
1939: int index = out - buffer;
1940:
1941: growBuffer(buffer);
1942: out = &buffer[index];
1943: }
1944: *out++ = *cur++;
1945: }
1946: xmlFree(name);
1947: } else {
1.52 veillard 1948: unsigned int c;
1949: int bits;
1950:
1.32 daniel 1951: if (out - buffer > buffer_size - 100) {
1952: int index = out - buffer;
1953:
1954: growBuffer(buffer);
1955: out = &buffer[index];
1956: }
1.52 veillard 1957: c = (xmlChar)ent->value;
1958: if (c < 0x80)
1959: { *out++ = c; bits= -6; }
1960: else if (c < 0x800)
1961: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1962: else if (c < 0x10000)
1963: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1964: else
1965: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1966:
1967: for ( ; bits >= 0; bits-= 6) {
1968: *out++ = ((c >> bits) & 0x3F) | 0x80;
1969: }
1.32 daniel 1970: xmlFree(name);
1971: }
1972: }
1973: } else {
1.52 veillard 1974: unsigned int c;
1975: int bits;
1976:
1.32 daniel 1977: if (out - buffer > buffer_size - 100) {
1.52 veillard 1978: int index = out - buffer;
1979:
1980: growBuffer(buffer);
1981: out = &buffer[index];
1982: }
1983: c = CUR;
1984: if (c < 0x80)
1985: { *out++ = c; bits= -6; }
1986: else if (c < 0x800)
1987: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1988: else if (c < 0x10000)
1989: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1990: else
1991: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1992:
1993: for ( ; bits >= 0; bits-= 6) {
1994: *out++ = ((c >> bits) & 0x3F) | 0x80;
1.32 daniel 1995: }
1996: NEXT;
1997: }
1998: }
1999: *out++ = 0;
2000: return(buffer);
2001: #endif
1.1 daniel 2002: }
2003:
2004: /**
2005: * htmlParseNmtoken:
2006: * @ctxt: an HTML parser context
2007: *
2008: * parse an HTML Nmtoken.
2009: *
2010: * Returns the Nmtoken parsed or NULL
2011: */
2012:
1.14 daniel 2013: xmlChar *
1.1 daniel 2014: htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
1.14 daniel 2015: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 2016: int len = 0;
1.1 daniel 2017:
1.5 daniel 2018: GROW;
1.1 daniel 2019: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2020: (CUR == '.') || (CUR == '-') ||
2021: (CUR == '_') || (CUR == ':') ||
2022: (IS_COMBINING(CUR)) ||
1.5 daniel 2023: (IS_EXTENDER(CUR))) {
2024: buf[len++] = CUR;
1.1 daniel 2025: NEXT;
1.5 daniel 2026: if (len >= HTML_MAX_NAMELEN) {
2027: fprintf(stderr,
2028: "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2029: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2030: (CUR == '.') || (CUR == '-') ||
2031: (CUR == '_') || (CUR == ':') ||
2032: (IS_COMBINING(CUR)) ||
2033: (IS_EXTENDER(CUR)))
2034: NEXT;
2035: break;
2036: }
2037: }
2038: return(xmlStrndup(buf, len));
1.1 daniel 2039: }
2040:
2041: /**
2042: * htmlParseEntityRef:
2043: * @ctxt: an HTML parser context
2044: * @str: location to store the entity name
2045: *
2046: * parse an HTML ENTITY references
2047: *
2048: * [68] EntityRef ::= '&' Name ';'
2049: *
2050: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2051: * if non-NULL *str will have to be freed by the caller.
2052: */
2053: htmlEntityDescPtr
1.14 daniel 2054: htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2055: xmlChar *name;
1.1 daniel 2056: htmlEntityDescPtr ent = NULL;
2057: *str = NULL;
2058:
2059: if (CUR == '&') {
2060: NEXT;
2061: name = htmlParseName(ctxt);
2062: if (name == NULL) {
2063: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2064: ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2065: ctxt->wellFormed = 0;
2066: } else {
1.5 daniel 2067: GROW;
1.1 daniel 2068: if (CUR == ';') {
2069: *str = name;
2070:
2071: /*
2072: * Lookup the entity in the table.
2073: */
2074: ent = htmlEntityLookup(name);
1.32 daniel 2075: if (ent != NULL) /* OK that's ugly !!! */
2076: NEXT;
1.1 daniel 2077: } else {
2078: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2079: ctxt->sax->error(ctxt->userData,
2080: "htmlParseEntityRef: expecting ';'\n");
1.32 daniel 2081: *str = name;
1.1 daniel 2082: }
2083: }
2084: }
2085: return(ent);
2086: }
2087:
2088: /**
2089: * htmlParseAttValue:
2090: * @ctxt: an HTML parser context
2091: *
2092: * parse a value for an attribute
2093: * Note: the parser won't do substitution of entities here, this
2094: * will be handled later in xmlStringGetNodeList, unless it was
2095: * asked for ctxt->replaceEntities != 0
2096: *
2097: * Returns the AttValue parsed or NULL.
2098: */
2099:
1.14 daniel 2100: xmlChar *
1.1 daniel 2101: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1.14 daniel 2102: xmlChar *ret = NULL;
1.1 daniel 2103:
2104: if (CUR == '"') {
2105: NEXT;
1.19 daniel 2106: ret = htmlParseHTMLAttribute(ctxt, '"');
1.1 daniel 2107: if (CUR != '"') {
2108: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2109: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2110: ctxt->wellFormed = 0;
2111: } else
2112: NEXT;
2113: } else if (CUR == '\'') {
2114: NEXT;
1.19 daniel 2115: ret = htmlParseHTMLAttribute(ctxt, '\'');
1.1 daniel 2116: if (CUR != '\'') {
2117: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2118: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2119: ctxt->wellFormed = 0;
2120: } else
2121: NEXT;
2122: } else {
2123: /*
2124: * That's an HTMLism, the attribute value may not be quoted
2125: */
1.19 daniel 2126: ret = htmlParseHTMLAttribute(ctxt, 0);
1.1 daniel 2127: if (ret == NULL) {
2128: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2129: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2130: ctxt->wellFormed = 0;
2131: }
2132: }
2133: return(ret);
2134: }
2135:
2136: /**
2137: * htmlParseSystemLiteral:
2138: * @ctxt: an HTML parser context
2139: *
2140: * parse an HTML Literal
2141: *
2142: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2143: *
2144: * Returns the SystemLiteral parsed or NULL
2145: */
2146:
1.14 daniel 2147: xmlChar *
1.1 daniel 2148: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 2149: const xmlChar *q;
2150: xmlChar *ret = NULL;
1.1 daniel 2151:
2152: if (CUR == '"') {
2153: NEXT;
2154: q = CUR_PTR;
2155: while ((IS_CHAR(CUR)) && (CUR != '"'))
2156: NEXT;
2157: if (!IS_CHAR(CUR)) {
2158: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2159: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2160: ctxt->wellFormed = 0;
2161: } else {
2162: ret = xmlStrndup(q, CUR_PTR - q);
2163: NEXT;
2164: }
2165: } else if (CUR == '\'') {
2166: NEXT;
2167: q = CUR_PTR;
2168: while ((IS_CHAR(CUR)) && (CUR != '\''))
2169: NEXT;
2170: if (!IS_CHAR(CUR)) {
2171: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2172: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2173: ctxt->wellFormed = 0;
2174: } else {
2175: ret = xmlStrndup(q, CUR_PTR - q);
2176: NEXT;
2177: }
2178: } else {
2179: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.38 daniel 2180: ctxt->sax->error(ctxt->userData,
2181: "SystemLiteral \" or ' expected\n");
1.1 daniel 2182: ctxt->wellFormed = 0;
2183: }
2184:
2185: return(ret);
2186: }
2187:
2188: /**
2189: * htmlParsePubidLiteral:
2190: * @ctxt: an HTML parser context
2191: *
2192: * parse an HTML public literal
2193: *
2194: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2195: *
2196: * Returns the PubidLiteral parsed or NULL.
2197: */
2198:
1.14 daniel 2199: xmlChar *
1.1 daniel 2200: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 2201: const xmlChar *q;
2202: xmlChar *ret = NULL;
1.1 daniel 2203: /*
2204: * Name ::= (Letter | '_') (NameChar)*
2205: */
2206: if (CUR == '"') {
2207: NEXT;
2208: q = CUR_PTR;
2209: while (IS_PUBIDCHAR(CUR)) NEXT;
2210: if (CUR != '"') {
2211: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2212: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2213: ctxt->wellFormed = 0;
2214: } else {
2215: ret = xmlStrndup(q, CUR_PTR - q);
2216: NEXT;
2217: }
2218: } else if (CUR == '\'') {
2219: NEXT;
2220: q = CUR_PTR;
2221: while ((IS_LETTER(CUR)) && (CUR != '\''))
2222: NEXT;
2223: if (!IS_LETTER(CUR)) {
2224: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2225: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2226: ctxt->wellFormed = 0;
2227: } else {
2228: ret = xmlStrndup(q, CUR_PTR - q);
2229: NEXT;
2230: }
2231: } else {
2232: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2233: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2234: ctxt->wellFormed = 0;
2235: }
2236:
2237: return(ret);
2238: }
2239:
2240: /**
2241: * htmlParseCharData:
2242: * @ctxt: an HTML parser context
2243: * @cdata: int indicating whether we are within a CDATA section
2244: *
2245: * parse a CharData section.
2246: * if we are within a CDATA section ']]>' marks an end of section.
2247: *
2248: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2249: */
2250:
2251: void
2252: htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1.53 veillard 2253: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2254: int nbchar = 0;
2255: int cur, l;
2256:
2257: SHRINK;
2258: cur = CUR_CHAR(l);
2259: while (((cur != '<') || (ctxt->token == '<')) &&
2260: ((cur != '&') || (ctxt->token == '&')) &&
2261: (IS_CHAR(cur))) {
2262: COPY_BUF(l,buf,nbchar,cur);
2263: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2264: /*
2265: * Ok the segment is to be consumed as chars.
2266: */
2267: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2268: if (areBlanks(ctxt, buf, nbchar)) {
2269: if (ctxt->sax->ignorableWhitespace != NULL)
2270: ctxt->sax->ignorableWhitespace(ctxt->userData,
2271: buf, nbchar);
2272: } else {
1.59 veillard 2273: htmlCheckParagraph(ctxt);
1.53 veillard 2274: if (ctxt->sax->characters != NULL)
2275: ctxt->sax->characters(ctxt->userData, buf, nbchar);
2276: }
1.1 daniel 2277: }
1.53 veillard 2278: nbchar = 0;
1.1 daniel 2279: }
1.53 veillard 2280: NEXTL(l);
2281: cur = CUR_CHAR(l);
2282: }
2283: if (nbchar != 0) {
2284: /*
2285: * Ok the segment is to be consumed as chars.
2286: */
2287: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2288: if (areBlanks(ctxt, buf, nbchar)) {
2289: if (ctxt->sax->ignorableWhitespace != NULL)
2290: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2291: } else {
1.59 veillard 2292: htmlCheckParagraph(ctxt);
1.53 veillard 2293: if (ctxt->sax->characters != NULL)
2294: ctxt->sax->characters(ctxt->userData, buf, nbchar);
1.25 daniel 2295: }
2296: }
1.1 daniel 2297: }
2298: }
2299:
2300: /**
2301: * htmlParseExternalID:
2302: * @ctxt: an HTML parser context
1.14 daniel 2303: * @publicID: a xmlChar** receiving PubidLiteral
1.1 daniel 2304: * @strict: indicate whether we should restrict parsing to only
2305: * production [75], see NOTE below
2306: *
2307: * Parse an External ID or a Public ID
2308: *
2309: * NOTE: Productions [75] and [83] interract badly since [75] can generate
2310: * 'PUBLIC' S PubidLiteral S SystemLiteral
2311: *
2312: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2313: * | 'PUBLIC' S PubidLiteral S SystemLiteral
2314: *
2315: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2316: *
2317: * Returns the function returns SystemLiteral and in the second
2318: * case publicID receives PubidLiteral, is strict is off
2319: * it is possible to return NULL and have publicID set.
2320: */
2321:
1.14 daniel 2322: xmlChar *
2323: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2324: xmlChar *URI = NULL;
1.1 daniel 2325:
2326: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2327: (UPP(2) == 'S') && (UPP(3) == 'T') &&
2328: (UPP(4) == 'E') && (UPP(5) == 'M')) {
2329: SKIP(6);
2330: if (!IS_BLANK(CUR)) {
2331: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2332: ctxt->sax->error(ctxt->userData,
2333: "Space required after 'SYSTEM'\n");
2334: ctxt->wellFormed = 0;
2335: }
2336: SKIP_BLANKS;
2337: URI = htmlParseSystemLiteral(ctxt);
2338: if (URI == NULL) {
2339: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2340: ctxt->sax->error(ctxt->userData,
2341: "htmlParseExternalID: SYSTEM, no URI\n");
2342: ctxt->wellFormed = 0;
2343: }
2344: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2345: (UPP(2) == 'B') && (UPP(3) == 'L') &&
2346: (UPP(4) == 'I') && (UPP(5) == 'C')) {
2347: SKIP(6);
2348: if (!IS_BLANK(CUR)) {
2349: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2350: ctxt->sax->error(ctxt->userData,
2351: "Space required after 'PUBLIC'\n");
2352: ctxt->wellFormed = 0;
2353: }
2354: SKIP_BLANKS;
2355: *publicID = htmlParsePubidLiteral(ctxt);
2356: if (*publicID == NULL) {
2357: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2358: ctxt->sax->error(ctxt->userData,
2359: "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2360: ctxt->wellFormed = 0;
2361: }
1.5 daniel 2362: SKIP_BLANKS;
2363: if ((CUR == '"') || (CUR == '\'')) {
2364: URI = htmlParseSystemLiteral(ctxt);
1.1 daniel 2365: }
2366: }
2367: return(URI);
2368: }
2369:
2370: /**
2371: * htmlParseComment:
2372: * @ctxt: an HTML parser context
2373: *
2374: * Parse an XML (SGML) comment <!-- .... -->
2375: *
2376: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2377: */
2378: void
1.31 daniel 2379: htmlParseComment(htmlParserCtxtPtr ctxt) {
1.25 daniel 2380: xmlChar *buf = NULL;
1.56 veillard 2381: int len;
1.31 daniel 2382: int size = HTML_PARSER_BUFFER_SIZE;
1.56 veillard 2383: int q, ql;
2384: int r, rl;
2385: int cur, l;
2386: xmlParserInputState state;
1.1 daniel 2387:
2388: /*
2389: * Check that there is a comment right here.
2390: */
1.56 veillard 2391: if ((RAW != '<') || (NXT(1) != '!') ||
1.1 daniel 2392: (NXT(2) != '-') || (NXT(3) != '-')) return;
2393:
1.56 veillard 2394: state = ctxt->instate;
2395: ctxt->instate = XML_PARSER_COMMENT;
2396: SHRINK;
2397: SKIP(4);
1.25 daniel 2398: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2399: if (buf == NULL) {
2400: fprintf(stderr, "malloc of %d byte failed\n", size);
1.56 veillard 2401: ctxt->instate = state;
1.25 daniel 2402: return;
2403: }
1.56 veillard 2404: q = CUR_CHAR(ql);
2405: NEXTL(ql);
2406: r = CUR_CHAR(rl);
2407: NEXTL(rl);
2408: cur = CUR_CHAR(l);
2409: len = 0;
2410: while (IS_CHAR(cur) &&
2411: ((cur != '>') ||
2412: (r != '-') || (q != '-'))) {
2413: if (len + 5 >= size) {
1.25 daniel 2414: size *= 2;
1.50 veillard 2415: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
1.25 daniel 2416: if (buf == NULL) {
2417: fprintf(stderr, "realloc of %d byte failed\n", size);
1.56 veillard 2418: ctxt->instate = state;
1.25 daniel 2419: return;
2420: }
2421: }
1.56 veillard 2422: COPY_BUF(ql,buf,len,q);
1.25 daniel 2423: q = r;
1.56 veillard 2424: ql = rl;
2425: r = cur;
2426: rl = l;
2427: NEXTL(l);
2428: cur = CUR_CHAR(l);
2429: if (cur == 0) {
2430: SHRINK;
2431: GROW;
2432: cur = CUR_CHAR(l);
2433: }
1.1 daniel 2434: }
1.56 veillard 2435: buf[len] = 0;
2436: if (!IS_CHAR(cur)) {
1.1 daniel 2437: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.56 veillard 2438: ctxt->sax->error(ctxt->userData,
2439: "Comment not terminated \n<!--%.50s\n", buf);
2440: ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
1.1 daniel 2441: ctxt->wellFormed = 0;
1.56 veillard 2442: xmlFree(buf);
1.1 daniel 2443: } else {
2444: NEXT;
1.56 veillard 2445: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2446: (!ctxt->disableSAX))
1.31 daniel 2447: ctxt->sax->comment(ctxt->userData, buf);
1.56 veillard 2448: xmlFree(buf);
1.1 daniel 2449: }
1.56 veillard 2450: ctxt->instate = state;
1.1 daniel 2451: }
2452:
2453: /**
2454: * htmlParseCharRef:
2455: * @ctxt: an HTML parser context
2456: *
2457: * parse Reference declarations
2458: *
2459: * [66] CharRef ::= '&#' [0-9]+ ';' |
2460: * '&#x' [0-9a-fA-F]+ ';'
2461: *
2462: * Returns the value parsed (as an int)
2463: */
2464: int
2465: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2466: int val = 0;
2467:
2468: if ((CUR == '&') && (NXT(1) == '#') &&
2469: (NXT(2) == 'x')) {
2470: SKIP(3);
2471: while (CUR != ';') {
2472: if ((CUR >= '0') && (CUR <= '9'))
2473: val = val * 16 + (CUR - '0');
2474: else if ((CUR >= 'a') && (CUR <= 'f'))
2475: val = val * 16 + (CUR - 'a') + 10;
2476: else if ((CUR >= 'A') && (CUR <= 'F'))
2477: val = val * 16 + (CUR - 'A') + 10;
2478: else {
2479: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2480: ctxt->sax->error(ctxt->userData,
2481: "htmlParseCharRef: invalid hexadecimal value\n");
2482: ctxt->wellFormed = 0;
2483: val = 0;
2484: break;
2485: }
2486: NEXT;
2487: }
2488: if (CUR == ';')
2489: NEXT;
2490: } else if ((CUR == '&') && (NXT(1) == '#')) {
2491: SKIP(2);
2492: while (CUR != ';') {
2493: if ((CUR >= '0') && (CUR <= '9'))
2494: val = val * 10 + (CUR - '0');
2495: else {
2496: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2497: ctxt->sax->error(ctxt->userData,
2498: "htmlParseCharRef: invalid decimal value\n");
2499: ctxt->wellFormed = 0;
2500: val = 0;
2501: break;
2502: }
2503: NEXT;
2504: }
2505: if (CUR == ';')
2506: NEXT;
2507: } else {
2508: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2509: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2510: ctxt->wellFormed = 0;
2511: }
2512: /*
2513: * Check the value IS_CHAR ...
2514: */
2515: if (IS_CHAR(val)) {
2516: return(val);
2517: } else {
2518: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.14 daniel 2519: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
1.1 daniel 2520: val);
2521: ctxt->wellFormed = 0;
2522: }
2523: return(0);
2524: }
2525:
2526:
2527: /**
2528: * htmlParseDocTypeDecl :
2529: * @ctxt: an HTML parser context
2530: *
2531: * parse a DOCTYPE declaration
2532: *
2533: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2534: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2535: */
2536:
2537: void
2538: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1.14 daniel 2539: xmlChar *name;
2540: xmlChar *ExternalID = NULL;
2541: xmlChar *URI = NULL;
1.1 daniel 2542:
2543: /*
2544: * We know that '<!DOCTYPE' has been detected.
2545: */
2546: SKIP(9);
2547:
2548: SKIP_BLANKS;
2549:
2550: /*
2551: * Parse the DOCTYPE name.
2552: */
2553: name = htmlParseName(ctxt);
2554: if (name == NULL) {
2555: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2556: ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2557: ctxt->wellFormed = 0;
2558: }
2559: /*
2560: * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2561: */
2562:
2563: SKIP_BLANKS;
2564:
2565: /*
2566: * Check for SystemID and ExternalID
2567: */
1.5 daniel 2568: URI = htmlParseExternalID(ctxt, &ExternalID, 0);
1.1 daniel 2569: SKIP_BLANKS;
2570:
2571: /*
2572: * We should be at the end of the DOCTYPE declaration.
2573: */
2574: if (CUR != '>') {
2575: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2576: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2577: ctxt->wellFormed = 0;
2578: /* We shouldn't try to resynchronize ... */
2579: }
2580: NEXT;
2581:
2582: /*
1.46 daniel 2583: * Create or update the document accordingly to the DOCTYPE
1.1 daniel 2584: */
1.46 daniel 2585: if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2586: (!ctxt->disableSAX))
2587: ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
1.1 daniel 2588:
2589: /*
2590: * Cleanup, since we don't use all those identifiers
2591: */
1.11 daniel 2592: if (URI != NULL) xmlFree(URI);
2593: if (ExternalID != NULL) xmlFree(ExternalID);
2594: if (name != NULL) xmlFree(name);
1.1 daniel 2595: }
2596:
2597: /**
2598: * htmlParseAttribute:
2599: * @ctxt: an HTML parser context
1.14 daniel 2600: * @value: a xmlChar ** used to store the value of the attribute
1.1 daniel 2601: *
2602: * parse an attribute
2603: *
2604: * [41] Attribute ::= Name Eq AttValue
2605: *
2606: * [25] Eq ::= S? '=' S?
2607: *
2608: * With namespace:
2609: *
2610: * [NS 11] Attribute ::= QName Eq AttValue
2611: *
2612: * Also the case QName == xmlns:??? is handled independently as a namespace
2613: * definition.
2614: *
2615: * Returns the attribute name, and the value in *value.
2616: */
2617:
1.14 daniel 2618: xmlChar *
2619: htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1.31 daniel 2620: xmlChar *name, *val = NULL;
1.1 daniel 2621:
2622: *value = NULL;
2623: name = htmlParseName(ctxt);
2624: if (name == NULL) {
2625: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2626: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2627: ctxt->wellFormed = 0;
2628: return(NULL);
2629: }
2630:
2631: /*
2632: * read the value
2633: */
2634: SKIP_BLANKS;
2635: if (CUR == '=') {
2636: NEXT;
2637: SKIP_BLANKS;
2638: val = htmlParseAttValue(ctxt);
1.42 daniel 2639: /******
1.1 daniel 2640: } else {
1.42 daniel 2641: * TODO : some attribute must have values, some may not
1.1 daniel 2642: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.31 daniel 2643: ctxt->sax->warning(ctxt->userData,
1.42 daniel 2644: "No value for attribute %s\n", name); */
1.1 daniel 2645: }
2646:
2647: *value = val;
2648: return(name);
2649: }
2650:
2651: /**
1.47 daniel 2652: * htmlCheckEncoding:
2653: * @ctxt: an HTML parser context
2654: * @attvalue: the attribute value
2655: *
2656: * Checks an http-equiv attribute from a Meta tag to detect
2657: * the encoding
2658: * If a new encoding is detected the parser is switched to decode
2659: * it and pass UTF8
2660: */
2661: void
2662: htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2663: const xmlChar *encoding;
2664:
2665: if ((ctxt == NULL) || (attvalue == NULL))
2666: return;
2667:
2668: encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
2669: if (encoding == NULL)
2670: encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
2671: if (encoding == NULL)
2672: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
2673: if (encoding != NULL) {
2674: encoding += 8;
2675: } else {
2676: encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
2677: if (encoding == NULL)
2678: encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
2679: if (encoding == NULL)
2680: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
2681: if (encoding != NULL)
2682: encoding += 9;
2683: }
2684: if (encoding != NULL) {
2685: xmlCharEncoding enc;
2686: xmlCharEncodingHandlerPtr handler;
2687:
2688: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2689:
2690: if (ctxt->input->encoding != NULL)
2691: xmlFree((xmlChar *) ctxt->input->encoding);
2692: ctxt->input->encoding = xmlStrdup(encoding);
2693:
2694: enc = xmlParseCharEncoding((const char *) encoding);
2695: /*
2696: * registered set of known encodings
2697: */
2698: if (enc != XML_CHAR_ENCODING_ERROR) {
2699: xmlSwitchEncoding(ctxt, enc);
1.53 veillard 2700: ctxt->charset = XML_CHAR_ENCODING_UTF8;
1.47 daniel 2701: } else {
2702: /*
2703: * fallback for unknown encodings
2704: */
2705: handler = xmlFindCharEncodingHandler((const char *) encoding);
2706: if (handler != NULL) {
2707: xmlSwitchToEncoding(ctxt, handler);
1.54 veillard 2708: ctxt->charset = XML_CHAR_ENCODING_UTF8;
1.47 daniel 2709: } else {
2710: ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2711: }
2712: }
1.54 veillard 2713:
2714: if ((ctxt->input->buf != NULL) &&
2715: (ctxt->input->buf->encoder != NULL) &&
2716: (ctxt->input->buf->raw != NULL) &&
2717: (ctxt->input->buf->buffer != NULL)) {
2718: int nbchars;
1.56 veillard 2719: int processed;
1.54 veillard 2720:
2721: /*
2722: * convert as much as possible to the parser reading buffer.
2723: */
1.56 veillard 2724: processed = ctxt->input->cur - ctxt->input->base;
2725: xmlBufferShrink(ctxt->input->buf->buffer, processed);
1.54 veillard 2726: nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2727: ctxt->input->buf->buffer,
2728: ctxt->input->buf->raw);
2729: if (nbchars < 0) {
2730: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2731: ctxt->sax->error(ctxt->userData,
2732: "htmlCheckEncoding: encoder error\n");
2733: ctxt->errNo = XML_ERR_INVALID_ENCODING;
2734: }
1.56 veillard 2735: ctxt->input->base =
2736: ctxt->input->cur = ctxt->input->buf->buffer->content;
1.54 veillard 2737: }
1.47 daniel 2738: }
2739: }
2740:
2741: /**
2742: * htmlCheckMeta:
2743: * @ctxt: an HTML parser context
2744: * @atts: the attributes values
2745: *
2746: * Checks an attributes from a Meta tag
2747: */
2748: void
2749: htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2750: int i;
2751: const xmlChar *att, *value;
2752: int http = 0;
2753: const xmlChar *content = NULL;
2754:
2755: if ((ctxt == NULL) || (atts == NULL))
2756: return;
2757:
2758: i = 0;
2759: att = atts[i++];
2760: while (att != NULL) {
2761: value = atts[i++];
2762: if ((value != NULL) &&
2763: ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
2764: (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
2765: (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
2766: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
2767: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
2768: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
2769: http = 1;
2770: else if ((value != NULL) &&
2771: ((!xmlStrcmp(att, BAD_CAST"content")) ||
2772: (!xmlStrcmp(att, BAD_CAST"Content")) ||
2773: (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
2774: content = value;
2775: att = atts[i++];
2776: }
2777: if ((http) && (content != NULL))
2778: htmlCheckEncoding(ctxt, content);
2779:
2780: }
2781:
2782: /**
1.1 daniel 2783: * htmlParseStartTag:
2784: * @ctxt: an HTML parser context
2785: *
2786: * parse a start of tag either for rule element or
2787: * EmptyElement. In both case we don't parse the tag closing chars.
2788: *
2789: * [40] STag ::= '<' Name (S Attribute)* S? '>'
2790: *
2791: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2792: *
2793: * With namespace:
2794: *
2795: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2796: *
2797: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2798: *
2799: */
2800:
1.18 daniel 2801: void
1.1 daniel 2802: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2803: xmlChar *name;
2804: xmlChar *attname;
2805: xmlChar *attvalue;
2806: const xmlChar **atts = NULL;
1.1 daniel 2807: int nbatts = 0;
2808: int maxatts = 0;
1.47 daniel 2809: int meta = 0;
1.1 daniel 2810: int i;
2811:
1.18 daniel 2812: if (CUR != '<') return;
1.1 daniel 2813: NEXT;
2814:
1.19 daniel 2815: GROW;
1.1 daniel 2816: name = htmlParseHTMLName(ctxt);
2817: if (name == NULL) {
2818: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2819: ctxt->sax->error(ctxt->userData,
2820: "htmlParseStartTag: invalid element name\n");
2821: ctxt->wellFormed = 0;
1.18 daniel 2822: return;
1.1 daniel 2823: }
1.47 daniel 2824: if (!xmlStrcmp(name, BAD_CAST"meta"))
2825: meta = 1;
1.1 daniel 2826:
2827: /*
2828: * Check for auto-closure of HTML elements.
2829: */
2830: htmlAutoClose(ctxt, name);
1.43 daniel 2831:
2832: /*
2833: * Check for implied HTML elements.
2834: */
2835: htmlCheckImplied(ctxt, name);
1.1 daniel 2836:
2837: /*
2838: * Now parse the attributes, it ends up with the ending
2839: *
2840: * (S Attribute)* S?
2841: */
2842: SKIP_BLANKS;
2843: while ((IS_CHAR(CUR)) &&
2844: (CUR != '>') &&
2845: ((CUR != '/') || (NXT(1) != '>'))) {
1.26 daniel 2846: long cons = ctxt->nbChars;
1.1 daniel 2847:
1.19 daniel 2848: GROW;
1.1 daniel 2849: attname = htmlParseAttribute(ctxt, &attvalue);
1.31 daniel 2850: if (attname != NULL) {
1.47 daniel 2851:
1.1 daniel 2852: /*
2853: * Well formedness requires at most one declaration of an attribute
2854: */
2855: for (i = 0; i < nbatts;i += 2) {
2856: if (!xmlStrcmp(atts[i], attname)) {
2857: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.19 daniel 2858: ctxt->sax->error(ctxt->userData,
2859: "Attribute %s redefined\n",
2860: attname);
1.1 daniel 2861: ctxt->wellFormed = 0;
1.11 daniel 2862: xmlFree(attname);
1.31 daniel 2863: if (attvalue != NULL)
2864: xmlFree(attvalue);
1.19 daniel 2865: goto failed;
1.1 daniel 2866: }
2867: }
2868:
2869: /*
2870: * Add the pair to atts
2871: */
2872: if (atts == NULL) {
2873: maxatts = 10;
1.14 daniel 2874: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
1.1 daniel 2875: if (atts == NULL) {
2876: fprintf(stderr, "malloc of %ld byte failed\n",
1.14 daniel 2877: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2878: if (name != NULL) xmlFree(name);
2879: return;
1.1 daniel 2880: }
1.23 daniel 2881: } else if (nbatts + 4 > maxatts) {
1.1 daniel 2882: maxatts *= 2;
1.14 daniel 2883: atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
1.1 daniel 2884: if (atts == NULL) {
2885: fprintf(stderr, "realloc of %ld byte failed\n",
1.14 daniel 2886: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2887: if (name != NULL) xmlFree(name);
2888: return;
1.1 daniel 2889: }
2890: }
2891: atts[nbatts++] = attname;
2892: atts[nbatts++] = attvalue;
2893: atts[nbatts] = NULL;
2894: atts[nbatts + 1] = NULL;
2895: }
2896:
1.19 daniel 2897: failed:
1.1 daniel 2898: SKIP_BLANKS;
1.26 daniel 2899: if (cons == ctxt->nbChars) {
1.1 daniel 2900: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2901: ctxt->sax->error(ctxt->userData,
2902: "htmlParseStartTag: problem parsing attributes\n");
2903: ctxt->wellFormed = 0;
2904: break;
2905: }
2906: }
2907:
2908: /*
1.47 daniel 2909: * Handle specific association to the META tag
2910: */
2911: if (meta)
2912: htmlCheckMeta(ctxt, atts);
2913:
2914: /*
1.1 daniel 2915: * SAX: Start of Element !
2916: */
1.15 daniel 2917: htmlnamePush(ctxt, xmlStrdup(name));
1.18 daniel 2918: #ifdef DEBUG
2919: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2920: #endif
1.1 daniel 2921: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2922: ctxt->sax->startElement(ctxt->userData, name, atts);
2923:
2924: if (atts != NULL) {
1.31 daniel 2925: for (i = 0;i < nbatts;i++) {
2926: if (atts[i] != NULL)
2927: xmlFree((xmlChar *) atts[i]);
2928: }
1.45 daniel 2929: xmlFree((void *) atts);
1.1 daniel 2930: }
1.18 daniel 2931: if (name != NULL) xmlFree(name);
1.1 daniel 2932: }
2933:
2934: /**
2935: * htmlParseEndTag:
2936: * @ctxt: an HTML parser context
2937: *
2938: * parse an end of tag
2939: *
2940: * [42] ETag ::= '</' Name S? '>'
2941: *
2942: * With namespace
2943: *
2944: * [NS 9] ETag ::= '</' QName S? '>'
2945: */
2946:
2947: void
1.18 daniel 2948: htmlParseEndTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2949: xmlChar *name;
1.15 daniel 2950: xmlChar *oldname;
1.1 daniel 2951: int i;
2952:
2953: if ((CUR != '<') || (NXT(1) != '/')) {
2954: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2955: ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
2956: ctxt->wellFormed = 0;
2957: return;
2958: }
2959: SKIP(2);
2960:
2961: name = htmlParseHTMLName(ctxt);
1.24 daniel 2962: if (name == NULL) return;
1.1 daniel 2963:
2964: /*
2965: * We should definitely be at the ending "S? '>'" part
2966: */
2967: SKIP_BLANKS;
2968: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
2969: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2970: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
2971: ctxt->wellFormed = 0;
2972: } else
2973: NEXT;
2974:
2975: /*
1.18 daniel 2976: * If the name read is not one of the element in the parsing stack
2977: * then return, it's just an error.
1.1 daniel 2978: */
1.18 daniel 2979: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
2980: if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
1.1 daniel 2981: }
2982: if (i < 0) {
2983: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.18 daniel 2984: ctxt->sax->error(ctxt->userData,
2985: "Unexpected end tag : %s\n", name);
1.11 daniel 2986: xmlFree(name);
1.1 daniel 2987: ctxt->wellFormed = 0;
2988: return;
2989: }
2990:
1.18 daniel 2991:
1.1 daniel 2992: /*
2993: * Check for auto-closure of HTML elements.
2994: */
1.18 daniel 2995:
1.1 daniel 2996: htmlAutoCloseOnClose(ctxt, name);
2997:
2998: /*
2999: * Well formedness constraints, opening and closing must match.
3000: * With the exception that the autoclose may have popped stuff out
3001: * of the stack.
3002: */
1.18 daniel 3003: if (xmlStrcmp(name, ctxt->name)) {
3004: #ifdef DEBUG
3005: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
3006: #endif
1.15 daniel 3007: if ((ctxt->name != NULL) &&
3008: (xmlStrcmp(ctxt->name, name))) {
1.1 daniel 3009: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3010: ctxt->sax->error(ctxt->userData,
3011: "Opening and ending tag mismatch: %s and %s\n",
1.15 daniel 3012: name, ctxt->name);
1.1 daniel 3013: ctxt->wellFormed = 0;
3014: }
3015: }
3016:
3017: /*
3018: * SAX: End of Tag
3019: */
1.15 daniel 3020: oldname = ctxt->name;
1.24 daniel 3021: if ((oldname != NULL) && (!xmlStrcmp(oldname, name))) {
1.18 daniel 3022: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3023: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 3024: oldname = htmlnamePop(ctxt);
1.18 daniel 3025: if (oldname != NULL) {
3026: #ifdef DEBUG
3027: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
3028: #endif
3029: xmlFree(oldname);
3030: #ifdef DEBUG
3031: } else {
3032: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
3033: #endif
3034: }
3035: }
1.1 daniel 3036:
3037: if (name != NULL)
1.11 daniel 3038: xmlFree(name);
1.1 daniel 3039:
3040: return;
3041: }
3042:
3043:
3044: /**
3045: * htmlParseReference:
3046: * @ctxt: an HTML parser context
3047: *
3048: * parse and handle entity references in content,
3049: * this will end-up in a call to character() since this is either a
3050: * CharRef, or a predefined entity.
3051: */
3052: void
3053: htmlParseReference(htmlParserCtxtPtr ctxt) {
3054: htmlEntityDescPtr ent;
1.52 veillard 3055: xmlChar out[6];
1.14 daniel 3056: xmlChar *name;
1.1 daniel 3057: if (CUR != '&') return;
3058:
3059: if (NXT(1) == '#') {
1.52 veillard 3060: unsigned int c;
3061: int bits, i = 0;
3062:
3063: c = htmlParseCharRef(ctxt);
3064: if (c < 0x80) { out[i++]= c; bits= -6; }
3065: else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3066: else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3067: else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3068:
3069: for ( ; bits >= 0; bits-= 6) {
3070: out[i++]= ((c >> bits) & 0x3F) | 0x80;
3071: }
3072: out[i] = 0;
3073:
1.59 veillard 3074: htmlCheckParagraph(ctxt);
1.1 daniel 3075: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.52 veillard 3076: ctxt->sax->characters(ctxt->userData, out, i);
1.1 daniel 3077: } else {
3078: ent = htmlParseEntityRef(ctxt, &name);
1.32 daniel 3079: if (name == NULL) {
1.59 veillard 3080: htmlCheckParagraph(ctxt);
1.58 veillard 3081: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3082: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.32 daniel 3083: return;
3084: }
1.52 veillard 3085: if ((ent == NULL) || (ent->value <= 0)) {
1.59 veillard 3086: htmlCheckParagraph(ctxt);
1.1 daniel 3087: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
1.8 daniel 3088: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.1 daniel 3089: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1.32 daniel 3090: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
1.1 daniel 3091: }
3092: } else {
1.52 veillard 3093: unsigned int c;
3094: int bits, i = 0;
3095:
3096: c = ent->value;
3097: if (c < 0x80)
3098: { out[i++]= c; bits= -6; }
3099: else if (c < 0x800)
3100: { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3101: else if (c < 0x10000)
3102: { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3103: else
3104: { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3105:
3106: for ( ; bits >= 0; bits-= 6) {
3107: out[i++]= ((c >> bits) & 0x3F) | 0x80;
3108: }
3109: out[i] = 0;
3110:
1.59 veillard 3111: htmlCheckParagraph(ctxt);
1.1 daniel 3112: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.52 veillard 3113: ctxt->sax->characters(ctxt->userData, out, i);
1.1 daniel 3114: }
1.11 daniel 3115: xmlFree(name);
1.1 daniel 3116: }
3117: }
3118:
3119: /**
3120: * htmlParseContent:
3121: * @ctxt: an HTML parser context
3122: * @name: the node name
3123: *
3124: * Parse a content: comment, sub-element, reference or text.
3125: *
3126: */
3127:
3128: void
1.18 daniel 3129: htmlParseContent(htmlParserCtxtPtr ctxt) {
1.15 daniel 3130: xmlChar *currentNode;
1.18 daniel 3131: int depth;
1.1 daniel 3132:
1.26 daniel 3133: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 3134: depth = ctxt->nameNr;
3135: while (1) {
1.26 daniel 3136: long cons = ctxt->nbChars;
1.1 daniel 3137:
1.18 daniel 3138: GROW;
3139: /*
3140: * Our tag or one of it's parent or children is ending.
3141: */
3142: if ((CUR == '<') && (NXT(1) == '/')) {
3143: htmlParseEndTag(ctxt);
1.26 daniel 3144: if (currentNode != NULL) xmlFree(currentNode);
1.18 daniel 3145: return;
3146: }
3147:
3148: /*
3149: * Has this node been popped out during parsing of
3150: * the next element
3151: */
1.26 daniel 3152: if ((xmlStrcmp(currentNode, ctxt->name)) &&
3153: (depth >= ctxt->nameNr)) {
3154: if (currentNode != NULL) xmlFree(currentNode);
3155: return;
3156: }
1.18 daniel 3157:
1.1 daniel 3158: /*
1.59 veillard 3159: * Sometimes DOCTYPE arrives in the middle of the document
3160: */
3161: if ((CUR == '<') && (NXT(1) == '!') &&
3162: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3163: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3164: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3165: (UPP(8) == 'E')) {
3166: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3167: ctxt->sax->error(ctxt->userData,
3168: "Misplaced DOCTYPE declaration\n");
3169: ctxt->wellFormed = 0;
3170: htmlParseDocTypeDecl(ctxt);
3171: }
3172:
3173: /*
1.1 daniel 3174: * First case : a comment
3175: */
3176: if ((CUR == '<') && (NXT(1) == '!') &&
3177: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 3178: htmlParseComment(ctxt);
1.1 daniel 3179: }
3180:
3181: /*
3182: * Second case : a sub-element.
3183: */
3184: else if (CUR == '<') {
3185: htmlParseElement(ctxt);
3186: }
3187:
3188: /*
3189: * Third case : a reference. If if has not been resolved,
3190: * parsing returns it's Name, create the node
3191: */
3192: else if (CUR == '&') {
3193: htmlParseReference(ctxt);
3194: }
3195:
3196: /*
1.47 daniel 3197: * Fourth : end of the resource
3198: */
3199: else if (CUR == 0) {
3200: htmlAutoClose(ctxt, NULL);
3201: }
3202:
3203: /*
1.1 daniel 3204: * Last case, text. Note that References are handled directly.
3205: */
3206: else {
3207: htmlParseCharData(ctxt, 0);
3208: }
3209:
1.26 daniel 3210: if (cons == ctxt->nbChars) {
1.22 daniel 3211: if (ctxt->node != NULL) {
3212: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3213: ctxt->sax->error(ctxt->userData,
3214: "detected an error in element content\n");
3215: ctxt->wellFormed = 0;
3216: }
1.1 daniel 3217: break;
3218: }
1.17 daniel 3219:
1.5 daniel 3220: GROW;
1.1 daniel 3221: }
1.26 daniel 3222: if (currentNode != NULL) xmlFree(currentNode);
1.1 daniel 3223: }
3224:
3225: /**
3226: * htmlParseElement:
3227: * @ctxt: an HTML parser context
3228: *
3229: * parse an HTML element, this is highly recursive
3230: *
3231: * [39] element ::= EmptyElemTag | STag content ETag
3232: *
3233: * [41] Attribute ::= Name Eq AttValue
3234: */
3235:
3236: void
3237: htmlParseElement(htmlParserCtxtPtr ctxt) {
1.14 daniel 3238: xmlChar *name;
1.16 daniel 3239: xmlChar *currentNode = NULL;
1.1 daniel 3240: htmlElemDescPtr info;
1.10 daniel 3241: htmlParserNodeInfo node_info;
1.31 daniel 3242: xmlChar *oldname;
1.18 daniel 3243: int depth = ctxt->nameNr;
1.1 daniel 3244:
3245: /* Capture start position */
1.10 daniel 3246: if (ctxt->record_info) {
3247: node_info.begin_pos = ctxt->input->consumed +
3248: (CUR_PTR - ctxt->input->base);
3249: node_info.begin_line = ctxt->input->line;
3250: }
1.1 daniel 3251:
1.26 daniel 3252: oldname = xmlStrdup(ctxt->name);
1.18 daniel 3253: htmlParseStartTag(ctxt);
3254: name = ctxt->name;
1.19 daniel 3255: #ifdef DEBUG
3256: if (oldname == NULL)
3257: fprintf(stderr, "Start of element %s\n", name);
3258: else if (name == NULL)
3259: fprintf(stderr, "Start of element failed, was %s\n", oldname);
3260: else
3261: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
3262: #endif
1.26 daniel 3263: if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
1.18 daniel 3264: (name == NULL)) {
1.19 daniel 3265: if (CUR == '>')
3266: NEXT;
1.26 daniel 3267: if (oldname != NULL)
3268: xmlFree(oldname);
1.1 daniel 3269: return;
3270: }
1.26 daniel 3271: if (oldname != NULL)
3272: xmlFree(oldname);
1.1 daniel 3273:
3274: /*
3275: * Lookup the info for that element.
3276: */
3277: info = htmlTagLookup(name);
3278: if (info == NULL) {
3279: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3280: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3281: name);
3282: ctxt->wellFormed = 0;
3283: } else if (info->depr) {
3284: /***************************
3285: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3286: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3287: name);
3288: ***************************/
3289: }
3290:
3291: /*
3292: * Check for an Empty Element labelled the XML/SGML way
3293: */
3294: if ((CUR == '/') && (NXT(1) == '>')) {
3295: SKIP(2);
3296: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3297: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 3298: oldname = htmlnamePop(ctxt);
1.18 daniel 3299: #ifdef DEBUG
3300: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
3301: #endif
1.17 daniel 3302: if (oldname != NULL)
3303: xmlFree(oldname);
1.1 daniel 3304: return;
3305: }
3306:
1.5 daniel 3307: if (CUR == '>') {
3308: NEXT;
3309: } else {
1.1 daniel 3310: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.56 veillard 3311: ctxt->sax->error(ctxt->userData,
3312: "Couldn't find end of Start Tag %s\n",
3313: name);
1.1 daniel 3314: ctxt->wellFormed = 0;
3315:
3316: /*
3317: * end of parsing of this node.
3318: */
1.18 daniel 3319: if (!xmlStrcmp(name, ctxt->name)) {
3320: nodePop(ctxt);
1.24 daniel 3321: oldname = htmlnamePop(ctxt);
1.18 daniel 3322: #ifdef DEBUG
3323: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
3324: #endif
3325: if (oldname != NULL)
3326: xmlFree(oldname);
3327: }
1.10 daniel 3328:
3329: /*
3330: * Capture end position and add node
3331: */
3332: if ( currentNode != NULL && ctxt->record_info ) {
3333: node_info.end_pos = ctxt->input->consumed +
3334: (CUR_PTR - ctxt->input->base);
3335: node_info.end_line = ctxt->input->line;
1.15 daniel 3336: node_info.node = ctxt->node;
1.10 daniel 3337: xmlParserAddNodeInfo(ctxt, &node_info);
3338: }
1.1 daniel 3339: return;
3340: }
3341:
3342: /*
3343: * Check for an Empty Element from DTD definition
3344: */
3345: if ((info != NULL) && (info->empty)) {
3346: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3347: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 3348: oldname = htmlnamePop(ctxt);
1.18 daniel 3349: #ifdef DEBUG
3350: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3351: #endif
1.17 daniel 3352: if (oldname != NULL)
3353: xmlFree(oldname);
1.1 daniel 3354: return;
3355: }
3356:
3357: /*
3358: * Parse the content of the element:
3359: */
1.26 daniel 3360: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 3361: depth = ctxt->nameNr;
3362: while (IS_CHAR(CUR)) {
3363: htmlParseContent(ctxt);
3364: if (ctxt->nameNr < depth) break;
3365: }
1.1 daniel 3366:
3367: if (!IS_CHAR(CUR)) {
1.49 daniel 3368: /************
1.1 daniel 3369: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3370: ctxt->sax->error(ctxt->userData,
1.18 daniel 3371: "Premature end of data in tag %s\n", currentNode);
1.1 daniel 3372: ctxt->wellFormed = 0;
1.49 daniel 3373: *************/
1.1 daniel 3374:
3375: /*
3376: * end of parsing of this node.
3377: */
3378: nodePop(ctxt);
1.24 daniel 3379: oldname = htmlnamePop(ctxt);
1.18 daniel 3380: #ifdef DEBUG
3381: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
3382: #endif
1.17 daniel 3383: if (oldname != NULL)
3384: xmlFree(oldname);
1.26 daniel 3385: if (currentNode != NULL)
3386: xmlFree(currentNode);
1.1 daniel 3387: return;
3388: }
1.10 daniel 3389:
3390: /*
3391: * Capture end position and add node
3392: */
3393: if ( currentNode != NULL && ctxt->record_info ) {
3394: node_info.end_pos = ctxt->input->consumed +
3395: (CUR_PTR - ctxt->input->base);
3396: node_info.end_line = ctxt->input->line;
1.15 daniel 3397: node_info.node = ctxt->node;
1.10 daniel 3398: xmlParserAddNodeInfo(ctxt, &node_info);
3399: }
1.26 daniel 3400: if (currentNode != NULL)
3401: xmlFree(currentNode);
1.1 daniel 3402: }
3403:
3404: /**
3405: * htmlParseDocument :
3406: * @ctxt: an HTML parser context
3407: *
3408: * parse an HTML document (and build a tree if using the standard SAX
3409: * interface).
3410: *
3411: * Returns 0, -1 in case of error. the parser context is augmented
3412: * as a result of the parsing.
3413: */
3414:
3415: int
3416: htmlParseDocument(htmlParserCtxtPtr ctxt) {
1.59 veillard 3417: xmlDtdPtr dtd;
3418:
1.1 daniel 3419: htmlDefaultSAXHandlerInit();
3420: ctxt->html = 1;
3421:
1.5 daniel 3422: GROW;
1.1 daniel 3423: /*
1.9 daniel 3424: * SAX: beginning of the document processing.
1.1 daniel 3425: */
3426: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3427: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3428:
3429: /*
3430: * Wipe out everything which is before the first '<'
3431: */
1.22 daniel 3432: SKIP_BLANKS;
1.1 daniel 3433: if (CUR == 0) {
3434: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3435: ctxt->sax->error(ctxt->userData, "Document is empty\n");
3436: ctxt->wellFormed = 0;
3437: }
3438:
1.40 daniel 3439: if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3440: ctxt->sax->startDocument(ctxt->userData);
3441:
3442:
1.22 daniel 3443: /*
3444: * Parse possible comments before any content
3445: */
3446: while ((CUR == '<') && (NXT(1) == '!') &&
3447: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 3448: htmlParseComment(ctxt);
1.22 daniel 3449: SKIP_BLANKS;
3450: }
3451:
1.1 daniel 3452:
3453: /*
3454: * Then possibly doc type declaration(s) and more Misc
3455: * (doctypedecl Misc*)?
3456: */
3457: if ((CUR == '<') && (NXT(1) == '!') &&
3458: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3459: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3460: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3461: (UPP(8) == 'E')) {
3462: htmlParseDocTypeDecl(ctxt);
3463: }
3464: SKIP_BLANKS;
3465:
3466: /*
1.55 veillard 3467: * Parse possible comments before any content
3468: */
3469: while ((CUR == '<') && (NXT(1) == '!') &&
3470: (NXT(2) == '-') && (NXT(3) == '-')) {
3471: htmlParseComment(ctxt);
3472: SKIP_BLANKS;
3473: }
3474:
3475: /*
1.1 daniel 3476: * Time to start parsing the tree itself
3477: */
1.22 daniel 3478: htmlParseContent(ctxt);
1.1 daniel 3479:
3480: /*
1.47 daniel 3481: * autoclose
3482: */
3483: if (CUR == 0)
3484: htmlAutoClose(ctxt, NULL);
3485:
3486:
3487: /*
1.1 daniel 3488: * SAX: end of the document processing.
3489: */
3490: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3491: ctxt->sax->endDocument(ctxt->userData);
1.59 veillard 3492:
3493: if (ctxt->myDoc != NULL) {
3494: dtd = xmlGetIntSubset(ctxt->myDoc);
3495: if (dtd == NULL)
3496: ctxt->myDoc->intSubset =
3497: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3498: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3499: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3500: }
1.1 daniel 3501: if (! ctxt->wellFormed) return(-1);
3502: return(0);
3503: }
3504:
3505:
1.30 daniel 3506: /************************************************************************
3507: * *
3508: * Parser contexts handling *
3509: * *
3510: ************************************************************************/
1.1 daniel 3511:
3512: /**
3513: * xmlInitParserCtxt:
3514: * @ctxt: an HTML parser context
3515: *
3516: * Initialize a parser context
3517: */
3518:
3519: void
3520: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3521: {
3522: htmlSAXHandler *sax;
3523:
1.21 daniel 3524: if (ctxt == NULL) return;
3525: memset(ctxt, 0, sizeof(htmlParserCtxt));
3526:
1.11 daniel 3527: sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
1.1 daniel 3528: if (sax == NULL) {
3529: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3530: }
1.19 daniel 3531: memset(sax, 0, sizeof(htmlSAXHandler));
1.1 daniel 3532:
3533: /* Allocate the Input stack */
1.19 daniel 3534: ctxt->inputTab = (htmlParserInputPtr *)
3535: xmlMalloc(5 * sizeof(htmlParserInputPtr));
3536: if (ctxt->inputTab == NULL) {
3537: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
1.65 ! veillard 3538: ctxt->inputNr = 0;
! 3539: ctxt->inputMax = 0;
! 3540: ctxt->input = NULL;
! 3541: return;
1.19 daniel 3542: }
1.1 daniel 3543: ctxt->inputNr = 0;
3544: ctxt->inputMax = 5;
3545: ctxt->input = NULL;
3546: ctxt->version = NULL;
3547: ctxt->encoding = NULL;
3548: ctxt->standalone = -1;
1.30 daniel 3549: ctxt->instate = XML_PARSER_START;
1.1 daniel 3550:
3551: /* Allocate the Node stack */
1.11 daniel 3552: ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
1.65 ! veillard 3553: if (ctxt->nodeTab == NULL) {
! 3554: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
! 3555: ctxt->nodeNr = 0;
! 3556: ctxt->nodeMax = 0;
! 3557: ctxt->node = NULL;
! 3558: ctxt->inputNr = 0;
! 3559: ctxt->inputMax = 0;
! 3560: ctxt->input = NULL;
! 3561: return;
! 3562: }
1.1 daniel 3563: ctxt->nodeNr = 0;
3564: ctxt->nodeMax = 10;
3565: ctxt->node = NULL;
3566:
1.15 daniel 3567: /* Allocate the Name stack */
3568: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
1.65 ! veillard 3569: if (ctxt->nameTab == NULL) {
! 3570: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
! 3571: ctxt->nameNr = 0;
! 3572: ctxt->nameMax = 10;
! 3573: ctxt->name = NULL;
! 3574: ctxt->nodeNr = 0;
! 3575: ctxt->nodeMax = 0;
! 3576: ctxt->node = NULL;
! 3577: ctxt->inputNr = 0;
! 3578: ctxt->inputMax = 0;
! 3579: ctxt->input = NULL;
! 3580: return;
! 3581: }
1.15 daniel 3582: ctxt->nameNr = 0;
3583: ctxt->nameMax = 10;
3584: ctxt->name = NULL;
3585:
1.1 daniel 3586: if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3587: else {
3588: ctxt->sax = sax;
3589: memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3590: }
3591: ctxt->userData = ctxt;
3592: ctxt->myDoc = NULL;
3593: ctxt->wellFormed = 1;
3594: ctxt->replaceEntities = 0;
3595: ctxt->html = 1;
3596: ctxt->record_info = 0;
1.21 daniel 3597: ctxt->validate = 0;
1.26 daniel 3598: ctxt->nbChars = 0;
1.30 daniel 3599: ctxt->checkIndex = 0;
1.1 daniel 3600: xmlInitNodeInfoSeq(&ctxt->node_seq);
3601: }
3602:
3603: /**
3604: * htmlFreeParserCtxt:
3605: * @ctxt: an HTML parser context
3606: *
3607: * Free all the memory used by a parser context. However the parsed
3608: * document in ctxt->myDoc is not freed.
3609: */
3610:
3611: void
3612: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3613: {
1.47 daniel 3614: xmlFreeParserCtxt(ctxt);
1.1 daniel 3615: }
3616:
3617: /**
3618: * htmlCreateDocParserCtxt :
1.14 daniel 3619: * @cur: a pointer to an array of xmlChar
1.1 daniel 3620: * @encoding: a free form C string describing the HTML document encoding, or NULL
3621: *
3622: * Create a parser context for an HTML document.
3623: *
3624: * Returns the new parser context or NULL
3625: */
3626: htmlParserCtxtPtr
1.14 daniel 3627: htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
1.1 daniel 3628: htmlParserCtxtPtr ctxt;
3629: htmlParserInputPtr input;
3630: /* htmlCharEncoding enc; */
3631:
1.11 daniel 3632: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 3633: if (ctxt == NULL) {
3634: perror("malloc");
3635: return(NULL);
3636: }
3637: htmlInitParserCtxt(ctxt);
1.11 daniel 3638: input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 3639: if (input == NULL) {
3640: perror("malloc");
1.11 daniel 3641: xmlFree(ctxt);
1.1 daniel 3642: return(NULL);
3643: }
1.19 daniel 3644: memset(input, 0, sizeof(htmlParserInput));
1.1 daniel 3645:
3646: input->line = 1;
3647: input->col = 1;
3648: input->base = cur;
3649: input->cur = cur;
3650:
3651: inputPush(ctxt, input);
3652: return(ctxt);
3653: }
3654:
1.31 daniel 3655: /************************************************************************
3656: * *
3657: * Progressive parsing interfaces *
3658: * *
3659: ************************************************************************/
3660:
3661: /**
3662: * htmlParseLookupSequence:
3663: * @ctxt: an HTML parser context
3664: * @first: the first char to lookup
3665: * @next: the next char to lookup or zero
3666: * @third: the next char to lookup or zero
3667: *
3668: * Try to find if a sequence (first, next, third) or just (first next) or
3669: * (first) is available in the input stream.
3670: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3671: * to avoid rescanning sequences of bytes, it DOES change the state of the
3672: * parser, do not use liberally.
3673: * This is basically similar to xmlParseLookupSequence()
3674: *
3675: * Returns the index to the current parsing point if the full sequence
3676: * is available, -1 otherwise.
3677: */
3678: int
3679: htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3680: xmlChar next, xmlChar third) {
3681: int base, len;
3682: htmlParserInputPtr in;
3683: const xmlChar *buf;
3684:
3685: in = ctxt->input;
3686: if (in == NULL) return(-1);
3687: base = in->cur - in->base;
3688: if (base < 0) return(-1);
3689: if (ctxt->checkIndex > base)
3690: base = ctxt->checkIndex;
3691: if (in->buf == NULL) {
3692: buf = in->base;
3693: len = in->length;
3694: } else {
3695: buf = in->buf->buffer->content;
3696: len = in->buf->buffer->use;
3697: }
3698: /* take into account the sequence length */
3699: if (third) len -= 2;
3700: else if (next) len --;
3701: for (;base < len;base++) {
3702: if (buf[base] == first) {
3703: if (third != 0) {
3704: if ((buf[base + 1] != next) ||
3705: (buf[base + 2] != third)) continue;
3706: } else if (next != 0) {
3707: if (buf[base + 1] != next) continue;
3708: }
3709: ctxt->checkIndex = 0;
3710: #ifdef DEBUG_PUSH
3711: if (next == 0)
3712: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3713: first, base);
3714: else if (third == 0)
3715: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3716: first, next, base);
3717: else
3718: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3719: first, next, third, base);
3720: #endif
3721: return(base - (in->cur - in->base));
3722: }
3723: }
3724: ctxt->checkIndex = base;
3725: #ifdef DEBUG_PUSH
3726: if (next == 0)
3727: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3728: else if (third == 0)
3729: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3730: else
3731: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3732: #endif
3733: return(-1);
3734: }
3735:
3736: /**
1.32 daniel 3737: * htmlParseTryOrFinish:
1.31 daniel 3738: * @ctxt: an HTML parser context
1.32 daniel 3739: * @terminate: last chunk indicator
1.31 daniel 3740: *
3741: * Try to progress on parsing
3742: *
3743: * Returns zero if no parsing was possible
3744: */
3745: int
1.32 daniel 3746: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
1.31 daniel 3747: int ret = 0;
3748: htmlParserInputPtr in;
1.47 daniel 3749: int avail = 0;
1.31 daniel 3750: xmlChar cur, next;
3751:
3752: #ifdef DEBUG_PUSH
3753: switch (ctxt->instate) {
3754: case XML_PARSER_EOF:
3755: fprintf(stderr, "HPP: try EOF\n"); break;
3756: case XML_PARSER_START:
3757: fprintf(stderr, "HPP: try START\n"); break;
3758: case XML_PARSER_MISC:
3759: fprintf(stderr, "HPP: try MISC\n");break;
3760: case XML_PARSER_COMMENT:
3761: fprintf(stderr, "HPP: try COMMENT\n");break;
3762: case XML_PARSER_PROLOG:
3763: fprintf(stderr, "HPP: try PROLOG\n");break;
3764: case XML_PARSER_START_TAG:
3765: fprintf(stderr, "HPP: try START_TAG\n");break;
3766: case XML_PARSER_CONTENT:
3767: fprintf(stderr, "HPP: try CONTENT\n");break;
3768: case XML_PARSER_CDATA_SECTION:
3769: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3770: case XML_PARSER_END_TAG:
3771: fprintf(stderr, "HPP: try END_TAG\n");break;
3772: case XML_PARSER_ENTITY_DECL:
3773: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3774: case XML_PARSER_ENTITY_VALUE:
3775: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3776: case XML_PARSER_ATTRIBUTE_VALUE:
3777: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3778: case XML_PARSER_DTD:
3779: fprintf(stderr, "HPP: try DTD\n");break;
3780: case XML_PARSER_EPILOG:
3781: fprintf(stderr, "HPP: try EPILOG\n");break;
3782: case XML_PARSER_PI:
3783: fprintf(stderr, "HPP: try PI\n");break;
3784: }
3785: #endif
3786:
3787: while (1) {
3788:
3789: in = ctxt->input;
3790: if (in == NULL) break;
3791: if (in->buf == NULL)
3792: avail = in->length - (in->cur - in->base);
3793: else
3794: avail = in->buf->buffer->use - (in->cur - in->base);
1.47 daniel 3795: if ((avail == 0) && (terminate)) {
3796: htmlAutoClose(ctxt, NULL);
1.54 veillard 3797: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3798: /*
3799: * SAX: end of the document processing.
3800: */
1.47 daniel 3801: ctxt->instate = XML_PARSER_EOF;
1.54 veillard 3802: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3803: ctxt->sax->endDocument(ctxt->userData);
3804: }
1.47 daniel 3805: }
1.31 daniel 3806: if (avail < 1)
3807: goto done;
3808: switch (ctxt->instate) {
3809: case XML_PARSER_EOF:
3810: /*
3811: * Document parsing is done !
3812: */
3813: goto done;
3814: case XML_PARSER_START:
3815: /*
3816: * Very first chars read from the document flow.
3817: */
3818: cur = in->cur[0];
3819: if (IS_BLANK(cur)) {
3820: SKIP_BLANKS;
3821: if (in->buf == NULL)
3822: avail = in->length - (in->cur - in->base);
3823: else
3824: avail = in->buf->buffer->use - (in->cur - in->base);
3825: }
3826: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3827: ctxt->sax->setDocumentLocator(ctxt->userData,
3828: &xmlDefaultSAXLocator);
1.46 daniel 3829: if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3830: (!ctxt->disableSAX))
3831: ctxt->sax->startDocument(ctxt->userData);
3832:
1.31 daniel 3833: cur = in->cur[0];
3834: next = in->cur[1];
3835: if ((cur == '<') && (next == '!') &&
3836: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3837: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3838: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3839: (UPP(8) == 'E')) {
1.32 daniel 3840: if ((!terminate) &&
3841: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3842: goto done;
3843: #ifdef DEBUG_PUSH
3844: fprintf(stderr, "HPP: Parsing internal subset\n");
3845: #endif
3846: htmlParseDocTypeDecl(ctxt);
3847: ctxt->instate = XML_PARSER_PROLOG;
3848: #ifdef DEBUG_PUSH
3849: fprintf(stderr, "HPP: entering PROLOG\n");
3850: #endif
3851: } else {
3852: ctxt->instate = XML_PARSER_MISC;
3853: }
3854: #ifdef DEBUG_PUSH
3855: fprintf(stderr, "HPP: entering MISC\n");
3856: #endif
3857: break;
3858: case XML_PARSER_MISC:
3859: SKIP_BLANKS;
3860: if (in->buf == NULL)
3861: avail = in->length - (in->cur - in->base);
3862: else
3863: avail = in->buf->buffer->use - (in->cur - in->base);
3864: if (avail < 2)
3865: goto done;
3866: cur = in->cur[0];
3867: next = in->cur[1];
3868: if ((cur == '<') && (next == '!') &&
3869: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3870: if ((!terminate) &&
3871: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3872: goto done;
3873: #ifdef DEBUG_PUSH
3874: fprintf(stderr, "HPP: Parsing Comment\n");
3875: #endif
3876: htmlParseComment(ctxt);
3877: ctxt->instate = XML_PARSER_MISC;
3878: } else if ((cur == '<') && (next == '!') &&
3879: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3880: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3881: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3882: (UPP(8) == 'E')) {
1.32 daniel 3883: if ((!terminate) &&
3884: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3885: goto done;
3886: #ifdef DEBUG_PUSH
3887: fprintf(stderr, "HPP: Parsing internal subset\n");
3888: #endif
3889: htmlParseDocTypeDecl(ctxt);
3890: ctxt->instate = XML_PARSER_PROLOG;
3891: #ifdef DEBUG_PUSH
3892: fprintf(stderr, "HPP: entering PROLOG\n");
3893: #endif
3894: } else if ((cur == '<') && (next == '!') &&
3895: (avail < 9)) {
3896: goto done;
3897: } else {
3898: ctxt->instate = XML_PARSER_START_TAG;
3899: #ifdef DEBUG_PUSH
3900: fprintf(stderr, "HPP: entering START_TAG\n");
3901: #endif
3902: }
3903: break;
3904: case XML_PARSER_PROLOG:
3905: SKIP_BLANKS;
3906: if (in->buf == NULL)
3907: avail = in->length - (in->cur - in->base);
3908: else
3909: avail = in->buf->buffer->use - (in->cur - in->base);
3910: if (avail < 2)
3911: goto done;
3912: cur = in->cur[0];
3913: next = in->cur[1];
3914: if ((cur == '<') && (next == '!') &&
3915: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3916: if ((!terminate) &&
3917: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3918: goto done;
3919: #ifdef DEBUG_PUSH
3920: fprintf(stderr, "HPP: Parsing Comment\n");
3921: #endif
3922: htmlParseComment(ctxt);
3923: ctxt->instate = XML_PARSER_PROLOG;
3924: } else if ((cur == '<') && (next == '!') &&
3925: (avail < 4)) {
3926: goto done;
3927: } else {
3928: ctxt->instate = XML_PARSER_START_TAG;
3929: #ifdef DEBUG_PUSH
3930: fprintf(stderr, "HPP: entering START_TAG\n");
3931: #endif
3932: }
3933: break;
3934: case XML_PARSER_EPILOG:
3935: if (in->buf == NULL)
3936: avail = in->length - (in->cur - in->base);
3937: else
3938: avail = in->buf->buffer->use - (in->cur - in->base);
1.55 veillard 3939: if (avail < 1)
3940: goto done;
3941: cur = in->cur[0];
3942: if (IS_BLANK(cur)) {
3943: htmlParseCharData(ctxt, 0);
3944: goto done;
3945: }
1.31 daniel 3946: if (avail < 2)
3947: goto done;
3948: next = in->cur[1];
3949: if ((cur == '<') && (next == '!') &&
3950: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3951: if ((!terminate) &&
3952: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3953: goto done;
3954: #ifdef DEBUG_PUSH
3955: fprintf(stderr, "HPP: Parsing Comment\n");
3956: #endif
3957: htmlParseComment(ctxt);
3958: ctxt->instate = XML_PARSER_EPILOG;
3959: } else if ((cur == '<') && (next == '!') &&
3960: (avail < 4)) {
3961: goto done;
3962: } else {
3963: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3964: ctxt->sax->error(ctxt->userData,
3965: "Extra content at the end of the document\n");
3966: ctxt->wellFormed = 0;
3967: ctxt->errNo = XML_ERR_DOCUMENT_END;
3968: ctxt->instate = XML_PARSER_EOF;
3969: #ifdef DEBUG_PUSH
3970: fprintf(stderr, "HPP: entering EOF\n");
3971: #endif
3972: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3973: ctxt->sax->endDocument(ctxt->userData);
3974: goto done;
3975: }
3976: break;
3977: case XML_PARSER_START_TAG: {
3978: xmlChar *name, *oldname;
3979: int depth = ctxt->nameNr;
3980: htmlElemDescPtr info;
3981:
3982: if (avail < 2)
3983: goto done;
3984: cur = in->cur[0];
3985: if (cur != '<') {
3986: ctxt->instate = XML_PARSER_CONTENT;
3987: #ifdef DEBUG_PUSH
3988: fprintf(stderr, "HPP: entering CONTENT\n");
3989: #endif
3990: break;
3991: }
1.32 daniel 3992: if ((!terminate) &&
3993: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3994: goto done;
3995:
3996: oldname = xmlStrdup(ctxt->name);
3997: htmlParseStartTag(ctxt);
3998: name = ctxt->name;
3999: #ifdef DEBUG
4000: if (oldname == NULL)
4001: fprintf(stderr, "Start of element %s\n", name);
4002: else if (name == NULL)
4003: fprintf(stderr, "Start of element failed, was %s\n",
4004: oldname);
4005: else
4006: fprintf(stderr, "Start of element %s, was %s\n",
4007: name, oldname);
4008: #endif
4009: if (((depth == ctxt->nameNr) &&
4010: (!xmlStrcmp(oldname, ctxt->name))) ||
4011: (name == NULL)) {
4012: if (CUR == '>')
4013: NEXT;
4014: if (oldname != NULL)
4015: xmlFree(oldname);
4016: break;
4017: }
4018: if (oldname != NULL)
4019: xmlFree(oldname);
4020:
4021: /*
4022: * Lookup the info for that element.
4023: */
4024: info = htmlTagLookup(name);
4025: if (info == NULL) {
4026: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4027: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4028: name);
4029: ctxt->wellFormed = 0;
4030: } else if (info->depr) {
4031: /***************************
4032: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4033: ctxt->sax->warning(ctxt->userData,
4034: "Tag %s is deprecated\n",
4035: name);
4036: ***************************/
4037: }
4038:
4039: /*
4040: * Check for an Empty Element labelled the XML/SGML way
4041: */
4042: if ((CUR == '/') && (NXT(1) == '>')) {
4043: SKIP(2);
4044: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4045: ctxt->sax->endElement(ctxt->userData, name);
4046: oldname = htmlnamePop(ctxt);
4047: #ifdef DEBUG
4048: fprintf(stderr,"End of tag the XML way: popping out %s\n",
4049: oldname);
4050: #endif
4051: if (oldname != NULL)
4052: xmlFree(oldname);
4053: ctxt->instate = XML_PARSER_CONTENT;
4054: #ifdef DEBUG_PUSH
4055: fprintf(stderr, "HPP: entering CONTENT\n");
4056: #endif
4057: break;
4058: }
4059:
4060: if (CUR == '>') {
4061: NEXT;
4062: } else {
4063: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4064: ctxt->sax->error(ctxt->userData,
4065: "Couldn't find end of Start Tag %s\n",
4066: name);
4067: ctxt->wellFormed = 0;
4068:
4069: /*
4070: * end of parsing of this node.
4071: */
4072: if (!xmlStrcmp(name, ctxt->name)) {
4073: nodePop(ctxt);
4074: oldname = htmlnamePop(ctxt);
4075: #ifdef DEBUG
4076: fprintf(stderr,
4077: "End of start tag problem: popping out %s\n", oldname);
4078: #endif
4079: if (oldname != NULL)
4080: xmlFree(oldname);
4081: }
4082:
4083: ctxt->instate = XML_PARSER_CONTENT;
4084: #ifdef DEBUG_PUSH
4085: fprintf(stderr, "HPP: entering CONTENT\n");
4086: #endif
4087: break;
4088: }
4089:
4090: /*
4091: * Check for an Empty Element from DTD definition
4092: */
4093: if ((info != NULL) && (info->empty)) {
4094: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4095: ctxt->sax->endElement(ctxt->userData, name);
4096: oldname = htmlnamePop(ctxt);
4097: #ifdef DEBUG
4098: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
4099: #endif
4100: if (oldname != NULL)
4101: xmlFree(oldname);
4102: }
4103: ctxt->instate = XML_PARSER_CONTENT;
4104: #ifdef DEBUG_PUSH
4105: fprintf(stderr, "HPP: entering CONTENT\n");
4106: #endif
4107: break;
4108: }
1.56 veillard 4109: case XML_PARSER_CONTENT: {
4110: long cons;
1.31 daniel 4111: /*
4112: * Handle preparsed entities and charRef
4113: */
4114: if (ctxt->token != 0) {
1.47 daniel 4115: xmlChar chr[2] = { 0 , 0 } ;
1.31 daniel 4116:
1.47 daniel 4117: chr[0] = (xmlChar) ctxt->token;
1.59 veillard 4118: htmlCheckParagraph(ctxt);
1.31 daniel 4119: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.47 daniel 4120: ctxt->sax->characters(ctxt->userData, chr, 1);
1.31 daniel 4121: ctxt->token = 0;
4122: ctxt->checkIndex = 0;
4123: }
1.47 daniel 4124: if ((avail == 1) && (terminate)) {
4125: cur = in->cur[0];
4126: if ((cur != '<') && (cur != '&')) {
1.48 daniel 4127: if (ctxt->sax != NULL) {
4128: if (IS_BLANK(cur)) {
4129: if (ctxt->sax->ignorableWhitespace != NULL)
4130: ctxt->sax->ignorableWhitespace(
4131: ctxt->userData, &cur, 1);
4132: } else {
1.59 veillard 4133: htmlCheckParagraph(ctxt);
1.48 daniel 4134: if (ctxt->sax->characters != NULL)
4135: ctxt->sax->characters(
4136: ctxt->userData, &cur, 1);
4137: }
4138: }
1.47 daniel 4139: ctxt->token = 0;
4140: ctxt->checkIndex = 0;
4141: NEXT;
4142: }
4143: break;
4144: }
1.31 daniel 4145: if (avail < 2)
4146: goto done;
4147: cur = in->cur[0];
4148: next = in->cur[1];
1.56 veillard 4149: cons = ctxt->nbChars;
1.59 veillard 4150: /*
4151: * Sometimes DOCTYPE arrives in the middle of the document
4152: */
4153: if ((cur == '<') && (next == '!') &&
4154: (UPP(2) == 'D') && (UPP(3) == 'O') &&
4155: (UPP(4) == 'C') && (UPP(5) == 'T') &&
4156: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4157: (UPP(8) == 'E')) {
4158: if ((!terminate) &&
4159: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4160: goto done;
4161: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4162: ctxt->sax->error(ctxt->userData,
4163: "Misplaced DOCTYPE declaration\n");
4164: ctxt->wellFormed = 0;
4165: htmlParseDocTypeDecl(ctxt);
4166: } else if ((cur == '<') && (next == '!') &&
1.31 daniel 4167: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 4168: if ((!terminate) &&
4169: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 4170: goto done;
4171: #ifdef DEBUG_PUSH
4172: fprintf(stderr, "HPP: Parsing Comment\n");
4173: #endif
4174: htmlParseComment(ctxt);
4175: ctxt->instate = XML_PARSER_CONTENT;
4176: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4177: goto done;
4178: } else if ((cur == '<') && (next == '/')) {
4179: ctxt->instate = XML_PARSER_END_TAG;
4180: ctxt->checkIndex = 0;
4181: #ifdef DEBUG_PUSH
4182: fprintf(stderr, "HPP: entering END_TAG\n");
4183: #endif
4184: break;
4185: } else if (cur == '<') {
4186: ctxt->instate = XML_PARSER_START_TAG;
4187: ctxt->checkIndex = 0;
4188: #ifdef DEBUG_PUSH
4189: fprintf(stderr, "HPP: entering START_TAG\n");
4190: #endif
4191: break;
4192: } else if (cur == '&') {
1.32 daniel 4193: if ((!terminate) &&
4194: (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
1.31 daniel 4195: goto done;
4196: #ifdef DEBUG_PUSH
4197: fprintf(stderr, "HPP: Parsing Reference\n");
4198: #endif
4199: /* TODO: check generation of subtrees if noent !!! */
4200: htmlParseReference(ctxt);
4201: } else {
4202: /* TODO Avoid the extra copy, handle directly !!!!!! */
4203: /*
4204: * Goal of the following test is :
4205: * - minimize calls to the SAX 'character' callback
4206: * when they are mergeable
4207: */
4208: if ((ctxt->inputNr == 1) &&
4209: (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
1.32 daniel 4210: if ((!terminate) &&
4211: (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
1.31 daniel 4212: goto done;
4213: }
4214: ctxt->checkIndex = 0;
4215: #ifdef DEBUG_PUSH
4216: fprintf(stderr, "HPP: Parsing char data\n");
4217: #endif
4218: htmlParseCharData(ctxt, 0);
4219: }
1.56 veillard 4220: if (cons == ctxt->nbChars) {
4221: if (ctxt->node != NULL) {
4222: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4223: ctxt->sax->error(ctxt->userData,
4224: "detected an error in element content\n");
4225: ctxt->wellFormed = 0;
4226: NEXT;
4227: }
4228: break;
4229: }
4230:
1.31 daniel 4231: break;
1.56 veillard 4232: }
1.31 daniel 4233: case XML_PARSER_END_TAG:
4234: if (avail < 2)
4235: goto done;
1.32 daniel 4236: if ((!terminate) &&
4237: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 4238: goto done;
4239: htmlParseEndTag(ctxt);
4240: if (ctxt->nameNr == 0) {
4241: ctxt->instate = XML_PARSER_EPILOG;
4242: } else {
4243: ctxt->instate = XML_PARSER_CONTENT;
4244: }
4245: ctxt->checkIndex = 0;
4246: #ifdef DEBUG_PUSH
4247: fprintf(stderr, "HPP: entering CONTENT\n");
4248: #endif
4249: break;
4250: case XML_PARSER_CDATA_SECTION:
4251: fprintf(stderr, "HPP: internal error, state == CDATA\n");
4252: ctxt->instate = XML_PARSER_CONTENT;
4253: ctxt->checkIndex = 0;
4254: #ifdef DEBUG_PUSH
4255: fprintf(stderr, "HPP: entering CONTENT\n");
4256: #endif
4257: break;
4258: case XML_PARSER_DTD:
4259: fprintf(stderr, "HPP: internal error, state == DTD\n");
4260: ctxt->instate = XML_PARSER_CONTENT;
4261: ctxt->checkIndex = 0;
4262: #ifdef DEBUG_PUSH
4263: fprintf(stderr, "HPP: entering CONTENT\n");
4264: #endif
4265: break;
4266: case XML_PARSER_COMMENT:
4267: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
4268: ctxt->instate = XML_PARSER_CONTENT;
4269: ctxt->checkIndex = 0;
4270: #ifdef DEBUG_PUSH
4271: fprintf(stderr, "HPP: entering CONTENT\n");
4272: #endif
4273: break;
4274: case XML_PARSER_PI:
4275: fprintf(stderr, "HPP: internal error, state == PI\n");
4276: ctxt->instate = XML_PARSER_CONTENT;
4277: ctxt->checkIndex = 0;
4278: #ifdef DEBUG_PUSH
4279: fprintf(stderr, "HPP: entering CONTENT\n");
4280: #endif
4281: break;
4282: case XML_PARSER_ENTITY_DECL:
4283: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
4284: ctxt->instate = XML_PARSER_CONTENT;
4285: ctxt->checkIndex = 0;
4286: #ifdef DEBUG_PUSH
4287: fprintf(stderr, "HPP: entering CONTENT\n");
4288: #endif
4289: break;
4290: case XML_PARSER_ENTITY_VALUE:
4291: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
4292: ctxt->instate = XML_PARSER_CONTENT;
4293: ctxt->checkIndex = 0;
4294: #ifdef DEBUG_PUSH
4295: fprintf(stderr, "HPP: entering DTD\n");
4296: #endif
4297: break;
4298: case XML_PARSER_ATTRIBUTE_VALUE:
4299: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4300: ctxt->instate = XML_PARSER_START_TAG;
4301: ctxt->checkIndex = 0;
4302: #ifdef DEBUG_PUSH
4303: fprintf(stderr, "HPP: entering START_TAG\n");
1.53 veillard 4304: #endif
4305: break;
4306: case XML_PARSER_SYSTEM_LITERAL:
4307: fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4308: ctxt->instate = XML_PARSER_CONTENT;
4309: ctxt->checkIndex = 0;
4310: #ifdef DEBUG_PUSH
4311: fprintf(stderr, "HPP: entering CONTENT\n");
1.31 daniel 4312: #endif
4313: break;
4314: }
4315: }
4316: done:
1.47 daniel 4317: if ((avail == 0) && (terminate)) {
4318: htmlAutoClose(ctxt, NULL);
1.54 veillard 4319: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4320: /*
4321: * SAX: end of the document processing.
4322: */
1.47 daniel 4323: ctxt->instate = XML_PARSER_EOF;
1.54 veillard 4324: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4325: ctxt->sax->endDocument(ctxt->userData);
4326: }
1.59 veillard 4327: }
4328: if ((ctxt->myDoc != NULL) &&
4329: ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4330: (ctxt->instate == XML_PARSER_EPILOG))) {
4331: xmlDtdPtr dtd;
4332: dtd = xmlGetIntSubset(ctxt->myDoc);
4333: if (dtd == NULL)
4334: ctxt->myDoc->intSubset =
4335: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4336: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4337: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1.47 daniel 4338: }
1.31 daniel 4339: #ifdef DEBUG_PUSH
4340: fprintf(stderr, "HPP: done %d\n", ret);
4341: #endif
4342: return(ret);
4343: }
4344:
4345: /**
1.32 daniel 4346: * htmlParseTry:
4347: * @ctxt: an HTML parser context
4348: *
4349: * Try to progress on parsing
4350: *
4351: * Returns zero if no parsing was possible
4352: */
4353: int
4354: htmlParseTry(htmlParserCtxtPtr ctxt) {
4355: return(htmlParseTryOrFinish(ctxt, 0));
4356: }
4357:
4358: /**
1.31 daniel 4359: * htmlParseChunk:
4360: * @ctxt: an XML parser context
4361: * @chunk: an char array
4362: * @size: the size in byte of the chunk
4363: * @terminate: last chunk indicator
4364: *
4365: * Parse a Chunk of memory
4366: *
4367: * Returns zero if no error, the xmlParserErrors otherwise.
4368: */
4369: int
4370: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4371: int terminate) {
4372: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4373: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4374: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4375: int cur = ctxt->input->cur - ctxt->input->base;
4376:
4377: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4378: ctxt->input->base = ctxt->input->buf->buffer->content + base;
4379: ctxt->input->cur = ctxt->input->base + cur;
4380: #ifdef DEBUG_PUSH
4381: fprintf(stderr, "HPP: pushed %d\n", size);
4382: #endif
4383:
1.34 daniel 4384: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4385: htmlParseTryOrFinish(ctxt, terminate);
1.60 veillard 4386: } else if (ctxt->instate != XML_PARSER_EOF) {
4387: xmlParserInputBufferPush(ctxt->input->buf, 0, "");
1.32 daniel 4388: htmlParseTryOrFinish(ctxt, terminate);
1.60 veillard 4389: }
1.31 daniel 4390: if (terminate) {
4391: if ((ctxt->instate != XML_PARSER_EOF) &&
4392: (ctxt->instate != XML_PARSER_EPILOG) &&
4393: (ctxt->instate != XML_PARSER_MISC)) {
4394: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4395: ctxt->sax->error(ctxt->userData,
4396: "Extra content at the end of the document\n");
4397: ctxt->wellFormed = 0;
4398: ctxt->errNo = XML_ERR_DOCUMENT_END;
4399: }
4400: if (ctxt->instate != XML_PARSER_EOF) {
4401: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4402: ctxt->sax->endDocument(ctxt->userData);
4403: }
4404: ctxt->instate = XML_PARSER_EOF;
4405: }
4406: return((xmlParserErrors) ctxt->errNo);
4407: }
4408:
4409: /************************************************************************
4410: * *
4411: * User entry points *
4412: * *
4413: ************************************************************************/
4414:
4415: /**
4416: * htmlCreatePushParserCtxt :
4417: * @sax: a SAX handler
4418: * @user_data: The user data returned on SAX callbacks
4419: * @chunk: a pointer to an array of chars
4420: * @size: number of chars in the array
4421: * @filename: an optional file name or URI
4422: * @enc: an optional encoding
4423: *
4424: * Create a parser context for using the HTML parser in push mode
4425: * To allow content encoding detection, @size should be >= 4
4426: * The value of @filename is used for fetching external entities
4427: * and error/warning reports.
4428: *
4429: * Returns the new parser context or NULL
4430: */
4431: htmlParserCtxtPtr
4432: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4433: const char *chunk, int size, const char *filename,
4434: xmlCharEncoding enc) {
4435: htmlParserCtxtPtr ctxt;
4436: htmlParserInputPtr inputStream;
4437: xmlParserInputBufferPtr buf;
4438:
4439: buf = xmlAllocParserInputBuffer(enc);
4440: if (buf == NULL) return(NULL);
4441:
4442: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4443: if (ctxt == NULL) {
4444: xmlFree(buf);
4445: return(NULL);
4446: }
4447: memset(ctxt, 0, sizeof(htmlParserCtxt));
4448: htmlInitParserCtxt(ctxt);
4449: if (sax != NULL) {
4450: if (ctxt->sax != &htmlDefaultSAXHandler)
4451: xmlFree(ctxt->sax);
4452: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4453: if (ctxt->sax == NULL) {
4454: xmlFree(buf);
4455: xmlFree(ctxt);
4456: return(NULL);
4457: }
4458: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4459: if (user_data != NULL)
4460: ctxt->userData = user_data;
4461: }
4462: if (filename == NULL) {
4463: ctxt->directory = NULL;
4464: } else {
4465: ctxt->directory = xmlParserGetDirectory(filename);
4466: }
4467:
4468: inputStream = htmlNewInputStream(ctxt);
4469: if (inputStream == NULL) {
4470: xmlFreeParserCtxt(ctxt);
4471: return(NULL);
4472: }
4473:
4474: if (filename == NULL)
4475: inputStream->filename = NULL;
4476: else
4477: inputStream->filename = xmlMemStrdup(filename);
4478: inputStream->buf = buf;
4479: inputStream->base = inputStream->buf->buffer->content;
4480: inputStream->cur = inputStream->buf->buffer->content;
4481:
4482: inputPush(ctxt, inputStream);
4483:
4484: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4485: (ctxt->input->buf != NULL)) {
4486: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4487: #ifdef DEBUG_PUSH
4488: fprintf(stderr, "HPP: pushed %d\n", size);
4489: #endif
4490: }
4491:
4492: return(ctxt);
4493: }
1.1 daniel 4494:
4495: /**
4496: * htmlSAXParseDoc :
1.14 daniel 4497: * @cur: a pointer to an array of xmlChar
1.1 daniel 4498: * @encoding: a free form C string describing the HTML document encoding, or NULL
4499: * @sax: the SAX handler block
4500: * @userData: if using SAX, this pointer will be provided on callbacks.
4501: *
4502: * parse an HTML in-memory document and build a tree.
4503: * It use the given SAX function block to handle the parsing callback.
4504: * If sax is NULL, fallback to the default DOM tree building routines.
4505: *
4506: * Returns the resulting document tree
4507: */
4508:
4509: htmlDocPtr
1.14 daniel 4510: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
1.1 daniel 4511: htmlDocPtr ret;
4512: htmlParserCtxtPtr ctxt;
4513:
4514: if (cur == NULL) return(NULL);
4515:
4516:
4517: ctxt = htmlCreateDocParserCtxt(cur, encoding);
4518: if (ctxt == NULL) return(NULL);
4519: if (sax != NULL) {
4520: ctxt->sax = sax;
4521: ctxt->userData = userData;
4522: }
4523:
4524: htmlParseDocument(ctxt);
4525: ret = ctxt->myDoc;
4526: if (sax != NULL) {
4527: ctxt->sax = NULL;
4528: ctxt->userData = NULL;
4529: }
4530: htmlFreeParserCtxt(ctxt);
4531:
4532: return(ret);
4533: }
4534:
4535: /**
4536: * htmlParseDoc :
1.14 daniel 4537: * @cur: a pointer to an array of xmlChar
1.1 daniel 4538: * @encoding: a free form C string describing the HTML document encoding, or NULL
4539: *
4540: * parse an HTML in-memory document and build a tree.
4541: *
4542: * Returns the resulting document tree
4543: */
4544:
4545: htmlDocPtr
1.14 daniel 4546: htmlParseDoc(xmlChar *cur, const char *encoding) {
1.1 daniel 4547: return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4548: }
4549:
4550:
4551: /**
4552: * htmlCreateFileParserCtxt :
4553: * @filename: the filename
4554: * @encoding: a free form C string describing the HTML document encoding, or NULL
4555: *
4556: * Create a parser context for a file content.
4557: * Automatic support for ZLIB/Compress compressed document is provided
4558: * by default if found at compile-time.
4559: *
4560: * Returns the new parser context or NULL
4561: */
4562: htmlParserCtxtPtr
4563: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4564: {
4565: htmlParserCtxtPtr ctxt;
4566: htmlParserInputPtr inputStream;
1.5 daniel 4567: xmlParserInputBufferPtr buf;
1.1 daniel 4568: /* htmlCharEncoding enc; */
4569:
1.5 daniel 4570: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4571: if (buf == NULL) return(NULL);
1.1 daniel 4572:
1.11 daniel 4573: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 4574: if (ctxt == NULL) {
4575: perror("malloc");
4576: return(NULL);
4577: }
1.19 daniel 4578: memset(ctxt, 0, sizeof(htmlParserCtxt));
1.1 daniel 4579: htmlInitParserCtxt(ctxt);
1.11 daniel 4580: inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 4581: if (inputStream == NULL) {
4582: perror("malloc");
1.11 daniel 4583: xmlFree(ctxt);
1.1 daniel 4584: return(NULL);
4585: }
1.19 daniel 4586: memset(inputStream, 0, sizeof(htmlParserInput));
1.1 daniel 4587:
1.11 daniel 4588: inputStream->filename = xmlMemStrdup(filename);
1.1 daniel 4589: inputStream->line = 1;
4590: inputStream->col = 1;
1.5 daniel 4591: inputStream->buf = buf;
1.21 daniel 4592: inputStream->directory = NULL;
1.1 daniel 4593:
1.5 daniel 4594: inputStream->base = inputStream->buf->buffer->content;
4595: inputStream->cur = inputStream->buf->buffer->content;
4596: inputStream->free = NULL;
1.1 daniel 4597:
4598: inputPush(ctxt, inputStream);
4599: return(ctxt);
4600: }
4601:
4602: /**
4603: * htmlSAXParseFile :
4604: * @filename: the filename
4605: * @encoding: a free form C string describing the HTML document encoding, or NULL
4606: * @sax: the SAX handler block
4607: * @userData: if using SAX, this pointer will be provided on callbacks.
4608: *
4609: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4610: * compressed document is provided by default if found at compile-time.
4611: * It use the given SAX function block to handle the parsing callback.
4612: * If sax is NULL, fallback to the default DOM tree building routines.
4613: *
4614: * Returns the resulting document tree
4615: */
4616:
4617: htmlDocPtr
4618: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4619: void *userData) {
4620: htmlDocPtr ret;
4621: htmlParserCtxtPtr ctxt;
1.57 veillard 4622: htmlSAXHandlerPtr oldsax = NULL;
1.1 daniel 4623:
4624: ctxt = htmlCreateFileParserCtxt(filename, encoding);
4625: if (ctxt == NULL) return(NULL);
4626: if (sax != NULL) {
1.55 veillard 4627: oldsax = ctxt->sax;
1.1 daniel 4628: ctxt->sax = sax;
4629: ctxt->userData = userData;
4630: }
4631:
4632: htmlParseDocument(ctxt);
4633:
4634: ret = ctxt->myDoc;
4635: if (sax != NULL) {
1.55 veillard 4636: ctxt->sax = oldsax;
1.1 daniel 4637: ctxt->userData = NULL;
4638: }
4639: htmlFreeParserCtxt(ctxt);
4640:
4641: return(ret);
4642: }
4643:
4644: /**
4645: * htmlParseFile :
4646: * @filename: the filename
4647: * @encoding: a free form C string describing the HTML document encoding, or NULL
4648: *
4649: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4650: * compressed document is provided by default if found at compile-time.
4651: *
4652: * Returns the resulting document tree
4653: */
4654:
4655: htmlDocPtr
4656: htmlParseFile(const char *filename, const char *encoding) {
4657: return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4658: }
1.39 daniel 4659:
4660: #endif /* LIBXML_HTML_ENABLED */
Webmaster