Annotation of XML/HTMLparser.c, revision 1.79
1.1 daniel 1: /*
2: * HTMLparser.c : an HTML 4.0 non-verifying parser
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
1.29 daniel 10: #include "win32config.h"
1.1 daniel 11: #else
1.13 daniel 12: #include "config.h"
1.1 daniel 13: #endif
1.13 daniel 14:
1.75 veillard 15: #include <libxml/xmlversion.h>
1.39 daniel 16: #ifdef LIBXML_HTML_ENABLED
1.1 daniel 17: #include <stdio.h>
1.50 veillard 18: #include <string.h>
1.13 daniel 19: #ifdef HAVE_CTYPE_H
1.1 daniel 20: #include <ctype.h>
1.13 daniel 21: #endif
22: #ifdef HAVE_STDLIB_H
1.1 daniel 23: #include <stdlib.h>
1.13 daniel 24: #endif
25: #ifdef HAVE_SYS_STAT_H
1.1 daniel 26: #include <sys/stat.h>
1.13 daniel 27: #endif
1.1 daniel 28: #ifdef HAVE_FCNTL_H
29: #include <fcntl.h>
30: #endif
31: #ifdef HAVE_UNISTD_H
32: #include <unistd.h>
33: #endif
34: #ifdef HAVE_ZLIB_H
35: #include <zlib.h>
36: #endif
37:
1.39 daniel 38: #include <libxml/xmlmemory.h>
39: #include <libxml/tree.h>
1.75 veillard 40: #include <libxml/parser.h>
41: #include <libxml/parserInternals.h>
42: #include <libxml/xmlerror.h>
1.39 daniel 43: #include <libxml/HTMLparser.h>
44: #include <libxml/entities.h>
45: #include <libxml/encoding.h>
46: #include <libxml/valid.h>
47: #include <libxml/xmlIO.h>
1.5 daniel 48:
49: #define HTML_MAX_NAMELEN 1000
1.53 veillard 50: #define HTML_PARSER_BIG_BUFFER_SIZE 1000
1.31 daniel 51: #define HTML_PARSER_BUFFER_SIZE 100
1.1 daniel 52:
53: /* #define DEBUG */
1.31 daniel 54: /* #define DEBUG_PUSH */
1.1 daniel 55:
56: /************************************************************************
57: * *
58: * Parser stacks related functions and macros *
59: * *
60: ************************************************************************/
61:
62: /*
63: * Generic function for accessing stacks in the Parser Context
64: */
65:
1.30 daniel 66: #define PUSH_AND_POP(scope, type, name) \
67: scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
1.1 daniel 68: if (ctxt->name##Nr >= ctxt->name##Max) { \
69: ctxt->name##Max *= 2; \
1.50 veillard 70: ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
1.1 daniel 71: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
72: if (ctxt->name##Tab == NULL) { \
73: fprintf(stderr, "realloc failed !\n"); \
1.33 daniel 74: return(0); \
1.1 daniel 75: } \
76: } \
77: ctxt->name##Tab[ctxt->name##Nr] = value; \
78: ctxt->name = value; \
79: return(ctxt->name##Nr++); \
80: } \
1.30 daniel 81: scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
1.1 daniel 82: type ret; \
1.18 daniel 83: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 84: ctxt->name##Nr--; \
1.18 daniel 85: if (ctxt->name##Nr < 0) return(0); \
1.1 daniel 86: if (ctxt->name##Nr > 0) \
87: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
88: else \
89: ctxt->name = NULL; \
90: ret = ctxt->name##Tab[ctxt->name##Nr]; \
91: ctxt->name##Tab[ctxt->name##Nr] = 0; \
92: return(ret); \
93: } \
94:
1.30 daniel 95: PUSH_AND_POP(extern, xmlNodePtr, node)
96: PUSH_AND_POP(extern, xmlChar*, name)
1.1 daniel 97:
98: /*
99: * Macros for accessing the content. Those should be used only by the parser,
100: * and not exported.
101: *
102: * Dirty macros, i.e. one need to make assumption on the context to use them
103: *
1.14 daniel 104: * CUR_PTR return the current pointer to the xmlChar to be parsed.
105: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
1.1 daniel 106: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
107: * in UNICODE mode. This should be used internally by the parser
108: * only to compare to ASCII values otherwise it would break when
109: * running with UTF-8 encoding.
1.14 daniel 110: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
1.1 daniel 111: * to compare on ASCII based substring.
1.14 daniel 112: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
1.1 daniel 113: * it should be used only to compare on ASCII based substring.
1.14 daniel 114: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
1.1 daniel 115: * strings within the parser.
116: *
117: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
118: *
119: * CURRENT Returns the current char value, with the full decoding of
120: * UTF-8 if we are using this mode. It returns an int.
121: * NEXT Skip to the next character, this does the proper decoding
122: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
123: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
124: */
125:
126: #define UPPER (toupper(*ctxt->input->cur))
1.36 daniel 127:
1.26 daniel 128: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
1.36 daniel 129:
1.1 daniel 130: #define NXT(val) ctxt->input->cur[(val)]
1.36 daniel 131:
1.1 daniel 132: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
1.36 daniel 133:
1.1 daniel 134: #define CUR_PTR ctxt->input->cur
1.36 daniel 135:
1.5 daniel 136: #define SHRINK xmlParserInputShrink(ctxt->input)
1.36 daniel 137:
1.5 daniel 138: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
1.1 daniel 139:
1.36 daniel 140: #define CURRENT ((int) (*ctxt->input->cur))
1.1 daniel 141:
1.53 veillard 142: #define SKIP_BLANKS htmlSkipBlankChars(ctxt);
143:
144: /* Inported from XML */
145:
146: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
147: #define CUR ((int) (*ctxt->input->cur))
148: #define NEXT xmlNextChar(ctxt);ctxt->nbChars++;
149:
150: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
151: #define NXT(val) ctxt->input->cur[(val)]
152: #define CUR_PTR ctxt->input->cur
153:
154:
155: #define NEXTL(l) \
156: if (*(ctxt->input->cur) == '\n') { \
157: ctxt->input->line++; ctxt->input->col = 1; \
158: } else ctxt->input->col++; \
159: ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;
160:
161: /************
162: \
163: if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
164: if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
165: ************/
166:
167: #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l);
168: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l);
169:
170: #define COPY_BUF(l,b,i,v) \
171: if (l == 1) b[i++] = (xmlChar) v; \
172: else i += xmlCopyChar(l,&b[i],v);
173:
174: /**
175: * htmlCurrentChar:
176: * @ctxt: the HTML parser context
177: * @len: pointer to the length of the char read
178: *
179: * The current char value, if using UTF-8 this may actaully span multiple
180: * bytes in the input buffer. Implement the end of line normalization:
181: * 2.11 End-of-Line Handling
182: * If the encoding is unspecified, in the case we find an ISO-Latin-1
183: * char, then the encoding converter is plugged in automatically.
184: *
185: * Returns the current char value and its lenght
186: */
187:
188: int
189: htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
190: if (ctxt->instate == XML_PARSER_EOF)
191: return(0);
1.35 daniel 192:
1.53 veillard 193: if (ctxt->token != 0) {
194: *len = 0;
195: return(ctxt->token);
196: }
197: if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
198: /*
199: * We are supposed to handle UTF8, check it's valid
200: * From rfc2044: encoding of the Unicode values on UTF-8:
201: *
202: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
203: * 0000 0000-0000 007F 0xxxxxxx
204: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
205: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
206: *
207: * Check for the 0x110000 limit too
208: */
209: const unsigned char *cur = ctxt->input->cur;
210: unsigned char c;
211: unsigned int val;
212:
213: c = *cur;
214: if (c & 0x80) {
215: if (cur[1] == 0)
216: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
217: if ((cur[1] & 0xc0) != 0x80)
218: goto encoding_error;
219: if ((c & 0xe0) == 0xe0) {
220:
221: if (cur[2] == 0)
222: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
223: if ((cur[2] & 0xc0) != 0x80)
224: goto encoding_error;
225: if ((c & 0xf0) == 0xf0) {
226: if (cur[3] == 0)
227: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
228: if (((c & 0xf8) != 0xf0) ||
229: ((cur[3] & 0xc0) != 0x80))
230: goto encoding_error;
231: /* 4-byte code */
232: *len = 4;
233: val = (cur[0] & 0x7) << 18;
234: val |= (cur[1] & 0x3f) << 12;
235: val |= (cur[2] & 0x3f) << 6;
236: val |= cur[3] & 0x3f;
237: } else {
238: /* 3-byte code */
239: *len = 3;
240: val = (cur[0] & 0xf) << 12;
241: val |= (cur[1] & 0x3f) << 6;
242: val |= cur[2] & 0x3f;
243: }
244: } else {
245: /* 2-byte code */
246: *len = 2;
247: val = (cur[0] & 0x1f) << 6;
248: val |= cur[1] & 0x3f;
249: }
250: if (!IS_CHAR(val)) {
1.67 veillard 251: ctxt->errNo = XML_ERR_INVALID_ENCODING;
1.53 veillard 252: if ((ctxt->sax != NULL) &&
253: (ctxt->sax->error != NULL))
254: ctxt->sax->error(ctxt->userData,
255: "Char 0x%X out of allowed range\n", val);
256: ctxt->wellFormed = 0;
257: ctxt->disableSAX = 1;
258: }
259: return(val);
260: } else {
261: /* 1-byte code */
262: *len = 1;
263: return((int) *ctxt->input->cur);
264: }
265: }
266: /*
267: * Assume it's a fixed lenght encoding (1) with
268: * a compatibke encoding for the ASCII set, since
269: * XML constructs only use < 128 chars
270: */
271: *len = 1;
272: if ((int) *ctxt->input->cur < 0x80)
273: return((int) *ctxt->input->cur);
274:
275: /*
276: * Humm this is bad, do an automatic flow conversion
277: */
278: xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
279: ctxt->charset = XML_CHAR_ENCODING_UTF8;
280: return(xmlCurrentChar(ctxt, len));
281:
282: encoding_error:
283: /*
284: * If we detect an UTF8 error that probably mean that the
285: * input encoding didn't get properly advertized in the
286: * declaration header. Report the error and switch the encoding
287: * to ISO-Latin-1 (if you don't like this policy, just declare the
288: * encoding !)
289: */
1.67 veillard 290: ctxt->errNo = XML_ERR_INVALID_ENCODING;
1.53 veillard 291: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
292: ctxt->sax->error(ctxt->userData,
293: "Input is not proper UTF-8, indicate encoding !\n");
294: ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
295: ctxt->input->cur[0], ctxt->input->cur[1],
296: ctxt->input->cur[2], ctxt->input->cur[3]);
297: }
298:
299: ctxt->charset = XML_CHAR_ENCODING_8859_1;
300: *len = 1;
301: return((int) *ctxt->input->cur);
302: }
1.35 daniel 303:
304: /**
305: * htmlNextChar:
306: * @ctxt: the HTML parser context
307: *
308: * Skip to the next char input char.
309: */
310:
311: void
312: htmlNextChar(htmlParserCtxtPtr ctxt) {
1.44 daniel 313: if (ctxt->instate == XML_PARSER_EOF)
314: return;
1.35 daniel 315: if ((*ctxt->input->cur == 0) &&
316: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
317: xmlPopInput(ctxt);
318: } else {
319: if (*(ctxt->input->cur) == '\n') {
320: ctxt->input->line++; ctxt->input->col = 1;
321: } else ctxt->input->col++;
322: ctxt->input->cur++;
323: ctxt->nbChars++;
324: if (*ctxt->input->cur == 0)
325: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
326: }
327: }
1.5 daniel 328:
1.36 daniel 329: /**
330: * htmlSkipBlankChars:
331: * @ctxt: the HTML parser context
332: *
333: * skip all blanks character found at that point in the input streams.
334: *
335: * Returns the number of space chars skipped
336: */
337:
338: int
339: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
340: int res = 0;
341:
342: while (IS_BLANK(*(ctxt->input->cur))) {
343: if ((*ctxt->input->cur == 0) &&
344: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
345: xmlPopInput(ctxt);
346: } else {
347: if (*(ctxt->input->cur) == '\n') {
348: ctxt->input->line++; ctxt->input->col = 1;
349: } else ctxt->input->col++;
350: ctxt->input->cur++;
351: ctxt->nbChars++;
352: if (*ctxt->input->cur == 0)
353: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
354: }
355: res++;
356: }
357: return(res);
358: }
1.1 daniel 359:
360:
1.5 daniel 361:
1.1 daniel 362: /************************************************************************
363: * *
364: * The list of HTML elements and their properties *
365: * *
366: ************************************************************************/
367:
368: /*
369: * Start Tag: 1 means the start tag can be ommited
370: * End Tag: 1 means the end tag can be ommited
371: * 2 means it's forbidden (empty elements)
372: * Depr: this element is deprecated
373: * DTD: 1 means that this element is valid only in the Loose DTD
374: * 2 means that this element is valid only in the Frameset DTD
375: *
376: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
377: */
378: htmlElemDesc html40ElementTable[] = {
1.26 daniel 379: { "a", 0, 0, 0, 0, 0, "anchor " },
380: { "abbr", 0, 0, 0, 0, 0, "abbreviated form" },
381: { "acronym", 0, 0, 0, 0, 0, "" },
382: { "address", 0, 0, 0, 0, 0, "information on author " },
383: { "applet", 0, 0, 0, 1, 1, "java applet " },
384: { "area", 0, 2, 1, 0, 0, "client-side image map area " },
385: { "b", 0, 0, 0, 0, 0, "bold text style" },
386: { "base", 0, 2, 1, 0, 0, "document base uri " },
387: { "basefont", 0, 2, 1, 1, 1, "base font size " },
388: { "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },
389: { "big", 0, 0, 0, 0, 0, "large text style" },
390: { "blockquote", 0, 0, 0, 0, 0, "long quotation " },
391: { "body", 1, 1, 0, 0, 0, "document body " },
392: { "br", 0, 2, 1, 0, 0, "forced line break " },
393: { "button", 0, 0, 0, 0, 0, "push button " },
394: { "caption", 0, 0, 0, 0, 0, "table caption " },
395: { "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },
396: { "cite", 0, 0, 0, 0, 0, "citation" },
397: { "code", 0, 0, 0, 0, 0, "computer code fragment" },
398: { "col", 0, 2, 1, 0, 0, "table column " },
399: { "colgroup", 0, 1, 0, 0, 0, "table column group " },
400: { "dd", 0, 1, 0, 0, 0, "definition description " },
401: { "del", 0, 0, 0, 0, 0, "deleted text " },
402: { "dfn", 0, 0, 0, 0, 0, "instance definition" },
403: { "dir", 0, 0, 0, 1, 1, "directory list" },
404: { "div", 0, 0, 0, 0, 0, "generic language/style container"},
405: { "dl", 0, 0, 0, 0, 0, "definition list " },
406: { "dt", 0, 1, 0, 0, 0, "definition term " },
407: { "em", 0, 0, 0, 0, 0, "emphasis" },
408: { "fieldset", 0, 0, 0, 0, 0, "form control group " },
409: { "font", 0, 0, 0, 1, 1, "local change to font " },
410: { "form", 0, 0, 0, 0, 0, "interactive form " },
411: { "frame", 0, 2, 1, 0, 2, "subwindow " },
412: { "frameset", 0, 0, 0, 0, 2, "window subdivision" },
413: { "h1", 0, 0, 0, 0, 0, "heading " },
414: { "h2", 0, 0, 0, 0, 0, "heading " },
415: { "h3", 0, 0, 0, 0, 0, "heading " },
416: { "h4", 0, 0, 0, 0, 0, "heading " },
417: { "h5", 0, 0, 0, 0, 0, "heading " },
418: { "h6", 0, 0, 0, 0, 0, "heading " },
419: { "head", 1, 1, 0, 0, 0, "document head " },
420: { "hr", 0, 2, 1, 0, 0, "horizontal rule " },
421: { "html", 1, 1, 0, 0, 0, "document root element " },
422: { "i", 0, 0, 0, 0, 0, "italic text style" },
423: { "iframe", 0, 0, 0, 0, 1, "inline subwindow " },
424: { "img", 0, 2, 1, 0, 0, "embedded image " },
425: { "input", 0, 2, 1, 0, 0, "form control " },
426: { "ins", 0, 0, 0, 0, 0, "inserted text" },
427: { "isindex", 0, 2, 1, 1, 1, "single line prompt " },
428: { "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },
429: { "label", 0, 0, 0, 0, 0, "form field label text " },
430: { "legend", 0, 0, 0, 0, 0, "fieldset legend " },
431: { "li", 0, 1, 0, 0, 0, "list item " },
432: { "link", 0, 2, 1, 0, 0, "a media-independent link " },
433: { "map", 0, 0, 0, 0, 0, "client-side image map " },
434: { "menu", 0, 0, 0, 1, 1, "menu list " },
435: { "meta", 0, 2, 1, 0, 0, "generic metainformation " },
436: { "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },
437: { "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
438: { "object", 0, 0, 0, 0, 0, "generic embedded object " },
439: { "ol", 0, 0, 0, 0, 0, "ordered list " },
440: { "optgroup", 0, 0, 0, 0, 0, "option group " },
441: { "option", 0, 1, 0, 0, 0, "selectable choice " },
442: { "p", 0, 1, 0, 0, 0, "paragraph " },
443: { "param", 0, 2, 1, 0, 0, "named property value " },
444: { "pre", 0, 0, 0, 0, 0, "preformatted text " },
445: { "q", 0, 0, 0, 0, 0, "short inline quotation " },
446: { "s", 0, 0, 0, 1, 1, "strike-through text style" },
447: { "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },
448: { "script", 0, 0, 0, 0, 0, "script statements " },
449: { "select", 0, 0, 0, 0, 0, "option selector " },
450: { "small", 0, 0, 0, 0, 0, "small text style" },
451: { "span", 0, 0, 0, 0, 0, "generic language/style container " },
452: { "strike", 0, 0, 0, 1, 1, "strike-through text" },
453: { "strong", 0, 0, 0, 0, 0, "strong emphasis" },
454: { "style", 0, 0, 0, 0, 0, "style info " },
455: { "sub", 0, 0, 0, 0, 0, "subscript" },
456: { "sup", 0, 0, 0, 0, 0, "superscript " },
457: { "table", 0, 0, 0, 0, 0, " " },
458: { "tbody", 1, 1, 0, 0, 0, "table body " },
459: { "td", 0, 1, 0, 0, 0, "table data cell" },
460: { "textarea", 0, 0, 0, 0, 0, "multi-line text field " },
461: { "tfoot", 0, 1, 0, 0, 0, "table footer " },
462: { "th", 0, 1, 0, 0, 0, "table header cell" },
463: { "thead", 0, 1, 0, 0, 0, "table header " },
464: { "title", 0, 0, 0, 0, 0, "document title " },
465: { "tr", 0, 1, 0, 0, 0, "table row " },
466: { "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },
467: { "u", 0, 0, 0, 1, 1, "underlined text style" },
468: { "ul", 0, 0, 0, 0, 0, "unordered list " },
469: { "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },
1.1 daniel 470: };
471:
472: /*
473: * start tags that imply the end of a current element
474: * any tag of each line implies the end of the current element if the type of
475: * that element is in the same line
476: */
1.8 daniel 477: char *htmlEquEnd[] = {
1.26 daniel 478: "dt", "dd", "li", "option", NULL,
479: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
480: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
1.1 daniel 481: NULL
482: };
483: /*
484: * acording the HTML DTD, HR should be added to the 2nd line above, as it
485: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
486: * because many documents contain rules in headings...
487: */
488:
489: /*
490: * start tags that imply the end of current element
491: */
1.8 daniel 492: char *htmlStartClose[] = {
1.26 daniel 493: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
494: "dl", "ul", "ol", "menu", "dir", "address", "pre",
495: "listing", "xmp", "head", NULL,
496: "head", "p", NULL,
497: "title", "p", NULL,
498: "body", "head", "style", "link", "title", "p", NULL,
499: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
500: "pre", "listing", "xmp", "head", "li", NULL,
501: "hr", "p", "head", NULL,
502: "h1", "p", "head", NULL,
503: "h2", "p", "head", NULL,
504: "h3", "p", "head", NULL,
505: "h4", "p", "head", NULL,
506: "h5", "p", "head", NULL,
507: "h6", "p", "head", NULL,
508: "dir", "p", "head", NULL,
509: "address", "p", "head", "ul", NULL,
510: "pre", "p", "head", "ul", NULL,
511: "listing", "p", "head", NULL,
512: "xmp", "p", "head", NULL,
513: "blockquote", "p", "head", NULL,
514: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
515: "xmp", "head", NULL,
516: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
517: "head", "dd", NULL,
518: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
519: "head", "dt", NULL,
520: "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
521: "listing", "xmp", NULL,
522: "ol", "p", "head", "ul", NULL,
523: "menu", "p", "head", "ul", NULL,
524: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
525: "div", "p", "head", NULL,
526: "noscript", "p", "head", NULL,
527: "center", "font", "b", "i", "p", "head", NULL,
528: "a", "a", NULL,
529: "caption", "p", NULL,
530: "colgroup", "caption", "colgroup", "col", "p", NULL,
531: "col", "caption", "col", "p", NULL,
532: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
533: "listing", "xmp", "a", NULL,
534: "th", "th", "td", NULL,
535: "td", "th", "td", "p", NULL,
536: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
537: "thead", "caption", "col", "colgroup", NULL,
538: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
539: "tbody", "p", NULL,
540: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
541: "tfoot", "tbody", "p", NULL,
542: "optgroup", "option", NULL,
1.79 ! veillard 543: "option", "option", NULL,
1.26 daniel 544: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
545: "pre", "listing", "xmp", "a", NULL,
1.1 daniel 546: NULL
547: };
548:
1.59 veillard 549: /*
550: * The list of HTML elements which are supposed not to have
551: * CDATA content and where a p element will be implied
552: *
553: * TODO: extend that list by reading the HTML SGML DtD on
554: * implied paragraph
555: */
556: static char *htmlNoContentElements[] = {
557: "html",
558: "head",
559: "body",
560: NULL
561: };
562:
1.78 veillard 563: /*
564: * The list of HTML attributes which are of content %Script;
565: * NOTE: when adding ones, check htmlIsScriptAttribute() since
566: * it assumes the name starts with 'on'
567: */
568: static char *htmlScriptAttributes[] = {
569: "onclick",
570: "ondblclick",
571: "onmousedown",
572: "onmouseup",
573: "onmouseover",
574: "onmousemove",
575: "onmouseout",
576: "onkeypress",
577: "onkeydown",
578: "onkeyup",
579: "onload",
580: "onunload",
581: "onfocus",
582: "onblur",
583: "onsubmit",
584: "onrest",
585: "onchange",
586: "onselect"
587: };
588:
589:
1.8 daniel 590: static char** htmlStartCloseIndex[100];
1.1 daniel 591: static int htmlStartCloseIndexinitialized = 0;
592:
593: /************************************************************************
594: * *
595: * functions to handle HTML specific data *
596: * *
597: ************************************************************************/
598:
599: /**
600: * htmlInitAutoClose:
601: *
602: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1.72 veillard 603: * This is not reentrant. Call xmlInitParser() once before processing in
604: * case of use in multithreaded programs.
1.1 daniel 605: */
606: void
607: htmlInitAutoClose(void) {
608: int index, i = 0;
609:
610: if (htmlStartCloseIndexinitialized) return;
611:
612: for (index = 0;index < 100;index ++) htmlStartCloseIndex[index] = NULL;
613: index = 0;
614: while ((htmlStartClose[i] != NULL) && (index < 100 - 1)) {
615: htmlStartCloseIndex[index++] = &htmlStartClose[i];
616: while (htmlStartClose[i] != NULL) i++;
617: i++;
618: }
1.72 veillard 619: htmlStartCloseIndexinitialized = 1;
1.1 daniel 620: }
621:
622: /**
623: * htmlTagLookup:
1.69 veillard 624: * @tag: The tag name in lowercase
1.1 daniel 625: *
626: * Lookup the HTML tag in the ElementTable
627: *
628: * Returns the related htmlElemDescPtr or NULL if not found.
629: */
630: htmlElemDescPtr
1.14 daniel 631: htmlTagLookup(const xmlChar *tag) {
1.61 veillard 632: int i;
1.1 daniel 633:
634: for (i = 0; i < (sizeof(html40ElementTable) /
635: sizeof(html40ElementTable[0]));i++) {
1.73 veillard 636: if (xmlStrEqual(tag, BAD_CAST html40ElementTable[i].name))
1.1 daniel 637: return(&html40ElementTable[i]);
638: }
639: return(NULL);
640: }
641:
642: /**
643: * htmlCheckAutoClose:
1.50 veillard 644: * @newtag: The new tag name
645: * @oldtag: The old tag name
1.1 daniel 646: *
647: * Checks wether the new tag is one of the registered valid tags for closing old.
648: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
649: *
650: * Returns 0 if no, 1 if yes.
651: */
652: int
1.50 veillard 653: htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
1.1 daniel 654: int i, index;
1.64 veillard 655: char **close = NULL;
1.1 daniel 656:
657: if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
658:
659: /* inefficient, but not a big deal */
660: for (index = 0; index < 100;index++) {
661: close = htmlStartCloseIndex[index];
662: if (close == NULL) return(0);
1.73 veillard 663: if (xmlStrEqual(BAD_CAST *close, newtag)) break;
1.1 daniel 664: }
665:
666: i = close - htmlStartClose;
667: i++;
668: while (htmlStartClose[i] != NULL) {
1.73 veillard 669: if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1.1 daniel 670: return(1);
671: }
672: i++;
673: }
674: return(0);
675: }
676:
677: /**
1.50 veillard 678: * htmlAutoCloseOnClose:
679: * @ctxt: an HTML parser context
680: * @newtag: The new tag name
681: *
682: * The HTmL DtD allows an ending tag to implicitely close other tags.
683: */
684: void
685: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
686: htmlElemDescPtr info;
687: xmlChar *oldname;
688: int i;
689:
690: #ifdef DEBUG
691: fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
692: for (i = 0;i < ctxt->nameNr;i++)
693: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
694: #endif
695:
696: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
1.73 veillard 697: if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
1.50 veillard 698: }
699: if (i < 0) return;
700:
1.73 veillard 701: while (!xmlStrEqual(newtag, ctxt->name)) {
1.50 veillard 702: info = htmlTagLookup(ctxt->name);
703: if ((info == NULL) || (info->endTag == 1)) {
704: #ifdef DEBUG
705: fprintf(stderr,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
706: #endif
707: } else {
708: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
709: ctxt->sax->error(ctxt->userData,
710: "Opening and ending tag mismatch: %s and %s\n",
711: newtag, ctxt->name);
712: ctxt->wellFormed = 0;
713: }
714: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
715: ctxt->sax->endElement(ctxt->userData, ctxt->name);
716: oldname = htmlnamePop(ctxt);
717: if (oldname != NULL) {
718: #ifdef DEBUG
719: fprintf(stderr,"htmlAutoCloseOnClose: popped %s\n", oldname);
720: #endif
721: xmlFree(oldname);
722: }
723: }
724: }
725:
726: /**
1.1 daniel 727: * htmlAutoClose:
728: * @ctxt: an HTML parser context
1.50 veillard 729: * @newtag: The new tag name or NULL
1.1 daniel 730: *
731: * The HTmL DtD allows a tag to implicitely close other tags.
732: * The list is kept in htmlStartClose array. This function is
733: * called when a new tag has been detected and generates the
734: * appropriates closes if possible/needed.
1.50 veillard 735: * If newtag is NULL this mean we are at the end of the resource
1.47 daniel 736: * and we should check
1.1 daniel 737: */
738: void
1.50 veillard 739: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1.15 daniel 740: xmlChar *oldname;
1.50 veillard 741: while ((newtag != NULL) && (ctxt->name != NULL) &&
742: (htmlCheckAutoClose(newtag, ctxt->name))) {
1.1 daniel 743: #ifdef DEBUG
1.50 veillard 744: fprintf(stderr,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
1.1 daniel 745: #endif
746: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1.15 daniel 747: ctxt->sax->endElement(ctxt->userData, ctxt->name);
1.24 daniel 748: oldname = htmlnamePop(ctxt);
1.18 daniel 749: if (oldname != NULL) {
750: #ifdef DEBUG
751: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
752: #endif
1.17 daniel 753: xmlFree(oldname);
1.18 daniel 754: }
1.1 daniel 755: }
1.50 veillard 756: if (newtag == NULL) {
1.49 daniel 757: htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
758: htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
759: htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
760: }
1.50 veillard 761: while ((newtag == NULL) && (ctxt->name != NULL) &&
1.73 veillard 762: ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
763: (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
764: (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
1.47 daniel 765: #ifdef DEBUG
766: fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
767: #endif
768: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
769: ctxt->sax->endElement(ctxt->userData, ctxt->name);
770: oldname = htmlnamePop(ctxt);
771: if (oldname != NULL) {
772: #ifdef DEBUG
773: fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
774: #endif
775: xmlFree(oldname);
776: }
777: }
778:
1.1 daniel 779: }
780:
781: /**
1.28 daniel 782: * htmlAutoCloseTag:
783: * @doc: the HTML document
784: * @name: The tag name
785: * @elem: the HTML element
786: *
787: * The HTmL DtD allows a tag to implicitely close other tags.
788: * The list is kept in htmlStartClose array. This function checks
789: * if the element or one of it's children would autoclose the
790: * given tag.
791: *
792: * Returns 1 if autoclose, 0 otherwise
793: */
794: int
795: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
796: htmlNodePtr child;
797:
798: if (elem == NULL) return(1);
1.73 veillard 799: if (xmlStrEqual(name, elem->name)) return(0);
1.28 daniel 800: if (htmlCheckAutoClose(elem->name, name)) return(1);
1.37 daniel 801: child = elem->children;
1.28 daniel 802: while (child != NULL) {
803: if (htmlAutoCloseTag(doc, name, child)) return(1);
804: child = child->next;
805: }
806: return(0);
807: }
808:
809: /**
810: * htmlIsAutoClosed:
811: * @doc: the HTML document
812: * @elem: the HTML element
813: *
814: * The HTmL DtD allows a tag to implicitely close other tags.
815: * The list is kept in htmlStartClose array. This function checks
816: * if a tag is autoclosed by one of it's child
817: *
818: * Returns 1 if autoclosed, 0 otherwise
819: */
820: int
821: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
822: htmlNodePtr child;
823:
824: if (elem == NULL) return(1);
1.37 daniel 825: child = elem->children;
1.28 daniel 826: while (child != NULL) {
827: if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
828: child = child->next;
829: }
830: return(0);
831: }
832:
833: /**
1.43 daniel 834: * htmlCheckImplied:
835: * @ctxt: an HTML parser context
1.50 veillard 836: * @newtag: The new tag name
1.43 daniel 837: *
838: * The HTmL DtD allows a tag to exists only implicitely
839: * called when a new tag has been detected and generates the
840: * appropriates implicit tags if missing
841: */
842: void
1.50 veillard 843: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1.73 veillard 844: if (xmlStrEqual(newtag, BAD_CAST"html"))
1.43 daniel 845: return;
846: if (ctxt->nameNr <= 0) {
847: #ifdef DEBUG
848: fprintf(stderr,"Implied element html: pushed html\n");
849: #endif
850: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
851: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
852: ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
853: }
1.73 veillard 854: if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1.43 daniel 855: return;
856: if (ctxt->nameNr <= 1) {
1.73 veillard 857: if ((xmlStrEqual(newtag, BAD_CAST"script")) ||
858: (xmlStrEqual(newtag, BAD_CAST"style")) ||
859: (xmlStrEqual(newtag, BAD_CAST"meta")) ||
860: (xmlStrEqual(newtag, BAD_CAST"link")) ||
861: (xmlStrEqual(newtag, BAD_CAST"title")) ||
862: (xmlStrEqual(newtag, BAD_CAST"base"))) {
1.43 daniel 863: /*
864: * dropped OBJECT ... i you put it first BODY will be
865: * assumed !
866: */
867: #ifdef DEBUG
868: fprintf(stderr,"Implied element head: pushed head\n");
869: #endif
870: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
871: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
872: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
873: } else {
874: #ifdef DEBUG
875: fprintf(stderr,"Implied element body: pushed body\n");
876: #endif
877: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
878: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
879: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
880: }
881: }
882: }
883:
1.59 veillard 884: /**
885: * htmlCheckParagraph
886: * @ctxt: an HTML parser context
887: *
888: * Check whether a p element need to be implied before inserting
889: * characters in the current element.
890: *
891: * Returns 1 if a paragraph has been inserted, 0 if not and -1
892: * in case of error.
893: */
894:
895: int
896: htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
897: const xmlChar *tag;
898: int i;
899:
900: if (ctxt == NULL)
901: return(-1);
902: tag = ctxt->name;
903: if (tag == NULL) {
904: htmlAutoClose(ctxt, BAD_CAST"p");
905: htmlCheckImplied(ctxt, BAD_CAST"p");
906: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
907: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
908: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
909: return(1);
910: }
911: for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1.73 veillard 912: if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1.59 veillard 913: #ifdef DEBUG
914: fprintf(stderr,"Implied element paragraph\n");
915: #endif
916: htmlAutoClose(ctxt, BAD_CAST"p");
917: htmlCheckImplied(ctxt, BAD_CAST"p");
918: htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
919: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
920: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
921: return(1);
922: }
1.78 veillard 923: }
924: return(0);
925: }
926:
927: /**
928: * htmlIsScriptAttribute:
929: * @name: an attribute name
930: *
931: * Check if an attribute is of content type Script
932: *
933: * Returns 1 is the attribute is a script 0 otherwise
934: */
935: int
936: htmlIsScriptAttribute(const xmlChar *name) {
937: int i;
938:
939: if (name == NULL)
940: return(0);
941: /*
942: * all script attributes start with 'on'
943: */
944: if ((name[0] != 'o') || (name[1] != 'n'))
945: return(0);
946: for (i = 0;
947: i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
948: i++) {
949: if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
950: return(1);
1.59 veillard 951: }
952: return(0);
953: }
954:
1.1 daniel 955: /************************************************************************
956: * *
957: * The list of HTML predefined entities *
958: * *
959: ************************************************************************/
960:
961:
962: htmlEntityDesc html40EntitiesTable[] = {
963: /*
1.61 veillard 964: * the 4 absolute ones, plus apostrophe.
1.1 daniel 965: */
966: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
967: { 38, "amp", "ampersand, U+0026 ISOnum" },
1.61 veillard 968: { 39, "apos", "single quote" },
1.1 daniel 969: { 60, "lt", "less-than sign, U+003C ISOnum" },
970: { 62, "gt", "greater-than sign, U+003E ISOnum" },
971:
972: /*
973: * A bunch still in the 128-255 range
974: * Replacing them depend really on the charset used.
975: */
976: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
977: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
978: { 162, "cent", "cent sign, U+00A2 ISOnum" },
979: { 163, "pound","pound sign, U+00A3 ISOnum" },
980: { 164, "curren","currency sign, U+00A4 ISOnum" },
981: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
982: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
983: { 167, "sect", "section sign, U+00A7 ISOnum" },
984: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
985: { 169, "copy", "copyright sign, U+00A9 ISOnum" },
986: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
987: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
988: { 172, "not", "not sign, U+00AC ISOnum" },
989: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
990: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
991: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
992: { 176, "deg", "degree sign, U+00B0 ISOnum" },
993: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
994: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
995: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
996: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
997: { 181, "micro","micro sign, U+00B5 ISOnum" },
998: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1.7 daniel 999: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1.1 daniel 1000: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1001: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1002: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1.7 daniel 1003: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1.1 daniel 1004: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1005: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1006: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1007: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1008: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1009: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1010: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1011: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1012: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1013: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1014: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1015: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1016: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1017: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1018: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1019: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1020: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1021: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1022: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1023: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1024: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1025: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1026: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1027: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1028: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1029: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1030: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1031: { 215, "times","multiplication sign, U+00D7 ISOnum" },
1.7 daniel 1032: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1.1 daniel 1033: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1034: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1035: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1036: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1037: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1038: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1039: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1040: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1041: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1042: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1043: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1044: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1045: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1046: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1047: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1048: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1049: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1050: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1051: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1052: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1053: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1054: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1055: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1056: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1057: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1058: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1059: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1060: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1061: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1062: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1063: { 247, "divide","division sign, U+00F7 ISOnum" },
1064: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1065: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1066: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1067: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1068: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1069: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1070: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1071: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1072:
1.61 veillard 1073: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1074: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1075: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1076: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1077: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1078:
1.1 daniel 1079: /*
1080: * Anything below should really be kept as entities references
1081: */
1082: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1083:
1.61 veillard 1084: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1085: { 732, "tilde","small tilde, U+02DC ISOdia" },
1086:
1.1 daniel 1087: { 913, "Alpha","greek capital letter alpha, U+0391" },
1088: { 914, "Beta", "greek capital letter beta, U+0392" },
1089: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1090: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1091: { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1092: { 918, "Zeta", "greek capital letter zeta, U+0396" },
1093: { 919, "Eta", "greek capital letter eta, U+0397" },
1094: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1095: { 921, "Iota", "greek capital letter iota, U+0399" },
1096: { 922, "Kappa","greek capital letter kappa, U+039A" },
1097: { 923, "Lambda""greek capital letter lambda, U+039B ISOgrk3" },
1098: { 924, "Mu", "greek capital letter mu, U+039C" },
1099: { 925, "Nu", "greek capital letter nu, U+039D" },
1100: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1101: { 927, "Omicron","greek capital letter omicron, U+039F" },
1102: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1103: { 929, "Rho", "greek capital letter rho, U+03A1" },
1104: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1105: { 932, "Tau", "greek capital letter tau, U+03A4" },
1106: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1107: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1108: { 935, "Chi", "greek capital letter chi, U+03A7" },
1109: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1110: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1111:
1112: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1113: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1114: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1115: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1116: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1117: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1118: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1119: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1120: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1121: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1122: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1123: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1124: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1125: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1126: { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1127: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1128: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1129: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1130: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1131: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1132: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1133: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1134: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1135: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1136: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1137: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1138: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1139: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1140:
1.61 veillard 1141: { 8194, "ensp", "en space, U+2002 ISOpub" },
1142: { 8195, "emsp", "em space, U+2003 ISOpub" },
1143: { 8201, "thinsp","thin space, U+2009 ISOpub" },
1144: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1145: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1146: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1147: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1148: { 8211, "ndash","en dash, U+2013 ISOpub" },
1149: { 8212, "mdash","em dash, U+2014 ISOpub" },
1150: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1151: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1152: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1153: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1154: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1155: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1156: { 8224, "dagger","dagger, U+2020 ISOpub" },
1157: { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1158:
1.1 daniel 1159: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1160: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1.61 veillard 1161:
1162: { 8240, "permil","per mille sign, U+2030 ISOtech" },
1163:
1.1 daniel 1164: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1165: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1.61 veillard 1166:
1167: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1168: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1169:
1.1 daniel 1170: { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1171: { 8260, "frasl","fraction slash, U+2044 NEW" },
1172:
1.61 veillard 1173: { 8364, "euro", "euro sign, U+20AC NEW" },
1174:
1175: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1.7 daniel 1176: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1.1 daniel 1177: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1178: { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1179: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1180: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1181: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1182: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1183: { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1184: { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1185: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1186: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1187: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1188: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1189: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1190: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1191:
1192: { 8704, "forall","for all, U+2200 ISOtech" },
1193: { 8706, "part", "partial differential, U+2202 ISOtech" },
1194: { 8707, "exist","there exists, U+2203 ISOtech" },
1195: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1196: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1197: { 8712, "isin", "element of, U+2208 ISOtech" },
1198: { 8713, "notin","not an element of, U+2209 ISOtech" },
1199: { 8715, "ni", "contains as member, U+220B ISOtech" },
1200: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1201: { 8721, "sum", "n-ary sumation, U+2211 ISOamsb" },
1202: { 8722, "minus","minus sign, U+2212 ISOtech" },
1203: { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1204: { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1205: { 8733, "prop", "proportional to, U+221D ISOtech" },
1206: { 8734, "infin","infinity, U+221E ISOtech" },
1207: { 8736, "ang", "angle, U+2220 ISOamso" },
1208: { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1209: { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1210: { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1211: { 8746, "cup", "union = cup, U+222A ISOtech" },
1212: { 8747, "int", "integral, U+222B ISOtech" },
1213: { 8756, "there4","therefore, U+2234 ISOtech" },
1214: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1215: { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1216: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1217: { 8800, "ne", "not equal to, U+2260 ISOtech" },
1218: { 8801, "equiv","identical to, U+2261 ISOtech" },
1219: { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1220: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1221: { 8834, "sub", "subset of, U+2282 ISOtech" },
1222: { 8835, "sup", "superset of, U+2283 ISOtech" },
1223: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1224: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1225: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1226: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1227: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1228: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1229: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1230: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1231: { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1232: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1233: { 8971, "rfloor","right floor, U+230B ISOamsc" },
1234: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1235: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1236: { 9674, "loz", "lozenge, U+25CA ISOpub" },
1237:
1238: { 9824, "spades","black spade suit, U+2660 ISOpub" },
1239: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1240: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1241: { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1242:
1243: };
1244:
1245: /************************************************************************
1246: * *
1247: * Commodity functions to handle entities *
1248: * *
1249: ************************************************************************/
1250:
1251: /*
1252: * Macro used to grow the current buffer.
1253: */
1254: #define growBuffer(buffer) { \
1255: buffer##_size *= 2; \
1.14 daniel 1256: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1.1 daniel 1257: if (buffer == NULL) { \
1258: perror("realloc failed"); \
1.33 daniel 1259: return(NULL); \
1.1 daniel 1260: } \
1261: }
1262:
1263: /**
1264: * htmlEntityLookup:
1265: * @name: the entity name
1266: *
1267: * Lookup the given entity in EntitiesTable
1268: *
1269: * TODO: the linear scan is really ugly, an hash table is really needed.
1270: *
1271: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1272: */
1273: htmlEntityDescPtr
1.14 daniel 1274: htmlEntityLookup(const xmlChar *name) {
1.1 daniel 1275: int i;
1276:
1277: for (i = 0;i < (sizeof(html40EntitiesTable)/
1278: sizeof(html40EntitiesTable[0]));i++) {
1.73 veillard 1279: if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1.1 daniel 1280: #ifdef DEBUG
1.18 daniel 1281: fprintf(stderr,"Found entity %s\n", name);
1.1 daniel 1282: #endif
1283: return(&html40EntitiesTable[i]);
1284: }
1285: }
1286: return(NULL);
1287: }
1288:
1.53 veillard 1289: /**
1.61 veillard 1290: * htmlEntityValueLookup:
1291: * @value: the entity's unicode value
1292: *
1293: * Lookup the given entity in EntitiesTable
1294: *
1295: * TODO: the linear scan is really ugly, an hash table is really needed.
1296: *
1297: * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1298: */
1299: htmlEntityDescPtr
1300: htmlEntityValueLookup(int value) {
1301: int i;
1302: #ifdef DEBUG
1303: int lv = 0;
1304: #endif
1305:
1306: for (i = 0;i < (sizeof(html40EntitiesTable)/
1307: sizeof(html40EntitiesTable[0]));i++) {
1.71 veillard 1308: if ((unsigned int) html40EntitiesTable[i].value >= value) {
1309: if ((unsigned int) html40EntitiesTable[i].value > value)
1.61 veillard 1310: break;
1311: #ifdef DEBUG
1312: fprintf(stderr,"Found entity %s\n", html40EntitiesTable[i].name);
1313: #endif
1314: return(&html40EntitiesTable[i]);
1315: }
1316: #ifdef DEBUG
1317: if (lv > html40EntitiesTable[i].value) {
1318: fprintf(stderr, "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1319: lv, html40EntitiesTable[i].value);
1320: }
1321: lv = html40EntitiesTable[i].value;
1322: #endif
1323: }
1324: return(NULL);
1325: }
1326:
1327: /**
1.53 veillard 1328: * UTF8ToHtml:
1329: * @out: a pointer to an array of bytes to store the result
1330: * @outlen: the length of @out
1331: * @in: a pointer to an array of UTF-8 chars
1332: * @inlen: the length of @in
1333: *
1334: * Take a block of UTF-8 chars in and try to convert it to an ASCII
1335: * plus HTML entities block of chars out.
1336: *
1337: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1338: * The value of @inlen after return is the number of octets consumed
1339: * as the return value is positive, else unpredictiable.
1340: * The value of @outlen after return is the number of octets consumed.
1341: */
1342: int
1343: UTF8ToHtml(unsigned char* out, int *outlen,
1344: const unsigned char* in, int *inlen) {
1345: const unsigned char* processed = in;
1346: const unsigned char* outend;
1347: const unsigned char* outstart = out;
1348: const unsigned char* instart = in;
1349: const unsigned char* inend;
1350: unsigned int c, d;
1351: int trailing;
1352:
1353: if (in == NULL) {
1354: /*
1355: * initialization nothing to do
1356: */
1357: *outlen = 0;
1358: *inlen = 0;
1359: return(0);
1360: }
1361: inend = in + (*inlen);
1362: outend = out + (*outlen);
1363: while (in < inend) {
1364: d = *in++;
1365: if (d < 0x80) { c= d; trailing= 0; }
1366: else if (d < 0xC0) {
1367: /* trailing byte in leading position */
1368: *outlen = out - outstart;
1369: *inlen = processed - instart;
1370: return(-2);
1371: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1372: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1373: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1374: else {
1375: /* no chance for this in Ascii */
1376: *outlen = out - outstart;
1377: *inlen = processed - instart;
1378: return(-2);
1379: }
1380:
1381: if (inend - in < trailing) {
1382: break;
1383: }
1384:
1385: for ( ; trailing; trailing--) {
1386: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1387: break;
1388: c <<= 6;
1389: c |= d & 0x3F;
1390: }
1391:
1392: /* assertion: c is a single UTF-4 value */
1393: if (c < 0x80) {
1.62 veillard 1394: if (out + 1 >= outend)
1.53 veillard 1395: break;
1396: *out++ = c;
1397: } else {
1.61 veillard 1398: int len;
1399: htmlEntityDescPtr ent;
1400:
1.53 veillard 1401: /*
1402: * Try to lookup a predefined HTML entity for it
1403: */
1404:
1.61 veillard 1405: ent = htmlEntityValueLookup(c);
1406: if (ent == NULL) {
1407: /* no chance for this in Ascii */
1408: *outlen = out - outstart;
1409: *inlen = processed - instart;
1410: return(-2);
1.53 veillard 1411: }
1.61 veillard 1412: len = strlen(ent->name);
1.62 veillard 1413: if (out + 2 + len >= outend)
1.53 veillard 1414: break;
1415: *out++ = '&';
1.61 veillard 1416: memcpy(out, ent->name, len);
1417: out += len;
1.53 veillard 1418: *out++ = ';';
1419: }
1420: processed = in;
1421: }
1422: *outlen = out - outstart;
1423: *inlen = processed - instart;
1424: return(0);
1425: }
1426:
1.62 veillard 1427: /**
1428: * htmlEncodeEntities:
1429: * @out: a pointer to an array of bytes to store the result
1430: * @outlen: the length of @out
1431: * @in: a pointer to an array of UTF-8 chars
1432: * @inlen: the length of @in
1433: * @quoteChar: the quote character to escape (' or ") or zero.
1434: *
1435: * Take a block of UTF-8 chars in and try to convert it to an ASCII
1436: * plus HTML entities block of chars out.
1437: *
1438: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1439: * The value of @inlen after return is the number of octets consumed
1440: * as the return value is positive, else unpredictiable.
1441: * The value of @outlen after return is the number of octets consumed.
1442: */
1443: int
1444: htmlEncodeEntities(unsigned char* out, int *outlen,
1445: const unsigned char* in, int *inlen, int quoteChar) {
1446: const unsigned char* processed = in;
1447: const unsigned char* outend = out + (*outlen);
1448: const unsigned char* outstart = out;
1449: const unsigned char* instart = in;
1450: const unsigned char* inend = in + (*inlen);
1451: unsigned int c, d;
1452: int trailing;
1453:
1454: while (in < inend) {
1455: d = *in++;
1456: if (d < 0x80) { c= d; trailing= 0; }
1457: else if (d < 0xC0) {
1458: /* trailing byte in leading position */
1459: *outlen = out - outstart;
1460: *inlen = processed - instart;
1461: return(-2);
1462: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1463: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1464: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1465: else {
1466: /* no chance for this in Ascii */
1467: *outlen = out - outstart;
1468: *inlen = processed - instart;
1469: return(-2);
1470: }
1471:
1472: if (inend - in < trailing)
1473: break;
1474:
1475: while (trailing--) {
1476: if (((d= *in++) & 0xC0) != 0x80) {
1477: *outlen = out - outstart;
1478: *inlen = processed - instart;
1479: return(-2);
1480: }
1481: c <<= 6;
1482: c |= d & 0x3F;
1483: }
1484:
1485: /* assertion: c is a single UTF-4 value */
1486: if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
1487: if (out >= outend)
1488: break;
1489: *out++ = c;
1490: } else {
1491: htmlEntityDescPtr ent;
1492: const char *cp;
1493: char nbuf[16];
1494: int len;
1495:
1496: /*
1497: * Try to lookup a predefined HTML entity for it
1498: */
1499: ent = htmlEntityValueLookup(c);
1500: if (ent == NULL) {
1501: sprintf(nbuf, "#%u", c);
1502: cp = nbuf;
1503: }
1504: else
1505: cp = ent->name;
1506: len = strlen(cp);
1507: if (out + 2 + len > outend)
1508: break;
1509: *out++ = '&';
1510: memcpy(out, cp, len);
1511: out += len;
1512: *out++ = ';';
1513: }
1514: processed = in;
1515: }
1516: *outlen = out - outstart;
1517: *inlen = processed - instart;
1518: return(0);
1519: }
1.1 daniel 1520:
1521: /**
1522: * htmlDecodeEntities:
1523: * @ctxt: the parser context
1524: * @len: the len to decode (in bytes !), -1 for no size limit
1.14 daniel 1525: * @end: an end marker xmlChar, 0 if none
1526: * @end2: an end marker xmlChar, 0 if none
1527: * @end3: an end marker xmlChar, 0 if none
1.1 daniel 1528: *
1529: * Subtitute the HTML entities by their value
1530: *
1.19 daniel 1531: * DEPRECATED !!!!
1.1 daniel 1532: *
1533: * Returns A newly allocated string with the substitution done. The caller
1534: * must deallocate it !
1535: */
1.14 daniel 1536: xmlChar *
1.1 daniel 1537: htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
1.14 daniel 1538: xmlChar end, xmlChar end2, xmlChar end3) {
1.53 veillard 1539: xmlChar *name = NULL;
1.14 daniel 1540: xmlChar *buffer = NULL;
1.53 veillard 1541: unsigned int buffer_size = 0;
1542: unsigned int nbchars = 0;
1.1 daniel 1543: htmlEntityDescPtr ent;
1544: unsigned int max = (unsigned int) len;
1.53 veillard 1545: int c,l;
1546:
1547: if (ctxt->depth > 40) {
1.67 veillard 1548: ctxt->errNo = XML_ERR_ENTITY_LOOP;
1.53 veillard 1549: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1550: ctxt->sax->error(ctxt->userData,
1551: "Detected entity reference loop\n");
1552: ctxt->wellFormed = 0;
1553: ctxt->disableSAX = 1;
1554: return(NULL);
1555: }
1.1 daniel 1556:
1557: /*
1558: * allocate a translation buffer.
1559: */
1.31 daniel 1560: buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
1.14 daniel 1561: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1.1 daniel 1562: if (buffer == NULL) {
1.53 veillard 1563: perror("xmlDecodeEntities: malloc failed");
1.1 daniel 1564: return(NULL);
1565: }
1566:
1567: /*
1568: * Ok loop until we reach one of the ending char or a size limit.
1569: */
1.53 veillard 1570: c = CUR_CHAR(l);
1571: while ((nbchars < max) && (c != end) &&
1572: (c != end2) && (c != end3)) {
1573:
1574: if (c == 0) break;
1575: if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
1576: int val = htmlParseCharRef(ctxt);
1577: COPY_BUF(0,buffer,nbchars,val);
1578: NEXTL(l);
1579: } else if ((c == '&') && (ctxt->token != '&')) {
1580: ent = htmlParseEntityRef(ctxt, &name);
1581: if (name != NULL) {
1582: if (ent != NULL) {
1583: int val = ent->value;
1584: COPY_BUF(0,buffer,nbchars,val);
1585: NEXTL(l);
1586: } else {
1587: const xmlChar *cur = name;
1.1 daniel 1588:
1.53 veillard 1589: buffer[nbchars++] = '&';
1590: if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1591: growBuffer(buffer);
1592: }
1593: while (*cur != 0) {
1594: buffer[nbchars++] = *cur++;
1.1 daniel 1595: }
1.53 veillard 1596: buffer[nbchars++] = ';';
1.1 daniel 1597: }
1598: }
1599: } else {
1.53 veillard 1600: COPY_BUF(l,buffer,nbchars,c);
1601: NEXTL(l);
1602: if (nbchars > buffer_size - HTML_PARSER_BUFFER_SIZE) {
1603: growBuffer(buffer);
1.1 daniel 1604: }
1605: }
1.53 veillard 1606: c = CUR_CHAR(l);
1.1 daniel 1607: }
1.53 veillard 1608: buffer[nbchars++] = 0;
1.1 daniel 1609: return(buffer);
1610: }
1611:
1.31 daniel 1612: /************************************************************************
1613: * *
1614: * Commodity functions to handle streams *
1615: * *
1616: ************************************************************************/
1617:
1618: /**
1619: * htmlFreeInputStream:
1620: * @input: an htmlParserInputPtr
1621: *
1622: * Free up an input stream.
1623: */
1624: void
1625: htmlFreeInputStream(htmlParserInputPtr input) {
1626: if (input == NULL) return;
1627:
1628: if (input->filename != NULL) xmlFree((char *) input->filename);
1629: if (input->directory != NULL) xmlFree((char *) input->directory);
1630: if ((input->free != NULL) && (input->base != NULL))
1631: input->free((xmlChar *) input->base);
1632: if (input->buf != NULL)
1633: xmlFreeParserInputBuffer(input->buf);
1634: memset(input, -1, sizeof(htmlParserInput));
1635: xmlFree(input);
1636: }
1637:
1638: /**
1639: * htmlNewInputStream:
1640: * @ctxt: an HTML parser context
1641: *
1642: * Create a new input stream structure
1643: * Returns the new input stream or NULL
1644: */
1645: htmlParserInputPtr
1646: htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1647: htmlParserInputPtr input;
1648:
1649: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1650: if (input == NULL) {
1651: ctxt->errNo = XML_ERR_NO_MEMORY;
1652: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1653: ctxt->sax->error(ctxt->userData,
1654: "malloc: couldn't allocate a new input stream\n");
1655: return(NULL);
1656: }
1.51 veillard 1657: memset(input, 0, sizeof(htmlParserInput));
1.31 daniel 1658: input->filename = NULL;
1659: input->directory = NULL;
1660: input->base = NULL;
1661: input->cur = NULL;
1662: input->buf = NULL;
1663: input->line = 1;
1664: input->col = 1;
1665: input->buf = NULL;
1666: input->free = NULL;
1.51 veillard 1667: input->version = NULL;
1.31 daniel 1668: input->consumed = 0;
1669: input->length = 0;
1670: return(input);
1671: }
1672:
1.1 daniel 1673:
1674: /************************************************************************
1675: * *
1676: * Commodity functions, cleanup needed ? *
1677: * *
1678: ************************************************************************/
1679:
1680: /**
1681: * areBlanks:
1682: * @ctxt: an HTML parser context
1.14 daniel 1683: * @str: a xmlChar *
1.1 daniel 1684: * @len: the size of @str
1685: *
1686: * Is this a sequence of blank chars that one can ignore ?
1687: *
1688: * Returns 1 if ignorable 0 otherwise.
1689: */
1690:
1.14 daniel 1691: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1.1 daniel 1692: int i;
1693: xmlNodePtr lastChild;
1694:
1695: for (i = 0;i < len;i++)
1696: if (!(IS_BLANK(str[i]))) return(0);
1697:
1.48 daniel 1698: if (CUR == 0) return(1);
1.1 daniel 1699: if (CUR != '<') return(0);
1.62 veillard 1700: if (ctxt->name == NULL)
1701: return(1);
1.73 veillard 1702: if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1.63 veillard 1703: return(1);
1.73 veillard 1704: if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1.62 veillard 1705: return(1);
1.73 veillard 1706: if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1.62 veillard 1707: return(1);
1.1 daniel 1708: if (ctxt->node == NULL) return(0);
1709: lastChild = xmlGetLastChild(ctxt->node);
1710: if (lastChild == NULL) {
1711: if (ctxt->node->content != NULL) return(0);
1712: } else if (xmlNodeIsText(lastChild))
1713: return(0);
1714: return(1);
1715: }
1716:
1717: /**
1718: * htmlHandleEntity:
1719: * @ctxt: an HTML parser context
1720: * @entity: an XML entity pointer.
1721: *
1722: * Default handling of an HTML entity, call the parser with the
1723: * substitution string
1724: */
1725:
1726: void
1727: htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1728: int len;
1729:
1730: if (entity->content == NULL) {
1731: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1732: ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n",
1733: entity->name);
1734: ctxt->wellFormed = 0;
1735: return;
1736: }
1737: len = xmlStrlen(entity->content);
1738:
1739: /*
1740: * Just handle the content as a set of chars.
1741: */
1.59 veillard 1742: htmlCheckParagraph(ctxt);
1.1 daniel 1743: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1744: ctxt->sax->characters(ctxt->userData, entity->content, len);
1745:
1746: }
1747:
1748: /**
1.59 veillard 1749: * htmlNewDocNoDtD:
1.1 daniel 1750: * @URI: URI for the dtd, or NULL
1751: * @ExternalID: the external ID of the DTD, or NULL
1752: *
1.59 veillard 1753: * Returns a new document, do not intialize the DTD if not provided
1.1 daniel 1754: */
1755: htmlDocPtr
1.59 veillard 1756: htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1.1 daniel 1757: xmlDocPtr cur;
1758:
1759: /*
1760: * Allocate a new document and fill the fields.
1761: */
1.11 daniel 1762: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1.1 daniel 1763: if (cur == NULL) {
1764: fprintf(stderr, "xmlNewDoc : malloc failed\n");
1765: return(NULL);
1766: }
1.10 daniel 1767: memset(cur, 0, sizeof(xmlDoc));
1.1 daniel 1768:
1.20 daniel 1769: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 1770: cur->version = NULL;
1771: cur->intSubset = NULL;
1.59 veillard 1772: if ((ExternalID != NULL) ||
1773: (URI != NULL))
1.28 daniel 1774: xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1.41 daniel 1775: cur->doc = cur;
1.1 daniel 1776: cur->name = NULL;
1.37 daniel 1777: cur->children = NULL;
1.1 daniel 1778: cur->extSubset = NULL;
1779: cur->oldNs = NULL;
1780: cur->encoding = NULL;
1781: cur->standalone = 1;
1782: cur->compression = 0;
1.12 daniel 1783: cur->ids = NULL;
1784: cur->refs = NULL;
1.1 daniel 1785: #ifndef XML_WITHOUT_CORBA
1786: cur->_private = NULL;
1787: #endif
1788: return(cur);
1789: }
1790:
1.59 veillard 1791: /**
1792: * htmlNewDoc:
1793: * @URI: URI for the dtd, or NULL
1794: * @ExternalID: the external ID of the DTD, or NULL
1795: *
1796: * Returns a new document
1797: */
1798: htmlDocPtr
1799: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1800: if ((URI == NULL) && (ExternalID == NULL))
1801: return(htmlNewDocNoDtD(
1802: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
1803: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
1804:
1805: return(htmlNewDocNoDtD(URI, ExternalID));
1806: }
1807:
1.1 daniel 1808:
1809: /************************************************************************
1810: * *
1811: * The parser itself *
1812: * Relates to http://www.w3.org/TR/html40 *
1813: * *
1814: ************************************************************************/
1815:
1816: /************************************************************************
1817: * *
1818: * The parser itself *
1819: * *
1820: ************************************************************************/
1821:
1822: /**
1823: * htmlParseHTMLName:
1824: * @ctxt: an HTML parser context
1825: *
1.26 daniel 1826: * parse an HTML tag or attribute name, note that we convert it to lowercase
1.1 daniel 1827: * since HTML names are not case-sensitive.
1828: *
1829: * Returns the Tag Name parsed or NULL
1830: */
1831:
1.14 daniel 1832: xmlChar *
1.1 daniel 1833: htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1834: xmlChar *ret = NULL;
1.1 daniel 1835: int i = 0;
1.31 daniel 1836: xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1.1 daniel 1837:
1838: if (!IS_LETTER(CUR) && (CUR != '_') &&
1839: (CUR != ':')) return(NULL);
1840:
1.31 daniel 1841: while ((i < HTML_PARSER_BUFFER_SIZE) &&
1.45 daniel 1842: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1.76 veillard 1843: (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1.26 daniel 1844: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1.1 daniel 1845: else loc[i] = CUR;
1846: i++;
1847:
1848: NEXT;
1849: }
1850:
1851: ret = xmlStrndup(loc, i);
1852:
1853: return(ret);
1854: }
1855:
1856: /**
1857: * htmlParseName:
1858: * @ctxt: an HTML parser context
1859: *
1860: * parse an HTML name, this routine is case sensistive.
1861: *
1862: * Returns the Name parsed or NULL
1863: */
1864:
1.14 daniel 1865: xmlChar *
1.1 daniel 1866: htmlParseName(htmlParserCtxtPtr ctxt) {
1.14 daniel 1867: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 1868: int len = 0;
1.1 daniel 1869:
1.5 daniel 1870: GROW;
1871: if (!IS_LETTER(CUR) && (CUR != '_')) {
1872: return(NULL);
1873: }
1.1 daniel 1874:
1875: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1876: (CUR == '.') || (CUR == '-') ||
1877: (CUR == '_') || (CUR == ':') ||
1878: (IS_COMBINING(CUR)) ||
1.5 daniel 1879: (IS_EXTENDER(CUR))) {
1880: buf[len++] = CUR;
1.1 daniel 1881: NEXT;
1.5 daniel 1882: if (len >= HTML_MAX_NAMELEN) {
1883: fprintf(stderr,
1884: "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1885: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1886: (CUR == '.') || (CUR == '-') ||
1887: (CUR == '_') || (CUR == ':') ||
1888: (IS_COMBINING(CUR)) ||
1889: (IS_EXTENDER(CUR)))
1890: NEXT;
1891: break;
1892: }
1893: }
1894: return(xmlStrndup(buf, len));
1.1 daniel 1895: }
1896:
1897: /**
1898: * htmlParseHTMLAttribute:
1899: * @ctxt: an HTML parser context
1.19 daniel 1900: * @stop: a char stop value
1.1 daniel 1901: *
1.19 daniel 1902: * parse an HTML attribute value till the stop (quote), if
1903: * stop is 0 then it stops at the first space
1.1 daniel 1904: *
1.19 daniel 1905: * Returns the attribute parsed or NULL
1.1 daniel 1906: */
1907:
1.14 daniel 1908: xmlChar *
1.19 daniel 1909: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1.32 daniel 1910: xmlChar *buffer = NULL;
1911: int buffer_size = 0;
1912: xmlChar *out = NULL;
1913: xmlChar *name = NULL;
1914:
1915: xmlChar *cur = NULL;
1916: htmlEntityDescPtr ent;
1917:
1918: /*
1919: * allocate a translation buffer.
1920: */
1.77 veillard 1921: buffer_size = HTML_PARSER_BUFFER_SIZE;
1.32 daniel 1922: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1923: if (buffer == NULL) {
1924: perror("htmlParseHTMLAttribute: malloc failed");
1925: return(NULL);
1926: }
1927: out = buffer;
1928:
1929: /*
1930: * Ok loop until we reach one of the ending chars
1931: */
1932: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
1933: if ((stop == 0) && (IS_BLANK(CUR))) break;
1934: if (CUR == '&') {
1935: if (NXT(1) == '#') {
1.52 veillard 1936: unsigned int c;
1937: int bits;
1938:
1939: c = htmlParseCharRef(ctxt);
1940: if (c < 0x80)
1941: { *out++ = c; bits= -6; }
1942: else if (c < 0x800)
1943: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1944: else if (c < 0x10000)
1945: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1946: else
1947: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1948:
1949: for ( ; bits >= 0; bits-= 6) {
1950: *out++ = ((c >> bits) & 0x3F) | 0x80;
1951: }
1.32 daniel 1952: } else {
1953: ent = htmlParseEntityRef(ctxt, &name);
1954: if (name == NULL) {
1955: *out++ = '&';
1956: if (out - buffer > buffer_size - 100) {
1957: int index = out - buffer;
1958:
1959: growBuffer(buffer);
1960: out = &buffer[index];
1961: }
1.52 veillard 1962: } else if (ent == NULL) {
1.32 daniel 1963: *out++ = '&';
1964: cur = name;
1965: while (*cur != 0) {
1966: if (out - buffer > buffer_size - 100) {
1967: int index = out - buffer;
1968:
1969: growBuffer(buffer);
1970: out = &buffer[index];
1971: }
1972: *out++ = *cur++;
1973: }
1974: xmlFree(name);
1975: } else {
1.52 veillard 1976: unsigned int c;
1977: int bits;
1978:
1.32 daniel 1979: if (out - buffer > buffer_size - 100) {
1980: int index = out - buffer;
1981:
1982: growBuffer(buffer);
1983: out = &buffer[index];
1984: }
1.52 veillard 1985: c = (xmlChar)ent->value;
1986: if (c < 0x80)
1987: { *out++ = c; bits= -6; }
1988: else if (c < 0x800)
1989: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1990: else if (c < 0x10000)
1991: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1992: else
1993: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1994:
1995: for ( ; bits >= 0; bits-= 6) {
1996: *out++ = ((c >> bits) & 0x3F) | 0x80;
1997: }
1.32 daniel 1998: xmlFree(name);
1999: }
2000: }
2001: } else {
1.52 veillard 2002: unsigned int c;
1.68 veillard 2003: int bits, l;
1.52 veillard 2004:
1.32 daniel 2005: if (out - buffer > buffer_size - 100) {
1.52 veillard 2006: int index = out - buffer;
2007:
2008: growBuffer(buffer);
2009: out = &buffer[index];
2010: }
1.68 veillard 2011: c = CUR_CHAR(l);
1.52 veillard 2012: if (c < 0x80)
2013: { *out++ = c; bits= -6; }
2014: else if (c < 0x800)
2015: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2016: else if (c < 0x10000)
2017: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2018: else
2019: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2020:
2021: for ( ; bits >= 0; bits-= 6) {
2022: *out++ = ((c >> bits) & 0x3F) | 0x80;
1.32 daniel 2023: }
2024: NEXT;
2025: }
2026: }
2027: *out++ = 0;
2028: return(buffer);
1.1 daniel 2029: }
2030:
2031: /**
2032: * htmlParseNmtoken:
2033: * @ctxt: an HTML parser context
2034: *
2035: * parse an HTML Nmtoken.
2036: *
2037: * Returns the Nmtoken parsed or NULL
2038: */
2039:
1.14 daniel 2040: xmlChar *
1.1 daniel 2041: htmlParseNmtoken(htmlParserCtxtPtr ctxt) {
1.14 daniel 2042: xmlChar buf[HTML_MAX_NAMELEN];
1.5 daniel 2043: int len = 0;
1.1 daniel 2044:
1.5 daniel 2045: GROW;
1.1 daniel 2046: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2047: (CUR == '.') || (CUR == '-') ||
2048: (CUR == '_') || (CUR == ':') ||
2049: (IS_COMBINING(CUR)) ||
1.5 daniel 2050: (IS_EXTENDER(CUR))) {
2051: buf[len++] = CUR;
1.1 daniel 2052: NEXT;
1.5 daniel 2053: if (len >= HTML_MAX_NAMELEN) {
2054: fprintf(stderr,
2055: "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n");
2056: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2057: (CUR == '.') || (CUR == '-') ||
2058: (CUR == '_') || (CUR == ':') ||
2059: (IS_COMBINING(CUR)) ||
2060: (IS_EXTENDER(CUR)))
2061: NEXT;
2062: break;
2063: }
2064: }
2065: return(xmlStrndup(buf, len));
1.1 daniel 2066: }
2067:
2068: /**
2069: * htmlParseEntityRef:
2070: * @ctxt: an HTML parser context
2071: * @str: location to store the entity name
2072: *
2073: * parse an HTML ENTITY references
2074: *
2075: * [68] EntityRef ::= '&' Name ';'
2076: *
2077: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2078: * if non-NULL *str will have to be freed by the caller.
2079: */
2080: htmlEntityDescPtr
1.14 daniel 2081: htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2082: xmlChar *name;
1.1 daniel 2083: htmlEntityDescPtr ent = NULL;
2084: *str = NULL;
2085:
2086: if (CUR == '&') {
2087: NEXT;
2088: name = htmlParseName(ctxt);
2089: if (name == NULL) {
2090: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2091: ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2092: ctxt->wellFormed = 0;
2093: } else {
1.5 daniel 2094: GROW;
1.1 daniel 2095: if (CUR == ';') {
2096: *str = name;
2097:
2098: /*
2099: * Lookup the entity in the table.
2100: */
2101: ent = htmlEntityLookup(name);
1.32 daniel 2102: if (ent != NULL) /* OK that's ugly !!! */
2103: NEXT;
1.1 daniel 2104: } else {
2105: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2106: ctxt->sax->error(ctxt->userData,
2107: "htmlParseEntityRef: expecting ';'\n");
1.32 daniel 2108: *str = name;
1.1 daniel 2109: }
2110: }
2111: }
2112: return(ent);
2113: }
2114:
2115: /**
2116: * htmlParseAttValue:
2117: * @ctxt: an HTML parser context
2118: *
2119: * parse a value for an attribute
2120: * Note: the parser won't do substitution of entities here, this
2121: * will be handled later in xmlStringGetNodeList, unless it was
2122: * asked for ctxt->replaceEntities != 0
2123: *
2124: * Returns the AttValue parsed or NULL.
2125: */
2126:
1.14 daniel 2127: xmlChar *
1.1 daniel 2128: htmlParseAttValue(htmlParserCtxtPtr ctxt) {
1.14 daniel 2129: xmlChar *ret = NULL;
1.1 daniel 2130:
2131: if (CUR == '"') {
2132: NEXT;
1.19 daniel 2133: ret = htmlParseHTMLAttribute(ctxt, '"');
1.1 daniel 2134: if (CUR != '"') {
2135: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2136: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2137: ctxt->wellFormed = 0;
2138: } else
2139: NEXT;
2140: } else if (CUR == '\'') {
2141: NEXT;
1.19 daniel 2142: ret = htmlParseHTMLAttribute(ctxt, '\'');
1.1 daniel 2143: if (CUR != '\'') {
2144: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2145: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2146: ctxt->wellFormed = 0;
2147: } else
2148: NEXT;
2149: } else {
2150: /*
2151: * That's an HTMLism, the attribute value may not be quoted
2152: */
1.19 daniel 2153: ret = htmlParseHTMLAttribute(ctxt, 0);
1.1 daniel 2154: if (ret == NULL) {
2155: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2156: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2157: ctxt->wellFormed = 0;
2158: }
2159: }
2160: return(ret);
2161: }
2162:
2163: /**
2164: * htmlParseSystemLiteral:
2165: * @ctxt: an HTML parser context
2166: *
2167: * parse an HTML Literal
2168: *
2169: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2170: *
2171: * Returns the SystemLiteral parsed or NULL
2172: */
2173:
1.14 daniel 2174: xmlChar *
1.1 daniel 2175: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 2176: const xmlChar *q;
2177: xmlChar *ret = NULL;
1.1 daniel 2178:
2179: if (CUR == '"') {
2180: NEXT;
2181: q = CUR_PTR;
2182: while ((IS_CHAR(CUR)) && (CUR != '"'))
2183: NEXT;
2184: if (!IS_CHAR(CUR)) {
2185: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2186: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2187: ctxt->wellFormed = 0;
2188: } else {
2189: ret = xmlStrndup(q, CUR_PTR - q);
2190: NEXT;
2191: }
2192: } else if (CUR == '\'') {
2193: NEXT;
2194: q = CUR_PTR;
2195: while ((IS_CHAR(CUR)) && (CUR != '\''))
2196: NEXT;
2197: if (!IS_CHAR(CUR)) {
2198: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2199: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2200: ctxt->wellFormed = 0;
2201: } else {
2202: ret = xmlStrndup(q, CUR_PTR - q);
2203: NEXT;
2204: }
2205: } else {
2206: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.38 daniel 2207: ctxt->sax->error(ctxt->userData,
2208: "SystemLiteral \" or ' expected\n");
1.1 daniel 2209: ctxt->wellFormed = 0;
2210: }
2211:
2212: return(ret);
2213: }
2214:
2215: /**
2216: * htmlParsePubidLiteral:
2217: * @ctxt: an HTML parser context
2218: *
2219: * parse an HTML public literal
2220: *
2221: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2222: *
2223: * Returns the PubidLiteral parsed or NULL.
2224: */
2225:
1.14 daniel 2226: xmlChar *
1.1 daniel 2227: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
1.14 daniel 2228: const xmlChar *q;
2229: xmlChar *ret = NULL;
1.1 daniel 2230: /*
2231: * Name ::= (Letter | '_') (NameChar)*
2232: */
2233: if (CUR == '"') {
2234: NEXT;
2235: q = CUR_PTR;
2236: while (IS_PUBIDCHAR(CUR)) NEXT;
2237: if (CUR != '"') {
2238: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2239: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2240: ctxt->wellFormed = 0;
2241: } else {
2242: ret = xmlStrndup(q, CUR_PTR - q);
2243: NEXT;
2244: }
2245: } else if (CUR == '\'') {
2246: NEXT;
2247: q = CUR_PTR;
2248: while ((IS_LETTER(CUR)) && (CUR != '\''))
2249: NEXT;
2250: if (!IS_LETTER(CUR)) {
2251: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2252: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2253: ctxt->wellFormed = 0;
2254: } else {
2255: ret = xmlStrndup(q, CUR_PTR - q);
2256: NEXT;
2257: }
2258: } else {
2259: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2260: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2261: ctxt->wellFormed = 0;
2262: }
2263:
2264: return(ret);
2265: }
2266:
2267: /**
1.77 veillard 2268: * htmlParseScript:
2269: * @ctxt: an HTML parser context
2270: *
2271: * parse the content of an HTML SCRIPT or STYLE element
2272: * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2273: * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2274: * http://www.w3.org/TR/html4/types.html#type-script
2275: * http://www.w3.org/TR/html4/types.html#h-6.15
2276: * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2277: *
2278: * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2279: * element and the value of intrinsic event attributes. User agents must
2280: * not evaluate script data as HTML markup but instead must pass it on as
2281: * data to a script engine.
2282: * NOTES:
2283: * - The content is passed like CDATA
2284: * - the attributes for style and scripting "onXXX" are also described
2285: * as CDATA but SGML allows entities references in attributes so their
2286: * processing is identical as other attributes
2287: */
2288: void
2289: htmlParseScript(htmlParserCtxtPtr ctxt) {
2290: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2291: int nbchar = 0;
2292: xmlChar cur;
2293:
2294: SHRINK;
2295: cur = CUR;
2296: while (IS_CHAR(cur)) {
2297: if ((cur == '<') && (NXT(1) == '/')) {
2298: /*
2299: * One should break here, the specification is clear:
2300: * Authors should therefore escape "</" within the content.
2301: * Escape mechanisms are specific to each scripting or
2302: * style sheet language.
2303: */
2304: if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2305: ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2306: break; /* while */
2307: }
2308: buf[nbchar++] = cur;
2309: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2310: if (ctxt->sax->cdataBlock!= NULL) {
2311: /*
2312: * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2313: */
2314: ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2315: }
2316: nbchar = 0;
2317: }
2318: NEXT;
2319: cur = CUR;
2320: }
2321: if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2322: if (ctxt->sax->cdataBlock!= NULL) {
2323: /*
2324: * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2325: */
2326: ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2327: }
2328: }
2329: }
2330:
2331:
2332: /**
1.1 daniel 2333: * htmlParseCharData:
2334: * @ctxt: an HTML parser context
2335: * @cdata: int indicating whether we are within a CDATA section
2336: *
2337: * parse a CharData section.
2338: * if we are within a CDATA section ']]>' marks an end of section.
2339: *
2340: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2341: */
2342:
2343: void
2344: htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
1.53 veillard 2345: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2346: int nbchar = 0;
2347: int cur, l;
2348:
2349: SHRINK;
2350: cur = CUR_CHAR(l);
2351: while (((cur != '<') || (ctxt->token == '<')) &&
2352: ((cur != '&') || (ctxt->token == '&')) &&
2353: (IS_CHAR(cur))) {
2354: COPY_BUF(l,buf,nbchar,cur);
2355: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2356: /*
2357: * Ok the segment is to be consumed as chars.
2358: */
2359: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2360: if (areBlanks(ctxt, buf, nbchar)) {
2361: if (ctxt->sax->ignorableWhitespace != NULL)
2362: ctxt->sax->ignorableWhitespace(ctxt->userData,
2363: buf, nbchar);
2364: } else {
1.59 veillard 2365: htmlCheckParagraph(ctxt);
1.53 veillard 2366: if (ctxt->sax->characters != NULL)
2367: ctxt->sax->characters(ctxt->userData, buf, nbchar);
2368: }
1.1 daniel 2369: }
1.53 veillard 2370: nbchar = 0;
1.1 daniel 2371: }
1.53 veillard 2372: NEXTL(l);
2373: cur = CUR_CHAR(l);
2374: }
2375: if (nbchar != 0) {
2376: /*
2377: * Ok the segment is to be consumed as chars.
2378: */
2379: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2380: if (areBlanks(ctxt, buf, nbchar)) {
2381: if (ctxt->sax->ignorableWhitespace != NULL)
2382: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2383: } else {
1.59 veillard 2384: htmlCheckParagraph(ctxt);
1.53 veillard 2385: if (ctxt->sax->characters != NULL)
2386: ctxt->sax->characters(ctxt->userData, buf, nbchar);
1.25 daniel 2387: }
2388: }
1.1 daniel 2389: }
2390: }
2391:
2392: /**
2393: * htmlParseExternalID:
2394: * @ctxt: an HTML parser context
1.14 daniel 2395: * @publicID: a xmlChar** receiving PubidLiteral
1.1 daniel 2396: * @strict: indicate whether we should restrict parsing to only
2397: * production [75], see NOTE below
2398: *
2399: * Parse an External ID or a Public ID
2400: *
2401: * NOTE: Productions [75] and [83] interract badly since [75] can generate
2402: * 'PUBLIC' S PubidLiteral S SystemLiteral
2403: *
2404: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2405: * | 'PUBLIC' S PubidLiteral S SystemLiteral
2406: *
2407: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2408: *
2409: * Returns the function returns SystemLiteral and in the second
2410: * case publicID receives PubidLiteral, is strict is off
2411: * it is possible to return NULL and have publicID set.
2412: */
2413:
1.14 daniel 2414: xmlChar *
2415: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
2416: xmlChar *URI = NULL;
1.1 daniel 2417:
2418: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2419: (UPP(2) == 'S') && (UPP(3) == 'T') &&
2420: (UPP(4) == 'E') && (UPP(5) == 'M')) {
2421: SKIP(6);
2422: if (!IS_BLANK(CUR)) {
2423: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2424: ctxt->sax->error(ctxt->userData,
2425: "Space required after 'SYSTEM'\n");
2426: ctxt->wellFormed = 0;
2427: }
2428: SKIP_BLANKS;
2429: URI = htmlParseSystemLiteral(ctxt);
2430: if (URI == NULL) {
2431: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2432: ctxt->sax->error(ctxt->userData,
2433: "htmlParseExternalID: SYSTEM, no URI\n");
2434: ctxt->wellFormed = 0;
2435: }
2436: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2437: (UPP(2) == 'B') && (UPP(3) == 'L') &&
2438: (UPP(4) == 'I') && (UPP(5) == 'C')) {
2439: SKIP(6);
2440: if (!IS_BLANK(CUR)) {
2441: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2442: ctxt->sax->error(ctxt->userData,
2443: "Space required after 'PUBLIC'\n");
2444: ctxt->wellFormed = 0;
2445: }
2446: SKIP_BLANKS;
2447: *publicID = htmlParsePubidLiteral(ctxt);
2448: if (*publicID == NULL) {
2449: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2450: ctxt->sax->error(ctxt->userData,
2451: "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2452: ctxt->wellFormed = 0;
2453: }
1.5 daniel 2454: SKIP_BLANKS;
2455: if ((CUR == '"') || (CUR == '\'')) {
2456: URI = htmlParseSystemLiteral(ctxt);
1.1 daniel 2457: }
2458: }
2459: return(URI);
2460: }
2461:
2462: /**
2463: * htmlParseComment:
2464: * @ctxt: an HTML parser context
2465: *
2466: * Parse an XML (SGML) comment <!-- .... -->
2467: *
2468: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2469: */
2470: void
1.31 daniel 2471: htmlParseComment(htmlParserCtxtPtr ctxt) {
1.25 daniel 2472: xmlChar *buf = NULL;
1.56 veillard 2473: int len;
1.31 daniel 2474: int size = HTML_PARSER_BUFFER_SIZE;
1.56 veillard 2475: int q, ql;
2476: int r, rl;
2477: int cur, l;
2478: xmlParserInputState state;
1.1 daniel 2479:
2480: /*
2481: * Check that there is a comment right here.
2482: */
1.56 veillard 2483: if ((RAW != '<') || (NXT(1) != '!') ||
1.1 daniel 2484: (NXT(2) != '-') || (NXT(3) != '-')) return;
2485:
1.56 veillard 2486: state = ctxt->instate;
2487: ctxt->instate = XML_PARSER_COMMENT;
2488: SHRINK;
2489: SKIP(4);
1.25 daniel 2490: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2491: if (buf == NULL) {
2492: fprintf(stderr, "malloc of %d byte failed\n", size);
1.56 veillard 2493: ctxt->instate = state;
1.25 daniel 2494: return;
2495: }
1.56 veillard 2496: q = CUR_CHAR(ql);
2497: NEXTL(ql);
2498: r = CUR_CHAR(rl);
2499: NEXTL(rl);
2500: cur = CUR_CHAR(l);
2501: len = 0;
2502: while (IS_CHAR(cur) &&
2503: ((cur != '>') ||
2504: (r != '-') || (q != '-'))) {
2505: if (len + 5 >= size) {
1.25 daniel 2506: size *= 2;
1.50 veillard 2507: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
1.25 daniel 2508: if (buf == NULL) {
2509: fprintf(stderr, "realloc of %d byte failed\n", size);
1.56 veillard 2510: ctxt->instate = state;
1.25 daniel 2511: return;
2512: }
2513: }
1.56 veillard 2514: COPY_BUF(ql,buf,len,q);
1.25 daniel 2515: q = r;
1.56 veillard 2516: ql = rl;
2517: r = cur;
2518: rl = l;
2519: NEXTL(l);
2520: cur = CUR_CHAR(l);
2521: if (cur == 0) {
2522: SHRINK;
2523: GROW;
2524: cur = CUR_CHAR(l);
2525: }
1.1 daniel 2526: }
1.56 veillard 2527: buf[len] = 0;
2528: if (!IS_CHAR(cur)) {
1.67 veillard 2529: ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
1.1 daniel 2530: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.56 veillard 2531: ctxt->sax->error(ctxt->userData,
2532: "Comment not terminated \n<!--%.50s\n", buf);
1.1 daniel 2533: ctxt->wellFormed = 0;
1.56 veillard 2534: xmlFree(buf);
1.1 daniel 2535: } else {
2536: NEXT;
1.56 veillard 2537: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2538: (!ctxt->disableSAX))
1.31 daniel 2539: ctxt->sax->comment(ctxt->userData, buf);
1.56 veillard 2540: xmlFree(buf);
1.1 daniel 2541: }
1.56 veillard 2542: ctxt->instate = state;
1.1 daniel 2543: }
2544:
2545: /**
2546: * htmlParseCharRef:
2547: * @ctxt: an HTML parser context
2548: *
2549: * parse Reference declarations
2550: *
2551: * [66] CharRef ::= '&#' [0-9]+ ';' |
2552: * '&#x' [0-9a-fA-F]+ ';'
2553: *
2554: * Returns the value parsed (as an int)
2555: */
2556: int
2557: htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2558: int val = 0;
2559:
2560: if ((CUR == '&') && (NXT(1) == '#') &&
2561: (NXT(2) == 'x')) {
2562: SKIP(3);
2563: while (CUR != ';') {
2564: if ((CUR >= '0') && (CUR <= '9'))
2565: val = val * 16 + (CUR - '0');
2566: else if ((CUR >= 'a') && (CUR <= 'f'))
2567: val = val * 16 + (CUR - 'a') + 10;
2568: else if ((CUR >= 'A') && (CUR <= 'F'))
2569: val = val * 16 + (CUR - 'A') + 10;
2570: else {
2571: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2572: ctxt->sax->error(ctxt->userData,
2573: "htmlParseCharRef: invalid hexadecimal value\n");
2574: ctxt->wellFormed = 0;
2575: val = 0;
2576: break;
2577: }
2578: NEXT;
2579: }
2580: if (CUR == ';')
2581: NEXT;
2582: } else if ((CUR == '&') && (NXT(1) == '#')) {
2583: SKIP(2);
2584: while (CUR != ';') {
2585: if ((CUR >= '0') && (CUR <= '9'))
2586: val = val * 10 + (CUR - '0');
2587: else {
2588: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2589: ctxt->sax->error(ctxt->userData,
2590: "htmlParseCharRef: invalid decimal value\n");
2591: ctxt->wellFormed = 0;
2592: val = 0;
2593: break;
2594: }
2595: NEXT;
2596: }
2597: if (CUR == ';')
2598: NEXT;
2599: } else {
2600: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2601: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2602: ctxt->wellFormed = 0;
2603: }
2604: /*
2605: * Check the value IS_CHAR ...
2606: */
2607: if (IS_CHAR(val)) {
2608: return(val);
2609: } else {
2610: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.14 daniel 2611: ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
1.1 daniel 2612: val);
2613: ctxt->wellFormed = 0;
2614: }
2615: return(0);
2616: }
2617:
2618:
2619: /**
2620: * htmlParseDocTypeDecl :
2621: * @ctxt: an HTML parser context
2622: *
2623: * parse a DOCTYPE declaration
2624: *
2625: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2626: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2627: */
2628:
2629: void
2630: htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
1.14 daniel 2631: xmlChar *name;
2632: xmlChar *ExternalID = NULL;
2633: xmlChar *URI = NULL;
1.1 daniel 2634:
2635: /*
2636: * We know that '<!DOCTYPE' has been detected.
2637: */
2638: SKIP(9);
2639:
2640: SKIP_BLANKS;
2641:
2642: /*
2643: * Parse the DOCTYPE name.
2644: */
2645: name = htmlParseName(ctxt);
2646: if (name == NULL) {
2647: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2648: ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2649: ctxt->wellFormed = 0;
2650: }
2651: /*
2652: * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2653: */
2654:
2655: SKIP_BLANKS;
2656:
2657: /*
2658: * Check for SystemID and ExternalID
2659: */
1.5 daniel 2660: URI = htmlParseExternalID(ctxt, &ExternalID, 0);
1.1 daniel 2661: SKIP_BLANKS;
2662:
2663: /*
2664: * We should be at the end of the DOCTYPE declaration.
2665: */
2666: if (CUR != '>') {
2667: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2668: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
2669: ctxt->wellFormed = 0;
2670: /* We shouldn't try to resynchronize ... */
2671: }
2672: NEXT;
2673:
2674: /*
1.46 daniel 2675: * Create or update the document accordingly to the DOCTYPE
1.1 daniel 2676: */
1.46 daniel 2677: if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2678: (!ctxt->disableSAX))
2679: ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
1.1 daniel 2680:
2681: /*
2682: * Cleanup, since we don't use all those identifiers
2683: */
1.11 daniel 2684: if (URI != NULL) xmlFree(URI);
2685: if (ExternalID != NULL) xmlFree(ExternalID);
2686: if (name != NULL) xmlFree(name);
1.1 daniel 2687: }
2688:
2689: /**
2690: * htmlParseAttribute:
2691: * @ctxt: an HTML parser context
1.14 daniel 2692: * @value: a xmlChar ** used to store the value of the attribute
1.1 daniel 2693: *
2694: * parse an attribute
2695: *
2696: * [41] Attribute ::= Name Eq AttValue
2697: *
2698: * [25] Eq ::= S? '=' S?
2699: *
2700: * With namespace:
2701: *
2702: * [NS 11] Attribute ::= QName Eq AttValue
2703: *
2704: * Also the case QName == xmlns:??? is handled independently as a namespace
2705: * definition.
2706: *
2707: * Returns the attribute name, and the value in *value.
2708: */
2709:
1.14 daniel 2710: xmlChar *
2711: htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
1.31 daniel 2712: xmlChar *name, *val = NULL;
1.1 daniel 2713:
2714: *value = NULL;
1.74 veillard 2715: name = htmlParseHTMLName(ctxt);
1.1 daniel 2716: if (name == NULL) {
2717: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2718: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2719: ctxt->wellFormed = 0;
2720: return(NULL);
2721: }
2722:
2723: /*
2724: * read the value
2725: */
2726: SKIP_BLANKS;
2727: if (CUR == '=') {
2728: NEXT;
2729: SKIP_BLANKS;
2730: val = htmlParseAttValue(ctxt);
1.42 daniel 2731: /******
1.1 daniel 2732: } else {
1.42 daniel 2733: * TODO : some attribute must have values, some may not
1.1 daniel 2734: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.31 daniel 2735: ctxt->sax->warning(ctxt->userData,
1.42 daniel 2736: "No value for attribute %s\n", name); */
1.1 daniel 2737: }
2738:
2739: *value = val;
2740: return(name);
2741: }
2742:
2743: /**
1.47 daniel 2744: * htmlCheckEncoding:
2745: * @ctxt: an HTML parser context
2746: * @attvalue: the attribute value
2747: *
2748: * Checks an http-equiv attribute from a Meta tag to detect
2749: * the encoding
2750: * If a new encoding is detected the parser is switched to decode
2751: * it and pass UTF8
2752: */
2753: void
2754: htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2755: const xmlChar *encoding;
2756:
2757: if ((ctxt == NULL) || (attvalue == NULL))
2758: return;
2759:
1.69 veillard 2760: encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
1.47 daniel 2761: if (encoding != NULL) {
2762: encoding += 8;
2763: } else {
1.69 veillard 2764: encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
1.47 daniel 2765: if (encoding != NULL)
2766: encoding += 9;
2767: }
2768: if (encoding != NULL) {
2769: xmlCharEncoding enc;
2770: xmlCharEncodingHandlerPtr handler;
2771:
2772: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2773:
2774: if (ctxt->input->encoding != NULL)
2775: xmlFree((xmlChar *) ctxt->input->encoding);
2776: ctxt->input->encoding = xmlStrdup(encoding);
2777:
2778: enc = xmlParseCharEncoding((const char *) encoding);
2779: /*
2780: * registered set of known encodings
2781: */
2782: if (enc != XML_CHAR_ENCODING_ERROR) {
2783: xmlSwitchEncoding(ctxt, enc);
1.53 veillard 2784: ctxt->charset = XML_CHAR_ENCODING_UTF8;
1.47 daniel 2785: } else {
2786: /*
2787: * fallback for unknown encodings
2788: */
2789: handler = xmlFindCharEncodingHandler((const char *) encoding);
2790: if (handler != NULL) {
2791: xmlSwitchToEncoding(ctxt, handler);
1.54 veillard 2792: ctxt->charset = XML_CHAR_ENCODING_UTF8;
1.47 daniel 2793: } else {
2794: ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2795: }
2796: }
1.54 veillard 2797:
2798: if ((ctxt->input->buf != NULL) &&
2799: (ctxt->input->buf->encoder != NULL) &&
2800: (ctxt->input->buf->raw != NULL) &&
2801: (ctxt->input->buf->buffer != NULL)) {
2802: int nbchars;
1.56 veillard 2803: int processed;
1.54 veillard 2804:
2805: /*
2806: * convert as much as possible to the parser reading buffer.
2807: */
1.56 veillard 2808: processed = ctxt->input->cur - ctxt->input->base;
2809: xmlBufferShrink(ctxt->input->buf->buffer, processed);
1.54 veillard 2810: nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2811: ctxt->input->buf->buffer,
2812: ctxt->input->buf->raw);
2813: if (nbchars < 0) {
1.67 veillard 2814: ctxt->errNo = XML_ERR_INVALID_ENCODING;
1.54 veillard 2815: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2816: ctxt->sax->error(ctxt->userData,
2817: "htmlCheckEncoding: encoder error\n");
2818: }
1.56 veillard 2819: ctxt->input->base =
2820: ctxt->input->cur = ctxt->input->buf->buffer->content;
1.54 veillard 2821: }
1.47 daniel 2822: }
2823: }
2824:
2825: /**
2826: * htmlCheckMeta:
2827: * @ctxt: an HTML parser context
2828: * @atts: the attributes values
2829: *
2830: * Checks an attributes from a Meta tag
2831: */
2832: void
2833: htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2834: int i;
2835: const xmlChar *att, *value;
2836: int http = 0;
2837: const xmlChar *content = NULL;
2838:
2839: if ((ctxt == NULL) || (atts == NULL))
2840: return;
2841:
2842: i = 0;
2843: att = atts[i++];
2844: while (att != NULL) {
2845: value = atts[i++];
1.69 veillard 2846: if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2847: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.47 daniel 2848: http = 1;
1.69 veillard 2849: else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
1.47 daniel 2850: content = value;
2851: att = atts[i++];
2852: }
2853: if ((http) && (content != NULL))
2854: htmlCheckEncoding(ctxt, content);
2855:
2856: }
2857:
2858: /**
1.1 daniel 2859: * htmlParseStartTag:
2860: * @ctxt: an HTML parser context
2861: *
2862: * parse a start of tag either for rule element or
2863: * EmptyElement. In both case we don't parse the tag closing chars.
2864: *
2865: * [40] STag ::= '<' Name (S Attribute)* S? '>'
2866: *
2867: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2868: *
2869: * With namespace:
2870: *
2871: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2872: *
2873: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2874: *
2875: */
2876:
1.18 daniel 2877: void
1.1 daniel 2878: htmlParseStartTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 2879: xmlChar *name;
2880: xmlChar *attname;
2881: xmlChar *attvalue;
2882: const xmlChar **atts = NULL;
1.1 daniel 2883: int nbatts = 0;
2884: int maxatts = 0;
1.47 daniel 2885: int meta = 0;
1.1 daniel 2886: int i;
2887:
1.18 daniel 2888: if (CUR != '<') return;
1.1 daniel 2889: NEXT;
2890:
1.19 daniel 2891: GROW;
1.1 daniel 2892: name = htmlParseHTMLName(ctxt);
2893: if (name == NULL) {
2894: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2895: ctxt->sax->error(ctxt->userData,
2896: "htmlParseStartTag: invalid element name\n");
2897: ctxt->wellFormed = 0;
1.18 daniel 2898: return;
1.1 daniel 2899: }
1.73 veillard 2900: if (xmlStrEqual(name, BAD_CAST"meta"))
1.47 daniel 2901: meta = 1;
1.1 daniel 2902:
2903: /*
2904: * Check for auto-closure of HTML elements.
2905: */
2906: htmlAutoClose(ctxt, name);
1.43 daniel 2907:
2908: /*
2909: * Check for implied HTML elements.
2910: */
2911: htmlCheckImplied(ctxt, name);
1.1 daniel 2912:
2913: /*
2914: * Now parse the attributes, it ends up with the ending
2915: *
2916: * (S Attribute)* S?
2917: */
2918: SKIP_BLANKS;
2919: while ((IS_CHAR(CUR)) &&
2920: (CUR != '>') &&
2921: ((CUR != '/') || (NXT(1) != '>'))) {
1.26 daniel 2922: long cons = ctxt->nbChars;
1.1 daniel 2923:
1.19 daniel 2924: GROW;
1.1 daniel 2925: attname = htmlParseAttribute(ctxt, &attvalue);
1.31 daniel 2926: if (attname != NULL) {
1.47 daniel 2927:
1.1 daniel 2928: /*
2929: * Well formedness requires at most one declaration of an attribute
2930: */
2931: for (i = 0; i < nbatts;i += 2) {
1.73 veillard 2932: if (xmlStrEqual(atts[i], attname)) {
1.1 daniel 2933: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.19 daniel 2934: ctxt->sax->error(ctxt->userData,
2935: "Attribute %s redefined\n",
2936: attname);
1.1 daniel 2937: ctxt->wellFormed = 0;
1.11 daniel 2938: xmlFree(attname);
1.31 daniel 2939: if (attvalue != NULL)
2940: xmlFree(attvalue);
1.19 daniel 2941: goto failed;
1.1 daniel 2942: }
2943: }
2944:
2945: /*
2946: * Add the pair to atts
2947: */
2948: if (atts == NULL) {
2949: maxatts = 10;
1.14 daniel 2950: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
1.1 daniel 2951: if (atts == NULL) {
2952: fprintf(stderr, "malloc of %ld byte failed\n",
1.14 daniel 2953: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2954: if (name != NULL) xmlFree(name);
2955: return;
1.1 daniel 2956: }
1.23 daniel 2957: } else if (nbatts + 4 > maxatts) {
1.1 daniel 2958: maxatts *= 2;
1.71 veillard 2959: atts = (const xmlChar **) xmlRealloc((void *) atts,
2960: maxatts * sizeof(xmlChar *));
1.1 daniel 2961: if (atts == NULL) {
2962: fprintf(stderr, "realloc of %ld byte failed\n",
1.14 daniel 2963: maxatts * (long)sizeof(xmlChar *));
1.18 daniel 2964: if (name != NULL) xmlFree(name);
2965: return;
1.1 daniel 2966: }
2967: }
2968: atts[nbatts++] = attname;
2969: atts[nbatts++] = attvalue;
2970: atts[nbatts] = NULL;
2971: atts[nbatts + 1] = NULL;
2972: }
2973:
1.19 daniel 2974: failed:
1.1 daniel 2975: SKIP_BLANKS;
1.26 daniel 2976: if (cons == ctxt->nbChars) {
1.1 daniel 2977: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2978: ctxt->sax->error(ctxt->userData,
2979: "htmlParseStartTag: problem parsing attributes\n");
2980: ctxt->wellFormed = 0;
2981: break;
2982: }
2983: }
2984:
2985: /*
1.47 daniel 2986: * Handle specific association to the META tag
2987: */
2988: if (meta)
2989: htmlCheckMeta(ctxt, atts);
2990:
2991: /*
1.1 daniel 2992: * SAX: Start of Element !
2993: */
1.15 daniel 2994: htmlnamePush(ctxt, xmlStrdup(name));
1.18 daniel 2995: #ifdef DEBUG
2996: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
2997: #endif
1.1 daniel 2998: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
2999: ctxt->sax->startElement(ctxt->userData, name, atts);
3000:
3001: if (atts != NULL) {
1.31 daniel 3002: for (i = 0;i < nbatts;i++) {
3003: if (atts[i] != NULL)
3004: xmlFree((xmlChar *) atts[i]);
3005: }
1.45 daniel 3006: xmlFree((void *) atts);
1.1 daniel 3007: }
1.18 daniel 3008: if (name != NULL) xmlFree(name);
1.1 daniel 3009: }
3010:
3011: /**
3012: * htmlParseEndTag:
3013: * @ctxt: an HTML parser context
3014: *
3015: * parse an end of tag
3016: *
3017: * [42] ETag ::= '</' Name S? '>'
3018: *
3019: * With namespace
3020: *
3021: * [NS 9] ETag ::= '</' QName S? '>'
3022: */
3023:
3024: void
1.18 daniel 3025: htmlParseEndTag(htmlParserCtxtPtr ctxt) {
1.14 daniel 3026: xmlChar *name;
1.15 daniel 3027: xmlChar *oldname;
1.1 daniel 3028: int i;
3029:
3030: if ((CUR != '<') || (NXT(1) != '/')) {
3031: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3032: ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3033: ctxt->wellFormed = 0;
3034: return;
3035: }
3036: SKIP(2);
3037:
3038: name = htmlParseHTMLName(ctxt);
1.24 daniel 3039: if (name == NULL) return;
1.1 daniel 3040:
3041: /*
3042: * We should definitely be at the ending "S? '>'" part
3043: */
3044: SKIP_BLANKS;
3045: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3046: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3047: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3048: ctxt->wellFormed = 0;
3049: } else
3050: NEXT;
3051:
3052: /*
1.18 daniel 3053: * If the name read is not one of the element in the parsing stack
3054: * then return, it's just an error.
1.1 daniel 3055: */
1.18 daniel 3056: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
1.73 veillard 3057: if (xmlStrEqual(name, ctxt->nameTab[i])) break;
1.1 daniel 3058: }
3059: if (i < 0) {
3060: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.18 daniel 3061: ctxt->sax->error(ctxt->userData,
3062: "Unexpected end tag : %s\n", name);
1.11 daniel 3063: xmlFree(name);
1.1 daniel 3064: ctxt->wellFormed = 0;
3065: return;
3066: }
3067:
1.18 daniel 3068:
1.1 daniel 3069: /*
3070: * Check for auto-closure of HTML elements.
3071: */
1.18 daniel 3072:
1.1 daniel 3073: htmlAutoCloseOnClose(ctxt, name);
3074:
3075: /*
3076: * Well formedness constraints, opening and closing must match.
3077: * With the exception that the autoclose may have popped stuff out
3078: * of the stack.
3079: */
1.73 veillard 3080: if (!xmlStrEqual(name, ctxt->name)) {
1.18 daniel 3081: #ifdef DEBUG
3082: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
3083: #endif
1.15 daniel 3084: if ((ctxt->name != NULL) &&
1.73 veillard 3085: (!xmlStrEqual(ctxt->name, name))) {
1.1 daniel 3086: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3087: ctxt->sax->error(ctxt->userData,
3088: "Opening and ending tag mismatch: %s and %s\n",
1.15 daniel 3089: name, ctxt->name);
1.1 daniel 3090: ctxt->wellFormed = 0;
3091: }
3092: }
3093:
3094: /*
3095: * SAX: End of Tag
3096: */
1.15 daniel 3097: oldname = ctxt->name;
1.73 veillard 3098: if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
1.18 daniel 3099: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3100: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 3101: oldname = htmlnamePop(ctxt);
1.18 daniel 3102: if (oldname != NULL) {
3103: #ifdef DEBUG
3104: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
3105: #endif
3106: xmlFree(oldname);
3107: #ifdef DEBUG
3108: } else {
3109: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
3110: #endif
3111: }
3112: }
1.1 daniel 3113:
3114: if (name != NULL)
1.11 daniel 3115: xmlFree(name);
1.1 daniel 3116:
3117: return;
3118: }
3119:
3120:
3121: /**
3122: * htmlParseReference:
3123: * @ctxt: an HTML parser context
3124: *
3125: * parse and handle entity references in content,
3126: * this will end-up in a call to character() since this is either a
3127: * CharRef, or a predefined entity.
3128: */
3129: void
3130: htmlParseReference(htmlParserCtxtPtr ctxt) {
3131: htmlEntityDescPtr ent;
1.52 veillard 3132: xmlChar out[6];
1.14 daniel 3133: xmlChar *name;
1.1 daniel 3134: if (CUR != '&') return;
3135:
3136: if (NXT(1) == '#') {
1.52 veillard 3137: unsigned int c;
3138: int bits, i = 0;
3139:
3140: c = htmlParseCharRef(ctxt);
3141: if (c < 0x80) { out[i++]= c; bits= -6; }
3142: else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3143: else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3144: else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3145:
3146: for ( ; bits >= 0; bits-= 6) {
3147: out[i++]= ((c >> bits) & 0x3F) | 0x80;
3148: }
3149: out[i] = 0;
3150:
1.59 veillard 3151: htmlCheckParagraph(ctxt);
1.1 daniel 3152: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.52 veillard 3153: ctxt->sax->characters(ctxt->userData, out, i);
1.1 daniel 3154: } else {
3155: ent = htmlParseEntityRef(ctxt, &name);
1.32 daniel 3156: if (name == NULL) {
1.59 veillard 3157: htmlCheckParagraph(ctxt);
1.58 veillard 3158: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3159: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.32 daniel 3160: return;
3161: }
1.52 veillard 3162: if ((ent == NULL) || (ent->value <= 0)) {
1.59 veillard 3163: htmlCheckParagraph(ctxt);
1.1 daniel 3164: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
1.8 daniel 3165: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
1.1 daniel 3166: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
1.32 daniel 3167: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
1.1 daniel 3168: }
3169: } else {
1.52 veillard 3170: unsigned int c;
3171: int bits, i = 0;
3172:
3173: c = ent->value;
3174: if (c < 0x80)
3175: { out[i++]= c; bits= -6; }
3176: else if (c < 0x800)
3177: { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3178: else if (c < 0x10000)
3179: { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3180: else
3181: { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3182:
3183: for ( ; bits >= 0; bits-= 6) {
3184: out[i++]= ((c >> bits) & 0x3F) | 0x80;
3185: }
3186: out[i] = 0;
3187:
1.59 veillard 3188: htmlCheckParagraph(ctxt);
1.1 daniel 3189: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.52 veillard 3190: ctxt->sax->characters(ctxt->userData, out, i);
1.1 daniel 3191: }
1.11 daniel 3192: xmlFree(name);
1.1 daniel 3193: }
3194: }
3195:
3196: /**
3197: * htmlParseContent:
3198: * @ctxt: an HTML parser context
3199: * @name: the node name
3200: *
3201: * Parse a content: comment, sub-element, reference or text.
3202: *
3203: */
3204:
3205: void
1.18 daniel 3206: htmlParseContent(htmlParserCtxtPtr ctxt) {
1.15 daniel 3207: xmlChar *currentNode;
1.18 daniel 3208: int depth;
1.1 daniel 3209:
1.26 daniel 3210: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 3211: depth = ctxt->nameNr;
3212: while (1) {
1.26 daniel 3213: long cons = ctxt->nbChars;
1.1 daniel 3214:
1.18 daniel 3215: GROW;
3216: /*
3217: * Our tag or one of it's parent or children is ending.
3218: */
3219: if ((CUR == '<') && (NXT(1) == '/')) {
3220: htmlParseEndTag(ctxt);
1.26 daniel 3221: if (currentNode != NULL) xmlFree(currentNode);
1.18 daniel 3222: return;
3223: }
3224:
3225: /*
3226: * Has this node been popped out during parsing of
3227: * the next element
3228: */
1.73 veillard 3229: if ((!xmlStrEqual(currentNode, ctxt->name)) &&
1.26 daniel 3230: (depth >= ctxt->nameNr)) {
3231: if (currentNode != NULL) xmlFree(currentNode);
3232: return;
3233: }
1.18 daniel 3234:
1.77 veillard 3235: if ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3236: (xmlStrEqual(currentNode, BAD_CAST"style"))) {
3237: /*
3238: * Handle SCRIPT/STYLE separately
3239: */
3240: htmlParseScript(ctxt);
3241: } else {
3242: /*
3243: * Sometimes DOCTYPE arrives in the middle of the document
3244: */
3245: if ((CUR == '<') && (NXT(1) == '!') &&
3246: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3247: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3248: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3249: (UPP(8) == 'E')) {
3250: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3251: ctxt->sax->error(ctxt->userData,
3252: "Misplaced DOCTYPE declaration\n");
3253: ctxt->wellFormed = 0;
3254: htmlParseDocTypeDecl(ctxt);
3255: }
1.59 veillard 3256:
1.77 veillard 3257: /*
3258: * First case : a comment
3259: */
3260: if ((CUR == '<') && (NXT(1) == '!') &&
3261: (NXT(2) == '-') && (NXT(3) == '-')) {
3262: htmlParseComment(ctxt);
3263: }
1.1 daniel 3264:
1.77 veillard 3265: /*
3266: * Second case : a sub-element.
3267: */
3268: else if (CUR == '<') {
3269: htmlParseElement(ctxt);
3270: }
1.1 daniel 3271:
1.77 veillard 3272: /*
3273: * Third case : a reference. If if has not been resolved,
3274: * parsing returns it's Name, create the node
3275: */
3276: else if (CUR == '&') {
3277: htmlParseReference(ctxt);
3278: }
1.1 daniel 3279:
1.77 veillard 3280: /*
3281: * Fourth : end of the resource
3282: */
3283: else if (CUR == 0) {
3284: htmlAutoClose(ctxt, NULL);
3285: }
1.47 daniel 3286:
1.77 veillard 3287: /*
3288: * Last case, text. Note that References are handled directly.
3289: */
3290: else {
3291: htmlParseCharData(ctxt, 0);
3292: }
1.1 daniel 3293:
1.77 veillard 3294: if (cons == ctxt->nbChars) {
3295: if (ctxt->node != NULL) {
3296: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3297: ctxt->sax->error(ctxt->userData,
3298: "detected an error in element content\n");
3299: ctxt->wellFormed = 0;
3300: }
3301: break;
1.22 daniel 3302: }
1.1 daniel 3303: }
1.5 daniel 3304: GROW;
1.1 daniel 3305: }
1.26 daniel 3306: if (currentNode != NULL) xmlFree(currentNode);
1.1 daniel 3307: }
3308:
3309: /**
3310: * htmlParseElement:
3311: * @ctxt: an HTML parser context
3312: *
3313: * parse an HTML element, this is highly recursive
3314: *
3315: * [39] element ::= EmptyElemTag | STag content ETag
3316: *
3317: * [41] Attribute ::= Name Eq AttValue
3318: */
3319:
3320: void
3321: htmlParseElement(htmlParserCtxtPtr ctxt) {
1.14 daniel 3322: xmlChar *name;
1.16 daniel 3323: xmlChar *currentNode = NULL;
1.1 daniel 3324: htmlElemDescPtr info;
1.10 daniel 3325: htmlParserNodeInfo node_info;
1.31 daniel 3326: xmlChar *oldname;
1.18 daniel 3327: int depth = ctxt->nameNr;
1.1 daniel 3328:
3329: /* Capture start position */
1.10 daniel 3330: if (ctxt->record_info) {
3331: node_info.begin_pos = ctxt->input->consumed +
3332: (CUR_PTR - ctxt->input->base);
3333: node_info.begin_line = ctxt->input->line;
3334: }
1.1 daniel 3335:
1.26 daniel 3336: oldname = xmlStrdup(ctxt->name);
1.18 daniel 3337: htmlParseStartTag(ctxt);
3338: name = ctxt->name;
1.19 daniel 3339: #ifdef DEBUG
3340: if (oldname == NULL)
3341: fprintf(stderr, "Start of element %s\n", name);
3342: else if (name == NULL)
3343: fprintf(stderr, "Start of element failed, was %s\n", oldname);
3344: else
3345: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
3346: #endif
1.73 veillard 3347: if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
1.18 daniel 3348: (name == NULL)) {
1.19 daniel 3349: if (CUR == '>')
3350: NEXT;
1.26 daniel 3351: if (oldname != NULL)
3352: xmlFree(oldname);
1.1 daniel 3353: return;
3354: }
1.26 daniel 3355: if (oldname != NULL)
3356: xmlFree(oldname);
1.1 daniel 3357:
3358: /*
3359: * Lookup the info for that element.
3360: */
3361: info = htmlTagLookup(name);
3362: if (info == NULL) {
3363: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3364: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3365: name);
3366: ctxt->wellFormed = 0;
3367: } else if (info->depr) {
3368: /***************************
3369: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3370: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3371: name);
3372: ***************************/
3373: }
3374:
3375: /*
3376: * Check for an Empty Element labelled the XML/SGML way
3377: */
3378: if ((CUR == '/') && (NXT(1) == '>')) {
3379: SKIP(2);
3380: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3381: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 3382: oldname = htmlnamePop(ctxt);
1.18 daniel 3383: #ifdef DEBUG
3384: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
3385: #endif
1.17 daniel 3386: if (oldname != NULL)
3387: xmlFree(oldname);
1.1 daniel 3388: return;
3389: }
3390:
1.5 daniel 3391: if (CUR == '>') {
3392: NEXT;
3393: } else {
1.1 daniel 3394: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.56 veillard 3395: ctxt->sax->error(ctxt->userData,
3396: "Couldn't find end of Start Tag %s\n",
3397: name);
1.1 daniel 3398: ctxt->wellFormed = 0;
3399:
3400: /*
3401: * end of parsing of this node.
3402: */
1.73 veillard 3403: if (xmlStrEqual(name, ctxt->name)) {
1.18 daniel 3404: nodePop(ctxt);
1.24 daniel 3405: oldname = htmlnamePop(ctxt);
1.18 daniel 3406: #ifdef DEBUG
3407: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
3408: #endif
3409: if (oldname != NULL)
3410: xmlFree(oldname);
3411: }
1.10 daniel 3412:
3413: /*
3414: * Capture end position and add node
3415: */
3416: if ( currentNode != NULL && ctxt->record_info ) {
3417: node_info.end_pos = ctxt->input->consumed +
3418: (CUR_PTR - ctxt->input->base);
3419: node_info.end_line = ctxt->input->line;
1.15 daniel 3420: node_info.node = ctxt->node;
1.10 daniel 3421: xmlParserAddNodeInfo(ctxt, &node_info);
3422: }
1.1 daniel 3423: return;
3424: }
3425:
3426: /*
3427: * Check for an Empty Element from DTD definition
3428: */
3429: if ((info != NULL) && (info->empty)) {
3430: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3431: ctxt->sax->endElement(ctxt->userData, name);
1.24 daniel 3432: oldname = htmlnamePop(ctxt);
1.18 daniel 3433: #ifdef DEBUG
3434: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
3435: #endif
1.17 daniel 3436: if (oldname != NULL)
3437: xmlFree(oldname);
1.1 daniel 3438: return;
3439: }
3440:
3441: /*
3442: * Parse the content of the element:
3443: */
1.26 daniel 3444: currentNode = xmlStrdup(ctxt->name);
1.18 daniel 3445: depth = ctxt->nameNr;
3446: while (IS_CHAR(CUR)) {
3447: htmlParseContent(ctxt);
3448: if (ctxt->nameNr < depth) break;
3449: }
1.1 daniel 3450:
3451: if (!IS_CHAR(CUR)) {
1.49 daniel 3452: /************
1.1 daniel 3453: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3454: ctxt->sax->error(ctxt->userData,
1.18 daniel 3455: "Premature end of data in tag %s\n", currentNode);
1.1 daniel 3456: ctxt->wellFormed = 0;
1.49 daniel 3457: *************/
1.1 daniel 3458:
3459: /*
3460: * end of parsing of this node.
3461: */
3462: nodePop(ctxt);
1.24 daniel 3463: oldname = htmlnamePop(ctxt);
1.18 daniel 3464: #ifdef DEBUG
3465: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
3466: #endif
1.17 daniel 3467: if (oldname != NULL)
3468: xmlFree(oldname);
1.26 daniel 3469: if (currentNode != NULL)
3470: xmlFree(currentNode);
1.1 daniel 3471: return;
3472: }
1.10 daniel 3473:
3474: /*
3475: * Capture end position and add node
3476: */
3477: if ( currentNode != NULL && ctxt->record_info ) {
3478: node_info.end_pos = ctxt->input->consumed +
3479: (CUR_PTR - ctxt->input->base);
3480: node_info.end_line = ctxt->input->line;
1.15 daniel 3481: node_info.node = ctxt->node;
1.10 daniel 3482: xmlParserAddNodeInfo(ctxt, &node_info);
3483: }
1.26 daniel 3484: if (currentNode != NULL)
3485: xmlFree(currentNode);
1.1 daniel 3486: }
3487:
3488: /**
3489: * htmlParseDocument :
3490: * @ctxt: an HTML parser context
3491: *
3492: * parse an HTML document (and build a tree if using the standard SAX
3493: * interface).
3494: *
3495: * Returns 0, -1 in case of error. the parser context is augmented
3496: * as a result of the parsing.
3497: */
3498:
3499: int
3500: htmlParseDocument(htmlParserCtxtPtr ctxt) {
1.59 veillard 3501: xmlDtdPtr dtd;
3502:
1.1 daniel 3503: htmlDefaultSAXHandlerInit();
3504: ctxt->html = 1;
3505:
1.5 daniel 3506: GROW;
1.1 daniel 3507: /*
1.9 daniel 3508: * SAX: beginning of the document processing.
1.1 daniel 3509: */
3510: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3511: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3512:
3513: /*
3514: * Wipe out everything which is before the first '<'
3515: */
1.22 daniel 3516: SKIP_BLANKS;
1.1 daniel 3517: if (CUR == 0) {
3518: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3519: ctxt->sax->error(ctxt->userData, "Document is empty\n");
3520: ctxt->wellFormed = 0;
3521: }
3522:
1.40 daniel 3523: if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3524: ctxt->sax->startDocument(ctxt->userData);
3525:
3526:
1.22 daniel 3527: /*
3528: * Parse possible comments before any content
3529: */
3530: while ((CUR == '<') && (NXT(1) == '!') &&
3531: (NXT(2) == '-') && (NXT(3) == '-')) {
1.31 daniel 3532: htmlParseComment(ctxt);
1.22 daniel 3533: SKIP_BLANKS;
3534: }
3535:
1.1 daniel 3536:
3537: /*
3538: * Then possibly doc type declaration(s) and more Misc
3539: * (doctypedecl Misc*)?
3540: */
3541: if ((CUR == '<') && (NXT(1) == '!') &&
3542: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3543: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3544: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3545: (UPP(8) == 'E')) {
3546: htmlParseDocTypeDecl(ctxt);
3547: }
3548: SKIP_BLANKS;
3549:
3550: /*
1.55 veillard 3551: * Parse possible comments before any content
3552: */
3553: while ((CUR == '<') && (NXT(1) == '!') &&
3554: (NXT(2) == '-') && (NXT(3) == '-')) {
3555: htmlParseComment(ctxt);
3556: SKIP_BLANKS;
3557: }
3558:
3559: /*
1.1 daniel 3560: * Time to start parsing the tree itself
3561: */
1.22 daniel 3562: htmlParseContent(ctxt);
1.1 daniel 3563:
3564: /*
1.47 daniel 3565: * autoclose
3566: */
3567: if (CUR == 0)
3568: htmlAutoClose(ctxt, NULL);
3569:
3570:
3571: /*
1.1 daniel 3572: * SAX: end of the document processing.
3573: */
3574: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3575: ctxt->sax->endDocument(ctxt->userData);
1.59 veillard 3576:
3577: if (ctxt->myDoc != NULL) {
3578: dtd = xmlGetIntSubset(ctxt->myDoc);
3579: if (dtd == NULL)
3580: ctxt->myDoc->intSubset =
3581: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3582: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3583: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3584: }
1.1 daniel 3585: if (! ctxt->wellFormed) return(-1);
3586: return(0);
3587: }
3588:
3589:
1.30 daniel 3590: /************************************************************************
3591: * *
3592: * Parser contexts handling *
3593: * *
3594: ************************************************************************/
1.1 daniel 3595:
3596: /**
3597: * xmlInitParserCtxt:
3598: * @ctxt: an HTML parser context
3599: *
3600: * Initialize a parser context
3601: */
3602:
3603: void
3604: htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3605: {
3606: htmlSAXHandler *sax;
3607:
1.21 daniel 3608: if (ctxt == NULL) return;
3609: memset(ctxt, 0, sizeof(htmlParserCtxt));
3610:
1.11 daniel 3611: sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
1.1 daniel 3612: if (sax == NULL) {
3613: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3614: }
1.68 veillard 3615: else
3616: memset(sax, 0, sizeof(htmlSAXHandler));
1.1 daniel 3617:
3618: /* Allocate the Input stack */
1.19 daniel 3619: ctxt->inputTab = (htmlParserInputPtr *)
3620: xmlMalloc(5 * sizeof(htmlParserInputPtr));
3621: if (ctxt->inputTab == NULL) {
3622: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
1.65 veillard 3623: ctxt->inputNr = 0;
3624: ctxt->inputMax = 0;
3625: ctxt->input = NULL;
3626: return;
1.19 daniel 3627: }
1.1 daniel 3628: ctxt->inputNr = 0;
3629: ctxt->inputMax = 5;
3630: ctxt->input = NULL;
3631: ctxt->version = NULL;
3632: ctxt->encoding = NULL;
3633: ctxt->standalone = -1;
1.30 daniel 3634: ctxt->instate = XML_PARSER_START;
1.1 daniel 3635:
3636: /* Allocate the Node stack */
1.11 daniel 3637: ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
1.65 veillard 3638: if (ctxt->nodeTab == NULL) {
3639: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3640: ctxt->nodeNr = 0;
3641: ctxt->nodeMax = 0;
3642: ctxt->node = NULL;
3643: ctxt->inputNr = 0;
3644: ctxt->inputMax = 0;
3645: ctxt->input = NULL;
3646: return;
3647: }
1.1 daniel 3648: ctxt->nodeNr = 0;
3649: ctxt->nodeMax = 10;
3650: ctxt->node = NULL;
3651:
1.15 daniel 3652: /* Allocate the Name stack */
3653: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
1.65 veillard 3654: if (ctxt->nameTab == NULL) {
3655: fprintf(stderr, "htmlInitParserCtxt: out of memory\n");
3656: ctxt->nameNr = 0;
3657: ctxt->nameMax = 10;
3658: ctxt->name = NULL;
3659: ctxt->nodeNr = 0;
3660: ctxt->nodeMax = 0;
3661: ctxt->node = NULL;
3662: ctxt->inputNr = 0;
3663: ctxt->inputMax = 0;
3664: ctxt->input = NULL;
3665: return;
3666: }
1.15 daniel 3667: ctxt->nameNr = 0;
3668: ctxt->nameMax = 10;
3669: ctxt->name = NULL;
3670:
1.1 daniel 3671: if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3672: else {
3673: ctxt->sax = sax;
3674: memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3675: }
3676: ctxt->userData = ctxt;
3677: ctxt->myDoc = NULL;
3678: ctxt->wellFormed = 1;
3679: ctxt->replaceEntities = 0;
3680: ctxt->html = 1;
3681: ctxt->record_info = 0;
1.21 daniel 3682: ctxt->validate = 0;
1.26 daniel 3683: ctxt->nbChars = 0;
1.30 daniel 3684: ctxt->checkIndex = 0;
1.1 daniel 3685: xmlInitNodeInfoSeq(&ctxt->node_seq);
3686: }
3687:
3688: /**
3689: * htmlFreeParserCtxt:
3690: * @ctxt: an HTML parser context
3691: *
3692: * Free all the memory used by a parser context. However the parsed
3693: * document in ctxt->myDoc is not freed.
3694: */
3695:
3696: void
3697: htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3698: {
1.47 daniel 3699: xmlFreeParserCtxt(ctxt);
1.1 daniel 3700: }
3701:
3702: /**
3703: * htmlCreateDocParserCtxt :
1.14 daniel 3704: * @cur: a pointer to an array of xmlChar
1.1 daniel 3705: * @encoding: a free form C string describing the HTML document encoding, or NULL
3706: *
3707: * Create a parser context for an HTML document.
3708: *
3709: * Returns the new parser context or NULL
3710: */
3711: htmlParserCtxtPtr
1.14 daniel 3712: htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
1.1 daniel 3713: htmlParserCtxtPtr ctxt;
3714: htmlParserInputPtr input;
3715: /* htmlCharEncoding enc; */
3716:
1.11 daniel 3717: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 3718: if (ctxt == NULL) {
3719: perror("malloc");
3720: return(NULL);
3721: }
3722: htmlInitParserCtxt(ctxt);
1.11 daniel 3723: input = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 3724: if (input == NULL) {
3725: perror("malloc");
1.11 daniel 3726: xmlFree(ctxt);
1.1 daniel 3727: return(NULL);
3728: }
1.19 daniel 3729: memset(input, 0, sizeof(htmlParserInput));
1.1 daniel 3730:
3731: input->line = 1;
3732: input->col = 1;
3733: input->base = cur;
3734: input->cur = cur;
3735:
3736: inputPush(ctxt, input);
3737: return(ctxt);
3738: }
3739:
1.31 daniel 3740: /************************************************************************
3741: * *
3742: * Progressive parsing interfaces *
3743: * *
3744: ************************************************************************/
3745:
3746: /**
3747: * htmlParseLookupSequence:
3748: * @ctxt: an HTML parser context
3749: * @first: the first char to lookup
3750: * @next: the next char to lookup or zero
3751: * @third: the next char to lookup or zero
3752: *
3753: * Try to find if a sequence (first, next, third) or just (first next) or
3754: * (first) is available in the input stream.
3755: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3756: * to avoid rescanning sequences of bytes, it DOES change the state of the
3757: * parser, do not use liberally.
3758: * This is basically similar to xmlParseLookupSequence()
3759: *
3760: * Returns the index to the current parsing point if the full sequence
3761: * is available, -1 otherwise.
3762: */
3763: int
3764: htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3765: xmlChar next, xmlChar third) {
3766: int base, len;
3767: htmlParserInputPtr in;
3768: const xmlChar *buf;
3769:
3770: in = ctxt->input;
3771: if (in == NULL) return(-1);
3772: base = in->cur - in->base;
3773: if (base < 0) return(-1);
3774: if (ctxt->checkIndex > base)
3775: base = ctxt->checkIndex;
3776: if (in->buf == NULL) {
3777: buf = in->base;
3778: len = in->length;
3779: } else {
3780: buf = in->buf->buffer->content;
3781: len = in->buf->buffer->use;
3782: }
3783: /* take into account the sequence length */
3784: if (third) len -= 2;
3785: else if (next) len --;
3786: for (;base < len;base++) {
3787: if (buf[base] == first) {
3788: if (third != 0) {
3789: if ((buf[base + 1] != next) ||
3790: (buf[base + 2] != third)) continue;
3791: } else if (next != 0) {
3792: if (buf[base + 1] != next) continue;
3793: }
3794: ctxt->checkIndex = 0;
3795: #ifdef DEBUG_PUSH
3796: if (next == 0)
3797: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
3798: first, base);
3799: else if (third == 0)
3800: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
3801: first, next, base);
3802: else
3803: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
3804: first, next, third, base);
3805: #endif
3806: return(base - (in->cur - in->base));
3807: }
3808: }
3809: ctxt->checkIndex = base;
3810: #ifdef DEBUG_PUSH
3811: if (next == 0)
3812: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
3813: else if (third == 0)
3814: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
3815: else
3816: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
3817: #endif
3818: return(-1);
3819: }
3820:
3821: /**
1.32 daniel 3822: * htmlParseTryOrFinish:
1.31 daniel 3823: * @ctxt: an HTML parser context
1.32 daniel 3824: * @terminate: last chunk indicator
1.31 daniel 3825: *
3826: * Try to progress on parsing
3827: *
3828: * Returns zero if no parsing was possible
3829: */
3830: int
1.32 daniel 3831: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
1.31 daniel 3832: int ret = 0;
3833: htmlParserInputPtr in;
1.47 daniel 3834: int avail = 0;
1.31 daniel 3835: xmlChar cur, next;
3836:
3837: #ifdef DEBUG_PUSH
3838: switch (ctxt->instate) {
3839: case XML_PARSER_EOF:
3840: fprintf(stderr, "HPP: try EOF\n"); break;
3841: case XML_PARSER_START:
3842: fprintf(stderr, "HPP: try START\n"); break;
3843: case XML_PARSER_MISC:
3844: fprintf(stderr, "HPP: try MISC\n");break;
3845: case XML_PARSER_COMMENT:
3846: fprintf(stderr, "HPP: try COMMENT\n");break;
3847: case XML_PARSER_PROLOG:
3848: fprintf(stderr, "HPP: try PROLOG\n");break;
3849: case XML_PARSER_START_TAG:
3850: fprintf(stderr, "HPP: try START_TAG\n");break;
3851: case XML_PARSER_CONTENT:
3852: fprintf(stderr, "HPP: try CONTENT\n");break;
3853: case XML_PARSER_CDATA_SECTION:
3854: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
3855: case XML_PARSER_END_TAG:
3856: fprintf(stderr, "HPP: try END_TAG\n");break;
3857: case XML_PARSER_ENTITY_DECL:
3858: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
3859: case XML_PARSER_ENTITY_VALUE:
3860: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
3861: case XML_PARSER_ATTRIBUTE_VALUE:
3862: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
3863: case XML_PARSER_DTD:
3864: fprintf(stderr, "HPP: try DTD\n");break;
3865: case XML_PARSER_EPILOG:
3866: fprintf(stderr, "HPP: try EPILOG\n");break;
3867: case XML_PARSER_PI:
3868: fprintf(stderr, "HPP: try PI\n");break;
1.77 veillard 3869: case XML_PARSER_SYSTEM_LITERAL:
3870: fprintf(stderr, "HPP: try SYSTEM_LITERAL\n");break;
1.31 daniel 3871: }
3872: #endif
3873:
3874: while (1) {
3875:
3876: in = ctxt->input;
3877: if (in == NULL) break;
3878: if (in->buf == NULL)
3879: avail = in->length - (in->cur - in->base);
3880: else
3881: avail = in->buf->buffer->use - (in->cur - in->base);
1.47 daniel 3882: if ((avail == 0) && (terminate)) {
3883: htmlAutoClose(ctxt, NULL);
1.54 veillard 3884: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
3885: /*
3886: * SAX: end of the document processing.
3887: */
1.47 daniel 3888: ctxt->instate = XML_PARSER_EOF;
1.54 veillard 3889: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3890: ctxt->sax->endDocument(ctxt->userData);
3891: }
1.47 daniel 3892: }
1.31 daniel 3893: if (avail < 1)
3894: goto done;
3895: switch (ctxt->instate) {
3896: case XML_PARSER_EOF:
3897: /*
3898: * Document parsing is done !
3899: */
3900: goto done;
3901: case XML_PARSER_START:
3902: /*
3903: * Very first chars read from the document flow.
3904: */
3905: cur = in->cur[0];
3906: if (IS_BLANK(cur)) {
3907: SKIP_BLANKS;
3908: if (in->buf == NULL)
3909: avail = in->length - (in->cur - in->base);
3910: else
3911: avail = in->buf->buffer->use - (in->cur - in->base);
3912: }
3913: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3914: ctxt->sax->setDocumentLocator(ctxt->userData,
3915: &xmlDefaultSAXLocator);
1.46 daniel 3916: if ((ctxt->sax) && (ctxt->sax->startDocument) &&
3917: (!ctxt->disableSAX))
3918: ctxt->sax->startDocument(ctxt->userData);
3919:
1.31 daniel 3920: cur = in->cur[0];
3921: next = in->cur[1];
3922: if ((cur == '<') && (next == '!') &&
3923: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3924: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3925: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3926: (UPP(8) == 'E')) {
1.32 daniel 3927: if ((!terminate) &&
3928: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3929: goto done;
3930: #ifdef DEBUG_PUSH
3931: fprintf(stderr, "HPP: Parsing internal subset\n");
3932: #endif
3933: htmlParseDocTypeDecl(ctxt);
3934: ctxt->instate = XML_PARSER_PROLOG;
3935: #ifdef DEBUG_PUSH
3936: fprintf(stderr, "HPP: entering PROLOG\n");
3937: #endif
3938: } else {
3939: ctxt->instate = XML_PARSER_MISC;
3940: }
3941: #ifdef DEBUG_PUSH
3942: fprintf(stderr, "HPP: entering MISC\n");
3943: #endif
3944: break;
3945: case XML_PARSER_MISC:
3946: SKIP_BLANKS;
3947: if (in->buf == NULL)
3948: avail = in->length - (in->cur - in->base);
3949: else
3950: avail = in->buf->buffer->use - (in->cur - in->base);
3951: if (avail < 2)
3952: goto done;
3953: cur = in->cur[0];
3954: next = in->cur[1];
3955: if ((cur == '<') && (next == '!') &&
3956: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 3957: if ((!terminate) &&
3958: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 3959: goto done;
3960: #ifdef DEBUG_PUSH
3961: fprintf(stderr, "HPP: Parsing Comment\n");
3962: #endif
3963: htmlParseComment(ctxt);
3964: ctxt->instate = XML_PARSER_MISC;
3965: } else if ((cur == '<') && (next == '!') &&
3966: (UPP(2) == 'D') && (UPP(3) == 'O') &&
3967: (UPP(4) == 'C') && (UPP(5) == 'T') &&
3968: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3969: (UPP(8) == 'E')) {
1.32 daniel 3970: if ((!terminate) &&
3971: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 3972: goto done;
3973: #ifdef DEBUG_PUSH
3974: fprintf(stderr, "HPP: Parsing internal subset\n");
3975: #endif
3976: htmlParseDocTypeDecl(ctxt);
3977: ctxt->instate = XML_PARSER_PROLOG;
3978: #ifdef DEBUG_PUSH
3979: fprintf(stderr, "HPP: entering PROLOG\n");
3980: #endif
3981: } else if ((cur == '<') && (next == '!') &&
3982: (avail < 9)) {
3983: goto done;
3984: } else {
3985: ctxt->instate = XML_PARSER_START_TAG;
3986: #ifdef DEBUG_PUSH
3987: fprintf(stderr, "HPP: entering START_TAG\n");
3988: #endif
3989: }
3990: break;
3991: case XML_PARSER_PROLOG:
3992: SKIP_BLANKS;
3993: if (in->buf == NULL)
3994: avail = in->length - (in->cur - in->base);
3995: else
3996: avail = in->buf->buffer->use - (in->cur - in->base);
3997: if (avail < 2)
3998: goto done;
3999: cur = in->cur[0];
4000: next = in->cur[1];
4001: if ((cur == '<') && (next == '!') &&
4002: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 4003: if ((!terminate) &&
4004: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 4005: goto done;
4006: #ifdef DEBUG_PUSH
4007: fprintf(stderr, "HPP: Parsing Comment\n");
4008: #endif
4009: htmlParseComment(ctxt);
4010: ctxt->instate = XML_PARSER_PROLOG;
4011: } else if ((cur == '<') && (next == '!') &&
4012: (avail < 4)) {
4013: goto done;
4014: } else {
4015: ctxt->instate = XML_PARSER_START_TAG;
4016: #ifdef DEBUG_PUSH
4017: fprintf(stderr, "HPP: entering START_TAG\n");
4018: #endif
4019: }
4020: break;
4021: case XML_PARSER_EPILOG:
4022: if (in->buf == NULL)
4023: avail = in->length - (in->cur - in->base);
4024: else
4025: avail = in->buf->buffer->use - (in->cur - in->base);
1.55 veillard 4026: if (avail < 1)
4027: goto done;
4028: cur = in->cur[0];
4029: if (IS_BLANK(cur)) {
4030: htmlParseCharData(ctxt, 0);
4031: goto done;
4032: }
1.31 daniel 4033: if (avail < 2)
4034: goto done;
4035: next = in->cur[1];
4036: if ((cur == '<') && (next == '!') &&
4037: (in->cur[2] == '-') && (in->cur[3] == '-')) {
1.32 daniel 4038: if ((!terminate) &&
4039: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
1.31 daniel 4040: goto done;
4041: #ifdef DEBUG_PUSH
4042: fprintf(stderr, "HPP: Parsing Comment\n");
4043: #endif
4044: htmlParseComment(ctxt);
4045: ctxt->instate = XML_PARSER_EPILOG;
4046: } else if ((cur == '<') && (next == '!') &&
4047: (avail < 4)) {
4048: goto done;
4049: } else {
1.67 veillard 4050: ctxt->errNo = XML_ERR_DOCUMENT_END;
1.31 daniel 4051: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4052: ctxt->sax->error(ctxt->userData,
4053: "Extra content at the end of the document\n");
4054: ctxt->wellFormed = 0;
4055: ctxt->instate = XML_PARSER_EOF;
4056: #ifdef DEBUG_PUSH
4057: fprintf(stderr, "HPP: entering EOF\n");
4058: #endif
4059: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4060: ctxt->sax->endDocument(ctxt->userData);
4061: goto done;
4062: }
4063: break;
4064: case XML_PARSER_START_TAG: {
4065: xmlChar *name, *oldname;
4066: int depth = ctxt->nameNr;
4067: htmlElemDescPtr info;
4068:
4069: if (avail < 2)
4070: goto done;
4071: cur = in->cur[0];
4072: if (cur != '<') {
4073: ctxt->instate = XML_PARSER_CONTENT;
4074: #ifdef DEBUG_PUSH
4075: fprintf(stderr, "HPP: entering CONTENT\n");
4076: #endif
4077: break;
4078: }
1.32 daniel 4079: if ((!terminate) &&
4080: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 4081: goto done;
4082:
4083: oldname = xmlStrdup(ctxt->name);
4084: htmlParseStartTag(ctxt);
4085: name = ctxt->name;
4086: #ifdef DEBUG
4087: if (oldname == NULL)
4088: fprintf(stderr, "Start of element %s\n", name);
4089: else if (name == NULL)
4090: fprintf(stderr, "Start of element failed, was %s\n",
4091: oldname);
4092: else
4093: fprintf(stderr, "Start of element %s, was %s\n",
4094: name, oldname);
4095: #endif
4096: if (((depth == ctxt->nameNr) &&
1.73 veillard 4097: (xmlStrEqual(oldname, ctxt->name))) ||
1.31 daniel 4098: (name == NULL)) {
4099: if (CUR == '>')
4100: NEXT;
4101: if (oldname != NULL)
4102: xmlFree(oldname);
4103: break;
4104: }
4105: if (oldname != NULL)
4106: xmlFree(oldname);
4107:
4108: /*
4109: * Lookup the info for that element.
4110: */
4111: info = htmlTagLookup(name);
4112: if (info == NULL) {
4113: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4114: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4115: name);
4116: ctxt->wellFormed = 0;
4117: } else if (info->depr) {
4118: /***************************
4119: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4120: ctxt->sax->warning(ctxt->userData,
4121: "Tag %s is deprecated\n",
4122: name);
4123: ***************************/
4124: }
4125:
4126: /*
4127: * Check for an Empty Element labelled the XML/SGML way
4128: */
4129: if ((CUR == '/') && (NXT(1) == '>')) {
4130: SKIP(2);
4131: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4132: ctxt->sax->endElement(ctxt->userData, name);
4133: oldname = htmlnamePop(ctxt);
4134: #ifdef DEBUG
4135: fprintf(stderr,"End of tag the XML way: popping out %s\n",
4136: oldname);
4137: #endif
4138: if (oldname != NULL)
4139: xmlFree(oldname);
4140: ctxt->instate = XML_PARSER_CONTENT;
4141: #ifdef DEBUG_PUSH
4142: fprintf(stderr, "HPP: entering CONTENT\n");
4143: #endif
4144: break;
4145: }
4146:
4147: if (CUR == '>') {
4148: NEXT;
4149: } else {
4150: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4151: ctxt->sax->error(ctxt->userData,
4152: "Couldn't find end of Start Tag %s\n",
4153: name);
4154: ctxt->wellFormed = 0;
4155:
4156: /*
4157: * end of parsing of this node.
4158: */
1.73 veillard 4159: if (xmlStrEqual(name, ctxt->name)) {
1.31 daniel 4160: nodePop(ctxt);
4161: oldname = htmlnamePop(ctxt);
4162: #ifdef DEBUG
4163: fprintf(stderr,
4164: "End of start tag problem: popping out %s\n", oldname);
4165: #endif
4166: if (oldname != NULL)
4167: xmlFree(oldname);
4168: }
4169:
4170: ctxt->instate = XML_PARSER_CONTENT;
4171: #ifdef DEBUG_PUSH
4172: fprintf(stderr, "HPP: entering CONTENT\n");
4173: #endif
4174: break;
4175: }
4176:
4177: /*
4178: * Check for an Empty Element from DTD definition
4179: */
4180: if ((info != NULL) && (info->empty)) {
4181: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4182: ctxt->sax->endElement(ctxt->userData, name);
4183: oldname = htmlnamePop(ctxt);
4184: #ifdef DEBUG
4185: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
4186: #endif
4187: if (oldname != NULL)
4188: xmlFree(oldname);
4189: }
4190: ctxt->instate = XML_PARSER_CONTENT;
4191: #ifdef DEBUG_PUSH
4192: fprintf(stderr, "HPP: entering CONTENT\n");
4193: #endif
4194: break;
4195: }
1.56 veillard 4196: case XML_PARSER_CONTENT: {
4197: long cons;
1.31 daniel 4198: /*
4199: * Handle preparsed entities and charRef
4200: */
4201: if (ctxt->token != 0) {
1.47 daniel 4202: xmlChar chr[2] = { 0 , 0 } ;
1.31 daniel 4203:
1.47 daniel 4204: chr[0] = (xmlChar) ctxt->token;
1.59 veillard 4205: htmlCheckParagraph(ctxt);
1.31 daniel 4206: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
1.47 daniel 4207: ctxt->sax->characters(ctxt->userData, chr, 1);
1.31 daniel 4208: ctxt->token = 0;
4209: ctxt->checkIndex = 0;
4210: }
1.47 daniel 4211: if ((avail == 1) && (terminate)) {
4212: cur = in->cur[0];
4213: if ((cur != '<') && (cur != '&')) {
1.48 daniel 4214: if (ctxt->sax != NULL) {
4215: if (IS_BLANK(cur)) {
4216: if (ctxt->sax->ignorableWhitespace != NULL)
4217: ctxt->sax->ignorableWhitespace(
4218: ctxt->userData, &cur, 1);
4219: } else {
1.59 veillard 4220: htmlCheckParagraph(ctxt);
1.48 daniel 4221: if (ctxt->sax->characters != NULL)
4222: ctxt->sax->characters(
4223: ctxt->userData, &cur, 1);
4224: }
4225: }
1.47 daniel 4226: ctxt->token = 0;
4227: ctxt->checkIndex = 0;
4228: NEXT;
4229: }
4230: break;
4231: }
1.31 daniel 4232: if (avail < 2)
4233: goto done;
4234: cur = in->cur[0];
4235: next = in->cur[1];
1.56 veillard 4236: cons = ctxt->nbChars;
1.77 veillard 4237: if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4238: (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4239: /*
4240: * Handle SCRIPT/STYLE separately
4241: */
1.59 veillard 4242: if ((!terminate) &&
1.77 veillard 4243: (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
1.31 daniel 4244: goto done;
1.77 veillard 4245: htmlParseScript(ctxt);
4246: if ((cur == '<') && (next == '/')) {
4247: ctxt->instate = XML_PARSER_END_TAG;
4248: ctxt->checkIndex = 0;
1.31 daniel 4249: #ifdef DEBUG_PUSH
1.77 veillard 4250: fprintf(stderr, "HPP: entering END_TAG\n");
1.31 daniel 4251: #endif
1.77 veillard 4252: break;
4253: }
4254: } else {
4255: /*
4256: * Sometimes DOCTYPE arrives in the middle of the document
4257: */
4258: if ((cur == '<') && (next == '!') &&
4259: (UPP(2) == 'D') && (UPP(3) == 'O') &&
4260: (UPP(4) == 'C') && (UPP(5) == 'T') &&
4261: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4262: (UPP(8) == 'E')) {
4263: if ((!terminate) &&
4264: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4265: goto done;
4266: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4267: ctxt->sax->error(ctxt->userData,
4268: "Misplaced DOCTYPE declaration\n");
4269: ctxt->wellFormed = 0;
4270: htmlParseDocTypeDecl(ctxt);
4271: } else if ((cur == '<') && (next == '!') &&
4272: (in->cur[2] == '-') && (in->cur[3] == '-')) {
4273: if ((!terminate) &&
4274: (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4275: goto done;
1.31 daniel 4276: #ifdef DEBUG_PUSH
1.77 veillard 4277: fprintf(stderr, "HPP: Parsing Comment\n");
1.31 daniel 4278: #endif
1.77 veillard 4279: htmlParseComment(ctxt);
4280: ctxt->instate = XML_PARSER_CONTENT;
4281: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4282: goto done;
4283: } else if ((cur == '<') && (next == '/')) {
4284: ctxt->instate = XML_PARSER_END_TAG;
4285: ctxt->checkIndex = 0;
1.31 daniel 4286: #ifdef DEBUG_PUSH
1.77 veillard 4287: fprintf(stderr, "HPP: entering END_TAG\n");
1.31 daniel 4288: #endif
1.77 veillard 4289: break;
4290: } else if (cur == '<') {
4291: ctxt->instate = XML_PARSER_START_TAG;
4292: ctxt->checkIndex = 0;
1.31 daniel 4293: #ifdef DEBUG_PUSH
1.77 veillard 4294: fprintf(stderr, "HPP: entering START_TAG\n");
1.31 daniel 4295: #endif
1.77 veillard 4296: break;
4297: } else if (cur == '&') {
1.32 daniel 4298: if ((!terminate) &&
1.77 veillard 4299: (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
1.31 daniel 4300: goto done;
4301: #ifdef DEBUG_PUSH
1.77 veillard 4302: fprintf(stderr, "HPP: Parsing Reference\n");
4303: #endif
4304: /* TODO: check generation of subtrees if noent !!! */
4305: htmlParseReference(ctxt);
4306: } else {
4307: /* TODO Avoid the extra copy, handle directly !!!!!! */
4308: /*
4309: * Goal of the following test is :
4310: * - minimize calls to the SAX 'character' callback
4311: * when they are mergeable
4312: */
4313: if ((ctxt->inputNr == 1) &&
4314: (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4315: if ((!terminate) &&
4316: (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4317: goto done;
4318: }
4319: ctxt->checkIndex = 0;
4320: #ifdef DEBUG_PUSH
4321: fprintf(stderr, "HPP: Parsing char data\n");
1.31 daniel 4322: #endif
1.77 veillard 4323: htmlParseCharData(ctxt, 0);
4324: }
1.31 daniel 4325: }
1.56 veillard 4326: if (cons == ctxt->nbChars) {
4327: if (ctxt->node != NULL) {
4328: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4329: ctxt->sax->error(ctxt->userData,
4330: "detected an error in element content\n");
4331: ctxt->wellFormed = 0;
4332: }
1.70 veillard 4333: NEXT;
1.56 veillard 4334: break;
4335: }
4336:
1.31 daniel 4337: break;
1.56 veillard 4338: }
1.31 daniel 4339: case XML_PARSER_END_TAG:
4340: if (avail < 2)
4341: goto done;
1.32 daniel 4342: if ((!terminate) &&
4343: (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
1.31 daniel 4344: goto done;
4345: htmlParseEndTag(ctxt);
4346: if (ctxt->nameNr == 0) {
4347: ctxt->instate = XML_PARSER_EPILOG;
4348: } else {
4349: ctxt->instate = XML_PARSER_CONTENT;
4350: }
4351: ctxt->checkIndex = 0;
4352: #ifdef DEBUG_PUSH
4353: fprintf(stderr, "HPP: entering CONTENT\n");
4354: #endif
4355: break;
4356: case XML_PARSER_CDATA_SECTION:
4357: fprintf(stderr, "HPP: internal error, state == CDATA\n");
4358: ctxt->instate = XML_PARSER_CONTENT;
4359: ctxt->checkIndex = 0;
4360: #ifdef DEBUG_PUSH
4361: fprintf(stderr, "HPP: entering CONTENT\n");
4362: #endif
4363: break;
4364: case XML_PARSER_DTD:
4365: fprintf(stderr, "HPP: internal error, state == DTD\n");
4366: ctxt->instate = XML_PARSER_CONTENT;
4367: ctxt->checkIndex = 0;
4368: #ifdef DEBUG_PUSH
4369: fprintf(stderr, "HPP: entering CONTENT\n");
4370: #endif
4371: break;
4372: case XML_PARSER_COMMENT:
4373: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
4374: ctxt->instate = XML_PARSER_CONTENT;
4375: ctxt->checkIndex = 0;
4376: #ifdef DEBUG_PUSH
4377: fprintf(stderr, "HPP: entering CONTENT\n");
4378: #endif
4379: break;
4380: case XML_PARSER_PI:
4381: fprintf(stderr, "HPP: internal error, state == PI\n");
4382: ctxt->instate = XML_PARSER_CONTENT;
4383: ctxt->checkIndex = 0;
4384: #ifdef DEBUG_PUSH
4385: fprintf(stderr, "HPP: entering CONTENT\n");
4386: #endif
4387: break;
4388: case XML_PARSER_ENTITY_DECL:
4389: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
4390: ctxt->instate = XML_PARSER_CONTENT;
4391: ctxt->checkIndex = 0;
4392: #ifdef DEBUG_PUSH
4393: fprintf(stderr, "HPP: entering CONTENT\n");
4394: #endif
4395: break;
4396: case XML_PARSER_ENTITY_VALUE:
4397: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
4398: ctxt->instate = XML_PARSER_CONTENT;
4399: ctxt->checkIndex = 0;
4400: #ifdef DEBUG_PUSH
4401: fprintf(stderr, "HPP: entering DTD\n");
4402: #endif
4403: break;
4404: case XML_PARSER_ATTRIBUTE_VALUE:
4405: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4406: ctxt->instate = XML_PARSER_START_TAG;
4407: ctxt->checkIndex = 0;
4408: #ifdef DEBUG_PUSH
4409: fprintf(stderr, "HPP: entering START_TAG\n");
1.53 veillard 4410: #endif
4411: break;
4412: case XML_PARSER_SYSTEM_LITERAL:
4413: fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4414: ctxt->instate = XML_PARSER_CONTENT;
4415: ctxt->checkIndex = 0;
4416: #ifdef DEBUG_PUSH
4417: fprintf(stderr, "HPP: entering CONTENT\n");
1.31 daniel 4418: #endif
4419: break;
4420: }
4421: }
4422: done:
1.47 daniel 4423: if ((avail == 0) && (terminate)) {
4424: htmlAutoClose(ctxt, NULL);
1.54 veillard 4425: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4426: /*
4427: * SAX: end of the document processing.
4428: */
1.47 daniel 4429: ctxt->instate = XML_PARSER_EOF;
1.54 veillard 4430: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4431: ctxt->sax->endDocument(ctxt->userData);
4432: }
1.59 veillard 4433: }
4434: if ((ctxt->myDoc != NULL) &&
4435: ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4436: (ctxt->instate == XML_PARSER_EPILOG))) {
4437: xmlDtdPtr dtd;
4438: dtd = xmlGetIntSubset(ctxt->myDoc);
4439: if (dtd == NULL)
4440: ctxt->myDoc->intSubset =
4441: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4442: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4443: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
1.47 daniel 4444: }
1.31 daniel 4445: #ifdef DEBUG_PUSH
4446: fprintf(stderr, "HPP: done %d\n", ret);
4447: #endif
4448: return(ret);
4449: }
4450:
4451: /**
1.32 daniel 4452: * htmlParseTry:
4453: * @ctxt: an HTML parser context
4454: *
4455: * Try to progress on parsing
4456: *
4457: * Returns zero if no parsing was possible
4458: */
4459: int
4460: htmlParseTry(htmlParserCtxtPtr ctxt) {
4461: return(htmlParseTryOrFinish(ctxt, 0));
4462: }
4463:
4464: /**
1.31 daniel 4465: * htmlParseChunk:
4466: * @ctxt: an XML parser context
4467: * @chunk: an char array
4468: * @size: the size in byte of the chunk
4469: * @terminate: last chunk indicator
4470: *
4471: * Parse a Chunk of memory
4472: *
4473: * Returns zero if no error, the xmlParserErrors otherwise.
4474: */
4475: int
4476: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4477: int terminate) {
4478: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4479: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4480: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4481: int cur = ctxt->input->cur - ctxt->input->base;
4482:
4483: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4484: ctxt->input->base = ctxt->input->buf->buffer->content + base;
4485: ctxt->input->cur = ctxt->input->base + cur;
4486: #ifdef DEBUG_PUSH
4487: fprintf(stderr, "HPP: pushed %d\n", size);
4488: #endif
4489:
1.34 daniel 4490: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4491: htmlParseTryOrFinish(ctxt, terminate);
1.60 veillard 4492: } else if (ctxt->instate != XML_PARSER_EOF) {
4493: xmlParserInputBufferPush(ctxt->input->buf, 0, "");
1.32 daniel 4494: htmlParseTryOrFinish(ctxt, terminate);
1.60 veillard 4495: }
1.31 daniel 4496: if (terminate) {
4497: if ((ctxt->instate != XML_PARSER_EOF) &&
4498: (ctxt->instate != XML_PARSER_EPILOG) &&
4499: (ctxt->instate != XML_PARSER_MISC)) {
1.67 veillard 4500: ctxt->errNo = XML_ERR_DOCUMENT_END;
1.31 daniel 4501: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4502: ctxt->sax->error(ctxt->userData,
4503: "Extra content at the end of the document\n");
4504: ctxt->wellFormed = 0;
4505: }
4506: if (ctxt->instate != XML_PARSER_EOF) {
4507: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4508: ctxt->sax->endDocument(ctxt->userData);
4509: }
4510: ctxt->instate = XML_PARSER_EOF;
4511: }
4512: return((xmlParserErrors) ctxt->errNo);
4513: }
4514:
4515: /************************************************************************
4516: * *
4517: * User entry points *
4518: * *
4519: ************************************************************************/
4520:
4521: /**
4522: * htmlCreatePushParserCtxt :
4523: * @sax: a SAX handler
4524: * @user_data: The user data returned on SAX callbacks
4525: * @chunk: a pointer to an array of chars
4526: * @size: number of chars in the array
4527: * @filename: an optional file name or URI
4528: * @enc: an optional encoding
4529: *
4530: * Create a parser context for using the HTML parser in push mode
4531: * To allow content encoding detection, @size should be >= 4
4532: * The value of @filename is used for fetching external entities
4533: * and error/warning reports.
4534: *
4535: * Returns the new parser context or NULL
4536: */
4537: htmlParserCtxtPtr
4538: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4539: const char *chunk, int size, const char *filename,
4540: xmlCharEncoding enc) {
4541: htmlParserCtxtPtr ctxt;
4542: htmlParserInputPtr inputStream;
4543: xmlParserInputBufferPtr buf;
4544:
4545: buf = xmlAllocParserInputBuffer(enc);
4546: if (buf == NULL) return(NULL);
4547:
4548: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4549: if (ctxt == NULL) {
4550: xmlFree(buf);
4551: return(NULL);
4552: }
4553: memset(ctxt, 0, sizeof(htmlParserCtxt));
4554: htmlInitParserCtxt(ctxt);
4555: if (sax != NULL) {
4556: if (ctxt->sax != &htmlDefaultSAXHandler)
4557: xmlFree(ctxt->sax);
4558: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4559: if (ctxt->sax == NULL) {
4560: xmlFree(buf);
4561: xmlFree(ctxt);
4562: return(NULL);
4563: }
4564: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4565: if (user_data != NULL)
4566: ctxt->userData = user_data;
4567: }
4568: if (filename == NULL) {
4569: ctxt->directory = NULL;
4570: } else {
4571: ctxt->directory = xmlParserGetDirectory(filename);
4572: }
4573:
4574: inputStream = htmlNewInputStream(ctxt);
4575: if (inputStream == NULL) {
4576: xmlFreeParserCtxt(ctxt);
4577: return(NULL);
4578: }
4579:
4580: if (filename == NULL)
4581: inputStream->filename = NULL;
4582: else
4583: inputStream->filename = xmlMemStrdup(filename);
4584: inputStream->buf = buf;
4585: inputStream->base = inputStream->buf->buffer->content;
4586: inputStream->cur = inputStream->buf->buffer->content;
4587:
4588: inputPush(ctxt, inputStream);
4589:
4590: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4591: (ctxt->input->buf != NULL)) {
4592: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4593: #ifdef DEBUG_PUSH
4594: fprintf(stderr, "HPP: pushed %d\n", size);
4595: #endif
4596: }
4597:
4598: return(ctxt);
4599: }
1.1 daniel 4600:
4601: /**
4602: * htmlSAXParseDoc :
1.14 daniel 4603: * @cur: a pointer to an array of xmlChar
1.1 daniel 4604: * @encoding: a free form C string describing the HTML document encoding, or NULL
4605: * @sax: the SAX handler block
4606: * @userData: if using SAX, this pointer will be provided on callbacks.
4607: *
4608: * parse an HTML in-memory document and build a tree.
4609: * It use the given SAX function block to handle the parsing callback.
4610: * If sax is NULL, fallback to the default DOM tree building routines.
4611: *
4612: * Returns the resulting document tree
4613: */
4614:
4615: htmlDocPtr
1.14 daniel 4616: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
1.1 daniel 4617: htmlDocPtr ret;
4618: htmlParserCtxtPtr ctxt;
4619:
4620: if (cur == NULL) return(NULL);
4621:
4622:
4623: ctxt = htmlCreateDocParserCtxt(cur, encoding);
4624: if (ctxt == NULL) return(NULL);
4625: if (sax != NULL) {
4626: ctxt->sax = sax;
4627: ctxt->userData = userData;
4628: }
4629:
4630: htmlParseDocument(ctxt);
4631: ret = ctxt->myDoc;
4632: if (sax != NULL) {
4633: ctxt->sax = NULL;
4634: ctxt->userData = NULL;
4635: }
4636: htmlFreeParserCtxt(ctxt);
4637:
4638: return(ret);
4639: }
4640:
4641: /**
4642: * htmlParseDoc :
1.14 daniel 4643: * @cur: a pointer to an array of xmlChar
1.1 daniel 4644: * @encoding: a free form C string describing the HTML document encoding, or NULL
4645: *
4646: * parse an HTML in-memory document and build a tree.
4647: *
4648: * Returns the resulting document tree
4649: */
4650:
4651: htmlDocPtr
1.14 daniel 4652: htmlParseDoc(xmlChar *cur, const char *encoding) {
1.1 daniel 4653: return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4654: }
4655:
4656:
4657: /**
4658: * htmlCreateFileParserCtxt :
4659: * @filename: the filename
4660: * @encoding: a free form C string describing the HTML document encoding, or NULL
4661: *
4662: * Create a parser context for a file content.
4663: * Automatic support for ZLIB/Compress compressed document is provided
4664: * by default if found at compile-time.
4665: *
4666: * Returns the new parser context or NULL
4667: */
4668: htmlParserCtxtPtr
4669: htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4670: {
4671: htmlParserCtxtPtr ctxt;
4672: htmlParserInputPtr inputStream;
1.5 daniel 4673: xmlParserInputBufferPtr buf;
1.1 daniel 4674: /* htmlCharEncoding enc; */
4675:
1.5 daniel 4676: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4677: if (buf == NULL) return(NULL);
1.1 daniel 4678:
1.11 daniel 4679: ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
1.1 daniel 4680: if (ctxt == NULL) {
4681: perror("malloc");
4682: return(NULL);
4683: }
1.19 daniel 4684: memset(ctxt, 0, sizeof(htmlParserCtxt));
1.1 daniel 4685: htmlInitParserCtxt(ctxt);
1.11 daniel 4686: inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1.1 daniel 4687: if (inputStream == NULL) {
4688: perror("malloc");
1.11 daniel 4689: xmlFree(ctxt);
1.1 daniel 4690: return(NULL);
4691: }
1.19 daniel 4692: memset(inputStream, 0, sizeof(htmlParserInput));
1.1 daniel 4693:
1.11 daniel 4694: inputStream->filename = xmlMemStrdup(filename);
1.1 daniel 4695: inputStream->line = 1;
4696: inputStream->col = 1;
1.5 daniel 4697: inputStream->buf = buf;
1.21 daniel 4698: inputStream->directory = NULL;
1.1 daniel 4699:
1.5 daniel 4700: inputStream->base = inputStream->buf->buffer->content;
4701: inputStream->cur = inputStream->buf->buffer->content;
4702: inputStream->free = NULL;
1.1 daniel 4703:
4704: inputPush(ctxt, inputStream);
4705: return(ctxt);
4706: }
4707:
4708: /**
4709: * htmlSAXParseFile :
4710: * @filename: the filename
4711: * @encoding: a free form C string describing the HTML document encoding, or NULL
4712: * @sax: the SAX handler block
4713: * @userData: if using SAX, this pointer will be provided on callbacks.
4714: *
4715: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4716: * compressed document is provided by default if found at compile-time.
4717: * It use the given SAX function block to handle the parsing callback.
4718: * If sax is NULL, fallback to the default DOM tree building routines.
4719: *
4720: * Returns the resulting document tree
4721: */
4722:
4723: htmlDocPtr
4724: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4725: void *userData) {
4726: htmlDocPtr ret;
4727: htmlParserCtxtPtr ctxt;
1.57 veillard 4728: htmlSAXHandlerPtr oldsax = NULL;
1.1 daniel 4729:
4730: ctxt = htmlCreateFileParserCtxt(filename, encoding);
4731: if (ctxt == NULL) return(NULL);
4732: if (sax != NULL) {
1.55 veillard 4733: oldsax = ctxt->sax;
1.1 daniel 4734: ctxt->sax = sax;
4735: ctxt->userData = userData;
4736: }
4737:
4738: htmlParseDocument(ctxt);
4739:
4740: ret = ctxt->myDoc;
4741: if (sax != NULL) {
1.55 veillard 4742: ctxt->sax = oldsax;
1.1 daniel 4743: ctxt->userData = NULL;
4744: }
4745: htmlFreeParserCtxt(ctxt);
4746:
4747: return(ret);
4748: }
4749:
4750: /**
4751: * htmlParseFile :
4752: * @filename: the filename
4753: * @encoding: a free form C string describing the HTML document encoding, or NULL
4754: *
4755: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4756: * compressed document is provided by default if found at compile-time.
4757: *
4758: * Returns the resulting document tree
4759: */
4760:
4761: htmlDocPtr
4762: htmlParseFile(const char *filename, const char *encoding) {
4763: return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4764: }
1.39 daniel 4765:
4766: #endif /* LIBXML_HTML_ENABLED */
Webmaster