Annotation of XML/HTMLtree.c, revision 1.26
1.1 daniel 1: /*
2: * HTMLtree.c : implemetation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
1.5 daniel 9:
1.13 daniel 10: #ifdef WIN32
11: #include "win32config.h"
12: #else
1.1 daniel 13: #include "config.h"
1.5 daniel 14: #endif
1.18 daniel 15:
16: #include "xmlversion.h"
17: #ifdef LIBXML_HTML_ENABLED
18:
1.1 daniel 19: #include <stdio.h>
1.5 daniel 20: #include <string.h> /* for memset() only ! */
21:
22: #ifdef HAVE_CTYPE_H
1.1 daniel 23: #include <ctype.h>
1.5 daniel 24: #endif
25: #ifdef HAVE_STDLIB_H
1.1 daniel 26: #include <stdlib.h>
1.5 daniel 27: #endif
1.1 daniel 28:
1.18 daniel 29: #include <libxml/xmlmemory.h>
30: #include <libxml/HTMLparser.h>
31: #include <libxml/HTMLtree.h>
32: #include <libxml/entities.h>
33: #include <libxml/valid.h>
1.1 daniel 34:
1.21 veillard 35: /************************************************************************
36: * *
1.23 veillard 37: * Getting/Setting encoding meta tags *
38: * *
39: ************************************************************************/
40:
41: /**
42: * htmlGetMetaEncoding:
43: * @doc: the document
44: *
45: * Encoding definition lookup in the Meta tags
46: *
47: * Returns the current encoding as flagged in the HTML source
48: */
49: const xmlChar *
50: htmlGetMetaEncoding(htmlDocPtr doc) {
1.24 veillard 51: htmlNodePtr cur;
52: const xmlChar *content;
53: const xmlChar *encoding;
54:
55: if (doc == NULL)
56: return(NULL);
57: cur = doc->children;
58:
59: /*
60: * Search the html
61: */
62: while (cur != NULL) {
63: if (cur->name != NULL) {
64: if (!xmlStrcmp(cur->name, BAD_CAST"html"))
65: break;
66: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
67: goto found_head;
68: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
69: goto found_meta;
70: }
71: cur = cur->next;
72: }
73: if (cur == NULL)
74: return(NULL);
75: cur = cur->children;
76:
77: /*
78: * Search the head
79: */
80: while (cur != NULL) {
81: if (cur->name != NULL) {
82: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
83: break;
84: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
85: goto found_meta;
86: }
87: cur = cur->next;
88: }
89: if (cur == NULL)
90: return(NULL);
91: found_head:
92: cur = cur->children;
93:
94: /*
95: * Search the meta elements
96: */
97: found_meta:
98: while (cur != NULL) {
99: if (cur->name != NULL) {
100: if (!xmlStrcmp(cur->name, BAD_CAST"meta")) {
101: xmlAttrPtr attr = cur->properties;
102: int http;
103: const xmlChar *value;
104:
105: content = NULL;
106: http = 0;
107: while (attr != NULL) {
108: if ((attr->children != NULL) &&
109: (attr->children->type == XML_TEXT_NODE) &&
110: (attr->children->next == NULL)) {
111: #ifndef XML_USE_BUFFER_CONTENT
112: value = attr->children->content;
113: #else
114: value = xmlBufferContent(attr->children->content);
115: #endif
116: if (((!xmlStrcmp(attr->name, BAD_CAST"http-equiv")) ||
117: (!xmlStrcmp(attr->name, BAD_CAST"Http-Equiv")) ||
118: (!xmlStrcmp(attr->name, BAD_CAST"HTTP-EQUIV"))) &&
119: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
120: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
121: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
122: http = 1;
123: else if ((value != NULL) &&
124: ((!xmlStrcmp(attr->name, BAD_CAST"content")) ||
125: (!xmlStrcmp(attr->name, BAD_CAST"Content")) ||
126: (!xmlStrcmp(attr->name, BAD_CAST"CONTENT"))))
127: content = value;
128: if ((http != 0) && (content != NULL))
129: goto found_content;
130: }
131: attr = attr->next;
132: }
133: }
134: }
135: cur = cur->next;
136: }
137: return(NULL);
138:
139: found_content:
140: encoding = xmlStrstr(content, BAD_CAST"charset=");
141: if (encoding == NULL)
142: encoding = xmlStrstr(content, BAD_CAST"Charset=");
143: if (encoding == NULL)
144: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
145: if (encoding != NULL) {
146: encoding += 8;
147: } else {
148: encoding = xmlStrstr(content, BAD_CAST"charset =");
149: if (encoding == NULL)
150: encoding = xmlStrstr(content, BAD_CAST"Charset =");
151: if (encoding == NULL)
152: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
153: if (encoding != NULL)
154: encoding += 9;
155: }
156: if (encoding != NULL) {
157: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
158: }
159: return(encoding);
1.23 veillard 160: }
161:
162: /**
163: * htmlSetMetaEncoding:
164: * @doc: the document
165: * @encoding: the encoding string
166: *
167: * Sets the current encoding in the Meta tags
168: * NOTE: this will not change the document content encoding, just
169: * the META flag associated.
170: *
171: * Returns 0 in case of success and -1 in case of error
172: */
173: int
174: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
1.26 ! veillard 175: htmlNodePtr cur, meta;
! 176: const xmlChar *content;
! 177: char newcontent[100];
! 178:
! 179:
! 180: if (doc == NULL)
! 181: return(-1);
! 182:
! 183: if (encoding != NULL) {
! 184: #ifndef HAVE_SNPRINTF
! 185: sprintf(newcontent, "text/html; charset=%s", encoding);
! 186: #else /* HAVE_SNPRINTF */
! 187: snprintf(newcontent, 99, "text/html; charset=%s", encoding);
! 188: #endif /* HAVE_SNPRINTF */
! 189: newcontent[99] = 0;
! 190: }
! 191:
! 192: cur = doc->children;
! 193:
! 194: /*
! 195: * Search the html
! 196: */
! 197: while (cur != NULL) {
! 198: if (cur->name != NULL) {
! 199: if (!xmlStrcmp(cur->name, BAD_CAST"html"))
! 200: break;
! 201: if (!xmlStrcmp(cur->name, BAD_CAST"body")) {
! 202: if (encoding == NULL)
! 203: return(0);
! 204: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
! 205: xmlAddPrevSibling(cur, meta);
! 206: cur = meta;
! 207: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
! 208: xmlAddChild(cur, meta);
! 209: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
! 210: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
! 211: return(0);
! 212: }
! 213: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
! 214: goto found_head;
! 215: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
! 216: goto found_meta;
! 217: }
! 218: cur = cur->next;
! 219: }
! 220: if (cur == NULL)
! 221: return(-1);
! 222: cur = cur->children;
! 223:
! 224: /*
! 225: * Search the head
! 226: */
! 227: while (cur != NULL) {
! 228: if (cur->name != NULL) {
! 229: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
! 230: break;
! 231: if (!xmlStrcmp(cur->name, BAD_CAST"body")) {
! 232: if (encoding == NULL)
! 233: return(0);
! 234: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
! 235: xmlAddPrevSibling(cur, meta);
! 236: cur = meta;
! 237: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
! 238: xmlAddChild(cur, meta);
! 239: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
! 240: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
! 241: return(0);
! 242: }
! 243: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
! 244: goto found_meta;
! 245: }
! 246: cur = cur->next;
! 247: }
! 248: if (cur == NULL)
! 249: return(-1);
! 250: found_head:
! 251: if (cur->children == NULL) {
! 252: if (encoding == NULL)
! 253: return(0);
! 254: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
! 255: xmlAddChild(cur, meta);
! 256: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
! 257: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
! 258: return(0);
! 259: }
! 260: cur = cur->children;
! 261:
! 262: found_meta:
! 263: if (encoding != NULL) {
! 264: /*
! 265: * Create a new Meta element with the right aatributes
! 266: */
! 267:
! 268: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
! 269: xmlAddPrevSibling(cur, meta);
! 270: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
! 271: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
! 272: }
! 273:
! 274: /*
! 275: * Search and destroy all the remaining the meta elements carrying
! 276: * encoding informations
! 277: */
! 278: while (cur != NULL) {
! 279: if (cur->name != NULL) {
! 280: if (!xmlStrcmp(cur->name, BAD_CAST"meta")) {
! 281: xmlAttrPtr attr = cur->properties;
! 282: int http;
! 283: const xmlChar *value;
! 284:
! 285: content = NULL;
! 286: http = 0;
! 287: while (attr != NULL) {
! 288: if ((attr->children != NULL) &&
! 289: (attr->children->type == XML_TEXT_NODE) &&
! 290: (attr->children->next == NULL)) {
! 291: #ifndef XML_USE_BUFFER_CONTENT
! 292: value = attr->children->content;
! 293: #else
! 294: value = xmlBufferContent(attr->children->content);
! 295: #endif
! 296: if (((!xmlStrcmp(attr->name, BAD_CAST"http-equiv")) ||
! 297: (!xmlStrcmp(attr->name, BAD_CAST"Http-Equiv")) ||
! 298: (!xmlStrcmp(attr->name, BAD_CAST"HTTP-EQUIV"))) &&
! 299: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
! 300: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
! 301: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
! 302: http = 1;
! 303: else if ((value != NULL) &&
! 304: ((!xmlStrcmp(attr->name, BAD_CAST"content")) ||
! 305: (!xmlStrcmp(attr->name, BAD_CAST"Content")) ||
! 306: (!xmlStrcmp(attr->name, BAD_CAST"CONTENT"))))
! 307: content = value;
! 308: if ((http != 0) && (content != NULL))
! 309: break;
! 310: }
! 311: attr = attr->next;
! 312: }
! 313: if ((http != 0) && (content != NULL)) {
! 314: meta = cur;
! 315: cur = cur->next;
! 316: xmlUnlinkNode(meta);
! 317: xmlFreeNode(meta);
! 318: continue;
! 319: }
! 320:
! 321: }
! 322: }
! 323: cur = cur->next;
! 324: }
! 325: return(0);
1.23 veillard 326: }
327:
328: /************************************************************************
329: * *
1.21 veillard 330: * Dumping HTML tree content to a simple buffer *
331: * *
332: ************************************************************************/
333:
1.14 daniel 334: static void
335: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
336:
1.1 daniel 337: /**
338: * htmlDtdDump:
339: * @buf: the HTML buffer output
340: * @doc: the document
341: *
342: * Dump the HTML document DTD, if any.
343: */
344: static void
345: htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
346: xmlDtdPtr cur = doc->intSubset;
347:
348: if (cur == NULL) {
349: fprintf(stderr, "htmlDtdDump : no internal subset\n");
350: return;
351: }
352: xmlBufferWriteChar(buf, "<!DOCTYPE ");
353: xmlBufferWriteCHAR(buf, cur->name);
354: if (cur->ExternalID != NULL) {
355: xmlBufferWriteChar(buf, " PUBLIC ");
356: xmlBufferWriteQuotedString(buf, cur->ExternalID);
1.2 daniel 357: if (cur->SystemID != NULL) {
358: xmlBufferWriteChar(buf, " ");
359: xmlBufferWriteQuotedString(buf, cur->SystemID);
360: }
1.1 daniel 361: } else if (cur->SystemID != NULL) {
362: xmlBufferWriteChar(buf, " SYSTEM ");
363: xmlBufferWriteQuotedString(buf, cur->SystemID);
364: }
365: xmlBufferWriteChar(buf, ">\n");
366: }
367:
368: /**
369: * htmlAttrDump:
370: * @buf: the HTML buffer output
371: * @doc: the document
372: * @cur: the attribute pointer
373: *
374: * Dump an HTML attribute
375: */
376: static void
377: htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
1.6 daniel 378: xmlChar *value;
1.1 daniel 379:
380: if (cur == NULL) {
381: fprintf(stderr, "htmlAttrDump : property == NULL\n");
382: return;
383: }
384: xmlBufferWriteChar(buf, " ");
385: xmlBufferWriteCHAR(buf, cur->name);
1.19 daniel 386: if (cur->children != NULL) {
387: value = xmlNodeListGetString(doc, cur->children, 0);
388: if (value) {
389: xmlBufferWriteChar(buf, "=");
390: xmlBufferWriteQuotedString(buf, value);
391: xmlFree(value);
392: } else {
393: xmlBufferWriteChar(buf, "=\"\"");
394: }
1.1 daniel 395: }
396: }
397:
398: /**
399: * htmlAttrListDump:
400: * @buf: the HTML buffer output
401: * @doc: the document
402: * @cur: the first attribute pointer
403: *
404: * Dump a list of HTML attributes
405: */
406: static void
407: htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
408: if (cur == NULL) {
409: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
410: return;
411: }
412: while (cur != NULL) {
413: htmlAttrDump(buf, doc, cur);
414: cur = cur->next;
415: }
416: }
417:
418:
1.14 daniel 419: void
1.1 daniel 420: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
421: /**
422: * htmlNodeListDump:
423: * @buf: the HTML buffer output
424: * @doc: the document
425: * @cur: the first node
426: *
427: * Dump an HTML node list, recursive behaviour,children are printed too.
428: */
429: static void
430: htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
431: if (cur == NULL) {
432: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
433: return;
434: }
435: while (cur != NULL) {
436: htmlNodeDump(buf, doc, cur);
437: cur = cur->next;
438: }
439: }
440:
441: /**
442: * htmlNodeDump:
443: * @buf: the HTML buffer output
444: * @doc: the document
445: * @cur: the current node
446: *
447: * Dump an HTML node, recursive behaviour,children are printed too.
448: */
1.14 daniel 449: void
1.1 daniel 450: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
451: htmlElemDescPtr info;
452:
453: if (cur == NULL) {
454: fprintf(stderr, "htmlNodeDump : node == NULL\n");
455: return;
456: }
457: /*
458: * Special cases.
459: */
1.20 daniel 460: if (cur->type == XML_DTD_NODE)
461: return;
1.14 daniel 462: if (cur->type == XML_HTML_DOCUMENT_NODE) {
463: htmlDocContentDump(buf, (xmlDocPtr) cur);
464: return;
465: }
1.1 daniel 466: if (cur->type == HTML_TEXT_NODE) {
467: if (cur->content != NULL) {
1.6 daniel 468: xmlChar *buffer;
1.1 daniel 469:
1.9 daniel 470: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 471: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
1.9 daniel 472: #else
473: buffer = xmlEncodeEntitiesReentrant(doc,
474: xmlBufferContent(cur->content));
475: #endif
1.1 daniel 476: if (buffer != NULL) {
477: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 478: xmlFree(buffer);
1.1 daniel 479: }
480: }
481: return;
482: }
483: if (cur->type == HTML_COMMENT_NODE) {
484: if (cur->content != NULL) {
485: xmlBufferWriteChar(buf, "<!--");
1.9 daniel 486: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 487: xmlBufferWriteCHAR(buf, cur->content);
1.9 daniel 488: #else
489: xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
490: #endif
1.1 daniel 491: xmlBufferWriteChar(buf, "-->");
492: }
493: return;
494: }
495: if (cur->type == HTML_ENTITY_REF_NODE) {
496: xmlBufferWriteChar(buf, "&");
497: xmlBufferWriteCHAR(buf, cur->name);
498: xmlBufferWriteChar(buf, ";");
499: return;
500: }
501:
502: /*
503: * Get specific HTmL info for taht node.
504: */
505: info = htmlTagLookup(cur->name);
506:
507: xmlBufferWriteChar(buf, "<");
508: xmlBufferWriteCHAR(buf, cur->name);
509: if (cur->properties != NULL)
510: htmlAttrListDump(buf, doc, cur->properties);
511:
1.7 daniel 512: if ((info != NULL) && (info->empty)) {
1.1 daniel 513: xmlBufferWriteChar(buf, ">");
514: if (cur->next != NULL) {
515: if ((cur->next->type != HTML_TEXT_NODE) &&
516: (cur->next->type != HTML_ENTITY_REF_NODE))
517: xmlBufferWriteChar(buf, "\n");
518: }
519: return;
520: }
1.17 daniel 521: if ((cur->content == NULL) && (cur->children == NULL)) {
1.7 daniel 522: if ((info != NULL) && (info->endTag != 0))
1.1 daniel 523: xmlBufferWriteChar(buf, ">");
524: else {
525: xmlBufferWriteChar(buf, "></");
526: xmlBufferWriteCHAR(buf, cur->name);
527: xmlBufferWriteChar(buf, ">");
528: }
529: if (cur->next != NULL) {
530: if ((cur->next->type != HTML_TEXT_NODE) &&
531: (cur->next->type != HTML_ENTITY_REF_NODE))
532: xmlBufferWriteChar(buf, "\n");
533: }
534: return;
535: }
536: xmlBufferWriteChar(buf, ">");
537: if (cur->content != NULL) {
1.6 daniel 538: xmlChar *buffer;
1.1 daniel 539:
1.9 daniel 540: #ifndef XML_USE_BUFFER_CONTENT
541: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
542: #else
543: buffer = xmlEncodeEntitiesReentrant(doc,
544: xmlBufferContent(cur->content));
545: #endif
1.1 daniel 546: if (buffer != NULL) {
547: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 548: xmlFree(buffer);
1.1 daniel 549: }
550: }
1.17 daniel 551: if (cur->children != NULL) {
552: if ((cur->children->type != HTML_TEXT_NODE) &&
553: (cur->children->type != HTML_ENTITY_REF_NODE) &&
554: (cur->children != cur->last))
1.1 daniel 555: xmlBufferWriteChar(buf, "\n");
1.17 daniel 556: htmlNodeListDump(buf, doc, cur->children);
1.1 daniel 557: if ((cur->last->type != HTML_TEXT_NODE) &&
1.10 daniel 558: (cur->last->type != HTML_ENTITY_REF_NODE) &&
1.17 daniel 559: (cur->children != cur->last))
1.1 daniel 560: xmlBufferWriteChar(buf, "\n");
561: }
1.11 daniel 562: if (!htmlIsAutoClosed(doc, cur)) {
563: xmlBufferWriteChar(buf, "</");
564: xmlBufferWriteCHAR(buf, cur->name);
565: xmlBufferWriteChar(buf, ">");
566: }
1.1 daniel 567: if (cur->next != NULL) {
568: if ((cur->next->type != HTML_TEXT_NODE) &&
569: (cur->next->type != HTML_ENTITY_REF_NODE))
570: xmlBufferWriteChar(buf, "\n");
571: }
572: }
573:
574: /**
1.16 daniel 575: * htmlNodeDumpFile:
576: * @out: the FILE pointer
577: * @doc: the document
578: * @cur: the current node
579: *
580: * Dump an HTML node, recursive behaviour,children are printed too.
581: */
582: void
583: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
584: xmlBufferPtr buf;
585:
586: buf = xmlBufferCreate();
587: if (buf == NULL) return;
588: htmlNodeDump(buf, doc, cur);
589: xmlBufferDump(out, buf);
590: xmlBufferFree(buf);
591: }
592:
593: /**
1.1 daniel 594: * htmlDocContentDump:
595: * @buf: the HTML buffer output
596: * @cur: the document
597: *
598: * Dump an HTML document.
599: */
600: static void
601: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
1.12 daniel 602: int type;
603:
604: /*
605: * force to output the stuff as HTML, especially for entities
606: */
607: type = cur->type;
608: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 609: if (cur->intSubset != NULL)
610: htmlDtdDump(buf, cur);
1.11 daniel 611: else {
612: /* Default to HTML-4.0 transitionnal @@@@ */
613: xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
614:
615: }
1.17 daniel 616: if (cur->children != NULL) {
617: htmlNodeListDump(buf, cur, cur->children);
1.1 daniel 618: }
619: xmlBufferWriteChar(buf, "\n");
1.22 veillard 620: cur->type = (xmlElementType) type;
1.1 daniel 621: }
622:
623: /**
624: * htmlDocDumpMemory:
625: * @cur: the document
626: * @mem: OUT: the memory pointer
627: * @size: OUT: the memory lenght
628: *
1.6 daniel 629: * Dump an HTML document in memory and return the xmlChar * and it's size.
1.1 daniel 630: * It's up to the caller to free the memory.
631: */
632: void
1.6 daniel 633: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
1.1 daniel 634: xmlBufferPtr buf;
635:
636: if (cur == NULL) {
637: #ifdef DEBUG_TREE
1.15 daniel 638: fprintf(stderr, "htmlxmlDocDumpMemory : document == NULL\n");
1.1 daniel 639: #endif
640: *mem = NULL;
641: *size = 0;
642: return;
643: }
644: buf = xmlBufferCreate();
645: if (buf == NULL) {
646: *mem = NULL;
647: *size = 0;
648: return;
649: }
650: htmlDocContentDump(buf, cur);
651: *mem = buf->content;
652: *size = buf->use;
653: memset(buf, -1, sizeof(xmlBuffer));
1.4 daniel 654: xmlFree(buf);
1.1 daniel 655: }
656:
657:
1.21 veillard 658: /************************************************************************
659: * *
660: * Dumping HTML tree content to an I/O output buffer *
661: * *
662: ************************************************************************/
663:
664: static void
665: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding);
666:
667: /**
668: * htmlDtdDump:
669: * @buf: the HTML buffer output
670: * @doc: the document
671: *
672: * Dump the HTML document DTD, if any.
673: */
674: static void
675: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, const char *encoding) {
676: xmlDtdPtr cur = doc->intSubset;
677:
678: if (cur == NULL) {
679: fprintf(stderr, "htmlDtdDump : no internal subset\n");
680: return;
681: }
682: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
683: xmlOutputBufferWriteString(buf, (const char *)cur->name);
684: if (cur->ExternalID != NULL) {
685: xmlOutputBufferWriteString(buf, " PUBLIC ");
686: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
687: if (cur->SystemID != NULL) {
688: xmlOutputBufferWriteString(buf, " ");
689: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
690: }
691: } else if (cur->SystemID != NULL) {
692: xmlOutputBufferWriteString(buf, " SYSTEM ");
693: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
694: }
695: xmlOutputBufferWriteString(buf, ">\n");
696: }
697:
698: /**
699: * htmlAttrDump:
700: * @buf: the HTML buffer output
701: * @doc: the document
702: * @cur: the attribute pointer
703: *
704: * Dump an HTML attribute
705: */
706: static void
707: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
708: xmlChar *value;
709:
710: if (cur == NULL) {
711: fprintf(stderr, "htmlAttrDump : property == NULL\n");
712: return;
713: }
714: xmlOutputBufferWriteString(buf, " ");
715: xmlOutputBufferWriteString(buf, (const char *)cur->name);
716: if (cur->children != NULL) {
717: value = xmlNodeListGetString(doc, cur->children, 0);
718: if (value) {
719: xmlOutputBufferWriteString(buf, "=");
720: xmlBufferWriteQuotedString(buf->buffer, value);
721: xmlFree(value);
722: } else {
723: xmlOutputBufferWriteString(buf, "=\"\"");
724: }
725: }
726: }
727:
728: /**
729: * htmlAttrListDump:
730: * @buf: the HTML buffer output
731: * @doc: the document
732: * @cur: the first attribute pointer
733: *
734: * Dump a list of HTML attributes
735: */
736: static void
737: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
738: if (cur == NULL) {
739: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
740: return;
741: }
742: while (cur != NULL) {
743: htmlAttrDumpOutput(buf, doc, cur, encoding);
744: cur = cur->next;
745: }
746: }
747:
748:
749: void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
750: xmlNodePtr cur, const char *encoding);
751:
752: /**
753: * htmlNodeListDump:
754: * @buf: the HTML buffer output
755: * @doc: the document
756: * @cur: the first node
757: *
758: * Dump an HTML node list, recursive behaviour,children are printed too.
759: */
760: static void
761: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
762: if (cur == NULL) {
763: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
764: return;
765: }
766: while (cur != NULL) {
767: htmlNodeDumpOutput(buf, doc, cur, encoding);
768: cur = cur->next;
769: }
770: }
771:
772: /**
773: * htmlNodeDump:
774: * @buf: the HTML buffer output
775: * @doc: the document
776: * @cur: the current node
777: *
778: * Dump an HTML node, recursive behaviour,children are printed too.
779: */
780: void
781: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
782: htmlElemDescPtr info;
783:
784: if (cur == NULL) {
785: fprintf(stderr, "htmlNodeDump : node == NULL\n");
786: return;
787: }
788: /*
789: * Special cases.
790: */
791: if (cur->type == XML_DTD_NODE)
792: return;
793: if (cur->type == XML_HTML_DOCUMENT_NODE) {
794: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
795: return;
796: }
797: if (cur->type == HTML_TEXT_NODE) {
798: if (cur->content != NULL) {
799: xmlChar *buffer;
800:
801: #ifndef XML_USE_BUFFER_CONTENT
802: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
803: #else
804: buffer = xmlEncodeEntitiesReentrant(doc,
805: xmlBufferContent(cur->content));
806: #endif
807: if (buffer != NULL) {
1.25 veillard 808: xmlOutputBufferWriteString(buf, (const char *)buffer);
1.21 veillard 809: xmlFree(buffer);
810: }
811: }
812: return;
813: }
814: if (cur->type == HTML_COMMENT_NODE) {
815: if (cur->content != NULL) {
816: xmlOutputBufferWriteString(buf, "<!--");
817: #ifndef XML_USE_BUFFER_CONTENT
818: xmlOutputBufferWriteString(buf, (const char *)cur->content);
819: #else
820: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
821: #endif
822: xmlOutputBufferWriteString(buf, "-->");
823: }
824: return;
825: }
826: if (cur->type == HTML_ENTITY_REF_NODE) {
827: xmlOutputBufferWriteString(buf, "&");
828: xmlOutputBufferWriteString(buf, (const char *)cur->name);
829: xmlOutputBufferWriteString(buf, ";");
830: return;
831: }
832:
833: /*
834: * Get specific HTmL info for taht node.
835: */
836: info = htmlTagLookup(cur->name);
837:
838: xmlOutputBufferWriteString(buf, "<");
839: xmlOutputBufferWriteString(buf, (const char *)cur->name);
840: if (cur->properties != NULL)
841: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
842:
843: if ((info != NULL) && (info->empty)) {
844: xmlOutputBufferWriteString(buf, ">");
845: if (cur->next != NULL) {
846: if ((cur->next->type != HTML_TEXT_NODE) &&
847: (cur->next->type != HTML_ENTITY_REF_NODE))
848: xmlOutputBufferWriteString(buf, "\n");
849: }
850: return;
851: }
852: if ((cur->content == NULL) && (cur->children == NULL)) {
853: if ((info != NULL) && (info->endTag != 0))
854: xmlOutputBufferWriteString(buf, ">");
855: else {
856: xmlOutputBufferWriteString(buf, "></");
857: xmlOutputBufferWriteString(buf, (const char *)cur->name);
858: xmlOutputBufferWriteString(buf, ">");
859: }
860: if (cur->next != NULL) {
861: if ((cur->next->type != HTML_TEXT_NODE) &&
862: (cur->next->type != HTML_ENTITY_REF_NODE))
863: xmlOutputBufferWriteString(buf, "\n");
864: }
865: return;
866: }
867: xmlOutputBufferWriteString(buf, ">");
868: if (cur->content != NULL) {
869: #if 0
870: xmlChar *buffer;
871:
872: #ifndef XML_USE_BUFFER_CONTENT
873: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
874: #else
875: buffer = xmlEncodeEntitiesReentrant(doc,
876: xmlBufferContent(cur->content));
877: #endif
878: if (buffer != NULL) {
879: xmlOutputBufferWriteString(buf, buffer);
880: xmlFree(buffer);
881: }
882: #else
883: /*
884: * Uses the OutputBuffer property to automatically convert
885: * invalids to charrefs
886: */
887:
888: #ifndef XML_USE_BUFFER_CONTENT
889: xmlOutputBufferWriteString(buf, (const char *) cur->content);
890: #else
891: xmlOutputBufferWriteString(buf,
892: (const char *) xmlBufferContent(cur->content));
893: #endif
894: #endif
895: }
896: if (cur->children != NULL) {
897: if ((cur->children->type != HTML_TEXT_NODE) &&
898: (cur->children->type != HTML_ENTITY_REF_NODE) &&
899: (cur->children != cur->last))
900: xmlOutputBufferWriteString(buf, "\n");
901: htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
902: if ((cur->last->type != HTML_TEXT_NODE) &&
903: (cur->last->type != HTML_ENTITY_REF_NODE) &&
904: (cur->children != cur->last))
905: xmlOutputBufferWriteString(buf, "\n");
906: }
907: if (!htmlIsAutoClosed(doc, cur)) {
908: xmlOutputBufferWriteString(buf, "</");
909: xmlOutputBufferWriteString(buf, (const char *)cur->name);
910: xmlOutputBufferWriteString(buf, ">");
911: }
912: if (cur->next != NULL) {
913: if ((cur->next->type != HTML_TEXT_NODE) &&
914: (cur->next->type != HTML_ENTITY_REF_NODE))
915: xmlOutputBufferWriteString(buf, "\n");
916: }
917: }
918:
919: /**
920: * htmlDocContentDump:
921: * @buf: the HTML buffer output
922: * @cur: the document
923: *
924: * Dump an HTML document.
925: */
926: static void
927: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
928: int type;
929:
930: /*
931: * force to output the stuff as HTML, especially for entities
932: */
933: type = cur->type;
934: cur->type = XML_HTML_DOCUMENT_NODE;
935: if (cur->intSubset != NULL)
936: htmlDtdDumpOutput(buf, cur, NULL);
937: else {
938: /* Default to HTML-4.0 transitionnal @@@@ */
939: xmlOutputBufferWriteString(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
940:
941: }
942: if (cur->children != NULL) {
943: htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
944: }
945: xmlOutputBufferWriteString(buf, "\n");
1.22 veillard 946: cur->type = (xmlElementType) type;
1.21 veillard 947: }
948:
949:
950: /************************************************************************
951: * *
952: * Saving functions front-ends *
953: * *
954: ************************************************************************/
955:
1.1 daniel 956: /**
957: * htmlDocDump:
958: * @f: the FILE*
959: * @cur: the document
960: *
961: * Dump an HTML document to an open FILE.
1.21 veillard 962: *
963: * returns: the number of byte written or -1 in case of failure.
1.1 daniel 964: */
1.21 veillard 965: int
1.1 daniel 966: htmlDocDump(FILE *f, xmlDocPtr cur) {
1.21 veillard 967: xmlOutputBufferPtr buf;
1.24 veillard 968: xmlCharEncodingHandlerPtr handler = NULL;
969: const char *encoding;
1.21 veillard 970: int ret;
1.1 daniel 971:
972: if (cur == NULL) {
973: #ifdef DEBUG_TREE
1.15 daniel 974: fprintf(stderr, "htmlDocDump : document == NULL\n");
1.1 daniel 975: #endif
1.21 veillard 976: return(-1);
1.1 daniel 977: }
1.24 veillard 978:
979: encoding = (const char *) htmlGetMetaEncoding(cur);
980:
981: if (encoding != NULL) {
982: xmlCharEncoding enc;
983:
984: enc = xmlParseCharEncoding(encoding);
985: if (enc != cur->charset) {
986: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
987: /*
988: * Not supported yet
989: */
990: return(-1);
991: }
992:
993: handler = xmlFindCharEncodingHandler(encoding);
994: if (handler == NULL)
995: return(-1);
996: }
997: }
998:
999: /*
1.25 veillard 1000: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1001: */
1002: if (handler == NULL)
1.25 veillard 1003: handler = xmlFindCharEncodingHandler("HTML");
1004: if (handler == NULL)
1.24 veillard 1005: handler = xmlFindCharEncodingHandler("ascii");
1006:
1007: buf = xmlOutputBufferCreateFile(f, handler);
1.21 veillard 1008: if (buf == NULL) return(-1);
1009: htmlDocContentDumpOutput(buf, cur, NULL);
1010:
1011: ret = xmlOutputBufferClose(buf);
1012: return(ret);
1013: }
1014:
1015: /**
1016: * htmlSaveFile:
1017: * @filename: the filename (or URL)
1018: * @cur: the document
1019: *
1020: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1021: * used.
1022: * returns: the number of byte written or -1 in case of failure.
1023: */
1024: int
1025: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1026: xmlOutputBufferPtr buf;
1.24 veillard 1027: xmlCharEncodingHandlerPtr handler = NULL;
1028: const char *encoding;
1.21 veillard 1029: int ret;
1030:
1.24 veillard 1031: encoding = (const char *) htmlGetMetaEncoding(cur);
1032:
1033: if (encoding != NULL) {
1034: xmlCharEncoding enc;
1035:
1036: enc = xmlParseCharEncoding(encoding);
1037: if (enc != cur->charset) {
1038: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1039: /*
1040: * Not supported yet
1041: */
1042: return(-1);
1043: }
1044:
1045: handler = xmlFindCharEncodingHandler(encoding);
1046: if (handler == NULL)
1047: return(-1);
1048: }
1049: }
1050:
1051: /*
1.25 veillard 1052: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1053: */
1054: if (handler == NULL)
1.25 veillard 1055: handler = xmlFindCharEncodingHandler("HTML");
1056: if (handler == NULL)
1.24 veillard 1057: handler = xmlFindCharEncodingHandler("ascii");
1058:
1.21 veillard 1059: /*
1060: * save the content to a temp buffer.
1061: */
1.24 veillard 1062: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1.21 veillard 1063: if (buf == NULL) return(0);
1064:
1065: htmlDocContentDumpOutput(buf, cur, NULL);
1066:
1067: ret = xmlOutputBufferClose(buf);
1068: return(ret);
1.1 daniel 1069: }
1070:
1071: /**
1.26 ! veillard 1072: * htmlSaveFileEnc:
1.1 daniel 1073: * @filename: the filename
1074: * @cur: the document
1075: *
1.26 ! veillard 1076: * Dump an HTML document to a file using a given encoding.
1.1 daniel 1077: *
1078: * returns: the number of byte written or -1 in case of failure.
1079: */
1080: int
1.21 veillard 1081: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1082: xmlOutputBufferPtr buf;
1083: xmlCharEncodingHandlerPtr handler = NULL;
1.1 daniel 1084: int ret;
1085:
1.21 veillard 1086: if (encoding != NULL) {
1087: xmlCharEncoding enc;
1088:
1089: enc = xmlParseCharEncoding(encoding);
1090: if (enc != cur->charset) {
1091: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1092: /*
1093: * Not supported yet
1094: */
1095: return(-1);
1096: }
1097:
1098: handler = xmlFindCharEncodingHandler(encoding);
1099: if (handler == NULL)
1100: return(-1);
1.26 ! veillard 1101: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1.21 veillard 1102: }
1103: }
1.24 veillard 1104:
1105: /*
1.25 veillard 1106: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1107: */
1.25 veillard 1108: if (handler == NULL)
1109: handler = xmlFindCharEncodingHandler("HTML");
1.24 veillard 1110: if (handler == NULL)
1111: handler = xmlFindCharEncodingHandler("ascii");
1.21 veillard 1112:
1.1 daniel 1113: /*
1114: * save the content to a temp buffer.
1115: */
1.21 veillard 1116: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1.1 daniel 1117: if (buf == NULL) return(0);
1118:
1.21 veillard 1119: htmlDocContentDumpOutput(buf, cur, encoding);
1.1 daniel 1120:
1.21 veillard 1121: ret = xmlOutputBufferClose(buf);
1122: return(ret);
1.1 daniel 1123: }
1.18 daniel 1124: #endif /* LIBXML_HTML_ENABLED */
Webmaster