Annotation of XML/HTMLtree.c, revision 1.27
1.1 daniel 1: /*
2: * HTMLtree.c : implemetation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
1.5 daniel 9:
1.13 daniel 10: #ifdef WIN32
11: #include "win32config.h"
12: #else
1.1 daniel 13: #include "config.h"
1.5 daniel 14: #endif
1.18 daniel 15:
16: #include "xmlversion.h"
17: #ifdef LIBXML_HTML_ENABLED
18:
1.1 daniel 19: #include <stdio.h>
1.5 daniel 20: #include <string.h> /* for memset() only ! */
21:
22: #ifdef HAVE_CTYPE_H
1.1 daniel 23: #include <ctype.h>
1.5 daniel 24: #endif
25: #ifdef HAVE_STDLIB_H
1.1 daniel 26: #include <stdlib.h>
1.5 daniel 27: #endif
1.1 daniel 28:
1.18 daniel 29: #include <libxml/xmlmemory.h>
30: #include <libxml/HTMLparser.h>
31: #include <libxml/HTMLtree.h>
32: #include <libxml/entities.h>
33: #include <libxml/valid.h>
1.1 daniel 34:
1.21 veillard 35: /************************************************************************
36: * *
1.23 veillard 37: * Getting/Setting encoding meta tags *
38: * *
39: ************************************************************************/
40:
41: /**
42: * htmlGetMetaEncoding:
43: * @doc: the document
44: *
45: * Encoding definition lookup in the Meta tags
46: *
47: * Returns the current encoding as flagged in the HTML source
48: */
49: const xmlChar *
50: htmlGetMetaEncoding(htmlDocPtr doc) {
1.24 veillard 51: htmlNodePtr cur;
52: const xmlChar *content;
53: const xmlChar *encoding;
54:
55: if (doc == NULL)
56: return(NULL);
57: cur = doc->children;
58:
59: /*
60: * Search the html
61: */
62: while (cur != NULL) {
63: if (cur->name != NULL) {
64: if (!xmlStrcmp(cur->name, BAD_CAST"html"))
65: break;
66: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
67: goto found_head;
68: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
69: goto found_meta;
70: }
71: cur = cur->next;
72: }
73: if (cur == NULL)
74: return(NULL);
75: cur = cur->children;
76:
77: /*
78: * Search the head
79: */
80: while (cur != NULL) {
81: if (cur->name != NULL) {
82: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
83: break;
84: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
85: goto found_meta;
86: }
87: cur = cur->next;
88: }
89: if (cur == NULL)
90: return(NULL);
91: found_head:
92: cur = cur->children;
93:
94: /*
95: * Search the meta elements
96: */
97: found_meta:
98: while (cur != NULL) {
99: if (cur->name != NULL) {
100: if (!xmlStrcmp(cur->name, BAD_CAST"meta")) {
101: xmlAttrPtr attr = cur->properties;
102: int http;
103: const xmlChar *value;
104:
105: content = NULL;
106: http = 0;
107: while (attr != NULL) {
108: if ((attr->children != NULL) &&
109: (attr->children->type == XML_TEXT_NODE) &&
110: (attr->children->next == NULL)) {
111: #ifndef XML_USE_BUFFER_CONTENT
112: value = attr->children->content;
113: #else
114: value = xmlBufferContent(attr->children->content);
115: #endif
116: if (((!xmlStrcmp(attr->name, BAD_CAST"http-equiv")) ||
117: (!xmlStrcmp(attr->name, BAD_CAST"Http-Equiv")) ||
118: (!xmlStrcmp(attr->name, BAD_CAST"HTTP-EQUIV"))) &&
119: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
120: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
121: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
122: http = 1;
123: else if ((value != NULL) &&
124: ((!xmlStrcmp(attr->name, BAD_CAST"content")) ||
125: (!xmlStrcmp(attr->name, BAD_CAST"Content")) ||
126: (!xmlStrcmp(attr->name, BAD_CAST"CONTENT"))))
127: content = value;
128: if ((http != 0) && (content != NULL))
129: goto found_content;
130: }
131: attr = attr->next;
132: }
133: }
134: }
135: cur = cur->next;
136: }
137: return(NULL);
138:
139: found_content:
140: encoding = xmlStrstr(content, BAD_CAST"charset=");
141: if (encoding == NULL)
142: encoding = xmlStrstr(content, BAD_CAST"Charset=");
143: if (encoding == NULL)
144: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
145: if (encoding != NULL) {
146: encoding += 8;
147: } else {
148: encoding = xmlStrstr(content, BAD_CAST"charset =");
149: if (encoding == NULL)
150: encoding = xmlStrstr(content, BAD_CAST"Charset =");
151: if (encoding == NULL)
152: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
153: if (encoding != NULL)
154: encoding += 9;
155: }
156: if (encoding != NULL) {
157: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
158: }
159: return(encoding);
1.23 veillard 160: }
161:
162: /**
163: * htmlSetMetaEncoding:
164: * @doc: the document
165: * @encoding: the encoding string
166: *
167: * Sets the current encoding in the Meta tags
168: * NOTE: this will not change the document content encoding, just
169: * the META flag associated.
170: *
171: * Returns 0 in case of success and -1 in case of error
172: */
173: int
174: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
1.26 veillard 175: htmlNodePtr cur, meta;
176: const xmlChar *content;
177: char newcontent[100];
178:
179:
180: if (doc == NULL)
181: return(-1);
182:
183: if (encoding != NULL) {
1.27 ! veillard 184: #ifdef HAVE_SNPRINTF
! 185: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
! 186: encoding);
! 187: #else
1.26 veillard 188: sprintf(newcontent, "text/html; charset=%s", encoding);
1.27 ! veillard 189: #endif
! 190: newcontent[sizeof(newcontent) - 1] = 0;
1.26 veillard 191: }
192:
193: cur = doc->children;
194:
195: /*
196: * Search the html
197: */
198: while (cur != NULL) {
199: if (cur->name != NULL) {
200: if (!xmlStrcmp(cur->name, BAD_CAST"html"))
201: break;
202: if (!xmlStrcmp(cur->name, BAD_CAST"body")) {
203: if (encoding == NULL)
204: return(0);
205: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
206: xmlAddPrevSibling(cur, meta);
207: cur = meta;
208: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
209: xmlAddChild(cur, meta);
210: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
211: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
212: return(0);
213: }
214: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
215: goto found_head;
216: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
217: goto found_meta;
218: }
219: cur = cur->next;
220: }
221: if (cur == NULL)
222: return(-1);
223: cur = cur->children;
224:
225: /*
226: * Search the head
227: */
228: while (cur != NULL) {
229: if (cur->name != NULL) {
230: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
231: break;
232: if (!xmlStrcmp(cur->name, BAD_CAST"body")) {
233: if (encoding == NULL)
234: return(0);
235: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
236: xmlAddPrevSibling(cur, meta);
237: cur = meta;
238: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
239: xmlAddChild(cur, meta);
240: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
241: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
242: return(0);
243: }
244: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
245: goto found_meta;
246: }
247: cur = cur->next;
248: }
249: if (cur == NULL)
250: return(-1);
251: found_head:
252: if (cur->children == NULL) {
253: if (encoding == NULL)
254: return(0);
255: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
256: xmlAddChild(cur, meta);
257: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
258: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
259: return(0);
260: }
261: cur = cur->children;
262:
263: found_meta:
264: if (encoding != NULL) {
265: /*
266: * Create a new Meta element with the right aatributes
267: */
268:
269: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
270: xmlAddPrevSibling(cur, meta);
271: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
272: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
273: }
274:
275: /*
276: * Search and destroy all the remaining the meta elements carrying
277: * encoding informations
278: */
279: while (cur != NULL) {
280: if (cur->name != NULL) {
281: if (!xmlStrcmp(cur->name, BAD_CAST"meta")) {
282: xmlAttrPtr attr = cur->properties;
283: int http;
284: const xmlChar *value;
285:
286: content = NULL;
287: http = 0;
288: while (attr != NULL) {
289: if ((attr->children != NULL) &&
290: (attr->children->type == XML_TEXT_NODE) &&
291: (attr->children->next == NULL)) {
292: #ifndef XML_USE_BUFFER_CONTENT
293: value = attr->children->content;
294: #else
295: value = xmlBufferContent(attr->children->content);
296: #endif
297: if (((!xmlStrcmp(attr->name, BAD_CAST"http-equiv")) ||
298: (!xmlStrcmp(attr->name, BAD_CAST"Http-Equiv")) ||
299: (!xmlStrcmp(attr->name, BAD_CAST"HTTP-EQUIV"))) &&
300: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
301: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
302: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
303: http = 1;
304: else if ((value != NULL) &&
305: ((!xmlStrcmp(attr->name, BAD_CAST"content")) ||
306: (!xmlStrcmp(attr->name, BAD_CAST"Content")) ||
307: (!xmlStrcmp(attr->name, BAD_CAST"CONTENT"))))
308: content = value;
309: if ((http != 0) && (content != NULL))
310: break;
311: }
312: attr = attr->next;
313: }
314: if ((http != 0) && (content != NULL)) {
315: meta = cur;
316: cur = cur->next;
317: xmlUnlinkNode(meta);
318: xmlFreeNode(meta);
319: continue;
320: }
321:
322: }
323: }
324: cur = cur->next;
325: }
326: return(0);
1.23 veillard 327: }
328:
329: /************************************************************************
330: * *
1.21 veillard 331: * Dumping HTML tree content to a simple buffer *
332: * *
333: ************************************************************************/
334:
1.14 daniel 335: static void
336: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
337:
1.1 daniel 338: /**
339: * htmlDtdDump:
340: * @buf: the HTML buffer output
341: * @doc: the document
342: *
343: * Dump the HTML document DTD, if any.
344: */
345: static void
346: htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
347: xmlDtdPtr cur = doc->intSubset;
348:
349: if (cur == NULL) {
350: fprintf(stderr, "htmlDtdDump : no internal subset\n");
351: return;
352: }
353: xmlBufferWriteChar(buf, "<!DOCTYPE ");
354: xmlBufferWriteCHAR(buf, cur->name);
355: if (cur->ExternalID != NULL) {
356: xmlBufferWriteChar(buf, " PUBLIC ");
357: xmlBufferWriteQuotedString(buf, cur->ExternalID);
1.2 daniel 358: if (cur->SystemID != NULL) {
359: xmlBufferWriteChar(buf, " ");
360: xmlBufferWriteQuotedString(buf, cur->SystemID);
361: }
1.1 daniel 362: } else if (cur->SystemID != NULL) {
363: xmlBufferWriteChar(buf, " SYSTEM ");
364: xmlBufferWriteQuotedString(buf, cur->SystemID);
365: }
366: xmlBufferWriteChar(buf, ">\n");
367: }
368:
369: /**
370: * htmlAttrDump:
371: * @buf: the HTML buffer output
372: * @doc: the document
373: * @cur: the attribute pointer
374: *
375: * Dump an HTML attribute
376: */
377: static void
378: htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
1.6 daniel 379: xmlChar *value;
1.1 daniel 380:
381: if (cur == NULL) {
382: fprintf(stderr, "htmlAttrDump : property == NULL\n");
383: return;
384: }
385: xmlBufferWriteChar(buf, " ");
386: xmlBufferWriteCHAR(buf, cur->name);
1.19 daniel 387: if (cur->children != NULL) {
388: value = xmlNodeListGetString(doc, cur->children, 0);
389: if (value) {
390: xmlBufferWriteChar(buf, "=");
391: xmlBufferWriteQuotedString(buf, value);
392: xmlFree(value);
393: } else {
394: xmlBufferWriteChar(buf, "=\"\"");
395: }
1.1 daniel 396: }
397: }
398:
399: /**
400: * htmlAttrListDump:
401: * @buf: the HTML buffer output
402: * @doc: the document
403: * @cur: the first attribute pointer
404: *
405: * Dump a list of HTML attributes
406: */
407: static void
408: htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
409: if (cur == NULL) {
410: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
411: return;
412: }
413: while (cur != NULL) {
414: htmlAttrDump(buf, doc, cur);
415: cur = cur->next;
416: }
417: }
418:
419:
1.14 daniel 420: void
1.1 daniel 421: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
422: /**
423: * htmlNodeListDump:
424: * @buf: the HTML buffer output
425: * @doc: the document
426: * @cur: the first node
427: *
428: * Dump an HTML node list, recursive behaviour,children are printed too.
429: */
430: static void
431: htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
432: if (cur == NULL) {
433: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
434: return;
435: }
436: while (cur != NULL) {
437: htmlNodeDump(buf, doc, cur);
438: cur = cur->next;
439: }
440: }
441:
442: /**
443: * htmlNodeDump:
444: * @buf: the HTML buffer output
445: * @doc: the document
446: * @cur: the current node
447: *
448: * Dump an HTML node, recursive behaviour,children are printed too.
449: */
1.14 daniel 450: void
1.1 daniel 451: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
452: htmlElemDescPtr info;
453:
454: if (cur == NULL) {
455: fprintf(stderr, "htmlNodeDump : node == NULL\n");
456: return;
457: }
458: /*
459: * Special cases.
460: */
1.20 daniel 461: if (cur->type == XML_DTD_NODE)
462: return;
1.14 daniel 463: if (cur->type == XML_HTML_DOCUMENT_NODE) {
464: htmlDocContentDump(buf, (xmlDocPtr) cur);
465: return;
466: }
1.1 daniel 467: if (cur->type == HTML_TEXT_NODE) {
468: if (cur->content != NULL) {
1.6 daniel 469: xmlChar *buffer;
1.1 daniel 470:
1.9 daniel 471: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 472: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
1.9 daniel 473: #else
474: buffer = xmlEncodeEntitiesReentrant(doc,
475: xmlBufferContent(cur->content));
476: #endif
1.1 daniel 477: if (buffer != NULL) {
478: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 479: xmlFree(buffer);
1.1 daniel 480: }
481: }
482: return;
483: }
484: if (cur->type == HTML_COMMENT_NODE) {
485: if (cur->content != NULL) {
486: xmlBufferWriteChar(buf, "<!--");
1.9 daniel 487: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 488: xmlBufferWriteCHAR(buf, cur->content);
1.9 daniel 489: #else
490: xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
491: #endif
1.1 daniel 492: xmlBufferWriteChar(buf, "-->");
493: }
494: return;
495: }
496: if (cur->type == HTML_ENTITY_REF_NODE) {
497: xmlBufferWriteChar(buf, "&");
498: xmlBufferWriteCHAR(buf, cur->name);
499: xmlBufferWriteChar(buf, ";");
500: return;
501: }
502:
503: /*
504: * Get specific HTmL info for taht node.
505: */
506: info = htmlTagLookup(cur->name);
507:
508: xmlBufferWriteChar(buf, "<");
509: xmlBufferWriteCHAR(buf, cur->name);
510: if (cur->properties != NULL)
511: htmlAttrListDump(buf, doc, cur->properties);
512:
1.7 daniel 513: if ((info != NULL) && (info->empty)) {
1.1 daniel 514: xmlBufferWriteChar(buf, ">");
515: if (cur->next != NULL) {
516: if ((cur->next->type != HTML_TEXT_NODE) &&
517: (cur->next->type != HTML_ENTITY_REF_NODE))
518: xmlBufferWriteChar(buf, "\n");
519: }
520: return;
521: }
1.17 daniel 522: if ((cur->content == NULL) && (cur->children == NULL)) {
1.7 daniel 523: if ((info != NULL) && (info->endTag != 0))
1.1 daniel 524: xmlBufferWriteChar(buf, ">");
525: else {
526: xmlBufferWriteChar(buf, "></");
527: xmlBufferWriteCHAR(buf, cur->name);
528: xmlBufferWriteChar(buf, ">");
529: }
530: if (cur->next != NULL) {
531: if ((cur->next->type != HTML_TEXT_NODE) &&
532: (cur->next->type != HTML_ENTITY_REF_NODE))
533: xmlBufferWriteChar(buf, "\n");
534: }
535: return;
536: }
537: xmlBufferWriteChar(buf, ">");
538: if (cur->content != NULL) {
1.6 daniel 539: xmlChar *buffer;
1.1 daniel 540:
1.9 daniel 541: #ifndef XML_USE_BUFFER_CONTENT
542: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
543: #else
544: buffer = xmlEncodeEntitiesReentrant(doc,
545: xmlBufferContent(cur->content));
546: #endif
1.1 daniel 547: if (buffer != NULL) {
548: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 549: xmlFree(buffer);
1.1 daniel 550: }
551: }
1.17 daniel 552: if (cur->children != NULL) {
553: if ((cur->children->type != HTML_TEXT_NODE) &&
554: (cur->children->type != HTML_ENTITY_REF_NODE) &&
555: (cur->children != cur->last))
1.1 daniel 556: xmlBufferWriteChar(buf, "\n");
1.17 daniel 557: htmlNodeListDump(buf, doc, cur->children);
1.1 daniel 558: if ((cur->last->type != HTML_TEXT_NODE) &&
1.10 daniel 559: (cur->last->type != HTML_ENTITY_REF_NODE) &&
1.17 daniel 560: (cur->children != cur->last))
1.1 daniel 561: xmlBufferWriteChar(buf, "\n");
562: }
1.11 daniel 563: if (!htmlIsAutoClosed(doc, cur)) {
564: xmlBufferWriteChar(buf, "</");
565: xmlBufferWriteCHAR(buf, cur->name);
566: xmlBufferWriteChar(buf, ">");
567: }
1.1 daniel 568: if (cur->next != NULL) {
569: if ((cur->next->type != HTML_TEXT_NODE) &&
570: (cur->next->type != HTML_ENTITY_REF_NODE))
571: xmlBufferWriteChar(buf, "\n");
572: }
573: }
574:
575: /**
1.16 daniel 576: * htmlNodeDumpFile:
577: * @out: the FILE pointer
578: * @doc: the document
579: * @cur: the current node
580: *
581: * Dump an HTML node, recursive behaviour,children are printed too.
582: */
583: void
584: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
585: xmlBufferPtr buf;
586:
587: buf = xmlBufferCreate();
588: if (buf == NULL) return;
589: htmlNodeDump(buf, doc, cur);
590: xmlBufferDump(out, buf);
591: xmlBufferFree(buf);
592: }
593:
594: /**
1.1 daniel 595: * htmlDocContentDump:
596: * @buf: the HTML buffer output
597: * @cur: the document
598: *
599: * Dump an HTML document.
600: */
601: static void
602: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
1.12 daniel 603: int type;
604:
605: /*
606: * force to output the stuff as HTML, especially for entities
607: */
608: type = cur->type;
609: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 610: if (cur->intSubset != NULL)
611: htmlDtdDump(buf, cur);
1.11 daniel 612: else {
613: /* Default to HTML-4.0 transitionnal @@@@ */
614: xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
615:
616: }
1.17 daniel 617: if (cur->children != NULL) {
618: htmlNodeListDump(buf, cur, cur->children);
1.1 daniel 619: }
620: xmlBufferWriteChar(buf, "\n");
1.22 veillard 621: cur->type = (xmlElementType) type;
1.1 daniel 622: }
623:
624: /**
625: * htmlDocDumpMemory:
626: * @cur: the document
627: * @mem: OUT: the memory pointer
628: * @size: OUT: the memory lenght
629: *
1.6 daniel 630: * Dump an HTML document in memory and return the xmlChar * and it's size.
1.1 daniel 631: * It's up to the caller to free the memory.
632: */
633: void
1.6 daniel 634: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
1.1 daniel 635: xmlBufferPtr buf;
636:
637: if (cur == NULL) {
638: #ifdef DEBUG_TREE
1.15 daniel 639: fprintf(stderr, "htmlxmlDocDumpMemory : document == NULL\n");
1.1 daniel 640: #endif
641: *mem = NULL;
642: *size = 0;
643: return;
644: }
645: buf = xmlBufferCreate();
646: if (buf == NULL) {
647: *mem = NULL;
648: *size = 0;
649: return;
650: }
651: htmlDocContentDump(buf, cur);
652: *mem = buf->content;
653: *size = buf->use;
654: memset(buf, -1, sizeof(xmlBuffer));
1.4 daniel 655: xmlFree(buf);
1.1 daniel 656: }
657:
658:
1.21 veillard 659: /************************************************************************
660: * *
661: * Dumping HTML tree content to an I/O output buffer *
662: * *
663: ************************************************************************/
664:
665: static void
666: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding);
667:
668: /**
669: * htmlDtdDump:
670: * @buf: the HTML buffer output
671: * @doc: the document
672: *
673: * Dump the HTML document DTD, if any.
674: */
675: static void
676: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, const char *encoding) {
677: xmlDtdPtr cur = doc->intSubset;
678:
679: if (cur == NULL) {
680: fprintf(stderr, "htmlDtdDump : no internal subset\n");
681: return;
682: }
683: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
684: xmlOutputBufferWriteString(buf, (const char *)cur->name);
685: if (cur->ExternalID != NULL) {
686: xmlOutputBufferWriteString(buf, " PUBLIC ");
687: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
688: if (cur->SystemID != NULL) {
689: xmlOutputBufferWriteString(buf, " ");
690: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
691: }
692: } else if (cur->SystemID != NULL) {
693: xmlOutputBufferWriteString(buf, " SYSTEM ");
694: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
695: }
696: xmlOutputBufferWriteString(buf, ">\n");
697: }
698:
699: /**
700: * htmlAttrDump:
701: * @buf: the HTML buffer output
702: * @doc: the document
703: * @cur: the attribute pointer
704: *
705: * Dump an HTML attribute
706: */
707: static void
708: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
709: xmlChar *value;
710:
711: if (cur == NULL) {
712: fprintf(stderr, "htmlAttrDump : property == NULL\n");
713: return;
714: }
715: xmlOutputBufferWriteString(buf, " ");
716: xmlOutputBufferWriteString(buf, (const char *)cur->name);
717: if (cur->children != NULL) {
718: value = xmlNodeListGetString(doc, cur->children, 0);
719: if (value) {
720: xmlOutputBufferWriteString(buf, "=");
721: xmlBufferWriteQuotedString(buf->buffer, value);
722: xmlFree(value);
723: } else {
724: xmlOutputBufferWriteString(buf, "=\"\"");
725: }
726: }
727: }
728:
729: /**
730: * htmlAttrListDump:
731: * @buf: the HTML buffer output
732: * @doc: the document
733: * @cur: the first attribute pointer
734: *
735: * Dump a list of HTML attributes
736: */
737: static void
738: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
739: if (cur == NULL) {
740: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
741: return;
742: }
743: while (cur != NULL) {
744: htmlAttrDumpOutput(buf, doc, cur, encoding);
745: cur = cur->next;
746: }
747: }
748:
749:
750: void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
751: xmlNodePtr cur, const char *encoding);
752:
753: /**
754: * htmlNodeListDump:
755: * @buf: the HTML buffer output
756: * @doc: the document
757: * @cur: the first node
758: *
759: * Dump an HTML node list, recursive behaviour,children are printed too.
760: */
761: static void
762: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
763: if (cur == NULL) {
764: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
765: return;
766: }
767: while (cur != NULL) {
768: htmlNodeDumpOutput(buf, doc, cur, encoding);
769: cur = cur->next;
770: }
771: }
772:
773: /**
774: * htmlNodeDump:
775: * @buf: the HTML buffer output
776: * @doc: the document
777: * @cur: the current node
778: *
779: * Dump an HTML node, recursive behaviour,children are printed too.
780: */
781: void
782: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
783: htmlElemDescPtr info;
784:
785: if (cur == NULL) {
786: fprintf(stderr, "htmlNodeDump : node == NULL\n");
787: return;
788: }
789: /*
790: * Special cases.
791: */
792: if (cur->type == XML_DTD_NODE)
793: return;
794: if (cur->type == XML_HTML_DOCUMENT_NODE) {
795: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
796: return;
797: }
798: if (cur->type == HTML_TEXT_NODE) {
799: if (cur->content != NULL) {
800: xmlChar *buffer;
801:
802: #ifndef XML_USE_BUFFER_CONTENT
803: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
804: #else
805: buffer = xmlEncodeEntitiesReentrant(doc,
806: xmlBufferContent(cur->content));
807: #endif
808: if (buffer != NULL) {
1.25 veillard 809: xmlOutputBufferWriteString(buf, (const char *)buffer);
1.21 veillard 810: xmlFree(buffer);
811: }
812: }
813: return;
814: }
815: if (cur->type == HTML_COMMENT_NODE) {
816: if (cur->content != NULL) {
817: xmlOutputBufferWriteString(buf, "<!--");
818: #ifndef XML_USE_BUFFER_CONTENT
819: xmlOutputBufferWriteString(buf, (const char *)cur->content);
820: #else
821: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
822: #endif
823: xmlOutputBufferWriteString(buf, "-->");
824: }
825: return;
826: }
827: if (cur->type == HTML_ENTITY_REF_NODE) {
828: xmlOutputBufferWriteString(buf, "&");
829: xmlOutputBufferWriteString(buf, (const char *)cur->name);
830: xmlOutputBufferWriteString(buf, ";");
831: return;
832: }
833:
834: /*
835: * Get specific HTmL info for taht node.
836: */
837: info = htmlTagLookup(cur->name);
838:
839: xmlOutputBufferWriteString(buf, "<");
840: xmlOutputBufferWriteString(buf, (const char *)cur->name);
841: if (cur->properties != NULL)
842: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
843:
844: if ((info != NULL) && (info->empty)) {
845: xmlOutputBufferWriteString(buf, ">");
846: if (cur->next != NULL) {
847: if ((cur->next->type != HTML_TEXT_NODE) &&
848: (cur->next->type != HTML_ENTITY_REF_NODE))
849: xmlOutputBufferWriteString(buf, "\n");
850: }
851: return;
852: }
853: if ((cur->content == NULL) && (cur->children == NULL)) {
854: if ((info != NULL) && (info->endTag != 0))
855: xmlOutputBufferWriteString(buf, ">");
856: else {
857: xmlOutputBufferWriteString(buf, "></");
858: xmlOutputBufferWriteString(buf, (const char *)cur->name);
859: xmlOutputBufferWriteString(buf, ">");
860: }
861: if (cur->next != NULL) {
862: if ((cur->next->type != HTML_TEXT_NODE) &&
863: (cur->next->type != HTML_ENTITY_REF_NODE))
864: xmlOutputBufferWriteString(buf, "\n");
865: }
866: return;
867: }
868: xmlOutputBufferWriteString(buf, ">");
869: if (cur->content != NULL) {
870: #if 0
871: xmlChar *buffer;
872:
873: #ifndef XML_USE_BUFFER_CONTENT
874: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
875: #else
876: buffer = xmlEncodeEntitiesReentrant(doc,
877: xmlBufferContent(cur->content));
878: #endif
879: if (buffer != NULL) {
880: xmlOutputBufferWriteString(buf, buffer);
881: xmlFree(buffer);
882: }
883: #else
884: /*
885: * Uses the OutputBuffer property to automatically convert
886: * invalids to charrefs
887: */
888:
889: #ifndef XML_USE_BUFFER_CONTENT
890: xmlOutputBufferWriteString(buf, (const char *) cur->content);
891: #else
892: xmlOutputBufferWriteString(buf,
893: (const char *) xmlBufferContent(cur->content));
894: #endif
895: #endif
896: }
897: if (cur->children != NULL) {
898: if ((cur->children->type != HTML_TEXT_NODE) &&
899: (cur->children->type != HTML_ENTITY_REF_NODE) &&
900: (cur->children != cur->last))
901: xmlOutputBufferWriteString(buf, "\n");
902: htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
903: if ((cur->last->type != HTML_TEXT_NODE) &&
904: (cur->last->type != HTML_ENTITY_REF_NODE) &&
905: (cur->children != cur->last))
906: xmlOutputBufferWriteString(buf, "\n");
907: }
908: if (!htmlIsAutoClosed(doc, cur)) {
909: xmlOutputBufferWriteString(buf, "</");
910: xmlOutputBufferWriteString(buf, (const char *)cur->name);
911: xmlOutputBufferWriteString(buf, ">");
912: }
913: if (cur->next != NULL) {
914: if ((cur->next->type != HTML_TEXT_NODE) &&
915: (cur->next->type != HTML_ENTITY_REF_NODE))
916: xmlOutputBufferWriteString(buf, "\n");
917: }
918: }
919:
920: /**
921: * htmlDocContentDump:
922: * @buf: the HTML buffer output
923: * @cur: the document
924: *
925: * Dump an HTML document.
926: */
927: static void
928: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
929: int type;
930:
931: /*
932: * force to output the stuff as HTML, especially for entities
933: */
934: type = cur->type;
935: cur->type = XML_HTML_DOCUMENT_NODE;
936: if (cur->intSubset != NULL)
937: htmlDtdDumpOutput(buf, cur, NULL);
938: else {
939: /* Default to HTML-4.0 transitionnal @@@@ */
940: xmlOutputBufferWriteString(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
941:
942: }
943: if (cur->children != NULL) {
944: htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
945: }
946: xmlOutputBufferWriteString(buf, "\n");
1.22 veillard 947: cur->type = (xmlElementType) type;
1.21 veillard 948: }
949:
950:
951: /************************************************************************
952: * *
953: * Saving functions front-ends *
954: * *
955: ************************************************************************/
956:
1.1 daniel 957: /**
958: * htmlDocDump:
959: * @f: the FILE*
960: * @cur: the document
961: *
962: * Dump an HTML document to an open FILE.
1.21 veillard 963: *
964: * returns: the number of byte written or -1 in case of failure.
1.1 daniel 965: */
1.21 veillard 966: int
1.1 daniel 967: htmlDocDump(FILE *f, xmlDocPtr cur) {
1.21 veillard 968: xmlOutputBufferPtr buf;
1.24 veillard 969: xmlCharEncodingHandlerPtr handler = NULL;
970: const char *encoding;
1.21 veillard 971: int ret;
1.1 daniel 972:
973: if (cur == NULL) {
974: #ifdef DEBUG_TREE
1.15 daniel 975: fprintf(stderr, "htmlDocDump : document == NULL\n");
1.1 daniel 976: #endif
1.21 veillard 977: return(-1);
1.1 daniel 978: }
1.24 veillard 979:
980: encoding = (const char *) htmlGetMetaEncoding(cur);
981:
982: if (encoding != NULL) {
983: xmlCharEncoding enc;
984:
985: enc = xmlParseCharEncoding(encoding);
986: if (enc != cur->charset) {
987: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
988: /*
989: * Not supported yet
990: */
991: return(-1);
992: }
993:
994: handler = xmlFindCharEncodingHandler(encoding);
995: if (handler == NULL)
996: return(-1);
997: }
998: }
999:
1000: /*
1.25 veillard 1001: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1002: */
1003: if (handler == NULL)
1.25 veillard 1004: handler = xmlFindCharEncodingHandler("HTML");
1005: if (handler == NULL)
1.24 veillard 1006: handler = xmlFindCharEncodingHandler("ascii");
1007:
1008: buf = xmlOutputBufferCreateFile(f, handler);
1.21 veillard 1009: if (buf == NULL) return(-1);
1010: htmlDocContentDumpOutput(buf, cur, NULL);
1011:
1012: ret = xmlOutputBufferClose(buf);
1013: return(ret);
1014: }
1015:
1016: /**
1017: * htmlSaveFile:
1018: * @filename: the filename (or URL)
1019: * @cur: the document
1020: *
1021: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1022: * used.
1023: * returns: the number of byte written or -1 in case of failure.
1024: */
1025: int
1026: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1027: xmlOutputBufferPtr buf;
1.24 veillard 1028: xmlCharEncodingHandlerPtr handler = NULL;
1029: const char *encoding;
1.21 veillard 1030: int ret;
1031:
1.24 veillard 1032: encoding = (const char *) htmlGetMetaEncoding(cur);
1033:
1034: if (encoding != NULL) {
1035: xmlCharEncoding enc;
1036:
1037: enc = xmlParseCharEncoding(encoding);
1038: if (enc != cur->charset) {
1039: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1040: /*
1041: * Not supported yet
1042: */
1043: return(-1);
1044: }
1045:
1046: handler = xmlFindCharEncodingHandler(encoding);
1047: if (handler == NULL)
1048: return(-1);
1049: }
1050: }
1051:
1052: /*
1.25 veillard 1053: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1054: */
1055: if (handler == NULL)
1.25 veillard 1056: handler = xmlFindCharEncodingHandler("HTML");
1057: if (handler == NULL)
1.24 veillard 1058: handler = xmlFindCharEncodingHandler("ascii");
1059:
1.21 veillard 1060: /*
1061: * save the content to a temp buffer.
1062: */
1.24 veillard 1063: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1.21 veillard 1064: if (buf == NULL) return(0);
1065:
1066: htmlDocContentDumpOutput(buf, cur, NULL);
1067:
1068: ret = xmlOutputBufferClose(buf);
1069: return(ret);
1.1 daniel 1070: }
1071:
1072: /**
1.26 veillard 1073: * htmlSaveFileEnc:
1.1 daniel 1074: * @filename: the filename
1075: * @cur: the document
1076: *
1.26 veillard 1077: * Dump an HTML document to a file using a given encoding.
1.1 daniel 1078: *
1079: * returns: the number of byte written or -1 in case of failure.
1080: */
1081: int
1.21 veillard 1082: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1083: xmlOutputBufferPtr buf;
1084: xmlCharEncodingHandlerPtr handler = NULL;
1.1 daniel 1085: int ret;
1086:
1.21 veillard 1087: if (encoding != NULL) {
1088: xmlCharEncoding enc;
1089:
1090: enc = xmlParseCharEncoding(encoding);
1091: if (enc != cur->charset) {
1092: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1093: /*
1094: * Not supported yet
1095: */
1096: return(-1);
1097: }
1098:
1099: handler = xmlFindCharEncodingHandler(encoding);
1100: if (handler == NULL)
1101: return(-1);
1.26 veillard 1102: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1.21 veillard 1103: }
1104: }
1.24 veillard 1105:
1106: /*
1.25 veillard 1107: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1108: */
1.25 veillard 1109: if (handler == NULL)
1110: handler = xmlFindCharEncodingHandler("HTML");
1.24 veillard 1111: if (handler == NULL)
1112: handler = xmlFindCharEncodingHandler("ascii");
1.21 veillard 1113:
1.1 daniel 1114: /*
1115: * save the content to a temp buffer.
1116: */
1.21 veillard 1117: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1.1 daniel 1118: if (buf == NULL) return(0);
1119:
1.21 veillard 1120: htmlDocContentDumpOutput(buf, cur, encoding);
1.1 daniel 1121:
1.21 veillard 1122: ret = xmlOutputBufferClose(buf);
1123: return(ret);
1.1 daniel 1124: }
1.18 daniel 1125: #endif /* LIBXML_HTML_ENABLED */
Webmaster