Annotation of XML/encoding.c, revision 1.35
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
1.9 daniel 15: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 16: *
17: * See Copyright for the status of this software.
18: *
19: * Daniel.Veillard@w3.org
20: */
21:
1.21 daniel 22: #ifdef WIN32
23: #include "win32config.h"
24: #else
1.14 daniel 25: #include "config.h"
1.17 daniel 26: #endif
27:
28: #include <stdio.h>
29: #include <string.h>
30:
31: #ifdef HAVE_CTYPE_H
1.7 daniel 32: #include <ctype.h>
1.17 daniel 33: #endif
1.20 daniel 34: #ifdef HAVE_STDLIB_H
35: #include <stdlib.h>
36: #endif
1.30 daniel 37: #include <libxml/xmlversion.h>
38: #ifdef LIBXML_ICONV_ENABLED
39: #ifdef HAVE_ERRNO_H
40: #include <errno.h>
41: #endif
42: #endif
1.29 daniel 43: #include <libxml/encoding.h>
44: #include <libxml/xmlmemory.h>
1.3 daniel 45:
1.25 daniel 46: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
47: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
48:
1.30 daniel 49: #ifdef LIBXML_ICONV_ENABLED
1.33 daniel 50: #if 0
1.30 daniel 51: #define DEBUG_ENCODING /* Define this to get encoding traces */
52: #endif
1.33 daniel 53: #endif
1.30 daniel 54:
1.34 daniel 55: static int xmlLittleEndian = 1;
56:
1.3 daniel 57: /*
58: * From rfc2044: encoding of the Unicode values on UTF-8:
59: *
60: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
61: * 0000 0000-0000 007F 0xxxxxxx
62: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
63: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
64: *
65: * I hope we won't use values > 0xFFFF anytime soon !
66: */
1.1 daniel 67:
68: /**
1.22 daniel 69: * xmlCheckUTF8: Check utf-8 string for legality.
70: * @utf: Pointer to putative utf-8 encoded string.
71: *
72: * Checks @utf for being valid utf-8. @utf is assumed to be
73: * null-terminated. This function is not super-strict, as it will
74: * allow longer utf-8 sequences than necessary. Note that Java is
75: * capable of producing these sequences if provoked. Also note, this
76: * routine checks for the 4-byte maxiumum size, but does not check for
77: * 0x10ffff maximum value.
78: *
79: * Return value: true if @utf is valid.
80: **/
81: int
82: xmlCheckUTF8(const unsigned char *utf)
83: {
84: int ix;
85: unsigned char c;
86:
87: for (ix = 0; (c = utf[ix]);) {
88: if (c & 0x80) {
89: if ((utf[ix + 1] & 0xc0) != 0x80)
90: return(0);
91: if ((c & 0xe0) == 0xe0) {
92: if ((utf[ix + 2] & 0xc0) != 0x80)
93: return(0);
94: if ((c & 0xf0) == 0xf0) {
95: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
96: return(0);
97: ix += 4;
98: /* 4-byte code */
99: } else
100: /* 3-byte code */
101: ix += 3;
102: } else
103: /* 2-byte code */
104: ix += 2;
105: } else
106: /* 1-byte code */
107: ix++;
108: }
109: return(1);
110: }
111:
112: /**
1.1 daniel 113: * isolat1ToUTF8:
1.18 daniel 114: * @out: a pointer to an array of bytes to store the result
115: * @outlen: the length of @out
116: * @in: a pointer to an array of ISO Latin 1 chars
117: * @inlen: the length of @in
1.1 daniel 118: *
119: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
120: * block of chars out.
1.33 daniel 121: * Returns 0 if success, or -1 otherwise
122: * The value of @inlen after return is the number of octets consumed
123: * as the return value is positive, else unpredictiable.
124: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 125: */
126: int
1.33 daniel 127: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25 daniel 128: const unsigned char* in, int *inlen) {
1.33 daniel 129: unsigned char* outstart = out;
130: const unsigned char* processed = in;
131: unsigned char* outend = out + *outlen;
132: const unsigned char* inend = in + *inlen;
1.1 daniel 133: unsigned char c;
134:
135: while (in < inend) {
136: c= *in++;
137: if (c < 0x80) {
1.33 daniel 138: if (out >= outend)
139: break;
1.1 daniel 140: *out++ = c;
141: }
142: else {
1.33 daniel 143: if (out + 1 >= outend) break;
1.1 daniel 144: *out++ = 0xC0 | (c >> 6);
145: *out++ = 0x80 | (0x3F & c);
146: }
1.33 daniel 147: processed = in;
1.1 daniel 148: }
1.33 daniel 149: *outlen = out - outstart;
150: *inlen = processed - in;
151:
152: return(0);
1.1 daniel 153: }
154:
155: /**
156: * UTF8Toisolat1:
1.18 daniel 157: * @out: a pointer to an array of bytes to store the result
158: * @outlen: the length of @out
159: * @in: a pointer to an array of UTF-8 chars
160: * @inlen: the length of @in
1.1 daniel 161: *
162: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
163: * block of chars out.
1.15 daniel 164: * TODO: UTF8Toisolat1 need a fallback mechanism ...
165: *
1.33 daniel 166: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28 daniel 167: * The value of @inlen after return is the number of octets consumed
168: * as the return value is positive, else unpredictiable.
1.33 daniel 169: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 170: */
171: int
1.33 daniel 172: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25 daniel 173: const unsigned char* in, int *inlen) {
1.33 daniel 174: unsigned char* outstart = out;
175: const unsigned char* processed = in;
176: unsigned char* outend = out + *outlen;
177: const unsigned char* inend = in + *inlen;
1.1 daniel 178: unsigned char c;
179:
180: while (in < inend) {
181: c= *in++;
182: if (c < 0x80) {
1.28 daniel 183: if (out >= outend) return(-1);
1.1 daniel 184: *out++= c;
185: }
1.23 daniel 186: else if (in == inend) {
187: break;
188: }
189: else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
190: /* a two byte utf-8 and can be encoding as isolate1 */
1.1 daniel 191: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23 daniel 192: }
1.33 daniel 193: else {
194: *outlen = out - outstart;
195: *inlen = processed - in;
1.28 daniel 196: return(-2);
1.33 daniel 197: }
198: processed = in;
1.1 daniel 199: }
1.33 daniel 200: *outlen = out - outstart;
201: *inlen = processed - in;
202: return(0);
1.1 daniel 203: }
204:
205: /**
1.28 daniel 206: * UTF16LEToUTF8:
207: * @out: a pointer to an array of bytes to store the result
208: * @outlen: the length of @out
209: * @inb: a pointer to an array of UTF-16LE passwd as a byte array
210: * @inlenb: the length of @in in UTF-16LE chars
211: *
212: * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
213: * block of chars out. This function assume the endian properity
214: * is the same between the native type of this machine and the
215: * inputed one.
216: *
217: * Returns the number of byte written, or -1 by lack of space, or -2
218: * if the transcoding fails (for *in is not valid utf16 string)
219: * The value of *inlen after return is the number of octets consumed
220: * as the return value is positive, else unpredictiable.
221: */
222: int
1.33 daniel 223: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28 daniel 224: const unsigned char* inb, int *inlenb)
225: {
1.33 daniel 226: unsigned char* outstart = out;
227: const unsigned char* processed = inb;
228: unsigned char* outend = out + *outlen;
1.28 daniel 229: unsigned short* in = (unsigned short*) inb;
230: unsigned short* inend;
231: unsigned int c, d, inlen;
232: unsigned char *tmp;
233: int bits;
234:
235: if ((*inlenb % 2) == 1)
236: (*inlenb)--;
237: inlen = *inlenb / 2;
1.33 daniel 238: inend = in + inlen;
1.28 daniel 239: while (in < inend) {
1.34 daniel 240: if (xmlLittleEndian) {
241: c= *in++;
242: } else {
243: tmp = (unsigned char *) in;
244: c = *tmp++;
245: c = c | (((unsigned int)*tmp) << 8);
246: in++;
247: }
1.28 daniel 248: if ((c & 0xFC00) == 0xD800) { /* surrogates */
249: if (in >= inend) { /* (in > inend) shouldn't happens */
250: break;
251: }
1.34 daniel 252: if (xmlLittleEndian) {
253: d = *in++;
254: } else {
255: tmp = (unsigned char *) in;
256: d = *tmp++;
257: d = d | (((unsigned int)*tmp) << 8);
258: in++;
259: }
1.28 daniel 260: if ((d & 0xFC00) == 0xDC00) {
261: c &= 0x03FF;
262: c <<= 10;
263: c |= d & 0x03FF;
264: c += 0x10000;
265: }
1.33 daniel 266: else {
267: *outlen = out - outstart;
268: *inlenb = processed - inb;
1.28 daniel 269: return(-2);
1.33 daniel 270: }
1.28 daniel 271: }
272:
273: /* assertion: c is a single UTF-4 value */
274: if (out >= outend)
1.33 daniel 275: break;
1.28 daniel 276: if (c < 0x80) { *out++= c; bits= -6; }
277: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
278: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
279: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
280:
281: for ( ; bits >= 0; bits-= 6) {
282: if (out >= outend)
1.33 daniel 283: break;
1.28 daniel 284: *out++= ((c >> bits) & 0x3F) | 0x80;
285: }
1.33 daniel 286: processed = (const unsigned char*) in;
1.28 daniel 287: }
1.33 daniel 288: *outlen = out - outstart;
289: *inlenb = processed - inb;
290: return(0);
1.28 daniel 291: }
292:
293: /**
294: * UTF8ToUTF16LE:
295: * @outb: a pointer to an array of bytes to store the result
296: * @outlen: the length of @outb
297: * @in: a pointer to an array of UTF-8 chars
298: * @inlen: the length of @in
299: *
300: * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
301: * block of chars out.
302: * TODO: UTF8ToUTF16LE need a fallback mechanism ...
303: *
304: * Returns the number of byte written, or -1 by lack of space, or -2
305: * if the transcoding failed.
306: */
307: int
1.33 daniel 308: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28 daniel 309: const unsigned char* in, int *inlen)
310: {
311: unsigned short* out = (unsigned short*) outb;
1.33 daniel 312: const unsigned char* processed = in;
1.28 daniel 313: unsigned short* outstart= out;
314: unsigned short* outend;
315: const unsigned char* inend= in+*inlen;
316: unsigned int c, d, trailing;
317: unsigned char *tmp;
318: unsigned short tmp1, tmp2;
319:
1.33 daniel 320: outend = out + (*outlen / 2);
1.28 daniel 321: while (in < inend) {
322: d= *in++;
323: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 324: else if (d < 0xC0) {
325: /* trailing byte in leading position */
326: *outlen = out - outstart;
327: *inlen = processed - in;
328: return(-2);
329: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.28 daniel 330: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
331: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 332: else {
333: /* no chance for this in UTF-16 */
334: *outlen = out - outstart;
335: *inlen = processed - in;
336: return(-2);
337: }
1.28 daniel 338:
339: if (inend - in < trailing) {
340: break;
341: }
342:
343: for ( ; trailing; trailing--) {
344: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33 daniel 345: break;
1.28 daniel 346: c <<= 6;
347: c |= d & 0x3F;
348: }
349:
350: /* assertion: c is a single UTF-4 value */
351: if (c < 0x10000) {
352: if (out >= outend)
1.33 daniel 353: break;
1.34 daniel 354: if (xmlLittleEndian) {
355: *out++ = c;
356: } else {
357: tmp = (unsigned char *) out;
358: *tmp = c ;
359: *(tmp + 1) = c >> 8 ;
360: out++;
361: }
1.28 daniel 362: }
363: else if (c < 0x110000) {
364: if (out+1 >= outend)
1.33 daniel 365: break;
1.28 daniel 366: c -= 0x10000;
1.34 daniel 367: if (xmlLittleEndian) {
368: *out++ = 0xD800 | (c >> 10);
369: *out++ = 0xDC00 | (c & 0x03FF);
370: } else {
371: tmp1 = 0xD800 | (c >> 10);
372: tmp = (unsigned char *) out;
373: *tmp = tmp1;
374: *(tmp + 1) = tmp1 >> 8;
375: out++;
376:
377: tmp2 = 0xDC00 | (c & 0x03FF);
378: tmp = (unsigned char *) out;
379: *tmp = tmp2;
380: *(tmp + 1) = tmp2 >> 8;
381: out++;
382: }
1.28 daniel 383: }
384: else
1.33 daniel 385: break;
386: processed = in;
1.28 daniel 387: }
1.33 daniel 388: *outlen = out - outstart;
389: *inlen = processed - in;
390: return(0);
1.28 daniel 391: }
392:
393: /**
394: * UTF16BEToUTF8:
1.18 daniel 395: * @out: a pointer to an array of bytes to store the result
396: * @outlen: the length of @out
1.25 daniel 397: * @inb: a pointer to an array of UTF-16 passwd as a byte array
398: * @inlenb: the length of @in in UTF-16 chars
1.1 daniel 399: *
400: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28 daniel 401: * block of chars out. This function assume the endian properity
402: * is the same between the native type of this machine and the
403: * inputed one.
1.25 daniel 404: *
1.28 daniel 405: * Returns the number of byte written, or -1 by lack of space, or -2
406: * if the transcoding fails (for *in is not valid utf16 string)
407: * The value of *inlen after return is the number of octets consumed
408: * as the return value is positive, else unpredictiable.
1.1 daniel 409: */
410: int
1.33 daniel 411: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25 daniel 412: const unsigned char* inb, int *inlenb)
1.1 daniel 413: {
1.33 daniel 414: unsigned char* outstart = out;
415: const unsigned char* processed = inb;
416: unsigned char* outend = out + *outlen;
1.25 daniel 417: unsigned short* in = (unsigned short*) inb;
418: unsigned short* inend;
419: unsigned int c, d, inlen;
1.28 daniel 420: unsigned char *tmp;
1.1 daniel 421: int bits;
422:
1.28 daniel 423: if ((*inlenb % 2) == 1)
424: (*inlenb)--;
1.25 daniel 425: inlen = *inlenb / 2;
426: inend= in + inlen;
1.1 daniel 427: while (in < inend) {
1.34 daniel 428: if (xmlLittleEndian) {
429: tmp = (unsigned char *) in;
430: c = *tmp++;
431: c = c << 8;
432: c = c | (unsigned int) *tmp;
433: in++;
434: } else {
435: c= *in++;
436: }
1.1 daniel 437: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.28 daniel 438: if (in >= inend) { /* (in > inend) shouldn't happens */
1.33 daniel 439: *outlen = out - outstart;
440: *inlenb = processed - inb;
441: return(-2);
1.28 daniel 442: }
1.34 daniel 443: if (xmlLittleEndian) {
444: tmp = (unsigned char *) in;
445: d = *tmp++;
446: d = d << 8;
447: d = d | (unsigned int) *tmp;
448: in++;
449: } else {
450: d= *in++;
451: }
1.28 daniel 452: if ((d & 0xFC00) == 0xDC00) {
1.1 daniel 453: c &= 0x03FF;
454: c <<= 10;
455: c |= d & 0x03FF;
456: c += 0x10000;
457: }
1.33 daniel 458: else {
459: *outlen = out - outstart;
460: *inlenb = processed - inb;
1.28 daniel 461: return(-2);
1.33 daniel 462: }
1.1 daniel 463: }
464:
1.25 daniel 465: /* assertion: c is a single UTF-4 value */
1.27 daniel 466: if (out >= outend)
1.33 daniel 467: break;
1.1 daniel 468: if (c < 0x80) { *out++= c; bits= -6; }
1.26 daniel 469: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
470: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
471: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1.1 daniel 472:
1.26 daniel 473: for ( ; bits >= 0; bits-= 6) {
1.27 daniel 474: if (out >= outend)
1.33 daniel 475: break;
1.26 daniel 476: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 477: }
1.33 daniel 478: processed = (const unsigned char*) in;
1.1 daniel 479: }
1.33 daniel 480: *outlen = out - outstart;
481: *inlenb = processed - inb;
482: return(0);
1.1 daniel 483: }
484:
485: /**
1.28 daniel 486: * UTF8ToUTF16BE:
1.25 daniel 487: * @outb: a pointer to an array of bytes to store the result
488: * @outlen: the length of @outb
1.18 daniel 489: * @in: a pointer to an array of UTF-8 chars
490: * @inlen: the length of @in
1.1 daniel 491: *
1.28 daniel 492: * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1 daniel 493: * block of chars out.
1.28 daniel 494: * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15 daniel 495: *
1.6 daniel 496: * Returns the number of byte written, or -1 by lack of space, or -2
1.25 daniel 497: * if the transcoding failed.
1.1 daniel 498: */
499: int
1.33 daniel 500: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25 daniel 501: const unsigned char* in, int *inlen)
1.1 daniel 502: {
1.25 daniel 503: unsigned short* out = (unsigned short*) outb;
1.33 daniel 504: const unsigned char* processed = in;
1.1 daniel 505: unsigned short* outstart= out;
1.28 daniel 506: unsigned short* outend;
1.25 daniel 507: const unsigned char* inend= in+*inlen;
1.1 daniel 508: unsigned int c, d, trailing;
1.28 daniel 509: unsigned char *tmp;
510: unsigned short tmp1, tmp2;
1.1 daniel 511:
1.33 daniel 512: outend = out + (*outlen / 2);
1.1 daniel 513: while (in < inend) {
514: d= *in++;
515: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 516: else if (d < 0xC0) {
517: /* trailing byte in leading position */
518: *outlen = out - outstart;
519: *inlen = processed - in;
520: return(-2);
521: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.1 daniel 522: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
523: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 524: else {
525: /* no chance for this in UTF-16 */
526: *outlen = out - outstart;
527: *inlen = processed - in;
528: return(-2);
529: }
1.28 daniel 530:
531: if (inend - in < trailing) {
532: break;
533: }
1.1 daniel 534:
535: for ( ; trailing; trailing--) {
1.33 daniel 536: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1.1 daniel 537: c <<= 6;
538: c |= d & 0x3F;
539: }
540:
541: /* assertion: c is a single UTF-4 value */
542: if (c < 0x10000) {
1.33 daniel 543: if (out >= outend) break;
1.34 daniel 544: if (xmlLittleEndian) {
545: tmp = (unsigned char *) out;
546: *tmp = c >> 8;
547: *(tmp + 1) = c;
548: out++;
549: } else {
550: *out++ = c;
551: }
1.1 daniel 552: }
553: else if (c < 0x110000) {
1.33 daniel 554: if (out+1 >= outend) break;
1.1 daniel 555: c -= 0x10000;
1.34 daniel 556: if (xmlLittleEndian) {
557: tmp1 = 0xD800 | (c >> 10);
558: tmp = (unsigned char *) out;
559: *tmp = tmp1 >> 8;
560: *(tmp + 1) = tmp1;
561: out++;
562:
563: tmp2 = 0xDC00 | (c & 0x03FF);
564: tmp = (unsigned char *) out;
565: *tmp = tmp2 >> 8;
566: *(tmp + 1) = tmp2;
567: out++;
568: } else {
569: *out++ = 0xD800 | (c >> 10);
570: *out++ = 0xDC00 | (c & 0x03FF);
571: }
1.1 daniel 572: }
1.33 daniel 573: else
574: break;
575: processed = in;
1.1 daniel 576: }
1.33 daniel 577: *outlen = out - outstart;
578: *inlen = processed - in;
579: return(0);
1.1 daniel 580: }
581:
1.7 daniel 582: /**
583: * xmlDetectCharEncoding:
584: * @in: a pointer to the first bytes of the XML entity, must be at least
585: * 4 bytes long.
1.25 daniel 586: * @len: pointer to the length of the buffer
1.7 daniel 587: *
588: * Guess the encoding of the entity using the first bytes of the entity content
589: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
590: *
591: * Returns one of the XML_CHAR_ENCODING_... values.
592: */
593: xmlCharEncoding
1.25 daniel 594: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7 daniel 595: {
1.25 daniel 596: if (len >= 4) {
597: if ((in[0] == 0x00) && (in[1] == 0x00) &&
598: (in[2] == 0x00) && (in[3] == 0x3C))
599: return(XML_CHAR_ENCODING_UCS4BE);
600: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
601: (in[2] == 0x00) && (in[3] == 0x00))
602: return(XML_CHAR_ENCODING_UCS4LE);
603: if ((in[0] == 0x00) && (in[1] == 0x00) &&
604: (in[2] == 0x3C) && (in[3] == 0x00))
605: return(XML_CHAR_ENCODING_UCS4_2143);
606: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
607: (in[2] == 0x00) && (in[3] == 0x00))
608: return(XML_CHAR_ENCODING_UCS4_3412);
609: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
610: (in[2] == 0xA7) && (in[3] == 0x94))
611: return(XML_CHAR_ENCODING_EBCDIC);
612: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
613: (in[2] == 0x78) && (in[3] == 0x6D))
614: return(XML_CHAR_ENCODING_UTF8);
615: }
616: if (len >= 2) {
617: if ((in[0] == 0xFE) && (in[1] == 0xFF))
618: return(XML_CHAR_ENCODING_UTF16BE);
619: if ((in[0] == 0xFF) && (in[1] == 0xFE))
620: return(XML_CHAR_ENCODING_UTF16LE);
621: }
1.7 daniel 622: return(XML_CHAR_ENCODING_NONE);
623: }
624:
625: /**
626: * xmlParseCharEncoding:
1.18 daniel 627: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 628: *
629: * Conpare the string to the known encoding schemes already known. Note
630: * that the comparison is case insensitive accordingly to the section
631: * [XML] 4.3.3 Character Encoding in Entities.
632: *
633: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
634: * if not recognized.
635: */
636: xmlCharEncoding
1.8 daniel 637: xmlParseCharEncoding(const char* name)
1.7 daniel 638: {
639: char upper[500];
640: int i;
641:
642: for (i = 0;i < 499;i++) {
643: upper[i] = toupper(name[i]);
644: if (upper[i] == 0) break;
645: }
646: upper[i] = 0;
647:
648: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
649: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
650: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
651:
652: /*
653: * NOTE: if we were able to parse this, the endianness of UTF16 is
654: * already found and in use
655: */
656: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
657: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
658:
659: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
660: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
661: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
662:
663: /*
664: * NOTE: if we were able to parse this, the endianness of UCS4 is
665: * already found and in use
666: */
667: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
668: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
669: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
670:
671:
672: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
673: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
674: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
675:
676: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
677: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
678: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
679:
680: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
681: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
682: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
683: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
684: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
685: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
686: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
687:
688: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30 daniel 689: if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7 daniel 690: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30 daniel 691:
692: #ifdef DEBUG_ENCODING
693: fprintf(stderr, "Unknown encoding %s\n", name);
694: #endif
1.7 daniel 695: return(XML_CHAR_ENCODING_ERROR);
696: }
1.9 daniel 697:
698: /****************************************************************
699: * *
700: * Char encoding handlers *
701: * *
702: ****************************************************************/
703:
704: /* the size should be growable, but it's not a big deal ... */
705: #define MAX_ENCODING_HANDLERS 50
706: static xmlCharEncodingHandlerPtr *handlers = NULL;
707: static int nbCharEncodingHandler = 0;
708:
709: /*
710: * The default is UTF-8 for XML, that's also the default used for the
711: * parser internals, so the default encoding handler is NULL
712: */
713:
714: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
715:
716: /**
717: * xmlNewCharEncodingHandler:
1.18 daniel 718: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 719: * @input: the xmlCharEncodingInputFunc to read that encoding
720: * @output: the xmlCharEncodingOutputFunc to write that encoding
721: *
722: * Create and registers an xmlCharEncodingHandler.
723: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
724: */
725: xmlCharEncodingHandlerPtr
1.25 daniel 726: xmlNewCharEncodingHandler(const char *name,
727: xmlCharEncodingInputFunc input,
1.9 daniel 728: xmlCharEncodingOutputFunc output) {
729: xmlCharEncodingHandlerPtr handler;
730: char upper[500];
731: int i;
732: char *up = 0;
733:
734: /*
735: * Keep only the uppercase version of the encoding.
736: */
737: if (name == NULL) {
738: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
739: return(NULL);
740: }
741: for (i = 0;i < 499;i++) {
742: upper[i] = toupper(name[i]);
743: if (upper[i] == 0) break;
744: }
745: upper[i] = 0;
1.16 daniel 746: up = xmlMemStrdup(upper);
1.9 daniel 747: if (up == NULL) {
748: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
749: return(NULL);
750: }
751:
752: /*
753: * allocate and fill-up an handler block.
754: */
755: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 756: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 757: if (handler == NULL) {
758: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
759: return(NULL);
760: }
761: handler->input = input;
762: handler->output = output;
763: handler->name = up;
764:
765: /*
766: * registers and returns the handler.
767: */
768: xmlRegisterCharEncodingHandler(handler);
1.30 daniel 769: #ifdef DEBUG_ENCODING
770: fprintf(stderr, "Registered encoding handler for %s\n", name);
771: #endif
1.9 daniel 772: return(handler);
773: }
774:
775: /**
776: * xmlInitCharEncodingHandlers:
777: *
778: * Initialize the char encoding support, it registers the default
779: * encoding supported.
1.18 daniel 780: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 781: * in normal processing.
782: */
783: void
784: xmlInitCharEncodingHandlers(void) {
1.34 daniel 785: unsigned short int tst = 0x1234;
786: unsigned char *ptr = (unsigned char *) &tst;
787:
1.9 daniel 788: if (handlers != NULL) return;
789:
790: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 791: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34 daniel 792:
793: if (*ptr == 0x12) xmlLittleEndian = 0;
794: else if (*ptr == 0x34) xmlLittleEndian = 1;
795: else fprintf(stderr, "Odd problem at endianness detection\n");
1.9 daniel 796:
797: if (handlers == NULL) {
798: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
799: return;
800: }
1.10 daniel 801: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25 daniel 802: xmlUTF16LEHandler =
1.28 daniel 803: xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
804: xmlUTF16BEHandler =
805: xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10 daniel 806: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9 daniel 807: }
808:
809: /**
1.19 daniel 810: * xmlCleanupCharEncodingHandlers:
811: *
812: * Cleanup the memory allocated for the char encoding support, it
813: * unregisters all the encoding handlers.
814: */
815: void
816: xmlCleanupCharEncodingHandlers(void) {
817: if (handlers == NULL) return;
818:
819: for (;nbCharEncodingHandler > 0;) {
820: nbCharEncodingHandler--;
821: if (handlers[nbCharEncodingHandler] != NULL) {
1.31 daniel 822: if (handlers[nbCharEncodingHandler]->name != NULL)
823: xmlFree(handlers[nbCharEncodingHandler]->name);
1.19 daniel 824: xmlFree(handlers[nbCharEncodingHandler]);
825: }
826: }
827: xmlFree(handlers);
828: handlers = NULL;
829: nbCharEncodingHandler = 0;
830: xmlDefaultCharEncodingHandler = NULL;
831: }
832:
833: /**
1.9 daniel 834: * xmlRegisterCharEncodingHandler:
835: * @handler: the xmlCharEncodingHandlerPtr handler block
836: *
837: * Register the char encoding handler, surprizing, isn't it ?
838: */
839: void
840: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
841: if (handlers == NULL) xmlInitCharEncodingHandlers();
842: if (handler == NULL) {
843: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
844: return;
845: }
846:
847: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
848: fprintf(stderr,
849: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
850: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
851: return;
852: }
853: handlers[nbCharEncodingHandler++] = handler;
854: }
855:
856: /**
857: * xmlGetCharEncodingHandler:
858: * @enc: an xmlCharEncoding value.
859: *
860: * Search in the registrered set the handler able to read/write that encoding.
861: *
862: * Returns the handler or NULL if not found
863: */
864: xmlCharEncodingHandlerPtr
865: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30 daniel 866: xmlCharEncodingHandlerPtr handler;
867:
1.9 daniel 868: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25 daniel 869: switch (enc) {
870: case XML_CHAR_ENCODING_ERROR:
871: return(NULL);
872: case XML_CHAR_ENCODING_NONE:
873: return(NULL);
874: case XML_CHAR_ENCODING_UTF8:
875: return(NULL);
876: case XML_CHAR_ENCODING_UTF16LE:
877: return(xmlUTF16LEHandler);
878: case XML_CHAR_ENCODING_UTF16BE:
879: return(xmlUTF16BEHandler);
880: case XML_CHAR_ENCODING_EBCDIC:
1.30 daniel 881: handler = xmlFindCharEncodingHandler("EBCDIC");
882: if (handler != NULL) return(handler);
883: handler = xmlFindCharEncodingHandler("ebcdic");
884: if (handler != NULL) return(handler);
885: break;
1.25 daniel 886: case XML_CHAR_ENCODING_UCS4LE:
1.30 daniel 887: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
888: if (handler != NULL) return(handler);
889: handler = xmlFindCharEncodingHandler("UCS-4");
890: if (handler != NULL) return(handler);
891: handler = xmlFindCharEncodingHandler("UCS4");
892: if (handler != NULL) return(handler);
893: break;
1.25 daniel 894: case XML_CHAR_ENCODING_UCS4BE:
1.30 daniel 895: handler = xmlFindCharEncodingHandler("UCS4BE");
896: if (handler != NULL) return(handler);
897: break;
1.25 daniel 898: case XML_CHAR_ENCODING_UCS4_2143:
1.30 daniel 899: break;
1.25 daniel 900: case XML_CHAR_ENCODING_UCS4_3412:
1.30 daniel 901: break;
1.25 daniel 902: case XML_CHAR_ENCODING_UCS2:
1.30 daniel 903: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
904: if (handler != NULL) return(handler);
905: handler = xmlFindCharEncodingHandler("UCS-2");
906: if (handler != NULL) return(handler);
907: handler = xmlFindCharEncodingHandler("UCS2");
908: if (handler != NULL) return(handler);
909: break;
1.25 daniel 910: case XML_CHAR_ENCODING_8859_1:
911: case XML_CHAR_ENCODING_8859_2:
912: case XML_CHAR_ENCODING_8859_3:
913: case XML_CHAR_ENCODING_8859_4:
914: case XML_CHAR_ENCODING_8859_5:
915: case XML_CHAR_ENCODING_8859_6:
916: case XML_CHAR_ENCODING_8859_7:
917: case XML_CHAR_ENCODING_8859_8:
918: case XML_CHAR_ENCODING_8859_9:
919: return(NULL);
920: case XML_CHAR_ENCODING_2022_JP:
1.30 daniel 921: handler = xmlFindCharEncodingHandler("ISO-2022-JP");
922: if (handler != NULL) return(handler);
923: break;
1.25 daniel 924: case XML_CHAR_ENCODING_SHIFT_JIS:
1.30 daniel 925: handler = xmlFindCharEncodingHandler("SHIFT-JIS");
926: if (handler != NULL) return(handler);
927: handler = xmlFindCharEncodingHandler("SHIFT_JIS");
928: if (handler != NULL) return(handler);
929: handler = xmlFindCharEncodingHandler("Shift_JIS");
930: if (handler != NULL) return(handler);
931: break;
1.25 daniel 932: case XML_CHAR_ENCODING_EUC_JP:
1.30 daniel 933: handler = xmlFindCharEncodingHandler("EUC-JP");
934: if (handler != NULL) return(handler);
935: break;
936: default:
937: break;
1.25 daniel 938: }
1.30 daniel 939:
940: #ifdef DEBUG_ENCODING
941: fprintf(stderr, "No handler found for encoding %d\n", enc);
942: #endif
1.9 daniel 943: return(NULL);
944: }
945:
946: /**
947: * xmlGetCharEncodingHandler:
948: * @enc: a string describing the char encoding.
949: *
950: * Search in the registrered set the handler able to read/write that encoding.
951: *
952: * Returns the handler or NULL if not found
953: */
954: xmlCharEncodingHandlerPtr
955: xmlFindCharEncodingHandler(const char *name) {
1.30 daniel 956: #ifdef LIBXML_ICONV_ENABLED
957: iconv_t icv_in, icv_out;
958: xmlCharEncodingHandlerPtr enc;
959: #endif /* LIBXML_ICONV_ENABLED */
960: char upper[100];
1.9 daniel 961: int i;
962:
963: if (handlers == NULL) xmlInitCharEncodingHandlers();
964: if (name == NULL) return(xmlDefaultCharEncodingHandler);
965: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
966:
1.30 daniel 967: for (i = 0;i < 99;i++) {
1.9 daniel 968: upper[i] = toupper(name[i]);
969: if (upper[i] == 0) break;
970: }
971: upper[i] = 0;
972:
973: for (i = 0;i < nbCharEncodingHandler; i++)
1.30 daniel 974: if (!strcmp(upper, handlers[i]->name)) {
975: #ifdef DEBUG_ENCODING
976: fprintf(stderr, "Found registered handler for encoding %s\n", name);
977: #endif
1.9 daniel 978: return(handlers[i]);
1.30 daniel 979: }
1.9 daniel 980:
1.30 daniel 981: #ifdef LIBXML_ICONV_ENABLED
982: /* check whether iconv can handle this */
1.31 daniel 983: icv_in = iconv_open("UTF-8", name);
984: icv_out = iconv_open(name, "UTF-8");
1.30 daniel 985: if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31 daniel 986: enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32 daniel 987: if (enc == NULL) {
988: iconv_close(icv_in);
989: iconv_close(icv_out);
990: return(NULL);
991: }
992: enc->name = NULL;
1.30 daniel 993: enc->input = NULL;
994: enc->output = NULL;
995: enc->iconv_in = icv_in;
996: enc->iconv_out = icv_out;
997: #ifdef DEBUG_ENCODING
998: fprintf(stderr, "Found iconv handler for encoding %s\n", name);
999: #endif
1000: return enc;
1001: } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1002: fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
1003: }
1004: #endif /* LIBXML_ICONV_ENABLED */
1005: #ifdef DEBUG_ENCODING
1006: fprintf(stderr, "No handler found for encoding %s\n", name);
1007: #endif
1.9 daniel 1008: return(NULL);
1.30 daniel 1009: }
1010:
1011: #ifdef LIBXML_ICONV_ENABLED
1012: /**
1013: * xmlIconvWrapper:
1014: * @cd: iconv converter data structure
1015: * @out: a pointer to an array of bytes to store the result
1016: * @outlen: the length of @out
1017: * @in: a pointer to an array of ISO Latin 1 chars
1018: * @inlen: the length of @in
1019: *
1020: * Returns 0 if success, or
1021: * -1 by lack of space, or
1022: * -2 if the transcoding fails (for *in is not valid utf8 string or
1023: * the result of transformation can't fit into the encoding we want), or
1024: * -3 if there the last byte can't form a single output char.
1025: *
1026: * The value of @inlen after return is the number of octets consumed
1027: * as the return value is positive, else unpredictiable.
1028: * The value of @outlen after return is the number of ocetes consumed.
1029: */
1030: static int
1031: xmlIconvWrapper(iconv_t cd,
1032: unsigned char *out, int *outlen,
1033: const unsigned char *in, int *inlen) {
1034:
1035: size_t icv_inlen = *inlen, icv_outlen = *outlen;
1036: const char *icv_in = (const char *) in;
1037: char *icv_out = (char *) out;
1038: int ret;
1039:
1040: ret = iconv(cd,
1041: &icv_in, &icv_inlen,
1042: &icv_out, &icv_outlen);
1.35 ! daniel 1043: if (in != NULL) {
! 1044: *inlen -= icv_inlen;
! 1045: *outlen -= icv_outlen;
! 1046: } else {
! 1047: *inlen = 0;
! 1048: *outlen = 0;
! 1049: }
1.30 daniel 1050: if (icv_inlen != 0 || ret == (size_t) -1) {
1051: #ifdef EILSEQ
1052: if (errno == EILSEQ) {
1.31 daniel 1053: return -2;
1.30 daniel 1054: } else
1055: #endif
1056: #ifdef E2BIG
1057: if (errno == E2BIG) {
1058: return -1;
1059: } else
1060: #endif
1061: #ifdef EINVAL
1062: if (errno == EINVAL) {
1.31 daniel 1063: return -3;
1.30 daniel 1064: }
1065: #endif
1066: else {
1067: return -3;
1068: }
1069: }
1070: return 0;
1071: }
1072: #endif /* LIBXML_ICONV_ENABLED */
1073:
1074: /**
1075: * xmlCharEncInFunc:
1076: * @handler: char enconding transformation data structure
1.31 daniel 1077: * @out: an xmlBuffer for the output.
1078: * @in: an xmlBuffer for the input
1.30 daniel 1079: *
1080: * Generic front-end for the encoding handler input function
1081: *
1.31 daniel 1082: * Returns the number of byte written if success, or
1083: * -1 general error
1.30 daniel 1084: * -2 if the transcoding fails (for *in is not valid utf8 string or
1085: * the result of transformation can't fit into the encoding we want), or
1086: */
1087: int
1.31 daniel 1088: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1089: xmlBufferPtr in) {
1.30 daniel 1090: int ret = -2;
1.31 daniel 1091: int written;
1092: int toconv;
1.30 daniel 1093:
1.31 daniel 1094: if (handler == NULL) return(-1);
1095: if (out == NULL) return(-1);
1096: if (in == NULL) return(-1);
1097:
1098: written = out->size - out->use;
1099: toconv = in->use;
1100: if (toconv * 2 >= written) {
1101: xmlBufferGrow(out, toconv * 2);
1.33 daniel 1102: written = out->size - out->use - 1;
1.31 daniel 1103: }
1.30 daniel 1104: if (handler->input != NULL) {
1.32 daniel 1105: ret = handler->input(&out->content[out->use], &written,
1.31 daniel 1106: in->content, &toconv);
1107: xmlBufferShrink(in, toconv);
1108: out->use += written;
1.33 daniel 1109: out->content[out->use] = 0;
1.30 daniel 1110: }
1111: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1112: else if (handler->iconv_in != NULL) {
1113: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1114: &written, in->content, &toconv);
1115: xmlBufferShrink(in, toconv);
1116: out->use += written;
1.33 daniel 1117: out->content[out->use] = 0;
1118: if (ret == -1) ret = -3;
1.30 daniel 1119: }
1120: #endif /* LIBXML_ICONV_ENABLED */
1121: #ifdef DEBUG_ENCODING
1122: switch (ret) {
1123: case 0:
1124: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31 daniel 1125: toconv, written);
1.30 daniel 1126: break;
1127: case -1:
1.31 daniel 1128: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1129: toconv, written, in->use);
1.30 daniel 1130: break;
1131: case -2:
1132: fprintf(stderr, "input conversion failed due to input error\n");
1133: break;
1134: case -3:
1.31 daniel 1135: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1136: toconv, written, in->use);
1.30 daniel 1137: break;
1138: default:
1139: fprintf(stderr,"Unknown input conversion failed %d\n", ret);
1140: }
1141: #endif
1.33 daniel 1142: /*
1143: * Ignore when input buffer is not on a boundary
1144: */
1145: if (ret == -3) ret = 0;
1.30 daniel 1146: return(ret);
1147: }
1148:
1149: /**
1150: * xmlCharEncOutFunc:
1151: * @handler: char enconding transformation data structure
1.31 daniel 1152: * @out: an xmlBuffer for the output.
1153: * @in: an xmlBuffer for the input
1154: *
1155: * Generic front-end for the encoding handler output function
1.35 ! daniel 1156: * a first call with @in == NULL has to be made firs to initiate the
! 1157: * output in case of non-stateless encoding needing to initiate their
! 1158: * state or the output (like the BOM in UTF16).
1.30 daniel 1159: *
1.31 daniel 1160: * Returns the number of byte written if success, or
1161: * -1 general error
1.30 daniel 1162: * -2 if the transcoding fails (for *in is not valid utf8 string or
1163: * the result of transformation can't fit into the encoding we want), or
1164: */
1165: int
1.31 daniel 1166: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1167: xmlBufferPtr in) {
1.30 daniel 1168: int ret = -2;
1.31 daniel 1169: int written;
1170: int toconv;
1171:
1172: if (handler == NULL) return(-1);
1173: if (out == NULL) return(-1);
1.35 ! daniel 1174: written = out->size - out->use;
! 1175:
! 1176: if (in == NULL) {
! 1177: toconv = 0;
! 1178: if (handler->output != NULL) {
! 1179: ret = handler->output(&out->content[out->use], &written,
! 1180: NULL, &toconv);
! 1181: out->use += written;
! 1182: out->content[out->use] = 0;
! 1183: }
! 1184: #ifdef LIBXML_ICONV_ENABLED
! 1185: else if (handler->iconv_out != NULL) {
! 1186: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
! 1187: &written, NULL, &toconv);
! 1188: out->use += written;
! 1189: out->content[out->use] = 0;
! 1190: }
! 1191: #endif /* LIBXML_ICONV_ENABLED */
! 1192: #ifdef DEBUG_ENCODING
! 1193: fprintf(stderr, "initialized encoder\n");
! 1194: #endif
! 1195: return(0);
! 1196: }
1.30 daniel 1197:
1.33 daniel 1198: toconv = in->use;
1199: if (toconv * 2 >= written) {
1200: xmlBufferGrow(out, toconv * 2);
1201: written = out->size - out->use - 1;
1202: }
1.30 daniel 1203: if (handler->output != NULL) {
1.33 daniel 1204: ret = handler->output(&out->content[out->use], &written,
1.35 ! daniel 1205: in->content, &toconv);
1.31 daniel 1206: xmlBufferShrink(in, toconv);
1207: out->use += written;
1.33 daniel 1208: out->content[out->use] = 0;
1.30 daniel 1209: }
1210: #ifdef LIBXML_ICONV_ENABLED
1211: else if (handler->iconv_out != NULL) {
1.31 daniel 1212: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1213: &written, in->content, &toconv);
1214: xmlBufferShrink(in, toconv);
1215: out->use += written;
1.33 daniel 1216: out->content[out->use] = 0;
1217: if (ret == -1) ret = -3;
1.30 daniel 1218: }
1219: #endif /* LIBXML_ICONV_ENABLED */
1220: #ifdef DEBUG_ENCODING
1221: switch (ret) {
1222: case 0:
1223: fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31 daniel 1224: toconv, written);
1.30 daniel 1225: break;
1226: case -1:
1227: fprintf(stderr, "output conversion failed by lack of space\n");
1228: break;
1229: case -2:
1230: fprintf(stderr, "output conversion failed due to output error\n");
1231: break;
1232: case -3:
1.31 daniel 1233: fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
1234: toconv, written, in->use);
1.30 daniel 1235: break;
1236: default:
1237: fprintf(stderr,"Unknown output conversion failed %d\n", ret);
1238: }
1239: #endif
1240: return(ret);
1241: }
1242:
1243: /**
1244: * xmlCharEncCloseFunc:
1245: * @handler: char enconding transformation data structure
1246: *
1247: * Generic front-end for hencoding handler close function
1248: *
1249: * Returns 0 if success, or -1 in case of error
1250: */
1251: int
1252: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1253: int ret = 0;
1.31 daniel 1254: if (handler == NULL) return(-1);
1255: if (handler->name == NULL) return(-1);
1.30 daniel 1256: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1257: /*
1258: * Iconv handlers can be oused only once, free the whole block.
1259: * and the associated icon resources.
1260: */
1.32 daniel 1261: if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
1262: if (handler->name != NULL)
1263: xmlFree(handler->name);
1264: handler->name = NULL;
1265: if (handler->iconv_out != NULL) {
1266: if (iconv_close(handler->iconv_out))
1267: ret = -1;
1268: handler->iconv_out = NULL;
1269: }
1270: if (handler->iconv_in != NULL) {
1271: if (iconv_close(handler->iconv_in))
1272: ret = -1;
1273: handler->iconv_in = NULL;
1274: }
1275: xmlFree(handler);
1.30 daniel 1276: }
1277: #endif /* LIBXML_ICONV_ENABLED */
1278: #ifdef DEBUG_ENCODING
1279: if (ret)
1280: fprintf(stderr, "failed to close the encoding handler\n");
1281: else
1282: fprintf(stderr, "closed the encoding handler\n");
1283:
1284: #endif
1285: return(ret);
1.9 daniel 1286: }
1287:
Webmaster