Annotation of XML/encoding.c, revision 1.38
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
1.9 daniel 15: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 16: *
17: * See Copyright for the status of this software.
18: *
19: * Daniel.Veillard@w3.org
20: */
21:
1.21 daniel 22: #ifdef WIN32
23: #include "win32config.h"
24: #else
1.14 daniel 25: #include "config.h"
1.17 daniel 26: #endif
27:
28: #include <stdio.h>
29: #include <string.h>
30:
31: #ifdef HAVE_CTYPE_H
1.7 daniel 32: #include <ctype.h>
1.17 daniel 33: #endif
1.20 daniel 34: #ifdef HAVE_STDLIB_H
35: #include <stdlib.h>
36: #endif
1.30 daniel 37: #include <libxml/xmlversion.h>
38: #ifdef LIBXML_ICONV_ENABLED
39: #ifdef HAVE_ERRNO_H
40: #include <errno.h>
41: #endif
42: #endif
1.29 daniel 43: #include <libxml/encoding.h>
44: #include <libxml/xmlmemory.h>
1.3 daniel 45:
1.25 daniel 46: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
47: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
48:
1.30 daniel 49: #ifdef LIBXML_ICONV_ENABLED
1.37 daniel 50: #if 0
1.30 daniel 51: #define DEBUG_ENCODING /* Define this to get encoding traces */
52: #endif
1.33 daniel 53: #endif
1.30 daniel 54:
1.34 daniel 55: static int xmlLittleEndian = 1;
56:
1.3 daniel 57: /*
58: * From rfc2044: encoding of the Unicode values on UTF-8:
59: *
60: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
61: * 0000 0000-0000 007F 0xxxxxxx
62: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
63: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
64: *
65: * I hope we won't use values > 0xFFFF anytime soon !
66: */
1.1 daniel 67:
68: /**
1.22 daniel 69: * xmlCheckUTF8: Check utf-8 string for legality.
70: * @utf: Pointer to putative utf-8 encoded string.
71: *
72: * Checks @utf for being valid utf-8. @utf is assumed to be
73: * null-terminated. This function is not super-strict, as it will
74: * allow longer utf-8 sequences than necessary. Note that Java is
75: * capable of producing these sequences if provoked. Also note, this
76: * routine checks for the 4-byte maxiumum size, but does not check for
77: * 0x10ffff maximum value.
78: *
79: * Return value: true if @utf is valid.
80: **/
81: int
82: xmlCheckUTF8(const unsigned char *utf)
83: {
84: int ix;
85: unsigned char c;
86:
87: for (ix = 0; (c = utf[ix]);) {
88: if (c & 0x80) {
89: if ((utf[ix + 1] & 0xc0) != 0x80)
90: return(0);
91: if ((c & 0xe0) == 0xe0) {
92: if ((utf[ix + 2] & 0xc0) != 0x80)
93: return(0);
94: if ((c & 0xf0) == 0xf0) {
95: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
96: return(0);
97: ix += 4;
98: /* 4-byte code */
99: } else
100: /* 3-byte code */
101: ix += 3;
102: } else
103: /* 2-byte code */
104: ix += 2;
105: } else
106: /* 1-byte code */
107: ix++;
108: }
109: return(1);
110: }
111:
112: /**
1.1 daniel 113: * isolat1ToUTF8:
1.18 daniel 114: * @out: a pointer to an array of bytes to store the result
115: * @outlen: the length of @out
116: * @in: a pointer to an array of ISO Latin 1 chars
117: * @inlen: the length of @in
1.1 daniel 118: *
119: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
120: * block of chars out.
1.33 daniel 121: * Returns 0 if success, or -1 otherwise
122: * The value of @inlen after return is the number of octets consumed
123: * as the return value is positive, else unpredictiable.
124: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 125: */
126: int
1.33 daniel 127: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25 daniel 128: const unsigned char* in, int *inlen) {
1.33 daniel 129: unsigned char* outstart = out;
130: const unsigned char* processed = in;
131: unsigned char* outend = out + *outlen;
132: const unsigned char* inend = in + *inlen;
1.1 daniel 133: unsigned char c;
134:
135: while (in < inend) {
136: c= *in++;
137: if (c < 0x80) {
1.33 daniel 138: if (out >= outend)
139: break;
1.1 daniel 140: *out++ = c;
141: }
142: else {
1.33 daniel 143: if (out + 1 >= outend) break;
1.1 daniel 144: *out++ = 0xC0 | (c >> 6);
145: *out++ = 0x80 | (0x3F & c);
146: }
1.33 daniel 147: processed = in;
1.1 daniel 148: }
1.33 daniel 149: *outlen = out - outstart;
150: *inlen = processed - in;
151:
152: return(0);
1.1 daniel 153: }
154:
155: /**
156: * UTF8Toisolat1:
1.18 daniel 157: * @out: a pointer to an array of bytes to store the result
158: * @outlen: the length of @out
159: * @in: a pointer to an array of UTF-8 chars
160: * @inlen: the length of @in
1.1 daniel 161: *
162: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
163: * block of chars out.
1.15 daniel 164: * TODO: UTF8Toisolat1 need a fallback mechanism ...
165: *
1.33 daniel 166: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28 daniel 167: * The value of @inlen after return is the number of octets consumed
168: * as the return value is positive, else unpredictiable.
1.33 daniel 169: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 170: */
171: int
1.33 daniel 172: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25 daniel 173: const unsigned char* in, int *inlen) {
1.33 daniel 174: unsigned char* outstart = out;
175: const unsigned char* processed = in;
176: unsigned char* outend = out + *outlen;
177: const unsigned char* inend = in + *inlen;
1.1 daniel 178: unsigned char c;
179:
180: while (in < inend) {
181: c= *in++;
182: if (c < 0x80) {
1.28 daniel 183: if (out >= outend) return(-1);
1.1 daniel 184: *out++= c;
185: }
1.23 daniel 186: else if (in == inend) {
187: break;
188: }
189: else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
190: /* a two byte utf-8 and can be encoding as isolate1 */
1.1 daniel 191: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23 daniel 192: }
1.33 daniel 193: else {
194: *outlen = out - outstart;
195: *inlen = processed - in;
1.28 daniel 196: return(-2);
1.33 daniel 197: }
198: processed = in;
1.1 daniel 199: }
1.33 daniel 200: *outlen = out - outstart;
201: *inlen = processed - in;
202: return(0);
1.1 daniel 203: }
204:
205: /**
1.28 daniel 206: * UTF16LEToUTF8:
207: * @out: a pointer to an array of bytes to store the result
208: * @outlen: the length of @out
209: * @inb: a pointer to an array of UTF-16LE passwd as a byte array
210: * @inlenb: the length of @in in UTF-16LE chars
211: *
212: * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
213: * block of chars out. This function assume the endian properity
214: * is the same between the native type of this machine and the
215: * inputed one.
216: *
217: * Returns the number of byte written, or -1 by lack of space, or -2
218: * if the transcoding fails (for *in is not valid utf16 string)
219: * The value of *inlen after return is the number of octets consumed
220: * as the return value is positive, else unpredictiable.
221: */
222: int
1.33 daniel 223: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28 daniel 224: const unsigned char* inb, int *inlenb)
225: {
1.33 daniel 226: unsigned char* outstart = out;
227: const unsigned char* processed = inb;
228: unsigned char* outend = out + *outlen;
1.28 daniel 229: unsigned short* in = (unsigned short*) inb;
230: unsigned short* inend;
231: unsigned int c, d, inlen;
232: unsigned char *tmp;
233: int bits;
234:
235: if ((*inlenb % 2) == 1)
236: (*inlenb)--;
237: inlen = *inlenb / 2;
1.33 daniel 238: inend = in + inlen;
1.28 daniel 239: while (in < inend) {
1.34 daniel 240: if (xmlLittleEndian) {
241: c= *in++;
242: } else {
243: tmp = (unsigned char *) in;
244: c = *tmp++;
245: c = c | (((unsigned int)*tmp) << 8);
246: in++;
247: }
1.28 daniel 248: if ((c & 0xFC00) == 0xD800) { /* surrogates */
249: if (in >= inend) { /* (in > inend) shouldn't happens */
250: break;
251: }
1.34 daniel 252: if (xmlLittleEndian) {
253: d = *in++;
254: } else {
255: tmp = (unsigned char *) in;
256: d = *tmp++;
257: d = d | (((unsigned int)*tmp) << 8);
258: in++;
259: }
1.28 daniel 260: if ((d & 0xFC00) == 0xDC00) {
261: c &= 0x03FF;
262: c <<= 10;
263: c |= d & 0x03FF;
264: c += 0x10000;
265: }
1.33 daniel 266: else {
267: *outlen = out - outstart;
268: *inlenb = processed - inb;
1.28 daniel 269: return(-2);
1.33 daniel 270: }
1.28 daniel 271: }
272:
273: /* assertion: c is a single UTF-4 value */
274: if (out >= outend)
1.33 daniel 275: break;
1.28 daniel 276: if (c < 0x80) { *out++= c; bits= -6; }
277: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
278: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
279: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
280:
281: for ( ; bits >= 0; bits-= 6) {
282: if (out >= outend)
1.33 daniel 283: break;
1.28 daniel 284: *out++= ((c >> bits) & 0x3F) | 0x80;
285: }
1.33 daniel 286: processed = (const unsigned char*) in;
1.28 daniel 287: }
1.33 daniel 288: *outlen = out - outstart;
289: *inlenb = processed - inb;
290: return(0);
1.28 daniel 291: }
292:
293: /**
294: * UTF8ToUTF16LE:
295: * @outb: a pointer to an array of bytes to store the result
296: * @outlen: the length of @outb
297: * @in: a pointer to an array of UTF-8 chars
298: * @inlen: the length of @in
299: *
300: * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
301: * block of chars out.
302: * TODO: UTF8ToUTF16LE need a fallback mechanism ...
303: *
304: * Returns the number of byte written, or -1 by lack of space, or -2
305: * if the transcoding failed.
306: */
307: int
1.33 daniel 308: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28 daniel 309: const unsigned char* in, int *inlen)
310: {
311: unsigned short* out = (unsigned short*) outb;
1.33 daniel 312: const unsigned char* processed = in;
1.28 daniel 313: unsigned short* outstart= out;
314: unsigned short* outend;
315: const unsigned char* inend= in+*inlen;
316: unsigned int c, d, trailing;
317: unsigned char *tmp;
318: unsigned short tmp1, tmp2;
319:
1.37 daniel 320: if (in == NULL) {
321: /*
322: * initialization, add the Byte Order Mark
323: */
324: if (*outlen >= 2) {
325: outb[0] = 0xFF;
326: outb[1] = 0xFE;
327: *outlen = 2;
328: *inlen = 0;
329: #ifdef DEBUG_ENCODING
330: fprintf(stderr, "Added FFFE Byte Order Mark\n");
331: #endif
332: return(2);
333: }
334: *outlen = 0;
335: *inlen = 0;
336: return(0);
337: }
1.33 daniel 338: outend = out + (*outlen / 2);
1.28 daniel 339: while (in < inend) {
340: d= *in++;
341: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 342: else if (d < 0xC0) {
343: /* trailing byte in leading position */
344: *outlen = out - outstart;
345: *inlen = processed - in;
346: return(-2);
347: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.28 daniel 348: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
349: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 350: else {
351: /* no chance for this in UTF-16 */
352: *outlen = out - outstart;
353: *inlen = processed - in;
354: return(-2);
355: }
1.28 daniel 356:
357: if (inend - in < trailing) {
358: break;
359: }
360:
361: for ( ; trailing; trailing--) {
362: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33 daniel 363: break;
1.28 daniel 364: c <<= 6;
365: c |= d & 0x3F;
366: }
367:
368: /* assertion: c is a single UTF-4 value */
369: if (c < 0x10000) {
370: if (out >= outend)
1.33 daniel 371: break;
1.34 daniel 372: if (xmlLittleEndian) {
373: *out++ = c;
374: } else {
375: tmp = (unsigned char *) out;
376: *tmp = c ;
377: *(tmp + 1) = c >> 8 ;
378: out++;
379: }
1.28 daniel 380: }
381: else if (c < 0x110000) {
382: if (out+1 >= outend)
1.33 daniel 383: break;
1.28 daniel 384: c -= 0x10000;
1.34 daniel 385: if (xmlLittleEndian) {
386: *out++ = 0xD800 | (c >> 10);
387: *out++ = 0xDC00 | (c & 0x03FF);
388: } else {
389: tmp1 = 0xD800 | (c >> 10);
390: tmp = (unsigned char *) out;
391: *tmp = tmp1;
392: *(tmp + 1) = tmp1 >> 8;
393: out++;
394:
395: tmp2 = 0xDC00 | (c & 0x03FF);
396: tmp = (unsigned char *) out;
397: *tmp = tmp2;
398: *(tmp + 1) = tmp2 >> 8;
399: out++;
400: }
1.28 daniel 401: }
402: else
1.33 daniel 403: break;
404: processed = in;
1.28 daniel 405: }
1.36 daniel 406: *outlen = (out - outstart) * 2;
1.33 daniel 407: *inlen = processed - in;
408: return(0);
1.28 daniel 409: }
410:
411: /**
412: * UTF16BEToUTF8:
1.18 daniel 413: * @out: a pointer to an array of bytes to store the result
414: * @outlen: the length of @out
1.25 daniel 415: * @inb: a pointer to an array of UTF-16 passwd as a byte array
416: * @inlenb: the length of @in in UTF-16 chars
1.1 daniel 417: *
418: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28 daniel 419: * block of chars out. This function assume the endian properity
420: * is the same between the native type of this machine and the
421: * inputed one.
1.25 daniel 422: *
1.28 daniel 423: * Returns the number of byte written, or -1 by lack of space, or -2
424: * if the transcoding fails (for *in is not valid utf16 string)
425: * The value of *inlen after return is the number of octets consumed
426: * as the return value is positive, else unpredictiable.
1.1 daniel 427: */
428: int
1.33 daniel 429: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25 daniel 430: const unsigned char* inb, int *inlenb)
1.1 daniel 431: {
1.33 daniel 432: unsigned char* outstart = out;
433: const unsigned char* processed = inb;
434: unsigned char* outend = out + *outlen;
1.25 daniel 435: unsigned short* in = (unsigned short*) inb;
436: unsigned short* inend;
437: unsigned int c, d, inlen;
1.28 daniel 438: unsigned char *tmp;
1.1 daniel 439: int bits;
440:
1.28 daniel 441: if ((*inlenb % 2) == 1)
442: (*inlenb)--;
1.25 daniel 443: inlen = *inlenb / 2;
444: inend= in + inlen;
1.1 daniel 445: while (in < inend) {
1.34 daniel 446: if (xmlLittleEndian) {
447: tmp = (unsigned char *) in;
448: c = *tmp++;
449: c = c << 8;
450: c = c | (unsigned int) *tmp;
451: in++;
452: } else {
453: c= *in++;
454: }
1.1 daniel 455: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.28 daniel 456: if (in >= inend) { /* (in > inend) shouldn't happens */
1.33 daniel 457: *outlen = out - outstart;
458: *inlenb = processed - inb;
459: return(-2);
1.28 daniel 460: }
1.34 daniel 461: if (xmlLittleEndian) {
462: tmp = (unsigned char *) in;
463: d = *tmp++;
464: d = d << 8;
465: d = d | (unsigned int) *tmp;
466: in++;
467: } else {
468: d= *in++;
469: }
1.28 daniel 470: if ((d & 0xFC00) == 0xDC00) {
1.1 daniel 471: c &= 0x03FF;
472: c <<= 10;
473: c |= d & 0x03FF;
474: c += 0x10000;
475: }
1.33 daniel 476: else {
477: *outlen = out - outstart;
478: *inlenb = processed - inb;
1.28 daniel 479: return(-2);
1.33 daniel 480: }
1.1 daniel 481: }
482:
1.25 daniel 483: /* assertion: c is a single UTF-4 value */
1.27 daniel 484: if (out >= outend)
1.33 daniel 485: break;
1.1 daniel 486: if (c < 0x80) { *out++= c; bits= -6; }
1.26 daniel 487: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
488: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
489: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1.1 daniel 490:
1.26 daniel 491: for ( ; bits >= 0; bits-= 6) {
1.27 daniel 492: if (out >= outend)
1.33 daniel 493: break;
1.26 daniel 494: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 495: }
1.33 daniel 496: processed = (const unsigned char*) in;
1.1 daniel 497: }
1.33 daniel 498: *outlen = out - outstart;
499: *inlenb = processed - inb;
500: return(0);
1.1 daniel 501: }
502:
503: /**
1.28 daniel 504: * UTF8ToUTF16BE:
1.25 daniel 505: * @outb: a pointer to an array of bytes to store the result
506: * @outlen: the length of @outb
1.18 daniel 507: * @in: a pointer to an array of UTF-8 chars
508: * @inlen: the length of @in
1.1 daniel 509: *
1.28 daniel 510: * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1 daniel 511: * block of chars out.
1.28 daniel 512: * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15 daniel 513: *
1.6 daniel 514: * Returns the number of byte written, or -1 by lack of space, or -2
1.25 daniel 515: * if the transcoding failed.
1.1 daniel 516: */
517: int
1.33 daniel 518: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25 daniel 519: const unsigned char* in, int *inlen)
1.1 daniel 520: {
1.25 daniel 521: unsigned short* out = (unsigned short*) outb;
1.33 daniel 522: const unsigned char* processed = in;
1.1 daniel 523: unsigned short* outstart= out;
1.28 daniel 524: unsigned short* outend;
1.25 daniel 525: const unsigned char* inend= in+*inlen;
1.1 daniel 526: unsigned int c, d, trailing;
1.28 daniel 527: unsigned char *tmp;
528: unsigned short tmp1, tmp2;
1.1 daniel 529:
1.37 daniel 530: if (in == NULL) {
531: /*
532: * initialization, add the Byte Order Mark
533: */
534: if (*outlen >= 2) {
535: outb[0] = 0xFE;
536: outb[1] = 0xFF;
537: *outlen = 2;
538: *inlen = 0;
539: #ifdef DEBUG_ENCODING
540: fprintf(stderr, "Added FEFF Byte Order Mark\n");
541: #endif
542: return(2);
543: }
544: *outlen = 0;
545: *inlen = 0;
546: return(0);
547: }
1.33 daniel 548: outend = out + (*outlen / 2);
1.1 daniel 549: while (in < inend) {
550: d= *in++;
551: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 552: else if (d < 0xC0) {
553: /* trailing byte in leading position */
554: *outlen = out - outstart;
555: *inlen = processed - in;
556: return(-2);
557: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.1 daniel 558: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
559: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 560: else {
561: /* no chance for this in UTF-16 */
562: *outlen = out - outstart;
563: *inlen = processed - in;
564: return(-2);
565: }
1.28 daniel 566:
567: if (inend - in < trailing) {
568: break;
569: }
1.1 daniel 570:
571: for ( ; trailing; trailing--) {
1.33 daniel 572: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1.1 daniel 573: c <<= 6;
574: c |= d & 0x3F;
575: }
576:
577: /* assertion: c is a single UTF-4 value */
578: if (c < 0x10000) {
1.33 daniel 579: if (out >= outend) break;
1.34 daniel 580: if (xmlLittleEndian) {
581: tmp = (unsigned char *) out;
582: *tmp = c >> 8;
583: *(tmp + 1) = c;
584: out++;
585: } else {
586: *out++ = c;
587: }
1.1 daniel 588: }
589: else if (c < 0x110000) {
1.33 daniel 590: if (out+1 >= outend) break;
1.1 daniel 591: c -= 0x10000;
1.34 daniel 592: if (xmlLittleEndian) {
593: tmp1 = 0xD800 | (c >> 10);
594: tmp = (unsigned char *) out;
595: *tmp = tmp1 >> 8;
596: *(tmp + 1) = tmp1;
597: out++;
598:
599: tmp2 = 0xDC00 | (c & 0x03FF);
600: tmp = (unsigned char *) out;
601: *tmp = tmp2 >> 8;
602: *(tmp + 1) = tmp2;
603: out++;
604: } else {
605: *out++ = 0xD800 | (c >> 10);
606: *out++ = 0xDC00 | (c & 0x03FF);
607: }
1.1 daniel 608: }
1.33 daniel 609: else
610: break;
611: processed = in;
1.1 daniel 612: }
1.36 daniel 613: *outlen = (out - outstart) * 2;
1.33 daniel 614: *inlen = processed - in;
615: return(0);
1.1 daniel 616: }
617:
1.7 daniel 618: /**
619: * xmlDetectCharEncoding:
620: * @in: a pointer to the first bytes of the XML entity, must be at least
621: * 4 bytes long.
1.25 daniel 622: * @len: pointer to the length of the buffer
1.7 daniel 623: *
624: * Guess the encoding of the entity using the first bytes of the entity content
625: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
626: *
627: * Returns one of the XML_CHAR_ENCODING_... values.
628: */
629: xmlCharEncoding
1.25 daniel 630: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7 daniel 631: {
1.25 daniel 632: if (len >= 4) {
633: if ((in[0] == 0x00) && (in[1] == 0x00) &&
634: (in[2] == 0x00) && (in[3] == 0x3C))
635: return(XML_CHAR_ENCODING_UCS4BE);
636: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
637: (in[2] == 0x00) && (in[3] == 0x00))
638: return(XML_CHAR_ENCODING_UCS4LE);
639: if ((in[0] == 0x00) && (in[1] == 0x00) &&
640: (in[2] == 0x3C) && (in[3] == 0x00))
641: return(XML_CHAR_ENCODING_UCS4_2143);
642: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
643: (in[2] == 0x00) && (in[3] == 0x00))
644: return(XML_CHAR_ENCODING_UCS4_3412);
645: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
646: (in[2] == 0xA7) && (in[3] == 0x94))
647: return(XML_CHAR_ENCODING_EBCDIC);
648: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
649: (in[2] == 0x78) && (in[3] == 0x6D))
650: return(XML_CHAR_ENCODING_UTF8);
651: }
652: if (len >= 2) {
653: if ((in[0] == 0xFE) && (in[1] == 0xFF))
654: return(XML_CHAR_ENCODING_UTF16BE);
655: if ((in[0] == 0xFF) && (in[1] == 0xFE))
656: return(XML_CHAR_ENCODING_UTF16LE);
657: }
1.7 daniel 658: return(XML_CHAR_ENCODING_NONE);
659: }
660:
661: /**
662: * xmlParseCharEncoding:
1.18 daniel 663: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 664: *
665: * Conpare the string to the known encoding schemes already known. Note
666: * that the comparison is case insensitive accordingly to the section
667: * [XML] 4.3.3 Character Encoding in Entities.
668: *
669: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
670: * if not recognized.
671: */
672: xmlCharEncoding
1.8 daniel 673: xmlParseCharEncoding(const char* name)
1.7 daniel 674: {
675: char upper[500];
676: int i;
677:
678: for (i = 0;i < 499;i++) {
679: upper[i] = toupper(name[i]);
680: if (upper[i] == 0) break;
681: }
682: upper[i] = 0;
683:
684: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
685: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
686: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
687:
688: /*
689: * NOTE: if we were able to parse this, the endianness of UTF16 is
690: * already found and in use
691: */
692: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
693: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
694:
695: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
696: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
697: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
698:
699: /*
700: * NOTE: if we were able to parse this, the endianness of UCS4 is
701: * already found and in use
702: */
703: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
704: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
705: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
706:
707:
708: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
709: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
710: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
711:
712: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
713: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
714: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
715:
716: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
717: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
718: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
719: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
720: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
721: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
722: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
723:
724: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30 daniel 725: if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7 daniel 726: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30 daniel 727:
728: #ifdef DEBUG_ENCODING
729: fprintf(stderr, "Unknown encoding %s\n", name);
730: #endif
1.7 daniel 731: return(XML_CHAR_ENCODING_ERROR);
732: }
1.9 daniel 733:
1.38 ! daniel 734: /**
! 735: * xmlGetCharEncodingName:
! 736: * @enc: the encoding
! 737: *
! 738: * The "canonical" name for XML encoding.
! 739: * C.f. http://www.w3.org/TR/REC-xml#charencoding
! 740: * Section 4.3.3 Character Encoding in Entities
! 741: *
! 742: * Returns the canonical name for the given encoding
! 743: */
! 744:
! 745: const char*
! 746: xmlGetCharEncodingName(xmlCharEncoding enc) {
! 747: switch (enc) {
! 748: case XML_CHAR_ENCODING_ERROR:
! 749: return(NULL);
! 750: case XML_CHAR_ENCODING_NONE:
! 751: return(NULL);
! 752: case XML_CHAR_ENCODING_UTF8:
! 753: return("UTF-8");
! 754: case XML_CHAR_ENCODING_UTF16LE:
! 755: return("UTF-16");
! 756: case XML_CHAR_ENCODING_UTF16BE:
! 757: return("UTF-16");
! 758: case XML_CHAR_ENCODING_EBCDIC:
! 759: return("EBCDIC");
! 760: case XML_CHAR_ENCODING_UCS4LE:
! 761: return("ISO-10646-UCS-4");
! 762: case XML_CHAR_ENCODING_UCS4BE:
! 763: return("ISO-10646-UCS-4");
! 764: case XML_CHAR_ENCODING_UCS4_2143:
! 765: return("ISO-10646-UCS-4");
! 766: case XML_CHAR_ENCODING_UCS4_3412:
! 767: return("ISO-10646-UCS-4");
! 768: case XML_CHAR_ENCODING_UCS2:
! 769: return("ISO-10646-UCS-2");
! 770: case XML_CHAR_ENCODING_8859_1:
! 771: return("ISO-8859-1");
! 772: case XML_CHAR_ENCODING_8859_2:
! 773: return("ISO-8859-2");
! 774: case XML_CHAR_ENCODING_8859_3:
! 775: return("ISO-8859-3");
! 776: case XML_CHAR_ENCODING_8859_4:
! 777: return("ISO-8859-4");
! 778: case XML_CHAR_ENCODING_8859_5:
! 779: return("ISO-8859-5");
! 780: case XML_CHAR_ENCODING_8859_6:
! 781: return("ISO-8859-6");
! 782: case XML_CHAR_ENCODING_8859_7:
! 783: return("ISO-8859-7");
! 784: case XML_CHAR_ENCODING_8859_8:
! 785: return("ISO-8859-8");
! 786: case XML_CHAR_ENCODING_8859_9:
! 787: return("ISO-8859-9");
! 788: case XML_CHAR_ENCODING_2022_JP:
! 789: return("ISO-2022-JP");
! 790: case XML_CHAR_ENCODING_SHIFT_JIS:
! 791: return("Shift-JIS");
! 792: case XML_CHAR_ENCODING_EUC_JP:
! 793: return("EUC-JP");
! 794: }
! 795: return(NULL);
! 796: }
! 797:
1.9 daniel 798: /****************************************************************
799: * *
800: * Char encoding handlers *
801: * *
802: ****************************************************************/
803:
804: /* the size should be growable, but it's not a big deal ... */
805: #define MAX_ENCODING_HANDLERS 50
806: static xmlCharEncodingHandlerPtr *handlers = NULL;
807: static int nbCharEncodingHandler = 0;
808:
809: /*
810: * The default is UTF-8 for XML, that's also the default used for the
811: * parser internals, so the default encoding handler is NULL
812: */
813:
814: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
815:
816: /**
817: * xmlNewCharEncodingHandler:
1.18 daniel 818: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 819: * @input: the xmlCharEncodingInputFunc to read that encoding
820: * @output: the xmlCharEncodingOutputFunc to write that encoding
821: *
822: * Create and registers an xmlCharEncodingHandler.
823: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
824: */
825: xmlCharEncodingHandlerPtr
1.25 daniel 826: xmlNewCharEncodingHandler(const char *name,
827: xmlCharEncodingInputFunc input,
1.9 daniel 828: xmlCharEncodingOutputFunc output) {
829: xmlCharEncodingHandlerPtr handler;
830: char upper[500];
831: int i;
832: char *up = 0;
833:
834: /*
835: * Keep only the uppercase version of the encoding.
836: */
837: if (name == NULL) {
838: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
839: return(NULL);
840: }
841: for (i = 0;i < 499;i++) {
842: upper[i] = toupper(name[i]);
843: if (upper[i] == 0) break;
844: }
845: upper[i] = 0;
1.16 daniel 846: up = xmlMemStrdup(upper);
1.9 daniel 847: if (up == NULL) {
848: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
849: return(NULL);
850: }
851:
852: /*
853: * allocate and fill-up an handler block.
854: */
855: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 856: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 857: if (handler == NULL) {
858: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
859: return(NULL);
860: }
861: handler->input = input;
862: handler->output = output;
863: handler->name = up;
864:
865: /*
866: * registers and returns the handler.
867: */
868: xmlRegisterCharEncodingHandler(handler);
1.30 daniel 869: #ifdef DEBUG_ENCODING
870: fprintf(stderr, "Registered encoding handler for %s\n", name);
871: #endif
1.9 daniel 872: return(handler);
873: }
874:
875: /**
876: * xmlInitCharEncodingHandlers:
877: *
878: * Initialize the char encoding support, it registers the default
879: * encoding supported.
1.18 daniel 880: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 881: * in normal processing.
882: */
883: void
884: xmlInitCharEncodingHandlers(void) {
1.34 daniel 885: unsigned short int tst = 0x1234;
886: unsigned char *ptr = (unsigned char *) &tst;
887:
1.9 daniel 888: if (handlers != NULL) return;
889:
890: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 891: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34 daniel 892:
893: if (*ptr == 0x12) xmlLittleEndian = 0;
894: else if (*ptr == 0x34) xmlLittleEndian = 1;
895: else fprintf(stderr, "Odd problem at endianness detection\n");
1.9 daniel 896:
897: if (handlers == NULL) {
898: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
899: return;
900: }
1.10 daniel 901: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25 daniel 902: xmlUTF16LEHandler =
1.28 daniel 903: xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
904: xmlUTF16BEHandler =
905: xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10 daniel 906: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9 daniel 907: }
908:
909: /**
1.19 daniel 910: * xmlCleanupCharEncodingHandlers:
911: *
912: * Cleanup the memory allocated for the char encoding support, it
913: * unregisters all the encoding handlers.
914: */
915: void
916: xmlCleanupCharEncodingHandlers(void) {
917: if (handlers == NULL) return;
918:
919: for (;nbCharEncodingHandler > 0;) {
920: nbCharEncodingHandler--;
921: if (handlers[nbCharEncodingHandler] != NULL) {
1.31 daniel 922: if (handlers[nbCharEncodingHandler]->name != NULL)
923: xmlFree(handlers[nbCharEncodingHandler]->name);
1.19 daniel 924: xmlFree(handlers[nbCharEncodingHandler]);
925: }
926: }
927: xmlFree(handlers);
928: handlers = NULL;
929: nbCharEncodingHandler = 0;
930: xmlDefaultCharEncodingHandler = NULL;
931: }
932:
933: /**
1.9 daniel 934: * xmlRegisterCharEncodingHandler:
935: * @handler: the xmlCharEncodingHandlerPtr handler block
936: *
937: * Register the char encoding handler, surprizing, isn't it ?
938: */
939: void
940: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
941: if (handlers == NULL) xmlInitCharEncodingHandlers();
942: if (handler == NULL) {
943: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
944: return;
945: }
946:
947: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
948: fprintf(stderr,
949: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
950: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
951: return;
952: }
953: handlers[nbCharEncodingHandler++] = handler;
954: }
955:
956: /**
957: * xmlGetCharEncodingHandler:
958: * @enc: an xmlCharEncoding value.
959: *
960: * Search in the registrered set the handler able to read/write that encoding.
961: *
962: * Returns the handler or NULL if not found
963: */
964: xmlCharEncodingHandlerPtr
965: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30 daniel 966: xmlCharEncodingHandlerPtr handler;
967:
1.9 daniel 968: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25 daniel 969: switch (enc) {
970: case XML_CHAR_ENCODING_ERROR:
971: return(NULL);
972: case XML_CHAR_ENCODING_NONE:
973: return(NULL);
974: case XML_CHAR_ENCODING_UTF8:
975: return(NULL);
976: case XML_CHAR_ENCODING_UTF16LE:
977: return(xmlUTF16LEHandler);
978: case XML_CHAR_ENCODING_UTF16BE:
979: return(xmlUTF16BEHandler);
980: case XML_CHAR_ENCODING_EBCDIC:
1.30 daniel 981: handler = xmlFindCharEncodingHandler("EBCDIC");
982: if (handler != NULL) return(handler);
983: handler = xmlFindCharEncodingHandler("ebcdic");
984: if (handler != NULL) return(handler);
985: break;
1.38 ! daniel 986: case XML_CHAR_ENCODING_UCS4BE:
1.30 daniel 987: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
988: if (handler != NULL) return(handler);
989: handler = xmlFindCharEncodingHandler("UCS-4");
990: if (handler != NULL) return(handler);
991: handler = xmlFindCharEncodingHandler("UCS4");
992: if (handler != NULL) return(handler);
993: break;
1.38 ! daniel 994: case XML_CHAR_ENCODING_UCS4LE:
! 995: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
! 996: if (handler != NULL) return(handler);
! 997: handler = xmlFindCharEncodingHandler("UCS-4");
! 998: if (handler != NULL) return(handler);
! 999: handler = xmlFindCharEncodingHandler("UCS4");
1.30 daniel 1000: if (handler != NULL) return(handler);
1001: break;
1.25 daniel 1002: case XML_CHAR_ENCODING_UCS4_2143:
1.30 daniel 1003: break;
1.25 daniel 1004: case XML_CHAR_ENCODING_UCS4_3412:
1.30 daniel 1005: break;
1.25 daniel 1006: case XML_CHAR_ENCODING_UCS2:
1.30 daniel 1007: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1008: if (handler != NULL) return(handler);
1009: handler = xmlFindCharEncodingHandler("UCS-2");
1010: if (handler != NULL) return(handler);
1011: handler = xmlFindCharEncodingHandler("UCS2");
1012: if (handler != NULL) return(handler);
1013: break;
1.25 daniel 1014: case XML_CHAR_ENCODING_8859_1:
1015: case XML_CHAR_ENCODING_8859_2:
1016: case XML_CHAR_ENCODING_8859_3:
1017: case XML_CHAR_ENCODING_8859_4:
1018: case XML_CHAR_ENCODING_8859_5:
1019: case XML_CHAR_ENCODING_8859_6:
1020: case XML_CHAR_ENCODING_8859_7:
1021: case XML_CHAR_ENCODING_8859_8:
1022: case XML_CHAR_ENCODING_8859_9:
1023: return(NULL);
1024: case XML_CHAR_ENCODING_2022_JP:
1.30 daniel 1025: handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1026: if (handler != NULL) return(handler);
1027: break;
1.25 daniel 1028: case XML_CHAR_ENCODING_SHIFT_JIS:
1.30 daniel 1029: handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1030: if (handler != NULL) return(handler);
1031: handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1032: if (handler != NULL) return(handler);
1033: handler = xmlFindCharEncodingHandler("Shift_JIS");
1034: if (handler != NULL) return(handler);
1035: break;
1.25 daniel 1036: case XML_CHAR_ENCODING_EUC_JP:
1.30 daniel 1037: handler = xmlFindCharEncodingHandler("EUC-JP");
1038: if (handler != NULL) return(handler);
1039: break;
1040: default:
1041: break;
1.25 daniel 1042: }
1.30 daniel 1043:
1044: #ifdef DEBUG_ENCODING
1045: fprintf(stderr, "No handler found for encoding %d\n", enc);
1046: #endif
1.9 daniel 1047: return(NULL);
1048: }
1049:
1050: /**
1051: * xmlGetCharEncodingHandler:
1052: * @enc: a string describing the char encoding.
1053: *
1054: * Search in the registrered set the handler able to read/write that encoding.
1055: *
1056: * Returns the handler or NULL if not found
1057: */
1058: xmlCharEncodingHandlerPtr
1059: xmlFindCharEncodingHandler(const char *name) {
1.36 daniel 1060: xmlCharEncodingHandlerPtr enc;
1061: xmlCharEncoding alias;
1.30 daniel 1062: #ifdef LIBXML_ICONV_ENABLED
1063: iconv_t icv_in, icv_out;
1064: #endif /* LIBXML_ICONV_ENABLED */
1065: char upper[100];
1.9 daniel 1066: int i;
1067:
1068: if (handlers == NULL) xmlInitCharEncodingHandlers();
1069: if (name == NULL) return(xmlDefaultCharEncodingHandler);
1070: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1071:
1.36 daniel 1072: /*
1073: * Check first for directly registered encoding names
1074: */
1.30 daniel 1075: for (i = 0;i < 99;i++) {
1.9 daniel 1076: upper[i] = toupper(name[i]);
1077: if (upper[i] == 0) break;
1078: }
1079: upper[i] = 0;
1080:
1081: for (i = 0;i < nbCharEncodingHandler; i++)
1.30 daniel 1082: if (!strcmp(upper, handlers[i]->name)) {
1083: #ifdef DEBUG_ENCODING
1084: fprintf(stderr, "Found registered handler for encoding %s\n", name);
1085: #endif
1.9 daniel 1086: return(handlers[i]);
1.30 daniel 1087: }
1.9 daniel 1088:
1.30 daniel 1089: #ifdef LIBXML_ICONV_ENABLED
1090: /* check whether iconv can handle this */
1.31 daniel 1091: icv_in = iconv_open("UTF-8", name);
1092: icv_out = iconv_open(name, "UTF-8");
1.30 daniel 1093: if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31 daniel 1094: enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32 daniel 1095: if (enc == NULL) {
1096: iconv_close(icv_in);
1097: iconv_close(icv_out);
1098: return(NULL);
1099: }
1100: enc->name = NULL;
1.30 daniel 1101: enc->input = NULL;
1102: enc->output = NULL;
1103: enc->iconv_in = icv_in;
1104: enc->iconv_out = icv_out;
1105: #ifdef DEBUG_ENCODING
1106: fprintf(stderr, "Found iconv handler for encoding %s\n", name);
1107: #endif
1108: return enc;
1109: } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1110: fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
1111: }
1112: #endif /* LIBXML_ICONV_ENABLED */
1.38 ! daniel 1113:
1.30 daniel 1114: #ifdef DEBUG_ENCODING
1115: fprintf(stderr, "No handler found for encoding %s\n", name);
1116: #endif
1.38 ! daniel 1117:
! 1118: /*
! 1119: * Fallback using the canonical names
! 1120: */
! 1121: alias = xmlParseCharEncoding(name);
! 1122: if (alias != XML_CHAR_ENCODING_ERROR) {
! 1123: const char* canon;
! 1124: canon = xmlGetCharEncodingName(alias);
! 1125: if ((canon != NULL) && (strcmp(name, canon))) {
! 1126: return(xmlFindCharEncodingHandler(canon));
! 1127: }
! 1128: }
! 1129:
1.9 daniel 1130: return(NULL);
1.30 daniel 1131: }
1132:
1133: #ifdef LIBXML_ICONV_ENABLED
1134: /**
1135: * xmlIconvWrapper:
1136: * @cd: iconv converter data structure
1137: * @out: a pointer to an array of bytes to store the result
1138: * @outlen: the length of @out
1139: * @in: a pointer to an array of ISO Latin 1 chars
1140: * @inlen: the length of @in
1141: *
1142: * Returns 0 if success, or
1143: * -1 by lack of space, or
1144: * -2 if the transcoding fails (for *in is not valid utf8 string or
1145: * the result of transformation can't fit into the encoding we want), or
1146: * -3 if there the last byte can't form a single output char.
1147: *
1148: * The value of @inlen after return is the number of octets consumed
1149: * as the return value is positive, else unpredictiable.
1150: * The value of @outlen after return is the number of ocetes consumed.
1151: */
1152: static int
1153: xmlIconvWrapper(iconv_t cd,
1154: unsigned char *out, int *outlen,
1155: const unsigned char *in, int *inlen) {
1156:
1157: size_t icv_inlen = *inlen, icv_outlen = *outlen;
1158: const char *icv_in = (const char *) in;
1159: char *icv_out = (char *) out;
1160: int ret;
1161:
1162: ret = iconv(cd,
1163: &icv_in, &icv_inlen,
1164: &icv_out, &icv_outlen);
1.35 daniel 1165: if (in != NULL) {
1166: *inlen -= icv_inlen;
1167: *outlen -= icv_outlen;
1168: } else {
1169: *inlen = 0;
1170: *outlen = 0;
1171: }
1.30 daniel 1172: if (icv_inlen != 0 || ret == (size_t) -1) {
1173: #ifdef EILSEQ
1174: if (errno == EILSEQ) {
1.31 daniel 1175: return -2;
1.30 daniel 1176: } else
1177: #endif
1178: #ifdef E2BIG
1179: if (errno == E2BIG) {
1180: return -1;
1181: } else
1182: #endif
1183: #ifdef EINVAL
1184: if (errno == EINVAL) {
1.31 daniel 1185: return -3;
1.30 daniel 1186: }
1187: #endif
1188: else {
1189: return -3;
1190: }
1191: }
1192: return 0;
1193: }
1194: #endif /* LIBXML_ICONV_ENABLED */
1.38 ! daniel 1195:
! 1196: /**
! 1197: * xmlCharEncFirstLine:
! 1198: * @handler: char enconding transformation data structure
! 1199: * @out: an xmlBuffer for the output.
! 1200: * @in: an xmlBuffer for the input
! 1201: *
! 1202: * Front-end for the encoding handler input function, but handle only
! 1203: * the very first line, i.e. limit itself to 45 chars.
! 1204: *
! 1205: * Returns the number of byte written if success, or
! 1206: * -1 general error
! 1207: * -2 if the transcoding fails (for *in is not valid utf8 string or
! 1208: * the result of transformation can't fit into the encoding we want), or
! 1209: */
! 1210: int
! 1211: xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
! 1212: xmlBufferPtr in) {
! 1213: int ret = -2;
! 1214: int written;
! 1215: int toconv;
! 1216:
! 1217: if (handler == NULL) return(-1);
! 1218: if (out == NULL) return(-1);
! 1219: if (in == NULL) return(-1);
! 1220:
! 1221: written = out->size - out->use;
! 1222: toconv = in->use;
! 1223: if (toconv * 2 >= written) {
! 1224: xmlBufferGrow(out, toconv * 2);
! 1225: written = out->size - out->use - 1;
! 1226: }
! 1227: /*
! 1228: * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
! 1229: * 45 chars should be sufficient to reach the end of the encoding
! 1230: * decalration without going too far inside the document content.
! 1231: */
! 1232: written = 45;
! 1233:
! 1234: if (handler->input != NULL) {
! 1235: ret = handler->input(&out->content[out->use], &written,
! 1236: in->content, &toconv);
! 1237: xmlBufferShrink(in, toconv);
! 1238: out->use += written;
! 1239: out->content[out->use] = 0;
! 1240: }
! 1241: #ifdef LIBXML_ICONV_ENABLED
! 1242: else if (handler->iconv_in != NULL) {
! 1243: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
! 1244: &written, in->content, &toconv);
! 1245: xmlBufferShrink(in, toconv);
! 1246: out->use += written;
! 1247: out->content[out->use] = 0;
! 1248: if (ret == -1) ret = -3;
! 1249: }
! 1250: #endif /* LIBXML_ICONV_ENABLED */
! 1251: #ifdef DEBUG_ENCODING
! 1252: switch (ret) {
! 1253: case 0:
! 1254: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
! 1255: toconv, written);
! 1256: break;
! 1257: case -1:
! 1258: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
! 1259: toconv, written, in->use);
! 1260: break;
! 1261: case -2:
! 1262: fprintf(stderr, "input conversion failed due to input error\n");
! 1263: break;
! 1264: case -3:
! 1265: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
! 1266: toconv, written, in->use);
! 1267: break;
! 1268: default:
! 1269: fprintf(stderr,"Unknown input conversion failed %d\n", ret);
! 1270: }
! 1271: #endif
! 1272: /*
! 1273: * Ignore when input buffer is not on a boundary
! 1274: */
! 1275: if (ret == -3) ret = 0;
! 1276: if (ret == -1) ret = 0;
! 1277: return(ret);
! 1278: }
1.30 daniel 1279:
1280: /**
1281: * xmlCharEncInFunc:
1282: * @handler: char enconding transformation data structure
1.31 daniel 1283: * @out: an xmlBuffer for the output.
1284: * @in: an xmlBuffer for the input
1.30 daniel 1285: *
1286: * Generic front-end for the encoding handler input function
1287: *
1.31 daniel 1288: * Returns the number of byte written if success, or
1289: * -1 general error
1.30 daniel 1290: * -2 if the transcoding fails (for *in is not valid utf8 string or
1291: * the result of transformation can't fit into the encoding we want), or
1292: */
1293: int
1.31 daniel 1294: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1295: xmlBufferPtr in) {
1.30 daniel 1296: int ret = -2;
1.31 daniel 1297: int written;
1298: int toconv;
1.30 daniel 1299:
1.31 daniel 1300: if (handler == NULL) return(-1);
1301: if (out == NULL) return(-1);
1302: if (in == NULL) return(-1);
1303:
1304: written = out->size - out->use;
1305: toconv = in->use;
1306: if (toconv * 2 >= written) {
1307: xmlBufferGrow(out, toconv * 2);
1.33 daniel 1308: written = out->size - out->use - 1;
1.31 daniel 1309: }
1.30 daniel 1310: if (handler->input != NULL) {
1.32 daniel 1311: ret = handler->input(&out->content[out->use], &written,
1.31 daniel 1312: in->content, &toconv);
1313: xmlBufferShrink(in, toconv);
1314: out->use += written;
1.33 daniel 1315: out->content[out->use] = 0;
1.30 daniel 1316: }
1317: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1318: else if (handler->iconv_in != NULL) {
1319: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1320: &written, in->content, &toconv);
1321: xmlBufferShrink(in, toconv);
1322: out->use += written;
1.33 daniel 1323: out->content[out->use] = 0;
1324: if (ret == -1) ret = -3;
1.30 daniel 1325: }
1326: #endif /* LIBXML_ICONV_ENABLED */
1327: #ifdef DEBUG_ENCODING
1328: switch (ret) {
1329: case 0:
1330: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31 daniel 1331: toconv, written);
1.30 daniel 1332: break;
1333: case -1:
1.31 daniel 1334: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1335: toconv, written, in->use);
1.30 daniel 1336: break;
1337: case -2:
1338: fprintf(stderr, "input conversion failed due to input error\n");
1339: break;
1340: case -3:
1.31 daniel 1341: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1342: toconv, written, in->use);
1.30 daniel 1343: break;
1344: default:
1345: fprintf(stderr,"Unknown input conversion failed %d\n", ret);
1346: }
1347: #endif
1.33 daniel 1348: /*
1349: * Ignore when input buffer is not on a boundary
1350: */
1351: if (ret == -3) ret = 0;
1.30 daniel 1352: return(ret);
1353: }
1354:
1355: /**
1356: * xmlCharEncOutFunc:
1357: * @handler: char enconding transformation data structure
1.31 daniel 1358: * @out: an xmlBuffer for the output.
1359: * @in: an xmlBuffer for the input
1360: *
1361: * Generic front-end for the encoding handler output function
1.35 daniel 1362: * a first call with @in == NULL has to be made firs to initiate the
1363: * output in case of non-stateless encoding needing to initiate their
1364: * state or the output (like the BOM in UTF16).
1.30 daniel 1365: *
1.31 daniel 1366: * Returns the number of byte written if success, or
1367: * -1 general error
1.30 daniel 1368: * -2 if the transcoding fails (for *in is not valid utf8 string or
1369: * the result of transformation can't fit into the encoding we want), or
1370: */
1371: int
1.31 daniel 1372: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1373: xmlBufferPtr in) {
1.30 daniel 1374: int ret = -2;
1.31 daniel 1375: int written;
1376: int toconv;
1377:
1378: if (handler == NULL) return(-1);
1379: if (out == NULL) return(-1);
1.35 daniel 1380: written = out->size - out->use;
1381:
1382: if (in == NULL) {
1383: toconv = 0;
1384: if (handler->output != NULL) {
1385: ret = handler->output(&out->content[out->use], &written,
1386: NULL, &toconv);
1387: out->use += written;
1388: out->content[out->use] = 0;
1389: }
1390: #ifdef LIBXML_ICONV_ENABLED
1391: else if (handler->iconv_out != NULL) {
1392: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1393: &written, NULL, &toconv);
1394: out->use += written;
1395: out->content[out->use] = 0;
1396: }
1397: #endif /* LIBXML_ICONV_ENABLED */
1398: #ifdef DEBUG_ENCODING
1399: fprintf(stderr, "initialized encoder\n");
1400: #endif
1401: return(0);
1402: }
1.30 daniel 1403:
1.33 daniel 1404: toconv = in->use;
1405: if (toconv * 2 >= written) {
1406: xmlBufferGrow(out, toconv * 2);
1407: written = out->size - out->use - 1;
1408: }
1.30 daniel 1409: if (handler->output != NULL) {
1.33 daniel 1410: ret = handler->output(&out->content[out->use], &written,
1.35 daniel 1411: in->content, &toconv);
1.31 daniel 1412: xmlBufferShrink(in, toconv);
1413: out->use += written;
1.33 daniel 1414: out->content[out->use] = 0;
1.30 daniel 1415: }
1416: #ifdef LIBXML_ICONV_ENABLED
1417: else if (handler->iconv_out != NULL) {
1.31 daniel 1418: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1419: &written, in->content, &toconv);
1420: xmlBufferShrink(in, toconv);
1421: out->use += written;
1.33 daniel 1422: out->content[out->use] = 0;
1423: if (ret == -1) ret = -3;
1.30 daniel 1424: }
1425: #endif /* LIBXML_ICONV_ENABLED */
1426: #ifdef DEBUG_ENCODING
1427: switch (ret) {
1428: case 0:
1429: fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31 daniel 1430: toconv, written);
1.30 daniel 1431: break;
1432: case -1:
1433: fprintf(stderr, "output conversion failed by lack of space\n");
1434: break;
1435: case -2:
1436: fprintf(stderr, "output conversion failed due to output error\n");
1437: break;
1438: case -3:
1.31 daniel 1439: fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
1440: toconv, written, in->use);
1.30 daniel 1441: break;
1442: default:
1443: fprintf(stderr,"Unknown output conversion failed %d\n", ret);
1444: }
1445: #endif
1446: return(ret);
1447: }
1448:
1449: /**
1450: * xmlCharEncCloseFunc:
1451: * @handler: char enconding transformation data structure
1452: *
1453: * Generic front-end for hencoding handler close function
1454: *
1455: * Returns 0 if success, or -1 in case of error
1456: */
1457: int
1458: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1459: int ret = 0;
1.31 daniel 1460: if (handler == NULL) return(-1);
1461: if (handler->name == NULL) return(-1);
1.30 daniel 1462: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1463: /*
1464: * Iconv handlers can be oused only once, free the whole block.
1465: * and the associated icon resources.
1466: */
1.32 daniel 1467: if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
1468: if (handler->name != NULL)
1469: xmlFree(handler->name);
1470: handler->name = NULL;
1471: if (handler->iconv_out != NULL) {
1472: if (iconv_close(handler->iconv_out))
1473: ret = -1;
1474: handler->iconv_out = NULL;
1475: }
1476: if (handler->iconv_in != NULL) {
1477: if (iconv_close(handler->iconv_in))
1478: ret = -1;
1479: handler->iconv_in = NULL;
1480: }
1481: xmlFree(handler);
1.30 daniel 1482: }
1483: #endif /* LIBXML_ICONV_ENABLED */
1484: #ifdef DEBUG_ENCODING
1485: if (ret)
1486: fprintf(stderr, "failed to close the encoding handler\n");
1487: else
1488: fprintf(stderr, "closed the encoding handler\n");
1489:
1490: #endif
1491: return(ret);
1.9 daniel 1492: }
1493:
Webmaster