Annotation of XML/encoding.c, revision 1.39
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
1.39 ! daniel 6: * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
1.1 daniel 7: * [ISO-10646] UTF-8 and UTF-16 in Annexes
8: * [ISO-8859-1] ISO Latin-1 characters codes.
9: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10: * Worldwide Character Encoding -- Version 1.0", Addison-
11: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12: * described in Unicode Technical Report #4.
13: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14: * Information Interchange, ANSI X3.4-1986.
15: *
1.9 daniel 16: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 17: *
18: * See Copyright for the status of this software.
19: *
20: * Daniel.Veillard@w3.org
21: */
22:
1.21 daniel 23: #ifdef WIN32
24: #include "win32config.h"
25: #else
1.14 daniel 26: #include "config.h"
1.17 daniel 27: #endif
28:
29: #include <stdio.h>
30: #include <string.h>
31:
32: #ifdef HAVE_CTYPE_H
1.7 daniel 33: #include <ctype.h>
1.17 daniel 34: #endif
1.20 daniel 35: #ifdef HAVE_STDLIB_H
36: #include <stdlib.h>
37: #endif
1.30 daniel 38: #include <libxml/xmlversion.h>
39: #ifdef LIBXML_ICONV_ENABLED
40: #ifdef HAVE_ERRNO_H
41: #include <errno.h>
42: #endif
43: #endif
1.29 daniel 44: #include <libxml/encoding.h>
45: #include <libxml/xmlmemory.h>
1.3 daniel 46:
1.25 daniel 47: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
48: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
49:
1.30 daniel 50: #ifdef LIBXML_ICONV_ENABLED
1.37 daniel 51: #if 0
1.30 daniel 52: #define DEBUG_ENCODING /* Define this to get encoding traces */
53: #endif
1.33 daniel 54: #endif
1.30 daniel 55:
1.34 daniel 56: static int xmlLittleEndian = 1;
57:
1.3 daniel 58: /*
59: * From rfc2044: encoding of the Unicode values on UTF-8:
60: *
61: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
62: * 0000 0000-0000 007F 0xxxxxxx
63: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
64: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
65: *
66: * I hope we won't use values > 0xFFFF anytime soon !
67: */
1.1 daniel 68:
69: /**
1.39 ! daniel 70: * xmlGetUTF8Char:
! 71: * @utf: a sequence of UTF-8 encoded bytes
! 72: * @len: a pointer to @bytes len
! 73: *
! 74: * Read one UTF8 Char from @utf
! 75: *
! 76: * Returns the char value or -1 in case of error and update @len with the
! 77: * number of bytes used
! 78: */
! 79: int
! 80: xmlGetUTF8Char(const unsigned char *utf, int *len) {
! 81: unsigned int c;
! 82:
! 83: if (utf == NULL)
! 84: goto error;
! 85: if (len == NULL)
! 86: goto error;
! 87: if (*len < 1)
! 88: goto error;
! 89:
! 90: c = utf[0];
! 91: if (c & 0x80) {
! 92: if (*len < 2)
! 93: goto error;
! 94: if ((utf[1] & 0xc0) != 0x80)
! 95: goto error;
! 96: if ((c & 0xe0) == 0xe0) {
! 97: if (*len < 3)
! 98: goto error;
! 99: if ((utf[2] & 0xc0) != 0x80)
! 100: goto error;
! 101: if ((c & 0xf0) == 0xf0) {
! 102: if (*len < 4)
! 103: goto error;
! 104: if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
! 105: goto error;
! 106: *len = 4;
! 107: /* 4-byte code */
! 108: c = (utf[0] & 0x7) << 18;
! 109: c |= (utf[1] & 0x3f) << 12;
! 110: c |= (utf[2] & 0x3f) << 6;
! 111: c |= utf[3] & 0x3f;
! 112: } else {
! 113: /* 3-byte code */
! 114: *len = 3;
! 115: c = (utf[0] & 0xf) << 12;
! 116: c |= (utf[1] & 0x3f) << 6;
! 117: c |= utf[2] & 0x3f;
! 118: }
! 119: } else {
! 120: /* 2-byte code */
! 121: *len = 2;
! 122: c = (utf[0] & 0x1f) << 6;
! 123: c |= utf[1] & 0x3f;
! 124: }
! 125: } else {
! 126: /* 1-byte code */
! 127: *len = 1;
! 128: }
! 129: return(c);
! 130:
! 131: error:
! 132: *len = 0;
! 133: return(-1);
! 134: }
! 135:
! 136: /**
1.22 daniel 137: * xmlCheckUTF8: Check utf-8 string for legality.
138: * @utf: Pointer to putative utf-8 encoded string.
139: *
140: * Checks @utf for being valid utf-8. @utf is assumed to be
141: * null-terminated. This function is not super-strict, as it will
142: * allow longer utf-8 sequences than necessary. Note that Java is
143: * capable of producing these sequences if provoked. Also note, this
144: * routine checks for the 4-byte maxiumum size, but does not check for
145: * 0x10ffff maximum value.
146: *
147: * Return value: true if @utf is valid.
148: **/
149: int
150: xmlCheckUTF8(const unsigned char *utf)
151: {
152: int ix;
153: unsigned char c;
154:
155: for (ix = 0; (c = utf[ix]);) {
156: if (c & 0x80) {
157: if ((utf[ix + 1] & 0xc0) != 0x80)
158: return(0);
159: if ((c & 0xe0) == 0xe0) {
160: if ((utf[ix + 2] & 0xc0) != 0x80)
161: return(0);
162: if ((c & 0xf0) == 0xf0) {
163: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
164: return(0);
165: ix += 4;
166: /* 4-byte code */
167: } else
168: /* 3-byte code */
169: ix += 3;
170: } else
171: /* 2-byte code */
172: ix += 2;
173: } else
174: /* 1-byte code */
175: ix++;
176: }
177: return(1);
178: }
179:
180: /**
1.1 daniel 181: * isolat1ToUTF8:
1.18 daniel 182: * @out: a pointer to an array of bytes to store the result
183: * @outlen: the length of @out
184: * @in: a pointer to an array of ISO Latin 1 chars
185: * @inlen: the length of @in
1.1 daniel 186: *
187: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
188: * block of chars out.
1.33 daniel 189: * Returns 0 if success, or -1 otherwise
190: * The value of @inlen after return is the number of octets consumed
191: * as the return value is positive, else unpredictiable.
192: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 193: */
194: int
1.33 daniel 195: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25 daniel 196: const unsigned char* in, int *inlen) {
1.33 daniel 197: unsigned char* outstart = out;
198: const unsigned char* processed = in;
199: unsigned char* outend = out + *outlen;
200: const unsigned char* inend = in + *inlen;
1.1 daniel 201: unsigned char c;
202:
203: while (in < inend) {
204: c= *in++;
205: if (c < 0x80) {
1.33 daniel 206: if (out >= outend)
207: break;
1.1 daniel 208: *out++ = c;
209: }
210: else {
1.33 daniel 211: if (out + 1 >= outend) break;
1.1 daniel 212: *out++ = 0xC0 | (c >> 6);
213: *out++ = 0x80 | (0x3F & c);
214: }
1.33 daniel 215: processed = in;
1.1 daniel 216: }
1.33 daniel 217: *outlen = out - outstart;
218: *inlen = processed - in;
219:
220: return(0);
1.1 daniel 221: }
222:
223: /**
224: * UTF8Toisolat1:
1.18 daniel 225: * @out: a pointer to an array of bytes to store the result
226: * @outlen: the length of @out
227: * @in: a pointer to an array of UTF-8 chars
228: * @inlen: the length of @in
1.1 daniel 229: *
230: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
231: * block of chars out.
1.15 daniel 232: * TODO: UTF8Toisolat1 need a fallback mechanism ...
233: *
1.33 daniel 234: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28 daniel 235: * The value of @inlen after return is the number of octets consumed
236: * as the return value is positive, else unpredictiable.
1.33 daniel 237: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 238: */
239: int
1.33 daniel 240: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25 daniel 241: const unsigned char* in, int *inlen) {
1.33 daniel 242: unsigned char* outstart = out;
243: const unsigned char* processed = in;
244: unsigned char* outend = out + *outlen;
245: const unsigned char* inend = in + *inlen;
1.1 daniel 246: unsigned char c;
247:
248: while (in < inend) {
249: c= *in++;
250: if (c < 0x80) {
1.28 daniel 251: if (out >= outend) return(-1);
1.1 daniel 252: *out++= c;
253: }
1.23 daniel 254: else if (in == inend) {
255: break;
256: }
257: else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
258: /* a two byte utf-8 and can be encoding as isolate1 */
1.1 daniel 259: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23 daniel 260: }
1.33 daniel 261: else {
262: *outlen = out - outstart;
263: *inlen = processed - in;
1.28 daniel 264: return(-2);
1.33 daniel 265: }
266: processed = in;
1.1 daniel 267: }
1.33 daniel 268: *outlen = out - outstart;
269: *inlen = processed - in;
270: return(0);
1.1 daniel 271: }
272:
273: /**
1.28 daniel 274: * UTF16LEToUTF8:
275: * @out: a pointer to an array of bytes to store the result
276: * @outlen: the length of @out
277: * @inb: a pointer to an array of UTF-16LE passwd as a byte array
278: * @inlenb: the length of @in in UTF-16LE chars
279: *
280: * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
281: * block of chars out. This function assume the endian properity
282: * is the same between the native type of this machine and the
283: * inputed one.
284: *
285: * Returns the number of byte written, or -1 by lack of space, or -2
286: * if the transcoding fails (for *in is not valid utf16 string)
287: * The value of *inlen after return is the number of octets consumed
288: * as the return value is positive, else unpredictiable.
289: */
290: int
1.33 daniel 291: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28 daniel 292: const unsigned char* inb, int *inlenb)
293: {
1.33 daniel 294: unsigned char* outstart = out;
295: const unsigned char* processed = inb;
296: unsigned char* outend = out + *outlen;
1.28 daniel 297: unsigned short* in = (unsigned short*) inb;
298: unsigned short* inend;
299: unsigned int c, d, inlen;
300: unsigned char *tmp;
301: int bits;
302:
303: if ((*inlenb % 2) == 1)
304: (*inlenb)--;
305: inlen = *inlenb / 2;
1.33 daniel 306: inend = in + inlen;
1.39 ! daniel 307: while ((in < inend) && (out - outstart + 5 < *outlen)) {
1.34 daniel 308: if (xmlLittleEndian) {
309: c= *in++;
310: } else {
311: tmp = (unsigned char *) in;
312: c = *tmp++;
313: c = c | (((unsigned int)*tmp) << 8);
314: in++;
315: }
1.28 daniel 316: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.39 ! daniel 317: if (in >= inend) { /* (in > inend) shouldn't happens */
! 318: break;
! 319: }
1.34 daniel 320: if (xmlLittleEndian) {
321: d = *in++;
322: } else {
323: tmp = (unsigned char *) in;
324: d = *tmp++;
325: d = d | (((unsigned int)*tmp) << 8);
326: in++;
327: }
1.28 daniel 328: if ((d & 0xFC00) == 0xDC00) {
329: c &= 0x03FF;
330: c <<= 10;
331: c |= d & 0x03FF;
332: c += 0x10000;
333: }
1.33 daniel 334: else {
335: *outlen = out - outstart;
336: *inlenb = processed - inb;
1.28 daniel 337: return(-2);
1.33 daniel 338: }
1.28 daniel 339: }
340:
341: /* assertion: c is a single UTF-4 value */
342: if (out >= outend)
1.33 daniel 343: break;
1.28 daniel 344: if (c < 0x80) { *out++= c; bits= -6; }
345: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
346: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
347: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
348:
349: for ( ; bits >= 0; bits-= 6) {
350: if (out >= outend)
1.33 daniel 351: break;
1.28 daniel 352: *out++= ((c >> bits) & 0x3F) | 0x80;
353: }
1.33 daniel 354: processed = (const unsigned char*) in;
1.28 daniel 355: }
1.33 daniel 356: *outlen = out - outstart;
357: *inlenb = processed - inb;
358: return(0);
1.28 daniel 359: }
360:
361: /**
362: * UTF8ToUTF16LE:
363: * @outb: a pointer to an array of bytes to store the result
364: * @outlen: the length of @outb
365: * @in: a pointer to an array of UTF-8 chars
366: * @inlen: the length of @in
367: *
368: * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
369: * block of chars out.
370: * TODO: UTF8ToUTF16LE need a fallback mechanism ...
371: *
372: * Returns the number of byte written, or -1 by lack of space, or -2
373: * if the transcoding failed.
374: */
375: int
1.33 daniel 376: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28 daniel 377: const unsigned char* in, int *inlen)
378: {
379: unsigned short* out = (unsigned short*) outb;
1.33 daniel 380: const unsigned char* processed = in;
1.28 daniel 381: unsigned short* outstart= out;
382: unsigned short* outend;
383: const unsigned char* inend= in+*inlen;
384: unsigned int c, d, trailing;
385: unsigned char *tmp;
386: unsigned short tmp1, tmp2;
387:
1.37 daniel 388: if (in == NULL) {
389: /*
390: * initialization, add the Byte Order Mark
391: */
392: if (*outlen >= 2) {
393: outb[0] = 0xFF;
394: outb[1] = 0xFE;
395: *outlen = 2;
396: *inlen = 0;
397: #ifdef DEBUG_ENCODING
398: fprintf(stderr, "Added FFFE Byte Order Mark\n");
399: #endif
400: return(2);
401: }
402: *outlen = 0;
403: *inlen = 0;
404: return(0);
405: }
1.33 daniel 406: outend = out + (*outlen / 2);
1.28 daniel 407: while (in < inend) {
408: d= *in++;
409: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 410: else if (d < 0xC0) {
411: /* trailing byte in leading position */
412: *outlen = out - outstart;
413: *inlen = processed - in;
414: return(-2);
415: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.28 daniel 416: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
417: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 418: else {
419: /* no chance for this in UTF-16 */
420: *outlen = out - outstart;
421: *inlen = processed - in;
422: return(-2);
423: }
1.28 daniel 424:
425: if (inend - in < trailing) {
426: break;
427: }
428:
429: for ( ; trailing; trailing--) {
430: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33 daniel 431: break;
1.28 daniel 432: c <<= 6;
433: c |= d & 0x3F;
434: }
435:
436: /* assertion: c is a single UTF-4 value */
437: if (c < 0x10000) {
438: if (out >= outend)
1.33 daniel 439: break;
1.34 daniel 440: if (xmlLittleEndian) {
441: *out++ = c;
442: } else {
443: tmp = (unsigned char *) out;
444: *tmp = c ;
445: *(tmp + 1) = c >> 8 ;
446: out++;
447: }
1.28 daniel 448: }
449: else if (c < 0x110000) {
450: if (out+1 >= outend)
1.33 daniel 451: break;
1.28 daniel 452: c -= 0x10000;
1.34 daniel 453: if (xmlLittleEndian) {
454: *out++ = 0xD800 | (c >> 10);
455: *out++ = 0xDC00 | (c & 0x03FF);
456: } else {
457: tmp1 = 0xD800 | (c >> 10);
458: tmp = (unsigned char *) out;
459: *tmp = tmp1;
460: *(tmp + 1) = tmp1 >> 8;
461: out++;
462:
463: tmp2 = 0xDC00 | (c & 0x03FF);
464: tmp = (unsigned char *) out;
465: *tmp = tmp2;
466: *(tmp + 1) = tmp2 >> 8;
467: out++;
468: }
1.28 daniel 469: }
470: else
1.33 daniel 471: break;
472: processed = in;
1.28 daniel 473: }
1.36 daniel 474: *outlen = (out - outstart) * 2;
1.33 daniel 475: *inlen = processed - in;
476: return(0);
1.28 daniel 477: }
478:
479: /**
480: * UTF16BEToUTF8:
1.18 daniel 481: * @out: a pointer to an array of bytes to store the result
482: * @outlen: the length of @out
1.25 daniel 483: * @inb: a pointer to an array of UTF-16 passwd as a byte array
484: * @inlenb: the length of @in in UTF-16 chars
1.1 daniel 485: *
486: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28 daniel 487: * block of chars out. This function assume the endian properity
488: * is the same between the native type of this machine and the
489: * inputed one.
1.25 daniel 490: *
1.28 daniel 491: * Returns the number of byte written, or -1 by lack of space, or -2
492: * if the transcoding fails (for *in is not valid utf16 string)
493: * The value of *inlen after return is the number of octets consumed
494: * as the return value is positive, else unpredictiable.
1.1 daniel 495: */
496: int
1.33 daniel 497: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25 daniel 498: const unsigned char* inb, int *inlenb)
1.1 daniel 499: {
1.33 daniel 500: unsigned char* outstart = out;
501: const unsigned char* processed = inb;
502: unsigned char* outend = out + *outlen;
1.25 daniel 503: unsigned short* in = (unsigned short*) inb;
504: unsigned short* inend;
505: unsigned int c, d, inlen;
1.28 daniel 506: unsigned char *tmp;
1.1 daniel 507: int bits;
508:
1.28 daniel 509: if ((*inlenb % 2) == 1)
510: (*inlenb)--;
1.25 daniel 511: inlen = *inlenb / 2;
512: inend= in + inlen;
1.1 daniel 513: while (in < inend) {
1.34 daniel 514: if (xmlLittleEndian) {
515: tmp = (unsigned char *) in;
516: c = *tmp++;
517: c = c << 8;
518: c = c | (unsigned int) *tmp;
519: in++;
520: } else {
521: c= *in++;
522: }
1.1 daniel 523: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.28 daniel 524: if (in >= inend) { /* (in > inend) shouldn't happens */
1.33 daniel 525: *outlen = out - outstart;
526: *inlenb = processed - inb;
527: return(-2);
1.28 daniel 528: }
1.34 daniel 529: if (xmlLittleEndian) {
530: tmp = (unsigned char *) in;
531: d = *tmp++;
532: d = d << 8;
533: d = d | (unsigned int) *tmp;
534: in++;
535: } else {
536: d= *in++;
537: }
1.28 daniel 538: if ((d & 0xFC00) == 0xDC00) {
1.1 daniel 539: c &= 0x03FF;
540: c <<= 10;
541: c |= d & 0x03FF;
542: c += 0x10000;
543: }
1.33 daniel 544: else {
545: *outlen = out - outstart;
546: *inlenb = processed - inb;
1.28 daniel 547: return(-2);
1.33 daniel 548: }
1.1 daniel 549: }
550:
1.25 daniel 551: /* assertion: c is a single UTF-4 value */
1.27 daniel 552: if (out >= outend)
1.33 daniel 553: break;
1.1 daniel 554: if (c < 0x80) { *out++= c; bits= -6; }
1.26 daniel 555: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
556: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
557: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1.1 daniel 558:
1.26 daniel 559: for ( ; bits >= 0; bits-= 6) {
1.27 daniel 560: if (out >= outend)
1.33 daniel 561: break;
1.26 daniel 562: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 563: }
1.33 daniel 564: processed = (const unsigned char*) in;
1.1 daniel 565: }
1.33 daniel 566: *outlen = out - outstart;
567: *inlenb = processed - inb;
568: return(0);
1.1 daniel 569: }
570:
571: /**
1.28 daniel 572: * UTF8ToUTF16BE:
1.25 daniel 573: * @outb: a pointer to an array of bytes to store the result
574: * @outlen: the length of @outb
1.18 daniel 575: * @in: a pointer to an array of UTF-8 chars
576: * @inlen: the length of @in
1.1 daniel 577: *
1.28 daniel 578: * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1 daniel 579: * block of chars out.
1.28 daniel 580: * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15 daniel 581: *
1.6 daniel 582: * Returns the number of byte written, or -1 by lack of space, or -2
1.25 daniel 583: * if the transcoding failed.
1.1 daniel 584: */
585: int
1.33 daniel 586: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25 daniel 587: const unsigned char* in, int *inlen)
1.1 daniel 588: {
1.25 daniel 589: unsigned short* out = (unsigned short*) outb;
1.33 daniel 590: const unsigned char* processed = in;
1.1 daniel 591: unsigned short* outstart= out;
1.28 daniel 592: unsigned short* outend;
1.25 daniel 593: const unsigned char* inend= in+*inlen;
1.1 daniel 594: unsigned int c, d, trailing;
1.28 daniel 595: unsigned char *tmp;
596: unsigned short tmp1, tmp2;
1.1 daniel 597:
1.37 daniel 598: if (in == NULL) {
599: /*
600: * initialization, add the Byte Order Mark
601: */
602: if (*outlen >= 2) {
603: outb[0] = 0xFE;
604: outb[1] = 0xFF;
605: *outlen = 2;
606: *inlen = 0;
607: #ifdef DEBUG_ENCODING
608: fprintf(stderr, "Added FEFF Byte Order Mark\n");
609: #endif
610: return(2);
611: }
612: *outlen = 0;
613: *inlen = 0;
614: return(0);
615: }
1.33 daniel 616: outend = out + (*outlen / 2);
1.1 daniel 617: while (in < inend) {
618: d= *in++;
619: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 620: else if (d < 0xC0) {
621: /* trailing byte in leading position */
622: *outlen = out - outstart;
623: *inlen = processed - in;
624: return(-2);
625: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.1 daniel 626: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
627: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 628: else {
629: /* no chance for this in UTF-16 */
630: *outlen = out - outstart;
631: *inlen = processed - in;
632: return(-2);
633: }
1.28 daniel 634:
635: if (inend - in < trailing) {
636: break;
637: }
1.1 daniel 638:
639: for ( ; trailing; trailing--) {
1.33 daniel 640: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1.1 daniel 641: c <<= 6;
642: c |= d & 0x3F;
643: }
644:
645: /* assertion: c is a single UTF-4 value */
646: if (c < 0x10000) {
1.33 daniel 647: if (out >= outend) break;
1.34 daniel 648: if (xmlLittleEndian) {
649: tmp = (unsigned char *) out;
650: *tmp = c >> 8;
651: *(tmp + 1) = c;
652: out++;
653: } else {
654: *out++ = c;
655: }
1.1 daniel 656: }
657: else if (c < 0x110000) {
1.33 daniel 658: if (out+1 >= outend) break;
1.1 daniel 659: c -= 0x10000;
1.34 daniel 660: if (xmlLittleEndian) {
661: tmp1 = 0xD800 | (c >> 10);
662: tmp = (unsigned char *) out;
663: *tmp = tmp1 >> 8;
664: *(tmp + 1) = tmp1;
665: out++;
666:
667: tmp2 = 0xDC00 | (c & 0x03FF);
668: tmp = (unsigned char *) out;
669: *tmp = tmp2 >> 8;
670: *(tmp + 1) = tmp2;
671: out++;
672: } else {
673: *out++ = 0xD800 | (c >> 10);
674: *out++ = 0xDC00 | (c & 0x03FF);
675: }
1.1 daniel 676: }
1.33 daniel 677: else
678: break;
679: processed = in;
1.1 daniel 680: }
1.36 daniel 681: *outlen = (out - outstart) * 2;
1.33 daniel 682: *inlen = processed - in;
683: return(0);
1.1 daniel 684: }
685:
1.7 daniel 686: /**
687: * xmlDetectCharEncoding:
688: * @in: a pointer to the first bytes of the XML entity, must be at least
689: * 4 bytes long.
1.25 daniel 690: * @len: pointer to the length of the buffer
1.7 daniel 691: *
692: * Guess the encoding of the entity using the first bytes of the entity content
693: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
694: *
695: * Returns one of the XML_CHAR_ENCODING_... values.
696: */
697: xmlCharEncoding
1.25 daniel 698: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7 daniel 699: {
1.25 daniel 700: if (len >= 4) {
701: if ((in[0] == 0x00) && (in[1] == 0x00) &&
702: (in[2] == 0x00) && (in[3] == 0x3C))
703: return(XML_CHAR_ENCODING_UCS4BE);
704: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
705: (in[2] == 0x00) && (in[3] == 0x00))
706: return(XML_CHAR_ENCODING_UCS4LE);
707: if ((in[0] == 0x00) && (in[1] == 0x00) &&
708: (in[2] == 0x3C) && (in[3] == 0x00))
709: return(XML_CHAR_ENCODING_UCS4_2143);
710: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
711: (in[2] == 0x00) && (in[3] == 0x00))
712: return(XML_CHAR_ENCODING_UCS4_3412);
713: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
714: (in[2] == 0xA7) && (in[3] == 0x94))
715: return(XML_CHAR_ENCODING_EBCDIC);
716: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
717: (in[2] == 0x78) && (in[3] == 0x6D))
718: return(XML_CHAR_ENCODING_UTF8);
719: }
720: if (len >= 2) {
721: if ((in[0] == 0xFE) && (in[1] == 0xFF))
722: return(XML_CHAR_ENCODING_UTF16BE);
723: if ((in[0] == 0xFF) && (in[1] == 0xFE))
724: return(XML_CHAR_ENCODING_UTF16LE);
725: }
1.7 daniel 726: return(XML_CHAR_ENCODING_NONE);
727: }
728:
729: /**
730: * xmlParseCharEncoding:
1.18 daniel 731: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 732: *
733: * Conpare the string to the known encoding schemes already known. Note
734: * that the comparison is case insensitive accordingly to the section
735: * [XML] 4.3.3 Character Encoding in Entities.
736: *
737: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
738: * if not recognized.
739: */
740: xmlCharEncoding
1.8 daniel 741: xmlParseCharEncoding(const char* name)
1.7 daniel 742: {
743: char upper[500];
744: int i;
745:
746: for (i = 0;i < 499;i++) {
747: upper[i] = toupper(name[i]);
748: if (upper[i] == 0) break;
749: }
750: upper[i] = 0;
751:
752: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
753: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
754: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
755:
756: /*
757: * NOTE: if we were able to parse this, the endianness of UTF16 is
758: * already found and in use
759: */
760: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
761: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
762:
763: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
764: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
765: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
766:
767: /*
768: * NOTE: if we were able to parse this, the endianness of UCS4 is
769: * already found and in use
770: */
771: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
772: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
773: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
774:
775:
776: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
777: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
778: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
779:
780: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
781: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
782: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
783:
784: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
785: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
786: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
787: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
788: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
789: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
790: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
791:
792: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30 daniel 793: if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7 daniel 794: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30 daniel 795:
796: #ifdef DEBUG_ENCODING
797: fprintf(stderr, "Unknown encoding %s\n", name);
798: #endif
1.7 daniel 799: return(XML_CHAR_ENCODING_ERROR);
800: }
1.9 daniel 801:
1.38 daniel 802: /**
803: * xmlGetCharEncodingName:
804: * @enc: the encoding
805: *
806: * The "canonical" name for XML encoding.
807: * C.f. http://www.w3.org/TR/REC-xml#charencoding
808: * Section 4.3.3 Character Encoding in Entities
809: *
810: * Returns the canonical name for the given encoding
811: */
812:
813: const char*
814: xmlGetCharEncodingName(xmlCharEncoding enc) {
815: switch (enc) {
816: case XML_CHAR_ENCODING_ERROR:
817: return(NULL);
818: case XML_CHAR_ENCODING_NONE:
819: return(NULL);
820: case XML_CHAR_ENCODING_UTF8:
821: return("UTF-8");
822: case XML_CHAR_ENCODING_UTF16LE:
823: return("UTF-16");
824: case XML_CHAR_ENCODING_UTF16BE:
825: return("UTF-16");
826: case XML_CHAR_ENCODING_EBCDIC:
827: return("EBCDIC");
828: case XML_CHAR_ENCODING_UCS4LE:
829: return("ISO-10646-UCS-4");
830: case XML_CHAR_ENCODING_UCS4BE:
831: return("ISO-10646-UCS-4");
832: case XML_CHAR_ENCODING_UCS4_2143:
833: return("ISO-10646-UCS-4");
834: case XML_CHAR_ENCODING_UCS4_3412:
835: return("ISO-10646-UCS-4");
836: case XML_CHAR_ENCODING_UCS2:
837: return("ISO-10646-UCS-2");
838: case XML_CHAR_ENCODING_8859_1:
839: return("ISO-8859-1");
840: case XML_CHAR_ENCODING_8859_2:
841: return("ISO-8859-2");
842: case XML_CHAR_ENCODING_8859_3:
843: return("ISO-8859-3");
844: case XML_CHAR_ENCODING_8859_4:
845: return("ISO-8859-4");
846: case XML_CHAR_ENCODING_8859_5:
847: return("ISO-8859-5");
848: case XML_CHAR_ENCODING_8859_6:
849: return("ISO-8859-6");
850: case XML_CHAR_ENCODING_8859_7:
851: return("ISO-8859-7");
852: case XML_CHAR_ENCODING_8859_8:
853: return("ISO-8859-8");
854: case XML_CHAR_ENCODING_8859_9:
855: return("ISO-8859-9");
856: case XML_CHAR_ENCODING_2022_JP:
857: return("ISO-2022-JP");
858: case XML_CHAR_ENCODING_SHIFT_JIS:
859: return("Shift-JIS");
860: case XML_CHAR_ENCODING_EUC_JP:
861: return("EUC-JP");
862: }
863: return(NULL);
864: }
865:
1.9 daniel 866: /****************************************************************
867: * *
868: * Char encoding handlers *
869: * *
870: ****************************************************************/
871:
872: /* the size should be growable, but it's not a big deal ... */
873: #define MAX_ENCODING_HANDLERS 50
874: static xmlCharEncodingHandlerPtr *handlers = NULL;
875: static int nbCharEncodingHandler = 0;
876:
877: /*
878: * The default is UTF-8 for XML, that's also the default used for the
879: * parser internals, so the default encoding handler is NULL
880: */
881:
882: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
883:
884: /**
885: * xmlNewCharEncodingHandler:
1.18 daniel 886: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 887: * @input: the xmlCharEncodingInputFunc to read that encoding
888: * @output: the xmlCharEncodingOutputFunc to write that encoding
889: *
890: * Create and registers an xmlCharEncodingHandler.
891: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
892: */
893: xmlCharEncodingHandlerPtr
1.25 daniel 894: xmlNewCharEncodingHandler(const char *name,
895: xmlCharEncodingInputFunc input,
1.9 daniel 896: xmlCharEncodingOutputFunc output) {
897: xmlCharEncodingHandlerPtr handler;
898: char upper[500];
899: int i;
900: char *up = 0;
901:
902: /*
903: * Keep only the uppercase version of the encoding.
904: */
905: if (name == NULL) {
906: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
907: return(NULL);
908: }
909: for (i = 0;i < 499;i++) {
910: upper[i] = toupper(name[i]);
911: if (upper[i] == 0) break;
912: }
913: upper[i] = 0;
1.16 daniel 914: up = xmlMemStrdup(upper);
1.9 daniel 915: if (up == NULL) {
916: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
917: return(NULL);
918: }
919:
920: /*
921: * allocate and fill-up an handler block.
922: */
923: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 924: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 925: if (handler == NULL) {
926: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
927: return(NULL);
928: }
929: handler->input = input;
930: handler->output = output;
931: handler->name = up;
932:
933: /*
934: * registers and returns the handler.
935: */
936: xmlRegisterCharEncodingHandler(handler);
1.30 daniel 937: #ifdef DEBUG_ENCODING
938: fprintf(stderr, "Registered encoding handler for %s\n", name);
939: #endif
1.9 daniel 940: return(handler);
941: }
942:
943: /**
944: * xmlInitCharEncodingHandlers:
945: *
946: * Initialize the char encoding support, it registers the default
947: * encoding supported.
1.18 daniel 948: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 949: * in normal processing.
950: */
951: void
952: xmlInitCharEncodingHandlers(void) {
1.34 daniel 953: unsigned short int tst = 0x1234;
954: unsigned char *ptr = (unsigned char *) &tst;
955:
1.9 daniel 956: if (handlers != NULL) return;
957:
958: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 959: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34 daniel 960:
961: if (*ptr == 0x12) xmlLittleEndian = 0;
962: else if (*ptr == 0x34) xmlLittleEndian = 1;
963: else fprintf(stderr, "Odd problem at endianness detection\n");
1.9 daniel 964:
965: if (handlers == NULL) {
966: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
967: return;
968: }
1.10 daniel 969: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25 daniel 970: xmlUTF16LEHandler =
1.28 daniel 971: xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
972: xmlUTF16BEHandler =
973: xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10 daniel 974: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9 daniel 975: }
976:
977: /**
1.19 daniel 978: * xmlCleanupCharEncodingHandlers:
979: *
980: * Cleanup the memory allocated for the char encoding support, it
981: * unregisters all the encoding handlers.
982: */
983: void
984: xmlCleanupCharEncodingHandlers(void) {
985: if (handlers == NULL) return;
986:
987: for (;nbCharEncodingHandler > 0;) {
988: nbCharEncodingHandler--;
989: if (handlers[nbCharEncodingHandler] != NULL) {
1.31 daniel 990: if (handlers[nbCharEncodingHandler]->name != NULL)
991: xmlFree(handlers[nbCharEncodingHandler]->name);
1.19 daniel 992: xmlFree(handlers[nbCharEncodingHandler]);
993: }
994: }
995: xmlFree(handlers);
996: handlers = NULL;
997: nbCharEncodingHandler = 0;
998: xmlDefaultCharEncodingHandler = NULL;
999: }
1000:
1001: /**
1.9 daniel 1002: * xmlRegisterCharEncodingHandler:
1003: * @handler: the xmlCharEncodingHandlerPtr handler block
1004: *
1005: * Register the char encoding handler, surprizing, isn't it ?
1006: */
1007: void
1008: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1009: if (handlers == NULL) xmlInitCharEncodingHandlers();
1010: if (handler == NULL) {
1011: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
1012: return;
1013: }
1014:
1015: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1016: fprintf(stderr,
1017: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1018: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1019: return;
1020: }
1021: handlers[nbCharEncodingHandler++] = handler;
1022: }
1023:
1024: /**
1025: * xmlGetCharEncodingHandler:
1026: * @enc: an xmlCharEncoding value.
1027: *
1028: * Search in the registrered set the handler able to read/write that encoding.
1029: *
1030: * Returns the handler or NULL if not found
1031: */
1032: xmlCharEncodingHandlerPtr
1033: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30 daniel 1034: xmlCharEncodingHandlerPtr handler;
1035:
1.9 daniel 1036: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25 daniel 1037: switch (enc) {
1038: case XML_CHAR_ENCODING_ERROR:
1039: return(NULL);
1040: case XML_CHAR_ENCODING_NONE:
1041: return(NULL);
1042: case XML_CHAR_ENCODING_UTF8:
1043: return(NULL);
1044: case XML_CHAR_ENCODING_UTF16LE:
1045: return(xmlUTF16LEHandler);
1046: case XML_CHAR_ENCODING_UTF16BE:
1047: return(xmlUTF16BEHandler);
1048: case XML_CHAR_ENCODING_EBCDIC:
1.30 daniel 1049: handler = xmlFindCharEncodingHandler("EBCDIC");
1050: if (handler != NULL) return(handler);
1051: handler = xmlFindCharEncodingHandler("ebcdic");
1052: if (handler != NULL) return(handler);
1053: break;
1.38 daniel 1054: case XML_CHAR_ENCODING_UCS4BE:
1.30 daniel 1055: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1056: if (handler != NULL) return(handler);
1057: handler = xmlFindCharEncodingHandler("UCS-4");
1058: if (handler != NULL) return(handler);
1059: handler = xmlFindCharEncodingHandler("UCS4");
1060: if (handler != NULL) return(handler);
1061: break;
1.38 daniel 1062: case XML_CHAR_ENCODING_UCS4LE:
1063: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1064: if (handler != NULL) return(handler);
1065: handler = xmlFindCharEncodingHandler("UCS-4");
1066: if (handler != NULL) return(handler);
1067: handler = xmlFindCharEncodingHandler("UCS4");
1.30 daniel 1068: if (handler != NULL) return(handler);
1069: break;
1.25 daniel 1070: case XML_CHAR_ENCODING_UCS4_2143:
1.30 daniel 1071: break;
1.25 daniel 1072: case XML_CHAR_ENCODING_UCS4_3412:
1.30 daniel 1073: break;
1.25 daniel 1074: case XML_CHAR_ENCODING_UCS2:
1.30 daniel 1075: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1076: if (handler != NULL) return(handler);
1077: handler = xmlFindCharEncodingHandler("UCS-2");
1078: if (handler != NULL) return(handler);
1079: handler = xmlFindCharEncodingHandler("UCS2");
1080: if (handler != NULL) return(handler);
1081: break;
1.25 daniel 1082: case XML_CHAR_ENCODING_8859_1:
1083: case XML_CHAR_ENCODING_8859_2:
1084: case XML_CHAR_ENCODING_8859_3:
1085: case XML_CHAR_ENCODING_8859_4:
1086: case XML_CHAR_ENCODING_8859_5:
1087: case XML_CHAR_ENCODING_8859_6:
1088: case XML_CHAR_ENCODING_8859_7:
1089: case XML_CHAR_ENCODING_8859_8:
1090: case XML_CHAR_ENCODING_8859_9:
1091: return(NULL);
1092: case XML_CHAR_ENCODING_2022_JP:
1.30 daniel 1093: handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1094: if (handler != NULL) return(handler);
1095: break;
1.25 daniel 1096: case XML_CHAR_ENCODING_SHIFT_JIS:
1.30 daniel 1097: handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1098: if (handler != NULL) return(handler);
1099: handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1100: if (handler != NULL) return(handler);
1101: handler = xmlFindCharEncodingHandler("Shift_JIS");
1102: if (handler != NULL) return(handler);
1103: break;
1.25 daniel 1104: case XML_CHAR_ENCODING_EUC_JP:
1.30 daniel 1105: handler = xmlFindCharEncodingHandler("EUC-JP");
1106: if (handler != NULL) return(handler);
1107: break;
1108: default:
1109: break;
1.25 daniel 1110: }
1.30 daniel 1111:
1112: #ifdef DEBUG_ENCODING
1113: fprintf(stderr, "No handler found for encoding %d\n", enc);
1114: #endif
1.9 daniel 1115: return(NULL);
1116: }
1117:
1118: /**
1119: * xmlGetCharEncodingHandler:
1120: * @enc: a string describing the char encoding.
1121: *
1122: * Search in the registrered set the handler able to read/write that encoding.
1123: *
1124: * Returns the handler or NULL if not found
1125: */
1126: xmlCharEncodingHandlerPtr
1127: xmlFindCharEncodingHandler(const char *name) {
1.36 daniel 1128: xmlCharEncodingHandlerPtr enc;
1129: xmlCharEncoding alias;
1.30 daniel 1130: #ifdef LIBXML_ICONV_ENABLED
1131: iconv_t icv_in, icv_out;
1132: #endif /* LIBXML_ICONV_ENABLED */
1133: char upper[100];
1.9 daniel 1134: int i;
1135:
1136: if (handlers == NULL) xmlInitCharEncodingHandlers();
1137: if (name == NULL) return(xmlDefaultCharEncodingHandler);
1138: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1139:
1.36 daniel 1140: /*
1141: * Check first for directly registered encoding names
1142: */
1.30 daniel 1143: for (i = 0;i < 99;i++) {
1.9 daniel 1144: upper[i] = toupper(name[i]);
1145: if (upper[i] == 0) break;
1146: }
1147: upper[i] = 0;
1148:
1149: for (i = 0;i < nbCharEncodingHandler; i++)
1.30 daniel 1150: if (!strcmp(upper, handlers[i]->name)) {
1151: #ifdef DEBUG_ENCODING
1152: fprintf(stderr, "Found registered handler for encoding %s\n", name);
1153: #endif
1.9 daniel 1154: return(handlers[i]);
1.30 daniel 1155: }
1.9 daniel 1156:
1.30 daniel 1157: #ifdef LIBXML_ICONV_ENABLED
1158: /* check whether iconv can handle this */
1.31 daniel 1159: icv_in = iconv_open("UTF-8", name);
1160: icv_out = iconv_open(name, "UTF-8");
1.30 daniel 1161: if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31 daniel 1162: enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32 daniel 1163: if (enc == NULL) {
1164: iconv_close(icv_in);
1165: iconv_close(icv_out);
1166: return(NULL);
1167: }
1168: enc->name = NULL;
1.30 daniel 1169: enc->input = NULL;
1170: enc->output = NULL;
1171: enc->iconv_in = icv_in;
1172: enc->iconv_out = icv_out;
1173: #ifdef DEBUG_ENCODING
1174: fprintf(stderr, "Found iconv handler for encoding %s\n", name);
1175: #endif
1176: return enc;
1177: } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1178: fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
1179: }
1180: #endif /* LIBXML_ICONV_ENABLED */
1.38 daniel 1181:
1.30 daniel 1182: #ifdef DEBUG_ENCODING
1183: fprintf(stderr, "No handler found for encoding %s\n", name);
1184: #endif
1.38 daniel 1185:
1186: /*
1187: * Fallback using the canonical names
1188: */
1189: alias = xmlParseCharEncoding(name);
1190: if (alias != XML_CHAR_ENCODING_ERROR) {
1191: const char* canon;
1192: canon = xmlGetCharEncodingName(alias);
1193: if ((canon != NULL) && (strcmp(name, canon))) {
1194: return(xmlFindCharEncodingHandler(canon));
1195: }
1196: }
1197:
1.9 daniel 1198: return(NULL);
1.30 daniel 1199: }
1200:
1201: #ifdef LIBXML_ICONV_ENABLED
1202: /**
1203: * xmlIconvWrapper:
1204: * @cd: iconv converter data structure
1205: * @out: a pointer to an array of bytes to store the result
1206: * @outlen: the length of @out
1207: * @in: a pointer to an array of ISO Latin 1 chars
1208: * @inlen: the length of @in
1209: *
1210: * Returns 0 if success, or
1211: * -1 by lack of space, or
1212: * -2 if the transcoding fails (for *in is not valid utf8 string or
1213: * the result of transformation can't fit into the encoding we want), or
1214: * -3 if there the last byte can't form a single output char.
1215: *
1216: * The value of @inlen after return is the number of octets consumed
1217: * as the return value is positive, else unpredictiable.
1218: * The value of @outlen after return is the number of ocetes consumed.
1219: */
1220: static int
1221: xmlIconvWrapper(iconv_t cd,
1222: unsigned char *out, int *outlen,
1223: const unsigned char *in, int *inlen) {
1224:
1225: size_t icv_inlen = *inlen, icv_outlen = *outlen;
1226: const char *icv_in = (const char *) in;
1227: char *icv_out = (char *) out;
1228: int ret;
1229:
1230: ret = iconv(cd,
1231: &icv_in, &icv_inlen,
1232: &icv_out, &icv_outlen);
1.35 daniel 1233: if (in != NULL) {
1234: *inlen -= icv_inlen;
1235: *outlen -= icv_outlen;
1236: } else {
1237: *inlen = 0;
1238: *outlen = 0;
1239: }
1.30 daniel 1240: if (icv_inlen != 0 || ret == (size_t) -1) {
1241: #ifdef EILSEQ
1242: if (errno == EILSEQ) {
1.31 daniel 1243: return -2;
1.30 daniel 1244: } else
1245: #endif
1246: #ifdef E2BIG
1247: if (errno == E2BIG) {
1248: return -1;
1249: } else
1250: #endif
1251: #ifdef EINVAL
1252: if (errno == EINVAL) {
1.31 daniel 1253: return -3;
1.30 daniel 1254: }
1255: #endif
1256: else {
1257: return -3;
1258: }
1259: }
1260: return 0;
1261: }
1262: #endif /* LIBXML_ICONV_ENABLED */
1.38 daniel 1263:
1264: /**
1265: * xmlCharEncFirstLine:
1266: * @handler: char enconding transformation data structure
1267: * @out: an xmlBuffer for the output.
1268: * @in: an xmlBuffer for the input
1269: *
1270: * Front-end for the encoding handler input function, but handle only
1271: * the very first line, i.e. limit itself to 45 chars.
1272: *
1273: * Returns the number of byte written if success, or
1274: * -1 general error
1275: * -2 if the transcoding fails (for *in is not valid utf8 string or
1276: * the result of transformation can't fit into the encoding we want), or
1277: */
1278: int
1279: xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1280: xmlBufferPtr in) {
1281: int ret = -2;
1282: int written;
1283: int toconv;
1284:
1285: if (handler == NULL) return(-1);
1286: if (out == NULL) return(-1);
1287: if (in == NULL) return(-1);
1288:
1289: written = out->size - out->use;
1290: toconv = in->use;
1291: if (toconv * 2 >= written) {
1.39 ! daniel 1292: xmlBufferGrow(out, toconv);
1.38 daniel 1293: written = out->size - out->use - 1;
1294: }
1.39 ! daniel 1295:
1.38 daniel 1296: /*
1297: * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1298: * 45 chars should be sufficient to reach the end of the encoding
1299: * decalration without going too far inside the document content.
1300: */
1301: written = 45;
1302:
1303: if (handler->input != NULL) {
1304: ret = handler->input(&out->content[out->use], &written,
1305: in->content, &toconv);
1306: xmlBufferShrink(in, toconv);
1307: out->use += written;
1308: out->content[out->use] = 0;
1309: }
1310: #ifdef LIBXML_ICONV_ENABLED
1311: else if (handler->iconv_in != NULL) {
1312: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1313: &written, in->content, &toconv);
1314: xmlBufferShrink(in, toconv);
1315: out->use += written;
1316: out->content[out->use] = 0;
1317: if (ret == -1) ret = -3;
1318: }
1319: #endif /* LIBXML_ICONV_ENABLED */
1320: #ifdef DEBUG_ENCODING
1321: switch (ret) {
1322: case 0:
1323: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1324: toconv, written);
1325: break;
1326: case -1:
1327: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1328: toconv, written, in->use);
1329: break;
1330: case -2:
1331: fprintf(stderr, "input conversion failed due to input error\n");
1332: break;
1333: case -3:
1334: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1335: toconv, written, in->use);
1336: break;
1337: default:
1338: fprintf(stderr,"Unknown input conversion failed %d\n", ret);
1339: }
1340: #endif
1341: /*
1342: * Ignore when input buffer is not on a boundary
1343: */
1344: if (ret == -3) ret = 0;
1345: if (ret == -1) ret = 0;
1346: return(ret);
1347: }
1.30 daniel 1348:
1349: /**
1350: * xmlCharEncInFunc:
1351: * @handler: char enconding transformation data structure
1.31 daniel 1352: * @out: an xmlBuffer for the output.
1353: * @in: an xmlBuffer for the input
1.30 daniel 1354: *
1355: * Generic front-end for the encoding handler input function
1356: *
1.31 daniel 1357: * Returns the number of byte written if success, or
1358: * -1 general error
1.30 daniel 1359: * -2 if the transcoding fails (for *in is not valid utf8 string or
1360: * the result of transformation can't fit into the encoding we want), or
1361: */
1362: int
1.31 daniel 1363: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1364: xmlBufferPtr in) {
1.30 daniel 1365: int ret = -2;
1.31 daniel 1366: int written;
1367: int toconv;
1.30 daniel 1368:
1.31 daniel 1369: if (handler == NULL) return(-1);
1370: if (out == NULL) return(-1);
1371: if (in == NULL) return(-1);
1372:
1373: written = out->size - out->use;
1374: toconv = in->use;
1375: if (toconv * 2 >= written) {
1376: xmlBufferGrow(out, toconv * 2);
1.33 daniel 1377: written = out->size - out->use - 1;
1.31 daniel 1378: }
1.30 daniel 1379: if (handler->input != NULL) {
1.32 daniel 1380: ret = handler->input(&out->content[out->use], &written,
1.31 daniel 1381: in->content, &toconv);
1382: xmlBufferShrink(in, toconv);
1383: out->use += written;
1.33 daniel 1384: out->content[out->use] = 0;
1.30 daniel 1385: }
1386: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1387: else if (handler->iconv_in != NULL) {
1388: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1389: &written, in->content, &toconv);
1390: xmlBufferShrink(in, toconv);
1391: out->use += written;
1.33 daniel 1392: out->content[out->use] = 0;
1393: if (ret == -1) ret = -3;
1.30 daniel 1394: }
1395: #endif /* LIBXML_ICONV_ENABLED */
1.39 ! daniel 1396: switch (ret) {
1.30 daniel 1397: #ifdef DEBUG_ENCODING
1398: case 0:
1399: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31 daniel 1400: toconv, written);
1.30 daniel 1401: break;
1402: case -1:
1.31 daniel 1403: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1404: toconv, written, in->use);
1.30 daniel 1405: break;
1406: case -3:
1.31 daniel 1407: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1408: toconv, written, in->use);
1.30 daniel 1409: break;
1.39 ! daniel 1410: #endif
! 1411: case -2:
! 1412: fprintf(stderr, "input conversion failed due to input error\n");
! 1413: fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
! 1414: in->content[0], in->content[1],
! 1415: in->content[2], in->content[3]);
1.30 daniel 1416: }
1.33 daniel 1417: /*
1418: * Ignore when input buffer is not on a boundary
1419: */
1420: if (ret == -3) ret = 0;
1.30 daniel 1421: return(ret);
1422: }
1423:
1424: /**
1425: * xmlCharEncOutFunc:
1426: * @handler: char enconding transformation data structure
1.31 daniel 1427: * @out: an xmlBuffer for the output.
1428: * @in: an xmlBuffer for the input
1429: *
1430: * Generic front-end for the encoding handler output function
1.35 daniel 1431: * a first call with @in == NULL has to be made firs to initiate the
1432: * output in case of non-stateless encoding needing to initiate their
1433: * state or the output (like the BOM in UTF16).
1.39 ! daniel 1434: * In case of UTF8 sequence conversion errors for the given encoder,
! 1435: * the content will be automatically remapped to a CharRef sequence.
1.30 daniel 1436: *
1.31 daniel 1437: * Returns the number of byte written if success, or
1438: * -1 general error
1.30 daniel 1439: * -2 if the transcoding fails (for *in is not valid utf8 string or
1440: * the result of transformation can't fit into the encoding we want), or
1441: */
1442: int
1.31 daniel 1443: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1444: xmlBufferPtr in) {
1.30 daniel 1445: int ret = -2;
1.31 daniel 1446: int written;
1447: int toconv;
1.39 ! daniel 1448: int output = 0;
1.31 daniel 1449:
1450: if (handler == NULL) return(-1);
1451: if (out == NULL) return(-1);
1.39 ! daniel 1452:
! 1453: retry:
! 1454:
1.35 daniel 1455: written = out->size - out->use;
1456:
1.39 ! daniel 1457: /*
! 1458: * First specific handling of in = NULL, i.e. the initialization call
! 1459: */
1.35 daniel 1460: if (in == NULL) {
1461: toconv = 0;
1462: if (handler->output != NULL) {
1463: ret = handler->output(&out->content[out->use], &written,
1464: NULL, &toconv);
1465: out->use += written;
1466: out->content[out->use] = 0;
1467: }
1468: #ifdef LIBXML_ICONV_ENABLED
1469: else if (handler->iconv_out != NULL) {
1470: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1471: &written, NULL, &toconv);
1472: out->use += written;
1473: out->content[out->use] = 0;
1474: }
1475: #endif /* LIBXML_ICONV_ENABLED */
1476: #ifdef DEBUG_ENCODING
1477: fprintf(stderr, "initialized encoder\n");
1478: #endif
1479: return(0);
1480: }
1.30 daniel 1481:
1.39 ! daniel 1482: /*
! 1483: * Convertion itself.
! 1484: */
1.33 daniel 1485: toconv = in->use;
1486: if (toconv * 2 >= written) {
1487: xmlBufferGrow(out, toconv * 2);
1488: written = out->size - out->use - 1;
1489: }
1.30 daniel 1490: if (handler->output != NULL) {
1.33 daniel 1491: ret = handler->output(&out->content[out->use], &written,
1.35 daniel 1492: in->content, &toconv);
1.31 daniel 1493: xmlBufferShrink(in, toconv);
1494: out->use += written;
1.33 daniel 1495: out->content[out->use] = 0;
1.30 daniel 1496: }
1497: #ifdef LIBXML_ICONV_ENABLED
1498: else if (handler->iconv_out != NULL) {
1.31 daniel 1499: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1500: &written, in->content, &toconv);
1501: xmlBufferShrink(in, toconv);
1502: out->use += written;
1.33 daniel 1503: out->content[out->use] = 0;
1504: if (ret == -1) ret = -3;
1.30 daniel 1505: }
1506: #endif /* LIBXML_ICONV_ENABLED */
1.39 ! daniel 1507:
! 1508: if (ret >= 0) output += ret;
! 1509:
! 1510: /*
! 1511: * Attempt to handle error cases
! 1512: */
! 1513: switch (ret) {
1.30 daniel 1514: #ifdef DEBUG_ENCODING
1515: case 0:
1516: fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31 daniel 1517: toconv, written);
1.30 daniel 1518: break;
1519: case -1:
1520: fprintf(stderr, "output conversion failed by lack of space\n");
1521: break;
1522: case -3:
1.31 daniel 1523: fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
1524: toconv, written, in->use);
1.30 daniel 1525: break;
1.39 ! daniel 1526: #endif
! 1527: case -2: {
! 1528: int len = in->use;
! 1529: const char *utf = (const char *) in->content;
! 1530: int cur;
! 1531:
! 1532: cur = xmlGetUTF8Char(utf, &len);
! 1533: if (cur > 0) {
! 1534: xmlChar charref[20];
! 1535:
! 1536: #ifdef DEBUG_ENCODING
! 1537: fprintf(stderr, "handling output conversion error\n");
! 1538: fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
! 1539: in->content[0], in->content[1],
! 1540: in->content[2], in->content[3]);
! 1541: #endif
! 1542: /*
! 1543: * Removes the UTF8 sequence, and replace it by a charref
! 1544: * and continue the transcoding phase, hoping the error
! 1545: * did not mangle the encoder state.
! 1546: */
! 1547: sprintf(charref, "&#x%X;", cur);
! 1548: xmlBufferShrink(in, len);
! 1549: xmlBufferAddHead(in, charref, -1);
! 1550:
! 1551: goto retry;
! 1552: } else {
! 1553: fprintf(stderr, "output conversion failed due to conv error\n");
! 1554: fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
! 1555: in->content[0], in->content[1],
! 1556: in->content[2], in->content[3]);
! 1557: }
! 1558: break;
! 1559: }
1.30 daniel 1560: }
1561: return(ret);
1562: }
1563:
1564: /**
1565: * xmlCharEncCloseFunc:
1566: * @handler: char enconding transformation data structure
1567: *
1568: * Generic front-end for hencoding handler close function
1569: *
1570: * Returns 0 if success, or -1 in case of error
1571: */
1572: int
1573: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1574: int ret = 0;
1.31 daniel 1575: if (handler == NULL) return(-1);
1576: if (handler->name == NULL) return(-1);
1.30 daniel 1577: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1578: /*
1579: * Iconv handlers can be oused only once, free the whole block.
1580: * and the associated icon resources.
1581: */
1.32 daniel 1582: if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
1583: if (handler->name != NULL)
1584: xmlFree(handler->name);
1585: handler->name = NULL;
1586: if (handler->iconv_out != NULL) {
1587: if (iconv_close(handler->iconv_out))
1588: ret = -1;
1589: handler->iconv_out = NULL;
1590: }
1591: if (handler->iconv_in != NULL) {
1592: if (iconv_close(handler->iconv_in))
1593: ret = -1;
1594: handler->iconv_in = NULL;
1595: }
1596: xmlFree(handler);
1.30 daniel 1597: }
1598: #endif /* LIBXML_ICONV_ENABLED */
1599: #ifdef DEBUG_ENCODING
1600: if (ret)
1601: fprintf(stderr, "failed to close the encoding handler\n");
1602: else
1603: fprintf(stderr, "closed the encoding handler\n");
1604:
1605: #endif
1606: return(ret);
1.9 daniel 1607: }
1608:
Webmaster