Annotation of XML/encoding.c, revision 1.29
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
1.9 daniel 15: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 16: *
17: * See Copyright for the status of this software.
18: *
19: * Daniel.Veillard@w3.org
20: */
21:
1.21 daniel 22: #ifdef WIN32
23: #include "win32config.h"
24: #else
1.14 daniel 25: #include "config.h"
1.17 daniel 26: #endif
27:
28: #include <stdio.h>
29: #include <string.h>
30:
31: #ifdef HAVE_CTYPE_H
1.7 daniel 32: #include <ctype.h>
1.17 daniel 33: #endif
1.20 daniel 34: #ifdef HAVE_STDLIB_H
35: #include <stdlib.h>
36: #endif
1.29 ! daniel 37: #include <libxml/encoding.h>
! 38: #include <libxml/xmlmemory.h>
1.3 daniel 39:
1.25 daniel 40: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
41: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
42:
1.3 daniel 43: /*
44: * From rfc2044: encoding of the Unicode values on UTF-8:
45: *
46: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
47: * 0000 0000-0000 007F 0xxxxxxx
48: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
49: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
50: *
51: * I hope we won't use values > 0xFFFF anytime soon !
52: */
1.1 daniel 53:
54: /**
1.22 daniel 55: * xmlCheckUTF8: Check utf-8 string for legality.
56: * @utf: Pointer to putative utf-8 encoded string.
57: *
58: * Checks @utf for being valid utf-8. @utf is assumed to be
59: * null-terminated. This function is not super-strict, as it will
60: * allow longer utf-8 sequences than necessary. Note that Java is
61: * capable of producing these sequences if provoked. Also note, this
62: * routine checks for the 4-byte maxiumum size, but does not check for
63: * 0x10ffff maximum value.
64: *
65: * Return value: true if @utf is valid.
66: **/
67: int
68: xmlCheckUTF8(const unsigned char *utf)
69: {
70: int ix;
71: unsigned char c;
72:
73: for (ix = 0; (c = utf[ix]);) {
74: if (c & 0x80) {
75: if ((utf[ix + 1] & 0xc0) != 0x80)
76: return(0);
77: if ((c & 0xe0) == 0xe0) {
78: if ((utf[ix + 2] & 0xc0) != 0x80)
79: return(0);
80: if ((c & 0xf0) == 0xf0) {
81: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
82: return(0);
83: ix += 4;
84: /* 4-byte code */
85: } else
86: /* 3-byte code */
87: ix += 3;
88: } else
89: /* 2-byte code */
90: ix += 2;
91: } else
92: /* 1-byte code */
93: ix++;
94: }
95: return(1);
96: }
97:
98: /**
1.1 daniel 99: * isolat1ToUTF8:
1.18 daniel 100: * @out: a pointer to an array of bytes to store the result
101: * @outlen: the length of @out
102: * @in: a pointer to an array of ISO Latin 1 chars
103: * @inlen: the length of @in
1.1 daniel 104: *
105: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
106: * block of chars out.
1.6 daniel 107: * Returns the number of byte written, or -1 by lack of space.
1.1 daniel 108: */
109: int
1.25 daniel 110: isolat1ToUTF8(unsigned char* out, int outlen,
111: const unsigned char* in, int *inlen) {
1.1 daniel 112: unsigned char* outstart= out;
113: unsigned char* outend= out+outlen;
1.25 daniel 114: const unsigned char* inend= in+*inlen;
1.1 daniel 115: unsigned char c;
116:
117: while (in < inend) {
118: c= *in++;
119: if (c < 0x80) {
1.28 daniel 120: if (out >= outend) return(-1);
1.1 daniel 121: *out++ = c;
122: }
123: else {
1.28 daniel 124: if (out >= outend) return(-1);
1.1 daniel 125: *out++ = 0xC0 | (c >> 6);
1.28 daniel 126: if (out >= outend) return(-1);
1.1 daniel 127: *out++ = 0x80 | (0x3F & c);
128: }
129: }
1.28 daniel 130: return(out-outstart);
1.1 daniel 131: }
132:
133: /**
134: * UTF8Toisolat1:
1.18 daniel 135: * @out: a pointer to an array of bytes to store the result
136: * @outlen: the length of @out
137: * @in: a pointer to an array of UTF-8 chars
138: * @inlen: the length of @in
1.1 daniel 139: *
140: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
141: * block of chars out.
1.15 daniel 142: * TODO: UTF8Toisolat1 need a fallback mechanism ...
143: *
1.6 daniel 144: * Returns the number of byte written, or -1 by lack of space, or -2
1.28 daniel 145: * if the transcoding fails (for *in is not valid utf8 string or
1.23 daniel 146: * the result of transformation can't fit into the encoding we want)
1.28 daniel 147: * The value of @inlen after return is the number of octets consumed
148: * as the return value is positive, else unpredictiable.
1.1 daniel 149: */
150: int
1.25 daniel 151: UTF8Toisolat1(unsigned char* out, int outlen,
152: const unsigned char* in, int *inlen) {
1.1 daniel 153: unsigned char* outstart= out;
154: unsigned char* outend= out+outlen;
1.25 daniel 155: const unsigned char* inend= in+*inlen;
1.1 daniel 156: unsigned char c;
157:
158: while (in < inend) {
159: c= *in++;
160: if (c < 0x80) {
1.28 daniel 161: if (out >= outend) return(-1);
1.1 daniel 162: *out++= c;
163: }
1.23 daniel 164: else if (in == inend) {
165: *inlen -= 1;
166: break;
167: }
168: else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
169: /* a two byte utf-8 and can be encoding as isolate1 */
1.1 daniel 170: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23 daniel 171: }
1.28 daniel 172: else
173: return(-2);
1.23 daniel 174: /* TODO : some should be represent as "&#x____;" */
1.1 daniel 175: }
1.28 daniel 176: return(out-outstart);
1.1 daniel 177: }
178:
179: /**
1.28 daniel 180: * UTF16LEToUTF8:
181: * @out: a pointer to an array of bytes to store the result
182: * @outlen: the length of @out
183: * @inb: a pointer to an array of UTF-16LE passwd as a byte array
184: * @inlenb: the length of @in in UTF-16LE chars
185: *
186: * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
187: * block of chars out. This function assume the endian properity
188: * is the same between the native type of this machine and the
189: * inputed one.
190: *
191: * Returns the number of byte written, or -1 by lack of space, or -2
192: * if the transcoding fails (for *in is not valid utf16 string)
193: * The value of *inlen after return is the number of octets consumed
194: * as the return value is positive, else unpredictiable.
195: */
196: int
197: UTF16LEToUTF8(unsigned char* out, int outlen,
198: const unsigned char* inb, int *inlenb)
199: {
200: unsigned char* outstart= out;
201: unsigned char* outend= out+outlen;
202: unsigned short* in = (unsigned short*) inb;
203: unsigned short* inend;
204: unsigned int c, d, inlen;
205: unsigned char *tmp;
206: int bits;
207:
208: if ((*inlenb % 2) == 1)
209: (*inlenb)--;
210: inlen = *inlenb / 2;
211: inend= in + inlen;
212: while (in < inend) {
213: #ifdef BIG_ENDIAN
214: tmp = (unsigned char *) in;
215: c = *tmp++;
216: c = c | (((unsigned int)*tmp) << 8);
217: in++;
218: #else /* BIG_ENDIAN */
219: c= *in++;
220: #endif /* BIG_ENDIAN */
221: if ((c & 0xFC00) == 0xD800) { /* surrogates */
222: if (in >= inend) { /* (in > inend) shouldn't happens */
223: (*inlenb) -= 2;
224: break;
225: }
226: #ifdef BIG_ENDIAN
227: tmp = (unsigned char *) in;
228: d = *tmp++;
229: d = d | (((unsigned int)*tmp) << 8);
230: in++;
231: #else /* BIG_ENDIAN */
232: d = *in++;
233: #endif /* BIG_ENDIAN */
234: if ((d & 0xFC00) == 0xDC00) {
235: c &= 0x03FF;
236: c <<= 10;
237: c |= d & 0x03FF;
238: c += 0x10000;
239: }
240: else
241: return(-2);
242: }
243:
244: /* assertion: c is a single UTF-4 value */
245: if (out >= outend)
246: return(-1);
247: if (c < 0x80) { *out++= c; bits= -6; }
248: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
249: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
250: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
251:
252: for ( ; bits >= 0; bits-= 6) {
253: if (out >= outend)
254: return(-1);
255: *out++= ((c >> bits) & 0x3F) | 0x80;
256: }
257: }
258: return(out-outstart);
259: }
260:
261: /**
262: * UTF8ToUTF16LE:
263: * @outb: a pointer to an array of bytes to store the result
264: * @outlen: the length of @outb
265: * @in: a pointer to an array of UTF-8 chars
266: * @inlen: the length of @in
267: *
268: * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
269: * block of chars out.
270: * TODO: UTF8ToUTF16LE need a fallback mechanism ...
271: *
272: * Returns the number of byte written, or -1 by lack of space, or -2
273: * if the transcoding failed.
274: */
275: int
276: UTF8ToUTF16LE(unsigned char* outb, int outlen,
277: const unsigned char* in, int *inlen)
278: {
279: unsigned short* out = (unsigned short*) outb;
280: unsigned short* outstart= out;
281: unsigned short* outend;
282: const unsigned char* inend= in+*inlen;
283: unsigned int c, d, trailing;
284: #ifdef BIG_ENDIAN
285: unsigned char *tmp;
286: unsigned short tmp1, tmp2;
287: #endif /* BIG_ENDIAN */
288:
289: outlen /= 2; /* convert in short length */
290: outend = out + outlen;
291: while (in < inend) {
292: d= *in++;
293: if (d < 0x80) { c= d; trailing= 0; }
294: else if (d < 0xC0)
295: return(-2); /* trailing byte in leading position */
296: else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
297: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
298: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
299: else
300: return(-2); /* no chance for this in UTF-16 */
301:
302: if (inend - in < trailing) {
303: *inlen -= (inend - in);
304: break;
305: }
306:
307: for ( ; trailing; trailing--) {
308: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
309: return(-1);
310: c <<= 6;
311: c |= d & 0x3F;
312: }
313:
314: /* assertion: c is a single UTF-4 value */
315: if (c < 0x10000) {
316: if (out >= outend)
317: return(-1);
318: #ifdef BIG_ENDIAN
319: tmp = (unsigned char *) out;
320: *tmp = c ;
321: *(tmp + 1) = c >> 8 ;
322: out++;
323: #else /* BIG_ENDIAN */
324: *out++ = c;
325: #endif /* BIG_ENDIAN */
326: }
327: else if (c < 0x110000) {
328: if (out+1 >= outend)
329: return(-1);
330: c -= 0x10000;
331: #ifdef BIG_ENDIAN
332: tmp1 = 0xD800 | (c >> 10);
333: tmp = (unsigned char *) out;
334: *tmp = tmp1;
335: *(tmp + 1) = tmp1 >> 8;
336: out++;
337:
338: tmp2 = 0xDC00 | (c & 0x03FF);
339: tmp = (unsigned char *) out;
340: *tmp = tmp2;
341: *(tmp + 1) = tmp2 >> 8;
342: out++;
343: #else /* BIG_ENDIAN */
344: *out++ = 0xD800 | (c >> 10);
345: *out++ = 0xDC00 | (c & 0x03FF);
346: #endif /* BIG_ENDIAN */
347: }
348: else
349: return(-1);
350: }
351: return(out-outstart);
352: }
353:
354: /**
355: * UTF16BEToUTF8:
1.18 daniel 356: * @out: a pointer to an array of bytes to store the result
357: * @outlen: the length of @out
1.25 daniel 358: * @inb: a pointer to an array of UTF-16 passwd as a byte array
359: * @inlenb: the length of @in in UTF-16 chars
1.1 daniel 360: *
361: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28 daniel 362: * block of chars out. This function assume the endian properity
363: * is the same between the native type of this machine and the
364: * inputed one.
1.25 daniel 365: *
1.28 daniel 366: * Returns the number of byte written, or -1 by lack of space, or -2
367: * if the transcoding fails (for *in is not valid utf16 string)
368: * The value of *inlen after return is the number of octets consumed
369: * as the return value is positive, else unpredictiable.
1.1 daniel 370: */
371: int
1.28 daniel 372: UTF16BEToUTF8(unsigned char* out, int outlen,
1.25 daniel 373: const unsigned char* inb, int *inlenb)
1.1 daniel 374: {
375: unsigned char* outstart= out;
376: unsigned char* outend= out+outlen;
1.25 daniel 377: unsigned short* in = (unsigned short*) inb;
378: unsigned short* inend;
379: unsigned int c, d, inlen;
1.28 daniel 380: #ifdef BIG_ENDIAN
381: #else /* BIG_ENDIAN */
382: unsigned char *tmp;
383: #endif /* BIG_ENDIAN */
1.1 daniel 384: int bits;
385:
1.28 daniel 386: if ((*inlenb % 2) == 1)
387: (*inlenb)--;
1.25 daniel 388: inlen = *inlenb / 2;
389: inend= in + inlen;
1.1 daniel 390: while (in < inend) {
1.28 daniel 391: #ifdef BIG_ENDIAN
1.1 daniel 392: c= *in++;
1.28 daniel 393: #else
394: tmp = (unsigned char *) in;
395: c = *tmp++;
396: c = c << 8;
397: c = c | (unsigned int) *tmp;
398: in++;
399: #endif
1.1 daniel 400: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.28 daniel 401: if (in >= inend) { /* (in > inend) shouldn't happens */
402: (*inlenb) -= 2;
403: break;
404: }
405:
406: #ifdef BIG_ENDIAN
407: d= *in++;
408: #else
409: tmp = (unsigned char *) in;
410: d = *tmp++;
411: d = d << 8;
412: d = d | (unsigned int) *tmp;
413: in++;
414: #endif
415: if ((d & 0xFC00) == 0xDC00) {
1.1 daniel 416: c &= 0x03FF;
417: c <<= 10;
418: c |= d & 0x03FF;
419: c += 0x10000;
420: }
1.27 daniel 421: else
1.28 daniel 422: return(-2);
1.1 daniel 423: }
424:
1.25 daniel 425: /* assertion: c is a single UTF-4 value */
1.27 daniel 426: if (out >= outend)
1.28 daniel 427: return(-1);
1.1 daniel 428: if (c < 0x80) { *out++= c; bits= -6; }
1.26 daniel 429: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
430: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
431: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1.1 daniel 432:
1.26 daniel 433: for ( ; bits >= 0; bits-= 6) {
1.27 daniel 434: if (out >= outend)
1.28 daniel 435: return(-1);
1.26 daniel 436: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 437: }
438: }
1.28 daniel 439: return(out-outstart);
1.1 daniel 440: }
441:
442: /**
1.28 daniel 443: * UTF8ToUTF16BE:
1.25 daniel 444: * @outb: a pointer to an array of bytes to store the result
445: * @outlen: the length of @outb
1.18 daniel 446: * @in: a pointer to an array of UTF-8 chars
447: * @inlen: the length of @in
1.1 daniel 448: *
1.28 daniel 449: * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1 daniel 450: * block of chars out.
1.28 daniel 451: * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15 daniel 452: *
1.6 daniel 453: * Returns the number of byte written, or -1 by lack of space, or -2
1.25 daniel 454: * if the transcoding failed.
1.1 daniel 455: */
456: int
1.28 daniel 457: UTF8ToUTF16BE(unsigned char* outb, int outlen,
1.25 daniel 458: const unsigned char* in, int *inlen)
1.1 daniel 459: {
1.25 daniel 460: unsigned short* out = (unsigned short*) outb;
1.1 daniel 461: unsigned short* outstart= out;
1.28 daniel 462: unsigned short* outend;
1.25 daniel 463: const unsigned char* inend= in+*inlen;
1.1 daniel 464: unsigned int c, d, trailing;
1.28 daniel 465: #ifdef BIG_ENDIAN
466: #else
467: unsigned char *tmp;
468: unsigned short tmp1, tmp2;
469: #endif /* BIG_ENDIAN */
1.1 daniel 470:
1.25 daniel 471: outlen /= 2; /* convert in short length */
1.28 daniel 472: outend = out + outlen;
1.1 daniel 473: while (in < inend) {
474: d= *in++;
475: if (d < 0x80) { c= d; trailing= 0; }
1.28 daniel 476: else if (d < 0xC0)
477: return(-2); /* trailing byte in leading position */
1.1 daniel 478: else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
479: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
480: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.28 daniel 481: else
482: return(-2); /* no chance for this in UTF-16 */
483:
484: if (inend - in < trailing) {
485: *inlen -= (inend - in);
486: break;
487: }
1.1 daniel 488:
489: for ( ; trailing; trailing--) {
1.28 daniel 490: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return(-1);
1.1 daniel 491: c <<= 6;
492: c |= d & 0x3F;
493: }
494:
495: /* assertion: c is a single UTF-4 value */
496: if (c < 0x10000) {
1.28 daniel 497: if (out >= outend) return(-1);
498: #ifdef BIG_ENDIAN
1.1 daniel 499: *out++ = c;
1.28 daniel 500: #else
501: tmp = (unsigned char *) out;
502: *tmp = c >> 8;
503: *(tmp + 1) = c;
504: out++;
505: #endif /* BIG_ENDIAN */
1.1 daniel 506: }
507: else if (c < 0x110000) {
1.28 daniel 508: if (out+1 >= outend) return(-1);
1.1 daniel 509: c -= 0x10000;
1.28 daniel 510: #ifdef BIG_ENDIAN
1.1 daniel 511: *out++ = 0xD800 | (c >> 10);
512: *out++ = 0xDC00 | (c & 0x03FF);
1.28 daniel 513: #else
514: tmp1 = 0xD800 | (c >> 10);
515: tmp = (unsigned char *) out;
516: *tmp = tmp1 >> 8;
517: *(tmp + 1) = tmp1;
518: out++;
519:
520: tmp2 = 0xDC00 | (c & 0x03FF);
521: tmp = (unsigned char *) out;
522: *tmp = tmp2 >> 8;
523: *(tmp + 1) = tmp2;
524: out++;
525: #endif
1.1 daniel 526: }
1.28 daniel 527: else return(-1);
1.1 daniel 528: }
1.28 daniel 529: return(out-outstart);
1.1 daniel 530: }
531:
1.7 daniel 532: /**
533: * xmlDetectCharEncoding:
534: * @in: a pointer to the first bytes of the XML entity, must be at least
535: * 4 bytes long.
1.25 daniel 536: * @len: pointer to the length of the buffer
1.7 daniel 537: *
538: * Guess the encoding of the entity using the first bytes of the entity content
539: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
540: *
541: * Returns one of the XML_CHAR_ENCODING_... values.
542: */
543: xmlCharEncoding
1.25 daniel 544: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7 daniel 545: {
1.25 daniel 546: if (len >= 4) {
547: if ((in[0] == 0x00) && (in[1] == 0x00) &&
548: (in[2] == 0x00) && (in[3] == 0x3C))
549: return(XML_CHAR_ENCODING_UCS4BE);
550: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
551: (in[2] == 0x00) && (in[3] == 0x00))
552: return(XML_CHAR_ENCODING_UCS4LE);
553: if ((in[0] == 0x00) && (in[1] == 0x00) &&
554: (in[2] == 0x3C) && (in[3] == 0x00))
555: return(XML_CHAR_ENCODING_UCS4_2143);
556: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
557: (in[2] == 0x00) && (in[3] == 0x00))
558: return(XML_CHAR_ENCODING_UCS4_3412);
559: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
560: (in[2] == 0xA7) && (in[3] == 0x94))
561: return(XML_CHAR_ENCODING_EBCDIC);
562: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
563: (in[2] == 0x78) && (in[3] == 0x6D))
564: return(XML_CHAR_ENCODING_UTF8);
565: }
566: if (len >= 2) {
567: if ((in[0] == 0xFE) && (in[1] == 0xFF))
568: return(XML_CHAR_ENCODING_UTF16BE);
569: if ((in[0] == 0xFF) && (in[1] == 0xFE))
570: return(XML_CHAR_ENCODING_UTF16LE);
571: }
1.7 daniel 572: return(XML_CHAR_ENCODING_NONE);
573: }
574:
575: /**
576: * xmlParseCharEncoding:
1.18 daniel 577: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 578: *
579: * Conpare the string to the known encoding schemes already known. Note
580: * that the comparison is case insensitive accordingly to the section
581: * [XML] 4.3.3 Character Encoding in Entities.
582: *
583: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
584: * if not recognized.
585: */
586: xmlCharEncoding
1.8 daniel 587: xmlParseCharEncoding(const char* name)
1.7 daniel 588: {
589: char upper[500];
590: int i;
591:
592: for (i = 0;i < 499;i++) {
593: upper[i] = toupper(name[i]);
594: if (upper[i] == 0) break;
595: }
596: upper[i] = 0;
597:
598: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
599: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
600: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
601:
602: /*
603: * NOTE: if we were able to parse this, the endianness of UTF16 is
604: * already found and in use
605: */
606: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
607: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
608:
609: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
610: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
611: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
612:
613: /*
614: * NOTE: if we were able to parse this, the endianness of UCS4 is
615: * already found and in use
616: */
617: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
618: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
619: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
620:
621:
622: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
623: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
624: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
625:
626: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
627: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
628: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
629:
630: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
631: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
632: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
633: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
634: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
635: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
636: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
637:
638: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
639: if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
640: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
641: return(XML_CHAR_ENCODING_ERROR);
642: }
1.9 daniel 643:
644: /****************************************************************
645: * *
646: * Char encoding handlers *
647: * *
648: ****************************************************************/
649:
650: /* the size should be growable, but it's not a big deal ... */
651: #define MAX_ENCODING_HANDLERS 50
652: static xmlCharEncodingHandlerPtr *handlers = NULL;
653: static int nbCharEncodingHandler = 0;
654:
655: /*
656: * The default is UTF-8 for XML, that's also the default used for the
657: * parser internals, so the default encoding handler is NULL
658: */
659:
660: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
661:
662: /**
663: * xmlNewCharEncodingHandler:
1.18 daniel 664: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 665: * @input: the xmlCharEncodingInputFunc to read that encoding
666: * @output: the xmlCharEncodingOutputFunc to write that encoding
667: *
668: * Create and registers an xmlCharEncodingHandler.
669: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
670: */
671: xmlCharEncodingHandlerPtr
1.25 daniel 672: xmlNewCharEncodingHandler(const char *name,
673: xmlCharEncodingInputFunc input,
1.9 daniel 674: xmlCharEncodingOutputFunc output) {
675: xmlCharEncodingHandlerPtr handler;
676: char upper[500];
677: int i;
678: char *up = 0;
679:
680: /*
681: * Keep only the uppercase version of the encoding.
682: */
683: if (name == NULL) {
684: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
685: return(NULL);
686: }
687: for (i = 0;i < 499;i++) {
688: upper[i] = toupper(name[i]);
689: if (upper[i] == 0) break;
690: }
691: upper[i] = 0;
1.16 daniel 692: up = xmlMemStrdup(upper);
1.9 daniel 693: if (up == NULL) {
694: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
695: return(NULL);
696: }
697:
698: /*
699: * allocate and fill-up an handler block.
700: */
701: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 702: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 703: if (handler == NULL) {
704: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
705: return(NULL);
706: }
707: handler->input = input;
708: handler->output = output;
709: handler->name = up;
710:
711: /*
712: * registers and returns the handler.
713: */
714: xmlRegisterCharEncodingHandler(handler);
715: return(handler);
716: }
717:
718: /**
719: * xmlInitCharEncodingHandlers:
720: *
721: * Initialize the char encoding support, it registers the default
722: * encoding supported.
1.18 daniel 723: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 724: * in normal processing.
725: */
726: void
727: xmlInitCharEncodingHandlers(void) {
728: if (handlers != NULL) return;
729:
730: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 731: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.9 daniel 732:
733: if (handlers == NULL) {
734: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
735: return;
736: }
1.10 daniel 737: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25 daniel 738: xmlUTF16LEHandler =
1.28 daniel 739: xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
740: xmlUTF16BEHandler =
741: xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10 daniel 742: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9 daniel 743: }
744:
745: /**
1.19 daniel 746: * xmlCleanupCharEncodingHandlers:
747: *
748: * Cleanup the memory allocated for the char encoding support, it
749: * unregisters all the encoding handlers.
750: */
751: void
752: xmlCleanupCharEncodingHandlers(void) {
753: if (handlers == NULL) return;
754:
755: for (;nbCharEncodingHandler > 0;) {
756: nbCharEncodingHandler--;
757: if (handlers[nbCharEncodingHandler] != NULL) {
758: xmlFree(handlers[nbCharEncodingHandler]->name);
759: xmlFree(handlers[nbCharEncodingHandler]);
760: }
761: }
762: xmlFree(handlers);
763: handlers = NULL;
764: nbCharEncodingHandler = 0;
765: xmlDefaultCharEncodingHandler = NULL;
766: }
767:
768: /**
1.9 daniel 769: * xmlRegisterCharEncodingHandler:
770: * @handler: the xmlCharEncodingHandlerPtr handler block
771: *
772: * Register the char encoding handler, surprizing, isn't it ?
773: */
774: void
775: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
776: if (handlers == NULL) xmlInitCharEncodingHandlers();
777: if (handler == NULL) {
778: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
779: return;
780: }
781:
782: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
783: fprintf(stderr,
784: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
785: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
786: return;
787: }
788: handlers[nbCharEncodingHandler++] = handler;
789: }
790:
791: /**
792: * xmlGetCharEncodingHandler:
793: * @enc: an xmlCharEncoding value.
794: *
795: * Search in the registrered set the handler able to read/write that encoding.
796: *
797: * Returns the handler or NULL if not found
798: */
799: xmlCharEncodingHandlerPtr
800: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
801: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25 daniel 802: switch (enc) {
803: case XML_CHAR_ENCODING_ERROR:
804: return(NULL);
805: case XML_CHAR_ENCODING_NONE:
806: return(NULL);
807: case XML_CHAR_ENCODING_UTF8:
808: return(NULL);
809: case XML_CHAR_ENCODING_UTF16LE:
810: return(xmlUTF16LEHandler);
811: case XML_CHAR_ENCODING_UTF16BE:
812: return(xmlUTF16BEHandler);
813: case XML_CHAR_ENCODING_EBCDIC:
814: return(NULL);
815: case XML_CHAR_ENCODING_UCS4LE:
816: return(NULL);
817: case XML_CHAR_ENCODING_UCS4BE:
818: return(NULL);
819: case XML_CHAR_ENCODING_UCS4_2143:
820: return(NULL);
821: case XML_CHAR_ENCODING_UCS4_3412:
822: return(NULL);
823: case XML_CHAR_ENCODING_UCS2:
824: return(NULL);
825: case XML_CHAR_ENCODING_8859_1:
826: return(NULL);
827: case XML_CHAR_ENCODING_8859_2:
828: return(NULL);
829: case XML_CHAR_ENCODING_8859_3:
830: return(NULL);
831: case XML_CHAR_ENCODING_8859_4:
832: return(NULL);
833: case XML_CHAR_ENCODING_8859_5:
834: return(NULL);
835: case XML_CHAR_ENCODING_8859_6:
836: return(NULL);
837: case XML_CHAR_ENCODING_8859_7:
838: return(NULL);
839: case XML_CHAR_ENCODING_8859_8:
840: return(NULL);
841: case XML_CHAR_ENCODING_8859_9:
842: return(NULL);
843: case XML_CHAR_ENCODING_2022_JP:
844: case XML_CHAR_ENCODING_SHIFT_JIS:
845: case XML_CHAR_ENCODING_EUC_JP:
846: return(NULL);
847: }
1.9 daniel 848: return(NULL);
849: }
850:
851: /**
852: * xmlGetCharEncodingHandler:
853: * @enc: a string describing the char encoding.
854: *
855: * Search in the registrered set the handler able to read/write that encoding.
856: *
857: * Returns the handler or NULL if not found
858: */
859: xmlCharEncodingHandlerPtr
860: xmlFindCharEncodingHandler(const char *name) {
861: char upper[500];
862: int i;
863:
864: if (handlers == NULL) xmlInitCharEncodingHandlers();
865: if (name == NULL) return(xmlDefaultCharEncodingHandler);
866: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
867:
868: for (i = 0;i < 499;i++) {
869: upper[i] = toupper(name[i]);
870: if (upper[i] == 0) break;
871: }
872: upper[i] = 0;
873:
874: for (i = 0;i < nbCharEncodingHandler; i++)
875: if (!strcmp(name, handlers[i]->name))
876: return(handlers[i]);
877:
878: return(NULL);
879: }
880:
Webmaster