XML/encoding.c - annotate

Return to encoding.c CVS log
Up to [Public] / XML
Annotation of XML/encoding.c, revision 1.27

1.1       daniel      1: /*
                      2:  * encoding.c : implements the encoding conversion functions needed for XML
                      3:  *
                      4:  * Related specs: 
                      5:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
                      6:  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
                      7:  * [ISO-8859-1]   ISO Latin-1 characters codes.
                      8:  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
                      9:  *                Worldwide Character Encoding -- Version 1.0", Addison-
                     10:  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
                     11:  *                described in Unicode Technical Report #4.
                     12:  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
                     13:  *                Information Interchange, ANSI X3.4-1986.
                     14:  *
1.9       daniel     15:  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1       daniel     16:  *
                     17:  * See Copyright for the status of this software.
                     18:  *
                     19:  * Daniel.Veillard@w3.org
                     20:  */
                     21: 
1.21      daniel     22: #ifdef WIN32
                     23: #include "win32config.h"
                     24: #else
1.14      daniel     25: #include "config.h"
1.17      daniel     26: #endif
                     27: 
                     28: #include <stdio.h>
                     29: #include <string.h>
                     30: 
                     31: #ifdef HAVE_CTYPE_H
1.7       daniel     32: #include <ctype.h>
1.17      daniel     33: #endif
1.20      daniel     34: #ifdef HAVE_STDLIB_H
                     35: #include <stdlib.h>
                     36: #endif
1.1       daniel     37: #include "encoding.h"
1.16      daniel     38: #include "xmlmemory.h"
1.3       daniel     39: 
1.25      daniel     40: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
                     41: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
                     42: 
1.3       daniel     43: /*
                     44:  * From rfc2044: encoding of the Unicode values on UTF-8:
                     45:  *
                     46:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                     47:  * 0000 0000-0000 007F   0xxxxxxx
                     48:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                     49:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
                     50:  *
                     51:  * I hope we won't use values > 0xFFFF anytime soon !
                     52:  */
1.1       daniel     53: 
                     54: /**
1.22      daniel     55:  * xmlCheckUTF8: Check utf-8 string for legality.
                     56:  * @utf: Pointer to putative utf-8 encoded string.
                     57:  *
                     58:  * Checks @utf for being valid utf-8. @utf is assumed to be
                     59:  * null-terminated. This function is not super-strict, as it will
                     60:  * allow longer utf-8 sequences than necessary. Note that Java is
                     61:  * capable of producing these sequences if provoked. Also note, this
                     62:  * routine checks for the 4-byte maxiumum size, but does not check for
                     63:  * 0x10ffff maximum value.
                     64:  *
                     65:  * Return value: true if @utf is valid.
                     66:  **/
                     67: int
                     68: xmlCheckUTF8(const unsigned char *utf)
                     69: {
                     70:     int ix;
                     71:     unsigned char c;
                     72: 
                     73:     for (ix = 0; (c = utf[ix]);) {
                     74:         if (c & 0x80) {
                     75:            if ((utf[ix + 1] & 0xc0) != 0x80)
                     76:                return(0);
                     77:            if ((c & 0xe0) == 0xe0) {
                     78:                if ((utf[ix + 2] & 0xc0) != 0x80)
                     79:                    return(0);
                     80:                if ((c & 0xf0) == 0xf0) {
                     81:                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
                     82:                        return(0);
                     83:                    ix += 4;
                     84:                    /* 4-byte code */
                     85:                } else
                     86:                  /* 3-byte code */
                     87:                    ix += 3;
                     88:            } else
                     89:              /* 2-byte code */
                     90:                ix += 2;
                     91:        } else
                     92:            /* 1-byte code */
                     93:            ix++;
                     94:       }
                     95:       return(1);
                     96: }
                     97: 
                     98: /**
1.1       daniel     99:  * isolat1ToUTF8:
1.18      daniel    100:  * @out:  a pointer to an array of bytes to store the result
                    101:  * @outlen:  the length of @out
                    102:  * @in:  a pointer to an array of ISO Latin 1 chars
                    103:  * @inlen:  the length of @in
1.1       daniel    104:  *
                    105:  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
                    106:  * block of chars out.
1.6       daniel    107:  * Returns the number of byte written, or -1 by lack of space.
1.1       daniel    108:  */
                    109: int
1.25      daniel    110: isolat1ToUTF8(unsigned char* out, int outlen,
                    111:               const unsigned char* in, int *inlen) {
1.1       daniel    112:     unsigned char* outstart= out;
                    113:     unsigned char* outend= out+outlen;
1.25      daniel    114:     const unsigned char* inend= in+*inlen;
1.1       daniel    115:     unsigned char c;
                    116: 
                    117:     while (in < inend) {
                    118:         c= *in++;
                    119:         if (c < 0x80) {
                    120:             if (out >= outend)  return -1;
                    121:             *out++ = c;
                    122:         }
                    123:         else {
                    124:             if (out >= outend)  return -1;
                    125:             *out++ = 0xC0 | (c >> 6);
                    126:             if (out >= outend)  return -1;
                    127:             *out++ = 0x80 | (0x3F & c);
                    128:         }
                    129:     }
                    130:     return out-outstart;
                    131: }
                    132: 
                    133: /**
                    134:  * UTF8Toisolat1:
1.18      daniel    135:  * @out:  a pointer to an array of bytes to store the result
                    136:  * @outlen:  the length of @out
                    137:  * @in:  a pointer to an array of UTF-8 chars
                    138:  * @inlen:  the length of @in
1.1       daniel    139:  *
                    140:  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
                    141:  * block of chars out.
1.15      daniel    142:  * TODO: UTF8Toisolat1 need a fallback mechanism ...
                    143:  *
1.6       daniel    144:  * Returns the number of byte written, or -1 by lack of space, or -2
1.23      daniel    145:  *     if the transcoding faile (for *in is not valid utf8 string or
                    146:  *     the result of transformation can't fit into the encoding we want)
1.1       daniel    147:  */
                    148: int
1.25      daniel    149: UTF8Toisolat1(unsigned char* out, int outlen,
                    150:               const unsigned char* in, int *inlen) {
1.1       daniel    151:     unsigned char* outstart= out;
                    152:     unsigned char* outend= out+outlen;
1.25      daniel    153:     const unsigned char* inend= in+*inlen;
1.1       daniel    154:     unsigned char c;
                    155: 
                    156:     while (in < inend) {
                    157:         c= *in++;
                    158:         if (c < 0x80) {
                    159:             if (out >= outend)  return -1;
                    160:             *out++= c;
                    161:         }
1.23      daniel    162:        else if (in == inend) {
                    163:             *inlen -= 1;
                    164:             break;
                    165:        }
                    166:        else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
                    167:            /* a two byte utf-8 and can be encoding as isolate1 */
1.1       daniel    168:             *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23      daniel    169:        }
                    170:        else return -2;
                    171:        /* TODO : some should be represent as "&#x____;" */
1.1       daniel    172:     }
                    173:     return out-outstart;
                    174: }
                    175: 
                    176: /**
                    177:  * UTF16ToUTF8:
1.18      daniel    178:  * @out:  a pointer to an array of bytes to store the result
                    179:  * @outlen:  the length of @out
1.25      daniel    180:  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
                    181:  * @inlenb:  the length of @in in UTF-16 chars
1.1       daniel    182:  *
                    183:  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
                    184:  * block of chars out.
1.25      daniel    185:  *
1.6       daniel    186:  * Returns the number of byte written, or -1 by lack of space.
1.1       daniel    187:  */
                    188: int
1.25      daniel    189: UTF16ToUTF8(unsigned char* out, int outlen,
                    190:             const unsigned char* inb, int *inlenb)
1.1       daniel    191: {
                    192:     unsigned char* outstart= out;
                    193:     unsigned char* outend= out+outlen;
1.25      daniel    194:     unsigned short* in = (unsigned short*) inb;
                    195:     unsigned short* inend;
                    196:     unsigned int c, d, inlen;
1.1       daniel    197:     int bits;
                    198: 
1.25      daniel    199:     inlen = *inlenb / 2;
                    200:     inend= in + inlen;
1.1       daniel    201:     while (in < inend) {
                    202:         c= *in++;
                    203:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
                    204:             if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
                    205:                 c &= 0x03FF;
                    206:                 c <<= 10;
                    207:                 c |= d & 0x03FF;
                    208:                 c += 0x10000;
                    209:             }
1.27    ! daniel    210:             else 
        !           211:                return -1;
1.1       daniel    212:         }
                    213: 
1.25      daniel    214:        /* assertion: c is a single UTF-4 value */
1.27    ! daniel    215:         if (out >= outend) 
        !           216:            return -1;
1.1       daniel    217:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
1.26      daniel    218:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    219:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    220:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1.1       daniel    221:  
1.26      daniel    222:         for ( ; bits >= 0; bits-= 6) {
1.27    ! daniel    223:             if (out >= outend) 
        !           224:                return -1;
1.26      daniel    225:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    226:         }
                    227:     }
                    228:     return out-outstart;
                    229: }
                    230: 
                    231: /**
                    232:  * UTF8ToUTF16:
1.25      daniel    233:  * @outb:  a pointer to an array of bytes to store the result
                    234:  * @outlen:  the length of @outb
1.18      daniel    235:  * @in:  a pointer to an array of UTF-8 chars
                    236:  * @inlen:  the length of @in
1.1       daniel    237:  *
                    238:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16
                    239:  * block of chars out.
1.15      daniel    240:  * TODO: UTF8ToUTF16 need a fallback mechanism ...
                    241:  *
1.6       daniel    242:  * Returns the number of byte written, or -1 by lack of space, or -2
1.25      daniel    243:  *     if the transcoding failed. 
1.1       daniel    244:  */
                    245: int
1.25      daniel    246: UTF8ToUTF16(unsigned char* outb, int outlen,
                    247:             const unsigned char* in, int *inlen)
1.1       daniel    248: {
1.25      daniel    249:     unsigned short* out = (unsigned short*) outb;
1.1       daniel    250:     unsigned short* outstart= out;
                    251:     unsigned short* outend= out+outlen;
1.25      daniel    252:     const unsigned char* inend= in+*inlen;
1.1       daniel    253:     unsigned int c, d, trailing;
                    254: 
1.25      daniel    255:     outlen /= 2; /* convert in short length */
1.1       daniel    256:     while (in < inend) {
                    257:       d= *in++;
                    258:       if      (d < 0x80)  { c= d; trailing= 0; }
                    259:       else if (d < 0xC0)  return -2;    /* trailing byte in leading position */
                    260:       else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
                    261:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    262:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
                    263:       else return -2;    /* no chance for this in UTF-16 */
                    264: 
                    265:       for ( ; trailing; trailing--) {
                    266:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  return -1;
                    267:           c <<= 6;
                    268:           c |= d & 0x3F;
                    269:       }
                    270: 
                    271:       /* assertion: c is a single UTF-4 value */
                    272:         if (c < 0x10000) {
                    273:             if (out >= outend)  return -1;
                    274:             *out++ = c;
                    275:         }
                    276:         else if (c < 0x110000) {
                    277:             if (out+1 >= outend)  return -1;
                    278:             c -= 0x10000;
                    279:             *out++ = 0xD800 | (c >> 10);
                    280:             *out++ = 0xDC00 | (c & 0x03FF);
                    281:         }
                    282:         else  return -1;
                    283:     }
                    284:     return out-outstart;
                    285: }
                    286: 
1.7       daniel    287: /**
                    288:  * xmlDetectCharEncoding:
                    289:  * @in:  a pointer to the first bytes of the XML entity, must be at least
                    290:  *       4 bytes long.
1.25      daniel    291:  * @len:  pointer to the length of the buffer
1.7       daniel    292:  *
                    293:  * Guess the encoding of the entity using the first bytes of the entity content
                    294:  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
                    295:  * 
                    296:  * Returns one of the XML_CHAR_ENCODING_... values.
                    297:  */
                    298: xmlCharEncoding
1.25      daniel    299: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7       daniel    300: {
1.25      daniel    301:     if (len >= 4) {
                    302:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    303:            (in[2] == 0x00) && (in[3] == 0x3C))
                    304:            return(XML_CHAR_ENCODING_UCS4BE);
                    305:        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
                    306:            (in[2] == 0x00) && (in[3] == 0x00))
                    307:            return(XML_CHAR_ENCODING_UCS4LE);
                    308:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    309:            (in[2] == 0x3C) && (in[3] == 0x00))
                    310:            return(XML_CHAR_ENCODING_UCS4_2143);
                    311:        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
                    312:            (in[2] == 0x00) && (in[3] == 0x00))
                    313:            return(XML_CHAR_ENCODING_UCS4_3412);
                    314:        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
                    315:            (in[2] == 0xA7) && (in[3] == 0x94))
                    316:            return(XML_CHAR_ENCODING_EBCDIC);
                    317:        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
                    318:            (in[2] == 0x78) && (in[3] == 0x6D))
                    319:            return(XML_CHAR_ENCODING_UTF8);
                    320:     }
                    321:     if (len >= 2) {
                    322:        if ((in[0] == 0xFE) && (in[1] == 0xFF))
                    323:            return(XML_CHAR_ENCODING_UTF16BE);
                    324:        if ((in[0] == 0xFF) && (in[1] == 0xFE))
                    325:            return(XML_CHAR_ENCODING_UTF16LE);
                    326:     }
1.7       daniel    327:     return(XML_CHAR_ENCODING_NONE);
                    328: }
                    329: 
                    330: /**
                    331:  * xmlParseCharEncoding:
1.18      daniel    332:  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7       daniel    333:  *
                    334:  * Conpare the string to the known encoding schemes already known. Note
                    335:  * that the comparison is case insensitive accordingly to the section
                    336:  * [XML] 4.3.3 Character Encoding in Entities.
                    337:  * 
                    338:  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
                    339:  * if not recognized.
                    340:  */
                    341: xmlCharEncoding
1.8       daniel    342: xmlParseCharEncoding(const char* name)
1.7       daniel    343: {
                    344:     char upper[500];
                    345:     int i;
                    346: 
                    347:     for (i = 0;i < 499;i++) {
                    348:         upper[i] = toupper(name[i]);
                    349:        if (upper[i] == 0) break;
                    350:     }
                    351:     upper[i] = 0;
                    352: 
                    353:     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
                    354:     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
                    355:     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
                    356: 
                    357:     /*
                    358:      * NOTE: if we were able to parse this, the endianness of UTF16 is
                    359:      *       already found and in use
                    360:      */
                    361:     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
                    362:     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
                    363:     
                    364:     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    365:     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    366:     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
                    367: 
                    368:     /*
                    369:      * NOTE: if we were able to parse this, the endianness of UCS4 is
                    370:      *       already found and in use
                    371:      */
                    372:     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    373:     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    374:     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
                    375: 
                    376:     
                    377:     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
                    378:     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
                    379:     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
                    380: 
                    381:     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
                    382:     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
                    383:     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
                    384: 
                    385:     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
                    386:     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
                    387:     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
                    388:     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
                    389:     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
                    390:     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
                    391:     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
                    392: 
                    393:     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
                    394:     if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
                    395:     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
                    396:     return(XML_CHAR_ENCODING_ERROR);
                    397: }
1.9       daniel    398: 
                    399: /****************************************************************
                    400:  *                                                             *
                    401:  *             Char encoding handlers                          *
                    402:  *                                                             *
                    403:  ****************************************************************/
                    404: 
                    405: /* the size should be growable, but it's not a big deal ... */
                    406: #define MAX_ENCODING_HANDLERS 50
                    407: static xmlCharEncodingHandlerPtr *handlers = NULL;
                    408: static int nbCharEncodingHandler = 0;
                    409: 
                    410: /*
                    411:  * The default is UTF-8 for XML, that's also the default used for the
                    412:  * parser internals, so the default encoding handler is NULL
                    413:  */
                    414: 
                    415: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
                    416: 
                    417: /**
                    418:  * xmlNewCharEncodingHandler:
1.18      daniel    419:  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1.9       daniel    420:  * @input:  the xmlCharEncodingInputFunc to read that encoding
                    421:  * @output:  the xmlCharEncodingOutputFunc to write that encoding
                    422:  *
                    423:  * Create and registers an xmlCharEncodingHandler.
                    424:  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
                    425:  */
                    426: xmlCharEncodingHandlerPtr
1.25      daniel    427: xmlNewCharEncodingHandler(const char *name, 
                    428:                           xmlCharEncodingInputFunc input,
1.9       daniel    429:                           xmlCharEncodingOutputFunc output) {
                    430:     xmlCharEncodingHandlerPtr handler;
                    431:     char upper[500];
                    432:     int i;
                    433:     char *up = 0;
                    434: 
                    435:     /*
                    436:      * Keep only the uppercase version of the encoding.
                    437:      */
                    438:     if (name == NULL) {
                    439:         fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
                    440:        return(NULL);
                    441:     }
                    442:     for (i = 0;i < 499;i++) {
                    443:         upper[i] = toupper(name[i]);
                    444:        if (upper[i] == 0) break;
                    445:     }
                    446:     upper[i] = 0;
1.16      daniel    447:     up = xmlMemStrdup(upper);
1.9       daniel    448:     if (up == NULL) {
                    449:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    450:        return(NULL);
                    451:     }
                    452: 
                    453:     /*
                    454:      * allocate and fill-up an handler block.
                    455:      */
                    456:     handler = (xmlCharEncodingHandlerPtr)
1.16      daniel    457:               xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9       daniel    458:     if (handler == NULL) {
                    459:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    460:        return(NULL);
                    461:     }
                    462:     handler->input = input;
                    463:     handler->output = output;
                    464:     handler->name = up;
                    465: 
                    466:     /*
                    467:      * registers and returns the handler.
                    468:      */
                    469:     xmlRegisterCharEncodingHandler(handler);
                    470:     return(handler);
                    471: }
                    472: 
                    473: /**
                    474:  * xmlInitCharEncodingHandlers:
                    475:  *
                    476:  * Initialize the char encoding support, it registers the default
                    477:  * encoding supported.
1.18      daniel    478:  * NOTE: while public, this function usually doesn't need to be called
1.9       daniel    479:  *       in normal processing.
                    480:  */
                    481: void
                    482: xmlInitCharEncodingHandlers(void) {
                    483:     if (handlers != NULL) return;
                    484: 
                    485:     handlers = (xmlCharEncodingHandlerPtr *)
1.16      daniel    486:         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.9       daniel    487: 
                    488:     if (handlers == NULL) {
                    489:         fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
                    490:        return;
                    491:     }
1.10      daniel    492:     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25      daniel    493:     xmlUTF16LEHandler = 
                    494:           xmlNewCharEncodingHandler("UTF-16LE", UTF16ToUTF8, UTF8ToUTF16);
1.10      daniel    495:     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9       daniel    496: }
                    497: 
                    498: /**
1.19      daniel    499:  * xmlCleanupCharEncodingHandlers:
                    500:  *
                    501:  * Cleanup the memory allocated for the char encoding support, it
                    502:  * unregisters all the encoding handlers.
                    503:  */
                    504: void
                    505: xmlCleanupCharEncodingHandlers(void) {
                    506:     if (handlers == NULL) return;
                    507: 
                    508:     for (;nbCharEncodingHandler > 0;) {
                    509:         nbCharEncodingHandler--;
                    510:        if (handlers[nbCharEncodingHandler] != NULL) {
                    511:            xmlFree(handlers[nbCharEncodingHandler]->name);
                    512:            xmlFree(handlers[nbCharEncodingHandler]);
                    513:        }
                    514:     }
                    515:     xmlFree(handlers);
                    516:     handlers = NULL;
                    517:     nbCharEncodingHandler = 0;
                    518:     xmlDefaultCharEncodingHandler = NULL;
                    519: }
                    520: 
                    521: /**
1.9       daniel    522:  * xmlRegisterCharEncodingHandler:
                    523:  * @handler:  the xmlCharEncodingHandlerPtr handler block
                    524:  *
                    525:  * Register the char encoding handler, surprizing, isn't it ?
                    526:  */
                    527: void
                    528: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
                    529:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    530:     if (handler == NULL) {
                    531:         fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
                    532:        return;
                    533:     }
                    534: 
                    535:     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
                    536:         fprintf(stderr, 
                    537:        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
                    538:         fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
                    539:        return;
                    540:     }
                    541:     handlers[nbCharEncodingHandler++] = handler;
                    542: }
                    543: 
                    544: /**
                    545:  * xmlGetCharEncodingHandler:
                    546:  * @enc:  an xmlCharEncoding value.
                    547:  *
                    548:  * Search in the registrered set the handler able to read/write that encoding.
                    549:  *
                    550:  * Returns the handler or NULL if not found
                    551:  */
                    552: xmlCharEncodingHandlerPtr
                    553: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
                    554:     if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25      daniel    555:     switch (enc) {
                    556:         case XML_CHAR_ENCODING_ERROR:
                    557:            return(NULL);
                    558:         case XML_CHAR_ENCODING_NONE:
                    559:            return(NULL);
                    560:         case XML_CHAR_ENCODING_UTF8:
                    561:            return(NULL);
                    562:         case XML_CHAR_ENCODING_UTF16LE:
                    563:            return(xmlUTF16LEHandler);
                    564:         case XML_CHAR_ENCODING_UTF16BE:
                    565:            return(xmlUTF16BEHandler);
                    566:         case XML_CHAR_ENCODING_EBCDIC:
                    567:            return(NULL);
                    568:         case XML_CHAR_ENCODING_UCS4LE:
                    569:            return(NULL);
                    570:         case XML_CHAR_ENCODING_UCS4BE:
                    571:            return(NULL);
                    572:         case XML_CHAR_ENCODING_UCS4_2143:
                    573:            return(NULL);
                    574:         case XML_CHAR_ENCODING_UCS4_3412:
                    575:            return(NULL);
                    576:         case XML_CHAR_ENCODING_UCS2:
                    577:            return(NULL);
                    578:         case XML_CHAR_ENCODING_8859_1:
                    579:            return(NULL);
                    580:         case XML_CHAR_ENCODING_8859_2:
                    581:            return(NULL);
                    582:         case XML_CHAR_ENCODING_8859_3:
                    583:            return(NULL);
                    584:         case XML_CHAR_ENCODING_8859_4:
                    585:            return(NULL);
                    586:         case XML_CHAR_ENCODING_8859_5:
                    587:            return(NULL);
                    588:         case XML_CHAR_ENCODING_8859_6:
                    589:            return(NULL);
                    590:         case XML_CHAR_ENCODING_8859_7:
                    591:            return(NULL);
                    592:         case XML_CHAR_ENCODING_8859_8:
                    593:            return(NULL);
                    594:         case XML_CHAR_ENCODING_8859_9:
                    595:            return(NULL);
                    596:         case XML_CHAR_ENCODING_2022_JP:
                    597:         case XML_CHAR_ENCODING_SHIFT_JIS:
                    598:         case XML_CHAR_ENCODING_EUC_JP:
                    599:            return(NULL);
                    600:     }
1.9       daniel    601:     return(NULL);
                    602: }
                    603: 
                    604: /**
                    605:  * xmlGetCharEncodingHandler:
                    606:  * @enc:  a string describing the char encoding.
                    607:  *
                    608:  * Search in the registrered set the handler able to read/write that encoding.
                    609:  *
                    610:  * Returns the handler or NULL if not found
                    611:  */
                    612: xmlCharEncodingHandlerPtr
                    613: xmlFindCharEncodingHandler(const char *name) {
                    614:     char upper[500];
                    615:     int i;
                    616: 
                    617:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    618:     if (name == NULL) return(xmlDefaultCharEncodingHandler);
                    619:     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
                    620: 
                    621:     for (i = 0;i < 499;i++) {
                    622:         upper[i] = toupper(name[i]);
                    623:        if (upper[i] == 0) break;
                    624:     }
                    625:     upper[i] = 0;
                    626: 
                    627:     for (i = 0;i < nbCharEncodingHandler; i++)
                    628:         if (!strcmp(name, handlers[i]->name))
                    629:            return(handlers[i]);
                    630: 
                    631:     return(NULL);
                    632: }
                    633:
Webmaster