Annotation of XML/encoding.c, revision 1.29

1.1       daniel      1: /*
                      2:  * encoding.c : implements the encoding conversion functions needed for XML
                      3:  *
                      4:  * Related specs: 
                      5:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
                      6:  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
                      7:  * [ISO-8859-1]   ISO Latin-1 characters codes.
                      8:  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
                      9:  *                Worldwide Character Encoding -- Version 1.0", Addison-
                     10:  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
                     11:  *                described in Unicode Technical Report #4.
                     12:  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
                     13:  *                Information Interchange, ANSI X3.4-1986.
                     14:  *
1.9       daniel     15:  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1       daniel     16:  *
                     17:  * See Copyright for the status of this software.
                     18:  *
                     19:  * Daniel.Veillard@w3.org
                     20:  */
                     21: 
1.21      daniel     22: #ifdef WIN32
                     23: #include "win32config.h"
                     24: #else
1.14      daniel     25: #include "config.h"
1.17      daniel     26: #endif
                     27: 
                     28: #include <stdio.h>
                     29: #include <string.h>
                     30: 
                     31: #ifdef HAVE_CTYPE_H
1.7       daniel     32: #include <ctype.h>
1.17      daniel     33: #endif
1.20      daniel     34: #ifdef HAVE_STDLIB_H
                     35: #include <stdlib.h>
                     36: #endif
1.29    ! daniel     37: #include <libxml/encoding.h>
        !            38: #include <libxml/xmlmemory.h>
1.3       daniel     39: 
1.25      daniel     40: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
                     41: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
                     42: 
1.3       daniel     43: /*
                     44:  * From rfc2044: encoding of the Unicode values on UTF-8:
                     45:  *
                     46:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                     47:  * 0000 0000-0000 007F   0xxxxxxx
                     48:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                     49:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
                     50:  *
                     51:  * I hope we won't use values > 0xFFFF anytime soon !
                     52:  */
1.1       daniel     53: 
                     54: /**
1.22      daniel     55:  * xmlCheckUTF8: Check utf-8 string for legality.
                     56:  * @utf: Pointer to putative utf-8 encoded string.
                     57:  *
                     58:  * Checks @utf for being valid utf-8. @utf is assumed to be
                     59:  * null-terminated. This function is not super-strict, as it will
                     60:  * allow longer utf-8 sequences than necessary. Note that Java is
                     61:  * capable of producing these sequences if provoked. Also note, this
                     62:  * routine checks for the 4-byte maxiumum size, but does not check for
                     63:  * 0x10ffff maximum value.
                     64:  *
                     65:  * Return value: true if @utf is valid.
                     66:  **/
                     67: int
                     68: xmlCheckUTF8(const unsigned char *utf)
                     69: {
                     70:     int ix;
                     71:     unsigned char c;
                     72: 
                     73:     for (ix = 0; (c = utf[ix]);) {
                     74:         if (c & 0x80) {
                     75:            if ((utf[ix + 1] & 0xc0) != 0x80)
                     76:                return(0);
                     77:            if ((c & 0xe0) == 0xe0) {
                     78:                if ((utf[ix + 2] & 0xc0) != 0x80)
                     79:                    return(0);
                     80:                if ((c & 0xf0) == 0xf0) {
                     81:                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
                     82:                        return(0);
                     83:                    ix += 4;
                     84:                    /* 4-byte code */
                     85:                } else
                     86:                  /* 3-byte code */
                     87:                    ix += 3;
                     88:            } else
                     89:              /* 2-byte code */
                     90:                ix += 2;
                     91:        } else
                     92:            /* 1-byte code */
                     93:            ix++;
                     94:       }
                     95:       return(1);
                     96: }
                     97: 
                     98: /**
1.1       daniel     99:  * isolat1ToUTF8:
1.18      daniel    100:  * @out:  a pointer to an array of bytes to store the result
                    101:  * @outlen:  the length of @out
                    102:  * @in:  a pointer to an array of ISO Latin 1 chars
                    103:  * @inlen:  the length of @in
1.1       daniel    104:  *
                    105:  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
                    106:  * block of chars out.
1.6       daniel    107:  * Returns the number of byte written, or -1 by lack of space.
1.1       daniel    108:  */
                    109: int
1.25      daniel    110: isolat1ToUTF8(unsigned char* out, int outlen,
                    111:               const unsigned char* in, int *inlen) {
1.1       daniel    112:     unsigned char* outstart= out;
                    113:     unsigned char* outend= out+outlen;
1.25      daniel    114:     const unsigned char* inend= in+*inlen;
1.1       daniel    115:     unsigned char c;
                    116: 
                    117:     while (in < inend) {
                    118:         c= *in++;
                    119:         if (c < 0x80) {
1.28      daniel    120:             if (out >= outend)  return(-1);
1.1       daniel    121:             *out++ = c;
                    122:         }
                    123:         else {
1.28      daniel    124:             if (out >= outend)  return(-1);
1.1       daniel    125:             *out++ = 0xC0 | (c >> 6);
1.28      daniel    126:             if (out >= outend)  return(-1);
1.1       daniel    127:             *out++ = 0x80 | (0x3F & c);
                    128:         }
                    129:     }
1.28      daniel    130:     return(out-outstart);
1.1       daniel    131: }
                    132: 
                    133: /**
                    134:  * UTF8Toisolat1:
1.18      daniel    135:  * @out:  a pointer to an array of bytes to store the result
                    136:  * @outlen:  the length of @out
                    137:  * @in:  a pointer to an array of UTF-8 chars
                    138:  * @inlen:  the length of @in
1.1       daniel    139:  *
                    140:  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
                    141:  * block of chars out.
1.15      daniel    142:  * TODO: UTF8Toisolat1 need a fallback mechanism ...
                    143:  *
1.6       daniel    144:  * Returns the number of byte written, or -1 by lack of space, or -2
1.28      daniel    145:  *     if the transcoding fails (for *in is not valid utf8 string or
1.23      daniel    146:  *     the result of transformation can't fit into the encoding we want)
1.28      daniel    147:  * The value of @inlen after return is the number of octets consumed
                    148:  *     as the return value is positive, else unpredictiable.
1.1       daniel    149:  */
                    150: int
1.25      daniel    151: UTF8Toisolat1(unsigned char* out, int outlen,
                    152:               const unsigned char* in, int *inlen) {
1.1       daniel    153:     unsigned char* outstart= out;
                    154:     unsigned char* outend= out+outlen;
1.25      daniel    155:     const unsigned char* inend= in+*inlen;
1.1       daniel    156:     unsigned char c;
                    157: 
                    158:     while (in < inend) {
                    159:         c= *in++;
                    160:         if (c < 0x80) {
1.28      daniel    161:             if (out >= outend)  return(-1);
1.1       daniel    162:             *out++= c;
                    163:         }
1.23      daniel    164:        else if (in == inend) {
                    165:             *inlen -= 1;
                    166:             break;
                    167:        }
                    168:        else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
                    169:            /* a two byte utf-8 and can be encoding as isolate1 */
1.1       daniel    170:             *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23      daniel    171:        }
1.28      daniel    172:        else
                    173:            return(-2);
1.23      daniel    174:        /* TODO : some should be represent as "&#x____;" */
1.1       daniel    175:     }
1.28      daniel    176:     return(out-outstart);
1.1       daniel    177: }
                    178: 
                    179: /**
1.28      daniel    180:  * UTF16LEToUTF8:
                    181:  * @out:  a pointer to an array of bytes to store the result
                    182:  * @outlen:  the length of @out
                    183:  * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
                    184:  * @inlenb:  the length of @in in UTF-16LE chars
                    185:  *
                    186:  * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
                    187:  * block of chars out. This function assume the endian properity
                    188:  * is the same between the native type of this machine and the
                    189:  * inputed one.
                    190:  *
                    191:  * Returns the number of byte written, or -1 by lack of space, or -2
                    192:  *     if the transcoding fails (for *in is not valid utf16 string)
                    193:  *     The value of *inlen after return is the number of octets consumed
                    194:  *     as the return value is positive, else unpredictiable.
                    195:  */
                    196: int
                    197: UTF16LEToUTF8(unsigned char* out, int outlen,
                    198:             const unsigned char* inb, int *inlenb)
                    199: {
                    200:     unsigned char* outstart= out;
                    201:     unsigned char* outend= out+outlen;
                    202:     unsigned short* in = (unsigned short*) inb;
                    203:     unsigned short* inend;
                    204:     unsigned int c, d, inlen;
                    205:     unsigned char *tmp;
                    206:     int bits;
                    207: 
                    208:     if ((*inlenb % 2) == 1)
                    209:         (*inlenb)--;
                    210:     inlen = *inlenb / 2;
                    211:     inend= in + inlen;
                    212:     while (in < inend) {
                    213: #ifdef BIG_ENDIAN
                    214:        tmp = (unsigned char *) in;
                    215:        c = *tmp++;
                    216:        c = c | (((unsigned int)*tmp) << 8);
                    217:        in++;
                    218: #else /* BIG_ENDIAN */
                    219:         c= *in++;
                    220: #endif /* BIG_ENDIAN */
                    221:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
                    222:             if (in >= inend) {           /* (in > inend) shouldn't happens */
                    223:                 (*inlenb) -= 2;
                    224:                 break;
                    225:             }
                    226: #ifdef BIG_ENDIAN
                    227:             tmp = (unsigned char *) in;
                    228:             d = *tmp++;
                    229:            d = d | (((unsigned int)*tmp) << 8);
                    230:            in++;
                    231: #else /* BIG_ENDIAN */
                    232:             d = *in++;
                    233: #endif /* BIG_ENDIAN */
                    234:             if ((d & 0xFC00) == 0xDC00) {
                    235:                 c &= 0x03FF;
                    236:                 c <<= 10;
                    237:                 c |= d & 0x03FF;
                    238:                 c += 0x10000;
                    239:             }
                    240:             else
                    241:                return(-2);
                    242:         }
                    243: 
                    244:        /* assertion: c is a single UTF-4 value */
                    245:         if (out >= outend)
                    246:            return(-1);
                    247:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
                    248:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    249:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    250:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                    251:  
                    252:         for ( ; bits >= 0; bits-= 6) {
                    253:             if (out >= outend)
                    254:                return(-1);
                    255:             *out++= ((c >> bits) & 0x3F) | 0x80;
                    256:         }
                    257:     }
                    258:     return(out-outstart);
                    259: }
                    260: 
                    261: /**
                    262:  * UTF8ToUTF16LE:
                    263:  * @outb:  a pointer to an array of bytes to store the result
                    264:  * @outlen:  the length of @outb
                    265:  * @in:  a pointer to an array of UTF-8 chars
                    266:  * @inlen:  the length of @in
                    267:  *
                    268:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
                    269:  * block of chars out.
                    270:  * TODO: UTF8ToUTF16LE need a fallback mechanism ...
                    271:  *
                    272:  * Returns the number of byte written, or -1 by lack of space, or -2
                    273:  *     if the transcoding failed. 
                    274:  */
                    275: int
                    276: UTF8ToUTF16LE(unsigned char* outb, int outlen,
                    277:             const unsigned char* in, int *inlen)
                    278: {
                    279:     unsigned short* out = (unsigned short*) outb;
                    280:     unsigned short* outstart= out;
                    281:     unsigned short* outend;
                    282:     const unsigned char* inend= in+*inlen;
                    283:     unsigned int c, d, trailing;
                    284: #ifdef BIG_ENDIAN
                    285:     unsigned char *tmp;
                    286:     unsigned short tmp1, tmp2;
                    287: #endif /* BIG_ENDIAN */
                    288: 
                    289:     outlen /= 2; /* convert in short length */
                    290:     outend = out + outlen;
                    291:     while (in < inend) {
                    292:       d= *in++;
                    293:       if      (d < 0x80)  { c= d; trailing= 0; }
                    294:       else if (d < 0xC0)
                    295:           return(-2);    /* trailing byte in leading position */
                    296:       else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
                    297:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    298:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
                    299:       else
                    300:           return(-2);    /* no chance for this in UTF-16 */
                    301: 
                    302:       if (inend - in < trailing) {
                    303:           *inlen -= (inend - in);
                    304:           break;
                    305:       } 
                    306: 
                    307:       for ( ; trailing; trailing--) {
                    308:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
                    309:              return(-1);
                    310:           c <<= 6;
                    311:           c |= d & 0x3F;
                    312:       }
                    313: 
                    314:       /* assertion: c is a single UTF-4 value */
                    315:         if (c < 0x10000) {
                    316:             if (out >= outend)
                    317:                return(-1);
                    318: #ifdef BIG_ENDIAN
                    319:             tmp = (unsigned char *) out;
                    320:             *tmp = c ;
                    321:             *(tmp + 1) = c >> 8 ;
                    322:             out++;
                    323: #else /* BIG_ENDIAN */
                    324:             *out++ = c;
                    325: #endif /* BIG_ENDIAN */
                    326:         }
                    327:         else if (c < 0x110000) {
                    328:             if (out+1 >= outend)
                    329:                return(-1);
                    330:             c -= 0x10000;
                    331: #ifdef BIG_ENDIAN
                    332:             tmp1 = 0xD800 | (c >> 10);
                    333:             tmp = (unsigned char *) out;
                    334:             *tmp = tmp1;
                    335:             *(tmp + 1) = tmp1 >> 8;
                    336:             out++;
                    337: 
                    338:             tmp2 = 0xDC00 | (c & 0x03FF);
                    339:             tmp = (unsigned char *) out;
                    340:             *tmp  = tmp2;
                    341:             *(tmp + 1) = tmp2 >> 8;
                    342:             out++;
                    343: #else /* BIG_ENDIAN */
                    344:             *out++ = 0xD800 | (c >> 10);
                    345:             *out++ = 0xDC00 | (c & 0x03FF);
                    346: #endif /* BIG_ENDIAN */
                    347:         }
                    348:         else
                    349:            return(-1);
                    350:     }
                    351:     return(out-outstart);
                    352: }
                    353: 
                    354: /**
                    355:  * UTF16BEToUTF8:
1.18      daniel    356:  * @out:  a pointer to an array of bytes to store the result
                    357:  * @outlen:  the length of @out
1.25      daniel    358:  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
                    359:  * @inlenb:  the length of @in in UTF-16 chars
1.1       daniel    360:  *
                    361:  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28      daniel    362:  * block of chars out. This function assume the endian properity
                    363:  * is the same between the native type of this machine and the
                    364:  * inputed one.
1.25      daniel    365:  *
1.28      daniel    366:  * Returns the number of byte written, or -1 by lack of space, or -2
                    367:  *     if the transcoding fails (for *in is not valid utf16 string)
                    368:  * The value of *inlen after return is the number of octets consumed
                    369:  *     as the return value is positive, else unpredictiable.
1.1       daniel    370:  */
                    371: int
1.28      daniel    372: UTF16BEToUTF8(unsigned char* out, int outlen,
1.25      daniel    373:             const unsigned char* inb, int *inlenb)
1.1       daniel    374: {
                    375:     unsigned char* outstart= out;
                    376:     unsigned char* outend= out+outlen;
1.25      daniel    377:     unsigned short* in = (unsigned short*) inb;
                    378:     unsigned short* inend;
                    379:     unsigned int c, d, inlen;
1.28      daniel    380: #ifdef BIG_ENDIAN
                    381: #else /* BIG_ENDIAN */
                    382:     unsigned char *tmp;
                    383: #endif /* BIG_ENDIAN */    
1.1       daniel    384:     int bits;
                    385: 
1.28      daniel    386:     if ((*inlenb % 2) == 1)
                    387:         (*inlenb)--;
1.25      daniel    388:     inlen = *inlenb / 2;
                    389:     inend= in + inlen;
1.1       daniel    390:     while (in < inend) {
1.28      daniel    391: #ifdef BIG_ENDIAN    
1.1       daniel    392:         c= *in++;
1.28      daniel    393: #else
                    394:         tmp = (unsigned char *) in;
                    395:        c = *tmp++;
                    396:        c = c << 8;
                    397:        c = c | (unsigned int) *tmp;
                    398:        in++;
                    399: #endif 
1.1       daniel    400:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
1.28      daniel    401:            if (in >= inend) {           /* (in > inend) shouldn't happens */
                    402:                (*inlenb) -= 2;
                    403:                break;
                    404:            }
                    405: 
                    406: #ifdef BIG_ENDIAN
                    407:             d= *in++;
                    408: #else
                    409:             tmp = (unsigned char *) in;
                    410:            d = *tmp++;
                    411:            d = d << 8;
                    412:            d = d | (unsigned int) *tmp;
                    413:            in++;
                    414: #endif     
                    415:             if ((d & 0xFC00) == 0xDC00) {
1.1       daniel    416:                 c &= 0x03FF;
                    417:                 c <<= 10;
                    418:                 c |= d & 0x03FF;
                    419:                 c += 0x10000;
                    420:             }
1.27      daniel    421:             else 
1.28      daniel    422:                return(-2);
1.1       daniel    423:         }
                    424: 
1.25      daniel    425:        /* assertion: c is a single UTF-4 value */
1.27      daniel    426:         if (out >= outend) 
1.28      daniel    427:            return(-1);
1.1       daniel    428:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
1.26      daniel    429:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    430:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    431:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1.1       daniel    432:  
1.26      daniel    433:         for ( ; bits >= 0; bits-= 6) {
1.27      daniel    434:             if (out >= outend) 
1.28      daniel    435:                return(-1);
1.26      daniel    436:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    437:         }
                    438:     }
1.28      daniel    439:     return(out-outstart);
1.1       daniel    440: }
                    441: 
                    442: /**
1.28      daniel    443:  * UTF8ToUTF16BE:
1.25      daniel    444:  * @outb:  a pointer to an array of bytes to store the result
                    445:  * @outlen:  the length of @outb
1.18      daniel    446:  * @in:  a pointer to an array of UTF-8 chars
                    447:  * @inlen:  the length of @in
1.1       daniel    448:  *
1.28      daniel    449:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1       daniel    450:  * block of chars out.
1.28      daniel    451:  * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15      daniel    452:  *
1.6       daniel    453:  * Returns the number of byte written, or -1 by lack of space, or -2
1.25      daniel    454:  *     if the transcoding failed. 
1.1       daniel    455:  */
                    456: int
1.28      daniel    457: UTF8ToUTF16BE(unsigned char* outb, int outlen,
1.25      daniel    458:             const unsigned char* in, int *inlen)
1.1       daniel    459: {
1.25      daniel    460:     unsigned short* out = (unsigned short*) outb;
1.1       daniel    461:     unsigned short* outstart= out;
1.28      daniel    462:     unsigned short* outend;
1.25      daniel    463:     const unsigned char* inend= in+*inlen;
1.1       daniel    464:     unsigned int c, d, trailing;
1.28      daniel    465: #ifdef BIG_ENDIAN
                    466: #else
                    467:     unsigned char *tmp;
                    468:     unsigned short tmp1, tmp2;
                    469: #endif /* BIG_ENDIAN */    
1.1       daniel    470: 
1.25      daniel    471:     outlen /= 2; /* convert in short length */
1.28      daniel    472:     outend = out + outlen;
1.1       daniel    473:     while (in < inend) {
                    474:       d= *in++;
                    475:       if      (d < 0x80)  { c= d; trailing= 0; }
1.28      daniel    476:       else if (d < 0xC0)
                    477:           return(-2);    /* trailing byte in leading position */
1.1       daniel    478:       else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
                    479:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    480:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.28      daniel    481:       else
                    482:           return(-2);    /* no chance for this in UTF-16 */
                    483: 
                    484:       if (inend - in < trailing) {
                    485:           *inlen -= (inend - in);
                    486:           break;
                    487:       } 
1.1       daniel    488: 
                    489:       for ( ; trailing; trailing--) {
1.28      daniel    490:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  return(-1);
1.1       daniel    491:           c <<= 6;
                    492:           c |= d & 0x3F;
                    493:       }
                    494: 
                    495:       /* assertion: c is a single UTF-4 value */
                    496:         if (c < 0x10000) {
1.28      daniel    497:             if (out >= outend)  return(-1);
                    498: #ifdef BIG_ENDIAN
1.1       daniel    499:             *out++ = c;
1.28      daniel    500: #else
                    501:             tmp = (unsigned char *) out;
                    502:             *tmp = c >> 8;
                    503:             *(tmp + 1) = c;
                    504:             out++;
                    505: #endif /* BIG_ENDIAN */
1.1       daniel    506:         }
                    507:         else if (c < 0x110000) {
1.28      daniel    508:             if (out+1 >= outend)  return(-1);
1.1       daniel    509:             c -= 0x10000;
1.28      daniel    510: #ifdef BIG_ENDIAN
1.1       daniel    511:             *out++ = 0xD800 | (c >> 10);
                    512:             *out++ = 0xDC00 | (c & 0x03FF);
1.28      daniel    513: #else
                    514:             tmp1 = 0xD800 | (c >> 10);
                    515:             tmp = (unsigned char *) out;
                    516:             *tmp = tmp1 >> 8;
                    517:             *(tmp + 1) = tmp1;
                    518:             out++;
                    519: 
                    520:             tmp2 = 0xDC00 | (c & 0x03FF);
                    521:             tmp = (unsigned char *) out;
                    522:             *tmp = tmp2 >> 8;
                    523:             *(tmp + 1) = tmp2;
                    524:             out++;
                    525: #endif
1.1       daniel    526:         }
1.28      daniel    527:         else  return(-1);
1.1       daniel    528:     }
1.28      daniel    529:     return(out-outstart);
1.1       daniel    530: }
                    531: 
1.7       daniel    532: /**
                    533:  * xmlDetectCharEncoding:
                    534:  * @in:  a pointer to the first bytes of the XML entity, must be at least
                    535:  *       4 bytes long.
1.25      daniel    536:  * @len:  pointer to the length of the buffer
1.7       daniel    537:  *
                    538:  * Guess the encoding of the entity using the first bytes of the entity content
                    539:  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
                    540:  * 
                    541:  * Returns one of the XML_CHAR_ENCODING_... values.
                    542:  */
                    543: xmlCharEncoding
1.25      daniel    544: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7       daniel    545: {
1.25      daniel    546:     if (len >= 4) {
                    547:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    548:            (in[2] == 0x00) && (in[3] == 0x3C))
                    549:            return(XML_CHAR_ENCODING_UCS4BE);
                    550:        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
                    551:            (in[2] == 0x00) && (in[3] == 0x00))
                    552:            return(XML_CHAR_ENCODING_UCS4LE);
                    553:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    554:            (in[2] == 0x3C) && (in[3] == 0x00))
                    555:            return(XML_CHAR_ENCODING_UCS4_2143);
                    556:        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
                    557:            (in[2] == 0x00) && (in[3] == 0x00))
                    558:            return(XML_CHAR_ENCODING_UCS4_3412);
                    559:        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
                    560:            (in[2] == 0xA7) && (in[3] == 0x94))
                    561:            return(XML_CHAR_ENCODING_EBCDIC);
                    562:        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
                    563:            (in[2] == 0x78) && (in[3] == 0x6D))
                    564:            return(XML_CHAR_ENCODING_UTF8);
                    565:     }
                    566:     if (len >= 2) {
                    567:        if ((in[0] == 0xFE) && (in[1] == 0xFF))
                    568:            return(XML_CHAR_ENCODING_UTF16BE);
                    569:        if ((in[0] == 0xFF) && (in[1] == 0xFE))
                    570:            return(XML_CHAR_ENCODING_UTF16LE);
                    571:     }
1.7       daniel    572:     return(XML_CHAR_ENCODING_NONE);
                    573: }
                    574: 
                    575: /**
                    576:  * xmlParseCharEncoding:
1.18      daniel    577:  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7       daniel    578:  *
                    579:  * Conpare the string to the known encoding schemes already known. Note
                    580:  * that the comparison is case insensitive accordingly to the section
                    581:  * [XML] 4.3.3 Character Encoding in Entities.
                    582:  * 
                    583:  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
                    584:  * if not recognized.
                    585:  */
                    586: xmlCharEncoding
1.8       daniel    587: xmlParseCharEncoding(const char* name)
1.7       daniel    588: {
                    589:     char upper[500];
                    590:     int i;
                    591: 
                    592:     for (i = 0;i < 499;i++) {
                    593:         upper[i] = toupper(name[i]);
                    594:        if (upper[i] == 0) break;
                    595:     }
                    596:     upper[i] = 0;
                    597: 
                    598:     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
                    599:     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
                    600:     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
                    601: 
                    602:     /*
                    603:      * NOTE: if we were able to parse this, the endianness of UTF16 is
                    604:      *       already found and in use
                    605:      */
                    606:     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
                    607:     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
                    608:     
                    609:     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    610:     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    611:     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
                    612: 
                    613:     /*
                    614:      * NOTE: if we were able to parse this, the endianness of UCS4 is
                    615:      *       already found and in use
                    616:      */
                    617:     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    618:     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    619:     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
                    620: 
                    621:     
                    622:     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
                    623:     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
                    624:     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
                    625: 
                    626:     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
                    627:     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
                    628:     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
                    629: 
                    630:     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
                    631:     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
                    632:     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
                    633:     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
                    634:     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
                    635:     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
                    636:     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
                    637: 
                    638:     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
                    639:     if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
                    640:     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
                    641:     return(XML_CHAR_ENCODING_ERROR);
                    642: }
1.9       daniel    643: 
                    644: /****************************************************************
                    645:  *                                                             *
                    646:  *             Char encoding handlers                          *
                    647:  *                                                             *
                    648:  ****************************************************************/
                    649: 
                    650: /* the size should be growable, but it's not a big deal ... */
                    651: #define MAX_ENCODING_HANDLERS 50
                    652: static xmlCharEncodingHandlerPtr *handlers = NULL;
                    653: static int nbCharEncodingHandler = 0;
                    654: 
                    655: /*
                    656:  * The default is UTF-8 for XML, that's also the default used for the
                    657:  * parser internals, so the default encoding handler is NULL
                    658:  */
                    659: 
                    660: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
                    661: 
                    662: /**
                    663:  * xmlNewCharEncodingHandler:
1.18      daniel    664:  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1.9       daniel    665:  * @input:  the xmlCharEncodingInputFunc to read that encoding
                    666:  * @output:  the xmlCharEncodingOutputFunc to write that encoding
                    667:  *
                    668:  * Create and registers an xmlCharEncodingHandler.
                    669:  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
                    670:  */
                    671: xmlCharEncodingHandlerPtr
1.25      daniel    672: xmlNewCharEncodingHandler(const char *name, 
                    673:                           xmlCharEncodingInputFunc input,
1.9       daniel    674:                           xmlCharEncodingOutputFunc output) {
                    675:     xmlCharEncodingHandlerPtr handler;
                    676:     char upper[500];
                    677:     int i;
                    678:     char *up = 0;
                    679: 
                    680:     /*
                    681:      * Keep only the uppercase version of the encoding.
                    682:      */
                    683:     if (name == NULL) {
                    684:         fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
                    685:        return(NULL);
                    686:     }
                    687:     for (i = 0;i < 499;i++) {
                    688:         upper[i] = toupper(name[i]);
                    689:        if (upper[i] == 0) break;
                    690:     }
                    691:     upper[i] = 0;
1.16      daniel    692:     up = xmlMemStrdup(upper);
1.9       daniel    693:     if (up == NULL) {
                    694:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    695:        return(NULL);
                    696:     }
                    697: 
                    698:     /*
                    699:      * allocate and fill-up an handler block.
                    700:      */
                    701:     handler = (xmlCharEncodingHandlerPtr)
1.16      daniel    702:               xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9       daniel    703:     if (handler == NULL) {
                    704:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    705:        return(NULL);
                    706:     }
                    707:     handler->input = input;
                    708:     handler->output = output;
                    709:     handler->name = up;
                    710: 
                    711:     /*
                    712:      * registers and returns the handler.
                    713:      */
                    714:     xmlRegisterCharEncodingHandler(handler);
                    715:     return(handler);
                    716: }
                    717: 
                    718: /**
                    719:  * xmlInitCharEncodingHandlers:
                    720:  *
                    721:  * Initialize the char encoding support, it registers the default
                    722:  * encoding supported.
1.18      daniel    723:  * NOTE: while public, this function usually doesn't need to be called
1.9       daniel    724:  *       in normal processing.
                    725:  */
                    726: void
                    727: xmlInitCharEncodingHandlers(void) {
                    728:     if (handlers != NULL) return;
                    729: 
                    730:     handlers = (xmlCharEncodingHandlerPtr *)
1.16      daniel    731:         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.9       daniel    732: 
                    733:     if (handlers == NULL) {
                    734:         fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
                    735:        return;
                    736:     }
1.10      daniel    737:     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25      daniel    738:     xmlUTF16LEHandler = 
1.28      daniel    739:           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
                    740:     xmlUTF16BEHandler = 
                    741:           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10      daniel    742:     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9       daniel    743: }
                    744: 
                    745: /**
1.19      daniel    746:  * xmlCleanupCharEncodingHandlers:
                    747:  *
                    748:  * Cleanup the memory allocated for the char encoding support, it
                    749:  * unregisters all the encoding handlers.
                    750:  */
                    751: void
                    752: xmlCleanupCharEncodingHandlers(void) {
                    753:     if (handlers == NULL) return;
                    754: 
                    755:     for (;nbCharEncodingHandler > 0;) {
                    756:         nbCharEncodingHandler--;
                    757:        if (handlers[nbCharEncodingHandler] != NULL) {
                    758:            xmlFree(handlers[nbCharEncodingHandler]->name);
                    759:            xmlFree(handlers[nbCharEncodingHandler]);
                    760:        }
                    761:     }
                    762:     xmlFree(handlers);
                    763:     handlers = NULL;
                    764:     nbCharEncodingHandler = 0;
                    765:     xmlDefaultCharEncodingHandler = NULL;
                    766: }
                    767: 
                    768: /**
1.9       daniel    769:  * xmlRegisterCharEncodingHandler:
                    770:  * @handler:  the xmlCharEncodingHandlerPtr handler block
                    771:  *
                    772:  * Register the char encoding handler, surprizing, isn't it ?
                    773:  */
                    774: void
                    775: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
                    776:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    777:     if (handler == NULL) {
                    778:         fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
                    779:        return;
                    780:     }
                    781: 
                    782:     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
                    783:         fprintf(stderr, 
                    784:        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
                    785:         fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
                    786:        return;
                    787:     }
                    788:     handlers[nbCharEncodingHandler++] = handler;
                    789: }
                    790: 
                    791: /**
                    792:  * xmlGetCharEncodingHandler:
                    793:  * @enc:  an xmlCharEncoding value.
                    794:  *
                    795:  * Search in the registrered set the handler able to read/write that encoding.
                    796:  *
                    797:  * Returns the handler or NULL if not found
                    798:  */
                    799: xmlCharEncodingHandlerPtr
                    800: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
                    801:     if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25      daniel    802:     switch (enc) {
                    803:         case XML_CHAR_ENCODING_ERROR:
                    804:            return(NULL);
                    805:         case XML_CHAR_ENCODING_NONE:
                    806:            return(NULL);
                    807:         case XML_CHAR_ENCODING_UTF8:
                    808:            return(NULL);
                    809:         case XML_CHAR_ENCODING_UTF16LE:
                    810:            return(xmlUTF16LEHandler);
                    811:         case XML_CHAR_ENCODING_UTF16BE:
                    812:            return(xmlUTF16BEHandler);
                    813:         case XML_CHAR_ENCODING_EBCDIC:
                    814:            return(NULL);
                    815:         case XML_CHAR_ENCODING_UCS4LE:
                    816:            return(NULL);
                    817:         case XML_CHAR_ENCODING_UCS4BE:
                    818:            return(NULL);
                    819:         case XML_CHAR_ENCODING_UCS4_2143:
                    820:            return(NULL);
                    821:         case XML_CHAR_ENCODING_UCS4_3412:
                    822:            return(NULL);
                    823:         case XML_CHAR_ENCODING_UCS2:
                    824:            return(NULL);
                    825:         case XML_CHAR_ENCODING_8859_1:
                    826:            return(NULL);
                    827:         case XML_CHAR_ENCODING_8859_2:
                    828:            return(NULL);
                    829:         case XML_CHAR_ENCODING_8859_3:
                    830:            return(NULL);
                    831:         case XML_CHAR_ENCODING_8859_4:
                    832:            return(NULL);
                    833:         case XML_CHAR_ENCODING_8859_5:
                    834:            return(NULL);
                    835:         case XML_CHAR_ENCODING_8859_6:
                    836:            return(NULL);
                    837:         case XML_CHAR_ENCODING_8859_7:
                    838:            return(NULL);
                    839:         case XML_CHAR_ENCODING_8859_8:
                    840:            return(NULL);
                    841:         case XML_CHAR_ENCODING_8859_9:
                    842:            return(NULL);
                    843:         case XML_CHAR_ENCODING_2022_JP:
                    844:         case XML_CHAR_ENCODING_SHIFT_JIS:
                    845:         case XML_CHAR_ENCODING_EUC_JP:
                    846:            return(NULL);
                    847:     }
1.9       daniel    848:     return(NULL);
                    849: }
                    850: 
                    851: /**
                    852:  * xmlGetCharEncodingHandler:
                    853:  * @enc:  a string describing the char encoding.
                    854:  *
                    855:  * Search in the registrered set the handler able to read/write that encoding.
                    856:  *
                    857:  * Returns the handler or NULL if not found
                    858:  */
                    859: xmlCharEncodingHandlerPtr
                    860: xmlFindCharEncodingHandler(const char *name) {
                    861:     char upper[500];
                    862:     int i;
                    863: 
                    864:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    865:     if (name == NULL) return(xmlDefaultCharEncodingHandler);
                    866:     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
                    867: 
                    868:     for (i = 0;i < 499;i++) {
                    869:         upper[i] = toupper(name[i]);
                    870:        if (upper[i] == 0) break;
                    871:     }
                    872:     upper[i] = 0;
                    873: 
                    874:     for (i = 0;i < nbCharEncodingHandler; i++)
                    875:         if (!strcmp(name, handlers[i]->name))
                    876:            return(handlers[i]);
                    877: 
                    878:     return(NULL);
                    879: }
                    880: 

Webmaster