Annotation of XML/encoding.c, revision 1.38

1.1       daniel      1: /*
                      2:  * encoding.c : implements the encoding conversion functions needed for XML
                      3:  *
                      4:  * Related specs: 
                      5:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
                      6:  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
                      7:  * [ISO-8859-1]   ISO Latin-1 characters codes.
                      8:  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
                      9:  *                Worldwide Character Encoding -- Version 1.0", Addison-
                     10:  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
                     11:  *                described in Unicode Technical Report #4.
                     12:  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
                     13:  *                Information Interchange, ANSI X3.4-1986.
                     14:  *
1.9       daniel     15:  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1       daniel     16:  *
                     17:  * See Copyright for the status of this software.
                     18:  *
                     19:  * Daniel.Veillard@w3.org
                     20:  */
                     21: 
1.21      daniel     22: #ifdef WIN32
                     23: #include "win32config.h"
                     24: #else
1.14      daniel     25: #include "config.h"
1.17      daniel     26: #endif
                     27: 
                     28: #include <stdio.h>
                     29: #include <string.h>
                     30: 
                     31: #ifdef HAVE_CTYPE_H
1.7       daniel     32: #include <ctype.h>
1.17      daniel     33: #endif
1.20      daniel     34: #ifdef HAVE_STDLIB_H
                     35: #include <stdlib.h>
                     36: #endif
1.30      daniel     37: #include <libxml/xmlversion.h>
                     38: #ifdef LIBXML_ICONV_ENABLED
                     39: #ifdef HAVE_ERRNO_H
                     40: #include <errno.h>
                     41: #endif
                     42: #endif
1.29      daniel     43: #include <libxml/encoding.h>
                     44: #include <libxml/xmlmemory.h>
1.3       daniel     45: 
1.25      daniel     46: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
                     47: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
                     48: 
1.30      daniel     49: #ifdef LIBXML_ICONV_ENABLED
1.37      daniel     50: #if 0
1.30      daniel     51: #define DEBUG_ENCODING  /* Define this to get encoding traces */
                     52: #endif
1.33      daniel     53: #endif
1.30      daniel     54: 
1.34      daniel     55: static int xmlLittleEndian = 1;
                     56: 
1.3       daniel     57: /*
                     58:  * From rfc2044: encoding of the Unicode values on UTF-8:
                     59:  *
                     60:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                     61:  * 0000 0000-0000 007F   0xxxxxxx
                     62:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                     63:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
                     64:  *
                     65:  * I hope we won't use values > 0xFFFF anytime soon !
                     66:  */
1.1       daniel     67: 
                     68: /**
1.22      daniel     69:  * xmlCheckUTF8: Check utf-8 string for legality.
                     70:  * @utf: Pointer to putative utf-8 encoded string.
                     71:  *
                     72:  * Checks @utf for being valid utf-8. @utf is assumed to be
                     73:  * null-terminated. This function is not super-strict, as it will
                     74:  * allow longer utf-8 sequences than necessary. Note that Java is
                     75:  * capable of producing these sequences if provoked. Also note, this
                     76:  * routine checks for the 4-byte maxiumum size, but does not check for
                     77:  * 0x10ffff maximum value.
                     78:  *
                     79:  * Return value: true if @utf is valid.
                     80:  **/
                     81: int
                     82: xmlCheckUTF8(const unsigned char *utf)
                     83: {
                     84:     int ix;
                     85:     unsigned char c;
                     86: 
                     87:     for (ix = 0; (c = utf[ix]);) {
                     88:         if (c & 0x80) {
                     89:            if ((utf[ix + 1] & 0xc0) != 0x80)
                     90:                return(0);
                     91:            if ((c & 0xe0) == 0xe0) {
                     92:                if ((utf[ix + 2] & 0xc0) != 0x80)
                     93:                    return(0);
                     94:                if ((c & 0xf0) == 0xf0) {
                     95:                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
                     96:                        return(0);
                     97:                    ix += 4;
                     98:                    /* 4-byte code */
                     99:                } else
                    100:                  /* 3-byte code */
                    101:                    ix += 3;
                    102:            } else
                    103:              /* 2-byte code */
                    104:                ix += 2;
                    105:        } else
                    106:            /* 1-byte code */
                    107:            ix++;
                    108:       }
                    109:       return(1);
                    110: }
                    111: 
                    112: /**
1.1       daniel    113:  * isolat1ToUTF8:
1.18      daniel    114:  * @out:  a pointer to an array of bytes to store the result
                    115:  * @outlen:  the length of @out
                    116:  * @in:  a pointer to an array of ISO Latin 1 chars
                    117:  * @inlen:  the length of @in
1.1       daniel    118:  *
                    119:  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
                    120:  * block of chars out.
1.33      daniel    121:  * Returns 0 if success, or -1 otherwise
                    122:  * The value of @inlen after return is the number of octets consumed
                    123:  *     as the return value is positive, else unpredictiable.
                    124:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    125:  */
                    126: int
1.33      daniel    127: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25      daniel    128:               const unsigned char* in, int *inlen) {
1.33      daniel    129:     unsigned char* outstart = out;
                    130:     const unsigned char* processed = in;
                    131:     unsigned char* outend = out + *outlen;
                    132:     const unsigned char* inend = in + *inlen;
1.1       daniel    133:     unsigned char c;
                    134: 
                    135:     while (in < inend) {
                    136:         c= *in++;
                    137:         if (c < 0x80) {
1.33      daniel    138:             if (out >= outend)
                    139:                break;
1.1       daniel    140:             *out++ = c;
                    141:         }
                    142:         else {
1.33      daniel    143:             if (out + 1 >= outend)  break;
1.1       daniel    144:             *out++ = 0xC0 | (c >> 6);
                    145:             *out++ = 0x80 | (0x3F & c);
                    146:         }
1.33      daniel    147:        processed = in;
1.1       daniel    148:     }
1.33      daniel    149:     *outlen = out - outstart;
                    150:     *inlen = processed - in;
                    151: 
                    152:     return(0);
1.1       daniel    153: }
                    154: 
                    155: /**
                    156:  * UTF8Toisolat1:
1.18      daniel    157:  * @out:  a pointer to an array of bytes to store the result
                    158:  * @outlen:  the length of @out
                    159:  * @in:  a pointer to an array of UTF-8 chars
                    160:  * @inlen:  the length of @in
1.1       daniel    161:  *
                    162:  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
                    163:  * block of chars out.
1.15      daniel    164:  * TODO: UTF8Toisolat1 need a fallback mechanism ...
                    165:  *
1.33      daniel    166:  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28      daniel    167:  * The value of @inlen after return is the number of octets consumed
                    168:  *     as the return value is positive, else unpredictiable.
1.33      daniel    169:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    170:  */
                    171: int
1.33      daniel    172: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25      daniel    173:               const unsigned char* in, int *inlen) {
1.33      daniel    174:     unsigned char* outstart = out;
                    175:     const unsigned char* processed = in;
                    176:     unsigned char* outend = out + *outlen;
                    177:     const unsigned char* inend = in + *inlen;
1.1       daniel    178:     unsigned char c;
                    179: 
                    180:     while (in < inend) {
                    181:         c= *in++;
                    182:         if (c < 0x80) {
1.28      daniel    183:             if (out >= outend)  return(-1);
1.1       daniel    184:             *out++= c;
                    185:         }
1.23      daniel    186:        else if (in == inend) {
                    187:             break;
                    188:        }
                    189:        else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
                    190:            /* a two byte utf-8 and can be encoding as isolate1 */
1.1       daniel    191:             *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23      daniel    192:        }
1.33      daniel    193:        else {
                    194:            *outlen = out - outstart;
                    195:            *inlen = processed - in;
1.28      daniel    196:            return(-2);
1.33      daniel    197:        }
                    198:        processed = in;
1.1       daniel    199:     }
1.33      daniel    200:     *outlen = out - outstart;
                    201:     *inlen = processed - in;
                    202:     return(0);
1.1       daniel    203: }
                    204: 
                    205: /**
1.28      daniel    206:  * UTF16LEToUTF8:
                    207:  * @out:  a pointer to an array of bytes to store the result
                    208:  * @outlen:  the length of @out
                    209:  * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
                    210:  * @inlenb:  the length of @in in UTF-16LE chars
                    211:  *
                    212:  * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
                    213:  * block of chars out. This function assume the endian properity
                    214:  * is the same between the native type of this machine and the
                    215:  * inputed one.
                    216:  *
                    217:  * Returns the number of byte written, or -1 by lack of space, or -2
                    218:  *     if the transcoding fails (for *in is not valid utf16 string)
                    219:  *     The value of *inlen after return is the number of octets consumed
                    220:  *     as the return value is positive, else unpredictiable.
                    221:  */
                    222: int
1.33      daniel    223: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28      daniel    224:             const unsigned char* inb, int *inlenb)
                    225: {
1.33      daniel    226:     unsigned char* outstart = out;
                    227:     const unsigned char* processed = inb;
                    228:     unsigned char* outend = out + *outlen;
1.28      daniel    229:     unsigned short* in = (unsigned short*) inb;
                    230:     unsigned short* inend;
                    231:     unsigned int c, d, inlen;
                    232:     unsigned char *tmp;
                    233:     int bits;
                    234: 
                    235:     if ((*inlenb % 2) == 1)
                    236:         (*inlenb)--;
                    237:     inlen = *inlenb / 2;
1.33      daniel    238:     inend = in + inlen;
1.28      daniel    239:     while (in < inend) {
1.34      daniel    240:         if (xmlLittleEndian) {
                    241:            c= *in++;
                    242:        } else {
                    243:            tmp = (unsigned char *) in;
                    244:            c = *tmp++;
                    245:            c = c | (((unsigned int)*tmp) << 8);
                    246:            in++;
                    247:        }
1.28      daniel    248:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
                    249:             if (in >= inend) {           /* (in > inend) shouldn't happens */
                    250:                 break;
                    251:             }
1.34      daniel    252:            if (xmlLittleEndian) {
                    253:                d = *in++;
                    254:            } else {
                    255:                tmp = (unsigned char *) in;
                    256:                d = *tmp++;
                    257:                d = d | (((unsigned int)*tmp) << 8);
                    258:                in++;
                    259:            }
1.28      daniel    260:             if ((d & 0xFC00) == 0xDC00) {
                    261:                 c &= 0x03FF;
                    262:                 c <<= 10;
                    263:                 c |= d & 0x03FF;
                    264:                 c += 0x10000;
                    265:             }
1.33      daniel    266:             else {
                    267:                *outlen = out - outstart;
                    268:                *inlenb = processed - inb;
1.28      daniel    269:                return(-2);
1.33      daniel    270:            }
1.28      daniel    271:         }
                    272: 
                    273:        /* assertion: c is a single UTF-4 value */
                    274:         if (out >= outend)
1.33      daniel    275:            break;
1.28      daniel    276:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
                    277:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    278:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    279:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                    280:  
                    281:         for ( ; bits >= 0; bits-= 6) {
                    282:             if (out >= outend)
1.33      daniel    283:                break;
1.28      daniel    284:             *out++= ((c >> bits) & 0x3F) | 0x80;
                    285:         }
1.33      daniel    286:        processed = (const unsigned char*) in;
1.28      daniel    287:     }
1.33      daniel    288:     *outlen = out - outstart;
                    289:     *inlenb = processed - inb;
                    290:     return(0);
1.28      daniel    291: }
                    292: 
                    293: /**
                    294:  * UTF8ToUTF16LE:
                    295:  * @outb:  a pointer to an array of bytes to store the result
                    296:  * @outlen:  the length of @outb
                    297:  * @in:  a pointer to an array of UTF-8 chars
                    298:  * @inlen:  the length of @in
                    299:  *
                    300:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
                    301:  * block of chars out.
                    302:  * TODO: UTF8ToUTF16LE need a fallback mechanism ...
                    303:  *
                    304:  * Returns the number of byte written, or -1 by lack of space, or -2
                    305:  *     if the transcoding failed. 
                    306:  */
                    307: int
1.33      daniel    308: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28      daniel    309:             const unsigned char* in, int *inlen)
                    310: {
                    311:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    312:     const unsigned char* processed = in;
1.28      daniel    313:     unsigned short* outstart= out;
                    314:     unsigned short* outend;
                    315:     const unsigned char* inend= in+*inlen;
                    316:     unsigned int c, d, trailing;
                    317:     unsigned char *tmp;
                    318:     unsigned short tmp1, tmp2;
                    319: 
1.37      daniel    320:     if (in == NULL) {
                    321:         /*
                    322:         * initialization, add the Byte Order Mark
                    323:         */
                    324:         if (*outlen >= 2) {
                    325:            outb[0] = 0xFF;
                    326:            outb[1] = 0xFE;
                    327:            *outlen = 2;
                    328:            *inlen = 0;
                    329: #ifdef DEBUG_ENCODING
                    330:             fprintf(stderr, "Added FFFE Byte Order Mark\n");
                    331: #endif
                    332:            return(2);
                    333:        }
                    334:        *outlen = 0;
                    335:        *inlen = 0;
                    336:        return(0);
                    337:     }
1.33      daniel    338:     outend = out + (*outlen / 2);
1.28      daniel    339:     while (in < inend) {
                    340:       d= *in++;
                    341:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    342:       else if (d < 0xC0) {
                    343:           /* trailing byte in leading position */
                    344:          *outlen = out - outstart;
                    345:          *inlen = processed - in;
                    346:          return(-2);
                    347:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.28      daniel    348:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    349:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    350:       else {
                    351:        /* no chance for this in UTF-16 */
                    352:        *outlen = out - outstart;
                    353:        *inlen = processed - in;
                    354:        return(-2);
                    355:       }
1.28      daniel    356: 
                    357:       if (inend - in < trailing) {
                    358:           break;
                    359:       } 
                    360: 
                    361:       for ( ; trailing; trailing--) {
                    362:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33      daniel    363:              break;
1.28      daniel    364:           c <<= 6;
                    365:           c |= d & 0x3F;
                    366:       }
                    367: 
                    368:       /* assertion: c is a single UTF-4 value */
                    369:         if (c < 0x10000) {
                    370:             if (out >= outend)
1.33      daniel    371:                break;
1.34      daniel    372:            if (xmlLittleEndian) {
                    373:                *out++ = c;
                    374:            } else {
                    375:                tmp = (unsigned char *) out;
                    376:                *tmp = c ;
                    377:                *(tmp + 1) = c >> 8 ;
                    378:                out++;
                    379:            }
1.28      daniel    380:         }
                    381:         else if (c < 0x110000) {
                    382:             if (out+1 >= outend)
1.33      daniel    383:                break;
1.28      daniel    384:             c -= 0x10000;
1.34      daniel    385:            if (xmlLittleEndian) {
                    386:                *out++ = 0xD800 | (c >> 10);
                    387:                *out++ = 0xDC00 | (c & 0x03FF);
                    388:            } else {
                    389:                tmp1 = 0xD800 | (c >> 10);
                    390:                tmp = (unsigned char *) out;
                    391:                *tmp = tmp1;
                    392:                *(tmp + 1) = tmp1 >> 8;
                    393:                out++;
                    394: 
                    395:                tmp2 = 0xDC00 | (c & 0x03FF);
                    396:                tmp = (unsigned char *) out;
                    397:                *tmp  = tmp2;
                    398:                *(tmp + 1) = tmp2 >> 8;
                    399:                out++;
                    400:            }
1.28      daniel    401:         }
                    402:         else
1.33      daniel    403:            break;
                    404:        processed = in;
1.28      daniel    405:     }
1.36      daniel    406:     *outlen = (out - outstart) * 2;
1.33      daniel    407:     *inlen = processed - in;
                    408:     return(0);
1.28      daniel    409: }
                    410: 
                    411: /**
                    412:  * UTF16BEToUTF8:
1.18      daniel    413:  * @out:  a pointer to an array of bytes to store the result
                    414:  * @outlen:  the length of @out
1.25      daniel    415:  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
                    416:  * @inlenb:  the length of @in in UTF-16 chars
1.1       daniel    417:  *
                    418:  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28      daniel    419:  * block of chars out. This function assume the endian properity
                    420:  * is the same between the native type of this machine and the
                    421:  * inputed one.
1.25      daniel    422:  *
1.28      daniel    423:  * Returns the number of byte written, or -1 by lack of space, or -2
                    424:  *     if the transcoding fails (for *in is not valid utf16 string)
                    425:  * The value of *inlen after return is the number of octets consumed
                    426:  *     as the return value is positive, else unpredictiable.
1.1       daniel    427:  */
                    428: int
1.33      daniel    429: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25      daniel    430:             const unsigned char* inb, int *inlenb)
1.1       daniel    431: {
1.33      daniel    432:     unsigned char* outstart = out;
                    433:     const unsigned char* processed = inb;
                    434:     unsigned char* outend = out + *outlen;
1.25      daniel    435:     unsigned short* in = (unsigned short*) inb;
                    436:     unsigned short* inend;
                    437:     unsigned int c, d, inlen;
1.28      daniel    438:     unsigned char *tmp;
1.1       daniel    439:     int bits;
                    440: 
1.28      daniel    441:     if ((*inlenb % 2) == 1)
                    442:         (*inlenb)--;
1.25      daniel    443:     inlen = *inlenb / 2;
                    444:     inend= in + inlen;
1.1       daniel    445:     while (in < inend) {
1.34      daniel    446:        if (xmlLittleEndian) {
                    447:            tmp = (unsigned char *) in;
                    448:            c = *tmp++;
                    449:            c = c << 8;
                    450:            c = c | (unsigned int) *tmp;
                    451:            in++;
                    452:        } else {
                    453:            c= *in++;
                    454:        } 
1.1       daniel    455:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
1.28      daniel    456:            if (in >= inend) {           /* (in > inend) shouldn't happens */
1.33      daniel    457:                *outlen = out - outstart;
                    458:                *inlenb = processed - inb;
                    459:                return(-2);
1.28      daniel    460:            }
1.34      daniel    461:            if (xmlLittleEndian) {
                    462:                tmp = (unsigned char *) in;
                    463:                d = *tmp++;
                    464:                d = d << 8;
                    465:                d = d | (unsigned int) *tmp;
                    466:                in++;
                    467:            } else {
                    468:                d= *in++;
                    469:            }
1.28      daniel    470:             if ((d & 0xFC00) == 0xDC00) {
1.1       daniel    471:                 c &= 0x03FF;
                    472:                 c <<= 10;
                    473:                 c |= d & 0x03FF;
                    474:                 c += 0x10000;
                    475:             }
1.33      daniel    476:             else {
                    477:                *outlen = out - outstart;
                    478:                *inlenb = processed - inb;
1.28      daniel    479:                return(-2);
1.33      daniel    480:            }
1.1       daniel    481:         }
                    482: 
1.25      daniel    483:        /* assertion: c is a single UTF-4 value */
1.27      daniel    484:         if (out >= outend) 
1.33      daniel    485:            break;
1.1       daniel    486:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
1.26      daniel    487:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    488:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    489:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1.1       daniel    490:  
1.26      daniel    491:         for ( ; bits >= 0; bits-= 6) {
1.27      daniel    492:             if (out >= outend) 
1.33      daniel    493:                break;
1.26      daniel    494:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    495:         }
1.33      daniel    496:        processed = (const unsigned char*) in;
1.1       daniel    497:     }
1.33      daniel    498:     *outlen = out - outstart;
                    499:     *inlenb = processed - inb;
                    500:     return(0);
1.1       daniel    501: }
                    502: 
                    503: /**
1.28      daniel    504:  * UTF8ToUTF16BE:
1.25      daniel    505:  * @outb:  a pointer to an array of bytes to store the result
                    506:  * @outlen:  the length of @outb
1.18      daniel    507:  * @in:  a pointer to an array of UTF-8 chars
                    508:  * @inlen:  the length of @in
1.1       daniel    509:  *
1.28      daniel    510:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1       daniel    511:  * block of chars out.
1.28      daniel    512:  * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15      daniel    513:  *
1.6       daniel    514:  * Returns the number of byte written, or -1 by lack of space, or -2
1.25      daniel    515:  *     if the transcoding failed. 
1.1       daniel    516:  */
                    517: int
1.33      daniel    518: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25      daniel    519:             const unsigned char* in, int *inlen)
1.1       daniel    520: {
1.25      daniel    521:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    522:     const unsigned char* processed = in;
1.1       daniel    523:     unsigned short* outstart= out;
1.28      daniel    524:     unsigned short* outend;
1.25      daniel    525:     const unsigned char* inend= in+*inlen;
1.1       daniel    526:     unsigned int c, d, trailing;
1.28      daniel    527:     unsigned char *tmp;
                    528:     unsigned short tmp1, tmp2;
1.1       daniel    529: 
1.37      daniel    530:     if (in == NULL) {
                    531:         /*
                    532:         * initialization, add the Byte Order Mark
                    533:         */
                    534:         if (*outlen >= 2) {
                    535:            outb[0] = 0xFE;
                    536:            outb[1] = 0xFF;
                    537:            *outlen = 2;
                    538:            *inlen = 0;
                    539: #ifdef DEBUG_ENCODING
                    540:             fprintf(stderr, "Added FEFF Byte Order Mark\n");
                    541: #endif
                    542:            return(2);
                    543:        }
                    544:        *outlen = 0;
                    545:        *inlen = 0;
                    546:        return(0);
                    547:     }
1.33      daniel    548:     outend = out + (*outlen / 2);
1.1       daniel    549:     while (in < inend) {
                    550:       d= *in++;
                    551:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    552:       else if (d < 0xC0)  {
                    553:           /* trailing byte in leading position */
                    554:          *outlen = out - outstart;
                    555:          *inlen = processed - in;
                    556:          return(-2);
                    557:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.1       daniel    558:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    559:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    560:       else {
                    561:           /* no chance for this in UTF-16 */
                    562:          *outlen = out - outstart;
                    563:          *inlen = processed - in;
                    564:          return(-2);
                    565:       }
1.28      daniel    566: 
                    567:       if (inend - in < trailing) {
                    568:           break;
                    569:       } 
1.1       daniel    570: 
                    571:       for ( ; trailing; trailing--) {
1.33      daniel    572:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
1.1       daniel    573:           c <<= 6;
                    574:           c |= d & 0x3F;
                    575:       }
                    576: 
                    577:       /* assertion: c is a single UTF-4 value */
                    578:         if (c < 0x10000) {
1.33      daniel    579:             if (out >= outend)  break;
1.34      daniel    580:            if (xmlLittleEndian) {
                    581:                tmp = (unsigned char *) out;
                    582:                *tmp = c >> 8;
                    583:                *(tmp + 1) = c;
                    584:                out++;
                    585:            } else {
                    586:                *out++ = c;
                    587:            }
1.1       daniel    588:         }
                    589:         else if (c < 0x110000) {
1.33      daniel    590:             if (out+1 >= outend)  break;
1.1       daniel    591:             c -= 0x10000;
1.34      daniel    592:            if (xmlLittleEndian) {
                    593:                tmp1 = 0xD800 | (c >> 10);
                    594:                tmp = (unsigned char *) out;
                    595:                *tmp = tmp1 >> 8;
                    596:                *(tmp + 1) = tmp1;
                    597:                out++;
                    598: 
                    599:                tmp2 = 0xDC00 | (c & 0x03FF);
                    600:                tmp = (unsigned char *) out;
                    601:                *tmp = tmp2 >> 8;
                    602:                *(tmp + 1) = tmp2;
                    603:                out++;
                    604:            } else {
                    605:                *out++ = 0xD800 | (c >> 10);
                    606:                *out++ = 0xDC00 | (c & 0x03FF);
                    607:            }
1.1       daniel    608:         }
1.33      daniel    609:         else
                    610:            break;
                    611:        processed = in;
1.1       daniel    612:     }
1.36      daniel    613:     *outlen = (out - outstart) * 2;
1.33      daniel    614:     *inlen = processed - in;
                    615:     return(0);
1.1       daniel    616: }
                    617: 
1.7       daniel    618: /**
                    619:  * xmlDetectCharEncoding:
                    620:  * @in:  a pointer to the first bytes of the XML entity, must be at least
                    621:  *       4 bytes long.
1.25      daniel    622:  * @len:  pointer to the length of the buffer
1.7       daniel    623:  *
                    624:  * Guess the encoding of the entity using the first bytes of the entity content
                    625:  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
                    626:  * 
                    627:  * Returns one of the XML_CHAR_ENCODING_... values.
                    628:  */
                    629: xmlCharEncoding
1.25      daniel    630: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7       daniel    631: {
1.25      daniel    632:     if (len >= 4) {
                    633:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    634:            (in[2] == 0x00) && (in[3] == 0x3C))
                    635:            return(XML_CHAR_ENCODING_UCS4BE);
                    636:        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
                    637:            (in[2] == 0x00) && (in[3] == 0x00))
                    638:            return(XML_CHAR_ENCODING_UCS4LE);
                    639:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    640:            (in[2] == 0x3C) && (in[3] == 0x00))
                    641:            return(XML_CHAR_ENCODING_UCS4_2143);
                    642:        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
                    643:            (in[2] == 0x00) && (in[3] == 0x00))
                    644:            return(XML_CHAR_ENCODING_UCS4_3412);
                    645:        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
                    646:            (in[2] == 0xA7) && (in[3] == 0x94))
                    647:            return(XML_CHAR_ENCODING_EBCDIC);
                    648:        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
                    649:            (in[2] == 0x78) && (in[3] == 0x6D))
                    650:            return(XML_CHAR_ENCODING_UTF8);
                    651:     }
                    652:     if (len >= 2) {
                    653:        if ((in[0] == 0xFE) && (in[1] == 0xFF))
                    654:            return(XML_CHAR_ENCODING_UTF16BE);
                    655:        if ((in[0] == 0xFF) && (in[1] == 0xFE))
                    656:            return(XML_CHAR_ENCODING_UTF16LE);
                    657:     }
1.7       daniel    658:     return(XML_CHAR_ENCODING_NONE);
                    659: }
                    660: 
                    661: /**
                    662:  * xmlParseCharEncoding:
1.18      daniel    663:  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7       daniel    664:  *
                    665:  * Conpare the string to the known encoding schemes already known. Note
                    666:  * that the comparison is case insensitive accordingly to the section
                    667:  * [XML] 4.3.3 Character Encoding in Entities.
                    668:  * 
                    669:  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
                    670:  * if not recognized.
                    671:  */
                    672: xmlCharEncoding
1.8       daniel    673: xmlParseCharEncoding(const char* name)
1.7       daniel    674: {
                    675:     char upper[500];
                    676:     int i;
                    677: 
                    678:     for (i = 0;i < 499;i++) {
                    679:         upper[i] = toupper(name[i]);
                    680:        if (upper[i] == 0) break;
                    681:     }
                    682:     upper[i] = 0;
                    683: 
                    684:     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
                    685:     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
                    686:     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
                    687: 
                    688:     /*
                    689:      * NOTE: if we were able to parse this, the endianness of UTF16 is
                    690:      *       already found and in use
                    691:      */
                    692:     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
                    693:     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
                    694:     
                    695:     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    696:     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    697:     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
                    698: 
                    699:     /*
                    700:      * NOTE: if we were able to parse this, the endianness of UCS4 is
                    701:      *       already found and in use
                    702:      */
                    703:     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    704:     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    705:     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
                    706: 
                    707:     
                    708:     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
                    709:     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
                    710:     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
                    711: 
                    712:     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
                    713:     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
                    714:     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
                    715: 
                    716:     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
                    717:     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
                    718:     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
                    719:     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
                    720:     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
                    721:     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
                    722:     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
                    723: 
                    724:     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30      daniel    725:     if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7       daniel    726:     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30      daniel    727: 
                    728: #ifdef DEBUG_ENCODING
                    729:     fprintf(stderr, "Unknown encoding %s\n", name);
                    730: #endif
1.7       daniel    731:     return(XML_CHAR_ENCODING_ERROR);
                    732: }
1.9       daniel    733: 
1.38    ! daniel    734: /**
        !           735:  * xmlGetCharEncodingName:
        !           736:  * @enc:  the encoding
        !           737:  *
        !           738:  * The "canonical" name for XML encoding.
        !           739:  * C.f. http://www.w3.org/TR/REC-xml#charencoding
        !           740:  * Section 4.3.3  Character Encoding in Entities
        !           741:  *
        !           742:  * Returns the canonical name for the given encoding
        !           743:  */
        !           744: 
        !           745: const char*
        !           746: xmlGetCharEncodingName(xmlCharEncoding enc) {
        !           747:     switch (enc) {
        !           748:         case XML_CHAR_ENCODING_ERROR:
        !           749:            return(NULL);
        !           750:         case XML_CHAR_ENCODING_NONE:
        !           751:            return(NULL);
        !           752:         case XML_CHAR_ENCODING_UTF8:
        !           753:            return("UTF-8");
        !           754:         case XML_CHAR_ENCODING_UTF16LE:
        !           755:            return("UTF-16");
        !           756:         case XML_CHAR_ENCODING_UTF16BE:
        !           757:            return("UTF-16");
        !           758:         case XML_CHAR_ENCODING_EBCDIC:
        !           759:             return("EBCDIC");
        !           760:         case XML_CHAR_ENCODING_UCS4LE:
        !           761:             return("ISO-10646-UCS-4");
        !           762:         case XML_CHAR_ENCODING_UCS4BE:
        !           763:             return("ISO-10646-UCS-4");
        !           764:         case XML_CHAR_ENCODING_UCS4_2143:
        !           765:             return("ISO-10646-UCS-4");
        !           766:         case XML_CHAR_ENCODING_UCS4_3412:
        !           767:             return("ISO-10646-UCS-4");
        !           768:         case XML_CHAR_ENCODING_UCS2:
        !           769:             return("ISO-10646-UCS-2");
        !           770:         case XML_CHAR_ENCODING_8859_1:
        !           771:            return("ISO-8859-1");
        !           772:         case XML_CHAR_ENCODING_8859_2:
        !           773:            return("ISO-8859-2");
        !           774:         case XML_CHAR_ENCODING_8859_3:
        !           775:            return("ISO-8859-3");
        !           776:         case XML_CHAR_ENCODING_8859_4:
        !           777:            return("ISO-8859-4");
        !           778:         case XML_CHAR_ENCODING_8859_5:
        !           779:            return("ISO-8859-5");
        !           780:         case XML_CHAR_ENCODING_8859_6:
        !           781:            return("ISO-8859-6");
        !           782:         case XML_CHAR_ENCODING_8859_7:
        !           783:            return("ISO-8859-7");
        !           784:         case XML_CHAR_ENCODING_8859_8:
        !           785:            return("ISO-8859-8");
        !           786:         case XML_CHAR_ENCODING_8859_9:
        !           787:            return("ISO-8859-9");
        !           788:         case XML_CHAR_ENCODING_2022_JP:
        !           789:             return("ISO-2022-JP");
        !           790:         case XML_CHAR_ENCODING_SHIFT_JIS:
        !           791:             return("Shift-JIS");
        !           792:         case XML_CHAR_ENCODING_EUC_JP:
        !           793:             return("EUC-JP");
        !           794:     }
        !           795:     return(NULL);
        !           796: }
        !           797: 
1.9       daniel    798: /****************************************************************
                    799:  *                                                             *
                    800:  *             Char encoding handlers                          *
                    801:  *                                                             *
                    802:  ****************************************************************/
                    803: 
                    804: /* the size should be growable, but it's not a big deal ... */
                    805: #define MAX_ENCODING_HANDLERS 50
                    806: static xmlCharEncodingHandlerPtr *handlers = NULL;
                    807: static int nbCharEncodingHandler = 0;
                    808: 
                    809: /*
                    810:  * The default is UTF-8 for XML, that's also the default used for the
                    811:  * parser internals, so the default encoding handler is NULL
                    812:  */
                    813: 
                    814: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
                    815: 
                    816: /**
                    817:  * xmlNewCharEncodingHandler:
1.18      daniel    818:  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1.9       daniel    819:  * @input:  the xmlCharEncodingInputFunc to read that encoding
                    820:  * @output:  the xmlCharEncodingOutputFunc to write that encoding
                    821:  *
                    822:  * Create and registers an xmlCharEncodingHandler.
                    823:  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
                    824:  */
                    825: xmlCharEncodingHandlerPtr
1.25      daniel    826: xmlNewCharEncodingHandler(const char *name, 
                    827:                           xmlCharEncodingInputFunc input,
1.9       daniel    828:                           xmlCharEncodingOutputFunc output) {
                    829:     xmlCharEncodingHandlerPtr handler;
                    830:     char upper[500];
                    831:     int i;
                    832:     char *up = 0;
                    833: 
                    834:     /*
                    835:      * Keep only the uppercase version of the encoding.
                    836:      */
                    837:     if (name == NULL) {
                    838:         fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
                    839:        return(NULL);
                    840:     }
                    841:     for (i = 0;i < 499;i++) {
                    842:         upper[i] = toupper(name[i]);
                    843:        if (upper[i] == 0) break;
                    844:     }
                    845:     upper[i] = 0;
1.16      daniel    846:     up = xmlMemStrdup(upper);
1.9       daniel    847:     if (up == NULL) {
                    848:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    849:        return(NULL);
                    850:     }
                    851: 
                    852:     /*
                    853:      * allocate and fill-up an handler block.
                    854:      */
                    855:     handler = (xmlCharEncodingHandlerPtr)
1.16      daniel    856:               xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9       daniel    857:     if (handler == NULL) {
                    858:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    859:        return(NULL);
                    860:     }
                    861:     handler->input = input;
                    862:     handler->output = output;
                    863:     handler->name = up;
                    864: 
                    865:     /*
                    866:      * registers and returns the handler.
                    867:      */
                    868:     xmlRegisterCharEncodingHandler(handler);
1.30      daniel    869: #ifdef DEBUG_ENCODING
                    870:     fprintf(stderr, "Registered encoding handler for %s\n", name);
                    871: #endif
1.9       daniel    872:     return(handler);
                    873: }
                    874: 
                    875: /**
                    876:  * xmlInitCharEncodingHandlers:
                    877:  *
                    878:  * Initialize the char encoding support, it registers the default
                    879:  * encoding supported.
1.18      daniel    880:  * NOTE: while public, this function usually doesn't need to be called
1.9       daniel    881:  *       in normal processing.
                    882:  */
                    883: void
                    884: xmlInitCharEncodingHandlers(void) {
1.34      daniel    885:     unsigned short int tst = 0x1234;
                    886:     unsigned char *ptr = (unsigned char *) &tst; 
                    887: 
1.9       daniel    888:     if (handlers != NULL) return;
                    889: 
                    890:     handlers = (xmlCharEncodingHandlerPtr *)
1.16      daniel    891:         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34      daniel    892: 
                    893:     if (*ptr == 0x12) xmlLittleEndian = 0;
                    894:     else if (*ptr == 0x34) xmlLittleEndian = 1;
                    895:     else fprintf(stderr, "Odd problem at endianness detection\n");
1.9       daniel    896: 
                    897:     if (handlers == NULL) {
                    898:         fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
                    899:        return;
                    900:     }
1.10      daniel    901:     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25      daniel    902:     xmlUTF16LEHandler = 
1.28      daniel    903:           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
                    904:     xmlUTF16BEHandler = 
                    905:           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10      daniel    906:     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9       daniel    907: }
                    908: 
                    909: /**
1.19      daniel    910:  * xmlCleanupCharEncodingHandlers:
                    911:  *
                    912:  * Cleanup the memory allocated for the char encoding support, it
                    913:  * unregisters all the encoding handlers.
                    914:  */
                    915: void
                    916: xmlCleanupCharEncodingHandlers(void) {
                    917:     if (handlers == NULL) return;
                    918: 
                    919:     for (;nbCharEncodingHandler > 0;) {
                    920:         nbCharEncodingHandler--;
                    921:        if (handlers[nbCharEncodingHandler] != NULL) {
1.31      daniel    922:            if (handlers[nbCharEncodingHandler]->name != NULL)
                    923:                xmlFree(handlers[nbCharEncodingHandler]->name);
1.19      daniel    924:            xmlFree(handlers[nbCharEncodingHandler]);
                    925:        }
                    926:     }
                    927:     xmlFree(handlers);
                    928:     handlers = NULL;
                    929:     nbCharEncodingHandler = 0;
                    930:     xmlDefaultCharEncodingHandler = NULL;
                    931: }
                    932: 
                    933: /**
1.9       daniel    934:  * xmlRegisterCharEncodingHandler:
                    935:  * @handler:  the xmlCharEncodingHandlerPtr handler block
                    936:  *
                    937:  * Register the char encoding handler, surprizing, isn't it ?
                    938:  */
                    939: void
                    940: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
                    941:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    942:     if (handler == NULL) {
                    943:         fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
                    944:        return;
                    945:     }
                    946: 
                    947:     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
                    948:         fprintf(stderr, 
                    949:        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
                    950:         fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
                    951:        return;
                    952:     }
                    953:     handlers[nbCharEncodingHandler++] = handler;
                    954: }
                    955: 
                    956: /**
                    957:  * xmlGetCharEncodingHandler:
                    958:  * @enc:  an xmlCharEncoding value.
                    959:  *
                    960:  * Search in the registrered set the handler able to read/write that encoding.
                    961:  *
                    962:  * Returns the handler or NULL if not found
                    963:  */
                    964: xmlCharEncodingHandlerPtr
                    965: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30      daniel    966:     xmlCharEncodingHandlerPtr handler;
                    967: 
1.9       daniel    968:     if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25      daniel    969:     switch (enc) {
                    970:         case XML_CHAR_ENCODING_ERROR:
                    971:            return(NULL);
                    972:         case XML_CHAR_ENCODING_NONE:
                    973:            return(NULL);
                    974:         case XML_CHAR_ENCODING_UTF8:
                    975:            return(NULL);
                    976:         case XML_CHAR_ENCODING_UTF16LE:
                    977:            return(xmlUTF16LEHandler);
                    978:         case XML_CHAR_ENCODING_UTF16BE:
                    979:            return(xmlUTF16BEHandler);
                    980:         case XML_CHAR_ENCODING_EBCDIC:
1.30      daniel    981:             handler = xmlFindCharEncodingHandler("EBCDIC");
                    982:             if (handler != NULL) return(handler);
                    983:             handler = xmlFindCharEncodingHandler("ebcdic");
                    984:             if (handler != NULL) return(handler);
                    985:            break;
1.38    ! daniel    986:         case XML_CHAR_ENCODING_UCS4BE:
1.30      daniel    987:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
                    988:             if (handler != NULL) return(handler);
                    989:             handler = xmlFindCharEncodingHandler("UCS-4");
                    990:             if (handler != NULL) return(handler);
                    991:             handler = xmlFindCharEncodingHandler("UCS4");
                    992:             if (handler != NULL) return(handler);
                    993:            break;
1.38    ! daniel    994:         case XML_CHAR_ENCODING_UCS4LE:
        !           995:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
        !           996:             if (handler != NULL) return(handler);
        !           997:             handler = xmlFindCharEncodingHandler("UCS-4");
        !           998:             if (handler != NULL) return(handler);
        !           999:             handler = xmlFindCharEncodingHandler("UCS4");
1.30      daniel   1000:             if (handler != NULL) return(handler);
                   1001:            break;
1.25      daniel   1002:         case XML_CHAR_ENCODING_UCS4_2143:
1.30      daniel   1003:            break;
1.25      daniel   1004:         case XML_CHAR_ENCODING_UCS4_3412:
1.30      daniel   1005:            break;
1.25      daniel   1006:         case XML_CHAR_ENCODING_UCS2:
1.30      daniel   1007:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
                   1008:             if (handler != NULL) return(handler);
                   1009:             handler = xmlFindCharEncodingHandler("UCS-2");
                   1010:             if (handler != NULL) return(handler);
                   1011:             handler = xmlFindCharEncodingHandler("UCS2");
                   1012:             if (handler != NULL) return(handler);
                   1013:            break;
1.25      daniel   1014:         case XML_CHAR_ENCODING_8859_1:
                   1015:         case XML_CHAR_ENCODING_8859_2:
                   1016:         case XML_CHAR_ENCODING_8859_3:
                   1017:         case XML_CHAR_ENCODING_8859_4:
                   1018:         case XML_CHAR_ENCODING_8859_5:
                   1019:         case XML_CHAR_ENCODING_8859_6:
                   1020:         case XML_CHAR_ENCODING_8859_7:
                   1021:         case XML_CHAR_ENCODING_8859_8:
                   1022:         case XML_CHAR_ENCODING_8859_9:
                   1023:            return(NULL);
                   1024:         case XML_CHAR_ENCODING_2022_JP:
1.30      daniel   1025:             handler = xmlFindCharEncodingHandler("ISO-2022-JP");
                   1026:             if (handler != NULL) return(handler);
                   1027:            break;
1.25      daniel   1028:         case XML_CHAR_ENCODING_SHIFT_JIS:
1.30      daniel   1029:             handler = xmlFindCharEncodingHandler("SHIFT-JIS");
                   1030:             if (handler != NULL) return(handler);
                   1031:             handler = xmlFindCharEncodingHandler("SHIFT_JIS");
                   1032:             if (handler != NULL) return(handler);
                   1033:             handler = xmlFindCharEncodingHandler("Shift_JIS");
                   1034:             if (handler != NULL) return(handler);
                   1035:            break;
1.25      daniel   1036:         case XML_CHAR_ENCODING_EUC_JP:
1.30      daniel   1037:             handler = xmlFindCharEncodingHandler("EUC-JP");
                   1038:             if (handler != NULL) return(handler);
                   1039:            break;
                   1040:        default: 
                   1041:            break;
1.25      daniel   1042:     }
1.30      daniel   1043:     
                   1044: #ifdef DEBUG_ENCODING
                   1045:     fprintf(stderr, "No handler found for encoding %d\n", enc);
                   1046: #endif
1.9       daniel   1047:     return(NULL);
                   1048: }
                   1049: 
                   1050: /**
                   1051:  * xmlGetCharEncodingHandler:
                   1052:  * @enc:  a string describing the char encoding.
                   1053:  *
                   1054:  * Search in the registrered set the handler able to read/write that encoding.
                   1055:  *
                   1056:  * Returns the handler or NULL if not found
                   1057:  */
                   1058: xmlCharEncodingHandlerPtr
                   1059: xmlFindCharEncodingHandler(const char *name) {
1.36      daniel   1060:     xmlCharEncodingHandlerPtr enc;
                   1061:     xmlCharEncoding alias;
1.30      daniel   1062: #ifdef LIBXML_ICONV_ENABLED
                   1063:     iconv_t icv_in, icv_out;
                   1064: #endif /* LIBXML_ICONV_ENABLED */
                   1065:     char upper[100];
1.9       daniel   1066:     int i;
                   1067: 
                   1068:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                   1069:     if (name == NULL) return(xmlDefaultCharEncodingHandler);
                   1070:     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
                   1071: 
1.36      daniel   1072:     /*
                   1073:      * Check first for directly registered encoding names
                   1074:      */
1.30      daniel   1075:     for (i = 0;i < 99;i++) {
1.9       daniel   1076:         upper[i] = toupper(name[i]);
                   1077:        if (upper[i] == 0) break;
                   1078:     }
                   1079:     upper[i] = 0;
                   1080: 
                   1081:     for (i = 0;i < nbCharEncodingHandler; i++)
1.30      daniel   1082:         if (!strcmp(upper, handlers[i]->name)) {
                   1083: #ifdef DEBUG_ENCODING
                   1084:             fprintf(stderr, "Found registered handler for encoding %s\n", name);
                   1085: #endif
1.9       daniel   1086:            return(handlers[i]);
1.30      daniel   1087:        }
1.9       daniel   1088: 
1.30      daniel   1089: #ifdef LIBXML_ICONV_ENABLED
                   1090:     /* check whether iconv can handle this */
1.31      daniel   1091:     icv_in = iconv_open("UTF-8", name);
                   1092:     icv_out = iconv_open(name, "UTF-8");
1.30      daniel   1093:     if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31      daniel   1094:            enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32      daniel   1095:            if (enc == NULL) {
                   1096:                iconv_close(icv_in);
                   1097:                iconv_close(icv_out);
                   1098:                return(NULL);
                   1099:            }
                   1100:            enc->name = NULL;
1.30      daniel   1101:            enc->input = NULL;
                   1102:            enc->output = NULL;
                   1103:            enc->iconv_in = icv_in;
                   1104:            enc->iconv_out = icv_out;
                   1105: #ifdef DEBUG_ENCODING
                   1106:             fprintf(stderr, "Found iconv handler for encoding %s\n", name);
                   1107: #endif
                   1108:            return enc;
                   1109:     } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
                   1110:            fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
                   1111:     }
                   1112: #endif /* LIBXML_ICONV_ENABLED */
1.38    ! daniel   1113: 
1.30      daniel   1114: #ifdef DEBUG_ENCODING
                   1115:     fprintf(stderr, "No handler found for encoding %s\n", name);
                   1116: #endif
1.38    ! daniel   1117: 
        !          1118:     /*
        !          1119:      * Fallback using the canonical names
        !          1120:      */
        !          1121:     alias = xmlParseCharEncoding(name);
        !          1122:     if (alias != XML_CHAR_ENCODING_ERROR) {
        !          1123:         const char* canon;
        !          1124:         canon = xmlGetCharEncodingName(alias);
        !          1125:         if ((canon != NULL) && (strcmp(name, canon))) {
        !          1126:            return(xmlFindCharEncodingHandler(canon));
        !          1127:         }
        !          1128:     }
        !          1129: 
1.9       daniel   1130:     return(NULL);
1.30      daniel   1131: }
                   1132: 
                   1133: #ifdef LIBXML_ICONV_ENABLED
                   1134: /**
                   1135:  * xmlIconvWrapper:
                   1136:  * @cd:                iconv converter data structure
                   1137:  * @out:  a pointer to an array of bytes to store the result
                   1138:  * @outlen:  the length of @out
                   1139:  * @in:  a pointer to an array of ISO Latin 1 chars
                   1140:  * @inlen:  the length of @in
                   1141:  *
                   1142:  * Returns 0 if success, or 
                   1143:  *     -1 by lack of space, or
                   1144:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1145:  *        the result of transformation can't fit into the encoding we want), or
                   1146:  *     -3 if there the last byte can't form a single output char.
                   1147:  *     
                   1148:  * The value of @inlen after return is the number of octets consumed
                   1149:  *     as the return value is positive, else unpredictiable.
                   1150:  * The value of @outlen after return is the number of ocetes consumed.
                   1151:  */
                   1152: static int
                   1153: xmlIconvWrapper(iconv_t cd,
                   1154:        unsigned char *out, int *outlen,
                   1155:        const unsigned char *in, int *inlen) {
                   1156: 
                   1157:        size_t icv_inlen = *inlen, icv_outlen = *outlen;
                   1158:        const char *icv_in = (const char *) in;
                   1159:        char *icv_out = (char *) out;
                   1160:        int ret;
                   1161: 
                   1162:        ret = iconv(cd,
                   1163:                &icv_in, &icv_inlen,
                   1164:                &icv_out, &icv_outlen);
1.35      daniel   1165:        if (in != NULL) {
                   1166:            *inlen -= icv_inlen;
                   1167:            *outlen -= icv_outlen;
                   1168:        } else {
                   1169:            *inlen = 0;
                   1170:            *outlen = 0;
                   1171:        }
1.30      daniel   1172:        if (icv_inlen != 0 || ret == (size_t) -1) {
                   1173: #ifdef EILSEQ
                   1174:                if (errno == EILSEQ) {
1.31      daniel   1175:                        return -2;
1.30      daniel   1176:                } else
                   1177: #endif
                   1178: #ifdef E2BIG
                   1179:                if (errno == E2BIG) {
                   1180:                        return -1;
                   1181:                } else
                   1182: #endif
                   1183: #ifdef EINVAL
                   1184:                if (errno == EINVAL) {
1.31      daniel   1185:                        return -3;
1.30      daniel   1186:                }
                   1187: #endif
                   1188:                else {
                   1189:                        return -3;
                   1190:                }
                   1191:        }
                   1192:        return 0;
                   1193: }
                   1194: #endif /* LIBXML_ICONV_ENABLED */
1.38    ! daniel   1195: 
        !          1196: /**
        !          1197:  * xmlCharEncFirstLine:
        !          1198:  * @handler:   char enconding transformation data structure
        !          1199:  * @out:  an xmlBuffer for the output.
        !          1200:  * @in:  an xmlBuffer for the input
        !          1201:  *     
        !          1202:  * Front-end for the encoding handler input function, but handle only
        !          1203:  * the very first line, i.e. limit itself to 45 chars.
        !          1204:  *     
        !          1205:  * Returns the number of byte written if success, or 
        !          1206:  *     -1 general error
        !          1207:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
        !          1208:  *        the result of transformation can't fit into the encoding we want), or
        !          1209:  */
        !          1210: int
        !          1211: xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
        !          1212:                  xmlBufferPtr in) {
        !          1213:     int ret = -2;
        !          1214:     int written;
        !          1215:     int toconv;
        !          1216: 
        !          1217:     if (handler == NULL) return(-1);
        !          1218:     if (out == NULL) return(-1);
        !          1219:     if (in == NULL) return(-1);
        !          1220: 
        !          1221:     written = out->size - out->use;
        !          1222:     toconv = in->use;
        !          1223:     if (toconv * 2 >= written) {
        !          1224:         xmlBufferGrow(out, toconv * 2);
        !          1225:        written = out->size - out->use - 1;
        !          1226:     }
        !          1227:     /*
        !          1228:      * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
        !          1229:      * 45 chars should be sufficient to reach the end of the encoding
        !          1230:      * decalration without going too far inside the document content.
        !          1231:      */
        !          1232:     written = 45;
        !          1233: 
        !          1234:     if (handler->input != NULL) {
        !          1235:        ret = handler->input(&out->content[out->use], &written,
        !          1236:                             in->content, &toconv);
        !          1237:        xmlBufferShrink(in, toconv);
        !          1238:        out->use += written;
        !          1239:        out->content[out->use] = 0;
        !          1240:     }
        !          1241: #ifdef LIBXML_ICONV_ENABLED
        !          1242:     else if (handler->iconv_in != NULL) {
        !          1243:        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
        !          1244:                              &written, in->content, &toconv);
        !          1245:        xmlBufferShrink(in, toconv);
        !          1246:        out->use += written;
        !          1247:        out->content[out->use] = 0;
        !          1248:        if (ret == -1) ret = -3;
        !          1249:     }
        !          1250: #endif /* LIBXML_ICONV_ENABLED */
        !          1251: #ifdef DEBUG_ENCODING
        !          1252:     switch (ret) {
        !          1253:         case 0:
        !          1254:            fprintf(stderr, "converted %d bytes to %d bytes of input\n",
        !          1255:                    toconv, written);
        !          1256:            break;
        !          1257:         case -1:
        !          1258:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
        !          1259:                    toconv, written, in->use);
        !          1260:            break;
        !          1261:         case -2:
        !          1262:            fprintf(stderr, "input conversion failed due to input error\n");
        !          1263:            break;
        !          1264:         case -3:
        !          1265:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
        !          1266:                    toconv, written, in->use);
        !          1267:            break;
        !          1268:        default:
        !          1269:            fprintf(stderr,"Unknown input conversion failed %d\n", ret);
        !          1270:     }
        !          1271: #endif
        !          1272:     /*
        !          1273:      * Ignore when input buffer is not on a boundary
        !          1274:      */
        !          1275:     if (ret == -3) ret = 0;
        !          1276:     if (ret == -1) ret = 0;
        !          1277:     return(ret);
        !          1278: }
1.30      daniel   1279: 
                   1280: /**
                   1281:  * xmlCharEncInFunc:
                   1282:  * @handler:   char enconding transformation data structure
1.31      daniel   1283:  * @out:  an xmlBuffer for the output.
                   1284:  * @in:  an xmlBuffer for the input
1.30      daniel   1285:  *     
                   1286:  * Generic front-end for the encoding handler input function
                   1287:  *     
1.31      daniel   1288:  * Returns the number of byte written if success, or 
                   1289:  *     -1 general error
1.30      daniel   1290:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1291:  *        the result of transformation can't fit into the encoding we want), or
                   1292:  */
                   1293: int
1.31      daniel   1294: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1295:                  xmlBufferPtr in) {
1.30      daniel   1296:     int ret = -2;
1.31      daniel   1297:     int written;
                   1298:     int toconv;
1.30      daniel   1299: 
1.31      daniel   1300:     if (handler == NULL) return(-1);
                   1301:     if (out == NULL) return(-1);
                   1302:     if (in == NULL) return(-1);
                   1303: 
                   1304:     written = out->size - out->use;
                   1305:     toconv = in->use;
                   1306:     if (toconv * 2 >= written) {
                   1307:         xmlBufferGrow(out, toconv * 2);
1.33      daniel   1308:        written = out->size - out->use - 1;
1.31      daniel   1309:     }
1.30      daniel   1310:     if (handler->input != NULL) {
1.32      daniel   1311:        ret = handler->input(&out->content[out->use], &written,
1.31      daniel   1312:                             in->content, &toconv);
                   1313:        xmlBufferShrink(in, toconv);
                   1314:        out->use += written;
1.33      daniel   1315:        out->content[out->use] = 0;
1.30      daniel   1316:     }
                   1317: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1318:     else if (handler->iconv_in != NULL) {
                   1319:        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
                   1320:                              &written, in->content, &toconv);
                   1321:        xmlBufferShrink(in, toconv);
                   1322:        out->use += written;
1.33      daniel   1323:        out->content[out->use] = 0;
                   1324:        if (ret == -1) ret = -3;
1.30      daniel   1325:     }
                   1326: #endif /* LIBXML_ICONV_ENABLED */
                   1327: #ifdef DEBUG_ENCODING
                   1328:     switch (ret) {
                   1329:         case 0:
                   1330:            fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31      daniel   1331:                    toconv, written);
1.30      daniel   1332:            break;
                   1333:         case -1:
1.31      daniel   1334:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1335:                    toconv, written, in->use);
1.30      daniel   1336:            break;
                   1337:         case -2:
                   1338:            fprintf(stderr, "input conversion failed due to input error\n");
                   1339:            break;
                   1340:         case -3:
1.31      daniel   1341:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1342:                    toconv, written, in->use);
1.30      daniel   1343:            break;
                   1344:        default:
                   1345:            fprintf(stderr,"Unknown input conversion failed %d\n", ret);
                   1346:     }
                   1347: #endif
1.33      daniel   1348:     /*
                   1349:      * Ignore when input buffer is not on a boundary
                   1350:      */
                   1351:     if (ret == -3) ret = 0;
1.30      daniel   1352:     return(ret);
                   1353: }
                   1354: 
                   1355: /**
                   1356:  * xmlCharEncOutFunc:
                   1357:  * @handler:   char enconding transformation data structure
1.31      daniel   1358:  * @out:  an xmlBuffer for the output.
                   1359:  * @in:  an xmlBuffer for the input
                   1360:  *     
                   1361:  * Generic front-end for the encoding handler output function
1.35      daniel   1362:  * a first call with @in == NULL has to be made firs to initiate the 
                   1363:  * output in case of non-stateless encoding needing to initiate their
                   1364:  * state or the output (like the BOM in UTF16).
1.30      daniel   1365:  *     
1.31      daniel   1366:  * Returns the number of byte written if success, or 
                   1367:  *     -1 general error
1.30      daniel   1368:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1369:  *        the result of transformation can't fit into the encoding we want), or
                   1370:  */
                   1371: int
1.31      daniel   1372: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1373:                   xmlBufferPtr in) {
1.30      daniel   1374:     int ret = -2;
1.31      daniel   1375:     int written;
                   1376:     int toconv;
                   1377: 
                   1378:     if (handler == NULL) return(-1);
                   1379:     if (out == NULL) return(-1);
1.35      daniel   1380:     written = out->size - out->use;
                   1381: 
                   1382:     if (in == NULL) {
                   1383:         toconv = 0;
                   1384:        if (handler->output != NULL) {
                   1385:            ret = handler->output(&out->content[out->use], &written,
                   1386:                                  NULL, &toconv);
                   1387:            out->use += written;
                   1388:            out->content[out->use] = 0;
                   1389:        }
                   1390: #ifdef LIBXML_ICONV_ENABLED
                   1391:        else if (handler->iconv_out != NULL) {
                   1392:            ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1393:                                  &written, NULL, &toconv);
                   1394:            out->use += written;
                   1395:            out->content[out->use] = 0;
                   1396:        }
                   1397: #endif /* LIBXML_ICONV_ENABLED */
                   1398: #ifdef DEBUG_ENCODING
                   1399:        fprintf(stderr, "initialized encoder\n");
                   1400: #endif
                   1401:         return(0);
                   1402:     }
1.30      daniel   1403: 
1.33      daniel   1404:     toconv = in->use;
                   1405:     if (toconv * 2 >= written) {
                   1406:         xmlBufferGrow(out, toconv * 2);
                   1407:        written = out->size - out->use - 1;
                   1408:     }
1.30      daniel   1409:     if (handler->output != NULL) {
1.33      daniel   1410:        ret = handler->output(&out->content[out->use], &written,
1.35      daniel   1411:                              in->content, &toconv);
1.31      daniel   1412:        xmlBufferShrink(in, toconv);
                   1413:        out->use += written;
1.33      daniel   1414:        out->content[out->use] = 0;
1.30      daniel   1415:     }
                   1416: #ifdef LIBXML_ICONV_ENABLED
                   1417:     else if (handler->iconv_out != NULL) {
1.31      daniel   1418:        ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1419:                              &written, in->content, &toconv);
                   1420:        xmlBufferShrink(in, toconv);
                   1421:        out->use += written;
1.33      daniel   1422:        out->content[out->use] = 0;
                   1423:        if (ret == -1) ret = -3;
1.30      daniel   1424:     }
                   1425: #endif /* LIBXML_ICONV_ENABLED */
                   1426: #ifdef DEBUG_ENCODING
                   1427:     switch (ret) {
                   1428:         case 0:
                   1429:            fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31      daniel   1430:                    toconv, written);
1.30      daniel   1431:            break;
                   1432:         case -1:
                   1433:            fprintf(stderr, "output conversion failed by lack of space\n");
                   1434:            break;
                   1435:         case -2:
                   1436:            fprintf(stderr, "output conversion failed due to output error\n");
                   1437:            break;
                   1438:         case -3:
1.31      daniel   1439:            fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
                   1440:                    toconv, written, in->use);
1.30      daniel   1441:            break;
                   1442:        default:
                   1443:            fprintf(stderr,"Unknown output conversion failed %d\n", ret);
                   1444:     }
                   1445: #endif
                   1446:     return(ret);
                   1447: }
                   1448: 
                   1449: /**
                   1450:  * xmlCharEncCloseFunc:
                   1451:  * @handler:   char enconding transformation data structure
                   1452:  *     
                   1453:  * Generic front-end for hencoding handler close function
                   1454:  *
                   1455:  * Returns 0 if success, or -1 in case of error
                   1456:  */
                   1457: int
                   1458: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
                   1459:     int ret = 0;
1.31      daniel   1460:     if (handler == NULL) return(-1);
                   1461:     if (handler->name == NULL) return(-1);
1.30      daniel   1462: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1463:     /*
                   1464:      * Iconv handlers can be oused only once, free the whole block.
                   1465:      * and the associated icon resources.
                   1466:      */
1.32      daniel   1467:     if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
                   1468:        if (handler->name != NULL)
                   1469:            xmlFree(handler->name);
                   1470:        handler->name = NULL;
                   1471:        if (handler->iconv_out != NULL) {
                   1472:            if (iconv_close(handler->iconv_out))
                   1473:                ret = -1;
                   1474:            handler->iconv_out = NULL;
                   1475:        }
                   1476:        if (handler->iconv_in != NULL) {
                   1477:            if (iconv_close(handler->iconv_in))
                   1478:                ret = -1;
                   1479:            handler->iconv_in = NULL;
                   1480:        }
                   1481:        xmlFree(handler);
1.30      daniel   1482:     }
                   1483: #endif /* LIBXML_ICONV_ENABLED */
                   1484: #ifdef DEBUG_ENCODING
                   1485:     if (ret)
                   1486:         fprintf(stderr, "failed to close the encoding handler\n");
                   1487:     else
                   1488:         fprintf(stderr, "closed the encoding handler\n");
                   1489: 
                   1490: #endif
                   1491:     return(ret);
1.9       daniel   1492: }
                   1493: 

Webmaster