Annotation of XML/encoding.c, revision 1.35

1.1       daniel      1: /*
                      2:  * encoding.c : implements the encoding conversion functions needed for XML
                      3:  *
                      4:  * Related specs: 
                      5:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
                      6:  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
                      7:  * [ISO-8859-1]   ISO Latin-1 characters codes.
                      8:  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
                      9:  *                Worldwide Character Encoding -- Version 1.0", Addison-
                     10:  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
                     11:  *                described in Unicode Technical Report #4.
                     12:  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
                     13:  *                Information Interchange, ANSI X3.4-1986.
                     14:  *
1.9       daniel     15:  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1       daniel     16:  *
                     17:  * See Copyright for the status of this software.
                     18:  *
                     19:  * Daniel.Veillard@w3.org
                     20:  */
                     21: 
1.21      daniel     22: #ifdef WIN32
                     23: #include "win32config.h"
                     24: #else
1.14      daniel     25: #include "config.h"
1.17      daniel     26: #endif
                     27: 
                     28: #include <stdio.h>
                     29: #include <string.h>
                     30: 
                     31: #ifdef HAVE_CTYPE_H
1.7       daniel     32: #include <ctype.h>
1.17      daniel     33: #endif
1.20      daniel     34: #ifdef HAVE_STDLIB_H
                     35: #include <stdlib.h>
                     36: #endif
1.30      daniel     37: #include <libxml/xmlversion.h>
                     38: #ifdef LIBXML_ICONV_ENABLED
                     39: #ifdef HAVE_ERRNO_H
                     40: #include <errno.h>
                     41: #endif
                     42: #endif
1.29      daniel     43: #include <libxml/encoding.h>
                     44: #include <libxml/xmlmemory.h>
1.3       daniel     45: 
1.25      daniel     46: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
                     47: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
                     48: 
1.30      daniel     49: #ifdef LIBXML_ICONV_ENABLED
1.33      daniel     50: #if 0
1.30      daniel     51: #define DEBUG_ENCODING  /* Define this to get encoding traces */
                     52: #endif
1.33      daniel     53: #endif
1.30      daniel     54: 
1.34      daniel     55: static int xmlLittleEndian = 1;
                     56: 
1.3       daniel     57: /*
                     58:  * From rfc2044: encoding of the Unicode values on UTF-8:
                     59:  *
                     60:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                     61:  * 0000 0000-0000 007F   0xxxxxxx
                     62:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                     63:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
                     64:  *
                     65:  * I hope we won't use values > 0xFFFF anytime soon !
                     66:  */
1.1       daniel     67: 
                     68: /**
1.22      daniel     69:  * xmlCheckUTF8: Check utf-8 string for legality.
                     70:  * @utf: Pointer to putative utf-8 encoded string.
                     71:  *
                     72:  * Checks @utf for being valid utf-8. @utf is assumed to be
                     73:  * null-terminated. This function is not super-strict, as it will
                     74:  * allow longer utf-8 sequences than necessary. Note that Java is
                     75:  * capable of producing these sequences if provoked. Also note, this
                     76:  * routine checks for the 4-byte maxiumum size, but does not check for
                     77:  * 0x10ffff maximum value.
                     78:  *
                     79:  * Return value: true if @utf is valid.
                     80:  **/
                     81: int
                     82: xmlCheckUTF8(const unsigned char *utf)
                     83: {
                     84:     int ix;
                     85:     unsigned char c;
                     86: 
                     87:     for (ix = 0; (c = utf[ix]);) {
                     88:         if (c & 0x80) {
                     89:            if ((utf[ix + 1] & 0xc0) != 0x80)
                     90:                return(0);
                     91:            if ((c & 0xe0) == 0xe0) {
                     92:                if ((utf[ix + 2] & 0xc0) != 0x80)
                     93:                    return(0);
                     94:                if ((c & 0xf0) == 0xf0) {
                     95:                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
                     96:                        return(0);
                     97:                    ix += 4;
                     98:                    /* 4-byte code */
                     99:                } else
                    100:                  /* 3-byte code */
                    101:                    ix += 3;
                    102:            } else
                    103:              /* 2-byte code */
                    104:                ix += 2;
                    105:        } else
                    106:            /* 1-byte code */
                    107:            ix++;
                    108:       }
                    109:       return(1);
                    110: }
                    111: 
                    112: /**
1.1       daniel    113:  * isolat1ToUTF8:
1.18      daniel    114:  * @out:  a pointer to an array of bytes to store the result
                    115:  * @outlen:  the length of @out
                    116:  * @in:  a pointer to an array of ISO Latin 1 chars
                    117:  * @inlen:  the length of @in
1.1       daniel    118:  *
                    119:  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
                    120:  * block of chars out.
1.33      daniel    121:  * Returns 0 if success, or -1 otherwise
                    122:  * The value of @inlen after return is the number of octets consumed
                    123:  *     as the return value is positive, else unpredictiable.
                    124:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    125:  */
                    126: int
1.33      daniel    127: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25      daniel    128:               const unsigned char* in, int *inlen) {
1.33      daniel    129:     unsigned char* outstart = out;
                    130:     const unsigned char* processed = in;
                    131:     unsigned char* outend = out + *outlen;
                    132:     const unsigned char* inend = in + *inlen;
1.1       daniel    133:     unsigned char c;
                    134: 
                    135:     while (in < inend) {
                    136:         c= *in++;
                    137:         if (c < 0x80) {
1.33      daniel    138:             if (out >= outend)
                    139:                break;
1.1       daniel    140:             *out++ = c;
                    141:         }
                    142:         else {
1.33      daniel    143:             if (out + 1 >= outend)  break;
1.1       daniel    144:             *out++ = 0xC0 | (c >> 6);
                    145:             *out++ = 0x80 | (0x3F & c);
                    146:         }
1.33      daniel    147:        processed = in;
1.1       daniel    148:     }
1.33      daniel    149:     *outlen = out - outstart;
                    150:     *inlen = processed - in;
                    151: 
                    152:     return(0);
1.1       daniel    153: }
                    154: 
                    155: /**
                    156:  * UTF8Toisolat1:
1.18      daniel    157:  * @out:  a pointer to an array of bytes to store the result
                    158:  * @outlen:  the length of @out
                    159:  * @in:  a pointer to an array of UTF-8 chars
                    160:  * @inlen:  the length of @in
1.1       daniel    161:  *
                    162:  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
                    163:  * block of chars out.
1.15      daniel    164:  * TODO: UTF8Toisolat1 need a fallback mechanism ...
                    165:  *
1.33      daniel    166:  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28      daniel    167:  * The value of @inlen after return is the number of octets consumed
                    168:  *     as the return value is positive, else unpredictiable.
1.33      daniel    169:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    170:  */
                    171: int
1.33      daniel    172: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25      daniel    173:               const unsigned char* in, int *inlen) {
1.33      daniel    174:     unsigned char* outstart = out;
                    175:     const unsigned char* processed = in;
                    176:     unsigned char* outend = out + *outlen;
                    177:     const unsigned char* inend = in + *inlen;
1.1       daniel    178:     unsigned char c;
                    179: 
                    180:     while (in < inend) {
                    181:         c= *in++;
                    182:         if (c < 0x80) {
1.28      daniel    183:             if (out >= outend)  return(-1);
1.1       daniel    184:             *out++= c;
                    185:         }
1.23      daniel    186:        else if (in == inend) {
                    187:             break;
                    188:        }
                    189:        else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
                    190:            /* a two byte utf-8 and can be encoding as isolate1 */
1.1       daniel    191:             *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23      daniel    192:        }
1.33      daniel    193:        else {
                    194:            *outlen = out - outstart;
                    195:            *inlen = processed - in;
1.28      daniel    196:            return(-2);
1.33      daniel    197:        }
                    198:        processed = in;
1.1       daniel    199:     }
1.33      daniel    200:     *outlen = out - outstart;
                    201:     *inlen = processed - in;
                    202:     return(0);
1.1       daniel    203: }
                    204: 
                    205: /**
1.28      daniel    206:  * UTF16LEToUTF8:
                    207:  * @out:  a pointer to an array of bytes to store the result
                    208:  * @outlen:  the length of @out
                    209:  * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
                    210:  * @inlenb:  the length of @in in UTF-16LE chars
                    211:  *
                    212:  * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
                    213:  * block of chars out. This function assume the endian properity
                    214:  * is the same between the native type of this machine and the
                    215:  * inputed one.
                    216:  *
                    217:  * Returns the number of byte written, or -1 by lack of space, or -2
                    218:  *     if the transcoding fails (for *in is not valid utf16 string)
                    219:  *     The value of *inlen after return is the number of octets consumed
                    220:  *     as the return value is positive, else unpredictiable.
                    221:  */
                    222: int
1.33      daniel    223: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28      daniel    224:             const unsigned char* inb, int *inlenb)
                    225: {
1.33      daniel    226:     unsigned char* outstart = out;
                    227:     const unsigned char* processed = inb;
                    228:     unsigned char* outend = out + *outlen;
1.28      daniel    229:     unsigned short* in = (unsigned short*) inb;
                    230:     unsigned short* inend;
                    231:     unsigned int c, d, inlen;
                    232:     unsigned char *tmp;
                    233:     int bits;
                    234: 
                    235:     if ((*inlenb % 2) == 1)
                    236:         (*inlenb)--;
                    237:     inlen = *inlenb / 2;
1.33      daniel    238:     inend = in + inlen;
1.28      daniel    239:     while (in < inend) {
1.34      daniel    240:         if (xmlLittleEndian) {
                    241:            c= *in++;
                    242:        } else {
                    243:            tmp = (unsigned char *) in;
                    244:            c = *tmp++;
                    245:            c = c | (((unsigned int)*tmp) << 8);
                    246:            in++;
                    247:        }
1.28      daniel    248:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
                    249:             if (in >= inend) {           /* (in > inend) shouldn't happens */
                    250:                 break;
                    251:             }
1.34      daniel    252:            if (xmlLittleEndian) {
                    253:                d = *in++;
                    254:            } else {
                    255:                tmp = (unsigned char *) in;
                    256:                d = *tmp++;
                    257:                d = d | (((unsigned int)*tmp) << 8);
                    258:                in++;
                    259:            }
1.28      daniel    260:             if ((d & 0xFC00) == 0xDC00) {
                    261:                 c &= 0x03FF;
                    262:                 c <<= 10;
                    263:                 c |= d & 0x03FF;
                    264:                 c += 0x10000;
                    265:             }
1.33      daniel    266:             else {
                    267:                *outlen = out - outstart;
                    268:                *inlenb = processed - inb;
1.28      daniel    269:                return(-2);
1.33      daniel    270:            }
1.28      daniel    271:         }
                    272: 
                    273:        /* assertion: c is a single UTF-4 value */
                    274:         if (out >= outend)
1.33      daniel    275:            break;
1.28      daniel    276:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
                    277:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    278:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    279:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                    280:  
                    281:         for ( ; bits >= 0; bits-= 6) {
                    282:             if (out >= outend)
1.33      daniel    283:                break;
1.28      daniel    284:             *out++= ((c >> bits) & 0x3F) | 0x80;
                    285:         }
1.33      daniel    286:        processed = (const unsigned char*) in;
1.28      daniel    287:     }
1.33      daniel    288:     *outlen = out - outstart;
                    289:     *inlenb = processed - inb;
                    290:     return(0);
1.28      daniel    291: }
                    292: 
                    293: /**
                    294:  * UTF8ToUTF16LE:
                    295:  * @outb:  a pointer to an array of bytes to store the result
                    296:  * @outlen:  the length of @outb
                    297:  * @in:  a pointer to an array of UTF-8 chars
                    298:  * @inlen:  the length of @in
                    299:  *
                    300:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
                    301:  * block of chars out.
                    302:  * TODO: UTF8ToUTF16LE need a fallback mechanism ...
                    303:  *
                    304:  * Returns the number of byte written, or -1 by lack of space, or -2
                    305:  *     if the transcoding failed. 
                    306:  */
                    307: int
1.33      daniel    308: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28      daniel    309:             const unsigned char* in, int *inlen)
                    310: {
                    311:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    312:     const unsigned char* processed = in;
1.28      daniel    313:     unsigned short* outstart= out;
                    314:     unsigned short* outend;
                    315:     const unsigned char* inend= in+*inlen;
                    316:     unsigned int c, d, trailing;
                    317:     unsigned char *tmp;
                    318:     unsigned short tmp1, tmp2;
                    319: 
1.33      daniel    320:     outend = out + (*outlen / 2);
1.28      daniel    321:     while (in < inend) {
                    322:       d= *in++;
                    323:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    324:       else if (d < 0xC0) {
                    325:           /* trailing byte in leading position */
                    326:          *outlen = out - outstart;
                    327:          *inlen = processed - in;
                    328:          return(-2);
                    329:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.28      daniel    330:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    331:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    332:       else {
                    333:        /* no chance for this in UTF-16 */
                    334:        *outlen = out - outstart;
                    335:        *inlen = processed - in;
                    336:        return(-2);
                    337:       }
1.28      daniel    338: 
                    339:       if (inend - in < trailing) {
                    340:           break;
                    341:       } 
                    342: 
                    343:       for ( ; trailing; trailing--) {
                    344:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33      daniel    345:              break;
1.28      daniel    346:           c <<= 6;
                    347:           c |= d & 0x3F;
                    348:       }
                    349: 
                    350:       /* assertion: c is a single UTF-4 value */
                    351:         if (c < 0x10000) {
                    352:             if (out >= outend)
1.33      daniel    353:                break;
1.34      daniel    354:            if (xmlLittleEndian) {
                    355:                *out++ = c;
                    356:            } else {
                    357:                tmp = (unsigned char *) out;
                    358:                *tmp = c ;
                    359:                *(tmp + 1) = c >> 8 ;
                    360:                out++;
                    361:            }
1.28      daniel    362:         }
                    363:         else if (c < 0x110000) {
                    364:             if (out+1 >= outend)
1.33      daniel    365:                break;
1.28      daniel    366:             c -= 0x10000;
1.34      daniel    367:            if (xmlLittleEndian) {
                    368:                *out++ = 0xD800 | (c >> 10);
                    369:                *out++ = 0xDC00 | (c & 0x03FF);
                    370:            } else {
                    371:                tmp1 = 0xD800 | (c >> 10);
                    372:                tmp = (unsigned char *) out;
                    373:                *tmp = tmp1;
                    374:                *(tmp + 1) = tmp1 >> 8;
                    375:                out++;
                    376: 
                    377:                tmp2 = 0xDC00 | (c & 0x03FF);
                    378:                tmp = (unsigned char *) out;
                    379:                *tmp  = tmp2;
                    380:                *(tmp + 1) = tmp2 >> 8;
                    381:                out++;
                    382:            }
1.28      daniel    383:         }
                    384:         else
1.33      daniel    385:            break;
                    386:        processed = in;
1.28      daniel    387:     }
1.33      daniel    388:     *outlen = out - outstart;
                    389:     *inlen = processed - in;
                    390:     return(0);
1.28      daniel    391: }
                    392: 
                    393: /**
                    394:  * UTF16BEToUTF8:
1.18      daniel    395:  * @out:  a pointer to an array of bytes to store the result
                    396:  * @outlen:  the length of @out
1.25      daniel    397:  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
                    398:  * @inlenb:  the length of @in in UTF-16 chars
1.1       daniel    399:  *
                    400:  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28      daniel    401:  * block of chars out. This function assume the endian properity
                    402:  * is the same between the native type of this machine and the
                    403:  * inputed one.
1.25      daniel    404:  *
1.28      daniel    405:  * Returns the number of byte written, or -1 by lack of space, or -2
                    406:  *     if the transcoding fails (for *in is not valid utf16 string)
                    407:  * The value of *inlen after return is the number of octets consumed
                    408:  *     as the return value is positive, else unpredictiable.
1.1       daniel    409:  */
                    410: int
1.33      daniel    411: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25      daniel    412:             const unsigned char* inb, int *inlenb)
1.1       daniel    413: {
1.33      daniel    414:     unsigned char* outstart = out;
                    415:     const unsigned char* processed = inb;
                    416:     unsigned char* outend = out + *outlen;
1.25      daniel    417:     unsigned short* in = (unsigned short*) inb;
                    418:     unsigned short* inend;
                    419:     unsigned int c, d, inlen;
1.28      daniel    420:     unsigned char *tmp;
1.1       daniel    421:     int bits;
                    422: 
1.28      daniel    423:     if ((*inlenb % 2) == 1)
                    424:         (*inlenb)--;
1.25      daniel    425:     inlen = *inlenb / 2;
                    426:     inend= in + inlen;
1.1       daniel    427:     while (in < inend) {
1.34      daniel    428:        if (xmlLittleEndian) {
                    429:            tmp = (unsigned char *) in;
                    430:            c = *tmp++;
                    431:            c = c << 8;
                    432:            c = c | (unsigned int) *tmp;
                    433:            in++;
                    434:        } else {
                    435:            c= *in++;
                    436:        } 
1.1       daniel    437:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
1.28      daniel    438:            if (in >= inend) {           /* (in > inend) shouldn't happens */
1.33      daniel    439:                *outlen = out - outstart;
                    440:                *inlenb = processed - inb;
                    441:                return(-2);
1.28      daniel    442:            }
1.34      daniel    443:            if (xmlLittleEndian) {
                    444:                tmp = (unsigned char *) in;
                    445:                d = *tmp++;
                    446:                d = d << 8;
                    447:                d = d | (unsigned int) *tmp;
                    448:                in++;
                    449:            } else {
                    450:                d= *in++;
                    451:            }
1.28      daniel    452:             if ((d & 0xFC00) == 0xDC00) {
1.1       daniel    453:                 c &= 0x03FF;
                    454:                 c <<= 10;
                    455:                 c |= d & 0x03FF;
                    456:                 c += 0x10000;
                    457:             }
1.33      daniel    458:             else {
                    459:                *outlen = out - outstart;
                    460:                *inlenb = processed - inb;
1.28      daniel    461:                return(-2);
1.33      daniel    462:            }
1.1       daniel    463:         }
                    464: 
1.25      daniel    465:        /* assertion: c is a single UTF-4 value */
1.27      daniel    466:         if (out >= outend) 
1.33      daniel    467:            break;
1.1       daniel    468:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
1.26      daniel    469:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    470:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    471:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1.1       daniel    472:  
1.26      daniel    473:         for ( ; bits >= 0; bits-= 6) {
1.27      daniel    474:             if (out >= outend) 
1.33      daniel    475:                break;
1.26      daniel    476:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    477:         }
1.33      daniel    478:        processed = (const unsigned char*) in;
1.1       daniel    479:     }
1.33      daniel    480:     *outlen = out - outstart;
                    481:     *inlenb = processed - inb;
                    482:     return(0);
1.1       daniel    483: }
                    484: 
                    485: /**
1.28      daniel    486:  * UTF8ToUTF16BE:
1.25      daniel    487:  * @outb:  a pointer to an array of bytes to store the result
                    488:  * @outlen:  the length of @outb
1.18      daniel    489:  * @in:  a pointer to an array of UTF-8 chars
                    490:  * @inlen:  the length of @in
1.1       daniel    491:  *
1.28      daniel    492:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1       daniel    493:  * block of chars out.
1.28      daniel    494:  * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15      daniel    495:  *
1.6       daniel    496:  * Returns the number of byte written, or -1 by lack of space, or -2
1.25      daniel    497:  *     if the transcoding failed. 
1.1       daniel    498:  */
                    499: int
1.33      daniel    500: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25      daniel    501:             const unsigned char* in, int *inlen)
1.1       daniel    502: {
1.25      daniel    503:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    504:     const unsigned char* processed = in;
1.1       daniel    505:     unsigned short* outstart= out;
1.28      daniel    506:     unsigned short* outend;
1.25      daniel    507:     const unsigned char* inend= in+*inlen;
1.1       daniel    508:     unsigned int c, d, trailing;
1.28      daniel    509:     unsigned char *tmp;
                    510:     unsigned short tmp1, tmp2;
1.1       daniel    511: 
1.33      daniel    512:     outend = out + (*outlen / 2);
1.1       daniel    513:     while (in < inend) {
                    514:       d= *in++;
                    515:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    516:       else if (d < 0xC0)  {
                    517:           /* trailing byte in leading position */
                    518:          *outlen = out - outstart;
                    519:          *inlen = processed - in;
                    520:          return(-2);
                    521:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.1       daniel    522:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    523:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    524:       else {
                    525:           /* no chance for this in UTF-16 */
                    526:          *outlen = out - outstart;
                    527:          *inlen = processed - in;
                    528:          return(-2);
                    529:       }
1.28      daniel    530: 
                    531:       if (inend - in < trailing) {
                    532:           break;
                    533:       } 
1.1       daniel    534: 
                    535:       for ( ; trailing; trailing--) {
1.33      daniel    536:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
1.1       daniel    537:           c <<= 6;
                    538:           c |= d & 0x3F;
                    539:       }
                    540: 
                    541:       /* assertion: c is a single UTF-4 value */
                    542:         if (c < 0x10000) {
1.33      daniel    543:             if (out >= outend)  break;
1.34      daniel    544:            if (xmlLittleEndian) {
                    545:                tmp = (unsigned char *) out;
                    546:                *tmp = c >> 8;
                    547:                *(tmp + 1) = c;
                    548:                out++;
                    549:            } else {
                    550:                *out++ = c;
                    551:            }
1.1       daniel    552:         }
                    553:         else if (c < 0x110000) {
1.33      daniel    554:             if (out+1 >= outend)  break;
1.1       daniel    555:             c -= 0x10000;
1.34      daniel    556:            if (xmlLittleEndian) {
                    557:                tmp1 = 0xD800 | (c >> 10);
                    558:                tmp = (unsigned char *) out;
                    559:                *tmp = tmp1 >> 8;
                    560:                *(tmp + 1) = tmp1;
                    561:                out++;
                    562: 
                    563:                tmp2 = 0xDC00 | (c & 0x03FF);
                    564:                tmp = (unsigned char *) out;
                    565:                *tmp = tmp2 >> 8;
                    566:                *(tmp + 1) = tmp2;
                    567:                out++;
                    568:            } else {
                    569:                *out++ = 0xD800 | (c >> 10);
                    570:                *out++ = 0xDC00 | (c & 0x03FF);
                    571:            }
1.1       daniel    572:         }
1.33      daniel    573:         else
                    574:            break;
                    575:        processed = in;
1.1       daniel    576:     }
1.33      daniel    577:     *outlen = out - outstart;
                    578:     *inlen = processed - in;
                    579:     return(0);
1.1       daniel    580: }
                    581: 
1.7       daniel    582: /**
                    583:  * xmlDetectCharEncoding:
                    584:  * @in:  a pointer to the first bytes of the XML entity, must be at least
                    585:  *       4 bytes long.
1.25      daniel    586:  * @len:  pointer to the length of the buffer
1.7       daniel    587:  *
                    588:  * Guess the encoding of the entity using the first bytes of the entity content
                    589:  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
                    590:  * 
                    591:  * Returns one of the XML_CHAR_ENCODING_... values.
                    592:  */
                    593: xmlCharEncoding
1.25      daniel    594: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7       daniel    595: {
1.25      daniel    596:     if (len >= 4) {
                    597:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    598:            (in[2] == 0x00) && (in[3] == 0x3C))
                    599:            return(XML_CHAR_ENCODING_UCS4BE);
                    600:        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
                    601:            (in[2] == 0x00) && (in[3] == 0x00))
                    602:            return(XML_CHAR_ENCODING_UCS4LE);
                    603:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    604:            (in[2] == 0x3C) && (in[3] == 0x00))
                    605:            return(XML_CHAR_ENCODING_UCS4_2143);
                    606:        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
                    607:            (in[2] == 0x00) && (in[3] == 0x00))
                    608:            return(XML_CHAR_ENCODING_UCS4_3412);
                    609:        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
                    610:            (in[2] == 0xA7) && (in[3] == 0x94))
                    611:            return(XML_CHAR_ENCODING_EBCDIC);
                    612:        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
                    613:            (in[2] == 0x78) && (in[3] == 0x6D))
                    614:            return(XML_CHAR_ENCODING_UTF8);
                    615:     }
                    616:     if (len >= 2) {
                    617:        if ((in[0] == 0xFE) && (in[1] == 0xFF))
                    618:            return(XML_CHAR_ENCODING_UTF16BE);
                    619:        if ((in[0] == 0xFF) && (in[1] == 0xFE))
                    620:            return(XML_CHAR_ENCODING_UTF16LE);
                    621:     }
1.7       daniel    622:     return(XML_CHAR_ENCODING_NONE);
                    623: }
                    624: 
                    625: /**
                    626:  * xmlParseCharEncoding:
1.18      daniel    627:  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7       daniel    628:  *
                    629:  * Conpare the string to the known encoding schemes already known. Note
                    630:  * that the comparison is case insensitive accordingly to the section
                    631:  * [XML] 4.3.3 Character Encoding in Entities.
                    632:  * 
                    633:  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
                    634:  * if not recognized.
                    635:  */
                    636: xmlCharEncoding
1.8       daniel    637: xmlParseCharEncoding(const char* name)
1.7       daniel    638: {
                    639:     char upper[500];
                    640:     int i;
                    641: 
                    642:     for (i = 0;i < 499;i++) {
                    643:         upper[i] = toupper(name[i]);
                    644:        if (upper[i] == 0) break;
                    645:     }
                    646:     upper[i] = 0;
                    647: 
                    648:     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
                    649:     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
                    650:     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
                    651: 
                    652:     /*
                    653:      * NOTE: if we were able to parse this, the endianness of UTF16 is
                    654:      *       already found and in use
                    655:      */
                    656:     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
                    657:     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
                    658:     
                    659:     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    660:     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    661:     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
                    662: 
                    663:     /*
                    664:      * NOTE: if we were able to parse this, the endianness of UCS4 is
                    665:      *       already found and in use
                    666:      */
                    667:     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    668:     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    669:     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
                    670: 
                    671:     
                    672:     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
                    673:     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
                    674:     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
                    675: 
                    676:     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
                    677:     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
                    678:     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
                    679: 
                    680:     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
                    681:     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
                    682:     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
                    683:     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
                    684:     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
                    685:     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
                    686:     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
                    687: 
                    688:     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30      daniel    689:     if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7       daniel    690:     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30      daniel    691: 
                    692: #ifdef DEBUG_ENCODING
                    693:     fprintf(stderr, "Unknown encoding %s\n", name);
                    694: #endif
1.7       daniel    695:     return(XML_CHAR_ENCODING_ERROR);
                    696: }
1.9       daniel    697: 
                    698: /****************************************************************
                    699:  *                                                             *
                    700:  *             Char encoding handlers                          *
                    701:  *                                                             *
                    702:  ****************************************************************/
                    703: 
                    704: /* the size should be growable, but it's not a big deal ... */
                    705: #define MAX_ENCODING_HANDLERS 50
                    706: static xmlCharEncodingHandlerPtr *handlers = NULL;
                    707: static int nbCharEncodingHandler = 0;
                    708: 
                    709: /*
                    710:  * The default is UTF-8 for XML, that's also the default used for the
                    711:  * parser internals, so the default encoding handler is NULL
                    712:  */
                    713: 
                    714: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
                    715: 
                    716: /**
                    717:  * xmlNewCharEncodingHandler:
1.18      daniel    718:  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1.9       daniel    719:  * @input:  the xmlCharEncodingInputFunc to read that encoding
                    720:  * @output:  the xmlCharEncodingOutputFunc to write that encoding
                    721:  *
                    722:  * Create and registers an xmlCharEncodingHandler.
                    723:  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
                    724:  */
                    725: xmlCharEncodingHandlerPtr
1.25      daniel    726: xmlNewCharEncodingHandler(const char *name, 
                    727:                           xmlCharEncodingInputFunc input,
1.9       daniel    728:                           xmlCharEncodingOutputFunc output) {
                    729:     xmlCharEncodingHandlerPtr handler;
                    730:     char upper[500];
                    731:     int i;
                    732:     char *up = 0;
                    733: 
                    734:     /*
                    735:      * Keep only the uppercase version of the encoding.
                    736:      */
                    737:     if (name == NULL) {
                    738:         fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
                    739:        return(NULL);
                    740:     }
                    741:     for (i = 0;i < 499;i++) {
                    742:         upper[i] = toupper(name[i]);
                    743:        if (upper[i] == 0) break;
                    744:     }
                    745:     upper[i] = 0;
1.16      daniel    746:     up = xmlMemStrdup(upper);
1.9       daniel    747:     if (up == NULL) {
                    748:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    749:        return(NULL);
                    750:     }
                    751: 
                    752:     /*
                    753:      * allocate and fill-up an handler block.
                    754:      */
                    755:     handler = (xmlCharEncodingHandlerPtr)
1.16      daniel    756:               xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9       daniel    757:     if (handler == NULL) {
                    758:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    759:        return(NULL);
                    760:     }
                    761:     handler->input = input;
                    762:     handler->output = output;
                    763:     handler->name = up;
                    764: 
                    765:     /*
                    766:      * registers and returns the handler.
                    767:      */
                    768:     xmlRegisterCharEncodingHandler(handler);
1.30      daniel    769: #ifdef DEBUG_ENCODING
                    770:     fprintf(stderr, "Registered encoding handler for %s\n", name);
                    771: #endif
1.9       daniel    772:     return(handler);
                    773: }
                    774: 
                    775: /**
                    776:  * xmlInitCharEncodingHandlers:
                    777:  *
                    778:  * Initialize the char encoding support, it registers the default
                    779:  * encoding supported.
1.18      daniel    780:  * NOTE: while public, this function usually doesn't need to be called
1.9       daniel    781:  *       in normal processing.
                    782:  */
                    783: void
                    784: xmlInitCharEncodingHandlers(void) {
1.34      daniel    785:     unsigned short int tst = 0x1234;
                    786:     unsigned char *ptr = (unsigned char *) &tst; 
                    787: 
1.9       daniel    788:     if (handlers != NULL) return;
                    789: 
                    790:     handlers = (xmlCharEncodingHandlerPtr *)
1.16      daniel    791:         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34      daniel    792: 
                    793:     if (*ptr == 0x12) xmlLittleEndian = 0;
                    794:     else if (*ptr == 0x34) xmlLittleEndian = 1;
                    795:     else fprintf(stderr, "Odd problem at endianness detection\n");
1.9       daniel    796: 
                    797:     if (handlers == NULL) {
                    798:         fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
                    799:        return;
                    800:     }
1.10      daniel    801:     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25      daniel    802:     xmlUTF16LEHandler = 
1.28      daniel    803:           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
                    804:     xmlUTF16BEHandler = 
                    805:           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10      daniel    806:     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9       daniel    807: }
                    808: 
                    809: /**
1.19      daniel    810:  * xmlCleanupCharEncodingHandlers:
                    811:  *
                    812:  * Cleanup the memory allocated for the char encoding support, it
                    813:  * unregisters all the encoding handlers.
                    814:  */
                    815: void
                    816: xmlCleanupCharEncodingHandlers(void) {
                    817:     if (handlers == NULL) return;
                    818: 
                    819:     for (;nbCharEncodingHandler > 0;) {
                    820:         nbCharEncodingHandler--;
                    821:        if (handlers[nbCharEncodingHandler] != NULL) {
1.31      daniel    822:            if (handlers[nbCharEncodingHandler]->name != NULL)
                    823:                xmlFree(handlers[nbCharEncodingHandler]->name);
1.19      daniel    824:            xmlFree(handlers[nbCharEncodingHandler]);
                    825:        }
                    826:     }
                    827:     xmlFree(handlers);
                    828:     handlers = NULL;
                    829:     nbCharEncodingHandler = 0;
                    830:     xmlDefaultCharEncodingHandler = NULL;
                    831: }
                    832: 
                    833: /**
1.9       daniel    834:  * xmlRegisterCharEncodingHandler:
                    835:  * @handler:  the xmlCharEncodingHandlerPtr handler block
                    836:  *
                    837:  * Register the char encoding handler, surprizing, isn't it ?
                    838:  */
                    839: void
                    840: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
                    841:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    842:     if (handler == NULL) {
                    843:         fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
                    844:        return;
                    845:     }
                    846: 
                    847:     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
                    848:         fprintf(stderr, 
                    849:        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
                    850:         fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
                    851:        return;
                    852:     }
                    853:     handlers[nbCharEncodingHandler++] = handler;
                    854: }
                    855: 
                    856: /**
                    857:  * xmlGetCharEncodingHandler:
                    858:  * @enc:  an xmlCharEncoding value.
                    859:  *
                    860:  * Search in the registrered set the handler able to read/write that encoding.
                    861:  *
                    862:  * Returns the handler or NULL if not found
                    863:  */
                    864: xmlCharEncodingHandlerPtr
                    865: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30      daniel    866:     xmlCharEncodingHandlerPtr handler;
                    867: 
1.9       daniel    868:     if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25      daniel    869:     switch (enc) {
                    870:         case XML_CHAR_ENCODING_ERROR:
                    871:            return(NULL);
                    872:         case XML_CHAR_ENCODING_NONE:
                    873:            return(NULL);
                    874:         case XML_CHAR_ENCODING_UTF8:
                    875:            return(NULL);
                    876:         case XML_CHAR_ENCODING_UTF16LE:
                    877:            return(xmlUTF16LEHandler);
                    878:         case XML_CHAR_ENCODING_UTF16BE:
                    879:            return(xmlUTF16BEHandler);
                    880:         case XML_CHAR_ENCODING_EBCDIC:
1.30      daniel    881:             handler = xmlFindCharEncodingHandler("EBCDIC");
                    882:             if (handler != NULL) return(handler);
                    883:             handler = xmlFindCharEncodingHandler("ebcdic");
                    884:             if (handler != NULL) return(handler);
                    885:            break;
1.25      daniel    886:         case XML_CHAR_ENCODING_UCS4LE:
1.30      daniel    887:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
                    888:             if (handler != NULL) return(handler);
                    889:             handler = xmlFindCharEncodingHandler("UCS-4");
                    890:             if (handler != NULL) return(handler);
                    891:             handler = xmlFindCharEncodingHandler("UCS4");
                    892:             if (handler != NULL) return(handler);
                    893:            break;
1.25      daniel    894:         case XML_CHAR_ENCODING_UCS4BE:
1.30      daniel    895:             handler = xmlFindCharEncodingHandler("UCS4BE");
                    896:             if (handler != NULL) return(handler);
                    897:            break;
1.25      daniel    898:         case XML_CHAR_ENCODING_UCS4_2143:
1.30      daniel    899:            break;
1.25      daniel    900:         case XML_CHAR_ENCODING_UCS4_3412:
1.30      daniel    901:            break;
1.25      daniel    902:         case XML_CHAR_ENCODING_UCS2:
1.30      daniel    903:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
                    904:             if (handler != NULL) return(handler);
                    905:             handler = xmlFindCharEncodingHandler("UCS-2");
                    906:             if (handler != NULL) return(handler);
                    907:             handler = xmlFindCharEncodingHandler("UCS2");
                    908:             if (handler != NULL) return(handler);
                    909:            break;
1.25      daniel    910:         case XML_CHAR_ENCODING_8859_1:
                    911:         case XML_CHAR_ENCODING_8859_2:
                    912:         case XML_CHAR_ENCODING_8859_3:
                    913:         case XML_CHAR_ENCODING_8859_4:
                    914:         case XML_CHAR_ENCODING_8859_5:
                    915:         case XML_CHAR_ENCODING_8859_6:
                    916:         case XML_CHAR_ENCODING_8859_7:
                    917:         case XML_CHAR_ENCODING_8859_8:
                    918:         case XML_CHAR_ENCODING_8859_9:
                    919:            return(NULL);
                    920:         case XML_CHAR_ENCODING_2022_JP:
1.30      daniel    921:             handler = xmlFindCharEncodingHandler("ISO-2022-JP");
                    922:             if (handler != NULL) return(handler);
                    923:            break;
1.25      daniel    924:         case XML_CHAR_ENCODING_SHIFT_JIS:
1.30      daniel    925:             handler = xmlFindCharEncodingHandler("SHIFT-JIS");
                    926:             if (handler != NULL) return(handler);
                    927:             handler = xmlFindCharEncodingHandler("SHIFT_JIS");
                    928:             if (handler != NULL) return(handler);
                    929:             handler = xmlFindCharEncodingHandler("Shift_JIS");
                    930:             if (handler != NULL) return(handler);
                    931:            break;
1.25      daniel    932:         case XML_CHAR_ENCODING_EUC_JP:
1.30      daniel    933:             handler = xmlFindCharEncodingHandler("EUC-JP");
                    934:             if (handler != NULL) return(handler);
                    935:            break;
                    936:        default: 
                    937:            break;
1.25      daniel    938:     }
1.30      daniel    939:     
                    940: #ifdef DEBUG_ENCODING
                    941:     fprintf(stderr, "No handler found for encoding %d\n", enc);
                    942: #endif
1.9       daniel    943:     return(NULL);
                    944: }
                    945: 
                    946: /**
                    947:  * xmlGetCharEncodingHandler:
                    948:  * @enc:  a string describing the char encoding.
                    949:  *
                    950:  * Search in the registrered set the handler able to read/write that encoding.
                    951:  *
                    952:  * Returns the handler or NULL if not found
                    953:  */
                    954: xmlCharEncodingHandlerPtr
                    955: xmlFindCharEncodingHandler(const char *name) {
1.30      daniel    956: #ifdef LIBXML_ICONV_ENABLED
                    957:     iconv_t icv_in, icv_out;
                    958:     xmlCharEncodingHandlerPtr enc;
                    959: #endif /* LIBXML_ICONV_ENABLED */
                    960:     char upper[100];
1.9       daniel    961:     int i;
                    962: 
                    963:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    964:     if (name == NULL) return(xmlDefaultCharEncodingHandler);
                    965:     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
                    966: 
1.30      daniel    967:     for (i = 0;i < 99;i++) {
1.9       daniel    968:         upper[i] = toupper(name[i]);
                    969:        if (upper[i] == 0) break;
                    970:     }
                    971:     upper[i] = 0;
                    972: 
                    973:     for (i = 0;i < nbCharEncodingHandler; i++)
1.30      daniel    974:         if (!strcmp(upper, handlers[i]->name)) {
                    975: #ifdef DEBUG_ENCODING
                    976:             fprintf(stderr, "Found registered handler for encoding %s\n", name);
                    977: #endif
1.9       daniel    978:            return(handlers[i]);
1.30      daniel    979:        }
1.9       daniel    980: 
1.30      daniel    981: #ifdef LIBXML_ICONV_ENABLED
                    982:     /* check whether iconv can handle this */
1.31      daniel    983:     icv_in = iconv_open("UTF-8", name);
                    984:     icv_out = iconv_open(name, "UTF-8");
1.30      daniel    985:     if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31      daniel    986:            enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32      daniel    987:            if (enc == NULL) {
                    988:                iconv_close(icv_in);
                    989:                iconv_close(icv_out);
                    990:                return(NULL);
                    991:            }
                    992:            enc->name = NULL;
1.30      daniel    993:            enc->input = NULL;
                    994:            enc->output = NULL;
                    995:            enc->iconv_in = icv_in;
                    996:            enc->iconv_out = icv_out;
                    997: #ifdef DEBUG_ENCODING
                    998:             fprintf(stderr, "Found iconv handler for encoding %s\n", name);
                    999: #endif
                   1000:            return enc;
                   1001:     } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
                   1002:            fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
                   1003:     }
                   1004: #endif /* LIBXML_ICONV_ENABLED */
                   1005: #ifdef DEBUG_ENCODING
                   1006:     fprintf(stderr, "No handler found for encoding %s\n", name);
                   1007: #endif
1.9       daniel   1008:     return(NULL);
1.30      daniel   1009: }
                   1010: 
                   1011: #ifdef LIBXML_ICONV_ENABLED
                   1012: /**
                   1013:  * xmlIconvWrapper:
                   1014:  * @cd:                iconv converter data structure
                   1015:  * @out:  a pointer to an array of bytes to store the result
                   1016:  * @outlen:  the length of @out
                   1017:  * @in:  a pointer to an array of ISO Latin 1 chars
                   1018:  * @inlen:  the length of @in
                   1019:  *
                   1020:  * Returns 0 if success, or 
                   1021:  *     -1 by lack of space, or
                   1022:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1023:  *        the result of transformation can't fit into the encoding we want), or
                   1024:  *     -3 if there the last byte can't form a single output char.
                   1025:  *     
                   1026:  * The value of @inlen after return is the number of octets consumed
                   1027:  *     as the return value is positive, else unpredictiable.
                   1028:  * The value of @outlen after return is the number of ocetes consumed.
                   1029:  */
                   1030: static int
                   1031: xmlIconvWrapper(iconv_t cd,
                   1032:        unsigned char *out, int *outlen,
                   1033:        const unsigned char *in, int *inlen) {
                   1034: 
                   1035:        size_t icv_inlen = *inlen, icv_outlen = *outlen;
                   1036:        const char *icv_in = (const char *) in;
                   1037:        char *icv_out = (char *) out;
                   1038:        int ret;
                   1039: 
                   1040:        ret = iconv(cd,
                   1041:                &icv_in, &icv_inlen,
                   1042:                &icv_out, &icv_outlen);
1.35    ! daniel   1043:        if (in != NULL) {
        !          1044:            *inlen -= icv_inlen;
        !          1045:            *outlen -= icv_outlen;
        !          1046:        } else {
        !          1047:            *inlen = 0;
        !          1048:            *outlen = 0;
        !          1049:        }
1.30      daniel   1050:        if (icv_inlen != 0 || ret == (size_t) -1) {
                   1051: #ifdef EILSEQ
                   1052:                if (errno == EILSEQ) {
1.31      daniel   1053:                        return -2;
1.30      daniel   1054:                } else
                   1055: #endif
                   1056: #ifdef E2BIG
                   1057:                if (errno == E2BIG) {
                   1058:                        return -1;
                   1059:                } else
                   1060: #endif
                   1061: #ifdef EINVAL
                   1062:                if (errno == EINVAL) {
1.31      daniel   1063:                        return -3;
1.30      daniel   1064:                }
                   1065: #endif
                   1066:                else {
                   1067:                        return -3;
                   1068:                }
                   1069:        }
                   1070:        return 0;
                   1071: }
                   1072: #endif /* LIBXML_ICONV_ENABLED */
                   1073: 
                   1074: /**
                   1075:  * xmlCharEncInFunc:
                   1076:  * @handler:   char enconding transformation data structure
1.31      daniel   1077:  * @out:  an xmlBuffer for the output.
                   1078:  * @in:  an xmlBuffer for the input
1.30      daniel   1079:  *     
                   1080:  * Generic front-end for the encoding handler input function
                   1081:  *     
1.31      daniel   1082:  * Returns the number of byte written if success, or 
                   1083:  *     -1 general error
1.30      daniel   1084:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1085:  *        the result of transformation can't fit into the encoding we want), or
                   1086:  */
                   1087: int
1.31      daniel   1088: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1089:                  xmlBufferPtr in) {
1.30      daniel   1090:     int ret = -2;
1.31      daniel   1091:     int written;
                   1092:     int toconv;
1.30      daniel   1093: 
1.31      daniel   1094:     if (handler == NULL) return(-1);
                   1095:     if (out == NULL) return(-1);
                   1096:     if (in == NULL) return(-1);
                   1097: 
                   1098:     written = out->size - out->use;
                   1099:     toconv = in->use;
                   1100:     if (toconv * 2 >= written) {
                   1101:         xmlBufferGrow(out, toconv * 2);
1.33      daniel   1102:        written = out->size - out->use - 1;
1.31      daniel   1103:     }
1.30      daniel   1104:     if (handler->input != NULL) {
1.32      daniel   1105:        ret = handler->input(&out->content[out->use], &written,
1.31      daniel   1106:                             in->content, &toconv);
                   1107:        xmlBufferShrink(in, toconv);
                   1108:        out->use += written;
1.33      daniel   1109:        out->content[out->use] = 0;
1.30      daniel   1110:     }
                   1111: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1112:     else if (handler->iconv_in != NULL) {
                   1113:        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
                   1114:                              &written, in->content, &toconv);
                   1115:        xmlBufferShrink(in, toconv);
                   1116:        out->use += written;
1.33      daniel   1117:        out->content[out->use] = 0;
                   1118:        if (ret == -1) ret = -3;
1.30      daniel   1119:     }
                   1120: #endif /* LIBXML_ICONV_ENABLED */
                   1121: #ifdef DEBUG_ENCODING
                   1122:     switch (ret) {
                   1123:         case 0:
                   1124:            fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31      daniel   1125:                    toconv, written);
1.30      daniel   1126:            break;
                   1127:         case -1:
1.31      daniel   1128:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1129:                    toconv, written, in->use);
1.30      daniel   1130:            break;
                   1131:         case -2:
                   1132:            fprintf(stderr, "input conversion failed due to input error\n");
                   1133:            break;
                   1134:         case -3:
1.31      daniel   1135:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1136:                    toconv, written, in->use);
1.30      daniel   1137:            break;
                   1138:        default:
                   1139:            fprintf(stderr,"Unknown input conversion failed %d\n", ret);
                   1140:     }
                   1141: #endif
1.33      daniel   1142:     /*
                   1143:      * Ignore when input buffer is not on a boundary
                   1144:      */
                   1145:     if (ret == -3) ret = 0;
1.30      daniel   1146:     return(ret);
                   1147: }
                   1148: 
                   1149: /**
                   1150:  * xmlCharEncOutFunc:
                   1151:  * @handler:   char enconding transformation data structure
1.31      daniel   1152:  * @out:  an xmlBuffer for the output.
                   1153:  * @in:  an xmlBuffer for the input
                   1154:  *     
                   1155:  * Generic front-end for the encoding handler output function
1.35    ! daniel   1156:  * a first call with @in == NULL has to be made firs to initiate the 
        !          1157:  * output in case of non-stateless encoding needing to initiate their
        !          1158:  * state or the output (like the BOM in UTF16).
1.30      daniel   1159:  *     
1.31      daniel   1160:  * Returns the number of byte written if success, or 
                   1161:  *     -1 general error
1.30      daniel   1162:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1163:  *        the result of transformation can't fit into the encoding we want), or
                   1164:  */
                   1165: int
1.31      daniel   1166: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1167:                   xmlBufferPtr in) {
1.30      daniel   1168:     int ret = -2;
1.31      daniel   1169:     int written;
                   1170:     int toconv;
                   1171: 
                   1172:     if (handler == NULL) return(-1);
                   1173:     if (out == NULL) return(-1);
1.35    ! daniel   1174:     written = out->size - out->use;
        !          1175: 
        !          1176:     if (in == NULL) {
        !          1177:         toconv = 0;
        !          1178:        if (handler->output != NULL) {
        !          1179:            ret = handler->output(&out->content[out->use], &written,
        !          1180:                                  NULL, &toconv);
        !          1181:            out->use += written;
        !          1182:            out->content[out->use] = 0;
        !          1183:        }
        !          1184: #ifdef LIBXML_ICONV_ENABLED
        !          1185:        else if (handler->iconv_out != NULL) {
        !          1186:            ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
        !          1187:                                  &written, NULL, &toconv);
        !          1188:            out->use += written;
        !          1189:            out->content[out->use] = 0;
        !          1190:        }
        !          1191: #endif /* LIBXML_ICONV_ENABLED */
        !          1192: #ifdef DEBUG_ENCODING
        !          1193:        fprintf(stderr, "initialized encoder\n");
        !          1194: #endif
        !          1195:         return(0);
        !          1196:     }
1.30      daniel   1197: 
1.33      daniel   1198:     toconv = in->use;
                   1199:     if (toconv * 2 >= written) {
                   1200:         xmlBufferGrow(out, toconv * 2);
                   1201:        written = out->size - out->use - 1;
                   1202:     }
1.30      daniel   1203:     if (handler->output != NULL) {
1.33      daniel   1204:        ret = handler->output(&out->content[out->use], &written,
1.35    ! daniel   1205:                              in->content, &toconv);
1.31      daniel   1206:        xmlBufferShrink(in, toconv);
                   1207:        out->use += written;
1.33      daniel   1208:        out->content[out->use] = 0;
1.30      daniel   1209:     }
                   1210: #ifdef LIBXML_ICONV_ENABLED
                   1211:     else if (handler->iconv_out != NULL) {
1.31      daniel   1212:        ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1213:                              &written, in->content, &toconv);
                   1214:        xmlBufferShrink(in, toconv);
                   1215:        out->use += written;
1.33      daniel   1216:        out->content[out->use] = 0;
                   1217:        if (ret == -1) ret = -3;
1.30      daniel   1218:     }
                   1219: #endif /* LIBXML_ICONV_ENABLED */
                   1220: #ifdef DEBUG_ENCODING
                   1221:     switch (ret) {
                   1222:         case 0:
                   1223:            fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31      daniel   1224:                    toconv, written);
1.30      daniel   1225:            break;
                   1226:         case -1:
                   1227:            fprintf(stderr, "output conversion failed by lack of space\n");
                   1228:            break;
                   1229:         case -2:
                   1230:            fprintf(stderr, "output conversion failed due to output error\n");
                   1231:            break;
                   1232:         case -3:
1.31      daniel   1233:            fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
                   1234:                    toconv, written, in->use);
1.30      daniel   1235:            break;
                   1236:        default:
                   1237:            fprintf(stderr,"Unknown output conversion failed %d\n", ret);
                   1238:     }
                   1239: #endif
                   1240:     return(ret);
                   1241: }
                   1242: 
                   1243: /**
                   1244:  * xmlCharEncCloseFunc:
                   1245:  * @handler:   char enconding transformation data structure
                   1246:  *     
                   1247:  * Generic front-end for hencoding handler close function
                   1248:  *
                   1249:  * Returns 0 if success, or -1 in case of error
                   1250:  */
                   1251: int
                   1252: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
                   1253:     int ret = 0;
1.31      daniel   1254:     if (handler == NULL) return(-1);
                   1255:     if (handler->name == NULL) return(-1);
1.30      daniel   1256: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1257:     /*
                   1258:      * Iconv handlers can be oused only once, free the whole block.
                   1259:      * and the associated icon resources.
                   1260:      */
1.32      daniel   1261:     if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
                   1262:        if (handler->name != NULL)
                   1263:            xmlFree(handler->name);
                   1264:        handler->name = NULL;
                   1265:        if (handler->iconv_out != NULL) {
                   1266:            if (iconv_close(handler->iconv_out))
                   1267:                ret = -1;
                   1268:            handler->iconv_out = NULL;
                   1269:        }
                   1270:        if (handler->iconv_in != NULL) {
                   1271:            if (iconv_close(handler->iconv_in))
                   1272:                ret = -1;
                   1273:            handler->iconv_in = NULL;
                   1274:        }
                   1275:        xmlFree(handler);
1.30      daniel   1276:     }
                   1277: #endif /* LIBXML_ICONV_ENABLED */
                   1278: #ifdef DEBUG_ENCODING
                   1279:     if (ret)
                   1280:         fprintf(stderr, "failed to close the encoding handler\n");
                   1281:     else
                   1282:         fprintf(stderr, "closed the encoding handler\n");
                   1283: 
                   1284: #endif
                   1285:     return(ret);
1.9       daniel   1286: }
                   1287: 

Webmaster