Annotation of XML/encoding.c, revision 1.39

1.1       daniel      1: /*
                      2:  * encoding.c : implements the encoding conversion functions needed for XML
                      3:  *
                      4:  * Related specs: 
                      5:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
1.39    ! daniel      6:  * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
1.1       daniel      7:  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
                      8:  * [ISO-8859-1]   ISO Latin-1 characters codes.
                      9:  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
                     10:  *                Worldwide Character Encoding -- Version 1.0", Addison-
                     11:  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
                     12:  *                described in Unicode Technical Report #4.
                     13:  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
                     14:  *                Information Interchange, ANSI X3.4-1986.
                     15:  *
1.9       daniel     16:  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1       daniel     17:  *
                     18:  * See Copyright for the status of this software.
                     19:  *
                     20:  * Daniel.Veillard@w3.org
                     21:  */
                     22: 
1.21      daniel     23: #ifdef WIN32
                     24: #include "win32config.h"
                     25: #else
1.14      daniel     26: #include "config.h"
1.17      daniel     27: #endif
                     28: 
                     29: #include <stdio.h>
                     30: #include <string.h>
                     31: 
                     32: #ifdef HAVE_CTYPE_H
1.7       daniel     33: #include <ctype.h>
1.17      daniel     34: #endif
1.20      daniel     35: #ifdef HAVE_STDLIB_H
                     36: #include <stdlib.h>
                     37: #endif
1.30      daniel     38: #include <libxml/xmlversion.h>
                     39: #ifdef LIBXML_ICONV_ENABLED
                     40: #ifdef HAVE_ERRNO_H
                     41: #include <errno.h>
                     42: #endif
                     43: #endif
1.29      daniel     44: #include <libxml/encoding.h>
                     45: #include <libxml/xmlmemory.h>
1.3       daniel     46: 
1.25      daniel     47: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
                     48: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
                     49: 
1.30      daniel     50: #ifdef LIBXML_ICONV_ENABLED
1.37      daniel     51: #if 0
1.30      daniel     52: #define DEBUG_ENCODING  /* Define this to get encoding traces */
                     53: #endif
1.33      daniel     54: #endif
1.30      daniel     55: 
1.34      daniel     56: static int xmlLittleEndian = 1;
                     57: 
1.3       daniel     58: /*
                     59:  * From rfc2044: encoding of the Unicode values on UTF-8:
                     60:  *
                     61:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                     62:  * 0000 0000-0000 007F   0xxxxxxx
                     63:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                     64:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
                     65:  *
                     66:  * I hope we won't use values > 0xFFFF anytime soon !
                     67:  */
1.1       daniel     68: 
                     69: /**
1.39    ! daniel     70:  * xmlGetUTF8Char:
        !            71:  * @utf:  a sequence of UTF-8 encoded bytes
        !            72:  * @len:  a pointer to @bytes len
        !            73:  *
        !            74:  * Read one UTF8 Char from @utf
        !            75:  *
        !            76:  * Returns the char value or -1 in case of error and update @len with the
        !            77:  *        number of bytes used
        !            78:  */
        !            79: int
        !            80: xmlGetUTF8Char(const unsigned char *utf, int *len) {
        !            81:     unsigned int c;
        !            82: 
        !            83:     if (utf == NULL)
        !            84:        goto error;
        !            85:     if (len == NULL)
        !            86:        goto error;
        !            87:     if (*len < 1)
        !            88:        goto error;
        !            89: 
        !            90:     c = utf[0];
        !            91:     if (c & 0x80) {
        !            92:        if (*len < 2)
        !            93:            goto error;
        !            94:        if ((utf[1] & 0xc0) != 0x80)
        !            95:            goto error;
        !            96:        if ((c & 0xe0) == 0xe0) {
        !            97:            if (*len < 3)
        !            98:                goto error;
        !            99:            if ((utf[2] & 0xc0) != 0x80)
        !           100:                goto error;
        !           101:            if ((c & 0xf0) == 0xf0) {
        !           102:                if (*len < 4)
        !           103:                    goto error;
        !           104:                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
        !           105:                    goto error;
        !           106:                *len = 4;
        !           107:                /* 4-byte code */
        !           108:                c = (utf[0] & 0x7) << 18;
        !           109:                c |= (utf[1] & 0x3f) << 12;
        !           110:                c |= (utf[2] & 0x3f) << 6;
        !           111:                c |= utf[3] & 0x3f;
        !           112:            } else {
        !           113:              /* 3-byte code */
        !           114:                *len = 3;
        !           115:                c = (utf[0] & 0xf) << 12;
        !           116:                c |= (utf[1] & 0x3f) << 6;
        !           117:                c |= utf[2] & 0x3f;
        !           118:            }
        !           119:        } else {
        !           120:          /* 2-byte code */
        !           121:            *len = 2;
        !           122:            c = (utf[0] & 0x1f) << 6;
        !           123:            c |= utf[1] & 0x3f;
        !           124:        }
        !           125:     } else {
        !           126:        /* 1-byte code */
        !           127:        *len = 1;
        !           128:     }
        !           129:     return(c);
        !           130: 
        !           131: error:
        !           132:     *len = 0;
        !           133:     return(-1);
        !           134: }
        !           135: 
        !           136: /**
1.22      daniel    137:  * xmlCheckUTF8: Check utf-8 string for legality.
                    138:  * @utf: Pointer to putative utf-8 encoded string.
                    139:  *
                    140:  * Checks @utf for being valid utf-8. @utf is assumed to be
                    141:  * null-terminated. This function is not super-strict, as it will
                    142:  * allow longer utf-8 sequences than necessary. Note that Java is
                    143:  * capable of producing these sequences if provoked. Also note, this
                    144:  * routine checks for the 4-byte maxiumum size, but does not check for
                    145:  * 0x10ffff maximum value.
                    146:  *
                    147:  * Return value: true if @utf is valid.
                    148:  **/
                    149: int
                    150: xmlCheckUTF8(const unsigned char *utf)
                    151: {
                    152:     int ix;
                    153:     unsigned char c;
                    154: 
                    155:     for (ix = 0; (c = utf[ix]);) {
                    156:         if (c & 0x80) {
                    157:            if ((utf[ix + 1] & 0xc0) != 0x80)
                    158:                return(0);
                    159:            if ((c & 0xe0) == 0xe0) {
                    160:                if ((utf[ix + 2] & 0xc0) != 0x80)
                    161:                    return(0);
                    162:                if ((c & 0xf0) == 0xf0) {
                    163:                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
                    164:                        return(0);
                    165:                    ix += 4;
                    166:                    /* 4-byte code */
                    167:                } else
                    168:                  /* 3-byte code */
                    169:                    ix += 3;
                    170:            } else
                    171:              /* 2-byte code */
                    172:                ix += 2;
                    173:        } else
                    174:            /* 1-byte code */
                    175:            ix++;
                    176:       }
                    177:       return(1);
                    178: }
                    179: 
                    180: /**
1.1       daniel    181:  * isolat1ToUTF8:
1.18      daniel    182:  * @out:  a pointer to an array of bytes to store the result
                    183:  * @outlen:  the length of @out
                    184:  * @in:  a pointer to an array of ISO Latin 1 chars
                    185:  * @inlen:  the length of @in
1.1       daniel    186:  *
                    187:  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
                    188:  * block of chars out.
1.33      daniel    189:  * Returns 0 if success, or -1 otherwise
                    190:  * The value of @inlen after return is the number of octets consumed
                    191:  *     as the return value is positive, else unpredictiable.
                    192:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    193:  */
                    194: int
1.33      daniel    195: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25      daniel    196:               const unsigned char* in, int *inlen) {
1.33      daniel    197:     unsigned char* outstart = out;
                    198:     const unsigned char* processed = in;
                    199:     unsigned char* outend = out + *outlen;
                    200:     const unsigned char* inend = in + *inlen;
1.1       daniel    201:     unsigned char c;
                    202: 
                    203:     while (in < inend) {
                    204:         c= *in++;
                    205:         if (c < 0x80) {
1.33      daniel    206:             if (out >= outend)
                    207:                break;
1.1       daniel    208:             *out++ = c;
                    209:         }
                    210:         else {
1.33      daniel    211:             if (out + 1 >= outend)  break;
1.1       daniel    212:             *out++ = 0xC0 | (c >> 6);
                    213:             *out++ = 0x80 | (0x3F & c);
                    214:         }
1.33      daniel    215:        processed = in;
1.1       daniel    216:     }
1.33      daniel    217:     *outlen = out - outstart;
                    218:     *inlen = processed - in;
                    219: 
                    220:     return(0);
1.1       daniel    221: }
                    222: 
                    223: /**
                    224:  * UTF8Toisolat1:
1.18      daniel    225:  * @out:  a pointer to an array of bytes to store the result
                    226:  * @outlen:  the length of @out
                    227:  * @in:  a pointer to an array of UTF-8 chars
                    228:  * @inlen:  the length of @in
1.1       daniel    229:  *
                    230:  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
                    231:  * block of chars out.
1.15      daniel    232:  * TODO: UTF8Toisolat1 need a fallback mechanism ...
                    233:  *
1.33      daniel    234:  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28      daniel    235:  * The value of @inlen after return is the number of octets consumed
                    236:  *     as the return value is positive, else unpredictiable.
1.33      daniel    237:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    238:  */
                    239: int
1.33      daniel    240: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25      daniel    241:               const unsigned char* in, int *inlen) {
1.33      daniel    242:     unsigned char* outstart = out;
                    243:     const unsigned char* processed = in;
                    244:     unsigned char* outend = out + *outlen;
                    245:     const unsigned char* inend = in + *inlen;
1.1       daniel    246:     unsigned char c;
                    247: 
                    248:     while (in < inend) {
                    249:         c= *in++;
                    250:         if (c < 0x80) {
1.28      daniel    251:             if (out >= outend)  return(-1);
1.1       daniel    252:             *out++= c;
                    253:         }
1.23      daniel    254:        else if (in == inend) {
                    255:             break;
                    256:        }
                    257:        else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
                    258:            /* a two byte utf-8 and can be encoding as isolate1 */
1.1       daniel    259:             *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23      daniel    260:        }
1.33      daniel    261:        else {
                    262:            *outlen = out - outstart;
                    263:            *inlen = processed - in;
1.28      daniel    264:            return(-2);
1.33      daniel    265:        }
                    266:        processed = in;
1.1       daniel    267:     }
1.33      daniel    268:     *outlen = out - outstart;
                    269:     *inlen = processed - in;
                    270:     return(0);
1.1       daniel    271: }
                    272: 
                    273: /**
1.28      daniel    274:  * UTF16LEToUTF8:
                    275:  * @out:  a pointer to an array of bytes to store the result
                    276:  * @outlen:  the length of @out
                    277:  * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
                    278:  * @inlenb:  the length of @in in UTF-16LE chars
                    279:  *
                    280:  * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
                    281:  * block of chars out. This function assume the endian properity
                    282:  * is the same between the native type of this machine and the
                    283:  * inputed one.
                    284:  *
                    285:  * Returns the number of byte written, or -1 by lack of space, or -2
                    286:  *     if the transcoding fails (for *in is not valid utf16 string)
                    287:  *     The value of *inlen after return is the number of octets consumed
                    288:  *     as the return value is positive, else unpredictiable.
                    289:  */
                    290: int
1.33      daniel    291: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28      daniel    292:             const unsigned char* inb, int *inlenb)
                    293: {
1.33      daniel    294:     unsigned char* outstart = out;
                    295:     const unsigned char* processed = inb;
                    296:     unsigned char* outend = out + *outlen;
1.28      daniel    297:     unsigned short* in = (unsigned short*) inb;
                    298:     unsigned short* inend;
                    299:     unsigned int c, d, inlen;
                    300:     unsigned char *tmp;
                    301:     int bits;
                    302: 
                    303:     if ((*inlenb % 2) == 1)
                    304:         (*inlenb)--;
                    305:     inlen = *inlenb / 2;
1.33      daniel    306:     inend = in + inlen;
1.39    ! daniel    307:     while ((in < inend) && (out - outstart + 5 < *outlen)) {
1.34      daniel    308:         if (xmlLittleEndian) {
                    309:            c= *in++;
                    310:        } else {
                    311:            tmp = (unsigned char *) in;
                    312:            c = *tmp++;
                    313:            c = c | (((unsigned int)*tmp) << 8);
                    314:            in++;
                    315:        }
1.28      daniel    316:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
1.39    ! daniel    317:            if (in >= inend) {           /* (in > inend) shouldn't happens */
        !           318:                break;
        !           319:            }
1.34      daniel    320:            if (xmlLittleEndian) {
                    321:                d = *in++;
                    322:            } else {
                    323:                tmp = (unsigned char *) in;
                    324:                d = *tmp++;
                    325:                d = d | (((unsigned int)*tmp) << 8);
                    326:                in++;
                    327:            }
1.28      daniel    328:             if ((d & 0xFC00) == 0xDC00) {
                    329:                 c &= 0x03FF;
                    330:                 c <<= 10;
                    331:                 c |= d & 0x03FF;
                    332:                 c += 0x10000;
                    333:             }
1.33      daniel    334:             else {
                    335:                *outlen = out - outstart;
                    336:                *inlenb = processed - inb;
1.28      daniel    337:                return(-2);
1.33      daniel    338:            }
1.28      daniel    339:         }
                    340: 
                    341:        /* assertion: c is a single UTF-4 value */
                    342:         if (out >= outend)
1.33      daniel    343:            break;
1.28      daniel    344:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
                    345:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    346:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    347:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                    348:  
                    349:         for ( ; bits >= 0; bits-= 6) {
                    350:             if (out >= outend)
1.33      daniel    351:                break;
1.28      daniel    352:             *out++= ((c >> bits) & 0x3F) | 0x80;
                    353:         }
1.33      daniel    354:        processed = (const unsigned char*) in;
1.28      daniel    355:     }
1.33      daniel    356:     *outlen = out - outstart;
                    357:     *inlenb = processed - inb;
                    358:     return(0);
1.28      daniel    359: }
                    360: 
                    361: /**
                    362:  * UTF8ToUTF16LE:
                    363:  * @outb:  a pointer to an array of bytes to store the result
                    364:  * @outlen:  the length of @outb
                    365:  * @in:  a pointer to an array of UTF-8 chars
                    366:  * @inlen:  the length of @in
                    367:  *
                    368:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
                    369:  * block of chars out.
                    370:  * TODO: UTF8ToUTF16LE need a fallback mechanism ...
                    371:  *
                    372:  * Returns the number of byte written, or -1 by lack of space, or -2
                    373:  *     if the transcoding failed. 
                    374:  */
                    375: int
1.33      daniel    376: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28      daniel    377:             const unsigned char* in, int *inlen)
                    378: {
                    379:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    380:     const unsigned char* processed = in;
1.28      daniel    381:     unsigned short* outstart= out;
                    382:     unsigned short* outend;
                    383:     const unsigned char* inend= in+*inlen;
                    384:     unsigned int c, d, trailing;
                    385:     unsigned char *tmp;
                    386:     unsigned short tmp1, tmp2;
                    387: 
1.37      daniel    388:     if (in == NULL) {
                    389:         /*
                    390:         * initialization, add the Byte Order Mark
                    391:         */
                    392:         if (*outlen >= 2) {
                    393:            outb[0] = 0xFF;
                    394:            outb[1] = 0xFE;
                    395:            *outlen = 2;
                    396:            *inlen = 0;
                    397: #ifdef DEBUG_ENCODING
                    398:             fprintf(stderr, "Added FFFE Byte Order Mark\n");
                    399: #endif
                    400:            return(2);
                    401:        }
                    402:        *outlen = 0;
                    403:        *inlen = 0;
                    404:        return(0);
                    405:     }
1.33      daniel    406:     outend = out + (*outlen / 2);
1.28      daniel    407:     while (in < inend) {
                    408:       d= *in++;
                    409:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    410:       else if (d < 0xC0) {
                    411:           /* trailing byte in leading position */
                    412:          *outlen = out - outstart;
                    413:          *inlen = processed - in;
                    414:          return(-2);
                    415:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.28      daniel    416:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    417:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    418:       else {
                    419:        /* no chance for this in UTF-16 */
                    420:        *outlen = out - outstart;
                    421:        *inlen = processed - in;
                    422:        return(-2);
                    423:       }
1.28      daniel    424: 
                    425:       if (inend - in < trailing) {
                    426:           break;
                    427:       } 
                    428: 
                    429:       for ( ; trailing; trailing--) {
                    430:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33      daniel    431:              break;
1.28      daniel    432:           c <<= 6;
                    433:           c |= d & 0x3F;
                    434:       }
                    435: 
                    436:       /* assertion: c is a single UTF-4 value */
                    437:         if (c < 0x10000) {
                    438:             if (out >= outend)
1.33      daniel    439:                break;
1.34      daniel    440:            if (xmlLittleEndian) {
                    441:                *out++ = c;
                    442:            } else {
                    443:                tmp = (unsigned char *) out;
                    444:                *tmp = c ;
                    445:                *(tmp + 1) = c >> 8 ;
                    446:                out++;
                    447:            }
1.28      daniel    448:         }
                    449:         else if (c < 0x110000) {
                    450:             if (out+1 >= outend)
1.33      daniel    451:                break;
1.28      daniel    452:             c -= 0x10000;
1.34      daniel    453:            if (xmlLittleEndian) {
                    454:                *out++ = 0xD800 | (c >> 10);
                    455:                *out++ = 0xDC00 | (c & 0x03FF);
                    456:            } else {
                    457:                tmp1 = 0xD800 | (c >> 10);
                    458:                tmp = (unsigned char *) out;
                    459:                *tmp = tmp1;
                    460:                *(tmp + 1) = tmp1 >> 8;
                    461:                out++;
                    462: 
                    463:                tmp2 = 0xDC00 | (c & 0x03FF);
                    464:                tmp = (unsigned char *) out;
                    465:                *tmp  = tmp2;
                    466:                *(tmp + 1) = tmp2 >> 8;
                    467:                out++;
                    468:            }
1.28      daniel    469:         }
                    470:         else
1.33      daniel    471:            break;
                    472:        processed = in;
1.28      daniel    473:     }
1.36      daniel    474:     *outlen = (out - outstart) * 2;
1.33      daniel    475:     *inlen = processed - in;
                    476:     return(0);
1.28      daniel    477: }
                    478: 
                    479: /**
                    480:  * UTF16BEToUTF8:
1.18      daniel    481:  * @out:  a pointer to an array of bytes to store the result
                    482:  * @outlen:  the length of @out
1.25      daniel    483:  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
                    484:  * @inlenb:  the length of @in in UTF-16 chars
1.1       daniel    485:  *
                    486:  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28      daniel    487:  * block of chars out. This function assume the endian properity
                    488:  * is the same between the native type of this machine and the
                    489:  * inputed one.
1.25      daniel    490:  *
1.28      daniel    491:  * Returns the number of byte written, or -1 by lack of space, or -2
                    492:  *     if the transcoding fails (for *in is not valid utf16 string)
                    493:  * The value of *inlen after return is the number of octets consumed
                    494:  *     as the return value is positive, else unpredictiable.
1.1       daniel    495:  */
                    496: int
1.33      daniel    497: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25      daniel    498:             const unsigned char* inb, int *inlenb)
1.1       daniel    499: {
1.33      daniel    500:     unsigned char* outstart = out;
                    501:     const unsigned char* processed = inb;
                    502:     unsigned char* outend = out + *outlen;
1.25      daniel    503:     unsigned short* in = (unsigned short*) inb;
                    504:     unsigned short* inend;
                    505:     unsigned int c, d, inlen;
1.28      daniel    506:     unsigned char *tmp;
1.1       daniel    507:     int bits;
                    508: 
1.28      daniel    509:     if ((*inlenb % 2) == 1)
                    510:         (*inlenb)--;
1.25      daniel    511:     inlen = *inlenb / 2;
                    512:     inend= in + inlen;
1.1       daniel    513:     while (in < inend) {
1.34      daniel    514:        if (xmlLittleEndian) {
                    515:            tmp = (unsigned char *) in;
                    516:            c = *tmp++;
                    517:            c = c << 8;
                    518:            c = c | (unsigned int) *tmp;
                    519:            in++;
                    520:        } else {
                    521:            c= *in++;
                    522:        } 
1.1       daniel    523:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
1.28      daniel    524:            if (in >= inend) {           /* (in > inend) shouldn't happens */
1.33      daniel    525:                *outlen = out - outstart;
                    526:                *inlenb = processed - inb;
                    527:                return(-2);
1.28      daniel    528:            }
1.34      daniel    529:            if (xmlLittleEndian) {
                    530:                tmp = (unsigned char *) in;
                    531:                d = *tmp++;
                    532:                d = d << 8;
                    533:                d = d | (unsigned int) *tmp;
                    534:                in++;
                    535:            } else {
                    536:                d= *in++;
                    537:            }
1.28      daniel    538:             if ((d & 0xFC00) == 0xDC00) {
1.1       daniel    539:                 c &= 0x03FF;
                    540:                 c <<= 10;
                    541:                 c |= d & 0x03FF;
                    542:                 c += 0x10000;
                    543:             }
1.33      daniel    544:             else {
                    545:                *outlen = out - outstart;
                    546:                *inlenb = processed - inb;
1.28      daniel    547:                return(-2);
1.33      daniel    548:            }
1.1       daniel    549:         }
                    550: 
1.25      daniel    551:        /* assertion: c is a single UTF-4 value */
1.27      daniel    552:         if (out >= outend) 
1.33      daniel    553:            break;
1.1       daniel    554:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
1.26      daniel    555:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    556:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    557:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1.1       daniel    558:  
1.26      daniel    559:         for ( ; bits >= 0; bits-= 6) {
1.27      daniel    560:             if (out >= outend) 
1.33      daniel    561:                break;
1.26      daniel    562:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    563:         }
1.33      daniel    564:        processed = (const unsigned char*) in;
1.1       daniel    565:     }
1.33      daniel    566:     *outlen = out - outstart;
                    567:     *inlenb = processed - inb;
                    568:     return(0);
1.1       daniel    569: }
                    570: 
                    571: /**
1.28      daniel    572:  * UTF8ToUTF16BE:
1.25      daniel    573:  * @outb:  a pointer to an array of bytes to store the result
                    574:  * @outlen:  the length of @outb
1.18      daniel    575:  * @in:  a pointer to an array of UTF-8 chars
                    576:  * @inlen:  the length of @in
1.1       daniel    577:  *
1.28      daniel    578:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1       daniel    579:  * block of chars out.
1.28      daniel    580:  * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15      daniel    581:  *
1.6       daniel    582:  * Returns the number of byte written, or -1 by lack of space, or -2
1.25      daniel    583:  *     if the transcoding failed. 
1.1       daniel    584:  */
                    585: int
1.33      daniel    586: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25      daniel    587:             const unsigned char* in, int *inlen)
1.1       daniel    588: {
1.25      daniel    589:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    590:     const unsigned char* processed = in;
1.1       daniel    591:     unsigned short* outstart= out;
1.28      daniel    592:     unsigned short* outend;
1.25      daniel    593:     const unsigned char* inend= in+*inlen;
1.1       daniel    594:     unsigned int c, d, trailing;
1.28      daniel    595:     unsigned char *tmp;
                    596:     unsigned short tmp1, tmp2;
1.1       daniel    597: 
1.37      daniel    598:     if (in == NULL) {
                    599:         /*
                    600:         * initialization, add the Byte Order Mark
                    601:         */
                    602:         if (*outlen >= 2) {
                    603:            outb[0] = 0xFE;
                    604:            outb[1] = 0xFF;
                    605:            *outlen = 2;
                    606:            *inlen = 0;
                    607: #ifdef DEBUG_ENCODING
                    608:             fprintf(stderr, "Added FEFF Byte Order Mark\n");
                    609: #endif
                    610:            return(2);
                    611:        }
                    612:        *outlen = 0;
                    613:        *inlen = 0;
                    614:        return(0);
                    615:     }
1.33      daniel    616:     outend = out + (*outlen / 2);
1.1       daniel    617:     while (in < inend) {
                    618:       d= *in++;
                    619:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    620:       else if (d < 0xC0)  {
                    621:           /* trailing byte in leading position */
                    622:          *outlen = out - outstart;
                    623:          *inlen = processed - in;
                    624:          return(-2);
                    625:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.1       daniel    626:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    627:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    628:       else {
                    629:           /* no chance for this in UTF-16 */
                    630:          *outlen = out - outstart;
                    631:          *inlen = processed - in;
                    632:          return(-2);
                    633:       }
1.28      daniel    634: 
                    635:       if (inend - in < trailing) {
                    636:           break;
                    637:       } 
1.1       daniel    638: 
                    639:       for ( ; trailing; trailing--) {
1.33      daniel    640:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
1.1       daniel    641:           c <<= 6;
                    642:           c |= d & 0x3F;
                    643:       }
                    644: 
                    645:       /* assertion: c is a single UTF-4 value */
                    646:         if (c < 0x10000) {
1.33      daniel    647:             if (out >= outend)  break;
1.34      daniel    648:            if (xmlLittleEndian) {
                    649:                tmp = (unsigned char *) out;
                    650:                *tmp = c >> 8;
                    651:                *(tmp + 1) = c;
                    652:                out++;
                    653:            } else {
                    654:                *out++ = c;
                    655:            }
1.1       daniel    656:         }
                    657:         else if (c < 0x110000) {
1.33      daniel    658:             if (out+1 >= outend)  break;
1.1       daniel    659:             c -= 0x10000;
1.34      daniel    660:            if (xmlLittleEndian) {
                    661:                tmp1 = 0xD800 | (c >> 10);
                    662:                tmp = (unsigned char *) out;
                    663:                *tmp = tmp1 >> 8;
                    664:                *(tmp + 1) = tmp1;
                    665:                out++;
                    666: 
                    667:                tmp2 = 0xDC00 | (c & 0x03FF);
                    668:                tmp = (unsigned char *) out;
                    669:                *tmp = tmp2 >> 8;
                    670:                *(tmp + 1) = tmp2;
                    671:                out++;
                    672:            } else {
                    673:                *out++ = 0xD800 | (c >> 10);
                    674:                *out++ = 0xDC00 | (c & 0x03FF);
                    675:            }
1.1       daniel    676:         }
1.33      daniel    677:         else
                    678:            break;
                    679:        processed = in;
1.1       daniel    680:     }
1.36      daniel    681:     *outlen = (out - outstart) * 2;
1.33      daniel    682:     *inlen = processed - in;
                    683:     return(0);
1.1       daniel    684: }
                    685: 
1.7       daniel    686: /**
                    687:  * xmlDetectCharEncoding:
                    688:  * @in:  a pointer to the first bytes of the XML entity, must be at least
                    689:  *       4 bytes long.
1.25      daniel    690:  * @len:  pointer to the length of the buffer
1.7       daniel    691:  *
                    692:  * Guess the encoding of the entity using the first bytes of the entity content
                    693:  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
                    694:  * 
                    695:  * Returns one of the XML_CHAR_ENCODING_... values.
                    696:  */
                    697: xmlCharEncoding
1.25      daniel    698: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7       daniel    699: {
1.25      daniel    700:     if (len >= 4) {
                    701:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    702:            (in[2] == 0x00) && (in[3] == 0x3C))
                    703:            return(XML_CHAR_ENCODING_UCS4BE);
                    704:        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
                    705:            (in[2] == 0x00) && (in[3] == 0x00))
                    706:            return(XML_CHAR_ENCODING_UCS4LE);
                    707:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    708:            (in[2] == 0x3C) && (in[3] == 0x00))
                    709:            return(XML_CHAR_ENCODING_UCS4_2143);
                    710:        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
                    711:            (in[2] == 0x00) && (in[3] == 0x00))
                    712:            return(XML_CHAR_ENCODING_UCS4_3412);
                    713:        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
                    714:            (in[2] == 0xA7) && (in[3] == 0x94))
                    715:            return(XML_CHAR_ENCODING_EBCDIC);
                    716:        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
                    717:            (in[2] == 0x78) && (in[3] == 0x6D))
                    718:            return(XML_CHAR_ENCODING_UTF8);
                    719:     }
                    720:     if (len >= 2) {
                    721:        if ((in[0] == 0xFE) && (in[1] == 0xFF))
                    722:            return(XML_CHAR_ENCODING_UTF16BE);
                    723:        if ((in[0] == 0xFF) && (in[1] == 0xFE))
                    724:            return(XML_CHAR_ENCODING_UTF16LE);
                    725:     }
1.7       daniel    726:     return(XML_CHAR_ENCODING_NONE);
                    727: }
                    728: 
                    729: /**
                    730:  * xmlParseCharEncoding:
1.18      daniel    731:  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7       daniel    732:  *
                    733:  * Conpare the string to the known encoding schemes already known. Note
                    734:  * that the comparison is case insensitive accordingly to the section
                    735:  * [XML] 4.3.3 Character Encoding in Entities.
                    736:  * 
                    737:  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
                    738:  * if not recognized.
                    739:  */
                    740: xmlCharEncoding
1.8       daniel    741: xmlParseCharEncoding(const char* name)
1.7       daniel    742: {
                    743:     char upper[500];
                    744:     int i;
                    745: 
                    746:     for (i = 0;i < 499;i++) {
                    747:         upper[i] = toupper(name[i]);
                    748:        if (upper[i] == 0) break;
                    749:     }
                    750:     upper[i] = 0;
                    751: 
                    752:     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
                    753:     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
                    754:     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
                    755: 
                    756:     /*
                    757:      * NOTE: if we were able to parse this, the endianness of UTF16 is
                    758:      *       already found and in use
                    759:      */
                    760:     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
                    761:     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
                    762:     
                    763:     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    764:     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    765:     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
                    766: 
                    767:     /*
                    768:      * NOTE: if we were able to parse this, the endianness of UCS4 is
                    769:      *       already found and in use
                    770:      */
                    771:     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    772:     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    773:     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
                    774: 
                    775:     
                    776:     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
                    777:     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
                    778:     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
                    779: 
                    780:     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
                    781:     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
                    782:     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
                    783: 
                    784:     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
                    785:     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
                    786:     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
                    787:     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
                    788:     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
                    789:     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
                    790:     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
                    791: 
                    792:     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30      daniel    793:     if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7       daniel    794:     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30      daniel    795: 
                    796: #ifdef DEBUG_ENCODING
                    797:     fprintf(stderr, "Unknown encoding %s\n", name);
                    798: #endif
1.7       daniel    799:     return(XML_CHAR_ENCODING_ERROR);
                    800: }
1.9       daniel    801: 
1.38      daniel    802: /**
                    803:  * xmlGetCharEncodingName:
                    804:  * @enc:  the encoding
                    805:  *
                    806:  * The "canonical" name for XML encoding.
                    807:  * C.f. http://www.w3.org/TR/REC-xml#charencoding
                    808:  * Section 4.3.3  Character Encoding in Entities
                    809:  *
                    810:  * Returns the canonical name for the given encoding
                    811:  */
                    812: 
                    813: const char*
                    814: xmlGetCharEncodingName(xmlCharEncoding enc) {
                    815:     switch (enc) {
                    816:         case XML_CHAR_ENCODING_ERROR:
                    817:            return(NULL);
                    818:         case XML_CHAR_ENCODING_NONE:
                    819:            return(NULL);
                    820:         case XML_CHAR_ENCODING_UTF8:
                    821:            return("UTF-8");
                    822:         case XML_CHAR_ENCODING_UTF16LE:
                    823:            return("UTF-16");
                    824:         case XML_CHAR_ENCODING_UTF16BE:
                    825:            return("UTF-16");
                    826:         case XML_CHAR_ENCODING_EBCDIC:
                    827:             return("EBCDIC");
                    828:         case XML_CHAR_ENCODING_UCS4LE:
                    829:             return("ISO-10646-UCS-4");
                    830:         case XML_CHAR_ENCODING_UCS4BE:
                    831:             return("ISO-10646-UCS-4");
                    832:         case XML_CHAR_ENCODING_UCS4_2143:
                    833:             return("ISO-10646-UCS-4");
                    834:         case XML_CHAR_ENCODING_UCS4_3412:
                    835:             return("ISO-10646-UCS-4");
                    836:         case XML_CHAR_ENCODING_UCS2:
                    837:             return("ISO-10646-UCS-2");
                    838:         case XML_CHAR_ENCODING_8859_1:
                    839:            return("ISO-8859-1");
                    840:         case XML_CHAR_ENCODING_8859_2:
                    841:            return("ISO-8859-2");
                    842:         case XML_CHAR_ENCODING_8859_3:
                    843:            return("ISO-8859-3");
                    844:         case XML_CHAR_ENCODING_8859_4:
                    845:            return("ISO-8859-4");
                    846:         case XML_CHAR_ENCODING_8859_5:
                    847:            return("ISO-8859-5");
                    848:         case XML_CHAR_ENCODING_8859_6:
                    849:            return("ISO-8859-6");
                    850:         case XML_CHAR_ENCODING_8859_7:
                    851:            return("ISO-8859-7");
                    852:         case XML_CHAR_ENCODING_8859_8:
                    853:            return("ISO-8859-8");
                    854:         case XML_CHAR_ENCODING_8859_9:
                    855:            return("ISO-8859-9");
                    856:         case XML_CHAR_ENCODING_2022_JP:
                    857:             return("ISO-2022-JP");
                    858:         case XML_CHAR_ENCODING_SHIFT_JIS:
                    859:             return("Shift-JIS");
                    860:         case XML_CHAR_ENCODING_EUC_JP:
                    861:             return("EUC-JP");
                    862:     }
                    863:     return(NULL);
                    864: }
                    865: 
1.9       daniel    866: /****************************************************************
                    867:  *                                                             *
                    868:  *             Char encoding handlers                          *
                    869:  *                                                             *
                    870:  ****************************************************************/
                    871: 
                    872: /* the size should be growable, but it's not a big deal ... */
                    873: #define MAX_ENCODING_HANDLERS 50
                    874: static xmlCharEncodingHandlerPtr *handlers = NULL;
                    875: static int nbCharEncodingHandler = 0;
                    876: 
                    877: /*
                    878:  * The default is UTF-8 for XML, that's also the default used for the
                    879:  * parser internals, so the default encoding handler is NULL
                    880:  */
                    881: 
                    882: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
                    883: 
                    884: /**
                    885:  * xmlNewCharEncodingHandler:
1.18      daniel    886:  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1.9       daniel    887:  * @input:  the xmlCharEncodingInputFunc to read that encoding
                    888:  * @output:  the xmlCharEncodingOutputFunc to write that encoding
                    889:  *
                    890:  * Create and registers an xmlCharEncodingHandler.
                    891:  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
                    892:  */
                    893: xmlCharEncodingHandlerPtr
1.25      daniel    894: xmlNewCharEncodingHandler(const char *name, 
                    895:                           xmlCharEncodingInputFunc input,
1.9       daniel    896:                           xmlCharEncodingOutputFunc output) {
                    897:     xmlCharEncodingHandlerPtr handler;
                    898:     char upper[500];
                    899:     int i;
                    900:     char *up = 0;
                    901: 
                    902:     /*
                    903:      * Keep only the uppercase version of the encoding.
                    904:      */
                    905:     if (name == NULL) {
                    906:         fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
                    907:        return(NULL);
                    908:     }
                    909:     for (i = 0;i < 499;i++) {
                    910:         upper[i] = toupper(name[i]);
                    911:        if (upper[i] == 0) break;
                    912:     }
                    913:     upper[i] = 0;
1.16      daniel    914:     up = xmlMemStrdup(upper);
1.9       daniel    915:     if (up == NULL) {
                    916:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    917:        return(NULL);
                    918:     }
                    919: 
                    920:     /*
                    921:      * allocate and fill-up an handler block.
                    922:      */
                    923:     handler = (xmlCharEncodingHandlerPtr)
1.16      daniel    924:               xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9       daniel    925:     if (handler == NULL) {
                    926:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    927:        return(NULL);
                    928:     }
                    929:     handler->input = input;
                    930:     handler->output = output;
                    931:     handler->name = up;
                    932: 
                    933:     /*
                    934:      * registers and returns the handler.
                    935:      */
                    936:     xmlRegisterCharEncodingHandler(handler);
1.30      daniel    937: #ifdef DEBUG_ENCODING
                    938:     fprintf(stderr, "Registered encoding handler for %s\n", name);
                    939: #endif
1.9       daniel    940:     return(handler);
                    941: }
                    942: 
                    943: /**
                    944:  * xmlInitCharEncodingHandlers:
                    945:  *
                    946:  * Initialize the char encoding support, it registers the default
                    947:  * encoding supported.
1.18      daniel    948:  * NOTE: while public, this function usually doesn't need to be called
1.9       daniel    949:  *       in normal processing.
                    950:  */
                    951: void
                    952: xmlInitCharEncodingHandlers(void) {
1.34      daniel    953:     unsigned short int tst = 0x1234;
                    954:     unsigned char *ptr = (unsigned char *) &tst; 
                    955: 
1.9       daniel    956:     if (handlers != NULL) return;
                    957: 
                    958:     handlers = (xmlCharEncodingHandlerPtr *)
1.16      daniel    959:         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34      daniel    960: 
                    961:     if (*ptr == 0x12) xmlLittleEndian = 0;
                    962:     else if (*ptr == 0x34) xmlLittleEndian = 1;
                    963:     else fprintf(stderr, "Odd problem at endianness detection\n");
1.9       daniel    964: 
                    965:     if (handlers == NULL) {
                    966:         fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
                    967:        return;
                    968:     }
1.10      daniel    969:     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25      daniel    970:     xmlUTF16LEHandler = 
1.28      daniel    971:           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
                    972:     xmlUTF16BEHandler = 
                    973:           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10      daniel    974:     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9       daniel    975: }
                    976: 
                    977: /**
1.19      daniel    978:  * xmlCleanupCharEncodingHandlers:
                    979:  *
                    980:  * Cleanup the memory allocated for the char encoding support, it
                    981:  * unregisters all the encoding handlers.
                    982:  */
                    983: void
                    984: xmlCleanupCharEncodingHandlers(void) {
                    985:     if (handlers == NULL) return;
                    986: 
                    987:     for (;nbCharEncodingHandler > 0;) {
                    988:         nbCharEncodingHandler--;
                    989:        if (handlers[nbCharEncodingHandler] != NULL) {
1.31      daniel    990:            if (handlers[nbCharEncodingHandler]->name != NULL)
                    991:                xmlFree(handlers[nbCharEncodingHandler]->name);
1.19      daniel    992:            xmlFree(handlers[nbCharEncodingHandler]);
                    993:        }
                    994:     }
                    995:     xmlFree(handlers);
                    996:     handlers = NULL;
                    997:     nbCharEncodingHandler = 0;
                    998:     xmlDefaultCharEncodingHandler = NULL;
                    999: }
                   1000: 
                   1001: /**
1.9       daniel   1002:  * xmlRegisterCharEncodingHandler:
                   1003:  * @handler:  the xmlCharEncodingHandlerPtr handler block
                   1004:  *
                   1005:  * Register the char encoding handler, surprizing, isn't it ?
                   1006:  */
                   1007: void
                   1008: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
                   1009:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                   1010:     if (handler == NULL) {
                   1011:         fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
                   1012:        return;
                   1013:     }
                   1014: 
                   1015:     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
                   1016:         fprintf(stderr, 
                   1017:        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
                   1018:         fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
                   1019:        return;
                   1020:     }
                   1021:     handlers[nbCharEncodingHandler++] = handler;
                   1022: }
                   1023: 
                   1024: /**
                   1025:  * xmlGetCharEncodingHandler:
                   1026:  * @enc:  an xmlCharEncoding value.
                   1027:  *
                   1028:  * Search in the registrered set the handler able to read/write that encoding.
                   1029:  *
                   1030:  * Returns the handler or NULL if not found
                   1031:  */
                   1032: xmlCharEncodingHandlerPtr
                   1033: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30      daniel   1034:     xmlCharEncodingHandlerPtr handler;
                   1035: 
1.9       daniel   1036:     if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25      daniel   1037:     switch (enc) {
                   1038:         case XML_CHAR_ENCODING_ERROR:
                   1039:            return(NULL);
                   1040:         case XML_CHAR_ENCODING_NONE:
                   1041:            return(NULL);
                   1042:         case XML_CHAR_ENCODING_UTF8:
                   1043:            return(NULL);
                   1044:         case XML_CHAR_ENCODING_UTF16LE:
                   1045:            return(xmlUTF16LEHandler);
                   1046:         case XML_CHAR_ENCODING_UTF16BE:
                   1047:            return(xmlUTF16BEHandler);
                   1048:         case XML_CHAR_ENCODING_EBCDIC:
1.30      daniel   1049:             handler = xmlFindCharEncodingHandler("EBCDIC");
                   1050:             if (handler != NULL) return(handler);
                   1051:             handler = xmlFindCharEncodingHandler("ebcdic");
                   1052:             if (handler != NULL) return(handler);
                   1053:            break;
1.38      daniel   1054:         case XML_CHAR_ENCODING_UCS4BE:
1.30      daniel   1055:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
                   1056:             if (handler != NULL) return(handler);
                   1057:             handler = xmlFindCharEncodingHandler("UCS-4");
                   1058:             if (handler != NULL) return(handler);
                   1059:             handler = xmlFindCharEncodingHandler("UCS4");
                   1060:             if (handler != NULL) return(handler);
                   1061:            break;
1.38      daniel   1062:         case XML_CHAR_ENCODING_UCS4LE:
                   1063:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
                   1064:             if (handler != NULL) return(handler);
                   1065:             handler = xmlFindCharEncodingHandler("UCS-4");
                   1066:             if (handler != NULL) return(handler);
                   1067:             handler = xmlFindCharEncodingHandler("UCS4");
1.30      daniel   1068:             if (handler != NULL) return(handler);
                   1069:            break;
1.25      daniel   1070:         case XML_CHAR_ENCODING_UCS4_2143:
1.30      daniel   1071:            break;
1.25      daniel   1072:         case XML_CHAR_ENCODING_UCS4_3412:
1.30      daniel   1073:            break;
1.25      daniel   1074:         case XML_CHAR_ENCODING_UCS2:
1.30      daniel   1075:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
                   1076:             if (handler != NULL) return(handler);
                   1077:             handler = xmlFindCharEncodingHandler("UCS-2");
                   1078:             if (handler != NULL) return(handler);
                   1079:             handler = xmlFindCharEncodingHandler("UCS2");
                   1080:             if (handler != NULL) return(handler);
                   1081:            break;
1.25      daniel   1082:         case XML_CHAR_ENCODING_8859_1:
                   1083:         case XML_CHAR_ENCODING_8859_2:
                   1084:         case XML_CHAR_ENCODING_8859_3:
                   1085:         case XML_CHAR_ENCODING_8859_4:
                   1086:         case XML_CHAR_ENCODING_8859_5:
                   1087:         case XML_CHAR_ENCODING_8859_6:
                   1088:         case XML_CHAR_ENCODING_8859_7:
                   1089:         case XML_CHAR_ENCODING_8859_8:
                   1090:         case XML_CHAR_ENCODING_8859_9:
                   1091:            return(NULL);
                   1092:         case XML_CHAR_ENCODING_2022_JP:
1.30      daniel   1093:             handler = xmlFindCharEncodingHandler("ISO-2022-JP");
                   1094:             if (handler != NULL) return(handler);
                   1095:            break;
1.25      daniel   1096:         case XML_CHAR_ENCODING_SHIFT_JIS:
1.30      daniel   1097:             handler = xmlFindCharEncodingHandler("SHIFT-JIS");
                   1098:             if (handler != NULL) return(handler);
                   1099:             handler = xmlFindCharEncodingHandler("SHIFT_JIS");
                   1100:             if (handler != NULL) return(handler);
                   1101:             handler = xmlFindCharEncodingHandler("Shift_JIS");
                   1102:             if (handler != NULL) return(handler);
                   1103:            break;
1.25      daniel   1104:         case XML_CHAR_ENCODING_EUC_JP:
1.30      daniel   1105:             handler = xmlFindCharEncodingHandler("EUC-JP");
                   1106:             if (handler != NULL) return(handler);
                   1107:            break;
                   1108:        default: 
                   1109:            break;
1.25      daniel   1110:     }
1.30      daniel   1111:     
                   1112: #ifdef DEBUG_ENCODING
                   1113:     fprintf(stderr, "No handler found for encoding %d\n", enc);
                   1114: #endif
1.9       daniel   1115:     return(NULL);
                   1116: }
                   1117: 
                   1118: /**
                   1119:  * xmlGetCharEncodingHandler:
                   1120:  * @enc:  a string describing the char encoding.
                   1121:  *
                   1122:  * Search in the registrered set the handler able to read/write that encoding.
                   1123:  *
                   1124:  * Returns the handler or NULL if not found
                   1125:  */
                   1126: xmlCharEncodingHandlerPtr
                   1127: xmlFindCharEncodingHandler(const char *name) {
1.36      daniel   1128:     xmlCharEncodingHandlerPtr enc;
                   1129:     xmlCharEncoding alias;
1.30      daniel   1130: #ifdef LIBXML_ICONV_ENABLED
                   1131:     iconv_t icv_in, icv_out;
                   1132: #endif /* LIBXML_ICONV_ENABLED */
                   1133:     char upper[100];
1.9       daniel   1134:     int i;
                   1135: 
                   1136:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                   1137:     if (name == NULL) return(xmlDefaultCharEncodingHandler);
                   1138:     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
                   1139: 
1.36      daniel   1140:     /*
                   1141:      * Check first for directly registered encoding names
                   1142:      */
1.30      daniel   1143:     for (i = 0;i < 99;i++) {
1.9       daniel   1144:         upper[i] = toupper(name[i]);
                   1145:        if (upper[i] == 0) break;
                   1146:     }
                   1147:     upper[i] = 0;
                   1148: 
                   1149:     for (i = 0;i < nbCharEncodingHandler; i++)
1.30      daniel   1150:         if (!strcmp(upper, handlers[i]->name)) {
                   1151: #ifdef DEBUG_ENCODING
                   1152:             fprintf(stderr, "Found registered handler for encoding %s\n", name);
                   1153: #endif
1.9       daniel   1154:            return(handlers[i]);
1.30      daniel   1155:        }
1.9       daniel   1156: 
1.30      daniel   1157: #ifdef LIBXML_ICONV_ENABLED
                   1158:     /* check whether iconv can handle this */
1.31      daniel   1159:     icv_in = iconv_open("UTF-8", name);
                   1160:     icv_out = iconv_open(name, "UTF-8");
1.30      daniel   1161:     if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31      daniel   1162:            enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32      daniel   1163:            if (enc == NULL) {
                   1164:                iconv_close(icv_in);
                   1165:                iconv_close(icv_out);
                   1166:                return(NULL);
                   1167:            }
                   1168:            enc->name = NULL;
1.30      daniel   1169:            enc->input = NULL;
                   1170:            enc->output = NULL;
                   1171:            enc->iconv_in = icv_in;
                   1172:            enc->iconv_out = icv_out;
                   1173: #ifdef DEBUG_ENCODING
                   1174:             fprintf(stderr, "Found iconv handler for encoding %s\n", name);
                   1175: #endif
                   1176:            return enc;
                   1177:     } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
                   1178:            fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
                   1179:     }
                   1180: #endif /* LIBXML_ICONV_ENABLED */
1.38      daniel   1181: 
1.30      daniel   1182: #ifdef DEBUG_ENCODING
                   1183:     fprintf(stderr, "No handler found for encoding %s\n", name);
                   1184: #endif
1.38      daniel   1185: 
                   1186:     /*
                   1187:      * Fallback using the canonical names
                   1188:      */
                   1189:     alias = xmlParseCharEncoding(name);
                   1190:     if (alias != XML_CHAR_ENCODING_ERROR) {
                   1191:         const char* canon;
                   1192:         canon = xmlGetCharEncodingName(alias);
                   1193:         if ((canon != NULL) && (strcmp(name, canon))) {
                   1194:            return(xmlFindCharEncodingHandler(canon));
                   1195:         }
                   1196:     }
                   1197: 
1.9       daniel   1198:     return(NULL);
1.30      daniel   1199: }
                   1200: 
                   1201: #ifdef LIBXML_ICONV_ENABLED
                   1202: /**
                   1203:  * xmlIconvWrapper:
                   1204:  * @cd:                iconv converter data structure
                   1205:  * @out:  a pointer to an array of bytes to store the result
                   1206:  * @outlen:  the length of @out
                   1207:  * @in:  a pointer to an array of ISO Latin 1 chars
                   1208:  * @inlen:  the length of @in
                   1209:  *
                   1210:  * Returns 0 if success, or 
                   1211:  *     -1 by lack of space, or
                   1212:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1213:  *        the result of transformation can't fit into the encoding we want), or
                   1214:  *     -3 if there the last byte can't form a single output char.
                   1215:  *     
                   1216:  * The value of @inlen after return is the number of octets consumed
                   1217:  *     as the return value is positive, else unpredictiable.
                   1218:  * The value of @outlen after return is the number of ocetes consumed.
                   1219:  */
                   1220: static int
                   1221: xmlIconvWrapper(iconv_t cd,
                   1222:        unsigned char *out, int *outlen,
                   1223:        const unsigned char *in, int *inlen) {
                   1224: 
                   1225:        size_t icv_inlen = *inlen, icv_outlen = *outlen;
                   1226:        const char *icv_in = (const char *) in;
                   1227:        char *icv_out = (char *) out;
                   1228:        int ret;
                   1229: 
                   1230:        ret = iconv(cd,
                   1231:                &icv_in, &icv_inlen,
                   1232:                &icv_out, &icv_outlen);
1.35      daniel   1233:        if (in != NULL) {
                   1234:            *inlen -= icv_inlen;
                   1235:            *outlen -= icv_outlen;
                   1236:        } else {
                   1237:            *inlen = 0;
                   1238:            *outlen = 0;
                   1239:        }
1.30      daniel   1240:        if (icv_inlen != 0 || ret == (size_t) -1) {
                   1241: #ifdef EILSEQ
                   1242:                if (errno == EILSEQ) {
1.31      daniel   1243:                        return -2;
1.30      daniel   1244:                } else
                   1245: #endif
                   1246: #ifdef E2BIG
                   1247:                if (errno == E2BIG) {
                   1248:                        return -1;
                   1249:                } else
                   1250: #endif
                   1251: #ifdef EINVAL
                   1252:                if (errno == EINVAL) {
1.31      daniel   1253:                        return -3;
1.30      daniel   1254:                }
                   1255: #endif
                   1256:                else {
                   1257:                        return -3;
                   1258:                }
                   1259:        }
                   1260:        return 0;
                   1261: }
                   1262: #endif /* LIBXML_ICONV_ENABLED */
1.38      daniel   1263: 
                   1264: /**
                   1265:  * xmlCharEncFirstLine:
                   1266:  * @handler:   char enconding transformation data structure
                   1267:  * @out:  an xmlBuffer for the output.
                   1268:  * @in:  an xmlBuffer for the input
                   1269:  *     
                   1270:  * Front-end for the encoding handler input function, but handle only
                   1271:  * the very first line, i.e. limit itself to 45 chars.
                   1272:  *     
                   1273:  * Returns the number of byte written if success, or 
                   1274:  *     -1 general error
                   1275:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1276:  *        the result of transformation can't fit into the encoding we want), or
                   1277:  */
                   1278: int
                   1279: xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1280:                  xmlBufferPtr in) {
                   1281:     int ret = -2;
                   1282:     int written;
                   1283:     int toconv;
                   1284: 
                   1285:     if (handler == NULL) return(-1);
                   1286:     if (out == NULL) return(-1);
                   1287:     if (in == NULL) return(-1);
                   1288: 
                   1289:     written = out->size - out->use;
                   1290:     toconv = in->use;
                   1291:     if (toconv * 2 >= written) {
1.39    ! daniel   1292:         xmlBufferGrow(out, toconv);
1.38      daniel   1293:        written = out->size - out->use - 1;
                   1294:     }
1.39    ! daniel   1295: 
1.38      daniel   1296:     /*
                   1297:      * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
                   1298:      * 45 chars should be sufficient to reach the end of the encoding
                   1299:      * decalration without going too far inside the document content.
                   1300:      */
                   1301:     written = 45;
                   1302: 
                   1303:     if (handler->input != NULL) {
                   1304:        ret = handler->input(&out->content[out->use], &written,
                   1305:                             in->content, &toconv);
                   1306:        xmlBufferShrink(in, toconv);
                   1307:        out->use += written;
                   1308:        out->content[out->use] = 0;
                   1309:     }
                   1310: #ifdef LIBXML_ICONV_ENABLED
                   1311:     else if (handler->iconv_in != NULL) {
                   1312:        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
                   1313:                              &written, in->content, &toconv);
                   1314:        xmlBufferShrink(in, toconv);
                   1315:        out->use += written;
                   1316:        out->content[out->use] = 0;
                   1317:        if (ret == -1) ret = -3;
                   1318:     }
                   1319: #endif /* LIBXML_ICONV_ENABLED */
                   1320: #ifdef DEBUG_ENCODING
                   1321:     switch (ret) {
                   1322:         case 0:
                   1323:            fprintf(stderr, "converted %d bytes to %d bytes of input\n",
                   1324:                    toconv, written);
                   1325:            break;
                   1326:         case -1:
                   1327:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1328:                    toconv, written, in->use);
                   1329:            break;
                   1330:         case -2:
                   1331:            fprintf(stderr, "input conversion failed due to input error\n");
                   1332:            break;
                   1333:         case -3:
                   1334:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1335:                    toconv, written, in->use);
                   1336:            break;
                   1337:        default:
                   1338:            fprintf(stderr,"Unknown input conversion failed %d\n", ret);
                   1339:     }
                   1340: #endif
                   1341:     /*
                   1342:      * Ignore when input buffer is not on a boundary
                   1343:      */
                   1344:     if (ret == -3) ret = 0;
                   1345:     if (ret == -1) ret = 0;
                   1346:     return(ret);
                   1347: }
1.30      daniel   1348: 
                   1349: /**
                   1350:  * xmlCharEncInFunc:
                   1351:  * @handler:   char enconding transformation data structure
1.31      daniel   1352:  * @out:  an xmlBuffer for the output.
                   1353:  * @in:  an xmlBuffer for the input
1.30      daniel   1354:  *     
                   1355:  * Generic front-end for the encoding handler input function
                   1356:  *     
1.31      daniel   1357:  * Returns the number of byte written if success, or 
                   1358:  *     -1 general error
1.30      daniel   1359:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1360:  *        the result of transformation can't fit into the encoding we want), or
                   1361:  */
                   1362: int
1.31      daniel   1363: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1364:                  xmlBufferPtr in) {
1.30      daniel   1365:     int ret = -2;
1.31      daniel   1366:     int written;
                   1367:     int toconv;
1.30      daniel   1368: 
1.31      daniel   1369:     if (handler == NULL) return(-1);
                   1370:     if (out == NULL) return(-1);
                   1371:     if (in == NULL) return(-1);
                   1372: 
                   1373:     written = out->size - out->use;
                   1374:     toconv = in->use;
                   1375:     if (toconv * 2 >= written) {
                   1376:         xmlBufferGrow(out, toconv * 2);
1.33      daniel   1377:        written = out->size - out->use - 1;
1.31      daniel   1378:     }
1.30      daniel   1379:     if (handler->input != NULL) {
1.32      daniel   1380:        ret = handler->input(&out->content[out->use], &written,
1.31      daniel   1381:                             in->content, &toconv);
                   1382:        xmlBufferShrink(in, toconv);
                   1383:        out->use += written;
1.33      daniel   1384:        out->content[out->use] = 0;
1.30      daniel   1385:     }
                   1386: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1387:     else if (handler->iconv_in != NULL) {
                   1388:        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
                   1389:                              &written, in->content, &toconv);
                   1390:        xmlBufferShrink(in, toconv);
                   1391:        out->use += written;
1.33      daniel   1392:        out->content[out->use] = 0;
                   1393:        if (ret == -1) ret = -3;
1.30      daniel   1394:     }
                   1395: #endif /* LIBXML_ICONV_ENABLED */
1.39    ! daniel   1396:     switch (ret) {
1.30      daniel   1397: #ifdef DEBUG_ENCODING
                   1398:         case 0:
                   1399:            fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31      daniel   1400:                    toconv, written);
1.30      daniel   1401:            break;
                   1402:         case -1:
1.31      daniel   1403:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1404:                    toconv, written, in->use);
1.30      daniel   1405:            break;
                   1406:         case -3:
1.31      daniel   1407:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1408:                    toconv, written, in->use);
1.30      daniel   1409:            break;
1.39    ! daniel   1410: #endif
        !          1411:         case -2:
        !          1412:            fprintf(stderr, "input conversion failed due to input error\n");
        !          1413:            fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
        !          1414:                    in->content[0], in->content[1],
        !          1415:                    in->content[2], in->content[3]);
1.30      daniel   1416:     }
1.33      daniel   1417:     /*
                   1418:      * Ignore when input buffer is not on a boundary
                   1419:      */
                   1420:     if (ret == -3) ret = 0;
1.30      daniel   1421:     return(ret);
                   1422: }
                   1423: 
                   1424: /**
                   1425:  * xmlCharEncOutFunc:
                   1426:  * @handler:   char enconding transformation data structure
1.31      daniel   1427:  * @out:  an xmlBuffer for the output.
                   1428:  * @in:  an xmlBuffer for the input
                   1429:  *     
                   1430:  * Generic front-end for the encoding handler output function
1.35      daniel   1431:  * a first call with @in == NULL has to be made firs to initiate the 
                   1432:  * output in case of non-stateless encoding needing to initiate their
                   1433:  * state or the output (like the BOM in UTF16).
1.39    ! daniel   1434:  * In case of UTF8 sequence conversion errors for the given encoder,
        !          1435:  * the content will be automatically remapped to a CharRef sequence.
1.30      daniel   1436:  *     
1.31      daniel   1437:  * Returns the number of byte written if success, or 
                   1438:  *     -1 general error
1.30      daniel   1439:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1440:  *        the result of transformation can't fit into the encoding we want), or
                   1441:  */
                   1442: int
1.31      daniel   1443: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1444:                   xmlBufferPtr in) {
1.30      daniel   1445:     int ret = -2;
1.31      daniel   1446:     int written;
                   1447:     int toconv;
1.39    ! daniel   1448:     int output = 0;
1.31      daniel   1449: 
                   1450:     if (handler == NULL) return(-1);
                   1451:     if (out == NULL) return(-1);
1.39    ! daniel   1452: 
        !          1453: retry:
        !          1454:     
1.35      daniel   1455:     written = out->size - out->use;
                   1456: 
1.39    ! daniel   1457:     /*
        !          1458:      * First specific handling of in = NULL, i.e. the initialization call
        !          1459:      */
1.35      daniel   1460:     if (in == NULL) {
                   1461:         toconv = 0;
                   1462:        if (handler->output != NULL) {
                   1463:            ret = handler->output(&out->content[out->use], &written,
                   1464:                                  NULL, &toconv);
                   1465:            out->use += written;
                   1466:            out->content[out->use] = 0;
                   1467:        }
                   1468: #ifdef LIBXML_ICONV_ENABLED
                   1469:        else if (handler->iconv_out != NULL) {
                   1470:            ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1471:                                  &written, NULL, &toconv);
                   1472:            out->use += written;
                   1473:            out->content[out->use] = 0;
                   1474:        }
                   1475: #endif /* LIBXML_ICONV_ENABLED */
                   1476: #ifdef DEBUG_ENCODING
                   1477:        fprintf(stderr, "initialized encoder\n");
                   1478: #endif
                   1479:         return(0);
                   1480:     }
1.30      daniel   1481: 
1.39    ! daniel   1482:     /*
        !          1483:      * Convertion itself.
        !          1484:      */
1.33      daniel   1485:     toconv = in->use;
                   1486:     if (toconv * 2 >= written) {
                   1487:         xmlBufferGrow(out, toconv * 2);
                   1488:        written = out->size - out->use - 1;
                   1489:     }
1.30      daniel   1490:     if (handler->output != NULL) {
1.33      daniel   1491:        ret = handler->output(&out->content[out->use], &written,
1.35      daniel   1492:                              in->content, &toconv);
1.31      daniel   1493:        xmlBufferShrink(in, toconv);
                   1494:        out->use += written;
1.33      daniel   1495:        out->content[out->use] = 0;
1.30      daniel   1496:     }
                   1497: #ifdef LIBXML_ICONV_ENABLED
                   1498:     else if (handler->iconv_out != NULL) {
1.31      daniel   1499:        ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1500:                              &written, in->content, &toconv);
                   1501:        xmlBufferShrink(in, toconv);
                   1502:        out->use += written;
1.33      daniel   1503:        out->content[out->use] = 0;
                   1504:        if (ret == -1) ret = -3;
1.30      daniel   1505:     }
                   1506: #endif /* LIBXML_ICONV_ENABLED */
1.39    ! daniel   1507: 
        !          1508:     if (ret >= 0) output += ret;
        !          1509: 
        !          1510:     /*
        !          1511:      * Attempt to handle error cases
        !          1512:      */
        !          1513:     switch (ret) {
1.30      daniel   1514: #ifdef DEBUG_ENCODING
                   1515:         case 0:
                   1516:            fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31      daniel   1517:                    toconv, written);
1.30      daniel   1518:            break;
                   1519:         case -1:
                   1520:            fprintf(stderr, "output conversion failed by lack of space\n");
                   1521:            break;
                   1522:         case -3:
1.31      daniel   1523:            fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
                   1524:                    toconv, written, in->use);
1.30      daniel   1525:            break;
1.39    ! daniel   1526: #endif
        !          1527:         case -2: {
        !          1528:            int len = in->use;
        !          1529:            const char *utf = (const char *) in->content;
        !          1530:            int cur;
        !          1531: 
        !          1532:            cur = xmlGetUTF8Char(utf, &len);
        !          1533:            if (cur > 0) {
        !          1534:                xmlChar charref[20];
        !          1535: 
        !          1536: #ifdef DEBUG_ENCODING
        !          1537:                fprintf(stderr, "handling output conversion error\n");
        !          1538:                fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
        !          1539:                        in->content[0], in->content[1],
        !          1540:                        in->content[2], in->content[3]);
        !          1541: #endif
        !          1542:                /*
        !          1543:                 * Removes the UTF8 sequence, and replace it by a charref
        !          1544:                 * and continue the transcoding phase, hoping the error
        !          1545:                 * did not mangle the encoder state.
        !          1546:                 */
        !          1547:                sprintf(charref, "&#x%X;", cur);
        !          1548:                xmlBufferShrink(in, len);
        !          1549:                xmlBufferAddHead(in, charref, -1);
        !          1550: 
        !          1551:                goto retry;
        !          1552:            } else {
        !          1553:                fprintf(stderr, "output conversion failed due to conv error\n");
        !          1554:                fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
        !          1555:                        in->content[0], in->content[1],
        !          1556:                        in->content[2], in->content[3]);
        !          1557:            }
        !          1558:            break;
        !          1559:        }
1.30      daniel   1560:     }
                   1561:     return(ret);
                   1562: }
                   1563: 
                   1564: /**
                   1565:  * xmlCharEncCloseFunc:
                   1566:  * @handler:   char enconding transformation data structure
                   1567:  *     
                   1568:  * Generic front-end for hencoding handler close function
                   1569:  *
                   1570:  * Returns 0 if success, or -1 in case of error
                   1571:  */
                   1572: int
                   1573: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
                   1574:     int ret = 0;
1.31      daniel   1575:     if (handler == NULL) return(-1);
                   1576:     if (handler->name == NULL) return(-1);
1.30      daniel   1577: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1578:     /*
                   1579:      * Iconv handlers can be oused only once, free the whole block.
                   1580:      * and the associated icon resources.
                   1581:      */
1.32      daniel   1582:     if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
                   1583:        if (handler->name != NULL)
                   1584:            xmlFree(handler->name);
                   1585:        handler->name = NULL;
                   1586:        if (handler->iconv_out != NULL) {
                   1587:            if (iconv_close(handler->iconv_out))
                   1588:                ret = -1;
                   1589:            handler->iconv_out = NULL;
                   1590:        }
                   1591:        if (handler->iconv_in != NULL) {
                   1592:            if (iconv_close(handler->iconv_in))
                   1593:                ret = -1;
                   1594:            handler->iconv_in = NULL;
                   1595:        }
                   1596:        xmlFree(handler);
1.30      daniel   1597:     }
                   1598: #endif /* LIBXML_ICONV_ENABLED */
                   1599: #ifdef DEBUG_ENCODING
                   1600:     if (ret)
                   1601:         fprintf(stderr, "failed to close the encoding handler\n");
                   1602:     else
                   1603:         fprintf(stderr, "closed the encoding handler\n");
                   1604: 
                   1605: #endif
                   1606:     return(ret);
1.9       daniel   1607: }
                   1608: 

Webmaster