Annotation of XML/encoding.c, revision 1.2
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
15: * Original code from "Martin J. Duerst" <duerst@w3.org>
16: *
17: * See Copyright for the status of this software.
18: *
1.2 ! daniel 19: * $Id: encoding.c,v 1.3 1998/10/27 06:20:50 veillard Exp $
1.1 daniel 20: *
21: * Daniel.Veillard@w3.org
22: */
23:
24: #include "encoding.h"
25:
26: /**
27: * isolat1ToUTF8:
28: * @out: a pointer ot an array of bytes to store the result
29: * @outlen: the lenght of @out
30: * @in: a pointer ot an array of ISO Latin 1 chars
31: * @inlen: the lenght of @in
32: *
33: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
34: * block of chars out.
35: * return values: number of byte written, or -1 by lack of space.
36: */
37: int
38: isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
39: {
40: unsigned char* outstart= out;
41: unsigned char* outend= out+outlen;
42: unsigned char* inend= in+inlen;
43: unsigned char c;
44:
45: while (in < inend) {
46: c= *in++;
47: if (c < 0x80) {
48: if (out >= outend) return -1;
49: *out++ = c;
50: }
51: else {
52: if (out >= outend) return -1;
53: *out++ = 0xC0 | (c >> 6);
54: if (out >= outend) return -1;
55: *out++ = 0x80 | (0x3F & c);
56: }
57: }
58: return out-outstart;
59: }
60:
61: /**
62: * UTF8Toisolat1:
63: * @out: a pointer ot an array of bytes to store the result
64: * @outlen: the lenght of @out
65: * @in: a pointer ot an array of UTF-8 chars
66: * @inlen: the lenght of @in
67: *
68: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
69: * block of chars out.
70: * TODO: need a fallback mechanism ...
71: * return values: the number of byte written, or -1 by lack of space, or -2
72: * if the transcoding failed.
73: */
74: int
75: UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
76: {
77: unsigned char* outstart= out;
78: unsigned char* outend= out+outlen;
79: unsigned char* inend= in+inlen;
80: unsigned char c;
81:
82: while (in < inend) {
83: c= *in++;
84: if (c < 0x80) {
85: if (out >= outend) return -1;
86: *out++= c;
87: }
88: else if (((c & 0xFE) == 0xC2) && in<inend) {
89: if (out >= outend) return -1;
90: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
91: }
92: else return -2;
93: }
94: return out-outstart;
95: }
96:
97: /**
98: * UTF16ToUTF8:
99: * @out: a pointer ot an array of bytes to store the result
100: * @outlen: the lenght of @out
101: * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
102: * @inlen: the lenght of @in
103: *
104: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
105: * block of chars out.
106: * return values: number of byte written, or -1 by lack of space.
107: */
108: int
109: UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
110: {
111: unsigned char* outstart= out;
112: unsigned char* outend= out+outlen;
113: unsigned short* inend= in+inlen;
114: unsigned int c, d;
115: int bits;
116:
117: while (in < inend) {
118: c= *in++;
119: if ((c & 0xFC00) == 0xD800) { /* surrogates */
120: if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
121: c &= 0x03FF;
122: c <<= 10;
123: c |= d & 0x03FF;
124: c += 0x10000;
125: }
126: else return -1;
127: }
128:
129: /* assertion: c is a single UTF-4 value */
130:
131: if (out >= outend) return -1;
132: if (c < 0x80) { *out++= c; bits= -6; }
133: else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
134: else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
135: else { *out++= (c >> 18) | 0xF0; bits= 12; }
136:
137: for ( ; bits < 0; bits-= 6) {
138: if (out >= outend) return -1;
139: *out++= (c >> bits) & 0x3F;
140: }
141: }
142: return out-outstart;
143: }
144:
145: /**
146: * UTF8ToUTF16:
147: * @out: a pointer ot an array of shorts to store the result
148: * @outlen: the lenght of @out (number of shorts)
149: * @in: a pointer ot an array of UTF-8 chars
150: * @inlen: the lenght of @in
151: *
152: * Take a block of UTF-8 chars in and try to convert it to an UTF-16
153: * block of chars out.
154: * TODO: need a fallback mechanism ...
155: * return values: the number of byte written, or -1 by lack of space, or -2
156: * if the transcoding failed.
157: */
158: int
159: UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
160: {
161: unsigned short* outstart= out;
162: unsigned short* outend= out+outlen;
163: unsigned char* inend= in+inlen;
164: unsigned int c, d, trailing;
165:
166: while (in < inend) {
167: d= *in++;
168: if (d < 0x80) { c= d; trailing= 0; }
169: else if (d < 0xC0) return -2; /* trailing byte in leading position */
170: else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
171: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
172: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
173: else return -2; /* no chance for this in UTF-16 */
174:
175: for ( ; trailing; trailing--) {
176: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
177: c <<= 6;
178: c |= d & 0x3F;
179: }
180:
181: /* assertion: c is a single UTF-4 value */
182: if (c < 0x10000) {
183: if (out >= outend) return -1;
184: *out++ = c;
185: }
186: else if (c < 0x110000) {
187: if (out+1 >= outend) return -1;
188: c -= 0x10000;
189: *out++ = 0xD800 | (c >> 10);
190: *out++ = 0xDC00 | (c & 0x03FF);
191: }
192: else return -1;
193: }
194: return out-outstart;
195: }
196:
197:
Webmaster