diff options
Diffstat (limited to 'firmware/common')
-rw-r--r-- | firmware/common/unicode.c | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/firmware/common/unicode.c b/firmware/common/unicode.c new file mode 100644 index 0000000000..a82327e1b1 --- /dev/null +++ b/firmware/common/unicode.c | |||
@@ -0,0 +1,307 @@ | |||
1 | /* Some conversion functions for handling UTF-8 | ||
2 | * | ||
3 | * copyright Marcoen Hirschberg (2004,2005) | ||
4 | * | ||
5 | * I got all the info from: | ||
6 | * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 | ||
7 | * and | ||
8 | * http://en.wikipedia.org/wiki/Unicode | ||
9 | */ | ||
10 | |||
11 | #include <stdio.h> | ||
12 | #include "file.h" | ||
13 | #include "debug.h" | ||
14 | #include "rbunicode.h" | ||
15 | |||
16 | #ifndef O_BINARY | ||
17 | #define O_BINARY 0 | ||
18 | #endif | ||
19 | |||
20 | #define NUM_TABLES 5 | ||
21 | #define NUM_CODEPAGES 13 | ||
22 | |||
23 | static int default_codepage = 0; | ||
24 | static unsigned short codepage_table[MAX_CP_TABLE_SIZE]; | ||
25 | static int loaded_cp_table = 0; | ||
26 | |||
27 | |||
28 | static const unsigned char utf8comp[6] = | ||
29 | { | ||
30 | 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC | ||
31 | }; | ||
32 | |||
33 | static const char *filename[NUM_TABLES] = | ||
34 | { | ||
35 | CODEPAGE_DIR"/iso.cp", | ||
36 | CODEPAGE_DIR"/932.cp", /* SJIS */ | ||
37 | CODEPAGE_DIR"/936.cp", /* GB2312 */ | ||
38 | CODEPAGE_DIR"/949.cp", /* KSX1001 */ | ||
39 | CODEPAGE_DIR"/950.cp" /* BIG5 */ | ||
40 | }; | ||
41 | |||
42 | static const char cp_2_table[NUM_CODEPAGES] = | ||
43 | { | ||
44 | 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5 | ||
45 | }; | ||
46 | |||
47 | /* Load codepage file into memory */ | ||
48 | int load_cp_table(int cp) | ||
49 | { | ||
50 | int i=0; | ||
51 | int table = cp_2_table[cp]; | ||
52 | int file, tablesize; | ||
53 | unsigned char tmp[2]; | ||
54 | |||
55 | if (cp == 0 || table == loaded_cp_table) | ||
56 | return 1; | ||
57 | |||
58 | file = open(filename[table-1], O_RDONLY|O_BINARY); | ||
59 | |||
60 | if (file < 0) { | ||
61 | DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]); | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | tablesize = lseek(file, 0, SEEK_END) / 2; | ||
66 | lseek(file, 0, SEEK_SET); | ||
67 | |||
68 | if (tablesize > MAX_CP_TABLE_SIZE) { | ||
69 | DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]); | ||
70 | close(file); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | while (i < tablesize) { | ||
75 | if (!read(file, tmp, 2)) { | ||
76 | DEBUGF("Can't read from codepage file: %s.cp\n", filename[table-1]); | ||
77 | loaded_cp_table = 0; | ||
78 | return 0; | ||
79 | } | ||
80 | codepage_table[i++] = (tmp[1] << 8) | tmp[0]; | ||
81 | } | ||
82 | |||
83 | loaded_cp_table = table; | ||
84 | close(file); | ||
85 | return 1; | ||
86 | } | ||
87 | |||
88 | /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */ | ||
89 | unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8) | ||
90 | { | ||
91 | int tail = 0; | ||
92 | |||
93 | if (ucs > 0x7F) | ||
94 | while (ucs >> (6*tail + 2)) | ||
95 | tail++; | ||
96 | |||
97 | *utf8++ = (ucs >> (6*tail)) | utf8comp[tail]; | ||
98 | while (tail--) | ||
99 | *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP; | ||
100 | |||
101 | return utf8; | ||
102 | } | ||
103 | |||
104 | /* Recode an iso encoded string to UTF-8 */ | ||
105 | unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8, | ||
106 | int cp, int count) | ||
107 | { | ||
108 | unsigned short ucs, tmp; | ||
109 | |||
110 | if (cp == -1) /* use default codepage */ | ||
111 | cp = default_codepage; | ||
112 | |||
113 | if (!load_cp_table(cp)) cp = 0; | ||
114 | |||
115 | while (count--) { | ||
116 | if (*iso < 128) | ||
117 | *utf8++ = *iso++; | ||
118 | |||
119 | else { | ||
120 | |||
121 | /* cp tells us which codepage to convert from */ | ||
122 | switch (cp) { | ||
123 | case 0x01: /* Greek (ISO-8859-7) */ | ||
124 | case 0x02: /* Hebrew (ISO-8859-8) */ | ||
125 | case 0x03: /* Russian (CP1251) */ | ||
126 | case 0x04: /* Thai (ISO-8859-11) */ | ||
127 | case 0x05: /* Arabic (ISO-8859-6) */ | ||
128 | case 0x06: /* Turkish (ISO-8859-9) */ | ||
129 | case 0x07: /* Latin Extended (ISO-8859-2) */ | ||
130 | tmp = ((cp-1)*128) + (*iso++ - 128); | ||
131 | ucs = codepage_table[tmp]; | ||
132 | break; | ||
133 | |||
134 | case 0x08: /* Japanese (SJIS) */ | ||
135 | if (*iso > 0xA0 && *iso < 0xE0) { | ||
136 | tmp = *iso | 0xA100; | ||
137 | ucs = codepage_table[tmp]; | ||
138 | break; | ||
139 | } | ||
140 | |||
141 | case 0x09: /* Simplified Chinese (GB2312) */ | ||
142 | case 0x0A: /* Korean (KSX1001) */ | ||
143 | case 0x0B: /* Traditional Chinese (BIG5) */ | ||
144 | if (count < 1 || !iso[1]) { | ||
145 | ucs = *iso++; | ||
146 | break; | ||
147 | } | ||
148 | |||
149 | /* we assume all cjk strings are written | ||
150 | in big endian order */ | ||
151 | tmp = *iso++ << 8; | ||
152 | tmp |= *iso++; | ||
153 | tmp -= 0x8000; | ||
154 | ucs = codepage_table[tmp]; | ||
155 | count--; | ||
156 | break; | ||
157 | |||
158 | case 0x0C: /* UTF-8, do nothing */ | ||
159 | default: | ||
160 | ucs = *iso++; | ||
161 | break; | ||
162 | } | ||
163 | |||
164 | if (ucs == 0) /* unknown char, assume invalid encoding */ | ||
165 | ucs = 0xffff; | ||
166 | utf8 = utf8encode(ucs, utf8); | ||
167 | } | ||
168 | } | ||
169 | return utf8; | ||
170 | } | ||
171 | |||
172 | /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */ | ||
173 | unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count) | ||
174 | { | ||
175 | unsigned long ucs; | ||
176 | |||
177 | while (count != 0) { | ||
178 | if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) { /* Check for a surrogate pair */ | ||
179 | ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18) | utf16[2] | ((utf16[3] - 0xDC) << 8)); | ||
180 | utf16 += 4; | ||
181 | count -= 2; | ||
182 | } else { | ||
183 | ucs = (utf16[0] | (utf16[1] << 8)); | ||
184 | utf16 += 2; | ||
185 | count -= 1; | ||
186 | } | ||
187 | utf8 = utf8encode(ucs, utf8); | ||
188 | } | ||
189 | return utf8; | ||
190 | } | ||
191 | |||
192 | /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */ | ||
193 | unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count) | ||
194 | { | ||
195 | unsigned long ucs; | ||
196 | |||
197 | while (count != 0) { | ||
198 | if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */ | ||
199 | ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10) | ((utf16[2] - 0xDC) << 8) | utf16[3]); | ||
200 | utf16 += 4; | ||
201 | count -= 2; | ||
202 | } else { | ||
203 | ucs = (utf16[0] << 8) | utf16[1]; | ||
204 | utf16 += 2; | ||
205 | count -= 1; | ||
206 | } | ||
207 | utf8 = utf8encode(ucs, utf8); | ||
208 | } | ||
209 | return utf8; | ||
210 | } | ||
211 | |||
212 | /* Recode any UTF-16 string to UTF-8 */ | ||
213 | //unsigned char* utf16decode(unsigned const char *utf16, unsigned char *utf8, unsigned int count) | ||
214 | unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8, unsigned int count) | ||
215 | { | ||
216 | unsigned long ucs; | ||
217 | |||
218 | ucs = *(utf16++) << 8; | ||
219 | ucs |= *(utf16++); | ||
220 | |||
221 | if (ucs == 0xFEFF) /* Check for BOM */ | ||
222 | return utf16BEdecode(utf16, utf8, count-1); | ||
223 | else if (ucs == 0xFFFE) | ||
224 | return utf16LEdecode(utf16, utf8, count-1); | ||
225 | else { /* ADDME: Should default be LE or BE? */ | ||
226 | utf16 -= 2; | ||
227 | return utf16BEdecode(utf16, utf8, count); | ||
228 | } | ||
229 | } | ||
230 | |||
231 | /* Return the number of UTF-8 chars in a string */ | ||
232 | unsigned long utf8length(const unsigned char *utf8) | ||
233 | { | ||
234 | unsigned long l = 0; | ||
235 | |||
236 | while (*utf8 != 0) | ||
237 | if ((*utf8++ & MASK) != COMP) | ||
238 | l++; | ||
239 | |||
240 | return l; | ||
241 | } | ||
242 | |||
243 | /* Decode 1 UTF-8 char and return a pointer to the next char. */ | ||
244 | const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs) | ||
245 | { | ||
246 | unsigned char c = *utf8++; | ||
247 | unsigned long code; | ||
248 | int tail = 0; | ||
249 | |||
250 | if ((c <= 0x7f) || (c >= 0xc2)) { | ||
251 | /* Start of new character. */ | ||
252 | if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */ | ||
253 | code = c; | ||
254 | } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */ | ||
255 | tail = 1; | ||
256 | code = c & 0x1f; | ||
257 | } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */ | ||
258 | tail = 2; | ||
259 | code = c & 0x0f; | ||
260 | } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */ | ||
261 | tail = 3; | ||
262 | code = c & 0x07; | ||
263 | } else { | ||
264 | /* Invalid size. */ | ||
265 | code = 0xffff; | ||
266 | } | ||
267 | |||
268 | while (tail-- && ((c = *utf8++) != 0)) { | ||
269 | if ((c & 0xc0) == 0x80) { | ||
270 | /* Valid continuation character. */ | ||
271 | code = (code << 6) | (c & 0x3f); | ||
272 | |||
273 | } else { | ||
274 | /* Invalid continuation char */ | ||
275 | code = 0xffff; | ||
276 | utf8--; | ||
277 | break; | ||
278 | } | ||
279 | } | ||
280 | } else { | ||
281 | /* Invalid UTF-8 char */ | ||
282 | code = 0xffff; | ||
283 | } | ||
284 | /* currently we don't support chars above U-FFFF */ | ||
285 | *ucs = (code < 0x10000) ? code : 0xffff; | ||
286 | return utf8; | ||
287 | } | ||
288 | |||
289 | void set_codepage(int cp) | ||
290 | { | ||
291 | default_codepage = cp; | ||
292 | return; | ||
293 | } | ||
294 | |||
295 | /* seek to a given char in a utf8 string and | ||
296 | return its start position in the string */ | ||
297 | int utf8seek(const unsigned char* utf8, int offset) | ||
298 | { | ||
299 | int pos = 0; | ||
300 | |||
301 | while (offset--) { | ||
302 | pos++; | ||
303 | while ((utf8[pos] & MASK) == COMP) | ||
304 | pos++; | ||
305 | } | ||
306 | return pos; | ||
307 | } | ||