summaryrefslogtreecommitdiff
path: root/firmware/common/unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'firmware/common/unicode.c')
-rw-r--r--firmware/common/unicode.c307
1 files changed, 307 insertions, 0 deletions
diff --git a/firmware/common/unicode.c b/firmware/common/unicode.c
new file mode 100644
index 0000000000..a82327e1b1
--- /dev/null
+++ b/firmware/common/unicode.c
@@ -0,0 +1,307 @@
1/* Some conversion functions for handling UTF-8
2 *
3 * copyright Marcoen Hirschberg (2004,2005)
4 *
5 * I got all the info from:
6 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
7 * and
8 * http://en.wikipedia.org/wiki/Unicode
9 */
10
11#include <stdio.h>
12#include "file.h"
13#include "debug.h"
14#include "rbunicode.h"
15
16#ifndef O_BINARY
17#define O_BINARY 0
18#endif
19
20#define NUM_TABLES 5
21#define NUM_CODEPAGES 13
22
23static int default_codepage = 0;
24static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
25static int loaded_cp_table = 0;
26
27
28static const unsigned char utf8comp[6] =
29{
30 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
31};
32
33static const char *filename[NUM_TABLES] =
34{
35 CODEPAGE_DIR"/iso.cp",
36 CODEPAGE_DIR"/932.cp", /* SJIS */
37 CODEPAGE_DIR"/936.cp", /* GB2312 */
38 CODEPAGE_DIR"/949.cp", /* KSX1001 */
39 CODEPAGE_DIR"/950.cp" /* BIG5 */
40};
41
42static const char cp_2_table[NUM_CODEPAGES] =
43{
44 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5
45};
46
47/* Load codepage file into memory */
48int load_cp_table(int cp)
49{
50 int i=0;
51 int table = cp_2_table[cp];
52 int file, tablesize;
53 unsigned char tmp[2];
54
55 if (cp == 0 || table == loaded_cp_table)
56 return 1;
57
58 file = open(filename[table-1], O_RDONLY|O_BINARY);
59
60 if (file < 0) {
61 DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
62 return 0;
63 }
64
65 tablesize = lseek(file, 0, SEEK_END) / 2;
66 lseek(file, 0, SEEK_SET);
67
68 if (tablesize > MAX_CP_TABLE_SIZE) {
69 DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
70 close(file);
71 return 0;
72 }
73
74 while (i < tablesize) {
75 if (!read(file, tmp, 2)) {
76 DEBUGF("Can't read from codepage file: %s.cp\n", filename[table-1]);
77 loaded_cp_table = 0;
78 return 0;
79 }
80 codepage_table[i++] = (tmp[1] << 8) | tmp[0];
81 }
82
83 loaded_cp_table = table;
84 close(file);
85 return 1;
86}
87
88/* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
89unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
90{
91 int tail = 0;
92
93 if (ucs > 0x7F)
94 while (ucs >> (6*tail + 2))
95 tail++;
96
97 *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
98 while (tail--)
99 *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
100
101 return utf8;
102}
103
104/* Recode an iso encoded string to UTF-8 */
105unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
106 int cp, int count)
107{
108 unsigned short ucs, tmp;
109
110 if (cp == -1) /* use default codepage */
111 cp = default_codepage;
112
113 if (!load_cp_table(cp)) cp = 0;
114
115 while (count--) {
116 if (*iso < 128)
117 *utf8++ = *iso++;
118
119 else {
120
121 /* cp tells us which codepage to convert from */
122 switch (cp) {
123 case 0x01: /* Greek (ISO-8859-7) */
124 case 0x02: /* Hebrew (ISO-8859-8) */
125 case 0x03: /* Russian (CP1251) */
126 case 0x04: /* Thai (ISO-8859-11) */
127 case 0x05: /* Arabic (ISO-8859-6) */
128 case 0x06: /* Turkish (ISO-8859-9) */
129 case 0x07: /* Latin Extended (ISO-8859-2) */
130 tmp = ((cp-1)*128) + (*iso++ - 128);
131 ucs = codepage_table[tmp];
132 break;
133
134 case 0x08: /* Japanese (SJIS) */
135 if (*iso > 0xA0 && *iso < 0xE0) {
136 tmp = *iso | 0xA100;
137 ucs = codepage_table[tmp];
138 break;
139 }
140
141 case 0x09: /* Simplified Chinese (GB2312) */
142 case 0x0A: /* Korean (KSX1001) */
143 case 0x0B: /* Traditional Chinese (BIG5) */
144 if (count < 1 || !iso[1]) {
145 ucs = *iso++;
146 break;
147 }
148
149 /* we assume all cjk strings are written
150 in big endian order */
151 tmp = *iso++ << 8;
152 tmp |= *iso++;
153 tmp -= 0x8000;
154 ucs = codepage_table[tmp];
155 count--;
156 break;
157
158 case 0x0C: /* UTF-8, do nothing */
159 default:
160 ucs = *iso++;
161 break;
162 }
163
164 if (ucs == 0) /* unknown char, assume invalid encoding */
165 ucs = 0xffff;
166 utf8 = utf8encode(ucs, utf8);
167 }
168 }
169 return utf8;
170}
171
172/* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
173unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
174{
175 unsigned long ucs;
176
177 while (count != 0) {
178 if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) { /* Check for a surrogate pair */
179 ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18) | utf16[2] | ((utf16[3] - 0xDC) << 8));
180 utf16 += 4;
181 count -= 2;
182 } else {
183 ucs = (utf16[0] | (utf16[1] << 8));
184 utf16 += 2;
185 count -= 1;
186 }
187 utf8 = utf8encode(ucs, utf8);
188 }
189 return utf8;
190}
191
192/* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
193unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
194{
195 unsigned long ucs;
196
197 while (count != 0) {
198 if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
199 ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10) | ((utf16[2] - 0xDC) << 8) | utf16[3]);
200 utf16 += 4;
201 count -= 2;
202 } else {
203 ucs = (utf16[0] << 8) | utf16[1];
204 utf16 += 2;
205 count -= 1;
206 }
207 utf8 = utf8encode(ucs, utf8);
208 }
209 return utf8;
210}
211
212/* Recode any UTF-16 string to UTF-8 */
213//unsigned char* utf16decode(unsigned const char *utf16, unsigned char *utf8, unsigned int count)
214unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
215{
216 unsigned long ucs;
217
218 ucs = *(utf16++) << 8;
219 ucs |= *(utf16++);
220
221 if (ucs == 0xFEFF) /* Check for BOM */
222 return utf16BEdecode(utf16, utf8, count-1);
223 else if (ucs == 0xFFFE)
224 return utf16LEdecode(utf16, utf8, count-1);
225 else { /* ADDME: Should default be LE or BE? */
226 utf16 -= 2;
227 return utf16BEdecode(utf16, utf8, count);
228 }
229}
230
231/* Return the number of UTF-8 chars in a string */
232unsigned long utf8length(const unsigned char *utf8)
233{
234 unsigned long l = 0;
235
236 while (*utf8 != 0)
237 if ((*utf8++ & MASK) != COMP)
238 l++;
239
240 return l;
241}
242
243/* Decode 1 UTF-8 char and return a pointer to the next char. */
244const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
245{
246 unsigned char c = *utf8++;
247 unsigned long code;
248 int tail = 0;
249
250 if ((c <= 0x7f) || (c >= 0xc2)) {
251 /* Start of new character. */
252 if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */
253 code = c;
254 } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
255 tail = 1;
256 code = c & 0x1f;
257 } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
258 tail = 2;
259 code = c & 0x0f;
260 } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
261 tail = 3;
262 code = c & 0x07;
263 } else {
264 /* Invalid size. */
265 code = 0xffff;
266 }
267
268 while (tail-- && ((c = *utf8++) != 0)) {
269 if ((c & 0xc0) == 0x80) {
270 /* Valid continuation character. */
271 code = (code << 6) | (c & 0x3f);
272
273 } else {
274 /* Invalid continuation char */
275 code = 0xffff;
276 utf8--;
277 break;
278 }
279 }
280 } else {
281 /* Invalid UTF-8 char */
282 code = 0xffff;
283 }
284 /* currently we don't support chars above U-FFFF */
285 *ucs = (code < 0x10000) ? code : 0xffff;
286 return utf8;
287}
288
289void set_codepage(int cp)
290{
291 default_codepage = cp;
292 return;
293}
294
295/* seek to a given char in a utf8 string and
296 return its start position in the string */
297int utf8seek(const unsigned char* utf8, int offset)
298{
299 int pos = 0;
300
301 while (offset--) {
302 pos++;
303 while ((utf8[pos] & MASK) == COMP)
304 pos++;
305 }
306 return pos;
307}