1 files changed, 307 insertions, 0 deletions
diff --git a/firmware/common/unicode.c b/firmware/common/unicode.c
new file mode 100644
index 0000000000..a82327e1b1
--- /dev/null
+++ b/firmware/common/unicode.c
@@ -0,0 +1,307 @@
+/*   Some conversion functions for handling UTF-8
+ *
+ *   copyright Marcoen Hirschberg (2004,2005)
+ *
+ *   I got all the info from:
+ *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
+ *   and
+ *   http://en.wikipedia.org/wiki/Unicode
+ */
+#include <stdio.h>
+#include "file.h"
+#include "debug.h"
+#include "rbunicode.h"
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+#define NUM_TABLES 5
+#define NUM_CODEPAGES 13
+static int default_codepage = 0;
+static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
+static int loaded_cp_table = 0;
+static const unsigned char utf8comp[6] = 
+{
+    0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
+};
+static const char *filename[NUM_TABLES] =
+{
+    CODEPAGE_DIR"/iso.cp",
+    CODEPAGE_DIR"/932.cp",  /* SJIS    */
+    CODEPAGE_DIR"/936.cp",  /* GB2312  */
+    CODEPAGE_DIR"/949.cp",  /* KSX1001 */
+    CODEPAGE_DIR"/950.cp"   /* BIG5    */
+};
+static const char cp_2_table[NUM_CODEPAGES] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5
+};
+/* Load codepage file into memory */
+int load_cp_table(int cp)
+{
+    int i=0;
+    int table = cp_2_table[cp];
+    int file, tablesize;
+    unsigned char tmp[2];
+    if (cp == 0 || table == loaded_cp_table)
+        return 1;
+    file = open(filename[table-1], O_RDONLY|O_BINARY);
+    if (file < 0) {
+        DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
+        return 0;
+    }
+    tablesize = lseek(file, 0, SEEK_END) / 2;
+    lseek(file, 0, SEEK_SET);
+    if (tablesize > MAX_CP_TABLE_SIZE) {
+        DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
+        close(file);
+        return 0;
+    }
+    while (i < tablesize) {
+        if (!read(file, tmp, 2)) {
+            DEBUGF("Can't read from codepage file: %s.cp\n", filename[table-1]);
+            loaded_cp_table = 0;
+            return 0;
+        }
+        codepage_table[i++] = (tmp[1] << 8) | tmp[0];
+    }
+    loaded_cp_table = table;
+    close(file);
+    return 1;
+}
+/* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
+unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
+{
+    int tail = 0;
+    if (ucs > 0x7F)
+        while (ucs >> (6*tail + 2))
+            tail++;
+    *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
+    while (tail--)
+        *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
+    return utf8;
+}
+/* Recode an iso encoded string to UTF-8 */
+unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
+                          int cp, int count)
+{
+    unsigned short ucs, tmp;
+    if (cp == -1) /* use default codepage */
+       cp = default_codepage;
+    if (!load_cp_table(cp)) cp = 0;
+    while (count--) {
+        if (*iso < 128)
+            *utf8++ = *iso++;
+        else {
+            /* cp tells us which codepage to convert from */
+            switch (cp) {
+                case 0x01: /* Greek (ISO-8859-7) */
+                case 0x02: /* Hebrew (ISO-8859-8) */
+                case 0x03: /* Russian (CP1251) */
+                case 0x04: /* Thai (ISO-8859-11) */
+                case 0x05: /* Arabic (ISO-8859-6) */
+                case 0x06: /* Turkish (ISO-8859-9) */
+                case 0x07: /* Latin Extended (ISO-8859-2) */
+                    tmp = ((cp-1)*128) + (*iso++ - 128);
+                    ucs = codepage_table[tmp];
+                    break;
+                case 0x08: /* Japanese (SJIS) */
+                    if (*iso > 0xA0 && *iso < 0xE0) {
+                        tmp = *iso | 0xA100;
+                        ucs = codepage_table[tmp];
+                        break;
+                    }
+                case 0x09: /* Simplified Chinese (GB2312) */
+                case 0x0A: /* Korean (KSX1001) */
+                case 0x0B: /* Traditional Chinese (BIG5) */
+                    if (count < 1 || !iso[1]) {
+                        ucs = *iso++;
+                        break;
+                    }
+                    /* we assume all cjk strings are written
+                       in big endian order */
+                    tmp = *iso++ << 8;
+                    tmp |= *iso++;
+                    tmp -= 0x8000;
+                    ucs = codepage_table[tmp];
+                    count--;
+                    break;
+                case 0x0C: /* UTF-8, do nothing */
+                default:
+                    ucs = *iso++;
+                    break;
+            }
+            if (ucs == 0) /* unknown char, assume invalid encoding */
+                ucs = 0xffff;
+            utf8 = utf8encode(ucs, utf8);
+        }
+    }
+    return utf8;
+}
+/* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
+unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
+{
+    unsigned long ucs;
+    while (count != 0) {
+        if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) { /* Check for a surrogate pair */
+            ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18) | utf16[2] | ((utf16[3] - 0xDC) << 8));
+            utf16 += 4;
+            count -= 2;
+        } else {
+            ucs = (utf16[0] | (utf16[1] << 8));
+            utf16 += 2;
+            count -= 1;
+        }
+        utf8 = utf8encode(ucs, utf8);
+    }
+    return utf8;
+}
+/* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
+unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
+{
+    unsigned long ucs;
+    while (count != 0) {
+        if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
+            ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10) | ((utf16[2] - 0xDC) << 8) | utf16[3]);
+            utf16 += 4;
+            count -= 2;
+        } else {
+            ucs = (utf16[0] << 8) | utf16[1];
+            utf16 += 2;
+            count -= 1;
+        }
+        utf8 = utf8encode(ucs, utf8);
+    }
+    return utf8;
+}
+/* Recode any UTF-16 string to UTF-8 */
+//unsigned char* utf16decode(unsigned const char *utf16, unsigned char *utf8, unsigned int count)
+unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
+{
+    unsigned long ucs;
+    ucs = *(utf16++) << 8;
+    ucs |= *(utf16++);
+    if (ucs == 0xFEFF) /* Check for BOM */
+        return utf16BEdecode(utf16, utf8, count-1);
+    else if (ucs == 0xFFFE)
+        return utf16LEdecode(utf16, utf8, count-1);
+    else { /* ADDME: Should default be LE or BE? */
+        utf16 -= 2;
+        return utf16BEdecode(utf16, utf8, count);
+    }
+}
+/* Return the number of UTF-8 chars in a string */
+unsigned long utf8length(const unsigned char *utf8)
+{
+    unsigned long l = 0;
+    while (*utf8 != 0)
+        if ((*utf8++ & MASK) != COMP)
+            l++;
+    return l;
+}
+/* Decode 1 UTF-8 char and return a pointer to the next char. */
+const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
+{
+    unsigned char c = *utf8++;
+    unsigned long code;
+    int tail = 0;
+    if ((c <= 0x7f) || (c >= 0xc2)) {
+        /* Start of new character. */
+        if (c < 0x80) {        /* U-00000000 - U-0000007F, 1 byte */
+            code = c;
+        } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
+            tail = 1;
+            code = c & 0x1f;
+        } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
+            tail = 2;
+            code = c & 0x0f;
+        } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
+            tail = 3;
+            code = c & 0x07;
+        } else {
+            /* Invalid size. */
+            code = 0xffff;
+        }
+        while (tail-- && ((c = *utf8++) != 0)) {
+            if ((c & 0xc0) == 0x80) {
+                /* Valid continuation character. */
+                code = (code << 6) | (c & 0x3f);
+            } else {
+                /* Invalid continuation char */
+                code = 0xffff;
+                utf8--;
+                break;
+            }
+        }
+    } else {
+        /* Invalid UTF-8 char */
+        code = 0xffff;
+    }
+    /* currently we don't support chars above U-FFFF */
+    *ucs = (code < 0x10000) ? code : 0xffff;
+    return utf8;
+}
+void set_codepage(int cp)
+{
+    default_codepage = cp;
+    return;
+}
+/* seek to a given char in a utf8 string and
+   return its start position in the string */
+int utf8seek(const unsigned char* utf8, int offset)
+{
+    int pos = 0;
+    while (offset--) {
+        pos++;
+        while ((utf8[pos] & MASK) == COMP)
+            pos++;
+    }
+    return pos;
+}

diff --git a/firmware/common/unicode.c b/firmware/common/unicode.c new file mode 100644 index 0000000000..a82327e1b1 --- /dev/null +++ b/firmware/common/unicode.c
@@ -0,0 +1,307 @@
	1	/* Some conversion functions for handling UTF-8
	2	*
	3	* copyright Marcoen Hirschberg (2004,2005)
	4	*
	5	* I got all the info from:
	6	* http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
	7	* and
	8	* http://en.wikipedia.org/wiki/Unicode
	9	*/
	10
	11	#include <stdio.h>
	12	#include "file.h"
	13	#include "debug.h"
	14	#include "rbunicode.h"
	15
	16	#ifndef O_BINARY
	17	#define O_BINARY 0
	18	#endif
	19
	20	#define NUM_TABLES 5
	21	#define NUM_CODEPAGES 13
	22
	23	static int default_codepage = 0;
	24	static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
	25	static int loaded_cp_table = 0;
	26
	27
	28	static const unsigned char utf8comp[6] =
	29	{
	30	0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
	31	};
	32
	33	static const char *filename[NUM_TABLES] =
	34	{
	35	CODEPAGE_DIR"/iso.cp",
	36	CODEPAGE_DIR"/932.cp", /* SJIS */
	37	CODEPAGE_DIR"/936.cp", /* GB2312 */
	38	CODEPAGE_DIR"/949.cp", /* KSX1001 */
	39	CODEPAGE_DIR"/950.cp" /* BIG5 */
	40	};
	41
	42	static const char cp_2_table[NUM_CODEPAGES] =
	43	{
	44	0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5
	45	};
	46
	47	/* Load codepage file into memory */
	48	int load_cp_table(int cp)
	49	{
	50	int i=0;
	51	int table = cp_2_table[cp];
	52	int file, tablesize;
	53	unsigned char tmp[2];
	54
	55	if (cp == 0 \|\| table == loaded_cp_table)
	56	return 1;
	57
	58	file = open(filename[table-1], O_RDONLY\|O_BINARY);
	59
	60	if (file < 0) {
	61	DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
	62	return 0;
	63	}
	64
	65	tablesize = lseek(file, 0, SEEK_END) / 2;
	66	lseek(file, 0, SEEK_SET);
	67
	68	if (tablesize > MAX_CP_TABLE_SIZE) {
	69	DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
	70	close(file);
	71	return 0;
	72	}
	73
	74	while (i < tablesize) {
	75	if (!read(file, tmp, 2)) {
	76	DEBUGF("Can't read from codepage file: %s.cp\n", filename[table-1]);
	77	loaded_cp_table = 0;
	78	return 0;
	79	}
	80	codepage_table[i++] = (tmp[1] << 8) \| tmp[0];
	81	}
	82
	83	loaded_cp_table = table;
	84	close(file);
	85	return 1;
	86	}
	87
	88	/* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
	89	unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
	90	{
	91	int tail = 0;
	92
	93	if (ucs > 0x7F)
	94	while (ucs >> (6*tail + 2))
	95	tail++;
	96
	97	utf8++ = (ucs >> (6tail)) \| utf8comp[tail];
	98	while (tail--)
	99	utf8++ = ((ucs >> (6tail)) & (MASK ^ 0xFF)) \| COMP;
	100
	101	return utf8;
	102	}
	103
	104	/* Recode an iso encoded string to UTF-8 */
	105	unsigned char* iso_decode(const unsigned char iso, unsigned char utf8,
	106	int cp, int count)
	107	{
	108	unsigned short ucs, tmp;
	109
	110	if (cp == -1) /* use default codepage */
	111	cp = default_codepage;
	112
	113	if (!load_cp_table(cp)) cp = 0;
	114
	115	while (count--) {
	116	if (*iso < 128)
	117	utf8++ = iso++;
	118
	119	else {
	120
	121	/* cp tells us which codepage to convert from */
	122	switch (cp) {
	123	case 0x01: /* Greek (ISO-8859-7) */
	124	case 0x02: /* Hebrew (ISO-8859-8) */
	125	case 0x03: /* Russian (CP1251) */
	126	case 0x04: /* Thai (ISO-8859-11) */
	127	case 0x05: /* Arabic (ISO-8859-6) */
	128	case 0x06: /* Turkish (ISO-8859-9) */
	129	case 0x07: /* Latin Extended (ISO-8859-2) */
	130	tmp = ((cp-1)128) + (iso++ - 128);
	131	ucs = codepage_table[tmp];
	132	break;
	133
	134	case 0x08: /* Japanese (SJIS) */
	135	if (iso > 0xA0 && iso < 0xE0) {
	136	tmp = *iso \| 0xA100;
	137	ucs = codepage_table[tmp];
	138	break;
	139	}
	140
	141	case 0x09: /* Simplified Chinese (GB2312) */
	142	case 0x0A: /* Korean (KSX1001) */
	143	case 0x0B: /* Traditional Chinese (BIG5) */
	144	if (count < 1 \|\| !iso[1]) {
	145	ucs = *iso++;
	146	break;
	147	}
	148
	149	/* we assume all cjk strings are written
	150	in big endian order */
	151	tmp = *iso++ << 8;
	152	tmp \|= *iso++;
	153	tmp -= 0x8000;
	154	ucs = codepage_table[tmp];
	155	count--;
	156	break;
	157
	158	case 0x0C: /* UTF-8, do nothing */
	159	default:
	160	ucs = *iso++;
	161	break;
	162	}
	163
	164	if (ucs == 0) /* unknown char, assume invalid encoding */
	165	ucs = 0xffff;
	166	utf8 = utf8encode(ucs, utf8);
	167	}
	168	}
	169	return utf8;
	170	}
	171
	172	/* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
	173	unsigned char* utf16LEdecode(const unsigned char utf16, unsigned char utf8, unsigned int count)
	174	{
	175	unsigned long ucs;
	176
	177	while (count != 0) {
	178	if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) { /* Check for a surrogate pair */
	179	ucs = 0x10000 + ((utf16[0] << 10) \| ((utf16[1] - 0xD8) << 18) \| utf16[2] \| ((utf16[3] - 0xDC) << 8));
	180	utf16 += 4;
	181	count -= 2;
	182	} else {
	183	ucs = (utf16[0] \| (utf16[1] << 8));
	184	utf16 += 2;
	185	count -= 1;
	186	}
	187	utf8 = utf8encode(ucs, utf8);
	188	}
	189	return utf8;
	190	}
	191
	192	/* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
	193	unsigned char* utf16BEdecode(const unsigned char utf16, unsigned char utf8, unsigned int count)
	194	{
	195	unsigned long ucs;
	196
	197	while (count != 0) {
	198	if (utf16 >= 0xD8 && utf16 < 0xE0) { /* Check for a surrogate pair */
	199	ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) \| (utf16[1] << 10) \| ((utf16[2] - 0xDC) << 8) \| utf16[3]);
	200	utf16 += 4;
	201	count -= 2;
	202	} else {
	203	ucs = (utf16[0] << 8) \| utf16[1];
	204	utf16 += 2;
	205	count -= 1;
	206	}
	207	utf8 = utf8encode(ucs, utf8);
	208	}
	209	return utf8;
	210	}
	211
	212	/* Recode any UTF-16 string to UTF-8 */
	213	//unsigned char* utf16decode(unsigned const char utf16, unsigned char utf8, unsigned int count)
	214	unsigned char* utf16decode(const unsigned char utf16, unsigned char utf8, unsigned int count)
	215	{
	216	unsigned long ucs;
	217
	218	ucs = *(utf16++) << 8;
	219	ucs \|= *(utf16++);
	220
	221	if (ucs == 0xFEFF) /* Check for BOM */
	222	return utf16BEdecode(utf16, utf8, count-1);
	223	else if (ucs == 0xFFFE)
	224	return utf16LEdecode(utf16, utf8, count-1);
	225	else { /* ADDME: Should default be LE or BE? */
	226	utf16 -= 2;
	227	return utf16BEdecode(utf16, utf8, count);
	228	}
	229	}
	230
	231	/* Return the number of UTF-8 chars in a string */
	232	unsigned long utf8length(const unsigned char *utf8)
	233	{
	234	unsigned long l = 0;
	235
	236	while (*utf8 != 0)
	237	if ((*utf8++ & MASK) != COMP)
	238	l++;
	239
	240	return l;
	241	}
	242
	243	/* Decode 1 UTF-8 char and return a pointer to the next char. */
	244	const unsigned char* utf8decode(const unsigned char utf8, unsigned short ucs)
	245	{
	246	unsigned char c = *utf8++;
	247	unsigned long code;
	248	int tail = 0;
	249
	250	if ((c <= 0x7f) \|\| (c >= 0xc2)) {
	251	/* Start of new character. */
	252	if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */
	253	code = c;
	254	} else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
	255	tail = 1;
	256	code = c & 0x1f;
	257	} else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
	258	tail = 2;
	259	code = c & 0x0f;
	260	} else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
	261	tail = 3;
	262	code = c & 0x07;
	263	} else {
	264	/* Invalid size. */
	265	code = 0xffff;
	266	}
	267
	268	while (tail-- && ((c = *utf8++) != 0)) {
	269	if ((c & 0xc0) == 0x80) {
	270	/* Valid continuation character. */
	271	code = (code << 6) \| (c & 0x3f);
	272
	273	} else {
	274	/* Invalid continuation char */
	275	code = 0xffff;
	276	utf8--;
	277	break;
	278	}
	279	}
	280	} else {
	281	/* Invalid UTF-8 char */
	282	code = 0xffff;
	283	}
	284	/* currently we don't support chars above U-FFFF */
	285	*ucs = (code < 0x10000) ? code : 0xffff;
	286	return utf8;
	287	}
	288
	289	void set_codepage(int cp)
	290	{
	291	default_codepage = cp;
	292	return;
	293	}
	294
	295	/* seek to a given char in a utf8 string and
	296	return its start position in the string */
	297	int utf8seek(const unsigned char* utf8, int offset)
	298	{
	299	int pos = 0;
	300
	301	while (offset--) {
	302	pos++;
	303	while ((utf8[pos] & MASK) == COMP)
	304	pos++;
	305	}
	306	return pos;
	307	}