diff options
Diffstat (limited to 'tools')
-rw-r--r-- | tools/FILES | 1 | ||||
-rwxr-xr-x | tools/dict2rdf.pl | 83 |
2 files changed, 84 insertions, 0 deletions
diff --git a/tools/FILES b/tools/FILES index 14cdeddd41..eabf9178dd 100644 --- a/tools/FILES +++ b/tools/FILES | |||
@@ -11,6 +11,7 @@ sample.emacs | |||
11 | buildzip.pl | 11 | buildzip.pl |
12 | romsizetest.pl | 12 | romsizetest.pl |
13 | wn2rdf.pl | 13 | wn2rdf.pl |
14 | dict2rdf.pl | ||
14 | make.inc | 15 | make.inc |
15 | makesrc.inc | 16 | makesrc.inc |
16 | fwpatcher/*.[ch] | 17 | fwpatcher/*.[ch] |
diff --git a/tools/dict2rdf.pl b/tools/dict2rdf.pl new file mode 100755 index 0000000000..c9085d35ed --- /dev/null +++ b/tools/dict2rdf.pl | |||
@@ -0,0 +1,83 @@ | |||
1 | #!/usr/bin/perl | ||
2 | |||
3 | # __________ __ ___. | ||
4 | # Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
5 | # Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
6 | # Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
7 | # Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
8 | # \/ \/ \/ \/ \/ | ||
9 | # $Id$ | ||
10 | # | ||
11 | # Copyright (C) 2005 Tony Motakis | ||
12 | # | ||
13 | # All files in this archive are subject to the GNU General Public License. | ||
14 | # See the file COPYING in the source tree root for full license agreement. | ||
15 | # | ||
16 | # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
17 | # KIND, either express or implied. | ||
18 | |||
19 | # set the word size limit | ||
20 | $word_limit = 32; | ||
21 | |||
22 | use Compress::Zlib; | ||
23 | |||
24 | # generate base 64 convertion hash | ||
25 | @b64_values = ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', | ||
26 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', | ||
27 | 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', | ||
28 | 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', | ||
29 | 'w', 'x', 'y', 'z', '0', 1, 2, 3, 4, 5, 6, 7, 8, 9, '+', '/' ); | ||
30 | |||
31 | foreach (0..63) { | ||
32 | $b64_get_value{$b64_values[$_]} = $_; | ||
33 | } | ||
34 | |||
35 | # base 64 convertion subroutine. note that if input is plain (base 64) 0, perl | ||
36 | # doesn't like it, and the function misinterprents it as a (decimal) 0 | ||
37 | # while it actually is a (decimal) 52. Input has a tab in front anyway, so | ||
38 | # this bug actually doesn't matter | ||
39 | sub base64 { | ||
40 | my $i = 1, $num = 0, $left = $_[0]; | ||
41 | while($left) { | ||
42 | $left =~ m{([^\s])$}; # use last char of string | ||
43 | chop $left; # yes, chop, NOT chomp | ||
44 | $num += $i * $b64_get_value{$1}; | ||
45 | $i *= 64; | ||
46 | } | ||
47 | $num; | ||
48 | } | ||
49 | |||
50 | # Open input files. <INDEX> is the database index, and $DICT is the actuall | ||
51 | # dictionary file we want to access (note the use of zlib, hence the $DICT | ||
52 | # variable instead of a <DICT> filehandle). <RDFOUT> is the output file, in | ||
53 | # plain rockbox dictionary format | ||
54 | open INDEX, $ARGV[0] or die "Could not open index: $!"; | ||
55 | $DICT = gzopen($ARGV[1], "rb") or die "Could not open definitions file: $!"; | ||
56 | open RDFOUT, ">$ARGV[2]" or die "Could not open output file: $!"; | ||
57 | |||
58 | # Read the index | ||
59 | while(<INDEX>) | ||
60 | { | ||
61 | next if /^00-?database/; | ||
62 | |||
63 | my @current = split /\t|\n/; # split in pieces | ||
64 | $current[0] =~ s/^\s(.{1,$word_limit}).*$/\L\1/; # lowercase | ||
65 | push @def_list, $current[0]; | ||
66 | $def_begin{$current[0]} = base64($current[1]); | ||
67 | $def_length{$current[0]} = base64($current[2]); | ||
68 | } | ||
69 | |||
70 | # sort the definition list. input from the <INDEX> is usualy sorted, but this | ||
71 | # is not mandatory in the dict file format, so we can't rely on this | ||
72 | @def_list = sort @def_list; | ||
73 | |||
74 | # read the whole DICT file into memory. overkill? propably. but the file is | ||
75 | # compressed, and we need quick access to random parts of it | ||
76 | $def_all .= $_ while($DICT->gzread($_)); | ||
77 | |||
78 | foreach (@def_list) { | ||
79 | $def = substr $def_all, $def_begin{$_}, $def_length{$_}; | ||
80 | $def =~ s/\n\s*/ /g; # remove newlines and whitespace after them | ||
81 | print RDFOUT $_ . "\t" . $def . "\n"; | ||
82 | } | ||
83 | |||