From e1b04276217366a54503d066d0cd90a228438bad Mon Sep 17 00:00:00 2001 From: Tomas Salfischberger Date: Wed, 8 Jun 2005 21:01:26 +0000 Subject: New dict2rdf script by Tony Motakis Please test :) it should convert dict format dictionarys to rockboxformat. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6621 a1c6a512-1295-4272-9138-f99709370657 --- tools/FILES | 1 + tools/dict2rdf.pl | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100755 tools/dict2rdf.pl diff --git a/tools/FILES b/tools/FILES index 14cdeddd41..eabf9178dd 100644 --- a/tools/FILES +++ b/tools/FILES @@ -11,6 +11,7 @@ sample.emacs buildzip.pl romsizetest.pl wn2rdf.pl +dict2rdf.pl make.inc makesrc.inc fwpatcher/*.[ch] diff --git a/tools/dict2rdf.pl b/tools/dict2rdf.pl new file mode 100755 index 0000000000..c9085d35ed --- /dev/null +++ b/tools/dict2rdf.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl + +# __________ __ ___. +# Open \______ \ ____ ____ | | _\_ |__ _______ ___ +# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / +# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < +# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ +# \/ \/ \/ \/ \/ +# $Id$ +# +# Copyright (C) 2005 Tony Motakis +# +# All files in this archive are subject to the GNU General Public License. +# See the file COPYING in the source tree root for full license agreement. +# +# This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY +# KIND, either express or implied. + +# set the word size limit +$word_limit = 32; + +use Compress::Zlib; + +# generate base 64 convertion hash +@b64_values = ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', + 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', + 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', 1, 2, 3, 4, 5, 6, 7, 8, 9, '+', '/' ); + +foreach (0..63) { + $b64_get_value{$b64_values[$_]} = $_; +} + +# base 64 convertion subroutine. note that if input is plain (base 64) 0, perl +# doesn't like it, and the function misinterprents it as a (decimal) 0 +# while it actually is a (decimal) 52. Input has a tab in front anyway, so +# this bug actually doesn't matter +sub base64 { + my $i = 1, $num = 0, $left = $_[0]; + while($left) { + $left =~ m{([^\s])$}; # use last char of string + chop $left; # yes, chop, NOT chomp + $num += $i * $b64_get_value{$1}; + $i *= 64; + } + $num; +} + +# Open input files. is the database index, and $DICT is the actuall +# dictionary file we want to access (note the use of zlib, hence the $DICT +# variable instead of a filehandle). is the output file, in +# plain rockbox dictionary format +open INDEX, $ARGV[0] or die "Could not open index: $!"; +$DICT = gzopen($ARGV[1], "rb") or die "Could not open definitions file: $!"; +open RDFOUT, ">$ARGV[2]" or die "Could not open output file: $!"; + +# Read the index +while() +{ + next if /^00-?database/; + + my @current = split /\t|\n/; # split in pieces + $current[0] =~ s/^\s(.{1,$word_limit}).*$/\L\1/; # lowercase + push @def_list, $current[0]; + $def_begin{$current[0]} = base64($current[1]); + $def_length{$current[0]} = base64($current[2]); +} + +# sort the definition list. input from the is usualy sorted, but this +# is not mandatory in the dict file format, so we can't rely on this +@def_list = sort @def_list; + +# read the whole DICT file into memory. overkill? propably. but the file is +# compressed, and we need quick access to random parts of it +$def_all .= $_ while($DICT->gzread($_)); + +foreach (@def_list) { + $def = substr $def_all, $def_begin{$_}, $def_length{$_}; + $def =~ s/\n\s*/ /g; # remove newlines and whitespace after them + print RDFOUT $_ . "\t" . $def . "\n"; +} + -- cgit v1.2.3