diff options
author | Tomas Salfischberger <tomas@rockbox.org> | 2005-05-02 15:05:07 +0000 |
---|---|---|
committer | Tomas Salfischberger <tomas@rockbox.org> | 2005-05-02 15:05:07 +0000 |
commit | 52abc68b11694d2360e119543b876cf3c5768fbe (patch) | |
tree | 937b8bdc68faccf815efdccf1192c2dd92738932 /tools/wn2rdf.pl | |
parent | a810a67db7c923b01c4135761ef21ab866db256d (diff) | |
download | rockbox-52abc68b11694d2360e119543b876cf3c5768fbe.tar.gz rockbox-52abc68b11694d2360e119543b876cf3c5768fbe.zip |
Dictionary conversion tools.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6395 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'tools/wn2rdf.pl')
-rw-r--r-- | tools/wn2rdf.pl | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/tools/wn2rdf.pl b/tools/wn2rdf.pl new file mode 100644 index 0000000000..2fff87d66b --- /dev/null +++ b/tools/wn2rdf.pl | |||
@@ -0,0 +1,122 @@ | |||
1 | #! /usr/bin/perl -w | ||
2 | |||
3 | # Wordnet dictionary database converter | ||
4 | # | ||
5 | # Converts the Wordnet prolog data to rockbox dictionary format. | ||
6 | # | ||
7 | # Written by Miika Pekkarinen <slasher@ihme.org> | ||
8 | # | ||
9 | # $Id$ | ||
10 | |||
11 | use strict; | ||
12 | |||
13 | # Lookup tables | ||
14 | my %words; | ||
15 | my %descriptions; | ||
16 | |||
17 | sub getcatname { | ||
18 | my ($id) = @_; | ||
19 | |||
20 | return 'N' if $id == 1; | ||
21 | return 'V' if $id == 2; | ||
22 | return 'A' if $id == 3; | ||
23 | return 'A' if $id == 4; | ||
24 | return '?'; | ||
25 | } | ||
26 | |||
27 | open IN_WORD, "wn_s.pl" or die "Open fail(#1): $!"; | ||
28 | open IN_DESC, "wn_g.pl" or die "Open fail(#2): $!"; | ||
29 | open OUTPUT, "> dict.preparsed" or die "Open fail(#3): $!"; | ||
30 | |||
31 | print "Reading word file...\n"; | ||
32 | |||
33 | # Read everything into memory | ||
34 | while (<IN_WORD>) { | ||
35 | chomp ; | ||
36 | |||
37 | # s(100001740,1,'entity',n,1,11). => 100001740,1,'entity',n,1,11 | ||
38 | s/(^s\()(.*)(\)\.$)/$2/; | ||
39 | |||
40 | my ($seqid, $n1, $word, $n2, $n3, $n4) = split /,/, $_, 6; | ||
41 | |||
42 | # 'entity' => entity | ||
43 | $word =~ s/(^\')(.*)(\'$)/$2/; | ||
44 | $word =~ s/\'\'/\'/s; | ||
45 | |||
46 | my $category = substr $seqid, 0, 1; | ||
47 | |||
48 | $words{lc $word}{$seqid} = $category; | ||
49 | } | ||
50 | |||
51 | close IN_WORD; | ||
52 | |||
53 | print "Reading description file...\n"; | ||
54 | while (<IN_DESC>) { | ||
55 | chomp ; | ||
56 | |||
57 | # g(100002056,'(a separate and self-contained entity)'). | ||
58 | # => 100002056,'(a separate and self-contained entity)' | ||
59 | s/(^g\()(.*)(\)\.$)/$2/; | ||
60 | |||
61 | my ($seqid, $desc) = split /,/, $_, 2; | ||
62 | |||
63 | $desc =~ s/(^\'\()(.*)(\)\'$)/$2/; | ||
64 | $desc =~ s/\'\'/\'/s; | ||
65 | |||
66 | $descriptions{$seqid} = $desc; | ||
67 | } | ||
68 | |||
69 | close IN_DESC; | ||
70 | |||
71 | print "Sorting and writing output...\n"; | ||
72 | |||
73 | # Now sort and find correct descriptions | ||
74 | foreach my $word (sort keys %words) { | ||
75 | my %categories; | ||
76 | |||
77 | # Find all definitions of the word | ||
78 | foreach my $id (keys %{$words{$word}}) { | ||
79 | my $catid = $words{$word}{$id}; | ||
80 | my $description = $descriptions{$id}; | ||
81 | |||
82 | if (!defined($description) or $description eq '') { | ||
83 | print "Error: Failed to link word: $word / ", | ||
84 | $words{$word}, "\n"; | ||
85 | exit 1; | ||
86 | } | ||
87 | |||
88 | push @{$categories{$catid}}, $description; | ||
89 | } | ||
90 | |||
91 | my $finaldesc; | ||
92 | |||
93 | # 1 = noun | ||
94 | # 2 = verb | ||
95 | # 3 = adjective | ||
96 | # 4 = adverb | ||
97 | for my $catid (1 .. 4) { | ||
98 | my $n = 1; | ||
99 | my $catdesc; | ||
100 | |||
101 | next unless $categories{$catid}; | ||
102 | foreach my $desc ( @{$categories{$catid}} ) { | ||
103 | $catdesc .= " " if $catdesc; | ||
104 | $catdesc .= "$n. $desc"; | ||
105 | $n++; | ||
106 | } | ||
107 | |||
108 | next unless $catdesc; | ||
109 | $finaldesc .= "\t" if $finaldesc; | ||
110 | $finaldesc .= getcatname($catid) . ": $catdesc" | ||
111 | } | ||
112 | |||
113 | die "Internal error" unless $finaldesc; | ||
114 | |||
115 | print OUTPUT "$word\t$finaldesc\n"; | ||
116 | } | ||
117 | |||
118 | close OUTPUT; | ||
119 | |||
120 | print "Done, output was successfully written!\n"; | ||
121 | |||
122 | |||