| File: | lib/CheckSpelling/DictionaryCoverage.pm |
| Coverage: | 95.9% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #! -*-perl-*- | |||||
| 2 | ||||||
| 3 | package CheckSpelling::DictionaryCoverage; | |||||
| 4 | ||||||
| 5 | our $VERSION='0.1.0'; | |||||
| 6 | 1 1 1 | 115186 2 48 | use File::Basename; | |||
| 7 | 1 1 1 | 2 1 37 | use Encode qw/decode_utf8 encode FB_DEFAULT/; | |||
| 8 | 1 1 1 | 189 1 24 | use CheckSpelling::Util; | |||
| 9 | ||||||
| 10 | use constant { | |||||
| 11 | 1 | 585 | NO_MATCHES => -1, | |||
| 12 | NOT_UNIQUE => -2, | |||||
| 13 | NO_WORD_SEEN_YET => '0' | |||||
| 14 | 1 1 | 2 0 | }; | |||
| 15 | ||||||
| 16 | my $hunspell; | |||||
| 17 | ||||||
| 18 | sub entry { | |||||
| 19 | 6 | 215 | my ($name) = @_; | |||
| 20 | 6 | 4 | my $handle; | |||
| 21 | 6 | 55 | unless (open ($handle, '<:utf8', $name)) { | |||
| 22 | 1 | 11 | print STDERR "Couldn't open dictionary `$name` (dictionary-not-found)\n"; | |||
| 23 | 1 | 10 | return 0; | |||
| 24 | } | |||||
| 25 | return { | |||||
| 26 | 5 | 13 | name => $name, | |||
| 27 | handle => $handle, | |||||
| 28 | word => NO_WORD_SEEN_YET, | |||||
| 29 | uniq => 0, | |||||
| 30 | covered => 0 | |||||
| 31 | } | |||||
| 32 | } | |||||
| 33 | ||||||
| 34 | sub hunspell_entry { | |||||
| 35 | 3 | 3 | my ($name) = @_; | |||
| 36 | 3 | 22 | unless (open ($handle, '<:utf8', $name)) { | |||
| 37 | 1 | 16 | print STDERR "Couldn't open dictionary `$name` (dictionary-not-found)\n"; | |||
| 38 | 1 | 2 | return 0; | |||
| 39 | } | |||||
| 40 | 2 | 16 | my $lines = <$handle>; | |||
| 41 | 2 | 3 | chomp $lines; | |||
| 42 | 2 | 6 | close $handle; | |||
| 43 | 2 | 2 | my $aff = $name; | |||
| 44 | 2 | 0 | my $encoding; | |||
| 45 | 2 | 4 | $aff =~ s/dic$/aff/; | |||
| 46 | 2 | 15 | if (open AFF, '<', $aff) { | |||
| 47 | 1 | 6 | while (<AFF>) { | |||
| 48 | 1 | 2 | next unless /^SET\s+(\S+)/; | |||
| 49 | 1 | 3 | $encoding = $1 if ($1 !~ /utf-8/i); | |||
| 50 | 1 | 1 | last; | |||
| 51 | } | |||||
| 52 | 1 | 3 | close AFF; | |||
| 53 | } | |||||
| 54 | 2 | 2 | my %map; | |||
| 55 | return { | |||||
| 56 | 2 | 507 | name => $name, | |||
| 57 | handle => undef, | |||||
| 58 | encoding => $encoding, | |||||
| 59 | engine => Text::Hunspell->new($aff, $name), | |||||
| 60 | word => NO_WORD_SEEN_YET, | |||||
| 61 | uniq => 0, | |||||
| 62 | coverage => \%map, | |||||
| 63 | lines => $lines | |||||
| 64 | } | |||||
| 65 | } | |||||
| 66 | ||||||
| 67 | sub update_unique { | |||||
| 68 | 10 | 5 | my ($uniq, $file_id) = @_; | |||
| 69 | 10 | 8 | if ($uniq == NO_MATCHES) { | |||
| 70 | 9 | 5 | $uniq = $file_id; | |||
| 71 | } elsif ($uniq > NO_MATCHES) { | |||||
| 72 | 1 | 1 | $uniq = NOT_UNIQUE; | |||
| 73 | } | |||||
| 74 | 10 | 7 | return $uniq; | |||
| 75 | } | |||||
| 76 | ||||||
| 77 | sub main { | |||||
| 78 | 7 | 9277 | my ($check, @dictionaries) = @_; | |||
| 79 | 7 | 5 | my @files; | |||
| 80 | my $unknown_words; | |||||
| 81 | 7 | 58 | unless (open($unknown_words, '<:utf8', $check)) { | |||
| 82 | 1 | 11 | print STDERR "Could not read $check\n"; | |||
| 83 | 1 | 2 | return 0; | |||
| 84 | } | |||||
| 85 | ||||||
| 86 | 6 | 8 | our $hunspell; | |||
| 87 | 6 | 4 | for my $name (@dictionaries) { | |||
| 88 | 8 | 11 | if ($name =~ /\.dic$/) { | |||
| 89 | 3 | 3 | unless ($hunspell) { | |||
| 90 | 1 1 1 1 | 166 1021 10 27 | unless (eval 'use Text::Hunspell; 1') { | |||
| 91 | 0 | 0 | print STDERR "Could not load Text::Hunspell for \`$name\` (hunspell-unavailable)\n"; | |||
| 92 | 0 | 0 | next; | |||
| 93 | } | |||||
| 94 | 1 | 1 | $hunspell = 1; | |||
| 95 | } | |||||
| 96 | 3 | 3 | push @files, hunspell_entry($name); | |||
| 97 | } else { | |||||
| 98 | 5 | 6 | push @files, entry($name); | |||
| 99 | } | |||||
| 100 | } | |||||
| 101 | ||||||
| 102 | 6 | 7 | my @results=@files; | |||
| 103 | 6 | 5 | while (@files) { | |||
| 104 | 19 | 44 | last if eof($unknown_words); | |||
| 105 | 14 | 14 | my $unknown = <$unknown_words>; | |||
| 106 | 14 | 12 | chomp $unknown; | |||
| 107 | 14 | 14 | last if ($unknown eq ''); | |||
| 108 | 14 | 6 | my @drop; | |||
| 109 | 14 | 7 | my $uniq = NO_MATCHES; | |||
| 110 | 14 | 10 | for (my $file_id = 0; $file_id < scalar @files; $file_id++) { | |||
| 111 | 17 | 11 | my $current = $files[$file_id]; | |||
| 112 | my ($word, $handle, $engine) = ( | |||||
| 113 | $current->{'word'}, | |||||
| 114 | $current->{'handle'}, | |||||
| 115 | 17 | 14 | $current->{'engine'}, | |||
| 116 | ); | |||||
| 117 | 17 | 43 | while ($word ne '' && $word lt $unknown) { | |||
| 118 | 18 | 10 | if ($engine) { | |||
| 119 | my $token_encoded = defined $hunspell_dictionary->{'encoding'} ? | |||||
| 120 | 10 | 11 | encode($hunspell_dictionary->{'encoding'}, $unknown) : $unknown; | |||
| 121 | 10 | 47 | if ($engine->check($token_encoded)) { | |||
| 122 | 5 | 37 | my $stem = $engine->stem($token_encoded); | |||
| 123 | 5 | 7 | $current->{'coverage'}->{$stem} = 1; | |||
| 124 | 5 | 5 | $uniq = update_unique($uniq, $file_id); | |||
| 125 | } | |||||
| 126 | 10 | 8 | last; | |||
| 127 | } | |||||
| 128 | 8 | 36 | if (eof $handle) { | |||
| 129 | 1 | 2 | $word = ''; | |||
| 130 | } else { | |||||
| 131 | 7 | 6 | $word = <$handle>; | |||
| 132 | 7 | 21 | chomp $word; | |||
| 133 | } | |||||
| 134 | } | |||||
| 135 | 17 | 18 | if ($word eq $unknown) { | |||
| 136 | 5 | 3 | ++$current->{'covered'}; | |||
| 137 | 5 | 6 | $uniq = update_unique($uniq, $file_id); | |||
| 138 | 5 | 5 | if (eof $handle) { | |||
| 139 | 2 | 2 | $word = ''; | |||
| 140 | } else { | |||||
| 141 | 3 | 1 | $word = <$handle>; | |||
| 142 | 3 | 2 | chomp $word; | |||
| 143 | } | |||||
| 144 | } | |||||
| 145 | 17 | 14 | $current->{'word'} = $word; | |||
| 146 | 17 | 24 | if ($word eq '') { | |||
| 147 | 3 | 5 | push @drop, $file_id; | |||
| 148 | } | |||||
| 149 | } | |||||
| 150 | 14 | 10 | if ($uniq > NO_MATCHES) { | |||
| 151 | 8 | 5 | my $current = $files[$uniq]; | |||
| 152 | 8 | 4 | ++$current->{'uniq'}; | |||
| 153 | } | |||||
| 154 | 14 | 15 | if (@drop) { | |||
| 155 | 3 | 3 | for $file_id (reverse @drop) { | |||
| 156 | 3 | 5 | splice @files, $file_id, 1; | |||
| 157 | } | |||||
| 158 | } | |||||
| 159 | } | |||||
| 160 | 6 | 5 | my $re=CheckSpelling::Util::get_file_from_env('aliases', ''); | |||
| 161 | 6 | 6 | my $extra_dictionaries = CheckSpelling::Util::get_file_from_env('extra_dictionaries', ''); | |||
| 162 | 6 | 9 | @dictionaries=split /\n/, $extra_dictionaries; | |||
| 163 | 6 | 11 | for (my $file_id = 0; $file_id < scalar @results; $file_id++) { | |||
| 164 | 8 | 3 | my $current = $results[$file_id]; | |||
| 165 | my $covered = $current->{'coverage'} | |||||
| 166 | 2 | 2 | ? scalar(keys %{$current->{'coverage'}}) | |||
| 167 | 8 | 11 | : $current->{'covered'}; | |||
| 168 | 8 | 19 | next unless $covered; | |||
| 169 | ||||||
| 170 | 5 | 6 | my $name = $current->{'name'}; | |||
| 171 | 5 | 2 | my $source_link; | |||
| 172 | 5 | 158 | my ($source_link_dir, $source_link_name) = (dirname($name), basename($name)); | |||
| 173 | 5 | 58 | if (open($source_link, '<', "$source_link_dir/.$source_link_name")) { | |||
| 174 | 2 | 15 | $name = <$source_link>; | |||
| 175 | 2 | 2 | chomp $name; | |||
| 176 | 2 | 5 | close $source_link; | |||
| 177 | } | |||||
| 178 | ||||||
| 179 | 5 | 5 | my $uniq = $current->{'uniq'}; | |||
| 180 | 5 | 3 | my $handle = $current->{'handle'}; | |||
| 181 | 5 | 2 | my $lines; | |||
| 182 | 5 | 4 | if ($handle) { | |||
| 183 | 3 | 2 | my $word = $current->{'word'}; | |||
| 184 | 3 | 5 | $word = <$handle> while !eof($handle); | |||
| 185 | 3 | 8 | $lines = $handle->input_line_number(); | |||
| 186 | } else { | |||||
| 187 | 2 | 2 | $lines = $current->{'lines'}; | |||
| 188 | } | |||||
| 189 | ||||||
| 190 | 5 | 33 | local $_ = $name; | |||
| 191 | 5 | 200 | eval $re; | |||
| 192 | 5 | 5 | my $url = $_; | |||
| 193 | ||||||
| 194 | 5 | 6 | my $name_without_spaces = $name; | |||
| 195 | 5 | 4 | $name_without_spaces =~ s/\s+/_/g; | |||
| 196 | ||||||
| 197 | 5 | 3 | my $unique = ''; | |||
| 198 | 5 | 5 | if ($uniq) { | |||
| 199 | 4 | 3 | $unique = " ($uniq uniquely)"; | |||
| 200 | } else { | |||||
| 201 | 1 | 1 | $uniq = 0; | |||
| 202 | } | |||||
| 203 | 5 | 160 | print "$covered-$lines-$uniq-$name_without_spaces [$name]($url) ($lines) covers $covered of them$unique\n"; | |||
| 204 | } | |||||
| 205 | } | |||||
| 206 | ||||||
| 207 | 1; | |||||