| File: | lib/CheckSpelling/SpellingCollator.pm |
| Coverage: | 89.8% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #! -*-perl-*- | |||||
| 2 | ||||||
| 3 | package CheckSpelling::SpellingCollator; | |||||
| 4 | ||||||
| 5 | our $VERSION='0.1.0'; | |||||
| 6 | 1 1 | 109062 1 | use 5.022; | |||
| 7 | 1 1 1 | 1 1 2 | use utf8; | |||
| 8 | 1 1 1 | 16 1 49 | use feature 'unicode_strings'; | |||
| 9 | 1 1 1 | 1 2 17 | use warnings; | |||
| 10 | 1 1 1 | 2 0 22 | use File::Path qw(remove_tree); | |||
| 11 | 1 1 1 | 232 0 3054 | use CheckSpelling::Util; | |||
| 12 | ||||||
| 13 | my %letter_map; | |||||
| 14 | my %ignored_event_map; | |||||
| 15 | my $disable_word_collating; | |||||
| 16 | my $shortest_word; | |||||
| 17 | ||||||
| 18 | my %last_seen; | |||||
| 19 | ||||||
| 20 | sub get_field { | |||||
| 21 | 40 | 32 | my ($record, $field) = @_; | |||
| 22 | 40 | 355 | return 0 unless $record =~ (/\b$field:\s*(\d+)/); | |||
| 23 | 30 | 37 | return $1; | |||
| 24 | } | |||||
| 25 | ||||||
| 26 | sub get_array { | |||||
| 27 | 4 | 4 | my ($record, $field) = @_; | |||
| 28 | 4 | 29 | return () unless $record =~ (/\b$field: \[([^\]]+)\]/); | |||
| 29 | 4 | 3 | my $values = $1; | |||
| 30 | 4 | 6 | return split /\s*,\s*/, $values; | |||
| 31 | } | |||||
| 32 | ||||||
| 33 | sub maybe { | |||||
| 34 | 7 | 5 | my ($next, $value) = @_; | |||
| 35 | 7 | 14 | $next = $value unless $next && $next < $value; | |||
| 36 | 7 | 6 | return $next; | |||
| 37 | } | |||||
| 38 | ||||||
| 39 | my %expected = (); | |||||
| 40 | sub expect_item { | |||||
| 41 | 94 | 55 | my ($item, $value) = @_; | |||
| 42 | 94 | 71 | $item =~ s/â/'/g; | |||
| 43 | 94 | 35 | our %expected; | |||
| 44 | 94 | 33 | my $next; | |||
| 45 | 94 | 97 | if (defined $expected{$item}) { | |||
| 46 | 22 | 12 | $next = $expected{$item}; | |||
| 47 | 22 | 17 | $next = $value if $value < $next; | |||
| 48 | } elsif ($item =~ /^([A-Z])(.*)/) { | |||||
| 49 | 12 | 10 | $item = $1 . lc $2; | |||
| 50 | 12 | 11 | if (defined $expected{$item}) { | |||
| 51 | 2 | 2 | $next = $expected{$item}; | |||
| 52 | 2 | 1 | $next = maybe($next, $value + .1); | |||
| 53 | } else { | |||||
| 54 | 10 | 5 | $item = lc $item; | |||
| 55 | 10 | 11 | if (defined $expected{$item}) { | |||
| 56 | 5 | 2 | $next = $expected{$item}; | |||
| 57 | 5 | 4 | $next = maybe($next, $value + .2); | |||
| 58 | } | |||||
| 59 | } | |||||
| 60 | } | |||||
| 61 | 94 | 104 | return 0 unless defined $next; | |||
| 62 | 29 | 22 | $expected{$item} = $next; | |||
| 63 | 29 | 57 | return $value; | |||
| 64 | } | |||||
| 65 | ||||||
| 66 | sub skip_item { | |||||
| 67 | 50 | 28 | my ($word) = @_; | |||
| 68 | 50 | 27 | return 1 if expect_item($word, 1); | |||
| 69 | 32 | 23 | my $key = lc $word; | |||
| 70 | 32 | 39 | return 2 if expect_item($key, 2); | |||
| 71 | 32 | 35 | if ($key =~ /.s$/) { | |||
| 72 | 2 | 2 | if ($key =~ /ies$/) { | |||
| 73 | 1 | 2 | $key =~ s/ies$/y/; | |||
| 74 | } else { | |||||
| 75 | 1 | 2 | $key =~ s/s$//; | |||
| 76 | } | |||||
| 77 | } elsif ($key =~ /^(.+[^aeiou])ed$/) { | |||||
| 78 | 1 | 1 | $key = $1; | |||
| 79 | } elsif ($key =~ /^(.+)'[ds]$/) { | |||||
| 80 | 4 | 3 | $key = $1; | |||
| 81 | } else { | |||||
| 82 | 25 | 23 | return 0; | |||
| 83 | } | |||||
| 84 | 7 | 5 | return 3 if expect_item($key, 3); | |||
| 85 | 0 | 0 | return 0; | |||
| 86 | } | |||||
| 87 | ||||||
| 88 | sub should_skip_warning { | |||||
| 89 | 72 | 44 | my ($warning) = @_; | |||
| 90 | 72 | 96 | if ($warning =~ /\(([-\w]+)\)$/) { | |||
| 91 | 68 | 42 | my ($code) = ($1); | |||
| 92 | 68 | 29 | our %ignored_event_map; | |||
| 93 | 68 | 52 | return 1 if $ignored_event_map{$code}; | |||
| 94 | } | |||||
| 95 | 71 | 56 | return 0; | |||
| 96 | } | |||||
| 97 | ||||||
| 98 | sub log_skip_item { | |||||
| 99 | 46 | 57 | my ($item, $file, $warning, $unknown_word_limit) = @_; | |||
| 100 | 46 | 31 | return 1 if should_skip_warning $warning; | |||
| 101 | 46 | 26 | return 1 if skip_item($item); | |||
| 102 | 21 | 8 | our %seen; | |||
| 103 | 21 | 14 | my $seen_count = $seen{$item}; | |||
| 104 | 21 | 16 | if (defined $seen_count) { | |||
| 105 | 8 | 11 | if (!defined $unknown_word_limit || ($seen_count++ < $unknown_word_limit)) { | |||
| 106 | 7 | 31 | print MORE_WARNINGS "$file$warning\n"; | |||
| 107 | } else { | |||||
| 108 | 1 | 1 | our %last_seen; | |||
| 109 | 1 | 2 | $last_seen{$item} = "$file$warning"; | |||
| 110 | } | |||||
| 111 | 8 | 9 | $seen{$item} = $seen_count; | |||
| 112 | 8 | 13 | return 1; | |||
| 113 | } | |||||
| 114 | 13 | 9 | $seen{$item} = 1; | |||
| 115 | 13 | 13 | return 0; | |||
| 116 | } | |||||
| 117 | ||||||
| 118 | sub stem_word { | |||||
| 119 | 28 | 858 | my ($key) = @_; | |||
| 120 | 28 | 9 | our $disable_word_collating; | |||
| 121 | 28 | 20 | return $key if $disable_word_collating; | |||
| 122 | 28 | 10 | our $shortest_word; | |||
| 123 | 28 | 18 | $shortest_word = 2 unless defined $shortest_word; | |||
| 124 | 28 | 16 | my $key_length = length $key; | |||
| 125 | ||||||
| 126 | 28 | 40 | if ($key =~ /.s$/) { | |||
| 127 | 5 | 26 | if ($key_length > ($shortest_word + 1) && $key =~ /ies$/) { | |||
| 128 | 1 | 2 | $key =~ s/ies$/y/; | |||
| 129 | } elsif ($key_length > $shortest_word && $key !~ /ies$/) { | |||||
| 130 | 3 | 3 | $key =~ s/s$//; | |||
| 131 | } | |||||
| 132 | } elsif ($key_length > ($shortest_word + 1) && $key =~ /.[^aeiou]ed$/) { | |||||
| 133 | 2 | 4 | $key =~ s/ed$//; | |||
| 134 | } | |||||
| 135 | 28 | 31 | return $key; | |||
| 136 | } | |||||
| 137 | ||||||
| 138 | sub collate_key { | |||||
| 139 | 81 | 55 | my ($key) = @_; | |||
| 140 | 81 | 26 | our $disable_word_collating; | |||
| 141 | 81 | 31 | my $char; | |||
| 142 | 81 | 50 | if ($disable_word_collating) { | |||
| 143 | 8 | 9 | $char = lc substr $key, 0, 1; | |||
| 144 | } else { | |||||
| 145 | 73 | 48 | $key = lc $key; | |||
| 146 | 73 | 46 | $key =~ s/''+/'/g; | |||
| 147 | 73 | 42 | $key =~ s/'[sd]$//; | |||
| 148 | 73 | 34 | $key =~ s/^[^Ii]?'+(.*)/$1/; | |||
| 149 | 73 | 37 | $key =~ s/(.*?)'$/$1/; | |||
| 150 | 73 | 64 | $char = substr $key, 0, 1; | |||
| 151 | } | |||||
| 152 | 81 | 101 | return ($key, $char); | |||
| 153 | } | |||||
| 154 | ||||||
| 155 | sub load_expect { | |||||
| 156 | 12 | 504 | my ($expect) = @_; | |||
| 157 | 12 | 24 | our %expected; | |||
| 158 | 12 | 13 | %expected = (); | |||
| 159 | 12 | 94 | if (open(EXPECT, '<:utf8', $expect)) { | |||
| 160 | 12 | 59 | while (my $word = <EXPECT>) { | |||
| 161 | 43 | 57 | $word =~ s/\R//; | |||
| 162 | 43 | 75 | $expected{$word} = 0; | |||
| 163 | } | |||||
| 164 | 12 | 34 | close EXPECT; | |||
| 165 | } | |||||
| 166 | } | |||||
| 167 | ||||||
| 168 | sub harmonize_expect { | |||||
| 169 | 11 | 4 | our $disable_word_collating; | |||
| 170 | 11 | 5 | our %letter_map; | |||
| 171 | 11 | 6 | our %expected; | |||
| 172 | ||||||
| 173 | 11 | 14 | for my $word (keys %expected) { | |||
| 174 | 40 | 15 | my ($key, $char) = collate_key $word; | |||
| 175 | 40 | 21 | my %word_map = (); | |||
| 176 | 40 | 46 | next unless defined $letter_map{$char}{$key}; | |||
| 177 | 13 13 | 7 20 | %word_map = %{$letter_map{$char}{$key}}; | |||
| 178 | 13 | 16 | next if defined $word_map{$word}; | |||
| 179 | 3 | 3 | my $words = scalar keys %word_map; | |||
| 180 | 3 | 1 | next if $words > 2; | |||
| 181 | 3 | 3 | if ($word eq $key) { | |||
| 182 | 1 | 2 | next if ($words > 1); | |||
| 183 | } | |||||
| 184 | 2 | 2 | delete $expected{$word}; | |||
| 185 | } | |||||
| 186 | } | |||||
| 187 | ||||||
| 188 | sub group_related_words { | |||||
| 189 | 12 | 6 | our %letter_map; | |||
| 190 | 12 | 6 | our $disable_word_collating; | |||
| 191 | 12 | 9 | return if $disable_word_collating; | |||
| 192 | ||||||
| 193 | # group related words | |||||
| 194 | 11 | 22 | for my $char (sort CheckSpelling::Util::number_biased keys %letter_map) { | |||
| 195 | 19 19 | 9 19 | for my $plural_key (sort keys(%{$letter_map{$char}})) { | |||
| 196 | 22 | 11 | my $key = stem_word $plural_key; | |||
| 197 | 22 | 22 | next if $key eq $plural_key; | |||
| 198 | 4 | 5 | next unless defined $letter_map{$char}{$key}; | |||
| 199 | 3 3 | 1 5 | my %word_map = %{$letter_map{$char}{$key}}; | |||
| 200 | 3 3 | 2 3 | for my $word (keys(%{$letter_map{$char}{$plural_key}})) { | |||
| 201 | 3 | 2 | $word_map{$word} = 1; | |||
| 202 | } | |||||
| 203 | 3 | 2 | $letter_map{$char}{$key} = \%word_map; | |||
| 204 | 3 | 5 | delete $letter_map{$char}{$plural_key}; | |||
| 205 | } | |||||
| 206 | } | |||||
| 207 | } | |||||
| 208 | ||||||
| 209 | sub count_warning { | |||||
| 210 | 16 | 16 | my ($warning) = @_; | |||
| 211 | 16 | 6 | our %counters; | |||
| 212 | 16 | 7 | our %ignored_event_map; | |||
| 213 | 16 | 30 | if ($warning =~ /\(([-\w]+)\)$/) { | |||
| 214 | 10 | 7 | my ($code) = ($1); | |||
| 215 | 10 | 9 | next if defined $ignored_event_map{$code}; | |||
| 216 | 10 | 11 | ++$counters{$code}; | |||
| 217 | } | |||||
| 218 | } | |||||
| 219 | ||||||
| 220 | sub report_timing { | |||||
| 221 | 0 | 0 | my ($name, $start_time, $directory, $marker) = @_; | |||
| 222 | 0 | 0 | my $end_time = (stat "$directory/$marker")[9]; | |||
| 223 | 0 | 0 | $name =~ s/"/\\"/g; | |||
| 224 | 0 | 0 | print TIMING_REPORT "\"$name\", $start_time, $end_time\n"; | |||
| 225 | } | |||||
| 226 | ||||||
| 227 | sub get_pattern_with_context { | |||||
| 228 | 24 | 21 | my ($path) = @_; | |||
| 229 | 24 | 22 | return unless defined $ENV{$path}; | |||
| 230 | 24 | 22 | $ENV{$path} =~ /(.*)/; | |||
| 231 | 24 | 135 | return unless open ITEMS, '<:utf8', $1; | |||
| 232 | ||||||
| 233 | 24 | 14 | my @items; | |||
| 234 | 24 | 16 | my $context = ''; | |||
| 235 | 24 | 84 | while (<ITEMS>) { | |||
| 236 | 5 | 5 | my $pattern = $_; | |||
| 237 | 5 | 6 | if ($pattern =~ /^#/) { | |||
| 238 | 2 | 3 | if ($pattern =~ /^# /) { | |||
| 239 | 2 | 3 | $context .= $pattern; | |||
| 240 | } else { | |||||
| 241 | 0 | 0 | $context = ''; | |||
| 242 | } | |||||
| 243 | 2 | 3 | next; | |||
| 244 | } | |||||
| 245 | 3 | 2 | chomp $pattern; | |||
| 246 | 3 | 3 | unless ($pattern =~ /./) { | |||
| 247 | 1 | 1 | $context = ''; | |||
| 248 | 1 | 2 | next; | |||
| 249 | } | |||||
| 250 | 2 | 4 | push @items, $context.$pattern; | |||
| 251 | 2 | 5 | $context = ''; | |||
| 252 | } | |||||
| 253 | 24 | 49 | close ITEMS; | |||
| 254 | 24 | 26 | return @items; | |||
| 255 | } | |||||
| 256 | ||||||
| 257 | sub summarize_totals { | |||||
| 258 | 24 | 20 | my ($formatter, $path, $items, $totals, $file_counts) = @_; | |||
| 259 | 24 24 | 11 24 | return unless @{$totals}; | |||
| 260 | 2 | 58 | return unless open my $fh, '>:utf8', $path; | |||
| 261 | 2 2 | 2 2 | my $totals_count = scalar(@{$totals}) - 1; | |||
| 262 | 2 | 1 | my @indices; | |||
| 263 | 2 | 2 | if ($file_counts) { | |||
| 264 | @indices = sort { | |||||
| 265 | 1 0 | 2 0 | $totals->[$b] <=> $totals->[$a] || | |||
| 266 | $file_counts->[$b] <=> $file_counts->[$a] | |||||
| 267 | } 0 .. $totals_count; | |||||
| 268 | } else { | |||||
| 269 | @indices = sort { | |||||
| 270 | 1 0 | 1 0 | $totals->[$b] <=> $totals->[$a] | |||
| 271 | } 0 .. $totals_count; | |||||
| 272 | } | |||||
| 273 | 2 | 2 | for my $i (@indices) { | |||
| 274 | 2 | 2 | last unless $totals->[$i] > 0; | |||
| 275 | 2 | 2 | my $rule_with_context = $items->[$i]; | |||
| 276 | 2 | 2 | my ($description, $rule); | |||
| 277 | 2 | 6 | if ($rule_with_context =~ /^(.*\n)([^\n]+)$/s) { | |||
| 278 | 2 | 3 | ($description, $rule) = ($1, $2); | |||
| 279 | } else { | |||||
| 280 | 0 | 0 | ($description, $rule) = ('', $rule_with_context); | |||
| 281 | } | |||||
| 282 | 2 | 4 | print $fh $formatter->( | |||
| 283 | $totals->[$i], | |||||
| 284 | ($file_counts ? " file-count: $file_counts->[$i]" : ""), | |||||
| 285 | $description, | |||||
| 286 | $rule | |||||
| 287 | ); | |||||
| 288 | } | |||||
| 289 | 2 | 62 | close $fh; | |||
| 290 | } | |||||
| 291 | ||||||
| 292 | sub get_special { | |||||
| 293 | 19 | 13 | my ($file, $special) = @_; | |||
| 294 | 19 | 25 | return 'file-list' if $file eq $special->{'file_list'}; | |||
| 295 | 17 | 18 | return 'pr-title' if $file eq $special->{'pr_title_file'}; | |||
| 296 | 15 | 13 | return 'pr-description' if $file eq $special->{'pr_description_file'}; | |||
| 297 | 13 | 26 | return 'commit-message' if !rindex($file, $special->{'commit_messages'}); | |||
| 298 | 11 | 14 | return 'file'; | |||
| 299 | } | |||||
| 300 | ||||||
| 301 | sub main { | |||||
| 302 | 12 | 22508 | my @directories; | |||
| 303 | my @cleanup_directories; | |||||
| 304 | 12 | 0 | my @check_file_paths; | |||
| 305 | ||||||
| 306 | 12 | 12 | my $early_warnings = CheckSpelling::Util::get_file_from_env('early_warnings', '/dev/null'); | |||
| 307 | 12 | 9 | my $warning_output = CheckSpelling::Util::get_file_from_env('warning_output', '/dev/stderr'); | |||
| 308 | 12 | 9 | my $more_warnings = CheckSpelling::Util::get_file_from_env('more_warnings', '/dev/stderr'); | |||
| 309 | 12 | 8 | my $counter_summary = CheckSpelling::Util::get_file_from_env('counter_summary', '/dev/stderr'); | |||
| 310 | 12 | 8 | my $ignored_events = CheckSpelling::Util::get_file_from_env('ignored_events', ''); | |||
| 311 | 12 | 12 | if ($ignored_events) { | |||
| 312 | 6 | 2 | our %ignored_event_map; | |||
| 313 | 6 | 8 | for my $event (split /,/, $ignored_events) { | |||
| 314 | 6 | 7 | $ignored_event_map{$event} = 1; | |||
| 315 | } | |||||
| 316 | } | |||||
| 317 | 12 | 9 | my $should_exclude_file = CheckSpelling::Util::get_file_from_env('should_exclude_file', '/dev/null'); | |||
| 318 | 12 | 10 | my $unknown_word_limit = CheckSpelling::Util::get_val_from_env('unknown_word_limit', undef); | |||
| 319 | 12 | 7 | my $unknown_file_word_limit = CheckSpelling::Util::get_val_from_env('unknown_file_word_limit', undef); | |||
| 320 | 12 | 7 | my $candidate_example_limit = CheckSpelling::Util::get_file_from_env('INPUT_CANDIDATE_EXAMPLE_LIMIT', '3'); | |||
| 321 | 12 | 6 | my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', ''); | |||
| 322 | 12 | 8 | my $only_check_changed_files = CheckSpelling::Util::get_file_from_env('INPUT_ONLY_CHECK_CHANGED_FILES', ''); | |||
| 323 | 12 | 9 | my $disable_noisy_file = $disable_flags =~ /(?:^|,|\s)noisy-file(?:,|\s|$)/; | |||
| 324 | 12 | 28 | our $disable_word_collating = $only_check_changed_files || $disable_flags =~ /(?:^|,|\s)word-collating(?:,|\s|$)/; | |||
| 325 | 12 | 11 | our $shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', undef); | |||
| 326 | 12 | 11 | my $file_list = CheckSpelling::Util::get_file_from_env('check_file_names', ''); | |||
| 327 | 12 | 8 | my $pr_title_file = CheckSpelling::Util::get_file_from_env('pr_title_file', ''); | |||
| 328 | 12 | 9 | my $pr_description_file = CheckSpelling::Util::get_file_from_env('pr_description_file', ''); | |||
| 329 | 12 | 6 | my $commit_messages = CheckSpelling::Util::get_file_from_env('commit_messages', ''); | |||
| 330 | 12 | 9 | my $timing_report = CheckSpelling::Util::get_file_from_env('timing_report', ''); | |||
| 331 | 12 | 21 | my $special = { | |||
| 332 | 'file_list' => $file_list, | |||||
| 333 | 'pr_title_file' => $pr_title_file, | |||||
| 334 | 'pr_description_file' => $pr_description_file, | |||||
| 335 | 'commit_messages' => $commit_messages, | |||||
| 336 | }; | |||||
| 337 | 12 | 5 | my ($start_time, $end_time); | |||
| 338 | ||||||
| 339 | 12 | 236 | open WARNING_OUTPUT, '>:utf8', $warning_output; | |||
| 340 | 12 | 165 | open MORE_WARNINGS, '>:utf8', $more_warnings; | |||
| 341 | 12 | 145 | open COUNTER_SUMMARY, '>:utf8', $counter_summary; | |||
| 342 | 12 | 103 | open SHOULD_EXCLUDE, '>:utf8', $should_exclude_file; | |||
| 343 | 12 | 10 | if ($timing_report) { | |||
| 344 | 0 | 0 | open TIMING_REPORT, '>:utf8', $timing_report; | |||
| 345 | 0 | 0 | print TIMING_REPORT "file, start, finish\n"; | |||
| 346 | } | |||||
| 347 | ||||||
| 348 | 12 | 10 | my @candidates = get_pattern_with_context('candidates_path'); | |||
| 349 | 12 | 12 | my @candidate_totals = (0) x scalar @candidates; | |||
| 350 | 12 | 7 | my @candidate_file_counts = (0) x scalar @candidates; | |||
| 351 | ||||||
| 352 | 12 | 6 | my @forbidden = get_pattern_with_context('forbidden_path'); | |||
| 353 | 12 | 10 | my @forbidden_totals = (0) x scalar @forbidden; | |||
| 354 | ||||||
| 355 | 12 | 9 | my @delayed_warnings; | |||
| 356 | 12 | 23 | our %letter_map = (); | |||
| 357 | ||||||
| 358 | 12 | 5 | my %file_map = (); | |||
| 359 | ||||||
| 360 | 12 | 32 | for my $directory (<>) { | |||
| 361 | 15 | 11 | chomp $directory; | |||
| 362 | 15 | 24 | next unless $directory =~ /^(.*)$/; | |||
| 363 | 15 | 14 | $directory = $1; | |||
| 364 | 15 | 44 | unless (-e $directory) { | |||
| 365 | 1 | 3 | print STDERR "Could not find: $directory\n"; | |||
| 366 | 1 | 1 | next; | |||
| 367 | } | |||||
| 368 | 14 | 30 | unless (-d $directory) { | |||
| 369 | 1 | 11 | print STDERR "Not a directory: $directory\n"; | |||
| 370 | 1 | 1 | next; | |||
| 371 | } | |||||
| 372 | ||||||
| 373 | # if there's no filename, we can't report | |||||
| 374 | 13 | 98 | next unless open(NAME, '<:utf8', "$directory/name"); | |||
| 375 | 12 | 60 | my $file=<NAME>; | |||
| 376 | 12 | 23 | close NAME; | |||
| 377 | ||||||
| 378 | 12 | 23 | $file_map{$file} = $directory; | |||
| 379 | } | |||||
| 380 | ||||||
| 381 | 12 | 21 | for my $file (sort keys %file_map) { | |||
| 382 | 12 | 9 | my $directory = $file_map{$file}; | |||
| 383 | 12 | 10 | if ($timing_report) { | |||
| 384 | 0 | 0 | $start_time = (stat "$directory/name")[9]; | |||
| 385 | } | |||||
| 386 | ||||||
| 387 | 12 | 59 | if (-e "$directory/skipped") { | |||
| 388 | 1 | 6 | open SKIPPED, '<:utf8', "$directory/skipped"; | |||
| 389 | 1 | 9 | my $reason=<SKIPPED>; | |||
| 390 | 1 | 2 | close SKIPPED; | |||
| 391 | 1 | 1 | chomp $reason; | |||
| 392 | 1 | 3 | push @delayed_warnings, "$file:1:1 ... 1, Warning - Skipping `$file` because $reason\n"; | |||
| 393 | 1 | 2 | print SHOULD_EXCLUDE "$file\n"; | |||
| 394 | 1 | 1 | push @cleanup_directories, $directory; | |||
| 395 | 1 | 1 | report_timing($file, $start_time, $directory, 'skipped') if ($timing_report); | |||
| 396 | 1 | 1 | next; | |||
| 397 | } | |||||
| 398 | ||||||
| 399 | # stats isn't written if there was nothing interesting in the file | |||||
| 400 | 11 | 34 | unless (-s "$directory/stats") { | |||
| 401 | 1 | 1 | report_timing($file, $start_time, $directory, 'warnings') if ($timing_report); | |||
| 402 | 1 | 1 | push @directories, $directory; | |||
| 403 | 1 | 1 | next; | |||
| 404 | } | |||||
| 405 | ||||||
| 406 | 10 | 8 | if ($file eq $file_list) { | |||
| 407 | 1 | 6 | open FILE_LIST, '<:utf8', $file_list; | |||
| 408 | 1 | 0 | push @check_file_paths, '0 placeholder'; | |||
| 409 | 1 | 6 | for my $check_file_path (<FILE_LIST>) { | |||
| 410 | 4 | 4 | chomp $check_file_path; | |||
| 411 | 4 | 3 | push @check_file_paths, $check_file_path; | |||
| 412 | } | |||||
| 413 | 1 | 2 | close FILE_LIST; | |||
| 414 | } | |||||
| 415 | ||||||
| 416 | 10 | 8 | my ($words, $unrecognized, $unknown, $unique); | |||
| 417 | ||||||
| 418 | { | |||||
| 419 | 10 10 | 6 50 | open STATS, '<:utf8', "$directory/stats"; | |||
| 420 | 10 | 41 | my $stats=<STATS>; | |||
| 421 | 10 | 16 | close STATS; | |||
| 422 | 10 | 9 | $words=get_field($stats, 'words'); | |||
| 423 | 10 | 7 | $unrecognized=get_field($stats, 'unrecognized'); | |||
| 424 | 10 | 6 | $unknown=get_field($stats, 'unknown'); | |||
| 425 | 10 | 7 | $unique=get_field($stats, 'unique'); | |||
| 426 | 10 | 3 | my @candidate_list; | |||
| 427 | 10 | 9 | if (@candidate_totals) { | |||
| 428 | 1 | 1 | @candidate_list=get_array($stats, 'candidates'); | |||
| 429 | 1 | 1 | my @lines=get_array($stats, 'candidate_lines'); | |||
| 430 | 1 | 1 | if (@candidate_list) { | |||
| 431 | 1 | 1 | for (my $i=0; $i < scalar @candidate_list; $i++) { | |||
| 432 | 1 | 1 | my $hits = $candidate_list[$i]; | |||
| 433 | 1 | 1 | if ($hits) { | |||
| 434 | 1 | 1 | $candidate_totals[$i] += $hits; | |||
| 435 | 1 | 1 | if ($candidate_file_counts[$i]++ < $candidate_example_limit) { | |||
| 436 | 1 | 2 | my $pattern = (split /\n/,$candidates[$i])[-1]; | |||
| 437 | 1 | 1 | my $position = $lines[$i]; | |||
| 438 | 1 | 5 | $position =~ s/:(\d+)$/ ... $1/; | |||
| 439 | 1 | 1 | my $wrapped = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($pattern), 99); | |||
| 440 | 1 | 1 | my $candidate_label = ''; | |||
| 441 | 1 | 2 | if ($candidates[$i] =~ /^#\s+(\S.+)/) { | |||
| 442 | 1 | 1 | $candidate_label = " ($1)"; | |||
| 443 | } | |||||
| 444 | 1 | 5 | push @delayed_warnings, "$file:$position, Notice - Line matches candidate pattern$candidate_label $wrapped (candidate-pattern)\n"; | |||
| 445 | } | |||||
| 446 | } | |||||
| 447 | } | |||||
| 448 | } | |||||
| 449 | } | |||||
| 450 | 10 | 28 | if (@forbidden_totals) { | |||
| 451 | 1 | 1 | my @forbidden_list=get_array($stats, 'forbidden'); | |||
| 452 | 1 | 1 | my @lines=get_array($stats, 'forbidden_lines'); | |||
| 453 | 1 | 1 | if (@forbidden_list) { | |||
| 454 | 1 | 1 | for (my $i=0; $i < scalar @forbidden_list; $i++) { | |||
| 455 | 1 | 1 | my $hits = $forbidden_list[$i]; | |||
| 456 | 1 | 0 | if ($hits) { | |||
| 457 | 1 | 2 | $forbidden_totals[$i] += $hits; | |||
| 458 | } | |||||
| 459 | } | |||||
| 460 | } | |||||
| 461 | } | |||||
| 462 | #print STDERR "$file (unrecognized: $unrecognized; unique: $unique; unknown: $unknown, words: $words, candidates: [".join(", ", @candidate_list)."])\n"; | |||||
| 463 | } | |||||
| 464 | ||||||
| 465 | 10 | 10 | report_timing($file, $start_time, $directory, 'unknown') if ($timing_report); | |||
| 466 | 10 | 9 | my $kind = get_special($file, $special); | |||
| 467 | # These heuristics are very new and need tuning/feedback | |||||
| 468 | 10 | 11 | if ( | |||
| 469 | ($unknown > $unique) | |||||
| 470 | # || ($unrecognized > $words / 2) | |||||
| 471 | ) { | |||||
| 472 | 1 | 1 | unless ($disable_noisy_file) { | |||
| 473 | 1 | 1 | if ($kind eq 'file') { | |||
| 474 | 1 | 5 | print SHOULD_EXCLUDE "$file\n"; | |||
| 475 | } | |||||
| 476 | 1 | 1 | my $warning = "noisy-$kind"; | |||
| 477 | 1 | 1 | count_warning $warning; | |||
| 478 | 1 | 2 | push @delayed_warnings, "$file:1:1 ... 1, Warning - Skipping `$file` because it seems to have more noise ($unknown) than unique words ($unique) (total: $unrecognized / $words). ($warning)\n"; | |||
| 479 | 1 | 0 | push @cleanup_directories, $directory; | |||
| 480 | 1 | 2 | next; | |||
| 481 | } | |||||
| 482 | } | |||||
| 483 | 9 | 6 | push @directories, $directory; | |||
| 484 | 9 | 38 | unless ($kind =~ /^file/ && -s "$directory/unknown") { | |||
| 485 | 4 | 6 | next; | |||
| 486 | } | |||||
| 487 | 5 | 32 | open UNKNOWN, '<:utf8', "$directory/unknown"; | |||
| 488 | 5 | 33 | for my $token (<UNKNOWN>) { | |||
| 489 | 43 | 48 | $token =~ s/\R//; | |||
| 490 | 43 | 35 | next unless $token =~ /./; | |||
| 491 | 41 | 40 | my ($key, $char) = collate_key $token; | |||
| 492 | 41 | 48 | $letter_map{$char} = () unless defined $letter_map{$char}; | |||
| 493 | 41 | 24 | my %word_map = (); | |||
| 494 | 41 14 | 31 20 | %word_map = %{$letter_map{$char}{$key}} if defined $letter_map{$char}{$key}; | |||
| 495 | 41 | 34 | $word_map{$token} = 1; | |||
| 496 | 41 | 49 | $letter_map{$char}{$key} = \%word_map; | |||
| 497 | } | |||||
| 498 | 5 | 19 | close UNKNOWN; | |||
| 499 | } | |||||
| 500 | 12 | 36 | close SHOULD_EXCLUDE; | |||
| 501 | 12 | 9 | close TIMING_REPORT if $timing_report; | |||
| 502 | ||||||
| 503 | summarize_totals( | |||||
| 504 | sub { | |||||
| 505 | 1 | 1 | my ($hits, $files, $context, $pattern) = @_; | |||
| 506 | 1 | 5 | return "# hit-count: $hits$files\n$context$pattern\n\n", | |||
| 507 | }, | |||||
| 508 | 12 | 36 | CheckSpelling::Util::get_file_from_env('candidate_summary', '/dev/stderr'), | |||
| 509 | \@candidates, | |||||
| 510 | \@candidate_totals, | |||||
| 511 | \@candidate_file_counts, | |||||
| 512 | ); | |||||
| 513 | ||||||
| 514 | summarize_totals( | |||||
| 515 | sub { | |||||
| 516 | 1 | 1 | my (undef, undef, $context, $pattern) = @_; | |||
| 517 | 1 | 2 | $context =~ s/^# //gm; | |||
| 518 | 1 | 1 | chomp $context; | |||
| 519 | 1 | 1 | my $details; | |||
| 520 | 1 | 3 | if ($context =~ /^(.*?)$(.*)/ms) { | |||
| 521 | 1 | 1 | ($context, $details) = ($1, $2); | |||
| 522 | 1 | 1 | $details = "\n$details" if $details; | |||
| 523 | } | |||||
| 524 | 1 | 6 | $context = 'Pattern' unless $context; | |||
| 525 | 1 | 5 | return "##### $context$details\n```\n$pattern\n```\n\n"; | |||
| 526 | }, | |||||
| 527 | 12 | 34 | CheckSpelling::Util::get_file_from_env('forbidden_summary', '/dev/stderr'), | |||
| 528 | \@forbidden, | |||||
| 529 | \@forbidden_totals, | |||||
| 530 | ); | |||||
| 531 | ||||||
| 532 | 12 | 35 | group_related_words; | |||
| 533 | ||||||
| 534 | 12 | 17 | if (defined $ENV{'expect'}) { | |||
| 535 | 11 | 10 | $ENV{'expect'} =~ /(.*)/; | |||
| 536 | 11 | 9 | load_expect $1; | |||
| 537 | 11 | 7 | harmonize_expect; | |||
| 538 | } | |||||
| 539 | ||||||
| 540 | 12 | 9 | my %seen = (); | |||
| 541 | 12 | 4 | our %counters; | |||
| 542 | 12 | 9 | %counters = (); | |||
| 543 | ||||||
| 544 | 12 | 45 | if (-s $early_warnings) { | |||
| 545 | 1 | 5 | open WARNINGS, '<:utf8', $early_warnings; | |||
| 546 | 1 | 8 | for my $warning (<WARNINGS>) { | |||
| 547 | 1 | 1 | chomp $warning; | |||
| 548 | 1 | 1 | count_warning $warning; | |||
| 549 | 1 | 3 | next if should_skip_warning $warning; | |||
| 550 | 1 | 4 | print WARNING_OUTPUT "$warning\n"; | |||
| 551 | } | |||||
| 552 | 1 | 3 | close WARNINGS; | |||
| 553 | } | |||||
| 554 | ||||||
| 555 | 12 | 6 | our %last_seen; | |||
| 556 | 12 | 4 | my %unknown_file_word_count; | |||
| 557 | 12 | 9 | for my $directory (@directories) { | |||
| 558 | 10 | 29 | next unless (-s "$directory/warnings"); | |||
| 559 | 9 | 53 | next unless open(NAME, '<:utf8', "$directory/name"); | |||
| 560 | 9 | 35 | my $file=<NAME>; | |||
| 561 | 9 | 16 | close NAME; | |||
| 562 | 9 | 8 | my $kind = get_special($file, $special); | |||
| 563 | 9 | 46 | open WARNINGS, '<:utf8', "$directory/warnings"; | |||
| 564 | 9 | 9 | if ($kind ne 'file-list') { | |||
| 565 | 8 | 46 | for my $warning (<WARNINGS>) { | |||
| 566 | 50 | 38 | chomp $warning; | |||
| 567 | 50 | 90 | if ($warning =~ m/:(\d+):(\d+ \.\.\. \d+): `(.*)`/) { | |||
| 568 | 46 | 40 | my ($line, $range, $item) = ($1, $2, $3); | |||
| 569 | 46 | 36 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($item); | |||
| 570 | 46 | 25 | my $reason = 'unrecognized-spelling'; | |||
| 571 | 46 | 30 | $reason .= "-$kind" unless $kind eq 'file'; | |||
| 572 | 46 | 100 | $warning =~ s/:\d+:\d+ \.\.\. \d+: `.*`/:$line:$range, Warning - $wrapped is not a recognized word ($reason)/; | |||
| 573 | 46 | 33 | next if log_skip_item($item, $file, $warning, $unknown_word_limit); | |||
| 574 | 13 | 11 | count_warning $warning if $kind ne 'file'; | |||
| 575 | } else { | |||||
| 576 | 4 | 4 | if ($warning =~ /\`(.*?)\` in line \(token-is-substring\)/) { | |||
| 577 | 0 | 0 | next if skip_item($1); | |||
| 578 | } | |||||
| 579 | 4 | 4 | count_warning $warning; | |||
| 580 | } | |||||
| 581 | 17 | 6 | next if should_skip_warning $warning; | |||
| 582 | 17 | 57 | print WARNING_OUTPUT "$file$warning\n"; | |||
| 583 | } | |||||
| 584 | } else { | |||||
| 585 | 1 | 7 | for my $warning (<WARNINGS>) { | |||
| 586 | 6 | 6 | chomp $warning; | |||
| 587 | 6 | 10 | next unless $warning =~ s/^:(\d+)/:1/; | |||
| 588 | 6 | 6 | $file = $check_file_paths[$1]; | |||
| 589 | 6 | 12 | if ($warning =~ m/:(\d+ \.\.\. \d+): `(.*)`/) { | |||
| 590 | 4 | 4 | my ($range, $item) = ($1, $2); | |||
| 591 | 4 | 4 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($item); | |||
| 592 | 4 | 11 | $warning =~ s/:\d+ \.\.\. \d+: `.*`/:$range, Warning - $wrapped is not a recognized word (check-file-path)/; | |||
| 593 | 4 | 2 | next if skip_item($item); | |||
| 594 | 4 | 3 | if (defined $unknown_file_word_limit) { | |||
| 595 | 4 | 10 | next if ++$unknown_file_word_count{$item} > $unknown_file_word_limit; | |||
| 596 | } | |||||
| 597 | } | |||||
| 598 | 5 | 4 | next if should_skip_warning $warning; | |||
| 599 | 4 | 11 | print WARNING_OUTPUT "$file$warning\n"; | |||
| 600 | 4 | 9 | count_warning $warning; | |||
| 601 | } | |||||
| 602 | } | |||||
| 603 | 9 | 34 | close WARNINGS; | |||
| 604 | } | |||||
| 605 | 12 | 214 | close MORE_WARNINGS; | |||
| 606 | ||||||
| 607 | 12 | 11 | for my $warning (@delayed_warnings) { | |||
| 608 | 3 | 3 | next if should_skip_warning $warning; | |||
| 609 | 3 | 2 | count_warning $warning; | |||
| 610 | 3 | 3 | print WARNING_OUTPUT $warning; | |||
| 611 | } | |||||
| 612 | 12 | 11 | if (defined $unknown_word_limit) { | |||
| 613 | 1 | 2 | for my $warned_word (sort keys %last_seen) { | |||
| 614 | 1 | 3 | my $warning_count = $seen{$warned_word} || 0; | |||
| 615 | 1 | 2 | next unless $warning_count >= $unknown_word_limit; | |||
| 616 | 0 | 0 | my $warning = $last_seen{$warned_word}; | |||
| 617 | 0 | 0 | $warning =~ s/\Q (unrecognized-spelling)\E/ -- found $warning_count times (limited-references)\n/; | |||
| 618 | 0 | 0 | next if should_skip_warning $warning; | |||
| 619 | 0 | 0 | print WARNING_OUTPUT $warning; | |||
| 620 | 0 | 0 | count_warning $warning; | |||
| 621 | } | |||||
| 622 | } | |||||
| 623 | 12 | 253 | close WARNING_OUTPUT; | |||
| 624 | ||||||
| 625 | 12 | 12 | if (%counters) { | |||
| 626 | 3 | 3 | my $continue=''; | |||
| 627 | 3 | 5 | print COUNTER_SUMMARY "{\n"; | |||
| 628 | 3 | 5 | for my $code (sort keys %counters) { | |||
| 629 | 6 | 9 | print COUNTER_SUMMARY qq<$continue"$code": $counters{$code}\n>; | |||
| 630 | 6 | 5 | $continue=','; | |||
| 631 | } | |||||
| 632 | 3 | 3 | print COUNTER_SUMMARY "}\n"; | |||
| 633 | } | |||||
| 634 | 12 | 93 | close COUNTER_SUMMARY; | |||
| 635 | ||||||
| 636 | # display the current unknown | |||||
| 637 | 12 | 25 | for my $char (sort keys %letter_map) { | |||
| 638 | 43 43 | 23 95 | for my $key (sort CheckSpelling::Util::case_biased keys(%{$letter_map{$char}})) { | |||
| 639 | 24 24 | 15 32 | my %word_map = %{$letter_map{$char}{$key}}; | |||
| 640 | 24 | 19 | my @words = keys(%word_map); | |||
| 641 | 24 | 14 | if (scalar(@words) > 1) { | |||
| 642 | 13 20 | 39 84 | print $key." (".(join ", ", sort { length($a) <=> length($b) || $a cmp $b } @words).")"; | |||
| 643 | } else { | |||||
| 644 | 11 | 36 | print $words[0]; | |||
| 645 | } | |||||
| 646 | 24 | 99 | print "\n"; | |||
| 647 | } | |||||
| 648 | } | |||||
| 649 | } | |||||
| 650 | ||||||
| 651 | 1; | |||||