| File: | lib/CheckSpelling/UnknownWordSplitter.pm |
| Coverage: | 86.0% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #! -*-perl-*- | |||||
| 2 | ||||||
| 3 | # ~/bin/w | |||||
| 4 | # Search for potentially misspelled words | |||||
| 5 | # Output is: | |||||
| 6 | # misspellled | |||||
| 7 | # woord (WOORD, Woord, woord, woord's) | |||||
| 8 | package CheckSpelling::UnknownWordSplitter; | |||||
| 9 | ||||||
| 10 | 1 1 | 102463 2 | use 5.022; | |||
| 11 | 1 1 1 | 2 1 41 | use feature 'unicode_strings'; | |||
| 12 | 1 1 1 | 2 1 6 | use strict; | |||
| 13 | 1 1 1 | 1 1 20 | use warnings; | |||
| 14 | 1 1 1 | 1 1 14 | no warnings qw(experimental::vlb); | |||
| 15 | 1 1 1 | 1 1 1 | use utf8; | |||
| 16 | 1 1 1 | 13 0 25 | use Encode qw/decode_utf8 encode FB_DEFAULT/; | |||
| 17 | 1 1 1 | 1 0 26 | use File::Basename; | |||
| 18 | 1 1 1 | 1 1 15 | use Cwd 'abs_path'; | |||
| 19 | 1 1 1 | 1 1 19 | use File::Spec; | |||
| 20 | 1 1 1 | 1 1 13 | use File::Temp qw/ tempfile tempdir /; | |||
| 21 | 1 1 1 | 296 1 3776 | use CheckSpelling::Util; | |||
| 22 | our $VERSION='0.1.0'; | |||||
| 23 | ||||||
| 24 | my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names); | |||||
| 25 | my $begin_block_re = ''; | |||||
| 26 | my @begin_block_list = (); | |||||
| 27 | my @end_block_list = (); | |||||
| 28 | my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern); | |||||
| 29 | my ($shortest, $longest) = (255, 0); | |||||
| 30 | my @forbidden_re_list; | |||||
| 31 | my %forbidden_re_descriptions; | |||||
| 32 | my @candidates_re_list; | |||||
| 33 | my $hunspell_dictionary_path; | |||||
| 34 | my @hunspell_dictionaries; | |||||
| 35 | my %dictionary = (); | |||||
| 36 | my $base_dict; | |||||
| 37 | my %unique; | |||||
| 38 | my %unique_unrecognized; | |||||
| 39 | my ($last_file, $words, $unrecognized) = ('', 0, 0); | |||||
| 40 | my ($ignore_next_line_pattern); | |||||
| 41 | my ($check_images); | |||||
| 42 | ||||||
| 43 | my $disable_flags; | |||||
| 44 | ||||||
| 45 | sub test_re { | |||||
| 46 | 32 | 22 | my ($expression) = @_; | |||
| 47 | 32 32 | 14 202 | return eval { qr /$expression/ }; | |||
| 48 | } | |||||
| 49 | ||||||
| 50 | sub quote_re { | |||||
| 51 | 34 | 26 | my ($expression) = @_; | |||
| 52 | 34 | 28 | return $expression if $expression =~ /\?\{/; | |||
| 53 | 34 | 66 | $expression =~ s/ | |||
| 54 | \G | |||||
| 55 | ( | |||||
| 56 | (?:[^\\]|\\[^Q])* | |||||
| 57 | ) | |||||
| 58 | (?: | |||||
| 59 | \\Q | |||||
| 60 | (?:[^\\]|\\[^E])* | |||||
| 61 | (?:\\E)? | |||||
| 62 | )? | |||||
| 63 | / | |||||
| 64 | 68 | 110 | $1 . (defined($2) ? quotemeta($2) : '') | |||
| 65 | /xge; | |||||
| 66 | 34 | 42 | return $expression; | |||
| 67 | } | |||||
| 68 | ||||||
| 69 | sub file_to_lists { | |||||
| 70 | 6 | 4 | my ($re) = @_; | |||
| 71 | 6 | 6 | my @patterns; | |||
| 72 | my %hints; | |||||
| 73 | 6 | 0 | my $fh; | |||
| 74 | 6 | 59 | if (open($fh, '<:utf8', $re)) { | |||
| 75 | 6 | 10 | local $/=undef; | |||
| 76 | 6 | 41 | my $file=<$fh>; | |||
| 77 | 6 | 16 | close $fh; | |||
| 78 | 6 | 5 | my $line_number = 0; | |||
| 79 | 6 | 4 | my $hint = ''; | |||
| 80 | 6 | 22 | for (split /\R/, $file) { | |||
| 81 | 32 | 15 | ++$line_number; | |||
| 82 | 32 | 16 | chomp; | |||
| 83 | 32 | 35 | if (/^#(?:\s(.+)|)/) { | |||
| 84 | 12 | 29 | $hint = $1 if ($hint eq '' && defined $1); | |||
| 85 | 12 | 9 | next; | |||
| 86 | } | |||||
| 87 | 20 | 19 | $hint = '' unless $_ ne ''; | |||
| 88 | 20 | 19 | next if $_ eq '$^'; | |||
| 89 | 20 | 15 | my $pattern = $_; | |||
| 90 | 20 | 48 | next unless s/^(.+)/(?:$1)/; | |||
| 91 | 13 | 11 | my $quoted = quote_re($1); | |||
| 92 | 13 | 12 | unless (test_re $quoted) { | |||
| 93 | 1 | 1 | my $error = $@; | |||
| 94 | 1 | 48 | my $home = dirname(__FILE__); | |||
| 95 | 1 | 18 | $error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/; | |||
| 96 | 1 | 11 | print STDERR $error; | |||
| 97 | 1 | 2 | $_ = '(?:\$^ - skipped because bad-regex)'; | |||
| 98 | 1 | 1 | $hint = ''; | |||
| 99 | } | |||||
| 100 | 13 | 19 | if (defined $hints{$_}) { | |||
| 101 | 1 | 2 | my $pattern_length = length $pattern; | |||
| 102 | 1 | 1 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern); | |||
| 103 | 1 | 16 | print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n"; | |||
| 104 | 1 | 2 | $_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)'; | |||
| 105 | } else { | |||||
| 106 | 12 | 12 | push @patterns, $_; | |||
| 107 | 12 | 13 | $hints{$_} = $hint; | |||
| 108 | } | |||||
| 109 | 13 | 21 | $hint = ''; | |||
| 110 | } | |||||
| 111 | } | |||||
| 112 | ||||||
| 113 | return { | |||||
| 114 | 6 | 21 | patterns => \@patterns, | |||
| 115 | hints => \%hints, | |||||
| 116 | }; | |||||
| 117 | } | |||||
| 118 | ||||||
| 119 | sub file_to_list { | |||||
| 120 | 5 | 1256 | my ($re) = @_; | |||
| 121 | 5 | 7 | my $lists = file_to_lists($re); | |||
| 122 | ||||||
| 123 | 5 5 | 4 16 | return @{$lists->{'patterns'}}; | |||
| 124 | } | |||||
| 125 | ||||||
| 126 | sub list_to_re { | |||||
| 127 | 5 | 6 | my (@list) = @_; | |||
| 128 | 5 11 11 | 3 6 8 | @list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list; | |||
| 129 | 5 11 | 5 11 | @list = grep { $_ ne '' } @list; | |||
| 130 | 5 | 6 | return '$^' unless scalar @list; | |||
| 131 | 5 | 12 | return join "|", (@list); | |||
| 132 | } | |||||
| 133 | ||||||
| 134 | sub not_empty { | |||||
| 135 | 106 | 85 | my ($thing) = @_; | |||
| 136 | 106 | 459 | return defined $thing && $thing ne '' && $thing =~ /^\d+$/; | |||
| 137 | } | |||||
| 138 | ||||||
| 139 | sub parse_block_list { | |||||
| 140 | 3 | 2 | my ($re) = @_; | |||
| 141 | 3 | 1 | my @file; | |||
| 142 | 3 | 27 | return @file unless (open(FILE, '<:utf8', $re)); | |||
| 143 | ||||||
| 144 | 3 | 6 | local $/=undef; | |||
| 145 | 3 | 23 | my $file=<FILE>; | |||
| 146 | 3 | 4 | my $last_line = $.; | |||
| 147 | 3 | 8 | close FILE; | |||
| 148 | 3 | 9 | for (split /\R/, $file) { | |||
| 149 | 8 | 10 | next if /^#/; | |||
| 150 | 5 | 3 | chomp; | |||
| 151 | 5 | 4 | s/^\\#/#/; | |||
| 152 | 5 | 6 | next unless /^./; | |||
| 153 | 5 | 6 | push @file, $_; | |||
| 154 | } | |||||
| 155 | ||||||
| 156 | 3 | 3 | my $pairs = (0+@file) / 2; | |||
| 157 | 3 | 2 | my $true_pairs = $pairs | 0; | |||
| 158 | 3 | 3 | unless ($pairs == $true_pairs) { | |||
| 159 | 1 | 1 | my $early_warnings = CheckSpelling::Util::get_file_from_env('early_warnings', '/dev/null'); | |||
| 160 | 1 1 1 1 | 2 1 1 26 | open EARLY_WARNINGS, ">>:encoding(UTF-8)", $early_warnings; | |||
| 161 | 1 | 413 | print EARLY_WARNINGS "$re:$last_line:Block delimiters must come in pairs (uneven-block-delimiters)\n"; | |||
| 162 | 1 | 23 | close EARLY_WARNINGS; | |||
| 163 | 1 | 1 | my $i = 0; | |||
| 164 | 1 | 2 | while ($i < $true_pairs) { | |||
| 165 | 0 | 0 | print STDERR "block-delimiter $i S: $file[$i*2]\n"; | |||
| 166 | 0 | 0 | print STDERR "block-delimiter $i E: $file[$i*2+1]\n"; | |||
| 167 | 0 | 0 | $i++; | |||
| 168 | } | |||||
| 169 | 1 | 10 | print STDERR "block-delimiter unmatched S: `$file[$i*2]`\n"; | |||
| 170 | 1 | 2 | @file = (); | |||
| 171 | } | |||||
| 172 | ||||||
| 173 | 3 | 9 | return @file; | |||
| 174 | } | |||||
| 175 | ||||||
| 176 | sub valid_word { | |||||
| 177 | # shortest_word is an absolute | |||||
| 178 | 28 | 16 | our ($shortest, $longest, $shortest_word, $longest_word); | |||
| 179 | 28 | 26 | $shortest = $shortest_word if $shortest_word; | |||
| 180 | 28 | 22 | if ($longest_word) { | |||
| 181 | # longest_word is an absolute | |||||
| 182 | 26 | 24 | $longest = $longest_word; | |||
| 183 | } elsif (not_empty($longest)) { | |||||
| 184 | # we allow for some sloppiness (a couple of stuck keys per word) | |||||
| 185 | # it's possible that this should scale with word length | |||||
| 186 | 1 | 0 | $longest += 2; | |||
| 187 | } | |||||
| 188 | 28 | 18 | our ($upper_pattern, $lower_pattern, $punctuation_pattern); | |||
| 189 | 28 84 | 21 162 | my $word_pattern = join '|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern)); | |||
| 190 | 28 | 27 | $word_pattern = q<\\w|'> unless $word_pattern; | |||
| 191 | 28 | 53 | if ((defined $shortest && not_empty($longest)) && | |||
| 192 | ($shortest > $longest)) { | |||||
| 193 | 0 | 0 | $word_pattern = "(?:$word_pattern){3}"; | |||
| 194 | 0 | 0 | return qr/$word_pattern/; | |||
| 195 | } | |||||
| 196 | 28 | 28 | $shortest = 3 unless defined $shortest; | |||
| 197 | 28 | 18 | $longest = '' unless not_empty($longest); | |||
| 198 | 28 | 125 | $word_match = "(?:$word_pattern){$shortest,$longest}"; | |||
| 199 | 28 | 220 | return qr/\b$word_match\b/; | |||
| 200 | } | |||||
| 201 | ||||||
| 202 | sub load_dictionary { | |||||
| 203 | 15 | 2412 | my ($dict) = @_; | |||
| 204 | 15 | 10 | our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary); | |||
| 205 | 15 | 10 | $longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef); | |||
| 206 | 15 | 15 | $shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', undef); | |||
| 207 | 15 | 12 | our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern); | |||
| 208 | 15 | 14 | $ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>); | |||
| 209 | 15 | 52 | $upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]'); | |||
| 210 | 15 | 37 | $lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]'); | |||
| 211 | 15 | 30 | $not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]'); | |||
| 212 | 15 | 28 | $not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]'); | |||
| 213 | 15 | 33 | $punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>); | |||
| 214 | 15 | 35 | %dictionary = (); | |||
| 215 | ||||||
| 216 | 15 | 487 | open(DICT, '<:utf8', $dict); | |||
| 217 | 15 | 67 | while (!eof(DICT)) { | |||
| 218 | 52 | 55 | my $word = <DICT>; | |||
| 219 | 52 | 35 | chomp $word; | |||
| 220 | 52 | 124 | next unless $word =~ $word_match; | |||
| 221 | 49 | 46 | my $l = length $word; | |||
| 222 | 49 | 29 | $longest = -1 unless not_empty($longest); | |||
| 223 | 49 | 50 | $longest = $l if $l > $longest; | |||
| 224 | 49 | 40 | $shortest = $l if $l < $shortest; | |||
| 225 | 49 | 86 | $dictionary{$word}=1; | |||
| 226 | } | |||||
| 227 | 15 | 43 | close DICT; | |||
| 228 | ||||||
| 229 | 15 | 9 | $word_match = valid_word(); | |||
| 230 | } | |||||
| 231 | ||||||
| 232 | sub hunspell_dictionary { | |||||
| 233 | 3 | 3 | my ($dict) = @_; | |||
| 234 | 3 | 4 | my $name = $dict; | |||
| 235 | 3 | 3 | $name =~ s{/src/index/hunspell/index\.dic$}{}; | |||
| 236 | 3 | 14 | $name =~ s{.*/}{}; | |||
| 237 | 3 | 3 | my $aff = $dict; | |||
| 238 | 3 | 3 | my $encoding; | |||
| 239 | 3 | 7 | $aff =~ s/\.dic$/.aff/; | |||
| 240 | 3 | 29 | if (open AFF, '<', $aff) { | |||
| 241 | 3 | 21 | while (<AFF>) { | |||
| 242 | 0 | 0 | next unless /^SET\s+(\S+)/; | |||
| 243 | 0 | 0 | $encoding = $1 if ($1 !~ /utf-8/i); | |||
| 244 | 0 | 0 | last; | |||
| 245 | } | |||||
| 246 | 3 | 9 | close AFF; | |||
| 247 | } | |||||
| 248 | return { | |||||
| 249 | 3 | 329 | name => $name, | |||
| 250 | dict => $dict, | |||||
| 251 | aff => $aff, | |||||
| 252 | encoding => $encoding, | |||||
| 253 | engine => Text::Hunspell->new($aff, $dict), | |||||
| 254 | } | |||||
| 255 | } | |||||
| 256 | ||||||
| 257 | sub init { | |||||
| 258 | 12 | 17003 | my ($configuration) = @_; | |||
| 259 | 12 | 14 | our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re); | |||
| 260 | 12 | 4 | our ($begin_block_re, @begin_block_list, @end_block_list); | |||
| 261 | 12 | 26 | our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', ''); | |||
| 262 | 12 | 11 | our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', ''); | |||
| 263 | 12 | 17 | our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30); | |||
| 264 | 12 | 8 | our %forbidden_re_descriptions; | |||
| 265 | 12 | 13 | if ($hunspell_dictionary_path) { | |||
| 266 | 3 | 34 | our @hunspell_dictionaries = (); | |||
| 267 | 1 1 1 1 1 1 1 1 1 3 | 261 1039 19 3 1 13 7 1 19 151 | if (eval 'use Text::Hunspell; 1') { | |||
| 268 | 3 | 139 | my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic"); | |||
| 269 | 3 | 5 | for my $hunspell_dictionary_file (@hunspell_dictionaries_list) { | |||
| 270 | 3 | 7 | push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file); | |||
| 271 | } | |||||
| 272 | } else { | |||||
| 273 | 0 | 0 | print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n"; | |||
| 274 | } | |||||
| 275 | } | |||||
| 276 | ||||||
| 277 | 12 | 80 | if (-e "$configuration/block-delimiters.list") { | |||
| 278 | 3 | 5 | my @block_delimiters = parse_block_list "$configuration/block-delimiters.list"; | |||
| 279 | 3 | 3 | if (@block_delimiters) { | |||
| 280 | 2 | 2 | @begin_block_list = (); | |||
| 281 | 2 | 1 | @end_block_list = (); | |||
| 282 | ||||||
| 283 | 2 | 2 | while (@block_delimiters) { | |||
| 284 | 2 | 4 | my ($begin, $end) = splice @block_delimiters, 0, 2; | |||
| 285 | 2 | 2 | push @begin_block_list, $begin; | |||
| 286 | 2 | 2 | push @end_block_list, $end; | |||
| 287 | } | |||||
| 288 | ||||||
| 289 | 2 2 | 2 2 | $begin_block_re = join '|', (map { '('.quote_re("\Q$_\E").')' } @begin_block_list); | |||
| 290 | } | |||||
| 291 | } | |||||
| 292 | ||||||
| 293 | 12 | 10 | my (@patterns_re_list, %in_patterns_re_list); | |||
| 294 | 12 | 50 | if (-e "$configuration/patterns.txt") { | |||
| 295 | 0 | 0 | @patterns_re_list = file_to_list "$configuration/patterns.txt"; | |||
| 296 | 0 | 0 | $patterns_re = list_to_re @patterns_re_list; | |||
| 297 | 0 0 | 0 0 | %in_patterns_re_list = map {$_ => 1} @patterns_re_list; | |||
| 298 | } else { | |||||
| 299 | 12 | 12 | $patterns_re = undef; | |||
| 300 | } | |||||
| 301 | ||||||
| 302 | 12 | 41 | if (-e "$configuration/forbidden.txt") { | |||
| 303 | 1 | 1 | my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt"; | |||
| 304 | 1 1 | 1 1 | @forbidden_re_list = @{$forbidden_re_info->{'patterns'}}; | |||
| 305 | 1 1 | 1 2 | %forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}}; | |||
| 306 | 1 | 1 | $forbidden_re = list_to_re @forbidden_re_list; | |||
| 307 | } else { | |||||
| 308 | 11 | 10 | $forbidden_re = undef; | |||
| 309 | } | |||||
| 310 | ||||||
| 311 | 12 | 51 | if (-e "$configuration/candidates.txt") { | |||
| 312 | 4 | 5 | @candidates_re_list = file_to_list "$configuration/candidates.txt"; | |||
| 313 | 4 8 8 | 4 4 13 | @candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} || !test_re($quoted) ? '' : $quoted } @candidates_re_list; | |||
| 314 | 4 | 4 | $candidates_re = list_to_re @candidates_re_list; | |||
| 315 | } else { | |||||
| 316 | 8 | 8 | $candidates_re = undef; | |||
| 317 | } | |||||
| 318 | ||||||
| 319 | 12 | 18 | our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024); | |||
| 320 | ||||||
| 321 | 12 | 10 | my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', ''); | |||
| 322 | 12 | 14 | our $disable_word_collating = $disable_flags =~ /(?:^|,|\s)word-collating(?:,|\s|$)/; | |||
| 323 | 12 | 4 | our $disable_minified_file = $disable_flags =~ /(?:^|,|\s)minified-file(?:,|\s|$)/; | |||
| 324 | 12 | 9 | our $disable_single_line_file = $disable_flags =~ /(?:^|,|\s)single-line-file(?:,|\s|$)/; | |||
| 325 | ||||||
| 326 | 12 | 5 | our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', ''); | |||
| 327 | 12 | 9 | $ignore_next_line_pattern =~ s/\s+/|/g; | |||
| 328 | ||||||
| 329 | 12 | 10 | our $check_images = CheckSpelling::Util::get_val_from_env('INPUT_CHECK_IMAGES', ''); | |||
| 330 | 12 | 9 | $check_images = $check_images =~ /^(?:1|true)$/i; | |||
| 331 | ||||||
| 332 | 12 | 10 | our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', ''); | |||
| 333 | ||||||
| 334 | 12 | 6 | our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', ''); | |||
| 335 | ||||||
| 336 | 12 | 13 | $word_match = valid_word(); | |||
| 337 | ||||||
| 338 | 12 | 21 | our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words"); | |||
| 339 | 12 | 77 | $base_dict = '/usr/share/dict/words' unless -e $base_dict; | |||
| 340 | 12 | 13 | load_dictionary($base_dict); | |||
| 341 | } | |||||
| 342 | ||||||
| 343 | sub split_line { | |||||
| 344 | 1160 | 433 | our (%dictionary, $word_match, $disable_word_collating); | |||
| 345 | 1160 | 429 | our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern); | |||
| 346 | 1160 | 497 | our @hunspell_dictionaries; | |||
| 347 | 1160 | 362 | our $shortest; | |||
| 348 | 1160 | 780 | my $shortest_threshold = $shortest + 2; | |||
| 349 | 1160 | 554 | my $pattern = '.'; | |||
| 350 | # $pattern = "(?:$upper_pattern){$shortest,}|$upper_pattern(?:$lower_pattern){2,}\n"; | |||||
| 351 | ||||||
| 352 | # https://www.fileformat.info/info/unicode/char/2019/ | |||||
| 353 | 1160 | 664 | my $rsqm = "\xE2\x80\x99"; | |||
| 354 | ||||||
| 355 | 1160 | 635 | my ($words, $unrecognized) = (0, 0); | |||
| 356 | 1160 | 742 | my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_; | |||
| 357 | 1160 | 5370 | $line =~ s/(?:$rsqm|'|'|\%27|’|’|’|\\u2019|\x{2019}|')+/'/g; | |||
| 358 | 1160 | 2148 | $line =~ s/(?:$ignore_pattern)+/ /g; | |||
| 359 | 1160 | 1624 | while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {} | |||
| 360 | 1160 | 3274 | while ($line =~ s/((?:$lower_pattern|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {} | |||
| 361 | 1160 | 1351 | for my $token (split /\s+/, $line) { | |||
| 362 | 3642 | 3129 | next unless $token =~ /$pattern/; | |||
| 363 | 2483 | 1960 | $token =~ s/^(?:'|$rsqm)+//g; | |||
| 364 | 2483 | 2455 | $token =~ s/(?:'|$rsqm)+s?$//g; | |||
| 365 | 2483 | 1411 | my $raw_token = $token; | |||
| 366 | 2483 | 1407 | $token =~ s/^[^Ii]?'+(.*)/$1/; | |||
| 367 | 2483 | 1257 | $token =~ s/(.*?)'+$/$1/; | |||
| 368 | 2483 | 3617 | next unless $token =~ $word_match; | |||
| 369 | 2318 | 2056 | if (defined $dictionary{$token}) { | |||
| 370 | 1038 | 424 | ++$words; | |||
| 371 | 1038 | 584 | $unique_ref->{$token}=1; | |||
| 372 | 1038 | 833 | next; | |||
| 373 | } | |||||
| 374 | 1280 | 909 | if (@hunspell_dictionaries) { | |||
| 375 | 1254 | 607 | my $found = 0; | |||
| 376 | 1254 | 708 | for my $hunspell_dictionary (@hunspell_dictionaries) { | |||
| 377 | my $token_encoded = defined $hunspell_dictionary->{'encoding'} ? | |||||
| 378 | 1254 | 1051 | encode($hunspell_dictionary->{'encoding'}, $token) : $token; | |||
| 379 | 1254 | 2803 | next unless ($hunspell_dictionary->{'engine'}->check($token_encoded)); | |||
| 380 | 0 | 0 | ++$words; | |||
| 381 | 0 | 0 | $dictionary{$token} = 1; | |||
| 382 | 0 | 0 | $unique_ref->{$token}=1; | |||
| 383 | 0 | 0 | $found = 1; | |||
| 384 | 0 | 0 | last; | |||
| 385 | } | |||||
| 386 | 1254 | 926 | next if $found; | |||
| 387 | } | |||||
| 388 | 1280 | 881 | my $key = lc $token; | |||
| 389 | 1280 | 1054 | if (defined $dictionary{$key}) { | |||
| 390 | 6 | 1 | ++$words; | |||
| 391 | 6 | 6 | $unique_ref->{$key}=1; | |||
| 392 | 6 | 6 | next; | |||
| 393 | } | |||||
| 394 | 1274 | 862 | unless ($disable_word_collating) { | |||
| 395 | 1274 | 716 | $key =~ s/''+/'/g; | |||
| 396 | 1274 | 1260 | $key =~ s/'[sd]$// unless length $key >= $shortest_threshold; | |||
| 397 | } | |||||
| 398 | 1274 | 994 | if (defined $dictionary{$key}) { | |||
| 399 | 0 | 0 | ++$words; | |||
| 400 | 0 | 0 | $unique_ref->{$key}=1; | |||
| 401 | 0 | 0 | next; | |||
| 402 | } | |||||
| 403 | 1274 | 598 | ++$unrecognized; | |||
| 404 | 1274 | 724 | $unique_unrecognized_ref->{$raw_token}=1; | |||
| 405 | 1274 | 1561 | $unrecognized_line_items_ref->{$raw_token}=1; | |||
| 406 | } | |||||
| 407 | 1160 | 1590 | return ($words, $unrecognized); | |||
| 408 | } | |||||
| 409 | ||||||
| 410 | sub skip_file { | |||||
| 411 | 7 | 29 | my ($temp_dir, $reason) = @_; | |||
| 412 | 7 | 181 | open(SKIPPED, '>:utf8', "$temp_dir/skipped"); | |||
| 413 | 7 | 40 | print SKIPPED $reason; | |||
| 414 | 7 | 93 | close SKIPPED; | |||
| 415 | } | |||||
| 416 | ||||||
| 417 | sub maybe_ocr_file { | |||||
| 418 | 0 | 0 | my ($file) = @_; | |||
| 419 | 0 | 0 | my $tesseract = dirname(dirname(dirname(__FILE__)))."/wrappers/run-tesseract"; | |||
| 420 | 0 | 0 | $ENV{'input'} = $file; | |||
| 421 | 0 | 0 | my $text_file = `"$tesseract"`; | |||
| 422 | 0 | 0 | delete $ENV{'input'}; | |||
| 423 | 0 | 0 | return ($file, 0) unless defined $text_file; | |||
| 424 | 0 | 0 | my $file_converted = 0; | |||
| 425 | 0 | 0 | chomp $text_file; | |||
| 426 | 0 | 0 | if ($text_file =~ /./) { | |||
| 427 | 0 | 0 | $file_converted = 1; | |||
| 428 | 0 | 0 | $file = $text_file; | |||
| 429 | } | |||||
| 430 | 0 | 0 | return ($file, $file_converted); | |||
| 431 | } | |||||
| 432 | ||||||
| 433 | sub split_file { | |||||
| 434 | 18 | 12637 | my ($file) = @_; | |||
| 435 | our ( | |||||
| 436 | 18 | 13 | $unrecognized, $shortest, $largest_file, $words, | |||
| 437 | $word_match, %unique, %unique_unrecognized, $forbidden_re, | |||||
| 438 | @forbidden_re_list, $patterns_re, %dictionary, | |||||
| 439 | $begin_block_re, @begin_block_list, @end_block_list, | |||||
| 440 | $candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file, | |||||
| 441 | $disable_single_line_file, | |||||
| 442 | $ignore_next_line_pattern, | |||||
| 443 | $sandbox, | |||||
| 444 | $check_images, | |||||
| 445 | ); | |||||
| 446 | 18 | 36 | $ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./; | |||
| 447 | ||||||
| 448 | 18 | 13 | our %forbidden_re_descriptions; | |||
| 449 | 18 | 7 | our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern); | |||
| 450 | ||||||
| 451 | # https://www.fileformat.info/info/unicode/char/2019/ | |||||
| 452 | 18 | 12 | my $rsqm = "\xE2\x80\x99"; | |||
| 453 | ||||||
| 454 | 18 | 21 | my @candidates_re_hits = (0) x scalar @candidates_re_list; | |||
| 455 | 18 | 8 | my @candidates_re_lines = (0) x scalar @candidates_re_list; | |||
| 456 | 18 | 13 | my @forbidden_re_hits = (0) x scalar @forbidden_re_list; | |||
| 457 | 18 | 17 | my @forbidden_re_lines = (0) x scalar @forbidden_re_list; | |||
| 458 | 18 | 33 | my $temp_dir = tempdir(DIR=>$sandbox); | |||
| 459 | 18 | 2658 | print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'}; | |||
| 460 | 18 | 397 | open(NAME, '>', "$temp_dir/name"); | |||
| 461 | 18 | 58 | print NAME $file; | |||
| 462 | 18 | 237 | close NAME; | |||
| 463 | 18 | 191 | if (defined readlink($file) && | |||
| 464 | rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) { | |||||
| 465 | 1 | 4 | skip_file($temp_dir, "file only has a single line (out-of-bounds-symbolic-link)\n"); | |||
| 466 | 1 | 3 | return $temp_dir; | |||
| 467 | } | |||||
| 468 | 17 | 30 | if ($use_magic_file) { | |||
| 469 | 8 | 11108 | if (open(my $file_fh, '-|', | |||
| 470 | '/usr/bin/file', | |||||
| 471 | '-b', | |||||
| 472 | '--mime', | |||||
| 473 | '-e', 'cdf', | |||||
| 474 | '-e', 'compress', | |||||
| 475 | '-e', 'csv', | |||||
| 476 | '-e', 'elf', | |||||
| 477 | '-e', 'json', | |||||
| 478 | '-e', 'tar', | |||||
| 479 | $file)) { | |||||
| 480 | 8 | 28867 | my $file_kind = <$file_fh>; | |||
| 481 | 8 | 4613 | close $file_fh; | |||
| 482 | 8 | 24 | my $file_converted = 0; | |||
| 483 | 8 | 22 | if ($check_images && $file_kind =~ m<^image/>) { | |||
| 484 | 0 | 0 | ($file, $file_converted) = maybe_ocr_file($file); | |||
| 485 | } | |||||
| 486 | 8 | 195 | if ($file_converted == 0 && $file_kind =~ /^(.*?); charset=binary/) { | |||
| 487 | 2 | 35 | skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n"); | |||
| 488 | 2 | 52 | return $temp_dir; | |||
| 489 | } | |||||
| 490 | } | |||||
| 491 | } elsif ($file =~ /\.(?:png|jpe?g|gif)$/) { | |||||
| 492 | 0 | 0 | my $file_converted = 0; | |||
| 493 | 0 | 0 | ($file, $file_converted) = maybe_ocr_file($file); | |||
| 494 | } | |||||
| 495 | 15 | 96 | my $file_size = -s $file; | |||
| 496 | 15 | 12 | if (defined $largest_file) { | |||
| 497 | 15 | 11 | unless ($check_file_names eq $file) { | |||
| 498 | 15 | 12 | if ($file_size > $largest_file) { | |||
| 499 | 1 | 3 | skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n"); | |||
| 500 | 1 | 3 | return $temp_dir; | |||
| 501 | } | |||||
| 502 | } | |||||
| 503 | } | |||||
| 504 | 14 | 119 | open FILE, '<', $file; | |||
| 505 | 14 | 13 | binmode FILE; | |||
| 506 | 14 | 8 | my $head; | |||
| 507 | 14 | 133 | read(FILE, $head, 4096); | |||
| 508 | 14 | 791 | $head =~ s/(?:\r|\n)+$//; | |||
| 509 | 14 | 50 | my $dos_new_lines = () = $head =~ /\r\n/gi; | |||
| 510 | 14 | 31 | my $unix_new_lines = () = $head =~ /\n/gi; | |||
| 511 | 14 | 106 | my $mac_new_lines = () = $head =~ /\r/gi; | |||
| 512 | 14 | 58 | local $/; | |||
| 513 | 14 | 65 | if ($unix_new_lines == 0 && $mac_new_lines == 0) { | |||
| 514 | 3 | 5 | $/ = "\n"; | |||
| 515 | } elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) { | |||||
| 516 | 1 | 5 | $/ = "\r\n"; | |||
| 517 | } elsif ($mac_new_lines > $unix_new_lines) { | |||||
| 518 | 2 | 7 | $/ = "\r"; | |||
| 519 | } else { | |||||
| 520 | 8 | 8 | $/ = "\n"; | |||
| 521 | } | |||||
| 522 | 14 | 36 | seek(FILE, 0, 0); | |||
| 523 | 14 | 19 | ($words, $unrecognized) = (0, 0); | |||
| 524 | 14 | 48 | %unique = (); | |||
| 525 | 14 | 26 | %unique_unrecognized = (); | |||
| 526 | ||||||
| 527 | local $SIG{__WARN__} = sub { | |||||
| 528 | 0 | 0 | my $message = shift; | |||
| 529 | 0 | 0 | $message =~ s/> line/> in $file - line/; | |||
| 530 | 0 | 0 | chomp $message; | |||
| 531 | 0 | 0 | print STDERR "$message\n"; | |||
| 532 | 14 | 125 | }; | |||
| 533 | ||||||
| 534 | 14 | 342 | open(WARNINGS, '>:utf8', "$temp_dir/warnings"); | |||
| 535 | 14 | 7 | our $timeout; | |||
| 536 | 14 | 11 | eval { | |||
| 537 | 14 0 | 98 0 | local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required | |||
| 538 | 14 | 37 | alarm $timeout; | |||
| 539 | ||||||
| 540 | 14 | 14 | my $ignore_next_line = 0; | |||
| 541 | 14 | 26 | my ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', ''); | |||
| 542 | 14 | 13 | my $offset = 0; | |||
| 543 | 14 | 98 | LINE: while (<FILE>) { | |||
| 544 | 1169 | 1005 | if ($. == 1) { | |||
| 545 | 14 | 20 | unless ($disable_minified_file) { | |||
| 546 | 14 | 75 | if ($file_size >= 512 && length($_) == $file_size) { | |||
| 547 | 1 | 9 | skip_file($temp_dir, "file only has a single line (single-line-file)\n"); | |||
| 548 | 1 | 4 | last; | |||
| 549 | } | |||||
| 550 | } | |||||
| 551 | } | |||||
| 552 | 1168 | 2337 | $_ = decode_utf8($_, FB_DEFAULT); | |||
| 553 | 1168 | 2679 | if (/[\x{D800}-\x{DFFF}]/) { | |||
| 554 | 0 | 0 | skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n"); | |||
| 555 | 0 | 0 | last; | |||
| 556 | } | |||||
| 557 | 1168 | 1294 | s/\R$//; | |||
| 558 | 1168 | 998 | s/^\x{FEFF}// if $. == 1; | |||
| 559 | 1168 | 1054 | next unless /./; | |||
| 560 | 1167 | 740 | my $raw_line = $_; | |||
| 561 | 1167 | 507 | my $parsed_block_markers; | |||
| 562 | ||||||
| 563 | # hook for custom multiline based text exclusions: | |||||
| 564 | 1167 | 781 | if ($begin_block_re) { | |||
| 565 | 1148 | 500 | FIND_END_MARKER: while (1) { | |||
| 566 | 1150 | 954 | while ($next_end_marker ne '') { | |||
| 567 | 6 | 29 | next LINE unless /\Q$next_end_marker\E/; | |||
| 568 | 1 | 5 | s/.*?\Q$next_end_marker\E//; | |||
| 569 | 1 | 2 | ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', ''); | |||
| 570 | 1 | 1 | $parsed_block_markers = 1; | |||
| 571 | } | |||||
| 572 | 1145 | 1068 | my @captured = (/^.*?$begin_block_re/); | |||
| 573 | 1145 | 998 | last unless (@captured); | |||
| 574 | 2 | 2 | for my $capture (0 .. $#captured) { | |||
| 575 | 2 | 2 | if ($captured[$capture]) { | |||
| 576 | 2 | 5 | ($current_begin_marker, $next_end_marker, $start_marker_line) = ($begin_block_list[$capture], $end_block_list[$capture], "$.:1 ... 1"); | |||
| 577 | 2 | 12 | s/^.*?\Q$begin_block_list[$capture]\E//; | |||
| 578 | 2 | 2 | $parsed_block_markers = 1; | |||
| 579 | 2 | 3 | next FIND_END_MARKER; | |||
| 580 | } | |||||
| 581 | } | |||||
| 582 | } | |||||
| 583 | 1143 | 806 | next if $parsed_block_markers; | |||
| 584 | } | |||||
| 585 | ||||||
| 586 | 1161 | 625 | my $ignore_this_line = $ignore_next_line; | |||
| 587 | 1161 | 860 | $ignore_next_line = ($_ =~ /$ignore_next_line_pattern/); | |||
| 588 | 1161 | 781 | next if $ignore_this_line; | |||
| 589 | ||||||
| 590 | # hook for custom line based text exclusions: | |||||
| 591 | 1160 | 824 | if (defined $patterns_re) { | |||
| 592 | 2 6 | 12 8 | s/($patterns_re)/"="x length($1)/ge; | |||
| 593 | } | |||||
| 594 | 1160 | 587 | my $initial_line_state = $_; | |||
| 595 | 1160 | 708 | my $previous_line_state = $_; | |||
| 596 | 1160 | 612 | my $line_flagged; | |||
| 597 | 1160 | 844 | if ($forbidden_re) { | |||
| 598 | 9 5 | 59 11 | while (s/($forbidden_re)/"="x length($1)/e) { | |||
| 599 | 5 | 3 | $line_flagged = 1; | |||
| 600 | 5 | 9 | my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1); | |||
| 601 | 5 | 4 | my $found_trigger_re; | |||
| 602 | 5 | 6 | for my $i (0 .. $#forbidden_re_list) { | |||
| 603 | 7 | 5 | my $forbidden_re_singleton = $forbidden_re_list[$i]; | |||
| 604 | 7 | 6 | my $test_line = $previous_line_state; | |||
| 605 | 7 4 | 60 6 | if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) { | |||
| 606 | 4 | 5 | next unless $test_line eq $_; | |||
| 607 | 4 | 6 | my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1); | |||
| 608 | 4 | 4 | next unless $begin == $begin_test; | |||
| 609 | 4 | 4 | next unless $end == $end_test; | |||
| 610 | 4 | 3 | next unless $match eq $match_test; | |||
| 611 | 4 | 2 | $found_trigger_re = $forbidden_re_singleton; | |||
| 612 | 4 | 8 | my $hit = "$.:$begin:$end"; | |||
| 613 | 4 | 3 | $forbidden_re_hits[$i]++; | |||
| 614 | 4 | 5 | $forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i]; | |||
| 615 | 4 | 5 | last; | |||
| 616 | } | |||||
| 617 | } | |||||
| 618 | 5 | 5 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($match); | |||
| 619 | 5 | 6 | if ($found_trigger_re) { | |||
| 620 | 4 | 7 | my $description = $forbidden_re_descriptions{$found_trigger_re} || ''; | |||
| 621 | 4 | 10 | $found_trigger_re =~ s/^\(\?:(.*)\)$/$1/; | |||
| 622 | 4 | 4 | my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99); | |||
| 623 | 4 | 5 | if ($description ne '') { | |||
| 624 | 3 | 14 | print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n"; | |||
| 625 | } else { | |||||
| 626 | 1 | 6 | print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n"; | |||
| 627 | } | |||||
| 628 | } else { | |||||
| 629 | 1 | 3 | print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n"; | |||
| 630 | } | |||||
| 631 | 5 | 26 | $previous_line_state = $_; | |||
| 632 | } | |||||
| 633 | 9 | 11 | $_ = $initial_line_state; | |||
| 634 | } | |||||
| 635 | # This is to make it easier to deal w/ rules: | |||||
| 636 | 1160 | 1066 | s/^/ /; | |||
| 637 | 1160 | 623 | my %unrecognized_line_items = (); | |||
| 638 | 1160 | 877 | my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items); | |||
| 639 | 1160 | 597 | $words += $new_words; | |||
| 640 | 1160 | 524 | $unrecognized += $new_unrecognized; | |||
| 641 | 1160 | 675 | my $line_length = length($raw_line); | |||
| 642 | 1160 | 1468 | for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) { | |||
| 643 | 1021 | 474 | my $found_token = 0; | |||
| 644 | 1021 | 453 | my $raw_token = $token; | |||
| 645 | 1021 | 580 | $token =~ s/'/(?:'|\x{2019}|\'|\')+/g; | |||
| 646 | 1021 | 444 | my $before; | |||
| 647 | 1021 | 1516 | if ($token =~ /^$upper_pattern$lower_pattern/) { | |||
| 648 | 5 | 3 | $before = '(?<=.)'; | |||
| 649 | } elsif ($token =~ /^$upper_pattern/) { | |||||
| 650 | 0 | 0 | $before = "(?<!$upper_pattern)"; | |||
| 651 | } else { | |||||
| 652 | 1016 | 570 | $before = "(?<=$not_lower_pattern)"; | |||
| 653 | } | |||||
| 654 | 1021 | 1112 | my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)"; | |||
| 655 | 1021 | 2188 | while ($raw_line =~ /(?:\b|$before)($token)(?:\b|$after)/g) { | |||
| 656 | 1271 | 550 | $line_flagged = 1; | |||
| 657 | 1271 | 644 | $found_token = 1; | |||
| 658 | 1271 | 1665 | my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1); | |||
| 659 | 1271 | 1173 | next unless $match =~ /./; | |||
| 660 | 1271 | 891 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($match); | |||
| 661 | 1271 | 4657 | print WARNINGS ":$.:$begin ... $end: $wrapped\n"; | |||
| 662 | } | |||||
| 663 | 1021 | 1126 | unless ($found_token) { | |||
| 664 | 3 | 29 | if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) { | |||
| 665 | 3 | 5 | my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1); | |||
| 666 | 3 | 2 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token); | |||
| 667 | 3 | 11 | print WARNINGS ":$.:$begin ... $end: $wrapped\n"; | |||
| 668 | } else { | |||||
| 669 | 0 | 0 | my $offset = $line_length + 1; | |||
| 670 | 0 | 0 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token); | |||
| 671 | 0 | 0 | print WARNINGS ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n"; | |||
| 672 | } | |||||
| 673 | } | |||||
| 674 | } | |||||
| 675 | 1160 | 1805 | if ($line_flagged && $candidates_re) { | |||
| 676 | 2 | 3 | $_ = $previous_line_state = $initial_line_state; | |||
| 677 | 2 2 | 21 4 | s/($candidates_re)/"="x length($1)/ge; | |||
| 678 | 2 | 3 | if ($_ ne $initial_line_state) { | |||
| 679 | 2 | 1 | $_ = $previous_line_state; | |||
| 680 | 2 | 2 | for my $i (0 .. $#candidates_re_list) { | |||
| 681 | 4 | 4 | my $candidate_re = $candidates_re_list[$i]; | |||
| 682 | 4 | 30 | next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/; | |||
| 683 | 2 2 | 8 4 | if (($_ =~ s/($candidate_re)/"="x length($1)/e)) { | |||
| 684 | 2 | 3 | my ($begin, $end) = ($-[0] + 1, $+[0] + 1); | |||
| 685 | 2 | 4 | my $hit = "$.:$begin:$end"; | |||
| 686 | 2 | 2 | $_ = $previous_line_state; | |||
| 687 | 2 2 | 6 3 | my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge); | |||
| 688 | 2 | 4 | $candidates_re_hits[$i] += $replacements; | |||
| 689 | 2 | 3 | $candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i]; | |||
| 690 | 2 | 4 | $_ = $previous_line_state; | |||
| 691 | } | |||||
| 692 | } | |||||
| 693 | } | |||||
| 694 | } | |||||
| 695 | 1160 | 806 | unless ($disable_minified_file) { | |||
| 696 | 1160 | 835 | s/={3,}//g; | |||
| 697 | 1160 | 823 | $offset += length; | |||
| 698 | 1160 | 1087 | my $ratio = int($offset / $.); | |||
| 699 | 1160 | 576 | my $ratio_threshold = 1000; | |||
| 700 | 1160 | 3158 | if ($ratio > $ratio_threshold) { | |||
| 701 | 2 | 6 | skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n"); | |||
| 702 | 2 | 7 | last; | |||
| 703 | } | |||||
| 704 | } | |||||
| 705 | } | |||||
| 706 | 14 | 15 | if ($next_end_marker) { | |||
| 707 | 1 | 2 | if ($start_marker_line) { | |||
| 708 | 1 | 1 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($current_begin_marker); | |||
| 709 | 1 | 8 | print WARNINGS ":$start_marker_line, Warning - Failed to find matching end marker for $wrapped (unclosed-block-ignore-begin)\n"; | |||
| 710 | } | |||||
| 711 | 1 | 2 | my $wrapped = CheckSpelling::Util::wrap_in_backticks($next_end_marker); | |||
| 712 | 1 | 3 | print WARNINGS ":$.:1 ... 1, Warning - Expected to find end block marker $wrapped (unclosed-block-ignore-end)\n"; | |||
| 713 | } | |||||
| 714 | ||||||
| 715 | 14 | 100 | alarm 0; | |||
| 716 | }; | |||||
| 717 | 14 | 20 | if ($@) { | |||
| 718 | 0 | 0 | die unless $@ eq "alarm\n"; | |||
| 719 | 0 | 0 | print WARNINGS ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n"; | |||
| 720 | 0 | 0 | skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n"); | |||
| 721 | 0 | 0 | last; | |||
| 722 | } | |||||
| 723 | ||||||
| 724 | 14 | 43 | close FILE; | |||
| 725 | 14 | 162 | close WARNINGS; | |||
| 726 | ||||||
| 727 | 14 | 29 | if ($unrecognized || @candidates_re_hits || @forbidden_re_hits) { | |||
| 728 | 13 | 307 | open(STATS, '>:utf8', "$temp_dir/stats"); | |||
| 729 | 13 | 186 | print STATS "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized). | |||
| 730 | ", unique: ".(keys %unique). | |||||
| 731 | (@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : ""). | |||||
| 732 | (@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : ""). | |||||
| 733 | (@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : ""). | |||||
| 734 | (@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : ""). | |||||
| 735 | "}"; | |||||
| 736 | 13 | 184 | close STATS; | |||
| 737 | 13 | 246 | open(UNKNOWN, '>:utf8', "$temp_dir/unknown"); | |||
| 738 | 13 20 | 45 33 | print UNKNOWN map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized; | |||
| 739 | 13 | 121 | close UNKNOWN; | |||
| 740 | } | |||||
| 741 | ||||||
| 742 | 14 | 121 | return $temp_dir; | |||
| 743 | } | |||||
| 744 | ||||||
| 745 | sub main { | |||||
| 746 | 4 | 393 | my ($configuration, @ARGV) = @_; | |||
| 747 | 4 | 1 | our %dictionary; | |||
| 748 | 4 | 4 | unless (%dictionary) { | |||
| 749 | 1 | 9 | init($configuration); | |||
| 750 | } | |||||
| 751 | ||||||
| 752 | # read all input | |||||
| 753 | 4 | 4 | my @reports; | |||
| 754 | ||||||
| 755 | 4 | 4 | for my $file (@ARGV) { | |||
| 756 | 4 | 2 | my $temp_dir = split_file($file); | |||
| 757 | 4 | 4 | push @reports, "$temp_dir\n"; | |||
| 758 | } | |||||
| 759 | 4 | 10 | print join '', @reports; | |||
| 760 | } | |||||
| 761 | ||||||
| 762 | 1; | |||||