File Coverage

File:lib/CheckSpelling/UnknownWordSplitter.pm
Coverage:88.1%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3# ~/bin/w
4# Search for potentially misspelled words
5# Output is:
6# misspellled
7# woord (WOORD, Woord, woord, woord's)
8package CheckSpelling::UnknownWordSplitter;
9
10
1
1
107436
1
use 5.022;
11
1
1
1
1
3
40
use feature 'unicode_strings';
12
1
1
1
3
0
9
use strict;
13
1
1
1
2
0
17
use warnings;
14
1
1
1
1
1
14
no warnings qw(experimental::vlb);
15
1
1
1
1
1
1
use utf8;
16
1
1
1
12
0
30
use Encode qw/decode_utf8 encode FB_DEFAULT/;
17
1
1
1
2
1
25
use File::Basename;
18
1
1
1
1
1
12
use Cwd 'abs_path';
19
1
1
1
1
1
19
use File::Spec;
20
1
1
1
1
1
23
use File::Temp qw/ tempfile tempdir /;
21
1
1
1
283
1
3256
use CheckSpelling::Util;
22our $VERSION='0.1.0';
23
24my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
25my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
26my ($shortest, $longest) = (255, 0);
27my @forbidden_re_list;
28my %forbidden_re_descriptions;
29my @candidates_re_list;
30my $hunspell_dictionary_path;
31my @hunspell_dictionaries;
32my %dictionary = ();
33my $base_dict;
34my %unique;
35my %unique_unrecognized;
36my ($last_file, $words, $unrecognized) = ('', 0, 0);
37my ($ignore_next_line_pattern);
38
39my $disable_flags;
40
41sub test_re {
42
14
9
  my ($expression) = @_;
43
14
14
5
100
  return eval { qr /$expression/ };
44}
45
46sub quote_re {
47
14
8
  my ($expression) = @_;
48
14
14
  return $expression if $expression =~ /\?\{/;
49
14
31
  $expression =~ s/
50   \G
51   (
52      (?:[^\\]|\\[^Q])*
53   )
54   (?:
55      \\Q
56      (?:[^\\]|\\[^E])*
57      (?:\\E)?
58   )?
59/
60
28
46
   $1 . (defined($2) ? quotemeta($2) : '')
61/xge;
62
14
16
  return $expression;
63}
64
65sub file_to_lists {
66
3
2
  my ($re) = @_;
67
3
4
  my @patterns;
68  my %hints;
69
3
0
  my $fh;
70
3
27
  if (open($fh, '<:utf8', $re)) {
71
3
6
    local $/=undef;
72
3
15
    my $file=<$fh>;
73
3
10
    close $fh;
74
3
2
    my $line_number = 0;
75
3
2
    my $hint = '';
76
3
13
    for (split /\R/, $file) {
77
17
8
      ++$line_number;
78
17
7
      chomp;
79
17
21
      if (/^#(?:\s(.+)|)/) {
80
6
14
        $hint = $1 if ($hint eq '' && defined $1);
81
6
4
        next;
82      }
83
11
10
      $hint = '' unless $_ ne '';
84
11
8
      my $pattern = $_;
85
11
29
      next unless s/^(.+)/(?:$1)/;
86
7
7
      my $quoted = quote_re($1);
87
7
9
      unless (test_re $quoted) {
88
1
2
        my $error = $@;
89
1
44
        my $home = dirname(__FILE__);
90
1
19
        $error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/;
91
1
10
        print STDERR $error;
92
1
1
        $_ = '(?:\$^ - skipped because bad-regex)';
93
1
1
        $hint = '';
94      }
95
7
12
      if (defined $hints{$_}) {
96
1
1
        my $pattern_length = length $pattern;
97
1
2
        my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern);
98
1
13
        print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n";
99
1
2
        $_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)';
100      } else {
101
6
7
        push @patterns, $_;
102
6
8
        $hints{$_} = $hint;
103      }
104
7
10
      $hint = '';
105    }
106  }
107
108  return {
109
3
11
    patterns => \@patterns,
110    hints => \%hints,
111  };
112}
113
114sub file_to_list {
115
2
1151
  my ($re) = @_;
116
2
3
  my $lists = file_to_lists($re);
117
118
2
2
1
5
  return @{$lists->{'patterns'}};
119}
120
121sub list_to_re {
122
2
2
  my (@list) = @_;
123
2
5
5
2
4
5
  @list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list;
124
2
5
2
5
  @list = grep { $_ ne '' } @list;
125
2
2
  return '$^' unless scalar @list;
126
2
6
  return join "|", (@list);
127}
128
129sub not_empty {
130
51
40
  my ($thing) = @_;
131
51
154
  return defined $thing && $thing ne ''
132}
133
134sub valid_word {
135  # shortest_word is an absolute
136
22
9
  our ($shortest, $longest, $shortest_word, $longest_word);
137
22
21
  $shortest = $shortest_word if $shortest_word;
138
22
20
  if ($longest_word) {
139    # longest_word is an absolute
140
20
15
    $longest = $longest_word;
141  } elsif (not_empty($longest)) {
142    # we allow for some sloppiness (a couple of stuck keys per word)
143    # it's possible that this should scale with word length
144
1
1
    $longest += 2;
145  }
146
22
18
  our ($upper_pattern, $lower_pattern, $punctuation_pattern);
147
22
66
15
147
  my $word_pattern = join '|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern));
148
22
18
  $word_pattern = q<\\w|'> unless $word_pattern;
149
22
34
  if ((defined $shortest && not_empty($longest)) &&
150      ($shortest > $longest)) {
151
0
0
    $word_pattern = "(?:$word_pattern){3}";
152
0
0
    return qr/$word_pattern/;
153  }
154
22
25
  $shortest = 3 unless defined $shortest;
155
22
21
  $longest = '' unless defined $longest;
156
22
73
  $word_match = "(?:$word_pattern){$shortest,$longest}";
157
22
197
  return qr/\b$word_match\b/;
158}
159
160sub load_dictionary {
161
12
1945
  my ($dict) = @_;
162
12
6
  our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary);
163
12
9
  $longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef);
164
12
8
  $shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', undef);
165
12
5
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
166
12
10
  $ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>);
167
12
53
  $upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]');
168
12
26
  $lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]');
169
12
19
  $not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]');
170
12
19
  $not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]');
171
12
22
  $punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>);
172
12
25
  %dictionary = ();
173
174
12
351
  open(DICT, '<:utf8', $dict);
175
12
45
  while (!eof(DICT)) {
176
31
60
    my $word = <DICT>;
177
31
21
    chomp $word;
178
31
83
    next unless $word =~ $word_match;
179
28
29
    my $l = length $word;
180
28
19
    $longest = -1 unless not_empty($longest);
181
28
29
    $longest = $l if $l > $longest;
182
28
30
    $shortest = $l if $l < $shortest;
183
28
76
    $dictionary{$word}=1;
184  }
185
12
43
  close DICT;
186
187
12
10
  $word_match = valid_word();
188}
189
190sub hunspell_dictionary {
191
3
3
  my ($dict) = @_;
192
3
4
  my $name = $dict;
193
3
3
  $name =~ s{/src/index/hunspell/index\.dic$}{};
194
3
12
  $name =~ s{.*/}{};
195
3
3
  my $aff = $dict;
196
3
4
  my $encoding;
197
3
7
  $aff =~ s/\.dic$/.aff/;
198
3
28
  if (open AFF, '<', $aff) {
199
3
19
    while (<AFF>) {
200
0
0
      next unless /^SET\s+(\S+)/;
201
0
0
      $encoding = $1 if ($1 !~ /utf-8/i);
202
0
0
      last;
203    }
204
3
7
    close AFF;
205  }
206  return {
207
3
279
    name => $name,
208    dict => $dict,
209    aff => $aff,
210    encoding => $encoding,
211    engine => Text::Hunspell->new($aff, $dict),
212  }
213}
214
215sub init {
216
9
10623
  my ($configuration) = @_;
217
9
11
  our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
218
9
18
  our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', '');
219
9
8
  our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
220
9
14
  our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
221
9
4
  our %forbidden_re_descriptions;
222
9
10
  if ($hunspell_dictionary_path) {
223
3
24
    our @hunspell_dictionaries = ();
224
1
1
1
1
1
1
1
1
1
3
171
1050
19
4
1
12
6
1
15
144
    if (eval 'use Text::Hunspell; 1') {
225
3
104
      my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic");
226
3
6
      for my $hunspell_dictionary_file (@hunspell_dictionaries_list) {
227
3
5
        push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file);
228      }
229    } else {
230
0
0
      print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
231    }
232  }
233
9
9
  my (@patterns_re_list, %in_patterns_re_list);
234
9
56
  if (-e "$configuration/patterns.txt") {
235
0
0
    @patterns_re_list = file_to_list "$configuration/patterns.txt";
236
0
0
    $patterns_re = list_to_re @patterns_re_list;
237
0
0
0
0
    %in_patterns_re_list = map {$_ => 1} @patterns_re_list;
238  } else {
239
9
6
    $patterns_re = undef;
240  }
241
242
9
35
  if (-e "$configuration/forbidden.txt") {
243
1
2
    my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt";
244
1
1
1
1
    @forbidden_re_list = @{$forbidden_re_info->{'patterns'}};
245
1
1
1
3
    %forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}};
246
1
1
    $forbidden_re = list_to_re @forbidden_re_list;
247  } else {
248
8
11
    $forbidden_re = undef;
249  }
250
251
9
41
  if (-e "$configuration/candidates.txt") {
252
1
1
    @candidates_re_list = file_to_list "$configuration/candidates.txt";
253
1
2
2
2
2
4
    @candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} || !test_re($quoted) ? '' : $quoted } @candidates_re_list;
254
1
1
    $candidates_re = list_to_re @candidates_re_list;
255  } else {
256
8
5
    $candidates_re = undef;
257  }
258
259
9
10
  our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024);
260
261
9
6
  my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', '');
262
9
9
  our $disable_word_collating = $disable_flags =~ /(?:^|,|\s)word-collating(?:,|\s|$)/;
263
9
5
  our $disable_minified_file = $disable_flags =~ /(?:^|,|\s)minified-file(?:,|\s|$)/;
264
9
8
  our $disable_single_line_file = $disable_flags =~ /(?:^|,|\s)single-line-file(?:,|\s|$)/;
265
266
9
7
  our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', '');
267
9
7
  $ignore_next_line_pattern =~ s/\s+/|/g;
268
269
9
8
  our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', '');
270
271
9
6
  our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', '');
272
273
9
6
  $word_match = valid_word();
274
275
9
18
  our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words");
276
9
43
  $base_dict = '/usr/share/dict/words' unless -e $base_dict;
277
9
8
  load_dictionary($base_dict);
278}
279
280sub split_line {
281
1156
504
  our (%dictionary, $word_match, $disable_word_collating);
282
1156
447
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
283
1156
450
  our @hunspell_dictionaries;
284
1156
461
  our $shortest;
285
1156
730
  my $shortest_threshold = $shortest + 2;
286
1156
604
  my $pattern = '.';
287  # $pattern = "(?:$upper_pattern){$shortest,}|$upper_pattern(?:$lower_pattern){2,}\n";
288
289  # https://www.fileformat.info/info/unicode/char/2019/
290
1156
554
  my $rsqm = "\xE2\x80\x99";
291
292
1156
649
  my ($words, $unrecognized) = (0, 0);
293
1156
804
  my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_;
294
1156
5490
    $line =~ s/(?:$rsqm|&apos;|&#39;|\%27|&#8217;|&#x2019;|&rsquo;|\\u2019|\x{2019}|')+/'/g;
295
1156
2159
    $line =~ s/(?:$ignore_pattern)+/ /g;
296
1156
1731
    while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {}
297
1156
3276
    while ($line =~ s/((?:$lower_pattern|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {}
298
1156
1322
    for my $token (split /\s+/, $line) {
299
3619
3341
      next unless $token =~ /$pattern/;
300
2464
1952
      $token =~ s/^(?:'|$rsqm)+//g;
301
2464
2411
      $token =~ s/(?:'|$rsqm)+s?$//g;
302
2464
1447
      my $raw_token = $token;
303
2464
1415
      $token =~ s/^[^Ii]?'+(.*)/$1/;
304
2464
1223
      $token =~ s/(.*?)'+$/$1/;
305
2464
3638
      next unless $token =~ $word_match;
306
2311
2029
      if (defined $dictionary{$token}) {
307
1032
462
        ++$words;
308
1032
591
        $unique_ref->{$token}=1;
309
1032
814
        next;
310      }
311
1279
963
      if (@hunspell_dictionaries) {
312
1254
627
        my $found = 0;
313
1254
690
        for my $hunspell_dictionary (@hunspell_dictionaries) {
314          my $token_encoded = defined $hunspell_dictionary->{'encoding'} ?
315
1254
1013
            encode($hunspell_dictionary->{'encoding'}, $token) : $token;
316
1254
2666
          next unless ($hunspell_dictionary->{'engine'}->check($token_encoded));
317
0
0
          ++$words;
318
0
0
          $dictionary{$token} = 1;
319
0
0
          $unique_ref->{$token}=1;
320
0
0
          $found = 1;
321
0
0
          last;
322        }
323
1254
992
        next if $found;
324      }
325
1279
904
      my $key = lc $token;
326
1279
1067
      if (defined $dictionary{$key}) {
327
6
4
        ++$words;
328
6
4
        $unique_ref->{$key}=1;
329
6
6
        next;
330      }
331
1273
861
      unless ($disable_word_collating) {
332
1273
746
        $key =~ s/''+/'/g;
333
1273
1284
        $key =~ s/'[sd]$// unless length $key >= $shortest_threshold;
334      }
335
1273
1031
      if (defined $dictionary{$key}) {
336
0
0
        ++$words;
337
0
0
        $unique_ref->{$key}=1;
338
0
0
        next;
339      }
340
1273
626
      ++$unrecognized;
341
1273
776
      $unique_unrecognized_ref->{$raw_token}=1;
342
1273
1641
      $unrecognized_line_items_ref->{$raw_token}=1;
343    }
344
1156
1546
    return ($words, $unrecognized);
345}
346
347sub skip_file {
348
7
22
  my ($temp_dir, $reason) = @_;
349
7
184
  open(SKIPPED, '>:utf8', "$temp_dir/skipped");
350
7
31
  print SKIPPED $reason;
351
7
97
  close SKIPPED;
352}
353
354sub split_file {
355
16
11193
  my ($file) = @_;
356  our (
357
16
9
    $unrecognized, $shortest, $largest_file, $words,
358    $word_match, %unique, %unique_unrecognized, $forbidden_re,
359    @forbidden_re_list, $patterns_re, %dictionary,
360    $candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file,
361    $disable_single_line_file,
362    $ignore_next_line_pattern,
363    $sandbox,
364  );
365
16
39
  $ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./;
366
367
16
7
  our %forbidden_re_descriptions;
368
16
10
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
369
370  # https://www.fileformat.info/info/unicode/char/2019/
371
16
9
  my $rsqm = "\xE2\x80\x99";
372
373
16
18
  my @candidates_re_hits = (0) x scalar @candidates_re_list;
374
16
11
  my @candidates_re_lines = (0) x scalar @candidates_re_list;
375
16
10
  my @forbidden_re_hits = (0) x scalar @forbidden_re_list;
376
16
9
  my @forbidden_re_lines = (0) x scalar @forbidden_re_list;
377
16
26
  my $temp_dir = tempdir(DIR=>$sandbox);
378
16
2338
  print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'};
379
16
368
  open(NAME, '>', "$temp_dir/name");
380
16
40
    print NAME $file;
381
16
183
  close NAME;
382
16
73
  my $file_size = -s $file;
383
16
17
  if (defined $largest_file) {
384
16
18
    unless ($check_file_names eq $file) {
385
16
15
      if ($file_size > $largest_file) {
386
1
2
        skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n");
387
1
2
        return $temp_dir;
388      }
389    }
390  }
391
15
170
  if (defined readlink($file) &&
392      rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) {
393
1
3
    skip_file($temp_dir, "file only has a single line (out-of-bounds-symbolic-link)\n");
394
1
3
    return $temp_dir;
395  }
396
14
16
  if ($use_magic_file) {
397
8
11766
    if (open(my $file_fh, '-|',
398              '/usr/bin/file',
399              '-b',
400              '--mime',
401              '-e', 'cdf',
402              '-e', 'compress',
403              '-e', 'csv',
404              '-e', 'elf',
405              '-e', 'json',
406              '-e', 'tar',
407              $file)) {
408
8
28291
      my $file_kind = <$file_fh>;
409
8
4819
      close $file_fh;
410
8
111
      if ($file_kind =~ /^(.*?); charset=binary/) {
411
2
26
        skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n");
412
2
39
        return $temp_dir;
413      }
414    }
415  }
416
12
103
  open FILE, '<', $file;
417
12
15
  binmode FILE;
418
12
6
  my $head;
419
12
85
  read(FILE, $head, 4096);
420
12
795
  $head =~ s/(?:\r|\n)+$//;
421
12
38
  my $dos_new_lines = () = $head =~ /\r\n/gi;
422
12
34
  my $unix_new_lines = () = $head =~ /\n/gi;
423
12
99
  my $mac_new_lines = () = $head =~ /\r/gi;
424
12
50
  local $/;
425
12
66
  if ($unix_new_lines == 0 && $mac_new_lines == 0) {
426
3
7
    $/ = "\n";
427  } elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) {
428
1
4
    $/ = "\r\n";
429  } elsif ($mac_new_lines > $unix_new_lines) {
430
2
5
    $/ = "\r";
431  } else {
432
6
5
    $/ = "\n";
433  }
434
12
22
  seek(FILE, 0, 0);
435
12
18
  ($words, $unrecognized) = (0, 0);
436
12
31
  %unique = ();
437
12
27
  %unique_unrecognized = ();
438
439  local $SIG{__WARN__} = sub {
440
0
0
    my $message = shift;
441
0
0
    $message =~ s/> line/> in $file - line/;
442
0
0
    chomp $message;
443
0
0
    print STDERR "$message\n";
444
12
93
  };
445
446
12
310
  open(WARNINGS, '>:utf8', "$temp_dir/warnings");
447
12
10
  our $timeout;
448
12
7
  eval {
449
12
0
84
0
    local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
450
12
46
    alarm $timeout;
451
452
12
14
    my $ignore_next_line = 0;
453
12
9
    my $offset = 0;
454
12
84
    while (<FILE>) {
455
1159
963
      if ($. == 1) {
456
12
13
        unless ($disable_minified_file) {
457
12
46
          if ($file_size >= 512 && length($_) == $file_size) {
458
1
11
            skip_file($temp_dir, "file only has a single line (single-line-file)\n");
459
1
5
            last;
460          }
461        }
462      }
463
1158
2233
      $_ = decode_utf8($_, FB_DEFAULT);
464
1158
2692
      if (/[\x{D800}-\x{DFFF}]/) {
465
0
0
        skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n");
466
0
0
        last;
467      }
468
1158
1363
      s/\R$//;
469
1158
1074
      s/^\x{FEFF}// if $. == 1;
470
1158
1006
      next unless /./;
471
1157
698
      my $raw_line = $_;
472
473
1157
585
      my $ignore_this_line = $ignore_next_line;
474
1157
938
      $ignore_next_line = ($_ =~ /$ignore_next_line_pattern/);
475
1157
820
      next if $ignore_this_line;
476
477      # hook for custom line based text exclusions:
478
1156
829
      if (defined $patterns_re) {
479
2
6
10
8
        s/($patterns_re)/"="x length($1)/ge;
480      }
481
1156
648
      my $initial_line_state = $_;
482
1156
709
      my $previous_line_state = $_;
483
1156
568
      my $line_flagged;
484
1156
744
      if ($forbidden_re) {
485
9
5
59
11
        while (s/($forbidden_re)/"="x length($1)/e) {
486
5
4
          $line_flagged = 1;
487
5
7
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
488
5
5
          my $found_trigger_re;
489
5
4
          for my $i (0 .. $#forbidden_re_list) {
490
7
6
            my $forbidden_re_singleton = $forbidden_re_list[$i];
491
7
4
            my $test_line = $previous_line_state;
492
7
4
58
6
            if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) {
493
4
4
              next unless $test_line eq $_;
494
4
6
              my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1);
495
4
4
              next unless $begin == $begin_test;
496
4
3
              next unless $end == $end_test;
497
4
3
              next unless $match eq $match_test;
498
4
3
              $found_trigger_re = $forbidden_re_singleton;
499
4
7
              my $hit = "$.:$begin:$end";
500
4
4
              $forbidden_re_hits[$i]++;
501
4
7
              $forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i];
502
4
5
              last;
503            }
504          }
505
5
6
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
506
5
5
          if ($found_trigger_re) {
507
4
9
            my $description = $forbidden_re_descriptions{$found_trigger_re} || '';
508
4
10
            $found_trigger_re =~ s/^\(\?:(.*)\)$/$1/;
509
4
2
            my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99);
510
4
4
            if ($description ne '') {
511
3
14
              print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n";
512            } else {
513
1
5
              print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n";
514            }
515          } else {
516
1
12
            print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n";
517          }
518
5
23
          $previous_line_state = $_;
519        }
520
9
8
        $_ = $initial_line_state;
521      }
522      # This is to make it easier to deal w/ rules:
523
1156
1102
      s/^/ /;
524
1156
657
      my %unrecognized_line_items = ();
525
1156
861
      my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items);
526
1156
670
      $words += $new_words;
527
1156
461
      $unrecognized += $new_unrecognized;
528
1156
768
      my $line_length = length($raw_line);
529
1156
1406
      for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) {
530
1020
536
        my $found_token = 0;
531
1020
479
        my $raw_token = $token;
532
1020
543
        $token =~ s/'/(?:'|\x{2019}|\&apos;|\&#39;)+/g;
533
1020
404
        my $before;
534
1020
1573
        if ($token =~ /^$upper_pattern$lower_pattern/) {
535
5
3
          $before = '(?<=.)';
536        } elsif ($token =~ /^$upper_pattern/) {
537
0
0
          $before = "(?<!$upper_pattern)";
538        } else {
539
1015
625
          $before = "(?<=$not_lower_pattern)";
540        }
541
1020
1138
        my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)";
542
1020
2125
        while ($raw_line =~ /(?:\b|$before)($token)(?:\b|$after)/g) {
543
1270
633
          $line_flagged = 1;
544
1270
505
          $found_token = 1;
545
1270
1693
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
546
1270
1242
          next unless $match =~ /./;
547
1270
989
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
548
1270
4778
          print WARNINGS ":$.:$begin ... $end: $wrapped\n";
549        }
550
1020
1057
        unless ($found_token) {
551
3
26
          if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) {
552
3
6
            my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
553
3
3
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
554
3
11
            print WARNINGS ":$.:$begin ... $end: $wrapped\n";
555          } else {
556
0
0
            my $offset = $line_length + 1;
557
0
0
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
558
0
0
            print WARNINGS ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n";
559          }
560        }
561      }
562
1156
1775
      if ($line_flagged && $candidates_re) {
563
1
1
        $_ = $previous_line_state = $initial_line_state;
564
1
1
17
3
        s/($candidates_re)/"="x length($1)/ge;
565
1
2
        if ($_ ne $initial_line_state) {
566
1
1
          $_ = $previous_line_state;
567
1
1
          for my $i (0 .. $#candidates_re_list) {
568
2
1
            my $candidate_re = $candidates_re_list[$i];
569
2
14
            next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/;
570
1
1
6
2
            if (($_ =~ s/($candidate_re)/"="x length($1)/e)) {
571
1
2
              my ($begin, $end) = ($-[0] + 1, $+[0] + 1);
572
1
2
              my $hit = "$.:$begin:$end";
573
1
1
              $_ = $previous_line_state;
574
1
1
5
1
              my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge);
575
1
2
              $candidates_re_hits[$i] += $replacements;
576
1
1
              $candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i];
577
1
2
              $_ = $previous_line_state;
578            }
579          }
580        }
581      }
582
1156
851
      unless ($disable_minified_file) {
583
1156
915
        s/={3,}//g;
584
1156
818
        $offset += length;
585
1156
1036
        my $ratio = int($offset / $.);
586
1156
566
        my $ratio_threshold = 1000;
587
1156
3010
        if ($ratio > $ratio_threshold) {
588
2
7
          skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n");
589
2
7
          last;
590        }
591      }
592    }
593
594
12
70
    alarm 0;
595  };
596
12
12
  if ($@) {
597
0
0
    die unless $@ eq "alarm\n";
598
0
0
    print WARNINGS ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n";
599
0
0
    skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n");
600
0
0
    last;
601  }
602
603
12
34
  close FILE;
604
12
134
  close WARNINGS;
605
606
12
24
  if ($unrecognized || @candidates_re_hits || @forbidden_re_hits) {
607
11
297
    open(STATS, '>:utf8', "$temp_dir/stats");
608
11
142
      print STATS "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized).
609      ", unique: ".(keys %unique).
610      (@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : "").
611      (@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : "").
612      (@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : "").
613      (@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : "").
614      "}";
615
11
118
    close STATS;
616
11
227
    open(UNKNOWN, '>:utf8', "$temp_dir/unknown");
617
11
19
37
29
      print UNKNOWN map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized;
618
11
82
    close UNKNOWN;
619  }
620
621
12
120
  return $temp_dir;
622}
623
624sub main {
625
2
354
  my ($configuration, @ARGV) = @_;
626
2
1
  our %dictionary;
627
2
2
  unless (%dictionary) {
628
1
2
    init($configuration);
629  }
630
631  # read all input
632
2
3
  my @reports;
633
634
2
2
  for my $file (@ARGV) {
635
2
3
    my $temp_dir = split_file($file);
636
2
3
    push @reports, "$temp_dir\n";
637  }
638
2
6
  print join '', @reports;
639}
640
6411;