File Coverage

File:lib/CheckSpelling/UnknownWordSplitter.pm
Coverage:88.1%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3# ~/bin/w
4# Search for potentially misspelled words
5# Output is:
6# misspellled
7# woord (WOORD, Woord, woord, woord's)
8package CheckSpelling::UnknownWordSplitter;
9
10
1
1
110548
3
use 5.022;
11
1
1
1
4
0
51
use feature 'unicode_strings';
12
1
1
1
2
0
10
use strict;
13
1
1
1
1
0
20
use warnings;
14
1
1
1
2
0
14
no warnings qw(experimental::vlb);
15
1
1
1
1
1
2
use utf8;
16
1
1
1
12
1
25
use Encode qw/decode_utf8 encode FB_DEFAULT/;
17
1
1
1
2
1
26
use File::Basename;
18
1
1
1
1
1
15
use Cwd 'abs_path';
19
1
1
1
2
0
23
use File::Spec;
20
1
1
1
2
0
20
use File::Temp qw/ tempfile tempdir /;
21
1
1
1
268
1
3235
use CheckSpelling::Util;
22our $VERSION='0.1.0';
23
24my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
25my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
26my ($shortest, $longest) = (255, 0);
27my @forbidden_re_list;
28my %forbidden_re_descriptions;
29my @candidates_re_list;
30my $hunspell_dictionary_path;
31my @hunspell_dictionaries;
32my %dictionary = ();
33my $base_dict;
34my %unique;
35my %unique_unrecognized;
36my ($last_file, $words, $unrecognized) = ('', 0, 0);
37my ($ignore_next_line_pattern);
38
39my $disable_flags;
40
41sub test_re {
42
14
8
  my ($expression) = @_;
43
14
14
8
105
  return eval { qr /$expression/ };
44}
45
46sub quote_re {
47
14
9
  my ($expression) = @_;
48
14
13
  return $expression if $expression =~ /\?\{/;
49
14
35
  $expression =~ s/
50   \G
51   (
52      (?:[^\\]|\\[^Q])*
53   )
54   (?:
55      \\Q
56      (?:[^\\]|\\[^E])*
57      (?:\\E)?
58   )?
59/
60
28
47
   $1 . (defined($2) ? quotemeta($2) : '')
61/xge;
62
14
14
  return $expression;
63}
64
65sub file_to_lists {
66
3
3
  my ($re) = @_;
67
3
3
  my @patterns;
68  my %hints;
69
3
0
  my $fh;
70
3
27
  if (open($fh, '<:utf8', $re)) {
71
3
5
    local $/=undef;
72
3
17
    my $file=<$fh>;
73
3
9
    close $fh;
74
3
1
    my $line_number = 0;
75
3
3
    my $hint = '';
76
3
13
    for (split /\R/, $file) {
77
17
8
      ++$line_number;
78
17
12
      chomp;
79
17
18
      if (/^#(?:\s(.+)|)/) {
80
6
15
        $hint = $1 if ($hint eq '' && defined $1);
81
6
4
        next;
82      }
83
11
14
      $hint = '' unless $_ ne '';
84
11
8
      my $pattern = $_;
85
11
31
      next unless s/^(.+)/(?:$1)/;
86
7
7
      my $quoted = quote_re($1);
87
7
9
      unless (test_re $quoted) {
88
1
1
        my $error = $@;
89
1
49
        my $home = dirname(__FILE__);
90
1
19
        $error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/;
91
1
11
        print STDERR $error;
92
1
1
        $_ = '(?:\$^ - skipped because bad-regex)';
93
1
2
        $hint = '';
94      }
95
7
9
      if (defined $hints{$_}) {
96
1
2
        my $pattern_length = length $pattern;
97
1
1
        my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern);
98
1
13
        print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n";
99
1
27
        $_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)';
100      } else {
101
6
7
        push @patterns, $_;
102
6
9
        $hints{$_} = $hint;
103      }
104
7
13
      $hint = '';
105    }
106  }
107
108  return {
109
3
12
    patterns => \@patterns,
110    hints => \%hints,
111  };
112}
113
114sub file_to_list {
115
2
1136
  my ($re) = @_;
116
2
4
  my $lists = file_to_lists($re);
117
118
2
2
2
6
  return @{$lists->{'patterns'}};
119}
120
121sub list_to_re {
122
2
2
  my (@list) = @_;
123
2
5
5
1
4
3
  @list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list;
124
2
5
2
5
  @list = grep { $_ ne '' } @list;
125
2
2
  return '$^' unless scalar @list;
126
2
7
  return join "|", (@list);
127}
128
129sub not_empty {
130
51
44
  my ($thing) = @_;
131
51
154
  return defined $thing && $thing ne ''
132}
133
134sub valid_word {
135  # shortest_word is an absolute
136
22
13
  our ($shortest, $longest, $shortest_word, $longest_word);
137
22
22
  $shortest = $shortest_word if $shortest_word;
138
22
21
  if ($longest_word) {
139    # longest_word is an absolute
140
20
19
    $longest = $longest_word;
141  } elsif (not_empty($longest)) {
142    # we allow for some sloppiness (a couple of stuck keys per word)
143    # it's possible that this should scale with word length
144
1
1
    $longest += 2;
145  }
146
22
12
  our ($upper_pattern, $lower_pattern, $punctuation_pattern);
147
22
66
15
152
  my $word_pattern = join '|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern));
148
22
20
  $word_pattern = q<\\w|'> unless $word_pattern;
149
22
39
  if ((defined $shortest && not_empty($longest)) &&
150      ($shortest > $longest)) {
151
0
0
    $word_pattern = "(?:$word_pattern){3}";
152
0
0
    return qr/$word_pattern/;
153  }
154
22
26
  $shortest = 3 unless defined $shortest;
155
22
16
  $longest = '' unless defined $longest;
156
22
72
  $word_match = "(?:$word_pattern){$shortest,$longest}";
157
22
178
  return qr/\b$word_match\b/;
158}
159
160sub load_dictionary {
161
12
1985
  my ($dict) = @_;
162
12
6
  our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary);
163
12
10
  $longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef);
164
12
12
  $shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', undef);
165
12
6
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
166
12
15
  $ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>);
167
12
43
  $upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]');
168
12
26
  $lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]');
169
12
20
  $not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]');
170
12
22
  $not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]');
171
12
39
  $punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>);
172
12
26
  %dictionary = ();
173
174
12
539
  open(DICT, '<:utf8', $dict);
175
12
55
  while (!eof(DICT)) {
176
31
37
    my $word = <DICT>;
177
31
23
    chomp $word;
178
31
74
    next unless $word =~ $word_match;
179
28
24
    my $l = length $word;
180
28
15
    $longest = -1 unless not_empty($longest);
181
28
27
    $longest = $l if $l > $longest;
182
28
30
    $shortest = $l if $l < $shortest;
183
28
54
    $dictionary{$word}=1;
184  }
185
12
27
  close DICT;
186
187
12
7
  $word_match = valid_word();
188}
189
190sub hunspell_dictionary {
191
3
4
  my ($dict) = @_;
192
3
2
  my $name = $dict;
193
3
5
  $name =~ s{/src/index/hunspell/index\.dic$}{};
194
3
10
  $name =~ s{.*/}{};
195
3
4
  my $aff = $dict;
196
3
2
  my $encoding;
197
3
5
  $aff =~ s/\.dic$/.aff/;
198
3
25
  if (open AFF, '<', $aff) {
199
3
19
    while (<AFF>) {
200
0
0
      next unless /^SET\s+(\S+)/;
201
0
0
      $encoding = $1 if ($1 !~ /utf-8/i);
202
0
0
      last;
203    }
204
3
7
    close AFF;
205  }
206  return {
207
3
301
    name => $name,
208    dict => $dict,
209    aff => $aff,
210    encoding => $encoding,
211    engine => Text::Hunspell->new($aff, $dict),
212  }
213}
214
215sub init {
216
9
10954
  my ($configuration) = @_;
217
9
9
  our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
218
9
23
  our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', '');
219
9
8
  our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
220
9
12
  our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
221
9
5
  our %forbidden_re_descriptions;
222
9
9
  if ($hunspell_dictionary_path) {
223
3
30
    our @hunspell_dictionaries = ();
224
1
1
1
1
1
1
1
1
1
3
170
1030
16
4
1
14
7
1
15
142
    if (eval 'use Text::Hunspell; 1') {
225
3
110
      my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic");
226
3
6
      for my $hunspell_dictionary_file (@hunspell_dictionaries_list) {
227
3
6
        push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file);
228      }
229    } else {
230
0
0
      print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
231    }
232  }
233
9
11
  my (@patterns_re_list, %in_patterns_re_list);
234
9
51
  if (-e "$configuration/patterns.txt") {
235
0
0
    @patterns_re_list = file_to_list "$configuration/patterns.txt";
236
0
0
    $patterns_re = list_to_re @patterns_re_list;
237
0
0
0
0
    %in_patterns_re_list = map {$_ => 1} @patterns_re_list;
238  } else {
239
9
8
    $patterns_re = undef;
240  }
241
242
9
68
  if (-e "$configuration/forbidden.txt") {
243
1
1
    my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt";
244
1
1
1
2
    @forbidden_re_list = @{$forbidden_re_info->{'patterns'}};
245
1
1
0
3
    %forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}};
246
1
1
    $forbidden_re = list_to_re @forbidden_re_list;
247  } else {
248
8
8
    $forbidden_re = undef;
249  }
250
251
9
41
  if (-e "$configuration/candidates.txt") {
252
1
1
    @candidates_re_list = file_to_list "$configuration/candidates.txt";
253
1
2
2
1
2
4
    @candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} || !test_re($quoted) ? '' : $quoted } @candidates_re_list;
254
1
2
    $candidates_re = list_to_re @candidates_re_list;
255  } else {
256
8
4
    $candidates_re = undef;
257  }
258
259
9
12
  our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024);
260
261
9
9
  my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', '');
262
9
11
  our $disable_word_collating = $disable_flags =~ /(?:^|,|\s)word-collating(?:,|\s|$)/;
263
9
6
  our $disable_minified_file = $disable_flags =~ /(?:^|,|\s)minified-file(?:,|\s|$)/;
264
9
3
  our $disable_single_line_file = $disable_flags =~ /(?:^|,|\s)single-line-file(?:,|\s|$)/;
265
266
9
9
  our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', '');
267
9
7
  $ignore_next_line_pattern =~ s/\s+/|/g;
268
269
9
6
  our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', '');
270
271
9
8
  our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', '');
272
273
9
10
  $word_match = valid_word();
274
275
9
18
  our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words");
276
9
44
  $base_dict = '/usr/share/dict/words' unless -e $base_dict;
277
9
12
  load_dictionary($base_dict);
278}
279
280sub split_line {
281
1156
484
  our (%dictionary, $word_match, $disable_word_collating);
282
1156
424
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
283
1156
416
  our @hunspell_dictionaries;
284
1156
519
  our $shortest;
285
1156
733
  my $shortest_threshold = $shortest + 2;
286
1156
660
  my $pattern = '.';
287  # $pattern = "(?:$upper_pattern){$shortest,}|$upper_pattern(?:$lower_pattern){2,}\n";
288
289  # https://www.fileformat.info/info/unicode/char/2019/
290
1156
512
  my $rsqm = "\xE2\x80\x99";
291
292
1156
677
  my ($words, $unrecognized) = (0, 0);
293
1156
728
  my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_;
294
1156
5395
    $line =~ s/(?:$rsqm|&apos;|&#39;|\%27|&#8217;|&#x2019;|&rsquo;|\\u2019|\x{2019}|')+/'/g;
295
1156
2167
    $line =~ s/(?:$ignore_pattern)+/ /g;
296
1156
1646
    while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {}
297
1156
3215
    while ($line =~ s/((?:$lower_pattern|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {}
298
1156
1309
    for my $token (split /\s+/, $line) {
299
3619
3094
      next unless $token =~ /$pattern/;
300
2464
1979
      $token =~ s/^(?:'|$rsqm)+//g;
301
2464
2359
      $token =~ s/(?:'|$rsqm)+s?$//g;
302
2464
1388
      my $raw_token = $token;
303
2464
1411
      $token =~ s/^[^Ii]?'+(.*)/$1/;
304
2464
1189
      $token =~ s/(.*?)'+$/$1/;
305
2464
3706
      next unless $token =~ $word_match;
306
2311
1948
      if (defined $dictionary{$token}) {
307
1032
463
        ++$words;
308
1032
529
        $unique_ref->{$token}=1;
309
1032
850
        next;
310      }
311
1279
1033
      if (@hunspell_dictionaries) {
312
1254
574
        my $found = 0;
313
1254
697
        for my $hunspell_dictionary (@hunspell_dictionaries) {
314          my $token_encoded = defined $hunspell_dictionary->{'encoding'} ?
315
1254
1086
            encode($hunspell_dictionary->{'encoding'}, $token) : $token;
316
1254
2576
          next unless ($hunspell_dictionary->{'engine'}->check($token_encoded));
317
0
0
          ++$words;
318
0
0
          $dictionary{$token} = 1;
319
0
0
          $unique_ref->{$token}=1;
320
0
0
          $found = 1;
321
0
0
          last;
322        }
323
1254
926
        next if $found;
324      }
325
1279
921
      my $key = lc $token;
326
1279
1050
      if (defined $dictionary{$key}) {
327
6
3
        ++$words;
328
6
6
        $unique_ref->{$key}=1;
329
6
7
        next;
330      }
331
1273
906
      unless ($disable_word_collating) {
332
1273
699
        $key =~ s/''+/'/g;
333
1273
1304
        $key =~ s/'[sd]$// unless length $key >= $shortest_threshold;
334      }
335
1273
1054
      if (defined $dictionary{$key}) {
336
0
0
        ++$words;
337
0
0
        $unique_ref->{$key}=1;
338
0
0
        next;
339      }
340
1273
616
      ++$unrecognized;
341
1273
729
      $unique_unrecognized_ref->{$raw_token}=1;
342
1273
1647
      $unrecognized_line_items_ref->{$raw_token}=1;
343    }
344
1156
1662
    return ($words, $unrecognized);
345}
346
347sub skip_file {
348
7
22
  my ($temp_dir, $reason) = @_;
349
7
195
  open(SKIPPED, '>:utf8', "$temp_dir/skipped");
350
7
35
  print SKIPPED $reason;
351
7
84
  close SKIPPED;
352}
353
354sub split_file {
355
16
11410
  my ($file) = @_;
356  our (
357
16
9
    $unrecognized, $shortest, $largest_file, $words,
358    $word_match, %unique, %unique_unrecognized, $forbidden_re,
359    @forbidden_re_list, $patterns_re, %dictionary,
360    $candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file,
361    $disable_single_line_file,
362    $ignore_next_line_pattern,
363    $sandbox,
364  );
365
16
31
  $ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./;
366
367
16
7
  our %forbidden_re_descriptions;
368
16
8
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
369
370  # https://www.fileformat.info/info/unicode/char/2019/
371
16
10
  my $rsqm = "\xE2\x80\x99";
372
373
16
16
  my @candidates_re_hits = (0) x scalar @candidates_re_list;
374
16
9
  my @candidates_re_lines = (0) x scalar @candidates_re_list;
375
16
11
  my @forbidden_re_hits = (0) x scalar @forbidden_re_list;
376
16
10
  my @forbidden_re_lines = (0) x scalar @forbidden_re_list;
377
16
26
  my $temp_dir = tempdir(DIR=>$sandbox);
378
16
2377
  print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'};
379
16
376
  open(NAME, '>', "$temp_dir/name");
380
16
36
    print NAME $file;
381
16
187
  close NAME;
382
16
55
  my $file_size = -s $file;
383
16
18
  if (defined $largest_file) {
384
16
15
    unless ($check_file_names eq $file) {
385
16
20
      if ($file_size > $largest_file) {
386
1
2
        skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n");
387
1
2
        return $temp_dir;
388      }
389    }
390  }
391
15
202
  if (defined readlink($file) &&
392      rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) {
393
1
3
    skip_file($temp_dir, "file only has a single line (out-of-bounds-symbolic-link)\n");
394
1
3
    return $temp_dir;
395  }
396
14
18
  if ($use_magic_file) {
397
8
11975
    if (open(my $file_fh, '-|',
398              '/usr/bin/file',
399              '-b',
400              '--mime',
401              '-e', 'cdf',
402              '-e', 'compress',
403              '-e', 'csv',
404              '-e', 'elf',
405              '-e', 'json',
406              '-e', 'tar',
407              $file)) {
408
8
29348
      my $file_kind = <$file_fh>;
409
8
4939
      close $file_fh;
410
8
101
      if ($file_kind =~ /^(.*?); charset=binary/) {
411
2
30
        skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n");
412
2
45
        return $temp_dir;
413      }
414    }
415  }
416
12
113
  open FILE, '<', $file;
417
12
10
  binmode FILE;
418
12
7
  my $head;
419
12
94
  read(FILE, $head, 4096);
420
12
839
  $head =~ s/(?:\r|\n)+$//;
421
12
46
  my $dos_new_lines = () = $head =~ /\r\n/gi;
422
12
35
  my $unix_new_lines = () = $head =~ /\n/gi;
423
12
102
  my $mac_new_lines = () = $head =~ /\r/gi;
424
12
50
  local $/;
425
12
65
  if ($unix_new_lines == 0 && $mac_new_lines == 0) {
426
3
10
    $/ = "\n";
427  } elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) {
428
1
4
    $/ = "\r\n";
429  } elsif ($mac_new_lines > $unix_new_lines) {
430
2
5
    $/ = "\r";
431  } else {
432
6
6
    $/ = "\n";
433  }
434
12
25
  seek(FILE, 0, 0);
435
12
16
  ($words, $unrecognized) = (0, 0);
436
12
58
  %unique = ();
437
12
24
  %unique_unrecognized = ();
438
439  local $SIG{__WARN__} = sub {
440
0
0
    my $message = shift;
441
0
0
    $message =~ s/> line/> in $file - line/;
442
0
0
    chomp $message;
443
0
0
    print STDERR "$message\n";
444
12
102
  };
445
446
12
303
  open(WARNINGS, '>:utf8', "$temp_dir/warnings");
447
12
8
  our $timeout;
448
12
9
  eval {
449
12
0
113
0
    local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
450
12
27
    alarm $timeout;
451
452
12
13
    my $ignore_next_line = 0;
453
12
11
    my $offset = 0;
454
12
82
    while (<FILE>) {
455
1159
1022
      if ($. == 1) {
456
12
17
        unless ($disable_minified_file) {
457
12
45
          if ($file_size >= 512 && length($_) == $file_size) {
458
1
8
            skip_file($temp_dir, "file only has a single line (single-line-file)\n");
459
1
5
            last;
460          }
461        }
462      }
463
1158
2282
      $_ = decode_utf8($_, FB_DEFAULT);
464
1158
2814
      if (/[\x{D800}-\x{DFFF}]/) {
465
0
0
        skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n");
466
0
0
        last;
467      }
468
1158
1402
      s/\R$//;
469
1158
1034
      s/^\x{FEFF}// if $. == 1;
470
1158
1009
      next unless /./;
471
1157
743
      my $raw_line = $_;
472
473
1157
656
      my $ignore_this_line = $ignore_next_line;
474
1157
947
      $ignore_next_line = ($_ =~ /$ignore_next_line_pattern/);
475
1157
765
      next if $ignore_this_line;
476
477      # hook for custom line based text exclusions:
478
1156
759
      if (defined $patterns_re) {
479
2
6
10
16
        s/($patterns_re)/"="x length($1)/ge;
480      }
481
1156
658
      my $initial_line_state = $_;
482
1156
697
      my $previous_line_state = $_;
483
1156
513
      my $line_flagged;
484
1156
915
      if ($forbidden_re) {
485
9
5
60
11
        while (s/($forbidden_re)/"="x length($1)/e) {
486
5
6
          $line_flagged = 1;
487
5
8
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
488
5
3
          my $found_trigger_re;
489
5
6
          for my $i (0 .. $#forbidden_re_list) {
490
7
4
            my $forbidden_re_singleton = $forbidden_re_list[$i];
491
7
7
            my $test_line = $previous_line_state;
492
7
4
58
7
            if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) {
493
4
4
              next unless $test_line eq $_;
494
4
5
              my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1);
495
4
4
              next unless $begin == $begin_test;
496
4
3
              next unless $end == $end_test;
497
4
4
              next unless $match eq $match_test;
498
4
3
              $found_trigger_re = $forbidden_re_singleton;
499
4
8
              my $hit = "$.:$begin:$end";
500
4
2
              $forbidden_re_hits[$i]++;
501
4
8
              $forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i];
502
4
6
              last;
503            }
504          }
505
5
7
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
506
5
5
          if ($found_trigger_re) {
507
4
8
            my $description = $forbidden_re_descriptions{$found_trigger_re} || '';
508
4
10
            $found_trigger_re =~ s/^\(\?:(.*)\)$/$1/;
509
4
3
            my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99);
510
4
7
            if ($description ne '') {
511
3
15
              print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n";
512            } else {
513
1
7
              print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n";
514            }
515          } else {
516
1
4
            print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n";
517          }
518
5
25
          $previous_line_state = $_;
519        }
520
9
9
        $_ = $initial_line_state;
521      }
522      # This is to make it easier to deal w/ rules:
523
1156
1083
      s/^/ /;
524
1156
706
      my %unrecognized_line_items = ();
525
1156
885
      my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items);
526
1156
579
      $words += $new_words;
527
1156
503
      $unrecognized += $new_unrecognized;
528
1156
769
      my $line_length = length($raw_line);
529
1156
1503
      for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) {
530
1020
516
        my $found_token = 0;
531
1020
464
        my $raw_token = $token;
532
1020
556
        $token =~ s/'/(?:'|\x{2019}|\&apos;|\&#39;)+/g;
533
1020
427
        my $before;
534
1020
1545
        if ($token =~ /^$upper_pattern$lower_pattern/) {
535
5
4
          $before = '(?<=.)';
536        } elsif ($token =~ /^$upper_pattern/) {
537
0
0
          $before = "(?<!$upper_pattern)";
538        } else {
539
1015
561
          $before = "(?<=$not_lower_pattern)";
540        }
541
1020
1152
        my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)";
542
1020
2060
        while ($raw_line =~ /(?:\b|$before)($token)(?:\b|$after)/g) {
543
1270
605
          $line_flagged = 1;
544
1270
513
          $found_token = 1;
545
1270
1643
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
546
1270
1181
          next unless $match =~ /./;
547
1270
887
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
548
1270
4866
          print WARNINGS ":$.:$begin ... $end: $wrapped\n";
549        }
550
1020
1157
        unless ($found_token) {
551
3
27
          if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) {
552
3
6
            my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
553
3
2
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
554
3
12
            print WARNINGS ":$.:$begin ... $end: $wrapped\n";
555          } else {
556
0
0
            my $offset = $line_length + 1;
557
0
0
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
558
0
0
            print WARNINGS ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n";
559          }
560        }
561      }
562
1156
1822
      if ($line_flagged && $candidates_re) {
563
1
1
        $_ = $previous_line_state = $initial_line_state;
564
1
1
18
2
        s/($candidates_re)/"="x length($1)/ge;
565
1
2
        if ($_ ne $initial_line_state) {
566
1
1
          $_ = $previous_line_state;
567
1
1
          for my $i (0 .. $#candidates_re_list) {
568
2
2
            my $candidate_re = $candidates_re_list[$i];
569
2
14
            next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/;
570
1
1
6
2
            if (($_ =~ s/($candidate_re)/"="x length($1)/e)) {
571
1
2
              my ($begin, $end) = ($-[0] + 1, $+[0] + 1);
572
1
2
              my $hit = "$.:$begin:$end";
573
1
1
              $_ = $previous_line_state;
574
1
1
8
2
              my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge);
575
1
1
              $candidates_re_hits[$i] += $replacements;
576
1
2
              $candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i];
577
1
2
              $_ = $previous_line_state;
578            }
579          }
580        }
581      }
582
1156
851
      unless ($disable_minified_file) {
583
1156
899
        s/={3,}//g;
584
1156
783
        $offset += length;
585
1156
1037
        my $ratio = int($offset / $.);
586
1156
558
        my $ratio_threshold = 1000;
587
1156
3056
        if ($ratio > $ratio_threshold) {
588
2
8
          skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n");
589
2
6
          last;
590        }
591      }
592    }
593
594
12
71
    alarm 0;
595  };
596
12
12
  if ($@) {
597
0
0
    die unless $@ eq "alarm\n";
598
0
0
    print WARNINGS ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n";
599
0
0
    skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n");
600
0
0
    last;
601  }
602
603
12
38
  close FILE;
604
12
120
  close WARNINGS;
605
606
12
23
  if ($unrecognized || @candidates_re_hits || @forbidden_re_hits) {
607
11
314
    open(STATS, '>:utf8', "$temp_dir/stats");
608
11
187
      print STATS "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized).
609      ", unique: ".(keys %unique).
610      (@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : "").
611      (@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : "").
612      (@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : "").
613      (@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : "").
614      "}";
615
11
129
    close STATS;
616
11
225
    open(UNKNOWN, '>:utf8', "$temp_dir/unknown");
617
11
19
37
27
      print UNKNOWN map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized;
618
11
91
    close UNKNOWN;
619  }
620
621
12
101
  return $temp_dir;
622}
623
624sub main {
625
2
384
  my ($configuration, @ARGV) = @_;
626
2
2
  our %dictionary;
627
2
2
  unless (%dictionary) {
628
1
2
    init($configuration);
629  }
630
631  # read all input
632
2
1
  my @reports;
633
634
2
2
  for my $file (@ARGV) {
635
2
2
    my $temp_dir = split_file($file);
636
2
3
    push @reports, "$temp_dir\n";
637  }
638
2
7
  print join '', @reports;
639}
640
6411;