File Coverage

File:lib/CheckSpelling/UnknownWordSplitter.pm
Coverage:88.0%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3# ~/bin/w
4# Search for potentially misspelled words
5# Output is:
6# misspellled
7# woord (WOORD, Woord, woord, woord's)
8package CheckSpelling::UnknownWordSplitter;
9
10
1
1
108217
1
use 5.022;
11
1
1
1
2
1
49
use feature 'unicode_strings';
12
1
1
1
1
1
6
use strict;
13
1
1
1
1
1
21
use warnings;
14
1
1
1
2
0
18
no warnings qw(experimental::vlb);
15
1
1
1
1
1
2
use utf8;
16
1
1
1
11
6
26
use Encode qw/decode_utf8 encode FB_DEFAULT/;
17
1
1
1
2
1
26
use File::Basename;
18
1
1
1
1
1
14
use Cwd 'abs_path';
19
1
1
1
2
0
19
use File::Spec;
20
1
1
1
1
1
14
use File::Temp qw/ tempfile tempdir /;
21
1
1
1
256
0
3210
use CheckSpelling::Util;
22our $VERSION='0.1.0';
23
24my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
25my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
26my ($shortest, $longest) = (255, 0);
27my @forbidden_re_list;
28my %forbidden_re_descriptions;
29my @candidates_re_list;
30my $hunspell_dictionary_path;
31my @hunspell_dictionaries;
32my %dictionary = ();
33my $base_dict;
34my %unique;
35my %unique_unrecognized;
36my ($last_file, $words, $unrecognized) = ('', 0, 0);
37my ($ignore_next_line_pattern);
38
39my $disable_flags;
40
41sub test_re {
42
14
9
  my ($expression) = @_;
43
14
14
7
100
  return eval { qr /$expression/ };
44}
45
46sub quote_re {
47
14
11
  my ($expression) = @_;
48
14
12
  return $expression if $expression =~ /\?\{/;
49
14
32
  $expression =~ s/
50   \G
51   (
52      (?:[^\\]|\\[^Q])*
53   )
54   (?:
55      \\Q
56      (?:[^\\]|\\[^E])*
57      (?:\\E)?
58   )?
59/
60
28
46
   $1 . (defined($2) ? quotemeta($2) : '')
61/xge;
62
14
16
  return $expression;
63}
64
65sub file_to_lists {
66
3
4
  my ($re) = @_;
67
3
4
  my @patterns;
68  my %hints;
69
3
0
  my $fh;
70
3
27
  if (open($fh, '<:utf8', $re)) {
71
3
4
    local $/=undef;
72
3
19
    my $file=<$fh>;
73
3
8
    close $fh;
74
3
2
    my $line_number = 0;
75
3
2
    my $hint = '';
76
3
13
    for (split /\R/, $file) {
77
17
10
      ++$line_number;
78
17
8
      chomp;
79
17
20
      if (/^#(?:\s(.+)|)/) {
80
6
15
        $hint = $1 if ($hint eq '' && defined $1);
81
6
4
        next;
82      }
83
11
12
      $hint = '' unless $_ ne '';
84
11
7
      next if $_ eq '$^';
85
11
11
      my $pattern = $_;
86
11
28
      next unless s/^(.+)/(?:$1)/;
87
7
8
      my $quoted = quote_re($1);
88
7
4
      unless (test_re $quoted) {
89
1
1
        my $error = $@;
90
1
43
        my $home = dirname(__FILE__);
91
1
20
        $error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/;
92
1
11
        print STDERR $error;
93
1
1
        $_ = '(?:\$^ - skipped because bad-regex)';
94
1
2
        $hint = '';
95      }
96
7
12
      if (defined $hints{$_}) {
97
1
2
        my $pattern_length = length $pattern;
98
1
1
        my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern);
99
1
14
        print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n";
100
1
2
        $_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)';
101      } else {
102
6
5
        push @patterns, $_;
103
6
7
        $hints{$_} = $hint;
104      }
105
7
12
      $hint = '';
106    }
107  }
108
109  return {
110
3
12
    patterns => \@patterns,
111    hints => \%hints,
112  };
113}
114
115sub file_to_list {
116
2
1153
  my ($re) = @_;
117
2
3
  my $lists = file_to_lists($re);
118
119
2
2
1
6
  return @{$lists->{'patterns'}};
120}
121
122sub list_to_re {
123
2
2
  my (@list) = @_;
124
2
5
5
0
4
5
  @list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list;
125
2
5
1
5
  @list = grep { $_ ne '' } @list;
126
2
2
  return '$^' unless scalar @list;
127
2
6
  return join "|", (@list);
128}
129
130sub not_empty {
131
51
41
  my ($thing) = @_;
132
51
172
  return defined $thing && $thing ne ''
133}
134
135sub valid_word {
136  # shortest_word is an absolute
137
22
16
  our ($shortest, $longest, $shortest_word, $longest_word);
138
22
17
  $shortest = $shortest_word if $shortest_word;
139
22
15
  if ($longest_word) {
140    # longest_word is an absolute
141
20
18
    $longest = $longest_word;
142  } elsif (not_empty($longest)) {
143    # we allow for some sloppiness (a couple of stuck keys per word)
144    # it's possible that this should scale with word length
145
1
1
    $longest += 2;
146  }
147
22
17
  our ($upper_pattern, $lower_pattern, $punctuation_pattern);
148
22
66
16
161
  my $word_pattern = join '|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern));
149
22
16
  $word_pattern = q<\\w|'> unless $word_pattern;
150
22
38
  if ((defined $shortest && not_empty($longest)) &&
151      ($shortest > $longest)) {
152
0
0
    $word_pattern = "(?:$word_pattern){3}";
153
0
0
    return qr/$word_pattern/;
154  }
155
22
22
  $shortest = 3 unless defined $shortest;
156
22
24
  $longest = '' unless defined $longest;
157
22
88
  $word_match = "(?:$word_pattern){$shortest,$longest}";
158
22
173
  return qr/\b$word_match\b/;
159}
160
161sub load_dictionary {
162
12
1907
  my ($dict) = @_;
163
12
7
  our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary);
164
12
11
  $longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef);
165
12
11
  $shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', undef);
166
12
7
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
167
12
7
  $ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>);
168
12
39
  $upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]');
169
12
25
  $lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]');
170
12
24
  $not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]');
171
12
22
  $not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]');
172
12
23
  $punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>);
173
12
25
  %dictionary = ();
174
175
12
1290
  open(DICT, '<:utf8', $dict);
176
12
52
  while (!eof(DICT)) {
177
31
30
    my $word = <DICT>;
178
31
25
    chomp $word;
179
31
76
    next unless $word =~ $word_match;
180
28
24
    my $l = length $word;
181
28
20
    $longest = -1 unless not_empty($longest);
182
28
24
    $longest = $l if $l > $longest;
183
28
25
    $shortest = $l if $l < $shortest;
184
28
54
    $dictionary{$word}=1;
185  }
186
12
29
  close DICT;
187
188
12
6
  $word_match = valid_word();
189}
190
191sub hunspell_dictionary {
192
3
5
  my ($dict) = @_;
193
3
3
  my $name = $dict;
194
3
4
  $name =~ s{/src/index/hunspell/index\.dic$}{};
195
3
8
  $name =~ s{.*/}{};
196
3
3
  my $aff = $dict;
197
3
4
  my $encoding;
198
3
10
  $aff =~ s/\.dic$/.aff/;
199
3
25
  if (open AFF, '<', $aff) {
200
3
20
    while (<AFF>) {
201
0
0
      next unless /^SET\s+(\S+)/;
202
0
0
      $encoding = $1 if ($1 !~ /utf-8/i);
203
0
0
      last;
204    }
205
3
8
    close AFF;
206  }
207  return {
208
3
315
    name => $name,
209    dict => $dict,
210    aff => $aff,
211    encoding => $encoding,
212    engine => Text::Hunspell->new($aff, $dict),
213  }
214}
215
216sub init {
217
9
10687
  my ($configuration) = @_;
218
9
9
  our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
219
9
19
  our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', '');
220
9
8
  our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
221
9
10
  our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
222
9
3
  our %forbidden_re_descriptions;
223
9
11
  if ($hunspell_dictionary_path) {
224
3
37
    our @hunspell_dictionaries = ();
225
1
1
1
1
1
1
1
1
1
3
213
1000
17
4
1
13
6
2
15
130
    if (eval 'use Text::Hunspell; 1') {
226
3
121
      my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic");
227
3
7
      for my $hunspell_dictionary_file (@hunspell_dictionaries_list) {
228
3
6
        push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file);
229      }
230    } else {
231
0
0
      print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
232    }
233  }
234
9
14
  my (@patterns_re_list, %in_patterns_re_list);
235
9
56
  if (-e "$configuration/patterns.txt") {
236
0
0
    @patterns_re_list = file_to_list "$configuration/patterns.txt";
237
0
0
    $patterns_re = list_to_re @patterns_re_list;
238
0
0
0
0
    %in_patterns_re_list = map {$_ => 1} @patterns_re_list;
239  } else {
240
9
7
    $patterns_re = undef;
241  }
242
243
9
37
  if (-e "$configuration/forbidden.txt") {
244
1
1
    my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt";
245
1
1
0
2
    @forbidden_re_list = @{$forbidden_re_info->{'patterns'}};
246
1
1
1
2
    %forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}};
247
1
2
    $forbidden_re = list_to_re @forbidden_re_list;
248  } else {
249
8
8
    $forbidden_re = undef;
250  }
251
252
9
36
  if (-e "$configuration/candidates.txt") {
253
1
1
    @candidates_re_list = file_to_list "$configuration/candidates.txt";
254
1
2
2
1
2
5
    @candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} || !test_re($quoted) ? '' : $quoted } @candidates_re_list;
255
1
2
    $candidates_re = list_to_re @candidates_re_list;
256  } else {
257
8
5
    $candidates_re = undef;
258  }
259
260
9
10
  our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024);
261
262
9
8
  my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', '');
263
9
9
  our $disable_word_collating = $disable_flags =~ /(?:^|,|\s)word-collating(?:,|\s|$)/;
264
9
9
  our $disable_minified_file = $disable_flags =~ /(?:^|,|\s)minified-file(?:,|\s|$)/;
265
9
9
  our $disable_single_line_file = $disable_flags =~ /(?:^|,|\s)single-line-file(?:,|\s|$)/;
266
267
9
5
  our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', '');
268
9
9
  $ignore_next_line_pattern =~ s/\s+/|/g;
269
270
9
6
  our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', '');
271
272
9
8
  our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', '');
273
274
9
14
  $word_match = valid_word();
275
276
9
13
  our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words");
277
9
44
  $base_dict = '/usr/share/dict/words' unless -e $base_dict;
278
9
10
  load_dictionary($base_dict);
279}
280
281sub split_line {
282
1156
509
  our (%dictionary, $word_match, $disable_word_collating);
283
1156
453
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
284
1156
454
  our @hunspell_dictionaries;
285
1156
471
  our $shortest;
286
1156
764
  my $shortest_threshold = $shortest + 2;
287
1156
641
  my $pattern = '.';
288  # $pattern = "(?:$upper_pattern){$shortest,}|$upper_pattern(?:$lower_pattern){2,}\n";
289
290  # https://www.fileformat.info/info/unicode/char/2019/
291
1156
517
  my $rsqm = "\xE2\x80\x99";
292
293
1156
660
  my ($words, $unrecognized) = (0, 0);
294
1156
784
  my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_;
295
1156
5520
    $line =~ s/(?:$rsqm|&apos;|&#39;|\%27|&#8217;|&#x2019;|&rsquo;|\\u2019|\x{2019}|')+/'/g;
296
1156
2293
    $line =~ s/(?:$ignore_pattern)+/ /g;
297
1156
1699
    while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {}
298
1156
3277
    while ($line =~ s/((?:$lower_pattern|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {}
299
1156
1364
    for my $token (split /\s+/, $line) {
300
3619
3244
      next unless $token =~ /$pattern/;
301
2464
2008
      $token =~ s/^(?:'|$rsqm)+//g;
302
2464
2506
      $token =~ s/(?:'|$rsqm)+s?$//g;
303
2464
1429
      my $raw_token = $token;
304
2464
1538
      $token =~ s/^[^Ii]?'+(.*)/$1/;
305
2464
1260
      $token =~ s/(.*?)'+$/$1/;
306
2464
3787
      next unless $token =~ $word_match;
307
2311
2146
      if (defined $dictionary{$token}) {
308
1032
467
        ++$words;
309
1032
535
        $unique_ref->{$token}=1;
310
1032
851
        next;
311      }
312
1279
928
      if (@hunspell_dictionaries) {
313
1254
596
        my $found = 0;
314
1254
781
        for my $hunspell_dictionary (@hunspell_dictionaries) {
315          my $token_encoded = defined $hunspell_dictionary->{'encoding'} ?
316
1254
1021
            encode($hunspell_dictionary->{'encoding'}, $token) : $token;
317
1254
2841
          next unless ($hunspell_dictionary->{'engine'}->check($token_encoded));
318
0
0
          ++$words;
319
0
0
          $dictionary{$token} = 1;
320
0
0
          $unique_ref->{$token}=1;
321
0
0
          $found = 1;
322
0
0
          last;
323        }
324
1254
958
        next if $found;
325      }
326
1279
930
      my $key = lc $token;
327
1279
1046
      if (defined $dictionary{$key}) {
328
6
2
        ++$words;
329
6
5
        $unique_ref->{$key}=1;
330
6
8
        next;
331      }
332
1273
905
      unless ($disable_word_collating) {
333
1273
738
        $key =~ s/''+/'/g;
334
1273
1350
        $key =~ s/'[sd]$// unless length $key >= $shortest_threshold;
335      }
336
1273
1003
      if (defined $dictionary{$key}) {
337
0
0
        ++$words;
338
0
0
        $unique_ref->{$key}=1;
339
0
0
        next;
340      }
341
1273
590
      ++$unrecognized;
342
1273
736
      $unique_unrecognized_ref->{$raw_token}=1;
343
1273
1641
      $unrecognized_line_items_ref->{$raw_token}=1;
344    }
345
1156
1538
    return ($words, $unrecognized);
346}
347
348sub skip_file {
349
7
24
  my ($temp_dir, $reason) = @_;
350
7
180
  open(SKIPPED, '>:utf8', "$temp_dir/skipped");
351
7
48
  print SKIPPED $reason;
352
7
88
  close SKIPPED;
353}
354
355sub split_file {
356
16
11088
  my ($file) = @_;
357  our (
358
16
12
    $unrecognized, $shortest, $largest_file, $words,
359    $word_match, %unique, %unique_unrecognized, $forbidden_re,
360    @forbidden_re_list, $patterns_re, %dictionary,
361    $candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file,
362    $disable_single_line_file,
363    $ignore_next_line_pattern,
364    $sandbox,
365  );
366
16
34
  $ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./;
367
368
16
7
  our %forbidden_re_descriptions;
369
16
8
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
370
371  # https://www.fileformat.info/info/unicode/char/2019/
372
16
7
  my $rsqm = "\xE2\x80\x99";
373
374
16
19
  my @candidates_re_hits = (0) x scalar @candidates_re_list;
375
16
8
  my @candidates_re_lines = (0) x scalar @candidates_re_list;
376
16
13
  my @forbidden_re_hits = (0) x scalar @forbidden_re_list;
377
16
9
  my @forbidden_re_lines = (0) x scalar @forbidden_re_list;
378
16
26
  my $temp_dir = tempdir(DIR=>$sandbox);
379
16
2301
  print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'};
380
16
366
  open(NAME, '>', "$temp_dir/name");
381
16
34
    print NAME $file;
382
16
191
  close NAME;
383
16
51
  my $file_size = -s $file;
384
16
15
  if (defined $largest_file) {
385
16
17
    unless ($check_file_names eq $file) {
386
16
17
      if ($file_size > $largest_file) {
387
1
2
        skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n");
388
1
3
        return $temp_dir;
389      }
390    }
391  }
392
15
168
  if (defined readlink($file) &&
393      rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) {
394
1
5
    skip_file($temp_dir, "file only has a single line (out-of-bounds-symbolic-link)\n");
395
1
3
    return $temp_dir;
396  }
397
14
18
  if ($use_magic_file) {
398
8
11396
    if (open(my $file_fh, '-|',
399              '/usr/bin/file',
400              '-b',
401              '--mime',
402              '-e', 'cdf',
403              '-e', 'compress',
404              '-e', 'csv',
405              '-e', 'elf',
406              '-e', 'json',
407              '-e', 'tar',
408              $file)) {
409
8
28098
      my $file_kind = <$file_fh>;
410
8
4880
      close $file_fh;
411
8
108
      if ($file_kind =~ /^(.*?); charset=binary/) {
412
2
29
        skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n");
413
2
40
        return $temp_dir;
414      }
415    }
416  }
417
12
105
  open FILE, '<', $file;
418
12
10
  binmode FILE;
419
12
7
  my $head;
420
12
81
  read(FILE, $head, 4096);
421
12
775
  $head =~ s/(?:\r|\n)+$//;
422
12
44
  my $dos_new_lines = () = $head =~ /\r\n/gi;
423
12
36
  my $unix_new_lines = () = $head =~ /\n/gi;
424
12
96
  my $mac_new_lines = () = $head =~ /\r/gi;
425
12
46
  local $/;
426
12
61
  if ($unix_new_lines == 0 && $mac_new_lines == 0) {
427
3
12
    $/ = "\n";
428  } elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) {
429
1
6
    $/ = "\r\n";
430  } elsif ($mac_new_lines > $unix_new_lines) {
431
2
5
    $/ = "\r";
432  } else {
433
6
7
    $/ = "\n";
434  }
435
12
17
  seek(FILE, 0, 0);
436
12
24
  ($words, $unrecognized) = (0, 0);
437
12
38
  %unique = ();
438
12
27
  %unique_unrecognized = ();
439
440  local $SIG{__WARN__} = sub {
441
0
0
    my $message = shift;
442
0
0
    $message =~ s/> line/> in $file - line/;
443
0
0
    chomp $message;
444
0
0
    print STDERR "$message\n";
445
12
92
  };
446
447
12
305
  open(WARNINGS, '>:utf8', "$temp_dir/warnings");
448
12
7
  our $timeout;
449
12
9
  eval {
450
12
0
70
0
    local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
451
12
25
    alarm $timeout;
452
453
12
18
    my $ignore_next_line = 0;
454
12
10
    my $offset = 0;
455
12
85
    while (<FILE>) {
456
1159
1036
      if ($. == 1) {
457
12
22
        unless ($disable_minified_file) {
458
12
46
          if ($file_size >= 512 && length($_) == $file_size) {
459
1
7
            skip_file($temp_dir, "file only has a single line (single-line-file)\n");
460
1
4
            last;
461          }
462        }
463      }
464
1158
2394
      $_ = decode_utf8($_, FB_DEFAULT);
465
1158
2684
      if (/[\x{D800}-\x{DFFF}]/) {
466
0
0
        skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n");
467
0
0
        last;
468      }
469
1158
1495
      s/\R$//;
470
1158
1028
      s/^\x{FEFF}// if $. == 1;
471
1158
1053
      next unless /./;
472
1157
780
      my $raw_line = $_;
473
474
1157
582
      my $ignore_this_line = $ignore_next_line;
475
1157
955
      $ignore_next_line = ($_ =~ /$ignore_next_line_pattern/);
476
1157
774
      next if $ignore_this_line;
477
478      # hook for custom line based text exclusions:
479
1156
731
      if (defined $patterns_re) {
480
2
6
10
26
        s/($patterns_re)/"="x length($1)/ge;
481      }
482
1156
637
      my $initial_line_state = $_;
483
1156
737
      my $previous_line_state = $_;
484
1156
507
      my $line_flagged;
485
1156
826
      if ($forbidden_re) {
486
9
5
61
11
        while (s/($forbidden_re)/"="x length($1)/e) {
487
5
4
          $line_flagged = 1;
488
5
8
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
489
5
4
          my $found_trigger_re;
490
5
4
          for my $i (0 .. $#forbidden_re_list) {
491
7
6
            my $forbidden_re_singleton = $forbidden_re_list[$i];
492
7
5
            my $test_line = $previous_line_state;
493
7
4
57
6
            if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) {
494
4
5
              next unless $test_line eq $_;
495
4
8
              my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1);
496
4
3
              next unless $begin == $begin_test;
497
4
3
              next unless $end == $end_test;
498
4
6
              next unless $match eq $match_test;
499
4
4
              $found_trigger_re = $forbidden_re_singleton;
500
4
7
              my $hit = "$.:$begin:$end";
501
4
3
              $forbidden_re_hits[$i]++;
502
4
6
              $forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i];
503
4
5
              last;
504            }
505          }
506
5
5
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
507
5
6
          if ($found_trigger_re) {
508
4
8
            my $description = $forbidden_re_descriptions{$found_trigger_re} || '';
509
4
8
            $found_trigger_re =~ s/^\(\?:(.*)\)$/$1/;
510
4
4
            my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99);
511
4
5
            if ($description ne '') {
512
3
13
              print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n";
513            } else {
514
1
5
              print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n";
515            }
516          } else {
517
1
4
            print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n";
518          }
519
5
32
          $previous_line_state = $_;
520        }
521
9
8
        $_ = $initial_line_state;
522      }
523      # This is to make it easier to deal w/ rules:
524
1156
1068
      s/^/ /;
525
1156
666
      my %unrecognized_line_items = ();
526
1156
929
      my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items);
527
1156
604
      $words += $new_words;
528
1156
538
      $unrecognized += $new_unrecognized;
529
1156
835
      my $line_length = length($raw_line);
530
1156
1486
      for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) {
531
1020
457
        my $found_token = 0;
532
1020
476
        my $raw_token = $token;
533
1020
517
        $token =~ s/'/(?:'|\x{2019}|\&apos;|\&#39;)+/g;
534
1020
430
        my $before;
535
1020
1525
        if ($token =~ /^$upper_pattern$lower_pattern/) {
536
5
3
          $before = '(?<=.)';
537        } elsif ($token =~ /^$upper_pattern/) {
538
0
0
          $before = "(?<!$upper_pattern)";
539        } else {
540
1015
679
          $before = "(?<=$not_lower_pattern)";
541        }
542
1020
1151
        my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)";
543
1020
2098
        while ($raw_line =~ /(?:\b|$before)($token)(?:\b|$after)/g) {
544
1270
615
          $line_flagged = 1;
545
1270
553
          $found_token = 1;
546
1270
1657
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
547
1270
1156
          next unless $match =~ /./;
548
1270
941
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
549
1270
4897
          print WARNINGS ":$.:$begin ... $end: $wrapped\n";
550        }
551
1020
1132
        unless ($found_token) {
552
3
27
          if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) {
553
3
7
            my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
554
3
3
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
555
3
11
            print WARNINGS ":$.:$begin ... $end: $wrapped\n";
556          } else {
557
0
0
            my $offset = $line_length + 1;
558
0
0
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
559
0
0
            print WARNINGS ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n";
560          }
561        }
562      }
563
1156
1884
      if ($line_flagged && $candidates_re) {
564
1
1
        $_ = $previous_line_state = $initial_line_state;
565
1
1
17
2
        s/($candidates_re)/"="x length($1)/ge;
566
1
2
        if ($_ ne $initial_line_state) {
567
1
1
          $_ = $previous_line_state;
568
1
2
          for my $i (0 .. $#candidates_re_list) {
569
2
2
            my $candidate_re = $candidates_re_list[$i];
570
2
13
            next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/;
571
1
1
6
2
            if (($_ =~ s/($candidate_re)/"="x length($1)/e)) {
572
1
2
              my ($begin, $end) = ($-[0] + 1, $+[0] + 1);
573
1
2
              my $hit = "$.:$begin:$end";
574
1
1
              $_ = $previous_line_state;
575
1
1
4
2
              my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge);
576
1
4
              $candidates_re_hits[$i] += $replacements;
577
1
2
              $candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i];
578
1
2
              $_ = $previous_line_state;
579            }
580          }
581        }
582      }
583
1156
871
      unless ($disable_minified_file) {
584
1156
883
        s/={3,}//g;
585
1156
767
        $offset += length;
586
1156
1048
        my $ratio = int($offset / $.);
587
1156
564
        my $ratio_threshold = 1000;
588
1156
3009
        if ($ratio > $ratio_threshold) {
589
2
6
          skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n");
590
2
18
          last;
591        }
592      }
593    }
594
595
12
68
    alarm 0;
596  };
597
12
9
  if ($@) {
598
0
0
    die unless $@ eq "alarm\n";
599
0
0
    print WARNINGS ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n";
600
0
0
    skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n");
601
0
0
    last;
602  }
603
604
12
36
  close FILE;
605
12
123
  close WARNINGS;
606
607
12
23
  if ($unrecognized || @candidates_re_hits || @forbidden_re_hits) {
608
11
277
    open(STATS, '>:utf8', "$temp_dir/stats");
609
11
139
      print STATS "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized).
610      ", unique: ".(keys %unique).
611      (@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : "").
612      (@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : "").
613      (@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : "").
614      (@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : "").
615      "}";
616
11
108
    close STATS;
617
11
211
    open(UNKNOWN, '>:utf8', "$temp_dir/unknown");
618
11
19
37
28
      print UNKNOWN map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized;
619
11
89
    close UNKNOWN;
620  }
621
622
12
95
  return $temp_dir;
623}
624
625sub main {
626
2
387
  my ($configuration, @ARGV) = @_;
627
2
1
  our %dictionary;
628
2
2
  unless (%dictionary) {
629
1
2
    init($configuration);
630  }
631
632  # read all input
633
2
1
  my @reports;
634
635
2
2
  for my $file (@ARGV) {
636
2
3
    my $temp_dir = split_file($file);
637
2
4
    push @reports, "$temp_dir\n";
638  }
639
2
6
  print join '', @reports;
640}
641
6421;