File Coverage

File:lib/CheckSpelling/UnknownWordSplitter.pm
Coverage:81.1%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3# ~/bin/w
4# Search for potentially misspelled words
5# Output is:
6# misspellled
7# woord (WOORD, Woord, woord, woord's)
8package CheckSpelling::UnknownWordSplitter;
9
10
1
1
113612
1
use 5.022;
11
1
1
1
2
1
53
use feature 'unicode_strings';
12
1
1
1
2
2
9
use strict;
13
1
1
1
1
1
18
use warnings;
14
1
1
1
1
1
16
no warnings qw(experimental::vlb);
15
1
1
1
1
0
2
use utf8;
16
1
1
1
14
1
34
use Encode qw/decode_utf8 encode FB_DEFAULT/;
17
1
1
1
2
0
30
use File::Basename;
18
1
1
1
2
0
15
use Cwd 'abs_path';
19
1
1
1
2
0
22
use File::Spec;
20
1
1
1
2
0
21
use File::Temp qw/ tempfile tempdir /;
21
1
1
1
1
1
14
use File::Path qw/ make_path /;
22
1
1
1
293
1
20
use CheckSpelling::Util;
23
1
1
1
205
1338
3663
use Digest::SHA;
24our $VERSION='0.1.0';
25
26my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
27my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
28my ($shortest, $longest) = (255, 0);
29my @forbidden_re_list;
30my %forbidden_re_descriptions;
31my @candidates_re_list;
32my $hunspell_dictionary_path;
33my @hunspell_dictionaries;
34my %dictionary = ();
35my $base_dict;
36my %unique;
37my %unique_unrecognized;
38my ($last_file, $words, $unrecognized) = ('', 0, 0);
39my ($ignore_next_line_pattern);
40my ($check_images, $ocr_directory);
41
42my $disable_flags;
43
44sub test_re {
45
14
10
  my ($expression) = @_;
46
14
14
10
100
  return eval { qr /$expression/ };
47}
48
49sub quote_re {
50
14
11
  my ($expression) = @_;
51
14
11
  return $expression if $expression =~ /\?\{/;
52
14
31
  $expression =~ s/
53   \G
54   (
55      (?:[^\\]|\\[^Q])*
56   )
57   (?:
58      \\Q
59      (?:[^\\]|\\[^E])*
60      (?:\\E)?
61   )?
62/
63
28
49
   $1 . (defined($2) ? quotemeta($2) : '')
64/xge;
65
14
15
  return $expression;
66}
67
68sub file_to_lists {
69
3
4
  my ($re) = @_;
70
3
5
  my @patterns;
71  my %hints;
72
3
0
  my $fh;
73
3
32
  if (open($fh, '<:utf8', $re)) {
74
3
6
    local $/=undef;
75
3
24
    my $file=<$fh>;
76
3
10
    close $fh;
77
3
1
    my $line_number = 0;
78
3
4
    my $hint = '';
79
3
13
    for (split /\R/, $file) {
80
17
8
      ++$line_number;
81
17
14
      chomp;
82
17
15
      if (/^#(?:\s(.+)|)/) {
83
6
13
        $hint = $1 if ($hint eq '' && defined $1);
84
6
6
        next;
85      }
86
11
16
      $hint = '' unless $_ ne '';
87
11
11
      next if $_ eq '$^';
88
11
5
      my $pattern = $_;
89
11
34
      next unless s/^(.+)/(?:$1)/;
90
7
8
      my $quoted = quote_re($1);
91
7
7
      unless (test_re $quoted) {
92
1
1
        my $error = $@;
93
1
54
        my $home = dirname(__FILE__);
94
1
25
        $error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/;
95
1
12
        print STDERR $error;
96
1
2
        $_ = '(?:\$^ - skipped because bad-regex)';
97
1
2
        $hint = '';
98      }
99
7
10
      if (defined $hints{$_}) {
100
1
2
        my $pattern_length = length $pattern;
101
1
1
        my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern);
102
1
16
        print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n";
103
1
3
        $_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)';
104      } else {
105
6
6
        push @patterns, $_;
106
6
8
        $hints{$_} = $hint;
107      }
108
7
11
      $hint = '';
109    }
110  }
111
112  return {
113
3
16
    patterns => \@patterns,
114    hints => \%hints,
115  };
116}
117
118sub file_to_list {
119
2
1354
  my ($re) = @_;
120
2
5
  my $lists = file_to_lists($re);
121
122
2
2
2
7
  return @{$lists->{'patterns'}};
123}
124
125sub list_to_re {
126
2
2
  my (@list) = @_;
127
2
5
5
2
5
6
  @list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list;
128
2
5
3
5
  @list = grep { $_ ne '' } @list;
129
2
2
  return '$^' unless scalar @list;
130
2
6
  return join "|", (@list);
131}
132
133sub not_empty {
134
73
56
  my ($thing) = @_;
135
73
371
  return defined $thing && $thing ne '' && $thing =~ /^\d+$/;
136}
137
138sub valid_word {
139  # shortest_word is an absolute
140
22
11
  our ($shortest, $longest, $shortest_word, $longest_word);
141
22
19
  $shortest = $shortest_word if $shortest_word;
142
22
18
  if ($longest_word) {
143    # longest_word is an absolute
144
20
23
    $longest = $longest_word;
145  } elsif (not_empty($longest)) {
146    # we allow for some sloppiness (a couple of stuck keys per word)
147    # it's possible that this should scale with word length
148
1
1
    $longest += 2;
149  }
150
22
16
  our ($upper_pattern, $lower_pattern, $punctuation_pattern);
151
22
66
15
134
  my $word_pattern = join '|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern));
152
22
17
  $word_pattern = q<\\w|'> unless $word_pattern;
153
22
43
  if ((defined $shortest && not_empty($longest)) &&
154      ($shortest > $longest)) {
155
0
0
    $word_pattern = "(?:$word_pattern){3}";
156
0
0
    return qr/$word_pattern/;
157  }
158
22
21
  $shortest = 3 unless defined $shortest;
159
22
19
  $longest = '' unless not_empty($longest);
160
22
82
  $word_match = "(?:$word_pattern){$shortest,$longest}";
161
22
189
  return qr/\b$word_match\b/;
162}
163
164sub load_dictionary {
165
12
2048
  my ($dict) = @_;
166
12
9
  our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary);
167
12
12
  $longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef);
168
12
10
  $shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', undef);
169
12
9
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
170
12
10
  $ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>);
171
12
47
  $upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]');
172
12
32
  $lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]');
173
12
25
  $not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]');
174
12
25
  $not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]');
175
12
25
  $punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>);
176
12
34
  %dictionary = ();
177
178
12
537
  open(my $dict_fh, '<:utf8', $dict);
179
12
55
  while (!eof($dict_fh)) {
180
31
29
    my $word = <$dict_fh>;
181
31
28
    chomp $word;
182
31
83
    next unless $word =~ $word_match;
183
28
26
    my $l = length $word;
184
28
19
    $longest = -1 unless not_empty($longest);
185
28
27
    $longest = $l if $l > $longest;
186
28
24
    $shortest = $l if $l < $shortest;
187
28
63
    $dictionary{$word}=1;
188  }
189
12
32
  close $dict_fh;
190
191
12
8
  $word_match = valid_word();
192}
193
194sub hunspell_dictionary {
195
3
4
  my ($dict) = @_;
196
3
3
  my $name = $dict;
197
3
3
  $name =~ s{/src/index/hunspell/index\.dic$}{};
198
3
13
  $name =~ s{.*/}{};
199
3
4
  my $aff = $dict;
200
3
0
  my $encoding;
201
3
6
  $aff =~ s/\.dic$/.aff/;
202
3
36
  if (open my $aff_fh, '<', $aff) {
203
3
22
    while (<$aff_fh>) {
204
0
0
      next unless /^SET\s+(\S+)/;
205
0
0
      $encoding = $1 if ($1 !~ /utf-8/i);
206
0
0
      last;
207    }
208
3
10
    close $aff_fh;
209  }
210  return {
211
3
317
    name => $name,
212    dict => $dict,
213    aff => $aff,
214    encoding => $encoding,
215    engine => Text::Hunspell->new($aff, $dict),
216  }
217}
218
219sub init {
220
9
11718
  my ($configuration) = @_;
221
9
13
  our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
222
9
25
  our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', '');
223
9
10
  our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
224
9
17
  our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
225
9
5
  our %forbidden_re_descriptions;
226
9
10
  if ($hunspell_dictionary_path) {
227
3
37
    our @hunspell_dictionaries = ();
228
1
1
1
1
1
1
1
1
1
3
184
1078
25
5
1
13
6
3
15
151
    if (eval 'use Text::Hunspell; 1') {
229
3
132
      my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic");
230
3
10
      for my $hunspell_dictionary_file (@hunspell_dictionaries_list) {
231
3
9
        push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file);
232      }
233    } else {
234
0
0
      print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
235    }
236  }
237
9
13
  my (@patterns_re_list, %in_patterns_re_list);
238
9
55
  if (-e "$configuration/patterns.txt") {
239
0
0
    @patterns_re_list = file_to_list "$configuration/patterns.txt";
240
0
0
    $patterns_re = list_to_re @patterns_re_list;
241
0
0
0
0
    %in_patterns_re_list = map {$_ => 1} @patterns_re_list;
242  } else {
243
9
7
    $patterns_re = undef;
244  }
245
246
9
39
  if (-e "$configuration/forbidden.txt") {
247
1
2
    my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt";
248
1
1
1
1
    @forbidden_re_list = @{$forbidden_re_info->{'patterns'}};
249
1
1
1
4
    %forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}};
250
1
2
    $forbidden_re = list_to_re @forbidden_re_list;
251  } else {
252
8
10
    $forbidden_re = undef;
253  }
254
255
9
38
  if (-e "$configuration/candidates.txt") {
256
1
2
    @candidates_re_list = file_to_list "$configuration/candidates.txt";
257
1
2
2
2
2
5
    @candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} || !test_re($quoted) ? '' : $quoted } @candidates_re_list;
258
1
1
    $candidates_re = list_to_re @candidates_re_list;
259  } else {
260
8
6
    $candidates_re = undef;
261  }
262
263
9
11
  our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024);
264
265
9
9
  my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', '');
266
9
9
  our $disable_word_collating = $disable_flags =~ /(?:^|,|\s)word-collating(?:,|\s|$)/;
267
9
10
  our $disable_minified_file = $disable_flags =~ /(?:^|,|\s)minified-file(?:,|\s|$)/;
268
9
6
  our $disable_single_line_file = $disable_flags =~ /(?:^|,|\s)single-line-file(?:,|\s|$)/;
269
270
9
10
  our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', '');
271
9
7
  $ignore_next_line_pattern =~ s/\s+/|/g;
272
273
9
10
  our $check_images = CheckSpelling::Util::get_val_from_env('INPUT_CHECK_IMAGES', '');
274
9
7
  $check_images = $check_images =~ /^(?:1|true)$/i;
275
9
8
  if ($check_images) {
276
0
0
    our $ocr_directory = CheckSpelling::Util::get_file_from_env('ocr_directory', '/tmp/ocr');
277
0
0
    $ocr_directory = $1 if ($ocr_directory =~ /^(.*)$/);
278  }
279
280
9
9
  our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', '');
281
282
9
9
  our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', '');
283
284
9
12
  $word_match = valid_word();
285
286
9
22
  our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words");
287
9
46
  $base_dict = '/usr/share/dict/words' unless -e $base_dict;
288
9
19
  load_dictionary($base_dict);
289}
290
291sub split_line {
292
1156
525
  our (%dictionary, $word_match, $disable_word_collating);
293
1156
425
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
294
1156
422
  our @hunspell_dictionaries;
295
1156
533
  our $shortest;
296
1156
731
  my $shortest_threshold = $shortest + 2;
297
1156
584
  my $pattern = '.';
298  # $pattern = "(?:$upper_pattern){$shortest,}|$upper_pattern(?:$lower_pattern){2,}\n";
299
300  # https://www.fileformat.info/info/unicode/char/2019/
301
1156
562
  my $rsqm = "\xE2\x80\x99";
302
303
1156
664
  my ($words, $unrecognized) = (0, 0);
304
1156
806
  my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_;
305
1156
5358
    $line =~ s/(?:$rsqm|&apos;|&#39;|\%27|&#8217;|&#x2019;|&rsquo;|\\u2019|\x{2019}|')+/'/g;
306
1156
2211
    $line =~ s/(?:$ignore_pattern)+/ /g;
307
1156
1684
    while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {}
308
1156
3234
    while ($line =~ s/((?:$lower_pattern|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {}
309
1156
1360
    for my $token (split /\s+/, $line) {
310
3619
3259
      next unless $token =~ /$pattern/;
311
2464
2190
      $token =~ s/^(?:'|$rsqm)+//g;
312
2464
2613
      $token =~ s/(?:'|$rsqm)+s?$//g;
313
2464
1405
      my $raw_token = $token;
314
2464
1436
      $token =~ s/^[^Ii]?'+(.*)/$1/;
315
2464
1205
      $token =~ s/(.*?)'+$/$1/;
316
2464
3713
      next unless $token =~ $word_match;
317
2311
2016
      if (defined $dictionary{$token}) {
318
1032
536
        ++$words;
319
1032
496
        $unique_ref->{$token}=1;
320
1032
844
        next;
321      }
322
1279
976
      if (@hunspell_dictionaries) {
323
1254
626
        my $found = 0;
324
1254
711
        for my $hunspell_dictionary (@hunspell_dictionaries) {
325          my $token_encoded = defined $hunspell_dictionary->{'encoding'} ?
326
1254
1028
            encode($hunspell_dictionary->{'encoding'}, $token) : $token;
327
1254
2731
          next unless ($hunspell_dictionary->{'engine'}->check($token_encoded));
328
0
0
          ++$words;
329
0
0
          $dictionary{$token} = 1;
330
0
0
          $unique_ref->{$token}=1;
331
0
0
          $found = 1;
332
0
0
          last;
333        }
334
1254
953
        next if $found;
335      }
336
1279
954
      my $key = lc $token;
337
1279
1131
      if (defined $dictionary{$key}) {
338
6
3
        ++$words;
339
6
4
        $unique_ref->{$key}=1;
340
6
7
        next;
341      }
342
1273
894
      unless ($disable_word_collating) {
343
1273
680
        $key =~ s/''+/'/g;
344
1273
1334
        $key =~ s/'[sd]$// unless length $key >= $shortest_threshold;
345      }
346
1273
1099
      if (defined $dictionary{$key}) {
347
0
0
        ++$words;
348
0
0
        $unique_ref->{$key}=1;
349
0
0
        next;
350      }
351
1273
631
      ++$unrecognized;
352
1273
873
      $unique_unrecognized_ref->{$raw_token}=1;
353
1273
1759
      $unrecognized_line_items_ref->{$raw_token}=1;
354    }
355
1156
1645
    return ($words, $unrecognized);
356}
357
358sub skip_file {
359
7
24
  my ($temp_dir, $reason) = @_;
360
7
234
  open(my $skipped_fh, '>:utf8', "$temp_dir/skipped");
361
7
36
  print $skipped_fh $reason;
362
7
117
  close $skipped_fh;
363}
364
365sub maybe_ocr_file {
366
0
0
  my ($file) = @_;
367
0
0
  our $ocr_directory;
368
0
0
  my $ocr_file = "$ocr_directory/$file";
369
0
0
  $ocr_file =~ /^(.*)$/;
370
0
0
  $ocr_file = $1;
371
0
0
  my $ocr_source_sha = "$ocr_file.sha1";
372
0
0
  $ocr_file = "$ocr_file.txt";
373
0
0
  my $sha = Digest::SHA->new(1)->addfile($file, 'b')->hexdigest;
374
0
0
  if (-e $ocr_file &&
375      -e $ocr_source_sha &&
376      open my $source_sha, '<', $ocr_source_sha) {
377
0
0
    my $last_sha = <$source_sha>;
378
0
0
    close $source_sha;
379
0
0
    if ($last_sha =~ /(.*)/) {
380
0
0
      return ($ocr_file, 1) if ($1 eq $sha);
381    }
382  }
383
0
0
  my $tesseract = dirname(dirname(dirname(__FILE__)))."/wrappers/run-tesseract";
384
0
0
  $ENV{'input'} = $file;
385
0
0
  my $text_file = `"$tesseract"`;
386
0
0
  delete $ENV{'input'};
387
0
0
  return ($file, 0) unless defined $text_file;
388
0
0
  my $file_converted = 0;
389
0
0
  chomp $text_file;
390
0
0
  if ($text_file =~ /^(.*)$/) {
391
0
0
    $text_file = $1;
392
0
0
    my $file_size = -s $text_file;
393
0
0
    if ($file_size > 20) {
394
0
0
      $file_converted = 1;
395
0
0
      make_path(dirname($ocr_source_sha));
396
0
0
      open my $source_sha, '>', $ocr_source_sha;
397
0
0
      print $source_sha $sha;
398
0
0
      close $source_sha;
399
0
0
      rename($text_file, $ocr_file);
400
0
0
      $file = $ocr_file;
401    } else {
402
0
0
      unlink($text_file);
403    }
404  }
405
0
0
  return ($file, $file_converted);
406}
407
408sub split_file {
409
16
12344
  my ($file) = @_;
410  our (
411
16
9
    $unrecognized, $shortest, $largest_file, $words,
412    $word_match, %unique, %unique_unrecognized, $forbidden_re,
413    @forbidden_re_list, $patterns_re, %dictionary,
414    $candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file,
415    $disable_single_line_file,
416    $ignore_next_line_pattern,
417    $sandbox,
418    $check_images,
419  );
420
16
30
  $ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./;
421
422
16
8
  our %forbidden_re_descriptions;
423
16
9
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
424
425  # https://www.fileformat.info/info/unicode/char/2019/
426
16
10
  my $rsqm = "\xE2\x80\x99";
427
428
16
16
  my @candidates_re_hits = (0) x scalar @candidates_re_list;
429
16
12
  my @candidates_re_lines = (0) x scalar @candidates_re_list;
430
16
9
  my @forbidden_re_hits = (0) x scalar @forbidden_re_list;
431
16
14
  my @forbidden_re_lines = (0) x scalar @forbidden_re_list;
432
16
37
  my $temp_dir = tempdir(DIR=>$sandbox);
433
16
2480
  print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'};
434
16
398
  open(my $name_fh, '>', "$temp_dir/name");
435
16
37
    print $name_fh $file;
436
16
213
  close $name_fh;
437
16
205
  if (defined readlink($file) &&
438      rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) {
439
1
3
    skip_file($temp_dir, "file only has a single line (out-of-bounds-symbolic-link)\n");
440
1
5
    return $temp_dir;
441  }
442
15
26
  if ($use_magic_file) {
443
8
12484
    if (open(my $file_fh, '-|',
444              '/usr/bin/file',
445              '-b',
446              '--mime',
447              '-e', 'cdf',
448              '-e', 'compress',
449              '-e', 'csv',
450              '-e', 'elf',
451              '-e', 'json',
452              '-e', 'tar',
453              $file)) {
454
8
30387
      my $file_kind = <$file_fh>;
455
8
4888
      close $file_fh;
456
8
24
      my $file_converted = 0;
457
8
23
      if ($check_images && $file_kind =~ m<^image/>) {
458
0
0
        ($file, $file_converted) = maybe_ocr_file($file);
459      }
460
8
198
      if ($file_converted == 0 && $file_kind =~ /^(.*?); charset=binary/) {
461
2
36
        skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n");
462
2
58
        return $temp_dir;
463      }
464    }
465  } elsif ($file =~ /\.(?:png|jpe?g|gif)$/) {
466
0
0
    my $file_converted = 0;
467
0
0
    ($file, $file_converted) = maybe_ocr_file($file);
468  }
469
13
76
  my $file_size = -s $file;
470
13
14
  if (defined $largest_file) {
471
13
14
    unless ($check_file_names eq $file) {
472
13
14
      if ($file_size > $largest_file) {
473
1
2
        skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n");
474
1
3
        return $temp_dir;
475      }
476    }
477  }
478
12
142
  open my $file_fh, '<', $file;
479
12
11
  binmode $file_fh;
480
12
6
  my $head;
481
12
129
  read($file_fh, $head, 4096);
482
12
801
  $head =~ s/(?:\r|\n)+$//;
483
12
43
  my $dos_new_lines = () = $head =~ /\r\n/gi;
484
12
41
  my $unix_new_lines = () = $head =~ /\n/gi;
485
12
121
  my $mac_new_lines = () = $head =~ /\r/gi;
486
12
52
  local $/;
487
12
67
  if ($unix_new_lines == 0 && $mac_new_lines == 0) {
488
3
5
    $/ = "\n";
489  } elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) {
490
1
7
    $/ = "\r\n";
491  } elsif ($mac_new_lines > $unix_new_lines) {
492
2
7
    $/ = "\r";
493  } else {
494
6
8
    $/ = "\n";
495  }
496
12
30
  seek($file_fh, 0, 0);
497
12
18
  ($words, $unrecognized) = (0, 0);
498
12
38
  %unique = ();
499
12
28
  %unique_unrecognized = ();
500
501  local $SIG{__WARN__} = sub {
502
0
0
    my $message = shift;
503
0
0
    $message =~ s/> line/> in $file - line/;
504
0
0
    chomp $message;
505
0
0
    print STDERR "$message\n";
506
12
119
  };
507
508
12
322
  open(my $warnings_fh, '>:utf8', "$temp_dir/warnings");
509
12
7
  our $timeout;
510
12
12
  eval {
511
12
0
96
0
    local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
512
12
58
    alarm $timeout;
513
514
12
19
    my $ignore_next_line = 0;
515
12
13
    my $offset = 0;
516
12
86
    while (<$file_fh>) {
517
1159
1012
      if ($. == 1) {
518
12
11
        unless ($disable_minified_file) {
519
12
68
          if ($file_size >= 512 && length($_) == $file_size) {
520
1
11
            skip_file($temp_dir, "file only has a single line (single-line-file)\n");
521
1
2
            last;
522          }
523        }
524      }
525
1158
2409
      $_ = decode_utf8($_, FB_DEFAULT);
526
1158
2808
      if (/[\x{D800}-\x{DFFF}]/) {
527
0
0
        skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n");
528
0
0
        last;
529      }
530
1158
1371
      s/\R$//;
531
1158
1010
      s/^\x{FEFF}// if $. == 1;
532
1158
1156
      next unless /./;
533
1157
760
      my $raw_line = $_;
534
535
1157
610
      my $ignore_this_line = $ignore_next_line;
536
1157
920
      $ignore_next_line = ($_ =~ /$ignore_next_line_pattern/);
537
1157
790
      next if $ignore_this_line;
538
539      # hook for custom line based text exclusions:
540
1156
829
      if (defined $patterns_re) {
541
2
6
10
8
        s/($patterns_re)/"="x length($1)/ge;
542      }
543
1156
639
      my $initial_line_state = $_;
544
1156
705
      my $previous_line_state = $_;
545
1156
600
      my $line_flagged;
546
1156
845
      if ($forbidden_re) {
547
9
5
63
13
        while (s/($forbidden_re)/"="x length($1)/e) {
548
5
5
          $line_flagged = 1;
549
5
12
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
550
5
4
          my $found_trigger_re;
551
5
5
          for my $i (0 .. $#forbidden_re_list) {
552
7
8
            my $forbidden_re_singleton = $forbidden_re_list[$i];
553
7
4
            my $test_line = $previous_line_state;
554
7
4
83
8
            if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) {
555
4
7
              next unless $test_line eq $_;
556
4
10
              my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1);
557
4
7
              next unless $begin == $begin_test;
558
4
4
              next unless $end == $end_test;
559
4
5
              next unless $match eq $match_test;
560
4
4
              $found_trigger_re = $forbidden_re_singleton;
561
4
10
              my $hit = "$.:$begin:$end";
562
4
4
              $forbidden_re_hits[$i]++;
563
4
7
              $forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i];
564
4
6
              last;
565            }
566          }
567
5
8
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
568
5
7
          if ($found_trigger_re) {
569
4
9
            my $description = $forbidden_re_descriptions{$found_trigger_re} || '';
570
4
12
            $found_trigger_re =~ s/^\(\?:(.*)\)$/$1/;
571
4
5
            my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99);
572
4
8
            if ($description ne '') {
573
3
18
              print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n";
574            } else {
575
1
5
              print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n";
576            }
577          } else {
578
1
4
            print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n";
579          }
580
5
30
          $previous_line_state = $_;
581        }
582
9
9
        $_ = $initial_line_state;
583      }
584      # This is to make it easier to deal w/ rules:
585
1156
1091
      s/^/ /;
586
1156
714
      my %unrecognized_line_items = ();
587
1156
919
      my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items, $warnings_fh);
588
1156
735
      $words += $new_words;
589
1156
611
      $unrecognized += $new_unrecognized;
590
1156
799
      my $line_length = length($raw_line);
591
1156
1600
      for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) {
592
1020
580
        my $found_token = 0;
593
1020
515
        my $raw_token = $token;
594
1020
505
        $token =~ s/'/(?:'|\x{2019}|\&apos;|\&#39;)+/g;
595
1020
473
        my $before;
596
1020
1550
        if ($token =~ /^$upper_pattern$lower_pattern/) {
597
5
5
          $before = '(?<=.)';
598        } elsif ($token =~ /^$upper_pattern/) {
599
0
0
          $before = "(?<!$upper_pattern)";
600        } else {
601
1015
604
          $before = "(?<=$not_lower_pattern)";
602        }
603
1020
1177
        my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)";
604
1020
2082
        while ($raw_line =~ /(?:\b|$before)($token)(?:\b|$after)/g) {
605
1270
585
          $line_flagged = 1;
606
1270
550
          $found_token = 1;
607
1270
1654
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
608
1270
1243
          next unless $match =~ /./;
609
1270
925
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
610
1270
4888
          print $warnings_fh ":$.:$begin ... $end: $wrapped\n";
611        }
612
1020
1136
        unless ($found_token) {
613
3
29
          if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) {
614
3
4
            my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
615
3
4
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
616
3
12
            print $warnings_fh ":$.:$begin ... $end: $wrapped\n";
617          } else {
618
0
0
            my $offset = $line_length + 1;
619
0
0
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
620
0
0
            print $warnings_fh ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n";
621          }
622        }
623      }
624
1156
1912
      if ($line_flagged && $candidates_re) {
625
1
1
        $_ = $previous_line_state = $initial_line_state;
626
1
1
17
3
        s/($candidates_re)/"="x length($1)/ge;
627
1
2
        if ($_ ne $initial_line_state) {
628
1
1
          $_ = $previous_line_state;
629
1
1
          for my $i (0 .. $#candidates_re_list) {
630
2
2
            my $candidate_re = $candidates_re_list[$i];
631
2
14
            next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/;
632
1
1
6
2
            if (($_ =~ s/($candidate_re)/"="x length($1)/e)) {
633
1
2
              my ($begin, $end) = ($-[0] + 1, $+[0] + 1);
634
1
3
              my $hit = "$.:$begin:$end";
635
1
1
              $_ = $previous_line_state;
636
1
1
6
2
              my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge);
637
1
1
              $candidates_re_hits[$i] += $replacements;
638
1
5
              $candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i];
639
1
5
              $_ = $previous_line_state;
640            }
641          }
642        }
643      }
644
1156
849
      unless ($disable_minified_file) {
645
1156
885
        s/={3,}//g;
646
1156
778
        $offset += length;
647
1156
1018
        my $ratio = int($offset / $.);
648
1156
646
        my $ratio_threshold = 1000;
649
1156
3037
        if ($ratio > $ratio_threshold) {
650
2
7
          skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n");
651
2
6
          last;
652        }
653      }
654    }
655
656
12
85
    alarm 0;
657  };
658
12
11
  if ($@) {
659
0
0
    die unless $@ eq "alarm\n";
660
0
0
    print $warnings_fh ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n";
661
0
0
    skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n");
662
0
0
    return $temp_dir;
663  }
664
665
12
49
  close $file_fh;
666
12
141
  close $warnings_fh;
667
668
12
28
  if ($unrecognized || @candidates_re_hits || @forbidden_re_hits) {
669
11
348
    open(my $stats_fh, '>:utf8', "$temp_dir/stats");
670
11
177
      print $stats_fh "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized).
671      ", unique: ".(keys %unique).
672      (@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : "").
673      (@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : "").
674      (@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : "").
675      (@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : "").
676      "}";
677
11
146
    close $stats_fh;
678
11
277
    open(my $unknown_fh, '>:utf8', "$temp_dir/unknown");
679
11
19
46
28
      print $unknown_fh map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized;
680
11
130
    close $unknown_fh;
681  }
682
683
12
134
  return $temp_dir;
684}
685
686sub main {
687
2
407
  my ($configuration, @ARGV) = @_;
688
2
2
  our %dictionary;
689
2
3
  unless (%dictionary) {
690
1
1
    init($configuration);
691  }
692
693  # read all input
694
2
2
  my @reports;
695
696
2
1
  for my $file (@ARGV) {
697
2
2
    my $temp_dir = split_file($file);
698
2
3
    push @reports, "$temp_dir\n";
699  }
700
2
7
  print join '', @reports;
701}
702
7031;