File Coverage

File:lib/CheckSpelling/UnknownWordSplitter.pm
Coverage:81.1%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3# ~/bin/w
4# Search for potentially misspelled words
5# Output is:
6# misspellled
7# woord (WOORD, Woord, woord, woord's)
8package CheckSpelling::UnknownWordSplitter;
9
10
1
1
110830
1
use 5.022;
11
1
1
1
2
0
41
use feature 'unicode_strings';
12
1
1
1
2
0
9
use strict;
13
1
1
1
2
0
17
use warnings;
14
1
1
1
2
0
14
no warnings qw(experimental::vlb);
15
1
1
1
2
0
2
use utf8;
16
1
1
1
12
1
26
use Encode qw/decode_utf8 encode FB_DEFAULT/;
17
1
1
1
1
1
24
use File::Basename;
18
1
1
1
2
0
13
use Cwd 'abs_path';
19
1
1
1
1
1
20
use File::Spec;
20
1
1
1
1
1
21
use File::Temp qw/ tempfile tempdir /;
21
1
1
1
1
1
13
use File::Path qw/ make_path /;
22
1
1
1
276
1
19
use CheckSpelling::Util;
23
1
1
1
220
1251
3706
use Digest::SHA;
24our $VERSION='0.1.0';
25
26my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
27my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
28my ($shortest, $longest) = (255, 0);
29my @forbidden_re_list;
30my %forbidden_re_descriptions;
31my @candidates_re_list;
32my $hunspell_dictionary_path;
33my @hunspell_dictionaries;
34my %dictionary = ();
35my $base_dict;
36my %unique;
37my %unique_unrecognized;
38my ($last_file, $words, $unrecognized) = ('', 0, 0);
39my ($ignore_next_line_pattern);
40my ($check_images, $ocr_directory);
41
42my $disable_flags;
43
44sub test_re {
45
14
9
  my ($expression) = @_;
46
14
14
7
106
  return eval { qr /$expression/ };
47}
48
49sub quote_re {
50
14
7
  my ($expression) = @_;
51
14
14
  return $expression if $expression =~ /\?\{/;
52
14
32
  $expression =~ s/
53   \G
54   (
55      (?:[^\\]|\\[^Q])*
56   )
57   (?:
58      \\Q
59      (?:[^\\]|\\[^E])*
60      (?:\\E)?
61   )?
62/
63
28
44
   $1 . (defined($2) ? quotemeta($2) : '')
64/xge;
65
14
17
  return $expression;
66}
67
68sub file_to_lists {
69
3
4
  my ($re) = @_;
70
3
3
  my @patterns;
71  my %hints;
72
3
0
  my $fh;
73
3
29
  if (open($fh, '<:utf8', $re)) {
74
3
5
    local $/=undef;
75
3
24
    my $file=<$fh>;
76
3
8
    close $fh;
77
3
2
    my $line_number = 0;
78
3
4
    my $hint = '';
79
3
14
    for (split /\R/, $file) {
80
17
10
      ++$line_number;
81
17
7
      chomp;
82
17
22
      if (/^#(?:\s(.+)|)/) {
83
6
14
        $hint = $1 if ($hint eq '' && defined $1);
84
6
4
        next;
85      }
86
11
12
      $hint = '' unless $_ ne '';
87
11
11
      next if $_ eq '$^';
88
11
6
      my $pattern = $_;
89
11
28
      next unless s/^(.+)/(?:$1)/;
90
7
9
      my $quoted = quote_re($1);
91
7
7
      unless (test_re $quoted) {
92
1
3
        my $error = $@;
93
1
51
        my $home = dirname(__FILE__);
94
1
23
        $error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/;
95
1
13
        print STDERR $error;
96
1
2
        $_ = '(?:\$^ - skipped because bad-regex)';
97
1
1
        $hint = '';
98      }
99
7
10
      if (defined $hints{$_}) {
100
1
2
        my $pattern_length = length $pattern;
101
1
2
        my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern);
102
1
16
        print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n";
103
1
2
        $_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)';
104      } else {
105
6
8
        push @patterns, $_;
106
6
10
        $hints{$_} = $hint;
107      }
108
7
12
      $hint = '';
109    }
110  }
111
112  return {
113
3
13
    patterns => \@patterns,
114    hints => \%hints,
115  };
116}
117
118sub file_to_list {
119
2
1313
  my ($re) = @_;
120
2
4
  my $lists = file_to_lists($re);
121
122
2
2
2
6
  return @{$lists->{'patterns'}};
123}
124
125sub list_to_re {
126
2
2
  my (@list) = @_;
127
2
5
5
1
5
4
  @list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list;
128
2
5
2
5
  @list = grep { $_ ne '' } @list;
129
2
3
  return '$^' unless scalar @list;
130
2
6
  return join "|", (@list);
131}
132
133sub not_empty {
134
73
61
  my ($thing) = @_;
135
73
343
  return defined $thing && $thing ne '' && $thing =~ /^\d+$/;
136}
137
138sub valid_word {
139  # shortest_word is an absolute
140
22
13
  our ($shortest, $longest, $shortest_word, $longest_word);
141
22
15
  $shortest = $shortest_word if $shortest_word;
142
22
19
  if ($longest_word) {
143    # longest_word is an absolute
144
20
26
    $longest = $longest_word;
145  } elsif (not_empty($longest)) {
146    # we allow for some sloppiness (a couple of stuck keys per word)
147    # it's possible that this should scale with word length
148
1
1
    $longest += 2;
149  }
150
22
13
  our ($upper_pattern, $lower_pattern, $punctuation_pattern);
151
22
66
18
142
  my $word_pattern = join '|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern));
152
22
23
  $word_pattern = q<\\w|'> unless $word_pattern;
153
22
42
  if ((defined $shortest && not_empty($longest)) &&
154      ($shortest > $longest)) {
155
0
0
    $word_pattern = "(?:$word_pattern){3}";
156
0
0
    return qr/$word_pattern/;
157  }
158
22
23
  $shortest = 3 unless defined $shortest;
159
22
15
  $longest = '' unless not_empty($longest);
160
22
69
  $word_match = "(?:$word_pattern){$shortest,$longest}";
161
22
177
  return qr/\b$word_match\b/;
162}
163
164sub load_dictionary {
165
12
2003
  my ($dict) = @_;
166
12
5
  our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary);
167
12
11
  $longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef);
168
12
10
  $shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', undef);
169
12
6
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
170
12
13
  $ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>);
171
12
45
  $upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]');
172
12
26
  $lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]');
173
12
21
  $not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]');
174
12
18
  $not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]');
175
12
21
  $punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>);
176
12
42
  %dictionary = ();
177
178
12
558
  open(DICT, '<:utf8', $dict);
179
12
53
  while (!eof(DICT)) {
180
31
52
    my $word = <DICT>;
181
31
20
    chomp $word;
182
31
78
    next unless $word =~ $word_match;
183
28
26
    my $l = length $word;
184
28
23
    $longest = -1 unless not_empty($longest);
185
28
31
    $longest = $l if $l > $longest;
186
28
24
    $shortest = $l if $l < $shortest;
187
28
54
    $dictionary{$word}=1;
188  }
189
12
27
  close DICT;
190
191
12
9
  $word_match = valid_word();
192}
193
194sub hunspell_dictionary {
195
3
6
  my ($dict) = @_;
196
3
5
  my $name = $dict;
197
3
3
  $name =~ s{/src/index/hunspell/index\.dic$}{};
198
3
11
  $name =~ s{.*/}{};
199
3
3
  my $aff = $dict;
200
3
2
  my $encoding;
201
3
8
  $aff =~ s/\.dic$/.aff/;
202
3
28
  if (open AFF, '<', $aff) {
203
3
20
    while (<AFF>) {
204
0
0
      next unless /^SET\s+(\S+)/;
205
0
0
      $encoding = $1 if ($1 !~ /utf-8/i);
206
0
0
      last;
207    }
208
3
9
    close AFF;
209  }
210  return {
211
3
302
    name => $name,
212    dict => $dict,
213    aff => $aff,
214    encoding => $encoding,
215    engine => Text::Hunspell->new($aff, $dict),
216  }
217}
218
219sub init {
220
9
11373
  my ($configuration) = @_;
221
9
14
  our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
222
9
26
  our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', '');
223
9
9
  our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
224
9
12
  our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
225
9
6
  our %forbidden_re_descriptions;
226
9
9
  if ($hunspell_dictionary_path) {
227
3
30
    our @hunspell_dictionaries = ();
228
1
1
1
1
1
1
1
1
1
3
182
1148
21
4
1
16
7
0
17
161
    if (eval 'use Text::Hunspell; 1') {
229
3
125
      my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic");
230
3
7
      for my $hunspell_dictionary_file (@hunspell_dictionaries_list) {
231
3
9
        push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file);
232      }
233    } else {
234
0
0
      print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
235    }
236  }
237
9
10
  my (@patterns_re_list, %in_patterns_re_list);
238
9
59
  if (-e "$configuration/patterns.txt") {
239
0
0
    @patterns_re_list = file_to_list "$configuration/patterns.txt";
240
0
0
    $patterns_re = list_to_re @patterns_re_list;
241
0
0
0
0
    %in_patterns_re_list = map {$_ => 1} @patterns_re_list;
242  } else {
243
9
6
    $patterns_re = undef;
244  }
245
246
9
40
  if (-e "$configuration/forbidden.txt") {
247
1
1
    my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt";
248
1
1
1
2
    @forbidden_re_list = @{$forbidden_re_info->{'patterns'}};
249
1
1
0
3
    %forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}};
250
1
1
    $forbidden_re = list_to_re @forbidden_re_list;
251  } else {
252
8
4
    $forbidden_re = undef;
253  }
254
255
9
35
  if (-e "$configuration/candidates.txt") {
256
1
2
    @candidates_re_list = file_to_list "$configuration/candidates.txt";
257
1
2
2
2
1
5
    @candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} || !test_re($quoted) ? '' : $quoted } @candidates_re_list;
258
1
1
    $candidates_re = list_to_re @candidates_re_list;
259  } else {
260
8
8
    $candidates_re = undef;
261  }
262
263
9
10
  our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024);
264
265
9
10
  my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', '');
266
9
11
  our $disable_word_collating = $disable_flags =~ /(?:^|,|\s)word-collating(?:,|\s|$)/;
267
9
4
  our $disable_minified_file = $disable_flags =~ /(?:^|,|\s)minified-file(?:,|\s|$)/;
268
9
5
  our $disable_single_line_file = $disable_flags =~ /(?:^|,|\s)single-line-file(?:,|\s|$)/;
269
270
9
7
  our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', '');
271
9
6
  $ignore_next_line_pattern =~ s/\s+/|/g;
272
273
9
6
  our $check_images = CheckSpelling::Util::get_val_from_env('INPUT_CHECK_IMAGES', '');
274
9
4
  $check_images = $check_images =~ /^(?:1|true)$/i;
275
9
6
  if ($check_images) {
276
0
0
    our $ocr_directory = CheckSpelling::Util::get_file_from_env('ocr_directory', '/tmp/ocr');
277
0
0
    $ocr_directory = $1 if ($ocr_directory =~ /^(.*)$/);
278  }
279
280
9
8
  our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', '');
281
282
9
8
  our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', '');
283
284
9
30
  $word_match = valid_word();
285
286
9
17
  our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words");
287
9
42
  $base_dict = '/usr/share/dict/words' unless -e $base_dict;
288
9
7
  load_dictionary($base_dict);
289}
290
291sub split_line {
292
1156
482
  our (%dictionary, $word_match, $disable_word_collating);
293
1156
498
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
294
1156
489
  our @hunspell_dictionaries;
295
1156
441
  our $shortest;
296
1156
793
  my $shortest_threshold = $shortest + 2;
297
1156
591
  my $pattern = '.';
298  # $pattern = "(?:$upper_pattern){$shortest,}|$upper_pattern(?:$lower_pattern){2,}\n";
299
300  # https://www.fileformat.info/info/unicode/char/2019/
301
1156
518
  my $rsqm = "\xE2\x80\x99";
302
303
1156
633
  my ($words, $unrecognized) = (0, 0);
304
1156
847
  my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_;
305
1156
5647
    $line =~ s/(?:$rsqm|&apos;|&#39;|\%27|&#8217;|&#x2019;|&rsquo;|\\u2019|\x{2019}|')+/'/g;
306
1156
2337
    $line =~ s/(?:$ignore_pattern)+/ /g;
307
1156
1715
    while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {}
308
1156
3707
    while ($line =~ s/((?:$lower_pattern|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {}
309
1156
1377
    for my $token (split /\s+/, $line) {
310
3619
3274
      next unless $token =~ /$pattern/;
311
2464
2063
      $token =~ s/^(?:'|$rsqm)+//g;
312
2464
2384
      $token =~ s/(?:'|$rsqm)+s?$//g;
313
2464
1465
      my $raw_token = $token;
314
2464
1405
      $token =~ s/^[^Ii]?'+(.*)/$1/;
315
2464
1263
      $token =~ s/(.*?)'+$/$1/;
316
2464
3823
      next unless $token =~ $word_match;
317
2311
2140
      if (defined $dictionary{$token}) {
318
1032
483
        ++$words;
319
1032
544
        $unique_ref->{$token}=1;
320
1032
834
        next;
321      }
322
1279
914
      if (@hunspell_dictionaries) {
323
1254
615
        my $found = 0;
324
1254
752
        for my $hunspell_dictionary (@hunspell_dictionaries) {
325          my $token_encoded = defined $hunspell_dictionary->{'encoding'} ?
326
1254
1101
            encode($hunspell_dictionary->{'encoding'}, $token) : $token;
327
1254
2840
          next unless ($hunspell_dictionary->{'engine'}->check($token_encoded));
328
0
0
          ++$words;
329
0
0
          $dictionary{$token} = 1;
330
0
0
          $unique_ref->{$token}=1;
331
0
0
          $found = 1;
332
0
0
          last;
333        }
334
1254
906
        next if $found;
335      }
336
1279
901
      my $key = lc $token;
337
1279
1160
      if (defined $dictionary{$key}) {
338
6
5
        ++$words;
339
6
4
        $unique_ref->{$key}=1;
340
6
7
        next;
341      }
342
1273
931
      unless ($disable_word_collating) {
343
1273
754
        $key =~ s/''+/'/g;
344
1273
1338
        $key =~ s/'[sd]$// unless length $key >= $shortest_threshold;
345      }
346
1273
1127
      if (defined $dictionary{$key}) {
347
0
0
        ++$words;
348
0
0
        $unique_ref->{$key}=1;
349
0
0
        next;
350      }
351
1273
648
      ++$unrecognized;
352
1273
745
      $unique_unrecognized_ref->{$raw_token}=1;
353
1273
1664
      $unrecognized_line_items_ref->{$raw_token}=1;
354    }
355
1156
1573
    return ($words, $unrecognized);
356}
357
358sub skip_file {
359
7
26
  my ($temp_dir, $reason) = @_;
360
7
183
  open(SKIPPED, '>:utf8', "$temp_dir/skipped");
361
7
36
  print SKIPPED $reason;
362
7
104
  close SKIPPED;
363}
364
365sub maybe_ocr_file {
366
0
0
  my ($file) = @_;
367
0
0
  our $ocr_directory;
368
0
0
  my $ocr_file = "$ocr_directory/$file";
369
0
0
  $ocr_file =~ /^(.*)$/;
370
0
0
  $ocr_file = $1;
371
0
0
  my $ocr_source_sha = "$ocr_file.sha1";
372
0
0
  $ocr_file = "$ocr_file.txt";
373
0
0
  my $sha = Digest::SHA->new(1)->addfile($file, 'b')->hexdigest;
374
0
0
  if (-e $ocr_file &&
375      -e $ocr_source_sha &&
376      open my $source_sha, '<', $ocr_source_sha) {
377
0
0
    my $last_sha = <$source_sha>;
378
0
0
    close $source_sha;
379
0
0
    if ($last_sha =~ /(.*)/) {
380
0
0
      return ($ocr_file, 1) if ($1 eq $sha);
381    }
382  }
383
0
0
  my $tesseract = dirname(dirname(dirname(__FILE__)))."/wrappers/run-tesseract";
384
0
0
  $ENV{'input'} = $file;
385
0
0
  my $text_file = `"$tesseract"`;
386
0
0
  delete $ENV{'input'};
387
0
0
  return ($file, 0) unless defined $text_file;
388
0
0
  my $file_converted = 0;
389
0
0
  chomp $text_file;
390
0
0
  if ($text_file =~ /^(.*)$/) {
391
0
0
    $text_file = $1;
392
0
0
    my $file_size = -s $text_file;
393
0
0
    if ($file_size > 20) {
394
0
0
      $file_converted = 1;
395
0
0
      make_path(dirname($ocr_source_sha));
396
0
0
      open my $source_sha, '>', $ocr_source_sha;
397
0
0
      print $source_sha $sha;
398
0
0
      close $source_sha;
399
0
0
      rename($text_file, $ocr_file);
400
0
0
      $file = $ocr_file;
401    } else {
402
0
0
      unlink($text_file);
403    }
404  }
405
0
0
  return ($file, $file_converted);
406}
407
408sub split_file {
409
16
11450
  my ($file) = @_;
410  our (
411
16
12
    $unrecognized, $shortest, $largest_file, $words,
412    $word_match, %unique, %unique_unrecognized, $forbidden_re,
413    @forbidden_re_list, $patterns_re, %dictionary,
414    $candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file,
415    $disable_single_line_file,
416    $ignore_next_line_pattern,
417    $sandbox,
418    $check_images,
419  );
420
16
30
  $ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./;
421
422
16
12
  our %forbidden_re_descriptions;
423
16
4
  our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
424
425  # https://www.fileformat.info/info/unicode/char/2019/
426
16
13
  my $rsqm = "\xE2\x80\x99";
427
428
16
16
  my @candidates_re_hits = (0) x scalar @candidates_re_list;
429
16
8
  my @candidates_re_lines = (0) x scalar @candidates_re_list;
430
16
11
  my @forbidden_re_hits = (0) x scalar @forbidden_re_list;
431
16
22
  my @forbidden_re_lines = (0) x scalar @forbidden_re_list;
432
16
27
  my $temp_dir = tempdir(DIR=>$sandbox);
433
16
2395
  print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'};
434
16
348
  open(NAME, '>', "$temp_dir/name");
435
16
38
    print NAME $file;
436
16
222
  close NAME;
437
16
199
  if (defined readlink($file) &&
438      rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) {
439
1
4
    skip_file($temp_dir, "file only has a single line (out-of-bounds-symbolic-link)\n");
440
1
3
    return $temp_dir;
441  }
442
15
27
  if ($use_magic_file) {
443
8
12718
    if (open(my $file_fh, '-|',
444              '/usr/bin/file',
445              '-b',
446              '--mime',
447              '-e', 'cdf',
448              '-e', 'compress',
449              '-e', 'csv',
450              '-e', 'elf',
451              '-e', 'json',
452              '-e', 'tar',
453              $file)) {
454
8
31529
      my $file_kind = <$file_fh>;
455
8
4902
      close $file_fh;
456
8
20
      my $file_converted = 0;
457
8
16
      if ($check_images && $file_kind =~ m<^image/>) {
458
0
0
        ($file, $file_converted) = maybe_ocr_file($file);
459      }
460
8
182
      if ($file_converted == 0 && $file_kind =~ /^(.*?); charset=binary/) {
461
2
33
        skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n");
462
2
54
        return $temp_dir;
463      }
464    }
465  } elsif ($file =~ /\.(?:png|jpe?g|gif)$/) {
466
0
0
    my $file_converted = 0;
467
0
0
    ($file, $file_converted) = maybe_ocr_file($file);
468  }
469
13
74
  my $file_size = -s $file;
470
13
11
  if (defined $largest_file) {
471
13
13
    unless ($check_file_names eq $file) {
472
13
11
      if ($file_size > $largest_file) {
473
1
2
        skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n");
474
1
2
        return $temp_dir;
475      }
476    }
477  }
478
12
95
  open FILE, '<', $file;
479
12
11
  binmode FILE;
480
12
7
  my $head;
481
12
117
  read(FILE, $head, 4096);
482
12
1082
  $head =~ s/(?:\r|\n)+$//;
483
12
62
  my $dos_new_lines = () = $head =~ /\r\n/gi;
484
12
41
  my $unix_new_lines = () = $head =~ /\n/gi;
485
12
124
  my $mac_new_lines = () = $head =~ /\r/gi;
486
12
77
  local $/;
487
12
62
  if ($unix_new_lines == 0 && $mac_new_lines == 0) {
488
3
6
    $/ = "\n";
489  } elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) {
490
1
7
    $/ = "\r\n";
491  } elsif ($mac_new_lines > $unix_new_lines) {
492
2
8
    $/ = "\r";
493  } else {
494
6
7
    $/ = "\n";
495  }
496
12
27
  seek(FILE, 0, 0);
497
12
14
  ($words, $unrecognized) = (0, 0);
498
12
42
  %unique = ();
499
12
30
  %unique_unrecognized = ();
500
501  local $SIG{__WARN__} = sub {
502
0
0
    my $message = shift;
503
0
0
    $message =~ s/> line/> in $file - line/;
504
0
0
    chomp $message;
505
0
0
    print STDERR "$message\n";
506
12
122
  };
507
508
12
301
  open(WARNINGS, '>:utf8', "$temp_dir/warnings");
509
12
7
  our $timeout;
510
12
10
  eval {
511
12
0
106
0
    local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
512
12
57
    alarm $timeout;
513
514
12
17
    my $ignore_next_line = 0;
515
12
13
    my $offset = 0;
516
12
90
    while (<FILE>) {
517
1159
1074
      if ($. == 1) {
518
12
11
        unless ($disable_minified_file) {
519
12
48
          if ($file_size >= 512 && length($_) == $file_size) {
520
1
11
            skip_file($temp_dir, "file only has a single line (single-line-file)\n");
521
1
4
            last;
522          }
523        }
524      }
525
1158
2363
      $_ = decode_utf8($_, FB_DEFAULT);
526
1158
2803
      if (/[\x{D800}-\x{DFFF}]/) {
527
0
0
        skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n");
528
0
0
        last;
529      }
530
1158
1466
      s/\R$//;
531
1158
1082
      s/^\x{FEFF}// if $. == 1;
532
1158
1080
      next unless /./;
533
1157
800
      my $raw_line = $_;
534
535
1157
627
      my $ignore_this_line = $ignore_next_line;
536
1157
1150
      $ignore_next_line = ($_ =~ /$ignore_next_line_pattern/);
537
1157
757
      next if $ignore_this_line;
538
539      # hook for custom line based text exclusions:
540
1156
754
      if (defined $patterns_re) {
541
2
6
10
8
        s/($patterns_re)/"="x length($1)/ge;
542      }
543
1156
684
      my $initial_line_state = $_;
544
1156
694
      my $previous_line_state = $_;
545
1156
606
      my $line_flagged;
546
1156
779
      if ($forbidden_re) {
547
9
5
60
11
        while (s/($forbidden_re)/"="x length($1)/e) {
548
5
4
          $line_flagged = 1;
549
5
8
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
550
5
4
          my $found_trigger_re;
551
5
5
          for my $i (0 .. $#forbidden_re_list) {
552
7
5
            my $forbidden_re_singleton = $forbidden_re_list[$i];
553
7
4
            my $test_line = $previous_line_state;
554
7
4
66
6
            if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) {
555
4
5
              next unless $test_line eq $_;
556
4
6
              my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1);
557
4
7
              next unless $begin == $begin_test;
558
4
3
              next unless $end == $end_test;
559
4
3
              next unless $match eq $match_test;
560
4
3
              $found_trigger_re = $forbidden_re_singleton;
561
4
8
              my $hit = "$.:$begin:$end";
562
4
4
              $forbidden_re_hits[$i]++;
563
4
4
              $forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i];
564
4
6
              last;
565            }
566          }
567
5
6
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
568
5
6
          if ($found_trigger_re) {
569
4
6
            my $description = $forbidden_re_descriptions{$found_trigger_re} || '';
570
4
10
            $found_trigger_re =~ s/^\(\?:(.*)\)$/$1/;
571
4
4
            my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99);
572
4
5
            if ($description ne '') {
573
3
13
              print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n";
574            } else {
575
1
5
              print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n";
576            }
577          } else {
578
1
4
            print WARNINGS ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n";
579          }
580
5
27
          $previous_line_state = $_;
581        }
582
9
8
        $_ = $initial_line_state;
583      }
584      # This is to make it easier to deal w/ rules:
585
1156
1074
      s/^/ /;
586
1156
711
      my %unrecognized_line_items = ();
587
1156
972
      my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items);
588
1156
643
      $words += $new_words;
589
1156
501
      $unrecognized += $new_unrecognized;
590
1156
820
      my $line_length = length($raw_line);
591
1156
1568
      for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) {
592
1020
490
        my $found_token = 0;
593
1020
506
        my $raw_token = $token;
594
1020
529
        $token =~ s/'/(?:'|\x{2019}|\&apos;|\&#39;)+/g;
595
1020
474
        my $before;
596
1020
1676
        if ($token =~ /^$upper_pattern$lower_pattern/) {
597
5
3
          $before = '(?<=.)';
598        } elsif ($token =~ /^$upper_pattern/) {
599
0
0
          $before = "(?<!$upper_pattern)";
600        } else {
601
1015
640
          $before = "(?<=$not_lower_pattern)";
602        }
603
1020
1233
        my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)";
604
1020
2179
        while ($raw_line =~ /(?:\b|$before)($token)(?:\b|$after)/g) {
605
1270
582
          $line_flagged = 1;
606
1270
545
          $found_token = 1;
607
1270
1645
          my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
608
1270
1241
          next unless $match =~ /./;
609
1270
864
          my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
610
1270
4937
          print WARNINGS ":$.:$begin ... $end: $wrapped\n";
611        }
612
1020
1153
        unless ($found_token) {
613
3
28
          if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) {
614
3
5
            my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
615
3
3
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
616
3
12
            print WARNINGS ":$.:$begin ... $end: $wrapped\n";
617          } else {
618
0
0
            my $offset = $line_length + 1;
619
0
0
            my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
620
0
0
            print WARNINGS ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n";
621          }
622        }
623      }
624
1156
1858
      if ($line_flagged && $candidates_re) {
625
1
1
        $_ = $previous_line_state = $initial_line_state;
626
1
1
17
3
        s/($candidates_re)/"="x length($1)/ge;
627
1
2
        if ($_ ne $initial_line_state) {
628
1
1
          $_ = $previous_line_state;
629
1
1
          for my $i (0 .. $#candidates_re_list) {
630
2
2
            my $candidate_re = $candidates_re_list[$i];
631
2
13
            next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/;
632
1
1
6
2
            if (($_ =~ s/($candidate_re)/"="x length($1)/e)) {
633
1
1
              my ($begin, $end) = ($-[0] + 1, $+[0] + 1);
634
1
3
              my $hit = "$.:$begin:$end";
635
1
1
              $_ = $previous_line_state;
636
1
1
4
2
              my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge);
637
1
1
              $candidates_re_hits[$i] += $replacements;
638
1
6
              $candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i];
639
1
2
              $_ = $previous_line_state;
640            }
641          }
642        }
643      }
644
1156
849
      unless ($disable_minified_file) {
645
1156
950
        s/={3,}//g;
646
1156
803
        $offset += length;
647
1156
1010
        my $ratio = int($offset / $.);
648
1156
628
        my $ratio_threshold = 1000;
649
1156
3035
        if ($ratio > $ratio_threshold) {
650
2
5
          skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n");
651
2
7
          last;
652        }
653      }
654    }
655
656
12
75
    alarm 0;
657  };
658
12
12
  if ($@) {
659
0
0
    die unless $@ eq "alarm\n";
660
0
0
    print WARNINGS ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n";
661
0
0
    skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n");
662
0
0
    return $temp_dir;
663  }
664
665
12
39
  close FILE;
666
12
136
  close WARNINGS;
667
668
12
22
  if ($unrecognized || @candidates_re_hits || @forbidden_re_hits) {
669
11
296
    open(STATS, '>:utf8', "$temp_dir/stats");
670
11
149
      print STATS "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized).
671      ", unique: ".(keys %unique).
672      (@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : "").
673      (@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : "").
674      (@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : "").
675      (@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : "").
676      "}";
677
11
169
    close STATS;
678
11
257
    open(UNKNOWN, '>:utf8', "$temp_dir/unknown");
679
11
19
45
28
      print UNKNOWN map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized;
680
11
111
    close UNKNOWN;
681  }
682
683
12
121
  return $temp_dir;
684}
685
686sub main {
687
2
419
  my ($configuration, @ARGV) = @_;
688
2
1
  our %dictionary;
689
2
10
  unless (%dictionary) {
690
1
1
    init($configuration);
691  }
692
693  # read all input
694
2
2
  my @reports;
695
696
2
1
  for my $file (@ARGV) {
697
2
2
    my $temp_dir = split_file($file);
698
2
3
    push @reports, "$temp_dir\n";
699  }
700
2
6
  print join '', @reports;
701}
702
7031;