File Coverage

File:	lib/CheckSpelling/UnknownWordSplitter.pm
Coverage:	80.3%

line	stmt	bran	cond	sub	time	code
1						#! --perl--
2
3						# ~/bin/w
4						# Search for potentially misspelled words
5						# Output is:
6						# misspellled
7						# woord (WOORD, Woord, woord, woord's)
8						package CheckSpelling::UnknownWordSplitter;
9
10	1 1			1	116047 3	use 5.022;
11	1 1 1			1	3 0 53	use feature 'unicode_strings';
12	1 1 1			1	3 0 12	use strict;
13	1 1 1			1	1 0 24	use warnings;
14	1 1 1			1	1 1 15	no warnings qw(experimental::vlb);
15	1 1 1			1	2 2 2	use utf8;
16	1 1 1			1	11 1 33	use Encode qw/decode_utf8 encode FB_DEFAULT/;
17	1 1 1			1	2 1 30	use File::Basename;
18	1 1 1			1	1 1 15	use Cwd 'abs_path';
19	1 1 1			1	2 0 22	use File::Spec;
20	1 1 1			1	1 1 22	use File::Temp qw/ tempfile tempdir /;
21	1 1 1			1	2 1 16	use File::Path qw/ make_path /;
22	1 1 1			1	340 1 20	use CheckSpelling::Util;
23	1 1 1			1	195 1259 981	use Digest::SHA;
24						our $VERSION='0.1.0';
25
26						my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
27						my ($check_homoglyphs);
28						my $begin_block_re = '';
29						my @begin_block_list = ();
30						my @end_block_list = ();
31						my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
32						my ($shortest, $longest) = (255, 0);
33						my @forbidden_re_list;
34						my %forbidden_re_descriptions;
35						my @candidates_re_list;
36						my $hunspell_dictionary_path;
37						my @hunspell_dictionaries;
38						my %dictionary = ();
39						our @reject_re_list = ();
40						our $reject_re = '$^';
41						my $base_dict;
42						my %unique;
43						my %unique_unrecognized;
44						my ($last_file, $words, $unrecognized) = ('', 0, 0);
45						my ($ignore_next_line_pattern);
46						my ($check_images, $ocr_directory);
47
48						my $disable_flags;
49
50						sub test_re {
51	32			32	27	my ($expression) = @_;
52	32 32				16 249	return eval { qr /$expression/ };
53						}
54
55						sub quote_re {
56	34			34	37	my ($expression) = @_;
57	34	50			35	return $expression if $expression =~ /\?\{/;
58	34				74	$expression =~ s/
59						\G
60						(
61						(?:[^\\]\|\\[^Q])*
62						)
63						(?:
64						\\Q
65						(?:[^\\]\|\\[^E])*
66						(?:\\E)?
67						)?
68						/
69	68	50			132	$1 . (defined($2) ? quotemeta($2) : '')
70						/xge;
71	34				43	return $expression;
72						}
73
74						sub file_to_lists {
75	6			6	8	my ($re) = @_;
76	6				9	my @patterns;
77						my %hints;
78	6				0	my $fh;
79	6	50			61	if (open($fh, '<:utf8', $re)) {
80	6				13	local $/=undef;
81	6				40	my $file=<$fh>;
82	6				16	close $fh;
83	6				6	my $line_number = 0;
84	6				7	my $hint = '';
85	6				30	for (split /\R/, $file) {
86	32				25	++$line_number;
87	32				14	chomp;
88	32	100			50	if (/^#(?:\s(.+)\|)/) {
89	12	100	67		31	$hint = $1 if ($hint eq '' && defined $1);
90	12				11	next;
91						}
92	20	100			25	$hint = '' unless $_ ne '';
93	20	50			19	next if $_ eq '$^';
94	20				20	my $pattern = $_;
95	20	100			56	next unless s/^(.+)/(?:$1)/;
96	13				17	my $quoted = quote_re($1);
97	13	100			20	unless (test_re $quoted) {
98	1				1	my $error = $@;
99	1				46	my $home = dirname(__FILE__);
100	1				53	$error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/;
101	1				13	print STDERR $error;
102	1				2	$_ = '(?:\$^ - skipped because bad-regex)';
103	1				3	$hint = '';
104						}
105	13	100			22	if (defined $hints{$_}) {
106	1				2	my $pattern_length = length $pattern;
107	1				2	my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern);
108	1				18	print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n";
109	1				2	$_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)';
110						} else {
111	12				16	push @patterns, $_;
112	12				23	$hints{$_} = $hint;
113						}
114	13				24	$hint = '';
115						}
116						}
117
118						return {
119	6				29	patterns => \@patterns,
120						hints => \%hints,
121						};
122						}
123
124						sub file_to_list {
125	5			5	1271	my ($re) = @_;
126	5				13	my $lists = file_to_lists($re);
127
128	5 5				5 21	return @{$lists->{'patterns'}};
129						}
130
131						sub list_to_re {
132	5			5	6	my (@list) = @_;
133	5 11 11	50			6 7 11	@list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list;
134	5 11				5 15	@list = grep { $_ ne '' } @list;
135	5	50			6	return '$^' unless scalar @list;
136	5				16	return join "\|", (@list);
137						}
138
139						sub not_empty {
140	109			109	96	my ($thing) = @_;
141	109		67		538	return defined $thing && $thing ne '' && $thing =~ /^\d+$/;
142						}
143
144						sub parse_block_list {
145	3			3	3	my ($re) = @_;
146	3				3	my @file;
147	3	50			33	return @file unless (open(my $file_fh, '<:utf8', $re));
148
149	3				6	local $/=undef;
150	3				32	my $file=<$file_fh>;
151	3				6	my $last_line = $.;
152	3				10	close $file_fh;
153	3				14	for (split /\R/, $file) {
154	8	100			11	next if /^#/;
155	5				4	chomp;
156	5				6	s/^\\#/#/;
157	5	50			8	next unless /^./;
158	5				6	push @file, $_;
159						}
160
161	3				6	my $pairs = (0+@file) / 2;
162	3				3	my $true_pairs = $pairs \| 0;
163	3	100			5	unless ($pairs == $true_pairs) {
164	1				1	my $early_warnings = CheckSpelling::Util::get_file_from_env('early_warnings', '/dev/null');
165	1				11	open my $early_warnings_fh, ">>:encoding(UTF-8)", $early_warnings;
166	1				42	print $early_warnings_fh "$re:$last_line:Block delimiters must come in pairs (uneven-block-delimiters)\n";
167	1				21	close $early_warnings_fh;
168	1				2	my $i = 0;
169	1				2	while ($i < $true_pairs) {
170	0				0	print STDERR "block-delimiter $i S: $file[$i*2]\n";
171	0				0	print STDERR "block-delimiter $i E: $file[$i*2+1]\n";
172	0				0	$i++;
173						}
174	1				11	print STDERR "block-delimiter unmatched S: `$file[$i*2]`\n";
175	1				3	@file = ();
176						}
177
178	3				10	return @file;
179						}
180
181						sub valid_word {
182						# shortest_word is an absolute
183	28			28	14	our ($shortest, $longest, $shortest_word, $longest_word);
184	28	50			36	$shortest = $shortest_word if $shortest_word;
185	28	100 100			35	if ($longest_word) {
186						# longest_word is an absolute
187	26				26	$longest = $longest_word;
188						} elsif (not_empty($longest)) {
189						# we allow for some sloppiness (a couple of stuck keys per word)
190						# it's possible that this should scale with word length
191	1				1	$longest += 2;
192						}
193	28				33	our ($upper_pattern, $lower_pattern, $punctuation_pattern);
194	28 84	100			31 201	my $word_pattern = join '\|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern));
195	28	100			28	$word_pattern = q<\\w\|'> unless $word_pattern;
196	28	50	50		75	if ((defined $shortest && not_empty($longest)) &&
197						($shortest > $longest)) {
198	0				0	$word_pattern = "(?:$word_pattern){3}";
199	0				0	return qr/$word_pattern/;
200						}
201	28	100			46	$shortest = 3 unless defined $shortest;
202	28	100			20	$longest = '' unless not_empty($longest);
203	28				107	$word_match = "(?:$word_pattern){$shortest,$longest}";
204	28				286	return qr/\b$word_match\b/;
205						}
206
207						sub load_dictionary {
208	15			15	2036	my ($dict) = @_;
209	15				9	our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary);
210	15				18	$longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef);
211	15				15	$shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', 0);
212	15				17	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
213	15				22	$ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>);
214	15				56	$upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]');
215	15				39	$lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]');
216	15				34	$not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]');
217	15				31	$not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]');
218	15				33	$punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>);
219	15				37	our $check_homoglyphs = CheckSpelling::Util::get_val_from_env('INPUT_CHECK_HOMOGLYPHS', 0);
220	15	100	67		49	if ($check_homoglyphs && $check_homoglyphs !~ /false/i) {
221	14				15	my $homoglyph_list_path = CheckSpelling::Util::get_file_from_env_utf8('homoglyph_list_path', '/dev/null');
222	14	50			125	if (-s $homoglyph_list_path) {
223	1 1 1			1	377 1 3757	use CheckSpelling::Homoglyph;
224	14				26	CheckSpelling::Homoglyph::init($homoglyph_list_path);
225						} else {
226	0				0	$check_homoglyphs = 0;
227						}
228						} else {
229	1				0	$check_homoglyphs = 0;
230						}
231	15				30	%dictionary = ();
232
233	15				629	open(my $dict_fh, '<:utf8', $dict);
234	15				20	my $word_match_relaxed = $word_match;
235	15	50			83	if ($word_match =~ /\{(\d+),/) {
236	15				23	my $three = $1;
237	15	50			21	if ($three > 1) {
238	15				15	my $two = $three - 1;
239	15				107	$word_match_relaxed =~ s/\b$three\b/$two/g;
240						}
241						}
242	15				70	while (!eof($dict_fh)) {
243	53				49	my $word = <$dict_fh>;
244	53				65	chomp $word;
245	53	100			175	next unless $word =~ $word_match_relaxed;
246	52				53	my $l = length $word;
247	52	100			42	$longest = -1 unless not_empty($longest);
248	52	100			57	$longest = $l if $l > $longest;
249	52	100			107	if ($word =~ $word_match) {
250	50	100			52	$shortest = $l if $l < $shortest;
251						}
252	52				105	$dictionary{$word}=1;
253						}
254	15				38	close $dict_fh;
255
256	15				15	$word_match = valid_word();
257						}
258
259						sub hunspell_dictionary {
260	3			3	6	my ($dict) = @_;
261	3				61	my $name = $dict;
262	3				4	$name =~ s{/src/index/hunspell/index\.dic$}{};
263	3				13	$name =~ s{.*/}{};
264	3				3	my $aff = $dict;
265	3				2	my $encoding;
266	3				7	$aff =~ s/\.dic$/.aff/;
267	3	50			36	if (open my $aff_fh, '<', $aff) {
268	3				19	while (<$aff_fh>) {
269	0	0			0	next unless /^SET\s+(\S+)/;
270	0	0			0	$encoding = $1 if ($1 !~ /utf-8/i);
271	0				0	last;
272						}
273	3				7	close $aff_fh;
274						}
275						return {
276	3				285	name => $name,
277						dict => $dict,
278						aff => $aff,
279						encoding => $encoding,
280						engine => Text::Hunspell->new($aff, $dict),
281						}
282						}
283
284						sub init {
285	12			12	22900	my ($configuration) = @_;
286	12				15	our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
287	12				9	our ($begin_block_re, @begin_block_list, @end_block_list);
288	12				67	our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', '');
289	12				22	our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
290	12				27	our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
291	12				13	our %forbidden_re_descriptions;
292	12				5	our @reject_re_list;
293	12				8	our $reject_re;
294	12	100			14	if ($hunspell_dictionary_path) {
295	3				39	our @hunspell_dictionaries = ();
296	1 1 1 1 1 1 1 1 1 3	50		1 1 1	283 1082 21 7 1 13 7 2 18 179	if (eval 'use Text::Hunspell; 1') {
297	3				125	my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic");
298	3				8	for my $hunspell_dictionary_file (@hunspell_dictionaries_list) {
299	3				9	push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file);
300						}
301						} else {
302	0				0	print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
303						}
304						}
305
306	12	100			90	if (-e "$configuration/block-delimiters.list") {
307	3				5	my @block_delimiters = parse_block_list "$configuration/block-delimiters.list";
308	3	100			4	if (@block_delimiters) {
309	2				3	@begin_block_list = ();
310	2				1	@end_block_list = ();
311
312	2				2	while (@block_delimiters) {
313	2				4	my ($begin, $end) = splice @block_delimiters, 0, 2;
314	2				2	push @begin_block_list, $begin;
315	2				3	push @end_block_list, $end;
316						}
317
318	2 2				2 2	$begin_block_re = join '\|', (map { '('.quote_re("\Q$_\E").')' } @begin_block_list);
319						}
320						}
321
322	12				16	my (@patterns_re_list, %in_patterns_re_list);
323	12	50			52	if (-e "$configuration/patterns.txt") {
324	0				0	@patterns_re_list = file_to_list "$configuration/patterns.txt";
325	0				0	$patterns_re = list_to_re @patterns_re_list;
326	0 0				0 0	%in_patterns_re_list = map {$_ => 1} @patterns_re_list;
327						} else {
328	12				16	$patterns_re = undef;
329						}
330
331	12	100			45	if (-e "$configuration/forbidden.txt") {
332	1				3	my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt";
333	1 1				0 2	@forbidden_re_list = @{$forbidden_re_info->{'patterns'}};
334	1 1				1 3	%forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}};
335	1				2	$forbidden_re = list_to_re @forbidden_re_list;
336						} else {
337	11				17	$forbidden_re = undef;
338						}
339
340	12	100			62	if (-e "$configuration/candidates.txt") {
341	4				9	@candidates_re_list = file_to_list "$configuration/candidates.txt";
342	4 8 8	50	33		6 8 21	@candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} \|\| !test_re($quoted) ? '' : $quoted } @candidates_re_list;
343	4				11	$candidates_re = list_to_re @candidates_re_list;
344						} else {
345	8				9	$candidates_re = undef;
346						}
347
348	12	50			57	if (-e "$configuration/reject.txt") {
349	0				0	@reject_re_list = file_to_list "$configuration/reject.txt";
350	0 0 0	0			0 0 0	@reject_re_list = map { my $quoted = quote_re($_); !test_re($quoted) ? '' : '^'.$quoted.'$' } @reject_re_list;
351	0				0	$reject_re = list_to_re @reject_re_list;
352						} else {
353	12				21	$reject_re = '$^';
354						}
355
356	12				15	our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024);
357
358	12				17	my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', '');
359	12				16	our $disable_word_collating = $disable_flags =~ /(?:^\|,\|\s)word-collating(?:,\|\s\|$)/;
360	12				11	our $disable_minified_file = $disable_flags =~ /(?:^\|,\|\s)minified-file(?:,\|\s\|$)/;
361	12				13	our $disable_single_line_file = $disable_flags =~ /(?:^\|,\|\s)single-line-file(?:,\|\s\|$)/;
362
363	12				11	our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', '');
364	12				12	$ignore_next_line_pattern =~ s/\s+/\|/g;
365
366	12				12	our $check_images = CheckSpelling::Util::get_val_from_env('INPUT_CHECK_IMAGES', '');
367	12				11	$check_images = $check_images =~ /^(?:1\|true)$/i;
368	12	50			12	if ($check_images) {
369	0				0	our $ocr_directory = CheckSpelling::Util::get_file_from_env('ocr_directory', '/tmp/ocr');
370	0	0			0	$ocr_directory = $1 if ($ocr_directory =~ /^(.*)$/);
371						}
372
373	12				16	our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', '');
374
375	12				15	our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', '');
376
377	12				16	$word_match = valid_word();
378
379	12				54	our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words");
380	12	100			55	$base_dict = '/usr/share/dict/words' unless -e $base_dict;
381	12				18	load_dictionary($base_dict);
382						}
383
384						sub split_line {
385	1163			1163	485	our (%dictionary, $word_match, $disable_word_collating);
386	1163				542	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
387	1163				459	our @hunspell_dictionaries;
388	1163				465	our $shortest;
389	1163				839	my $shortest_threshold = $shortest + 2;
390	1163				674	my $pattern = '.';
391						# $pattern = "(?:$upper_pattern){$shortest,}\|$upper_pattern(?:$lower_pattern){2,}\n";
392
393						# https://www.fileformat.info/info/unicode/char/2019/
394	1163				548	my $rsqm = "\xE2\x80\x99";
395
396	1163				739	my ($words, $unrecognized) = (0, 0);
397	1163				870	my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_;
398	1163				5911	$line =~ s/(?:$rsqm\|'\|'\|\%27\|’\|’\|’\|\\u2019\|\x{2019}\|')+/'/g;
399	1163				2432	$line =~ s/(?:$ignore_pattern)+/ /g;
400	1163				1873	while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {}
401	1163				3608	while ($line =~ s/((?:$lower_pattern\|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {}
402	1163				1479	for my $token (split /\s+/, $line) {
403	3654	100			3569	next unless $token =~ /$pattern/;
404	2492				3793	$token =~ s/^(?:'\|$rsqm)(.?)(?:'\|$rsqm)+$/$1/g;
405	2492				1836	$token =~ s/(?:'\|$rsqm)+s$//g; # need to reconsider for not English
406	2492				1467	my $raw_token = $token;
407	2492				1585	$token =~ s/^[^Ii]?'+//; # need to reconsider for French
408	2492				1368	$token =~ s/'+$//;
409	2492	100			4073	next unless $token =~ $word_match;
410	2323	100			2200	if (defined $dictionary{$token}) {
411	1042				509	++$words;
412	1042				566	$unique_ref->{$token}=1;
413	1042				968	next;
414						}
415	1281	100			1014	if (@hunspell_dictionaries) {
416	1254				697	my $found = 0;
417	1254				745	for my $hunspell_dictionary (@hunspell_dictionaries) {
418						my $token_encoded = defined $hunspell_dictionary->{'encoding'} ?
419	1254	50			1205	encode($hunspell_dictionary->{'encoding'}, $token) : $token;
420	1254	50			3272	next unless ($hunspell_dictionary->{'engine'}->check($token_encoded));
421	0				0	++$words;
422	0				0	$dictionary{$token} = 1;
423	0				0	$unique_ref->{$token}=1;
424	0				0	$found = 1;
425	0				0	last;
426						}
427	1254	50			1003	next if $found;
428						}
429	1281				941	my $key = lc $token;
430	1281	100			1190	if (defined $dictionary{$key}) {
431	6				3	++$words;
432	6				6	$unique_ref->{$key}=1;
433	6				8	next;
434						}
435	1275	50			1054	unless ($disable_word_collating) {
436	1275				776	$key =~ s/''+/'/g;
437	1275	100			1295	$key =~ s/'[sd]$// if length $key >= $shortest_threshold;
438						}
439	1275	50			1102	if (defined $dictionary{$key}) {
440	0				0	++$words;
441	0				0	$unique_ref->{$key}=1;
442	0				0	next;
443						}
444	1275				670	++$unrecognized;
445	1275				827	$unique_unrecognized_ref->{$raw_token}=1;
446	1275				1988	$unrecognized_line_items_ref->{$raw_token}=1;
447						}
448	1163				1916	return ($words, $unrecognized);
449						}
450
451						sub skip_file {
452	7			7	28	my ($temp_dir, $reason) = @_;
453	7				291	open(my $skipped_fh, '>:utf8', "$temp_dir/skipped");
454	7				37	print $skipped_fh $reason;
455	7				128	close $skipped_fh;
456						}
457
458						sub maybe_ocr_file {
459	0			0	0	my ($file) = @_;
460	0				0	our $ocr_directory;
461	0				0	my $ocr_file = "$ocr_directory/$file";
462	0				0	$ocr_file =~ /^(.*)$/;
463	0				0	$ocr_file = $1;
464	0				0	my $ocr_source_sha = "$ocr_file.sha1";
465	0				0	$ocr_file = "$ocr_file.txt";
466	0				0	my $sha = Digest::SHA->new(1)->addfile($file, 'b')->hexdigest;
467	0	0	0		0	if (-e $ocr_file &&
468						-e $ocr_source_sha &&
469						open my $source_sha, '<', $ocr_source_sha) {
470	0				0	my $last_sha = <$source_sha>;
471	0				0	close $source_sha;
472	0	0			0	if ($last_sha =~ /(.*)/) {
473	0	0			0	return ($ocr_file, 1) if ($1 eq $sha);
474						}
475						}
476	0				0	my $tesseract = dirname(dirname(dirname(__FILE__)))."/wrappers/run-tesseract";
477	0				0	$ENV{'input'} = $file;
478	0				0	my $text_file = `"$tesseract"`;
479	0				0	delete $ENV{'input'};
480	0	0			0	return ($file, 0) unless defined $text_file;
481	0				0	my $file_converted = 0;
482	0				0	chomp $text_file;
483	0	0			0	if ($text_file =~ /^(.*)$/) {
484	0				0	$text_file = $1;
485	0		0		0	my $file_size = -s $text_file \|\| 0;
486	0	0			0	if ($file_size > 20) {
487	0				0	$file_converted = 1;
488	0				0	make_path(dirname($ocr_source_sha));
489	0				0	open my $source_sha, '>', $ocr_source_sha;
490	0				0	print $source_sha $sha;
491	0				0	close $source_sha;
492	0				0	rename($text_file, $ocr_file);
493	0				0	$file = $ocr_file;
494						} else {
495	0				0	unlink($text_file);
496						}
497						}
498	0				0	return ($file, $file_converted);
499						}
500
501						sub print_word_not_in_dictionary {
502	1275			1275	833	my ($warnings_fh, $begin, $end, $match) = @_;
503	1275				570	our $reject_re;
504	1275				1040	my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
505	1275	50			1463	if ($match =~ /^($reject_re)$/) {
506	0				0	our @reject_re_list;
507	0				0	my $found = 0;
508	0				0	for my $reject (@reject_re_list) {
509	0	0			0	if ($match =~ /^$reject$/) {
510	0				0	my $rejection = CheckSpelling::Util::wrap_in_backticks($reject);
511	0				0	print $warnings_fh ":$.:$begin ... $end, Error - Rejected word $wrapped matched $rejection (rejected-word)\n";
512	0				0	$found = 1;
513						}
514						}
515	0	0			0	unless ($found) {
516	0				0	my $rejection = CheckSpelling::Util::wrap_in_backticks($reject_re);
517	0				0	print $warnings_fh ":$.:$begin ... $end, Error - Rejected word $wrapped matched $rejection (rejected-word)\n";
518						}
519						} else {
520	1275				5498	print $warnings_fh ":$.:$begin ... $end: $wrapped\n";
521						}
522						}
523
524						sub split_file {
525	19			19	14653	my ($file) = @_;
526						our (
527	19				14	$unrecognized, $shortest, $largest_file, $words,
528						$word_match, %unique, %unique_unrecognized, $forbidden_re,
529						@forbidden_re_list, $patterns_re, %dictionary,
530						$begin_block_re, @begin_block_list, @end_block_list,
531						$candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file,
532						$disable_single_line_file,
533						$ignore_next_line_pattern,
534						$sandbox,
535						$check_images,
536						);
537	19	100			50	$ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./;
538
539	19				11	our %forbidden_re_descriptions;
540	19				10	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
541
542						# https://www.fileformat.info/info/unicode/char/2019/
543	19				18	my $rsqm = "\xE2\x80\x99";
544
545	19				32	my @candidates_re_hits = (0) x scalar @candidates_re_list;
546	19				20	my @candidates_re_lines = (0) x scalar @candidates_re_list;
547	19				35	my @forbidden_re_hits = (0) x scalar @forbidden_re_list;
548	19				24	my @forbidden_re_lines = (0) x scalar @forbidden_re_list;
549	19				50	my $temp_dir = tempdir(DIR=>$sandbox);
550	19	50			3535	print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'};
551	19				479	open(my $name_fh, '>', "$temp_dir/name");
552	19				48	print $name_fh $file;
553	19				263	close $name_fh;
554	19	100	67		229	if (defined readlink($file) &&
555						rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) {
556	1				5	skip_file($temp_dir, "symbolic link points outside repository (out-of-bounds-symbolic-link)\n");
557	1				6	return $temp_dir;
558						}
559	18	100 50			44	if ($use_magic_file) {
560	8	50			13028	if (open(my $file_fh, '-\|',
561						'/usr/bin/file',
562						'-b',
563						'--mime',
564						'-e', 'cdf',
565						'-e', 'compress',
566						'-e', 'csv',
567						'-e', 'elf',
568						'-e', 'json',
569						'-e', 'tar',
570						$file)) {
571	8				29427	my $file_kind = <$file_fh>;
572	8				3904	close $file_fh;
573	8				14	my $file_converted = 0;
574	8	50	33		22	if ($check_images && $file_kind =~ m<^image/(?!svg)>) {
575	0				0	($file, $file_converted) = maybe_ocr_file($file);
576						}
577	8	100	67		162	if ($file_converted == 0 && $file_kind =~ /^(.*?); charset=binary/) {
578	2				46	skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n");
579	2				42	return $temp_dir;
580						}
581						}
582						} elsif ($file =~ /\.(?:png\|jpe?g\|gif)$/) {
583	0				0	my $file_converted = 0;
584	0				0	($file, $file_converted) = maybe_ocr_file($file);
585						}
586	16				84	my $file_size = -s $file;
587	16	50			18	if (defined $largest_file) {
588	16	50			17	unless ($check_file_names eq $file) {
589	16	100			27	if ($file_size > $largest_file) {
590	1				3	skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n");
591	1				4	return $temp_dir;
592						}
593						}
594						}
595	15				155	open my $file_fh, '<', $file;
596	15				20	binmode $file_fh;
597	15				10	my $head;
598	15				119	read($file_fh, $head, 4096);
599	15				923	$head =~ s/(?:\r\|\n)+$//;
600	15				81	my $dos_new_lines = () = $head =~ /\r\n/gi;
601	15				44	my $unix_new_lines = () = $head =~ /\n/gi;
602	15				117	my $mac_new_lines = () = $head =~ /\r/gi;
603	15				58	local $/;
604	15	100 100 100	100 100		89	if ($unix_new_lines == 0 && $mac_new_lines == 0) {
605	3				7	$/ = "\n";
606						} elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) {
607	1				6	$/ = "\r\n";
608						} elsif ($mac_new_lines > $unix_new_lines) {
609	2				7	$/ = "\r";
610						} else {
611	9				10	$/ = "\n";
612						}
613	15				29	seek($file_fh, 0, 0);
614	15				30	($words, $unrecognized) = (0, 0);
615	15				36	%unique = ();
616	15				33	%unique_unrecognized = ();
617
618						local $SIG{__WARN__} = sub {
619	0			0	0	my $message = shift;
620	0				0	$message =~ s/> line/> in $file - line/;
621	0				0	chomp $message;
622	0				0	print STDERR "$message\n";
623	15				132	};
624
625	15				447	open(my $warnings_fh, '>:utf8', "$temp_dir/warnings");
626	15				18	our $timeout;
627	15				17	eval {
628	15 0			0	127 0	local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
629	15				45	alarm $timeout;
630
631	15				16	my $ignore_next_line = 0;
632	15				39	my ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
633	15				34	my $offset = 0;
634	15				101	LINE: while (<$file_fh>) {
635	1172	100			1170	if ($. == 1) {
636	15	50			34	unless ($disable_minified_file) {
637	15	100	100		66	if ($file_size >= 512 && length($_) == $file_size) {
638	1				9	skip_file($temp_dir, "file only has a single line (single-line-file)\n");
639	1				5	last;
640						}
641						}
642	14				29	s/^\x{FEFF}//;
643						}
644	1171				2775	$_ = decode_utf8($_, FB_DEFAULT);
645	1171	50			3228	if (/[\x{D800}-\x{DFFF}]/) {
646	0				0	skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n");
647	0				0	last;
648						}
649	1171				1615	s/\R$//;
650	1171	100			1260	next unless /./;
651	1170				805	my $raw_line = $_;
652	1170				585	my $parsed_block_markers;
653
654						# hook for custom multiline based text exclusions:
655	1170	100			837	if ($begin_block_re) {
656	1148				725	FIND_END_MARKER: while (1) {
657	1150				938	while ($next_end_marker ne '') {
658	6	100			27	next LINE unless /\Q$next_end_marker\E/;
659	1				6	s/.*?\Q$next_end_marker\E//;
660	1				2	($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
661	1				1	$parsed_block_markers = 1;
662						}
663	1145				1292	my @captured = (/^.*?$begin_block_re/);
664	1145	100			1102	last unless (@captured);
665	2				3	for my $capture (0 .. $#captured) {
666	2	50			3	if ($captured[$capture]) {
667	2				6	($current_begin_marker, $next_end_marker, $start_marker_line) = ($begin_block_list[$capture], $end_block_list[$capture], "$.:1 ... 1");
668	2				17	s/^.*?\Q$begin_block_list[$capture]\E//;
669	2				2	$parsed_block_markers = 1;
670	2				3	next FIND_END_MARKER;
671						}
672						}
673						}
674	1143	100			867	next if $parsed_block_markers;
675						}
676
677	1164				759	my $ignore_this_line = $ignore_next_line;
678	1164				1142	$ignore_next_line = ($_ =~ /$ignore_next_line_pattern/);
679	1164	100			708	next if $ignore_this_line;
680
681						# hook for custom line based text exclusions:
682	1163	100			954	if (defined $patterns_re) {
683	2 6				12 9	s/($patterns_re)/"="x length($1)/ge;
684						}
685	1163				741	my $initial_line_state = $_;
686	1163				688	my $previous_line_state = $_;
687	1163				589	my $line_flagged;
688	1163	100			806	if ($forbidden_re) {
689	9 5				66 13	while (s/($forbidden_re)/"="x length($1)/e) {
690	5				6	$line_flagged = 1;
691	5				35	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
692	5				5	my $found_trigger_re;
693	5				6	for my $i (0 .. $#forbidden_re_list) {
694	7				4	my $forbidden_re_singleton = $forbidden_re_list[$i];
695	7				6	my $test_line = $previous_line_state;
696	7 4	100			67 6	if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) {
697	4	50			5	next unless $test_line eq $_;
698	4				9	my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1);
699	4	50			4	next unless $begin == $begin_test;
700	4	50			4	next unless $end == $end_test;
701	4	50			3	next unless $match eq $match_test;
702	4				2	$found_trigger_re = $forbidden_re_singleton;
703	4				9	my $hit = "$.:$begin:$end";
704	4				4	$forbidden_re_hits[$i]++;
705	4	100			4	$forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i];
706	4				8	last;
707						}
708						}
709	5				7	my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
710	5	100			7	if ($found_trigger_re) {
711	4		100		9	my $description = $forbidden_re_descriptions{$found_trigger_re} \|\| '';
712	4				12	$found_trigger_re =~ s/^$\?:(.*)$$/$1/;
713	4				5	my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99);
714	4	100			6	if ($description ne '') {
715	3				16	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n";
716						} else {
717	1				6	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n";
718						}
719						} else {
720	1				4	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n";
721						}
722	5				33	$previous_line_state = $_;
723						}
724	9				9	$_ = $initial_line_state;
725						}
726						# This is to make it easier to deal w/ rules:
727	1163				1195	s/^/ /;
728	1163				740	my %unrecognized_line_items = ();
729	1163				513	our $check_homoglyphs;
730	1163	50			768	if ($check_homoglyphs) {
731	1163				652	my $check_line_for_homoglyphs = $_;
732	1163				769	my $homoglyphs = $CheckSpelling::Homoglyph::homoglyphs;
733						# problematic characters: `\\`, `-`, `]`
734	1163				11062	$homoglyphs =~ s/([-\\\]])/\\$1/g;
735	1163				1477	$homoglyphs = "[$homoglyphs]";
736	1163				559	our ($longest_word, $shortest_word);
737	1163	50	33		2042	my $longest_word_string = defined $longest_word && ($longest_word =~ /^\d+$/) ? $longest_word : '';
738	1163				607	my $dollar = '$';
739	1163				1879	my $homoglyph_re = "(?=(?:${homoglyphs}\|(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})){${shortest_word},${longest_word_string}}(?:${not_upper_or_lower_pattern}\|${dollar}))((?:${upper_pattern}\|${lower_pattern})+${homoglyphs}(?:(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})\|${homoglyphs})\|${homoglyphs}+(?:${upper_pattern}\|${lower_pattern})(?:(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})\|${homoglyphs}))";
740	1163				11987	while ($check_line_for_homoglyphs =~ /((?=(?:${homoglyphs}\|(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})){${shortest_word},${longest_word_string}}(?:${not_upper_or_lower_pattern}\|${dollar}))((?:${upper_pattern}\|${lower_pattern})+${homoglyphs}(?:(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})\|${homoglyphs})\|${homoglyphs}+(?:${upper_pattern}\|${lower_pattern})(?:(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})\|${homoglyphs})))/g) {
741	1				2	my ($token, $token_raw, $begin, $end) = ($1, $1, $-[0], $+[0]);
742	1				50	$token =~ s/($homoglyphs)/$CheckSpelling::Homoglyph::homoglyph_to_glyph{$1}/g;
743	1	50			2	if (defined $dictionary{$token}) {
744	1				1	my $token_raw = CheckSpelling::Util::wrap_in_backticks($token_raw);
745	1				1	my $token = CheckSpelling::Util::wrap_in_backticks($token);
746	1				1	my $wrapped = "$token_raw should probably be $token (homoglyph-word)";
747	1				10	print $warnings_fh ":$.:$begin ... $end, Error - $wrapped\n";
748						}
749						}
750						}
751	1163				1157	my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items);
752	1163				743	$words += $new_words;
753	1163				509	$unrecognized += $new_unrecognized;
754	1163				798	my $line_length = length($raw_line);
755	1163				1839	for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) {
756	1022				521	my $found_token = 0;
757	1022				512	my $raw_token = $token;
758	1022				539	$token =~ s/'/(?:'\|\x{2019}\|\'\|\')+/g;
759	1022				475	my $before;
760	1022	100 50			1796	if ($token =~ /^$upper_pattern$lower_pattern/) {
761	5				4	$before = '(?<=.)';
762						} elsif ($token =~ /^$upper_pattern/) {
763	0				0	$before = "(?<!$upper_pattern)";
764						} else {
765	1017				662	$before = "(?<=$not_upper_or_lower_pattern)";
766						}
767	1022	50			1308	my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)\|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)";
768	1022				2294	while ($raw_line =~ /(?:\b\|$before)($token)(?:\b\|$after)/g) {
769	1272				684	$line_flagged = 1;
770	1272				489	$found_token = 1;
771	1272				1895	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
772	1272	50			1216	next unless $match =~ /./;
773	1272				827	print_word_not_in_dictionary($warnings_fh, $begin, $end, $match);
774						}
775	1022	100			1376	unless ($found_token) {
776	3	50	33		30	if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) {
777	3				5	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
778	3				4	print_word_not_in_dictionary($warnings_fh, $begin, $end, $match);
779						} else {
780	0				0	my $offset = $line_length + 1;
781	0				0	my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
782	0				0	print $warnings_fh ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n";
783						}
784						}
785						}
786	1163	100	100		1901	if ($line_flagged && $candidates_re) {
787	2				3	$_ = $previous_line_state = $initial_line_state;
788	2 2				25 5	s/($candidates_re)/"="x length($1)/ge;
789	2	50			3	if ($_ ne $initial_line_state) {
790	2				2	$_ = $previous_line_state;
791	2				3	for my $i (0 .. $#candidates_re_list) {
792	4				4	my $candidate_re = $candidates_re_list[$i];
793	4	100	67		28	next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/;
794	2 2	50			10 5	if (($_ =~ s/($candidate_re)/"="x length($1)/e)) {
795	2				4	my ($begin, $end) = ($-[0] + 1, $+[0] + 1);
796	2				5	my $hit = "$.:$begin:$end";
797	2				2	$_ = $previous_line_state;
798	2 2				9 3	my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge);
799	2				4	$candidates_re_hits[$i] += $replacements;
800	2	50			3	$candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i];
801	2				5	$_ = $previous_line_state;
802						}
803						}
804						}
805						}
806	1163	50			1009	unless ($disable_minified_file) {
807	1163				986	s/={3,}//g;
808	1163				945	$offset += length;
809	1163				1225	my $ratio = int($offset / $.);
810	1163				714	my $ratio_threshold = 1000;
811	1163	100			3863	if ($ratio > $ratio_threshold) {
812	2				9	skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n");
813	2				10	last;
814						}
815						}
816						}
817	15	100			25	if ($next_end_marker) {
818	1	50			2	if ($start_marker_line) {
819	1				2	my $wrapped = CheckSpelling::Util::wrap_in_backticks($current_begin_marker);
820	1				5	print $warnings_fh ":$start_marker_line, Warning - Failed to find matching end marker for $wrapped (unclosed-block-ignore-begin)\n";
821						}
822	1				2	my $wrapped = CheckSpelling::Util::wrap_in_backticks($next_end_marker);
823	1				3	print $warnings_fh ":$.:1 ... 1, Warning - Expected to find end block marker $wrapped (unclosed-block-ignore-end)\n";
824						}
825
826	15				114	alarm 0;
827						};
828	15	50			17	if ($@) {
829	0	0			0	die unless $@ eq "alarm\n";
830	0				0	print $warnings_fh ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n";
831	0				0	skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n");
832	0				0	return $temp_dir;
833						}
834
835	15				56	close $file_fh;
836	15				238	close $warnings_fh;
837
838	15	100	75		45	if ($unrecognized \|\| @candidates_re_hits \|\| @forbidden_re_hits) {
839	14				458	open(my $stats_fh, '>:utf8', "$temp_dir/stats");
840	14	100 100 100 100			229	print $stats_fh "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized).
841						", unique: ".(keys %unique).
842						(@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : "").
843						(@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : "").
844						(@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : "").
845						(@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : "").
846						"}";
847	14				231	close $stats_fh;
848	14				339	open(my $unknown_fh, '>:utf8', "$temp_dir/unknown");
849	14 21				56 41	print $unknown_fh map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized;
850	14				168	close $unknown_fh;
851						}
852
853	15				172	return $temp_dir;
854						}
855
856						sub main {
857	4			4	464	my ($configuration, @ARGV) = @_;
858	4				4	our %dictionary;
859	4	100			7	unless (%dictionary) {
860	1				1	init($configuration);
861						}
862
863						# read all input
864	4				4	my @reports;
865
866	4				5	for my $file (@ARGV) {
867	4				6	my $temp_dir = split_file($file);
868	4				8	push @reports, "$temp_dir\n";
869						}
870	4				13	print join '', @reports;
871						}
872
873						1;