File Coverage

File:	lib/CheckSpelling/UnknownWordSplitter.pm
Coverage:	82.0%

line	stmt	bran	cond	sub	time	code
1						#! --perl--
2
3						# ~/bin/w
4						# Search for potentially misspelled words
5						# Output is:
6						# misspellled
7						# woord (WOORD, Woord, woord, woord's)
8						package CheckSpelling::UnknownWordSplitter;
9
10	1 1			1	114050 3	use 5.022;
11	1 1 1			1	2 1 54	use feature 'unicode_strings';
12	1 1 1			1	2 0 9	use strict;
13	1 1 1			1	3 1 20	use warnings;
14	1 1 1			1	2 0 17	no warnings qw(experimental::vlb);
15	1 1 1			1	2 1 2	use utf8;
16	1 1 1			1	14 2 29	use Encode qw/decode_utf8 encode FB_DEFAULT/;
17	1 1 1			1	2 0 29	use File::Basename;
18	1 1 1			1	2 1 17	use Cwd 'abs_path';
19	1 1 1			1	2 0 20	use File::Spec;
20	1 1 1			1	3 1 21	use File::Temp qw/ tempfile tempdir /;
21	1 1 1			1	2 1 18	use File::Path qw/ make_path /;
22	1 1 1			1	326 1 21	use CheckSpelling::Util;
23	1 1 1			1	219 1231 986	use Digest::SHA;
24						our $VERSION='0.1.0';
25
26						my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
27						my ($check_homoglyphs);
28						my $begin_block_re = '';
29						my @begin_block_list = ();
30						my @end_block_list = ();
31						my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
32						my ($shortest, $longest) = (255, 0);
33						my @forbidden_re_list;
34						my %forbidden_re_descriptions;
35						my @candidates_re_list;
36						my $hunspell_dictionary_path;
37						my @hunspell_dictionaries;
38						my %dictionary = ();
39						my $base_dict;
40						my %unique;
41						my %unique_unrecognized;
42						my ($last_file, $words, $unrecognized) = ('', 0, 0);
43						my ($ignore_next_line_pattern);
44						my ($check_images, $ocr_directory);
45
46						my $disable_flags;
47
48						sub test_re {
49	32			32	25	my ($expression) = @_;
50	32 32				16 208	return eval { qr /$expression/ };
51						}
52
53						sub quote_re {
54	34			34	26	my ($expression) = @_;
55	34	50			33	return $expression if $expression =~ /\?\{/;
56	34				69	$expression =~ s/
57						\G
58						(
59						(?:[^\\]\|\\[^Q])*
60						)
61						(?:
62						\\Q
63						(?:[^\\]\|\\[^E])*
64						(?:\\E)?
65						)?
66						/
67	68	50			112	$1 . (defined($2) ? quotemeta($2) : '')
68						/xge;
69	34				41	return $expression;
70						}
71
72						sub file_to_lists {
73	6			6	7	my ($re) = @_;
74	6				9	my @patterns;
75						my %hints;
76	6				0	my $fh;
77	6	50			56	if (open($fh, '<:utf8', $re)) {
78	6				8	local $/=undef;
79	6				42	my $file=<$fh>;
80	6				16	close $fh;
81	6				5	my $line_number = 0;
82	6				5	my $hint = '';
83	6				27	for (split /\R/, $file) {
84	32				19	++$line_number;
85	32				18	chomp;
86	32	100			38	if (/^#(?:\s(.+)\|)/) {
87	12	100	67		27	$hint = $1 if ($hint eq '' && defined $1);
88	12				10	next;
89						}
90	20	100			22	$hint = '' unless $_ ne '';
91	20	50			21	next if $_ eq '$^';
92	20				17	my $pattern = $_;
93	20	100			48	next unless s/^(.+)/(?:$1)/;
94	13				17	my $quoted = quote_re($1);
95	13	100			17	unless (test_re $quoted) {
96	1				3	my $error = $@;
97	1				46	my $home = dirname(__FILE__);
98	1				27	$error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/;
99	1				13	print STDERR $error;
100	1				2	$_ = '(?:\$^ - skipped because bad-regex)';
101	1				2	$hint = '';
102						}
103	13	100			19	if (defined $hints{$_}) {
104	1				2	my $pattern_length = length $pattern;
105	1				1	my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern);
106	1				16	print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n";
107	1				2	$_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)';
108						} else {
109	12				12	push @patterns, $_;
110	12				17	$hints{$_} = $hint;
111						}
112	13				20	$hint = '';
113						}
114						}
115
116						return {
117	6				26	patterns => \@patterns,
118						hints => \%hints,
119						};
120						}
121
122						sub file_to_list {
123	5			5	1388	my ($re) = @_;
124	5				10	my $lists = file_to_lists($re);
125
126	5 5				4 18	return @{$lists->{'patterns'}};
127						}
128
129						sub list_to_re {
130	5			5	6	my (@list) = @_;
131	5 11 11	50			6 7 11	@list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list;
132	5 11				5 13	@list = grep { $_ ne '' } @list;
133	5	50			4	return '$^' unless scalar @list;
134	5				13	return join "\|", (@list);
135						}
136
137						sub not_empty {
138	107			107	85	my ($thing) = @_;
139	107		67		495	return defined $thing && $thing ne '' && $thing =~ /^\d+$/;
140						}
141
142						sub parse_block_list {
143	3			3	3	my ($re) = @_;
144	3				2	my @file;
145	3	50			30	return @file unless (open(my $file_fh, '<:utf8', $re));
146
147	3				6	local $/=undef;
148	3				21	my $file=<$file_fh>;
149	3				6	my $last_line = $.;
150	3				7	close $file_fh;
151	3				11	for (split /\R/, $file) {
152	8	100			11	next if /^#/;
153	5				3	chomp;
154	5				6	s/^\\#/#/;
155	5	50			6	next unless /^./;
156	5				5	push @file, $_;
157						}
158
159	3				4	my $pairs = (0+@file) / 2;
160	3				4	my $true_pairs = $pairs \| 0;
161	3	100			3	unless ($pairs == $true_pairs) {
162	1				1	my $early_warnings = CheckSpelling::Util::get_file_from_env('early_warnings', '/dev/null');
163	1				9	open my $early_warnings_fh, ">>:encoding(UTF-8)", $early_warnings;
164	1				33	print $early_warnings_fh "$re:$last_line:Block delimiters must come in pairs (uneven-block-delimiters)\n";
165	1				21	close $early_warnings_fh;
166	1				1	my $i = 0;
167	1				2	while ($i < $true_pairs) {
168	0				0	print STDERR "block-delimiter $i S: $file[$i*2]\n";
169	0				0	print STDERR "block-delimiter $i E: $file[$i*2+1]\n";
170	0				0	$i++;
171						}
172	1				10	print STDERR "block-delimiter unmatched S: `$file[$i*2]`\n";
173	1				3	@file = ();
174						}
175
176	3				11	return @file;
177						}
178
179						sub valid_word {
180						# shortest_word is an absolute
181	28			28	18	our ($shortest, $longest, $shortest_word, $longest_word);
182	28	50			31	$shortest = $shortest_word if $shortest_word;
183	28	100 100			28	if ($longest_word) {
184						# longest_word is an absolute
185	26				30	$longest = $longest_word;
186						} elsif (not_empty($longest)) {
187						# we allow for some sloppiness (a couple of stuck keys per word)
188						# it's possible that this should scale with word length
189	1				2	$longest += 2;
190						}
191	28				25	our ($upper_pattern, $lower_pattern, $punctuation_pattern);
192	28 84	100			22 205	my $word_pattern = join '\|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern));
193	28	100			24	$word_pattern = q<\\w\|'> unless $word_pattern;
194	28	50	50		59	if ((defined $shortest && not_empty($longest)) &&
195						($shortest > $longest)) {
196	0				0	$word_pattern = "(?:$word_pattern){3}";
197	0				0	return qr/$word_pattern/;
198						}
199	28	100			64	$shortest = 3 unless defined $shortest;
200	28	100			23	$longest = '' unless not_empty($longest);
201	28				111	$word_match = "(?:$word_pattern){$shortest,$longest}";
202	28				262	return qr/\b$word_match\b/;
203						}
204
205						sub load_dictionary {
206	15			15	2020	my ($dict) = @_;
207	15				9	our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary);
208	15				14	$longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef);
209	15				15	$shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', 0);
210	15				13	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
211	15				15	$ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>);
212	15				56	$upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]');
213	15				36	$lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]');
214	15				29	$not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]');
215	15				33	$not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]');
216	15				31	$punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>);
217	15				36	our $check_homoglyphs = CheckSpelling::Util::get_val_from_env('INPUT_CHECK_HOMOGLYPHS', 0);
218	15	100	67		48	if ($check_homoglyphs && $check_homoglyphs !~ /false/i) {
219	14				15	my $homoglyph_list_path = CheckSpelling::Util::get_file_from_env_utf8('homoglyph_list_path', '/dev/null');
220	14	50			95	if (-s $homoglyph_list_path) {
221	1 1 1			1	348 1 3588	use CheckSpelling::Homoglyph;
222	14				20	CheckSpelling::Homoglyph::init($homoglyph_list_path);
223						} else {
224	0				0	$check_homoglyphs = 0;
225						}
226						} else {
227	1				1	$check_homoglyphs = 0;
228						}
229	15				22	%dictionary = ();
230
231	15				474	open(my $dict_fh, '<:utf8', $dict);
232	15				66	while (!eof($dict_fh)) {
233	53				61	my $word = <$dict_fh>;
234	53				44	chomp $word;
235	53	100			139	next unless $word =~ $word_match;
236	50				42	my $l = length $word;
237	50	100			38	$longest = -1 unless not_empty($longest);
238	50	100			49	$longest = $l if $l > $longest;
239	50	100			51	$shortest = $l if $l < $shortest;
240	50				99	$dictionary{$word}=1;
241						}
242	15				53	close $dict_fh;
243
244	15				17	$word_match = valid_word();
245						}
246
247						sub hunspell_dictionary {
248	3			3	6	my ($dict) = @_;
249	3				6	my $name = $dict;
250	3				4	$name =~ s{/src/index/hunspell/index\.dic$}{};
251	3				11	$name =~ s{.*/}{};
252	3				3	my $aff = $dict;
253	3				2	my $encoding;
254	3				11	$aff =~ s/\.dic$/.aff/;
255	3	50			30	if (open my $aff_fh, '<', $aff) {
256	3				20	while (<$aff_fh>) {
257	0	0			0	next unless /^SET\s+(\S+)/;
258	0	0			0	$encoding = $1 if ($1 !~ /utf-8/i);
259	0				0	last;
260						}
261	3				9	close $aff_fh;
262						}
263						return {
264	3				297	name => $name,
265						dict => $dict,
266						aff => $aff,
267						encoding => $encoding,
268						engine => Text::Hunspell->new($aff, $dict),
269						}
270						}
271
272						sub init {
273	12			12	16007	my ($configuration) = @_;
274	12				13	our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
275	12				11	our ($begin_block_re, @begin_block_list, @end_block_list);
276	12				57	our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', '');
277	12				17	our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
278	12				28	our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
279	12				8	our %forbidden_re_descriptions;
280	12	100			17	if ($hunspell_dictionary_path) {
281	3				36	our @hunspell_dictionaries = ();
282	1 1 1 1 1 1 1 1 1 3	50		1 1 1	278 1056 17 6 1 58 7 3 15 168	if (eval 'use Text::Hunspell; 1') {
283	3				116	my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic");
284	3				7	for my $hunspell_dictionary_file (@hunspell_dictionaries_list) {
285	3				11	push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file);
286						}
287						} else {
288	0				0	print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
289						}
290						}
291
292	12	100			76	if (-e "$configuration/block-delimiters.list") {
293	3				7	my @block_delimiters = parse_block_list "$configuration/block-delimiters.list";
294	3	100			4	if (@block_delimiters) {
295	2				2	@begin_block_list = ();
296	2				2	@end_block_list = ();
297
298	2				2	while (@block_delimiters) {
299	2				3	my ($begin, $end) = splice @block_delimiters, 0, 2;
300	2				2	push @begin_block_list, $begin;
301	2				2	push @end_block_list, $end;
302						}
303
304	2 2				2 4	$begin_block_re = join '\|', (map { '('.quote_re("\Q$_\E").')' } @begin_block_list);
305						}
306						}
307
308	12				13	my (@patterns_re_list, %in_patterns_re_list);
309	12	50			72	if (-e "$configuration/patterns.txt") {
310	0				0	@patterns_re_list = file_to_list "$configuration/patterns.txt";
311	0				0	$patterns_re = list_to_re @patterns_re_list;
312	0 0				0 0	%in_patterns_re_list = map {$_ => 1} @patterns_re_list;
313						} else {
314	12				14	$patterns_re = undef;
315						}
316
317	12	100			78	if (-e "$configuration/forbidden.txt") {
318	1				2	my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt";
319	1 1				1 2	@forbidden_re_list = @{$forbidden_re_info->{'patterns'}};
320	1 1				1 3	%forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}};
321	1				2	$forbidden_re = list_to_re @forbidden_re_list;
322						} else {
323	11				52	$forbidden_re = undef;
324						}
325
326	12	100			57	if (-e "$configuration/candidates.txt") {
327	4				8	@candidates_re_list = file_to_list "$configuration/candidates.txt";
328	4 8 8	50	33		4 8 15	@candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} \|\| !test_re($quoted) ? '' : $quoted } @candidates_re_list;
329	4				8	$candidates_re = list_to_re @candidates_re_list;
330						} else {
331	8				15	$candidates_re = undef;
332						}
333
334	12				27	our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024);
335
336	12				17	my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', '');
337	12				16	our $disable_word_collating = $disable_flags =~ /(?:^\|,\|\s)word-collating(?:,\|\s\|$)/;
338	12				9	our $disable_minified_file = $disable_flags =~ /(?:^\|,\|\s)minified-file(?:,\|\s\|$)/;
339	12				12	our $disable_single_line_file = $disable_flags =~ /(?:^\|,\|\s)single-line-file(?:,\|\s\|$)/;
340
341	12				13	our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', '');
342	12				12	$ignore_next_line_pattern =~ s/\s+/\|/g;
343
344	12				11	our $check_images = CheckSpelling::Util::get_val_from_env('INPUT_CHECK_IMAGES', '');
345	12				9	$check_images = $check_images =~ /^(?:1\|true)$/i;
346	12	50			14	if ($check_images) {
347	0				0	our $ocr_directory = CheckSpelling::Util::get_file_from_env('ocr_directory', '/tmp/ocr');
348	0	0			0	$ocr_directory = $1 if ($ocr_directory =~ /^(.*)$/);
349						}
350
351	12				8	our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', '');
352
353	12				13	our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', '');
354
355	12				14	$word_match = valid_word();
356
357	12				30	our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words");
358	12	100			60	$base_dict = '/usr/share/dict/words' unless -e $base_dict;
359	12				14	load_dictionary($base_dict);
360						}
361
362						sub split_line {
363	1162			1162	594	our (%dictionary, $word_match, $disable_word_collating);
364	1162				511	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
365	1162				462	our @hunspell_dictionaries;
366	1162				437	our $shortest;
367	1162				864	my $shortest_threshold = $shortest + 2;
368	1162				619	my $pattern = '.';
369						# $pattern = "(?:$upper_pattern){$shortest,}\|$upper_pattern(?:$lower_pattern){2,}\n";
370
371						# https://www.fileformat.info/info/unicode/char/2019/
372	1162				595	my $rsqm = "\xE2\x80\x99";
373
374	1162				661	my ($words, $unrecognized) = (0, 0);
375	1162				848	my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_;
376	1162				5599	$line =~ s/(?:$rsqm\|'\|'\|\%27\|’\|’\|’\|\\u2019\|\x{2019}\|')+/'/g;
377	1162				2247	$line =~ s/(?:$ignore_pattern)+/ /g;
378	1162				1912	while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {}
379	1162				3409	while ($line =~ s/((?:$lower_pattern\|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {}
380	1162				1431	for my $token (split /\s+/, $line) {
381	3652	100			3436	next unless $token =~ /$pattern/;
382	2491				2111	$token =~ s/^(?:'\|$rsqm)+//g;
383	2491				2567	$token =~ s/(?:'\|$rsqm)+s?$//g;
384	2491				1462	my $raw_token = $token;
385	2491				1451	$token =~ s/^[^Ii]?'+//; # need to reconsider for French
386	2491				1339	$token =~ s/'+$//;
387	2491	100			3910	next unless $token =~ $word_match;
388	2323	100			2227	if (defined $dictionary{$token}) {
389	1042				449	++$words;
390	1042				638	$unique_ref->{$token}=1;
391	1042				883	next;
392						}
393	1281	100			1005	if (@hunspell_dictionaries) {
394	1254				679	my $found = 0;
395	1254				805	for my $hunspell_dictionary (@hunspell_dictionaries) {
396						my $token_encoded = defined $hunspell_dictionary->{'encoding'} ?
397	1254	50			1102	encode($hunspell_dictionary->{'encoding'}, $token) : $token;
398	1254	50			3016	next unless ($hunspell_dictionary->{'engine'}->check($token_encoded));
399	0				0	++$words;
400	0				0	$dictionary{$token} = 1;
401	0				0	$unique_ref->{$token}=1;
402	0				0	$found = 1;
403	0				0	last;
404						}
405	1254	50			959	next if $found;
406						}
407	1281				972	my $key = lc $token;
408	1281	100			1186	if (defined $dictionary{$key}) {
409	6				3	++$words;
410	6				4	$unique_ref->{$key}=1;
411	6				9	next;
412						}
413	1275	50			928	unless ($disable_word_collating) {
414	1275				752	$key =~ s/''+/'/g;
415	1275	100			1266	$key =~ s/'[sd]$// if length $key >= $shortest_threshold;
416						}
417	1275	50			1124	if (defined $dictionary{$key}) {
418	0				0	++$words;
419	0				0	$unique_ref->{$key}=1;
420	0				0	next;
421						}
422	1275				638	++$unrecognized;
423	1275				878	$unique_unrecognized_ref->{$raw_token}=1;
424	1275				1726	$unrecognized_line_items_ref->{$raw_token}=1;
425						}
426	1162				1690	return ($words, $unrecognized);
427						}
428
429						sub skip_file {
430	7			7	23	my ($temp_dir, $reason) = @_;
431	7				233	open(my $skipped_fh, '>:utf8', "$temp_dir/skipped");
432	7				41	print $skipped_fh $reason;
433	7				124	close $skipped_fh;
434						}
435
436						sub maybe_ocr_file {
437	0			0	0	my ($file) = @_;
438	0				0	our $ocr_directory;
439	0				0	my $ocr_file = "$ocr_directory/$file";
440	0				0	$ocr_file =~ /^(.*)$/;
441	0				0	$ocr_file = $1;
442	0				0	my $ocr_source_sha = "$ocr_file.sha1";
443	0				0	$ocr_file = "$ocr_file.txt";
444	0				0	my $sha = Digest::SHA->new(1)->addfile($file, 'b')->hexdigest;
445	0	0	0		0	if (-e $ocr_file &&
446						-e $ocr_source_sha &&
447						open my $source_sha, '<', $ocr_source_sha) {
448	0				0	my $last_sha = <$source_sha>;
449	0				0	close $source_sha;
450	0	0			0	if ($last_sha =~ /(.*)/) {
451	0	0			0	return ($ocr_file, 1) if ($1 eq $sha);
452						}
453						}
454	0				0	my $tesseract = dirname(dirname(dirname(__FILE__)))."/wrappers/run-tesseract";
455	0				0	$ENV{'input'} = $file;
456	0				0	my $text_file = `"$tesseract"`;
457	0				0	delete $ENV{'input'};
458	0	0			0	return ($file, 0) unless defined $text_file;
459	0				0	my $file_converted = 0;
460	0				0	chomp $text_file;
461	0	0			0	if ($text_file =~ /^(.*)$/) {
462	0				0	$text_file = $1;
463	0		0		0	my $file_size = -s $text_file \|\| 0;
464	0	0			0	if ($file_size > 20) {
465	0				0	$file_converted = 1;
466	0				0	make_path(dirname($ocr_source_sha));
467	0				0	open my $source_sha, '>', $ocr_source_sha;
468	0				0	print $source_sha $sha;
469	0				0	close $source_sha;
470	0				0	rename($text_file, $ocr_file);
471	0				0	$file = $ocr_file;
472						} else {
473	0				0	unlink($text_file);
474						}
475						}
476	0				0	return ($file, $file_converted);
477						}
478
479						sub split_file {
480	19			19	13517	my ($file) = @_;
481						our (
482	19				12	$unrecognized, $shortest, $largest_file, $words,
483						$word_match, %unique, %unique_unrecognized, $forbidden_re,
484						@forbidden_re_list, $patterns_re, %dictionary,
485						$begin_block_re, @begin_block_list, @end_block_list,
486						$candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file,
487						$disable_single_line_file,
488						$ignore_next_line_pattern,
489						$sandbox,
490						$check_images,
491						);
492	19	100			49	$ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./;
493
494	19				12	our %forbidden_re_descriptions;
495	19				7	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
496
497						# https://www.fileformat.info/info/unicode/char/2019/
498	19				17	my $rsqm = "\xE2\x80\x99";
499
500	19				24	my @candidates_re_hits = (0) x scalar @candidates_re_list;
501	19				17	my @candidates_re_lines = (0) x scalar @candidates_re_list;
502	19				21	my @forbidden_re_hits = (0) x scalar @forbidden_re_list;
503	19				22	my @forbidden_re_lines = (0) x scalar @forbidden_re_list;
504	19				41	my $temp_dir = tempdir(DIR=>$sandbox);
505	19	50			3240	print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'};
506	19				472	open(my $name_fh, '>', "$temp_dir/name");
507	19				57	print $name_fh $file;
508	19				247	close $name_fh;
509	19	100	67		194	if (defined readlink($file) &&
510						rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) {
511	1				2	skip_file($temp_dir, "symbolic link points outside repository (out-of-bounds-symbolic-link)\n");
512	1				6	return $temp_dir;
513						}
514	18	100 50			35	if ($use_magic_file) {
515	8	50			13343	if (open(my $file_fh, '-\|',
516						'/usr/bin/file',
517						'-b',
518						'--mime',
519						'-e', 'cdf',
520						'-e', 'compress',
521						'-e', 'csv',
522						'-e', 'elf',
523						'-e', 'json',
524						'-e', 'tar',
525						$file)) {
526	8				31432	my $file_kind = <$file_fh>;
527	8				5554	close $file_fh;
528	8				11	my $file_converted = 0;
529	8	50	33		22	if ($check_images && $file_kind =~ m<^image/(?!svg)>) {
530	0				0	($file, $file_converted) = maybe_ocr_file($file);
531						}
532	8	100	67		166	if ($file_converted == 0 && $file_kind =~ /^(.*?); charset=binary/) {
533	2				31	skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n");
534	2				41	return $temp_dir;
535						}
536						}
537						} elsif ($file =~ /\.(?:png\|jpe?g\|gif)$/) {
538	0				0	my $file_converted = 0;
539	0				0	($file, $file_converted) = maybe_ocr_file($file);
540						}
541	16				81	my $file_size = -s $file;
542	16	50			21	if (defined $largest_file) {
543	16	50			17	unless ($check_file_names eq $file) {
544	16	100			19	if ($file_size > $largest_file) {
545	1				2	skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n");
546	1				3	return $temp_dir;
547						}
548						}
549						}
550	15				154	open my $file_fh, '<', $file;
551	15				16	binmode $file_fh;
552	15				10	my $head;
553	15				117	read($file_fh, $head, 4096);
554	15				799	$head =~ s/(?:\r\|\n)+$//;
555	15				58	my $dos_new_lines = () = $head =~ /\r\n/gi;
556	15				36	my $unix_new_lines = () = $head =~ /\n/gi;
557	15				119	my $mac_new_lines = () = $head =~ /\r/gi;
558	15				59	local $/;
559	15	100 100 100	100 100		82	if ($unix_new_lines == 0 && $mac_new_lines == 0) {
560	3				5	$/ = "\n";
561						} elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) {
562	1				5	$/ = "\r\n";
563						} elsif ($mac_new_lines > $unix_new_lines) {
564	2				8	$/ = "\r";
565						} else {
566	9				12	$/ = "\n";
567						}
568	15				29	seek($file_fh, 0, 0);
569	15				21	($words, $unrecognized) = (0, 0);
570	15				38	%unique = ();
571	15				29	%unique_unrecognized = ();
572
573						local $SIG{__WARN__} = sub {
574	0			0	0	my $message = shift;
575	0				0	$message =~ s/> line/> in $file - line/;
576	0				0	chomp $message;
577	0				0	print STDERR "$message\n";
578	15				164	};
579
580	15				415	open(my $warnings_fh, '>:utf8', "$temp_dir/warnings");
581	15				11	our $timeout;
582	15				13	eval {
583	15 0			0	110 0	local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
584	15				44	alarm $timeout;
585
586	15				15	my $ignore_next_line = 0;
587	15				25	my ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
588	15				17	my $offset = 0;
589	15				110	LINE: while (<$file_fh>) {
590	1172	100			1098	if ($. == 1) {
591	15	50			18	unless ($disable_minified_file) {
592	15	100	100		61	if ($file_size >= 512 && length($_) == $file_size) {
593	1				8	skip_file($temp_dir, "file only has a single line (single-line-file)\n");
594	1				4	last;
595						}
596						}
597	14				30	s/^\x{FEFF}//;
598						}
599	1171				2572	$_ = decode_utf8($_, FB_DEFAULT);
600	1171	50			2866	if (/[\x{D800}-\x{DFFF}]/) {
601	0				0	skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n");
602	0				0	last;
603						}
604	1171				1499	s/\R$//;
605	1171	100			1219	next unless /./;
606	1169				793	my $raw_line = $_;
607	1169				641	my $parsed_block_markers;
608
609						# hook for custom multiline based text exclusions:
610	1169	100			870	if ($begin_block_re) {
611	1148				550	FIND_END_MARKER: while (1) {
612	1150				1023	while ($next_end_marker ne '') {
613	6	100			24	next LINE unless /\Q$next_end_marker\E/;
614	1				5	s/.*?\Q$next_end_marker\E//;
615	1				2	($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
616	1				1	$parsed_block_markers = 1;
617						}
618	1145				1161	my @captured = (/^.*?$begin_block_re/);
619	1145	100			1080	last unless (@captured);
620	2				2	for my $capture (0 .. $#captured) {
621	2	50			3	if ($captured[$capture]) {
622	2				6	($current_begin_marker, $next_end_marker, $start_marker_line) = ($begin_block_list[$capture], $end_block_list[$capture], "$.:1 ... 1");
623	2				12	s/^.*?\Q$begin_block_list[$capture]\E//;
624	2				2	$parsed_block_markers = 1;
625	2				3	next FIND_END_MARKER;
626						}
627						}
628						}
629	1143	100			823	next if $parsed_block_markers;
630						}
631
632	1163				672	my $ignore_this_line = $ignore_next_line;
633	1163				1103	$ignore_next_line = ($_ =~ /$ignore_next_line_pattern/);
634	1163	100			801	next if $ignore_this_line;
635
636						# hook for custom line based text exclusions:
637	1162	100			801	if (defined $patterns_re) {
638	2 6				12 8	s/($patterns_re)/"="x length($1)/ge;
639						}
640	1162				680	my $initial_line_state = $_;
641	1162				725	my $previous_line_state = $_;
642	1162				546	my $line_flagged;
643	1162	100			872	if ($forbidden_re) {
644	9 5				64 11	while (s/($forbidden_re)/"="x length($1)/e) {
645	5				5	$line_flagged = 1;
646	5				9	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
647	5				6	my $found_trigger_re;
648	5				6	for my $i (0 .. $#forbidden_re_list) {
649	7				6	my $forbidden_re_singleton = $forbidden_re_list[$i];
650	7				3	my $test_line = $previous_line_state;
651	7 4	100			83 8	if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) {
652	4	50			4	next unless $test_line eq $_;
653	4				9	my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1);
654	4	50			3	next unless $begin == $begin_test;
655	4	50			4	next unless $end == $end_test;
656	4	50			5	next unless $match eq $match_test;
657	4				3	$found_trigger_re = $forbidden_re_singleton;
658	4				9	my $hit = "$.:$begin:$end";
659	4				3	$forbidden_re_hits[$i]++;
660	4	100			6	$forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i];
661	4				6	last;
662						}
663						}
664	5				7	my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
665	5	100			5	if ($found_trigger_re) {
666	4		100		9	my $description = $forbidden_re_descriptions{$found_trigger_re} \|\| '';
667	4				10	$found_trigger_re =~ s/^$\?:(.*)$$/$1/;
668	4				2	my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99);
669	4	100			5	if ($description ne '') {
670	3				14	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n";
671						} else {
672	1				6	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n";
673						}
674						} else {
675	1				4	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n";
676						}
677	5				26	$previous_line_state = $_;
678						}
679	9				7	$_ = $initial_line_state;
680						}
681						# This is to make it easier to deal w/ rules:
682	1162				1155	s/^/ /;
683	1162				657	my %unrecognized_line_items = ();
684	1162				519	our $check_homoglyphs;
685	1162	50			728	if ($check_homoglyphs) {
686	1162				672	my $check_line_for_homoglyphs = $_;
687	1162				781	my $homoglyphs = $CheckSpelling::Homoglyph::homoglyphs;
688						# problematic characters: `\\`, `-`, `]`
689	1162				11031	$homoglyphs =~ s/([-\\\]])/\\$1/g;
690	1162				1490	$homoglyphs = "[$homoglyphs]";
691	1162				560	our ($longest_word, $shortest_word);
692	1162	50	33		2029	my $longest_word_string = defined $longest_word && ($longest_word =~ /^\d+$/) ? $longest_word : '';
693	1162				641	my $dollar = '$';
694	1162				1705	my $homoglyph_re = "(?=(?:${homoglyphs}\|(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})){${shortest_word},${longest_word_string}}(?:${not_upper_or_lower_pattern}\|${dollar}))((?:${upper_pattern}\|${lower_pattern})+${homoglyphs}(?:(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})\|${homoglyphs})\|${homoglyphs}+(?:${upper_pattern}\|${lower_pattern})(?:(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})\|${homoglyphs}))";
695	1162				11345	while ($check_line_for_homoglyphs =~ /((?=(?:${homoglyphs}\|(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})){${shortest_word},${longest_word_string}}(?:${not_upper_or_lower_pattern}\|${dollar}))((?:${upper_pattern}\|${lower_pattern})+${homoglyphs}(?:(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})\|${homoglyphs})\|${homoglyphs}+(?:${upper_pattern}\|${lower_pattern})(?:(?:${upper_pattern}\|${lower_pattern}\|${punctuation_pattern})\|${homoglyphs})))/g) {
696	1				2	my ($token, $token_raw, $begin, $end) = ($1, $1, $-[0], $+[0]);
697	1				49	$token =~ s/($homoglyphs)/$CheckSpelling::Homoglyph::homoglyph_to_glyph{$1}/g;
698	1	50			2	if (defined $dictionary{$token}) {
699	1				1	my $token_raw = CheckSpelling::Util::wrap_in_backticks($token_raw);
700	1				2	my $token = CheckSpelling::Util::wrap_in_backticks($token);
701	1				1	my $wrapped = "$token_raw should probably be $token (homoglyph-word)";
702	1				9	print $warnings_fh ":$.:$begin ... $end, Error - $wrapped\n";
703						}
704						}
705						}
706	1162				1126	my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items);
707	1162				726	$words += $new_words;
708	1162				559	$unrecognized += $new_unrecognized;
709	1162				833	my $line_length = length($raw_line);
710	1162				1762	for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) {
711	1022				523	my $found_token = 0;
712	1022				487	my $raw_token = $token;
713	1022				555	$token =~ s/'/(?:'\|\x{2019}\|\'\|\')+/g;
714	1022				456	my $before;
715	1022	100 50			1770	if ($token =~ /^$upper_pattern$lower_pattern/) {
716	5				5	$before = '(?<=.)';
717						} elsif ($token =~ /^$upper_pattern/) {
718	0				0	$before = "(?<!$upper_pattern)";
719						} else {
720	1017				675	$before = "(?<=$not_upper_or_lower_pattern)";
721						}
722	1022	50			1303	my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)\|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)";
723	1022				2205	while ($raw_line =~ /(?:\b\|$before)($token)(?:\b\|$after)/g) {
724	1272				632	$line_flagged = 1;
725	1272				521	$found_token = 1;
726	1272				1852	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
727	1272	50			1220	next unless $match =~ /./;
728	1272				939	my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
729	1272				5068	print $warnings_fh ":$.:$begin ... $end: $wrapped\n";
730						}
731	1022	100			1157	unless ($found_token) {
732	3	50	33		36	if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) {
733	3				8	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
734	3				2	my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
735	3				44	print $warnings_fh ":$.:$begin ... $end: $wrapped\n";
736						} else {
737	0				0	my $offset = $line_length + 1;
738	0				0	my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
739	0				0	print $warnings_fh ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n";
740						}
741						}
742						}
743	1162	100	100		2045	if ($line_flagged && $candidates_re) {
744	2				2	$_ = $previous_line_state = $initial_line_state;
745	2 2				25 5	s/($candidates_re)/"="x length($1)/ge;
746	2	50			4	if ($_ ne $initial_line_state) {
747	2				2	$_ = $previous_line_state;
748	2				3	for my $i (0 .. $#candidates_re_list) {
749	4				4	my $candidate_re = $candidates_re_list[$i];
750	4	100	67		27	next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/;
751	2 2	50			10 4	if (($_ =~ s/($candidate_re)/"="x length($1)/e)) {
752	2				3	my ($begin, $end) = ($-[0] + 1, $+[0] + 1);
753	2				6	my $hit = "$.:$begin:$end";
754	2				2	$_ = $previous_line_state;
755	2 2				7 3	my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge);
756	2				2	$candidates_re_hits[$i] += $replacements;
757	2	50			3	$candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i];
758	2				5	$_ = $previous_line_state;
759						}
760						}
761						}
762						}
763	1162	50			882	unless ($disable_minified_file) {
764	1162				976	s/={3,}//g;
765	1162				863	$offset += length;
766	1162				1128	my $ratio = int($offset / $.);
767	1162				635	my $ratio_threshold = 1000;
768	1162	100			3381	if ($ratio > $ratio_threshold) {
769	2				9	skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n");
770	2				7	last;
771						}
772						}
773						}
774	15	100			24	if ($next_end_marker) {
775	1	50			2	if ($start_marker_line) {
776	1				2	my $wrapped = CheckSpelling::Util::wrap_in_backticks($current_begin_marker);
777	1				4	print $warnings_fh ":$start_marker_line, Warning - Failed to find matching end marker for $wrapped (unclosed-block-ignore-begin)\n";
778						}
779	1				1	my $wrapped = CheckSpelling::Util::wrap_in_backticks($next_end_marker);
780	1				3	print $warnings_fh ":$.:1 ... 1, Warning - Expected to find end block marker $wrapped (unclosed-block-ignore-end)\n";
781						}
782
783	15				139	alarm 0;
784						};
785	15	50			18	if ($@) {
786	0	0			0	die unless $@ eq "alarm\n";
787	0				0	print $warnings_fh ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n";
788	0				0	skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n");
789	0				0	return $temp_dir;
790						}
791
792	15				56	close $file_fh;
793	15				244	close $warnings_fh;
794
795	15	100	75		40	if ($unrecognized \|\| @candidates_re_hits \|\| @forbidden_re_hits) {
796	14				432	open(my $stats_fh, '>:utf8', "$temp_dir/stats");
797	14	100 100 100 100			212	print $stats_fh "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized).
798						", unique: ".(keys %unique).
799						(@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : "").
800						(@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : "").
801						(@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : "").
802						(@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : "").
803						"}";
804	14				183	close $stats_fh;
805	14				394	open(my $unknown_fh, '>:utf8', "$temp_dir/unknown");
806	14 21				55 35	print $unknown_fh map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized;
807	14				161	close $unknown_fh;
808						}
809
810	15				161	return $temp_dir;
811						}
812
813						sub main {
814	4			4	420	my ($configuration, @ARGV) = @_;
815	4				2	our %dictionary;
816	4	100			5	unless (%dictionary) {
817	1				2	init($configuration);
818						}
819
820						# read all input
821	4				3	my @reports;
822
823	4				4	for my $file (@ARGV) {
824	4				7	my $temp_dir = split_file($file);
825	4				9	push @reports, "$temp_dir\n";
826						}
827	4				13	print join '', @reports;
828						}
829
830						1;