File Coverage

File:	lib/CheckSpelling/UnknownWordSplitter.pm
Coverage:	82.7%

line	stmt	bran	cond	sub	time	code
1						#! --perl--
2
3						# ~/bin/w
4						# Search for potentially misspelled words
5						# Output is:
6						# misspellled
7						# woord (WOORD, Woord, woord, woord's)
8						package CheckSpelling::UnknownWordSplitter;
9
10	1 1			1	111537 1	use 5.022;
11	1 1 1			1	1 1 47	use feature 'unicode_strings';
12	1 1 1			1	1 5 7	use strict;
13	1 1 1			1	1 1 20	use warnings;
14	1 1 1			1	1 1 14	no warnings qw(experimental::vlb);
15	1 1 1			1	1 1 1	use utf8;
16	1 1 1			1	10 1 26	use Encode qw/decode_utf8 encode FB_DEFAULT/;
17	1 1 1			1	2 1 25	use File::Basename;
18	1 1 1			1	2 0 15	use Cwd 'abs_path';
19	1 1 1			1	1 1 17	use File::Spec;
20	1 1 1			1	1 1 16	use File::Temp qw/ tempfile tempdir /;
21	1 1 1			1	2 0 26	use File::Path qw/ make_path /;
22	1 1 1			1	306 1 18	use CheckSpelling::Util;
23	1 1 1			1	202 1308 940	use Digest::SHA;
24						our $VERSION='0.1.0';
25
26						my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
27						my $homoglyph_re;
28						my $begin_block_re = '';
29						my @begin_block_list = ();
30						my @end_block_list = ();
31						my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
32						my ($shortest, $longest) = (255, 0);
33						my @forbidden_re_list;
34						my %forbidden_re_descriptions;
35						my @candidates_re_list;
36						my $hunspell_dictionary_path;
37						my @hunspell_dictionaries;
38						my %dictionary = ();
39						my $base_dict;
40						my %unique;
41						my %unique_unrecognized;
42						my ($last_file, $words, $unrecognized) = ('', 0, 0);
43						my ($ignore_next_line_pattern);
44						my ($check_images, $ocr_directory);
45
46						my $disable_flags;
47
48						sub test_re {
49	32			32	21	my ($expression) = @_;
50	32 32				17 209	return eval { qr /$expression/ };
51						}
52
53						sub quote_re {
54	34			34	29	my ($expression) = @_;
55	34	50			26	return $expression if $expression =~ /\?\{/;
56	34				72	$expression =~ s/
57						\G
58						(
59						(?:[^\\]\|\\[^Q])*
60						)
61						(?:
62						\\Q
63						(?:[^\\]\|\\[^E])*
64						(?:\\E)?
65						)?
66						/
67	68	50			111	$1 . (defined($2) ? quotemeta($2) : '')
68						/xge;
69	34				42	return $expression;
70						}
71
72						sub file_to_lists {
73	6			6	3	my ($re) = @_;
74	6				7	my @patterns;
75						my %hints;
76	6				0	my $fh;
77	6	50			58	if (open($fh, '<:utf8', $re)) {
78	6				8	local $/=undef;
79	6				43	my $file=<$fh>;
80	6				20	close $fh;
81	6				2	my $line_number = 0;
82	6				5	my $hint = '';
83	6				22	for (split /\R/, $file) {
84	32				21	++$line_number;
85	32				15	chomp;
86	32	100			41	if (/^#(?:\s(.+)\|)/) {
87	12	100	67		25	$hint = $1 if ($hint eq '' && defined $1);
88	12				9	next;
89						}
90	20	100			21	$hint = '' unless $_ ne '';
91	20	50			17	next if $_ eq '$^';
92	20				14	my $pattern = $_;
93	20	100			49	next unless s/^(.+)/(?:$1)/;
94	13				15	my $quoted = quote_re($1);
95	13	100			14	unless (test_re $quoted) {
96	1				1	my $error = $@;
97	1				64	my $home = dirname(__FILE__);
98	1				27	$error =~ s/$home.*?\.pm line \d+\./$re line $line_number (bad-regex)/;
99	1				14	print STDERR $error;
100	1				4	$_ = '(?:\$^ - skipped because bad-regex)';
101	1				1	$hint = '';
102						}
103	13	100			20	if (defined $hints{$_}) {
104	1				2	my $pattern_length = length $pattern;
105	1				2	my $wrapped = CheckSpelling::Util::wrap_in_backticks($pattern);
106	1				20	print STDERR "$re:$line_number:1 ... $pattern_length, Warning - duplicate pattern: $wrapped (duplicate-pattern)\n";
107	1				2	$_ = '(?:\$^ - skipped because duplicate-pattern on $line_number)';
108						} else {
109	12				15	push @patterns, $_;
110	12				19	$hints{$_} = $hint;
111						}
112	13				19	$hint = '';
113						}
114						}
115
116						return {
117	6				26	patterns => \@patterns,
118						hints => \%hints,
119						};
120						}
121
122						sub file_to_list {
123	5			5	1318	my ($re) = @_;
124	5				8	my $lists = file_to_lists($re);
125
126	5 5				3 18	return @{$lists->{'patterns'}};
127						}
128
129						sub list_to_re {
130	5			5	6	my (@list) = @_;
131	5 11 11	50			1 8 8	@list = map { my $quoted = quote_re($_); test_re($quoted) ? $quoted : '' } @list;
132	5 11				4 12	@list = grep { $_ ne '' } @list;
133	5	50			5	return '$^' unless scalar @list;
134	5				13	return join "\|", (@list);
135						}
136
137						sub not_empty {
138	107			107	85	my ($thing) = @_;
139	107		67		506	return defined $thing && $thing ne '' && $thing =~ /^\d+$/;
140						}
141
142						sub parse_block_list {
143	3			3	2	my ($re) = @_;
144	3				3	my @file;
145	3	50			31	return @file unless (open(my $file_fh, '<:utf8', $re));
146
147	3				4	local $/=undef;
148	3				24	my $file=<$file_fh>;
149	3				4	my $last_line = $.;
150	3				9	close $file_fh;
151	3				10	for (split /\R/, $file) {
152	8	100			11	next if /^#/;
153	5				3	chomp;
154	5				4	s/^\\#/#/;
155	5	50			6	next unless /^./;
156	5				5	push @file, $_;
157						}
158
159	3				3	my $pairs = (0+@file) / 2;
160	3				4	my $true_pairs = $pairs \| 0;
161	3	100			3	unless ($pairs == $true_pairs) {
162	1				1	my $early_warnings = CheckSpelling::Util::get_file_from_env('early_warnings', '/dev/null');
163	1				10	open my $early_warnings_fh, ">>:encoding(UTF-8)", $early_warnings;
164	1				28	print $early_warnings_fh "$re:$last_line:Block delimiters must come in pairs (uneven-block-delimiters)\n";
165	1				19	close $early_warnings_fh;
166	1				1	my $i = 0;
167	1				2	while ($i < $true_pairs) {
168	0				0	print STDERR "block-delimiter $i S: $file[$i*2]\n";
169	0				0	print STDERR "block-delimiter $i E: $file[$i*2+1]\n";
170	0				0	$i++;
171						}
172	1				11	print STDERR "block-delimiter unmatched S: `$file[$i*2]`\n";
173	1				2	@file = ();
174						}
175
176	3				9	return @file;
177						}
178
179						sub valid_word {
180						# shortest_word is an absolute
181	28			28	17	our ($shortest, $longest, $shortest_word, $longest_word);
182	28	50			35	$shortest = $shortest_word if $shortest_word;
183	28	100 100			27	if ($longest_word) {
184						# longest_word is an absolute
185	26				21	$longest = $longest_word;
186						} elsif (not_empty($longest)) {
187						# we allow for some sloppiness (a couple of stuck keys per word)
188						# it's possible that this should scale with word length
189	1				1	$longest += 2;
190						}
191	28				18	our ($upper_pattern, $lower_pattern, $punctuation_pattern);
192	28 84	100			31 210	my $word_pattern = join '\|', (grep { defined $_ && /./ } ($upper_pattern, $lower_pattern, $punctuation_pattern));
193	28	100			26	$word_pattern = q<\\w\|'> unless $word_pattern;
194	28	50	50		58	if ((defined $shortest && not_empty($longest)) &&
195						($shortest > $longest)) {
196	0				0	$word_pattern = "(?:$word_pattern){3}";
197	0				0	return qr/$word_pattern/;
198						}
199	28	100			35	$shortest = 3 unless defined $shortest;
200	28	100			20	$longest = '' unless not_empty($longest);
201	28				112	$word_match = "(?:$word_pattern){$shortest,$longest}";
202	28				226	return qr/\b$word_match\b/;
203						}
204
205						sub load_dictionary {
206	15			15	2011	my ($dict) = @_;
207	15				7	our ($word_match, $longest, $shortest, $longest_word, $shortest_word, %dictionary);
208	15				13	$longest_word = CheckSpelling::Util::get_val_from_env('INPUT_LONGEST_WORD', undef);
209	15				14	$shortest_word = CheckSpelling::Util::get_val_from_env('INPUT_SHORTEST_WORD', 0);
210	15				10	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
211	15				13	$ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>);
212	15				55	$upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]');
213	15				35	$lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]');
214	15				26	$not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]');
215	15				28	$not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]');
216	15				27	$punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>);
217	15				30	my $homoglyph_list_path = CheckSpelling::Util::get_file_from_env_utf8('homoglyph_list_path', '/dev/null');
218	15	100			99	if (-s $homoglyph_list_path) {
219	1 1 1			1	344 1 3384	use CheckSpelling::Homoglyph;
220	14				21	CheckSpelling::Homoglyph::init($homoglyph_list_path);
221	14				20	my $homoglyphs = $CheckSpelling::Homoglyph::homoglyphs;
222						# problematic characters: `\\`, `-`, `]`
223	14				138	$homoglyphs =~ s/([-\\\]])/\\$1/g;
224	14				36	$homoglyphs = "[$homoglyphs]";
225	14				80	qr/$homoglyphs/;
226	14				18	my $any_char = "(?:$upper_pattern\|$lower_pattern)";
227	14				28	qr/$any_char/;
228	14				15	my $any_char_or_punctuation = "(?:$upper_pattern\|$lower_pattern\|$punctuation_pattern)";
229	14				25	my $homoglyphs_or_any_char_or_punctuation = "$homoglyphs\|$any_char_or_punctuation";
230	14	50			18	my $longest_word_string = defined $longest_word ? $longest_word : '';
231	14				17	my $homoglyphs_or_any_char_or_punctuation_short_to_long = "(?:$homoglyphs_or_any_char_or_punctuation){$shortest_word,$longest_word_string}";
232	14				77	my $not_upper_or_lower_pattern_or_end = "(?:$not_upper_or_lower_pattern\|$))";
233	14				74	our $homoglyph_re = "(?=$homoglyphs_or_any_char_or_punctuation_short_to_long$not_upper_or_lower_pattern_or_end)($any_char+$homoglyphs(?:$any_char_or_punctuation\|$homoglyphs)\|$homoglyphs+$any_char(?:$any_char_or_punctuation\|$homoglyphs))";
234						}
235	15				19	%dictionary = ();
236
237	15				547	open(my $dict_fh, '<:utf8', $dict);
238	15				67	while (!eof($dict_fh)) {
239	53				59	my $word = <$dict_fh>;
240	53				45	chomp $word;
241	53	100			128	next unless $word =~ $word_match;
242	50				44	my $l = length $word;
243	50	100			36	$longest = -1 unless not_empty($longest);
244	50	100			49	$longest = $l if $l > $longest;
245	50	100			41	$shortest = $l if $l < $shortest;
246	50				100	$dictionary{$word}=1;
247						}
248	15				37	close $dict_fh;
249
250	15				14	$word_match = valid_word();
251						}
252
253						sub hunspell_dictionary {
254	3			3	4	my ($dict) = @_;
255	3				6	my $name = $dict;
256	3				7	$name =~ s{/src/index/hunspell/index\.dic$}{};
257	3				18	$name =~ s{.*/}{};
258	3				3	my $aff = $dict;
259	3				3	my $encoding;
260	3				9	$aff =~ s/\.dic$/.aff/;
261	3	50			40	if (open my $aff_fh, '<', $aff) {
262	3				21	while (<$aff_fh>) {
263	0	0			0	next unless /^SET\s+(\S+)/;
264	0	0			0	$encoding = $1 if ($1 !~ /utf-8/i);
265	0				0	last;
266						}
267	3				57	close $aff_fh;
268						}
269						return {
270	3				334	name => $name,
271						dict => $dict,
272						aff => $aff,
273						encoding => $encoding,
274						engine => Text::Hunspell->new($aff, $dict),
275						}
276						}
277
278						sub init {
279	12			12	15715	my ($configuration) = @_;
280	12				9	our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
281	12				9	our ($begin_block_re, @begin_block_list, @end_block_list);
282	12				26	our $sandbox = CheckSpelling::Util::get_file_from_env('sandbox', '');
283	12				13	our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
284	12				15	our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
285	12				10	our %forbidden_re_descriptions;
286	12	100			13	if ($hunspell_dictionary_path) {
287	3				42	our @hunspell_dictionaries = ();
288	1 1 1 1 1 1 1 1 1 3	50		1 1 1	290 1106 21 6 2 14 9 2 17 187	if (eval 'use Text::Hunspell; 1') {
289	3				131	my @hunspell_dictionaries_list = glob("$hunspell_dictionary_path/*.dic");
290	3				8	for my $hunspell_dictionary_file (@hunspell_dictionaries_list) {
291	3				5	push @hunspell_dictionaries, hunspell_dictionary($hunspell_dictionary_file);
292						}
293						} else {
294	0				0	print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
295						}
296						}
297
298	12	100			70	if (-e "$configuration/block-delimiters.list") {
299	3				4	my @block_delimiters = parse_block_list "$configuration/block-delimiters.list";
300	3	100			4	if (@block_delimiters) {
301	2				2	@begin_block_list = ();
302	2				1	@end_block_list = ();
303
304	2				2	while (@block_delimiters) {
305	2				3	my ($begin, $end) = splice @block_delimiters, 0, 2;
306	2				2	push @begin_block_list, $begin;
307	2				3	push @end_block_list, $end;
308						}
309
310	2 2				1 3	$begin_block_re = join '\|', (map { '('.quote_re("\Q$_\E").')' } @begin_block_list);
311						}
312						}
313
314	12				20	my (@patterns_re_list, %in_patterns_re_list);
315	12	50			48	if (-e "$configuration/patterns.txt") {
316	0				0	@patterns_re_list = file_to_list "$configuration/patterns.txt";
317	0				0	$patterns_re = list_to_re @patterns_re_list;
318	0 0				0 0	%in_patterns_re_list = map {$_ => 1} @patterns_re_list;
319						} else {
320	12				10	$patterns_re = undef;
321						}
322
323	12	100			38	if (-e "$configuration/forbidden.txt") {
324	1				2	my $forbidden_re_info = file_to_lists "$configuration/forbidden.txt";
325	1 1				1 2	@forbidden_re_list = @{$forbidden_re_info->{'patterns'}};
326	1 1				1 3	%forbidden_re_descriptions = %{$forbidden_re_info->{'hints'}};
327	1				1	$forbidden_re = list_to_re @forbidden_re_list;
328						} else {
329	11				12	$forbidden_re = undef;
330						}
331
332	12	100			46	if (-e "$configuration/candidates.txt") {
333	4				6	@candidates_re_list = file_to_list "$configuration/candidates.txt";
334	4 8 8	50	33		5 8 17	@candidates_re_list = map { my $quoted = quote_re($_); $in_patterns_re_list{$_} \|\| !test_re($quoted) ? '' : $quoted } @candidates_re_list;
335	4				5	$candidates_re = list_to_re @candidates_re_list;
336						} else {
337	8				6	$candidates_re = undef;
338						}
339
340	12				23	our $largest_file = CheckSpelling::Util::get_val_from_env('INPUT_LARGEST_FILE', 1024*1024);
341
342	12				12	my $disable_flags = CheckSpelling::Util::get_file_from_env('INPUT_DISABLE_CHECKS', '');
343	12				10	our $disable_word_collating = $disable_flags =~ /(?:^\|,\|\s)word-collating(?:,\|\s\|$)/;
344	12				6	our $disable_minified_file = $disable_flags =~ /(?:^\|,\|\s)minified-file(?:,\|\s\|$)/;
345	12				9	our $disable_single_line_file = $disable_flags =~ /(?:^\|,\|\s)single-line-file(?:,\|\s\|$)/;
346
347	12				10	our $ignore_next_line_pattern = CheckSpelling::Util::get_file_from_env('INPUT_IGNORE_NEXT_LINE', '');
348	12				10	$ignore_next_line_pattern =~ s/\s+/\|/g;
349
350	12				9	our $check_images = CheckSpelling::Util::get_val_from_env('INPUT_CHECK_IMAGES', '');
351	12				6	$check_images = $check_images =~ /^(?:1\|true)$/i;
352	12	50			9	if ($check_images) {
353	0				0	our $ocr_directory = CheckSpelling::Util::get_file_from_env('ocr_directory', '/tmp/ocr');
354	0	0			0	$ocr_directory = $1 if ($ocr_directory =~ /^(.*)$/);
355						}
356
357	12				9	our $check_file_names = CheckSpelling::Util::get_file_from_env('check_file_names', '');
358
359	12				9	our $use_magic_file = CheckSpelling::Util::get_val_from_env('INPUT_USE_MAGIC_FILE', '');
360
361	12				17	$word_match = valid_word();
362
363	12				24	our $base_dict = CheckSpelling::Util::get_file_from_env('dict', "$configuration/words");
364	12	100			58	$base_dict = '/usr/share/dict/words' unless -e $base_dict;
365	12				16	load_dictionary($base_dict);
366						}
367
368						sub split_line {
369	1161			1161	514	our (%dictionary, $word_match, $disable_word_collating);
370	1161				522	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
371	1161				481	our @hunspell_dictionaries;
372	1161				445	our $shortest;
373	1161				882	my $shortest_threshold = $shortest + 2;
374	1161				624	my $pattern = '.';
375						# $pattern = "(?:$upper_pattern){$shortest,}\|$upper_pattern(?:$lower_pattern){2,}\n";
376
377						# https://www.fileformat.info/info/unicode/char/2019/
378	1161				646	my $rsqm = "\xE2\x80\x99";
379
380	1161				640	my ($words, $unrecognized) = (0, 0);
381	1161				850	my ($line, $unique_ref, $unique_unrecognized_ref, $unrecognized_line_items_ref) = @_;
382	1161				5425	$line =~ s/(?:$rsqm\|'\|'\|\%27\|’\|’\|’\|\\u2019\|\x{2019}\|')+/'/g;
383	1161				2253	$line =~ s/(?:$ignore_pattern)+/ /g;
384	1161				1736	while ($line =~ s/($upper_pattern{2,})($upper_pattern$lower_pattern{2,})/ $1 $2 /g) {}
385	1161				3324	while ($line =~ s/((?:$lower_pattern\|$punctuation_pattern)+)($upper_pattern)/$1 $2/g) {}
386	1161				1411	for my $token (split /\s+/, $line) {
387	3645	100			3421	next unless $token =~ /$pattern/;
388	2485				1901	$token =~ s/^(?:'\|$rsqm)+//g;
389	2485				2477	$token =~ s/(?:'\|$rsqm)+s?$//g;
390	2485				1692	my $raw_token = $token;
391	2485				1383	$token =~ s/^[^Ii]?'+(.*)/$1/;
392	2485				1353	$token =~ s/(.*?)'+$/$1/;
393	2485	100			3858	next unless $token =~ $word_match;
394	2318	100			2239	if (defined $dictionary{$token}) {
395	1038				440	++$words;
396	1038				632	$unique_ref->{$token}=1;
397	1038				854	next;
398						}
399	1280	100			1027	if (@hunspell_dictionaries) {
400	1254				658	my $found = 0;
401	1254				769	for my $hunspell_dictionary (@hunspell_dictionaries) {
402						my $token_encoded = defined $hunspell_dictionary->{'encoding'} ?
403	1254	50			1135	encode($hunspell_dictionary->{'encoding'}, $token) : $token;
404	1254	50			2986	next unless ($hunspell_dictionary->{'engine'}->check($token_encoded));
405	0				0	++$words;
406	0				0	$dictionary{$token} = 1;
407	0				0	$unique_ref->{$token}=1;
408	0				0	$found = 1;
409	0				0	last;
410						}
411	1254	50			1019	next if $found;
412						}
413	1280				1034	my $key = lc $token;
414	1280	100			1121	if (defined $dictionary{$key}) {
415	6				4	++$words;
416	6				6	$unique_ref->{$key}=1;
417	6				8	next;
418						}
419	1274	50			929	unless ($disable_word_collating) {
420	1274				689	$key =~ s/''+/'/g;
421	1274	100			1331	$key =~ s/'[sd]$// unless length $key >= $shortest_threshold;
422						}
423	1274	50			1127	if (defined $dictionary{$key}) {
424	0				0	++$words;
425	0				0	$unique_ref->{$key}=1;
426	0				0	next;
427						}
428	1274				727	++$unrecognized;
429	1274				796	$unique_unrecognized_ref->{$raw_token}=1;
430	1274				1670	$unrecognized_line_items_ref->{$raw_token}=1;
431						}
432	1161				1694	return ($words, $unrecognized);
433						}
434
435						sub skip_file {
436	7			7	27	my ($temp_dir, $reason) = @_;
437	7				227	open(my $skipped_fh, '>:utf8', "$temp_dir/skipped");
438	7				35	print $skipped_fh $reason;
439	7				120	close $skipped_fh;
440						}
441
442						sub maybe_ocr_file {
443	0			0	0	my ($file) = @_;
444	0				0	our $ocr_directory;
445	0				0	my $ocr_file = "$ocr_directory/$file";
446	0				0	$ocr_file =~ /^(.*)$/;
447	0				0	$ocr_file = $1;
448	0				0	my $ocr_source_sha = "$ocr_file.sha1";
449	0				0	$ocr_file = "$ocr_file.txt";
450	0				0	my $sha = Digest::SHA->new(1)->addfile($file, 'b')->hexdigest;
451	0	0	0		0	if (-e $ocr_file &&
452						-e $ocr_source_sha &&
453						open my $source_sha, '<', $ocr_source_sha) {
454	0				0	my $last_sha = <$source_sha>;
455	0				0	close $source_sha;
456	0	0			0	if ($last_sha =~ /(.*)/) {
457	0	0			0	return ($ocr_file, 1) if ($1 eq $sha);
458						}
459						}
460	0				0	my $tesseract = dirname(dirname(dirname(__FILE__)))."/wrappers/run-tesseract";
461	0				0	$ENV{'input'} = $file;
462	0				0	my $text_file = `"$tesseract"`;
463	0				0	delete $ENV{'input'};
464	0	0			0	return ($file, 0) unless defined $text_file;
465	0				0	my $file_converted = 0;
466	0				0	chomp $text_file;
467	0	0			0	if ($text_file =~ /^(.*)$/) {
468	0				0	$text_file = $1;
469	0				0	my $file_size = -s $text_file;
470	0	0			0	if ($file_size > 20) {
471	0				0	$file_converted = 1;
472	0				0	make_path(dirname($ocr_source_sha));
473	0				0	open my $source_sha, '>', $ocr_source_sha;
474	0				0	print $source_sha $sha;
475	0				0	close $source_sha;
476	0				0	rename($text_file, $ocr_file);
477	0				0	$file = $ocr_file;
478						} else {
479	0				0	unlink($text_file);
480						}
481						}
482	0				0	return ($file, $file_converted);
483						}
484
485						sub split_file {
486	18			18	12308	my ($file) = @_;
487						our (
488	18				17	$unrecognized, $shortest, $largest_file, $words,
489						$word_match, %unique, %unique_unrecognized, $forbidden_re,
490						@forbidden_re_list, $patterns_re, %dictionary,
491						$begin_block_re, @begin_block_list, @end_block_list,
492						$candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file,
493						$disable_single_line_file,
494						$ignore_next_line_pattern,
495						$sandbox,
496						$check_images,
497						);
498	18	100			35	$ignore_next_line_pattern = '$^' unless $ignore_next_line_pattern =~ /./;
499
500	18				12	our %forbidden_re_descriptions;
501	18				7	our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
502
503						# https://www.fileformat.info/info/unicode/char/2019/
504	18				13	my $rsqm = "\xE2\x80\x99";
505
506	18				22	my @candidates_re_hits = (0) x scalar @candidates_re_list;
507	18				15	my @candidates_re_lines = (0) x scalar @candidates_re_list;
508	18				14	my @forbidden_re_hits = (0) x scalar @forbidden_re_list;
509	18				30	my @forbidden_re_lines = (0) x scalar @forbidden_re_list;
510	18				42	my $temp_dir = tempdir(DIR=>$sandbox);
511	18	50			3048	print STDERR "checking file: $file\n" if defined $ENV{'DEBUG'};
512	18				460	open(my $name_fh, '>', "$temp_dir/name");
513	18				40	print $name_fh $file;
514	18				250	close $name_fh;
515	18	100	67		211	if (defined readlink($file) &&
516						rindex(File::Spec->abs2rel(abs_path($file)), '../', 0) == 0) {
517	1				2	skip_file($temp_dir, "file only has a single line (out-of-bounds-symbolic-link)\n");
518	1				4	return $temp_dir;
519						}
520	17	100 50			38	if ($use_magic_file) {
521	8	50			13196	if (open(my $file_fh, '-\|',
522						'/usr/bin/file',
523						'-b',
524						'--mime',
525						'-e', 'cdf',
526						'-e', 'compress',
527						'-e', 'csv',
528						'-e', 'elf',
529						'-e', 'json',
530						'-e', 'tar',
531						$file)) {
532	8				31073	my $file_kind = <$file_fh>;
533	8				5014	close $file_fh;
534	8				18	my $file_converted = 0;
535	8	50	33		26	if ($check_images && $file_kind =~ m<^image/>) {
536	0				0	($file, $file_converted) = maybe_ocr_file($file);
537						}
538	8	100	67		199	if ($file_converted == 0 && $file_kind =~ /^(.*?); charset=binary/) {
539	2				35	skip_file($temp_dir, "it appears to be a binary file (`$1`) (binary-file)\n");
540	2				47	return $temp_dir;
541						}
542						}
543						} elsif ($file =~ /\.(?:png\|jpe?g\|gif)$/) {
544	0				0	my $file_converted = 0;
545	0				0	($file, $file_converted) = maybe_ocr_file($file);
546						}
547	15				102	my $file_size = -s $file;
548	15	50			23	if (defined $largest_file) {
549	15	50			17	unless ($check_file_names eq $file) {
550	15	100			19	if ($file_size > $largest_file) {
551	1				3	skip_file($temp_dir, "size `$file_size` exceeds limit `$largest_file` (large-file)\n");
552	1				3	return $temp_dir;
553						}
554						}
555						}
556	14				165	open my $file_fh, '<', $file;
557	14				13	binmode $file_fh;
558	14				10	my $head;
559	14				130	read($file_fh, $head, 4096);
560	14				822	$head =~ s/(?:\r\|\n)+$//;
561	14				59	my $dos_new_lines = () = $head =~ /\r\n/gi;
562	14				35	my $unix_new_lines = () = $head =~ /\n/gi;
563	14				117	my $mac_new_lines = () = $head =~ /\r/gi;
564	14				58	local $/;
565	14	100 100 100	100 100		85	if ($unix_new_lines == 0 && $mac_new_lines == 0) {
566	3				6	$/ = "\n";
567						} elsif ($dos_new_lines >= $unix_new_lines && $dos_new_lines >= $mac_new_lines) {
568	1				7	$/ = "\r\n";
569						} elsif ($mac_new_lines > $unix_new_lines) {
570	2				8	$/ = "\r";
571						} else {
572	8				9	$/ = "\n";
573						}
574	14				33	seek($file_fh, 0, 0);
575	14				21	($words, $unrecognized) = (0, 0);
576	14				39	%unique = ();
577	14				33	%unique_unrecognized = ();
578
579						local $SIG{__WARN__} = sub {
580	0			0	0	my $message = shift;
581	0				0	$message =~ s/> line/> in $file - line/;
582	0				0	chomp $message;
583	0				0	print STDERR "$message\n";
584	14				128	};
585
586	14				394	open(my $warnings_fh, '>:utf8', "$temp_dir/warnings");
587	14				25	our $timeout;
588	14				10	eval {
589	14 0			0	111 0	local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
590	14				42	alarm $timeout;
591
592	14				30	my $ignore_next_line = 0;
593	14				34	my ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
594	14				19	my $offset = 0;
595	14				108	LINE: while (<$file_fh>) {
596	1170	100			1154	if ($. == 1) {
597	14	50			16	unless ($disable_minified_file) {
598	14	100	100		56	if ($file_size >= 512 && length($_) == $file_size) {
599	1				13	skip_file($temp_dir, "file only has a single line (single-line-file)\n");
600	1				5	last;
601						}
602						}
603						}
604	1169				2498	$_ = decode_utf8($_, FB_DEFAULT);
605	1169	50			2949	if (/[\x{D800}-\x{DFFF}]/) {
606	0				0	skip_file($temp_dir, "file contains a UTF-16 surrogate -- UTF-16 surrogates are not supported (utf16-surrogate-file)\n");
607	0				0	last;
608						}
609	1169				1527	s/\R$//;
610	1169	100			1029	s/^\x{FEFF}// if $. == 1;
611	1169	100			1170	next unless /./;
612	1168				810	my $raw_line = $_;
613	1168				567	my $parsed_block_markers;
614
615						# hook for custom multiline based text exclusions:
616	1168	100			804	if ($begin_block_re) {
617	1148				553	FIND_END_MARKER: while (1) {
618	1150				981	while ($next_end_marker ne '') {
619	6	100			25	next LINE unless /\Q$next_end_marker\E/;
620	1				5	s/.*?\Q$next_end_marker\E//;
621	1				1	($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
622	1				2	$parsed_block_markers = 1;
623						}
624	1145				1236	my @captured = (/^.*?$begin_block_re/);
625	1145	100			1104	last unless (@captured);
626	2				2	for my $capture (0 .. $#captured) {
627	2	50			3	if ($captured[$capture]) {
628	2				5	($current_begin_marker, $next_end_marker, $start_marker_line) = ($begin_block_list[$capture], $end_block_list[$capture], "$.:1 ... 1");
629	2				12	s/^.*?\Q$begin_block_list[$capture]\E//;
630	2				2	$parsed_block_markers = 1;
631	2				3	next FIND_END_MARKER;
632						}
633						}
634						}
635	1143	100			848	next if $parsed_block_markers;
636						}
637
638	1162				626	my $ignore_this_line = $ignore_next_line;
639	1162				993	$ignore_next_line = ($_ =~ /$ignore_next_line_pattern/);
640	1162	100			859	next if $ignore_this_line;
641
642						# hook for custom line based text exclusions:
643	1161	100			833	if (defined $patterns_re) {
644	2 6				11 8	s/($patterns_re)/"="x length($1)/ge;
645						}
646	1161				710	my $initial_line_state = $_;
647	1161				695	my $previous_line_state = $_;
648	1161				616	my $line_flagged;
649	1161	100			792	if ($forbidden_re) {
650	9 5				58 12	while (s/($forbidden_re)/"="x length($1)/e) {
651	5				4	$line_flagged = 1;
652	5				9	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
653	5				3	my $found_trigger_re;
654	5				7	for my $i (0 .. $#forbidden_re_list) {
655	7				6	my $forbidden_re_singleton = $forbidden_re_list[$i];
656	7				5	my $test_line = $previous_line_state;
657	7 4	100			55 7	if ($test_line =~ s/($forbidden_re_singleton)/"="x length($1)/e) {
658	4	50			4	next unless $test_line eq $_;
659	4				7	my ($begin_test, $end_test, $match_test) = ($-[0] + 1, $+[0] + 1, $1);
660	4	50			4	next unless $begin == $begin_test;
661	4	50			2	next unless $end == $end_test;
662	4	50			5	next unless $match eq $match_test;
663	4				2	$found_trigger_re = $forbidden_re_singleton;
664	4				8	my $hit = "$.:$begin:$end";
665	4				2	$forbidden_re_hits[$i]++;
666	4	100			4	$forbidden_re_lines[$i] = $hit unless $forbidden_re_lines[$i];
667	4				8	last;
668						}
669						}
670	5				4	my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
671	5	100			5	if ($found_trigger_re) {
672	4		100		9	my $description = $forbidden_re_descriptions{$found_trigger_re} \|\| '';
673	4				8	$found_trigger_re =~ s/^$\?:(.*)$$/$1/;
674	4				4	my $quoted_trigger_re = CheckSpelling::Util::truncate_with_ellipsis(CheckSpelling::Util::wrap_in_backticks($found_trigger_re), 99);
675	4	100			4	if ($description ne '') {
676	3				13	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns rule: $description - $quoted_trigger_re (forbidden-pattern)\n";
677						} else {
678	1				5	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry: $quoted_trigger_re (forbidden-pattern)\n";
679						}
680						} else {
681	1				3	print $warnings_fh ":$.:$begin ... $end, Warning - $wrapped matches a line_forbidden.patterns entry (forbidden-pattern)\n";
682						}
683	5				24	$previous_line_state = $_;
684						}
685	9				9	$_ = $initial_line_state;
686						}
687						# This is to make it easier to deal w/ rules:
688	1161				1126	s/^/ /;
689	1161				636	my %unrecognized_line_items = ();
690	1161				467	our $homoglyph_re;
691	1161	50			806	if (defined $homoglyph_re) {
692	1161				668	my $check_line_for_homoglyphs = $_;
693	1161				653	my $homoglyphs = $CheckSpelling::Homoglyph::homoglyphs;
694	1161				10094	while ($check_line_for_homoglyphs =~ /($homoglyph_re)/g) {
695	1				3	my ($token, $token_raw, $begin, $end) = ($1, $1, $-[0] + 1, $+[0] + 1);
696	1				54	$token =~ s/([$homoglyphs])/$CheckSpelling::Homoglyph::homoglyph_to_glyph{$1}/g;
697	1	50			2	if (defined $dictionary{$token}) {
698	1				1	my $token_raw = CheckSpelling::Util::wrap_in_backticks($token_raw);
699	1				1	my $token = CheckSpelling::Util::wrap_in_backticks($token);
700	1				1	my $wrapped = "check $token_raw should probably be $token (homoglyph-word)";
701	1				7	print $warnings_fh ":$.:$begin ... $end: $wrapped\n";
702						}
703						}
704						}
705	1161				1216	my ($new_words, $new_unrecognized) = split_line($_, \%unique, \%unique_unrecognized, \%unrecognized_line_items);
706	1161				682	$words += $new_words;
707	1161				555	$unrecognized += $new_unrecognized;
708	1161				813	my $line_length = length($raw_line);
709	1161				1675	for my $token (sort CheckSpelling::Util::case_biased keys %unrecognized_line_items) {
710	1021				483	my $found_token = 0;
711	1021				536	my $raw_token = $token;
712	1021				510	$token =~ s/'/(?:'\|\x{2019}\|\'\|\')+/g;
713	1021				445	my $before;
714	1021	100 50			1546	if ($token =~ /^$upper_pattern$lower_pattern/) {
715	5				3	$before = '(?<=.)';
716						} elsif ($token =~ /^$upper_pattern/) {
717	0				0	$before = "(?<!$upper_pattern)";
718						} else {
719	1016				661	$before = "(?<=$not_lower_pattern)";
720						}
721	1021	50			1219	my $after = ($token =~ /$upper_pattern$/) ? "(?=$not_upper_or_lower_pattern)\|(?=$upper_pattern$lower_pattern)" : "(?=$not_lower_pattern)";
722	1021				2212	while ($raw_line =~ /(?:\b\|$before)($token)(?:\b\|$after)/g) {
723	1271				571	$line_flagged = 1;
724	1271				526	$found_token = 1;
725	1271				1793	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
726	1271	50			1194	next unless $match =~ /./;
727	1271				901	my $wrapped = CheckSpelling::Util::wrap_in_backticks($match);
728	1271				4935	print $warnings_fh ":$.:$begin ... $end: $wrapped\n";
729						}
730	1021	100			1245	unless ($found_token) {
731	3	50	33		29	if ($raw_line !~ /$token.*$token/ && $raw_line =~ /($token)/) {
732	3				6	my ($begin, $end, $match) = ($-[0] + 1, $+[0] + 1, $1);
733	3				3	my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
734	3				11	print $warnings_fh ":$.:$begin ... $end: $wrapped\n";
735						} else {
736	0				0	my $offset = $line_length + 1;
737	0				0	my $wrapped = CheckSpelling::Util::wrap_in_backticks($raw_token);
738	0				0	print $warnings_fh ":$.:1 ... $offset, Warning - Could not identify whole word $wrapped in line (token-is-substring)\n";
739						}
740						}
741						}
742	1161	100	100		1927	if ($line_flagged && $candidates_re) {
743	2				2	$_ = $previous_line_state = $initial_line_state;
744	2 2				21 5	s/($candidates_re)/"="x length($1)/ge;
745	2	50			2	if ($_ ne $initial_line_state) {
746	2				1	$_ = $previous_line_state;
747	2				3	for my $i (0 .. $#candidates_re_list) {
748	4				4	my $candidate_re = $candidates_re_list[$i];
749	4	100	67		25	next unless $candidate_re =~ /./ && $raw_line =~ /$candidate_re/;
750	2 2	50			7 4	if (($_ =~ s/($candidate_re)/"="x length($1)/e)) {
751	2				3	my ($begin, $end) = ($-[0] + 1, $+[0] + 1);
752	2				4	my $hit = "$.:$begin:$end";
753	2				2	$_ = $previous_line_state;
754	2 2				7 2	my $replacements = ($_ =~ s/($candidate_re)/"="x length($1)/ge);
755	2				2	$candidates_re_hits[$i] += $replacements;
756	2	50			7	$candidates_re_lines[$i] = $hit unless $candidates_re_lines[$i];
757	2				5	$_ = $previous_line_state;
758						}
759						}
760						}
761						}
762	1161	50			916	unless ($disable_minified_file) {
763	1161				903	s/={3,}//g;
764	1161				806	$offset += length;
765	1161				1043	my $ratio = int($offset / $.);
766	1161				617	my $ratio_threshold = 1000;
767	1161	100			3364	if ($ratio > $ratio_threshold) {
768	2				8	skip_file($temp_dir, "average line width ($ratio) exceeds the threshold ($ratio_threshold) (minified-file)\n");
769	2				7	last;
770						}
771						}
772						}
773	14	100			29	if ($next_end_marker) {
774	1	50			2	if ($start_marker_line) {
775	1				1	my $wrapped = CheckSpelling::Util::wrap_in_backticks($current_begin_marker);
776	1				4	print $warnings_fh ":$start_marker_line, Warning - Failed to find matching end marker for $wrapped (unclosed-block-ignore-begin)\n";
777						}
778	1				1	my $wrapped = CheckSpelling::Util::wrap_in_backticks($next_end_marker);
779	1				3	print $warnings_fh ":$.:1 ... 1, Warning - Expected to find end block marker $wrapped (unclosed-block-ignore-end)\n";
780						}
781
782	14				110	alarm 0;
783						};
784	14	50			12	if ($@) {
785	0	0			0	die unless $@ eq "alarm\n";
786	0				0	print $warnings_fh ":$.:1 ... 1, Warning - Could not parse file within time limit (slow-file)\n";
787	0				0	skip_file($temp_dir, "it could not be parsed file within time limit (slow-file)\n");
788	0				0	return $temp_dir;
789						}
790
791	14				58	close $file_fh;
792	14				178	close $warnings_fh;
793
794	14	100	75		32	if ($unrecognized \|\| @candidates_re_hits \|\| @forbidden_re_hits) {
795	13				384	open(my $stats_fh, '>:utf8', "$temp_dir/stats");
796	13	100 100 100 100			197	print $stats_fh "{words: $words, unrecognized: $unrecognized, unknown: ".(keys %unique_unrecognized).
797						", unique: ".(keys %unique).
798						(@candidates_re_hits ? ", candidates: [".(join ',', @candidates_re_hits)."]" : "").
799						(@candidates_re_lines ? ", candidate_lines: [".(join ',', @candidates_re_lines)."]" : "").
800						(@forbidden_re_hits ? ", forbidden: [".(join ',', @forbidden_re_hits)."]" : "").
801						(@forbidden_re_lines ? ", forbidden_lines: [".(join ',', @forbidden_re_lines)."]" : "").
802						"}";
803	13				176	close $stats_fh;
804	13				349	open(my $unknown_fh, '>:utf8', "$temp_dir/unknown");
805	13 20				56 36	print $unknown_fh map { "$_\n" } sort CheckSpelling::Util::case_biased keys %unique_unrecognized;
806	13				145	close $unknown_fh;
807						}
808
809	14				157	return $temp_dir;
810						}
811
812						sub main {
813	4			4	402	my ($configuration, @ARGV) = @_;
814	4				0	our %dictionary;
815	4	100			4	unless (%dictionary) {
816	1				1	init($configuration);
817						}
818
819						# read all input
820	4				5	my @reports;
821
822	4				1	for my $file (@ARGV) {
823	4				4	my $temp_dir = split_file($file);
824	4				8	push @reports, "$temp_dir\n";
825						}
826	4				10	print join '', @reports;
827						}
828
829						1;