| File: | lib/CheckSpelling/Homoglyph.pm |
| Coverage: | 79.2% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #! -*-perl-*- | |||||
| 2 | ||||||
| 3 | package CheckSpelling::Homoglyph; | |||||
| 4 | ||||||
| 5 | our $VERSION='0.1.0'; | |||||
| 6 | our $flatten=0; | |||||
| 7 | ||||||
| 8 | 1 1 1 | 2 1 2 | use utf8; | |||
| 9 | 1 1 1 | 15 1 260 | use CheckSpelling::Util; | |||
| 10 | ||||||
| 11 | my %homoglyph_map; | |||||
| 12 | my $homoglyphs; | |||||
| 13 | ||||||
| 14 | my %homoglyph_to_glyph; | |||||
| 15 | ||||||
| 16 | sub init { | |||||
| 17 | 14 | 11 | my ($file) = @_; | |||
| 18 | 1 1 1 14 | 2 1 11 190 | return unless open (my $fh, '<:encoding(UTF-8)', $file); | |||
| 19 | 14 | 780 | my $ignore_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_IGNORE_PATTERN', q<[^a-zA-Z']>); | |||
| 20 | 14 | 36 | my $upper_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_UPPER_PATTERN', '[A-Z]'); | |||
| 21 | 14 | 25 | my $lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_LOWER_PATTERN', '[a-z]'); | |||
| 22 | 14 | 22 | my $not_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_LOWER_PATTERN', '[^a-z]'); | |||
| 23 | 14 | 23 | my $not_upper_or_lower_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_NOT_UPPER_OR_LOWER_PATTERN', '[^A-Za-z]'); | |||
| 24 | 14 | 23 | my $punctuation_pattern = CheckSpelling::Util::get_file_from_env_utf8('INPUT_PUNCTUATION_PATTERN', q<'>); | |||
| 25 | 14 | 35 | local $/ = "\n"; | |||
| 26 | 14 | 105 | while (<$fh>) { | |||
| 27 | 686 | 825 | next if /^#/; | |||
| 28 | 686 | 324 | s/^\\//; | |||
| 29 | 686 | 642 | next unless /(.)(.+)/; | |||
| 30 | 686 | 588 | my ($expected, $aliases) = ($1, $2); | |||
| 31 | 686 | 748 | my @chars = split('', $aliases); | |||
| 32 | 686 | 780 | next unless $expected =~ /$upper_pattern|$lower_pattern|$punctuation_pattern/; | |||
| 33 | 686 | 682 | next unless $expected =~ /$upper_pattern|$lower_pattern/; | |||
| 34 | 686 | 333 | my @unexpected_chars; | |||
| 35 | 686 | 349 | for my $ch (@chars) { | |||
| 36 | 16198 | 14473 | if ($ch =~ /$ignore_pattern/) { | |||
| 37 | 16198 | 6454 | my %ref; | |||
| 38 | 16198 | 6143 | my $known_aliases = \%ref; | |||
| 39 | 16198 | 10348 | if (defined $homoglyph_map{$ch}) { | |||
| 40 | 15041 | 7088 | $known_aliases = $homoglyph_map{$ch}; | |||
| 41 | } | |||||
| 42 | 16198 | 7933 | $known_aliases->{$expected} = 1; | |||
| 43 | 16198 | 12311 | $homoglyph_map{$ch} = $known_aliases; | |||
| 44 | } | |||||
| 45 | } | |||||
| 46 | } | |||||
| 47 | 14 | 69 | close $fh; | |||
| 48 | 14 | 4314 | my @glyphs = sort keys %homoglyph_map; | |||
| 49 | 14 | 445 | our $homoglyphs = join '', @glyphs; | |||
| 50 | 14 | 8 | our %homoglyph_to_glyph; | |||
| 51 | 14 | 9 | for my $ch (@glyphs) { | |||
| 52 | 16198 16198 | 5986 9531 | my @known_aliases = keys %{$homoglyph_map{$ch}}; | |||
| 53 | 16198 | 8272 | if (scalar @known_aliases == 1) { | |||
| 54 | 16198 | 10608 | $homoglyph_to_glyph{$ch} = $known_aliases[0]; | |||
| 55 | } else { | |||||
| 56 | 0 | $homoglyph_to_glyph{$ch} = $ch; | ||||
| 57 | } | |||||
| 58 | } | |||||
| 59 | } | |||||
| 60 | ||||||
| 61 | sub dump_aliases { | |||||
| 62 | 0 | my $a; | ||||
| 63 | 0 | for my $ch (keys %homoglyph_map) { | ||||
| 64 | 0 | $a = $ch; | ||||
| 65 | 0 | my $b = $homoglyph_map{$a}; | ||||
| 66 | 0 | print "$a: ".(join " ", (sort keys %$b))."\n"; | ||||
| 67 | } | |||||
| 68 | 0 | print "\n"."homoglyphs: $homoglyphs\n"; | ||||
| 69 | } | |||||
| 70 | 1; | |||||