| File: | lib/CheckSpelling/SuggestExcludes.pm |
| Coverage: | 88.1% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #! -*-perl-*- | |||||
| 2 | ||||||
| 3 | package CheckSpelling::SuggestExcludes; | |||||
| 4 | ||||||
| 5 | 1 1 | 53702 2 | use 5.022; | |||
| 6 | 1 1 1 | 2 1 42 | use feature 'unicode_strings'; | |||
| 7 | 1 1 1 | 2 1 1143 | use CheckSpelling::Util; | |||
| 8 | ||||||
| 9 | my (%extensions, %files, %directories, %rooted_paths, %tailed_paths); | |||||
| 10 | my @patterns; | |||||
| 11 | my @not_hit_patterns; | |||||
| 12 | ||||||
| 13 | sub get_extension { | |||||
| 14 | 46 | 21 | my ($path) = @_; | |||
| 15 | 46 | 43 | $path =~ s!.*/!!; | |||
| 16 | 46 | 44 | return '' unless $path =~ s/.+\././; | |||
| 17 | 8 | 8 | return $path; | |||
| 18 | } | |||||
| 19 | ||||||
| 20 | sub path_to_pattern { | |||||
| 21 | 1 | 2 | my ($path) = @_; | |||
| 22 | 1 | 2 | $path =~ s/^/^\\Q/; | |||
| 23 | 1 | 1 | $path =~ s/$/\\E\$/; | |||
| 24 | 1 | 1 | return $path; | |||
| 25 | } | |||||
| 26 | ||||||
| 27 | sub process_path { | |||||
| 28 | 46 | 30 | my ($path) = @_; | |||
| 29 | 46 | 16 | our $baseline; | |||
| 30 | 46 | 21 | my $extension = get_extension($_); | |||
| 31 | 46 | 38 | ++$baseline->{'extensions'}{$extension} if $extension ne ''; | |||
| 32 | 46 | 37 | my @directories = split m{/}, $_; | |||
| 33 | 46 | 30 | my $file = pop @directories; | |||
| 34 | 46 | 29 | ++$baseline->{'files'}{$file}; | |||
| 35 | 46 | 63 | return unless @directories; | |||
| 36 | 41 | 13 | my @path_elements; | |||
| 37 | 41 | 26 | for my $directory (@directories) { | |||
| 38 | 44 | 18 | ++$baseline->{'directories'}{$directory}; | |||
| 39 | 44 | 35 | push @path_elements, $directory; | |||
| 40 | 44 | 40 | ++$baseline->{'rooted_paths'}{join '/', @path_elements}; | |||
| 41 | } | |||||
| 42 | 41 | 15 | shift @path_elements; | |||
| 43 | 41 | 68 | while (@path_elements) { | |||
| 44 | 3 | 2 | ++$baseline->{'tailed_paths'}{join '/', @path_elements}; | |||
| 45 | 3 | 6 | shift @path_elements; | |||
| 46 | } | |||||
| 47 | } | |||||
| 48 | ||||||
| 49 | sub quote_regex { | |||||
| 50 | 10 | 2 | my ($pattern) = @_; | |||
| 51 | 10 | 11 | return quotemeta($pattern); | |||
| 52 | } | |||||
| 53 | ||||||
| 54 | sub maybe_quote_regex { | |||||
| 55 | 6 | 2 | my ($i) = @_; | |||
| 56 | 6 | 6 | my $quoted = quote_regex($i); | |||
| 57 | 6 | 6 | $quoted =~ s<\\([-/])><$1>g; | |||
| 58 | 6 | 8 | return $i if $i eq $quoted; | |||
| 59 | 3 | 1 | my $slashed = $i; | |||
| 60 | 3 | 4 | if (($slashed =~ s<\.><\\.>) == 1){ | |||
| 61 | 3 | 12 | return $slashed if $slashed eq $quoted; | |||
| 62 | } | |||||
| 63 | 1 | 3 | return "\\Q$i\\E"; | |||
| 64 | } | |||||
| 65 | ||||||
| 66 | sub build_patterns { | |||||
| 67 | 4 | 3 | my ($template, $key, $use_threshold, $suggest_threshold) = @_; | |||
| 68 | 4 | 2 | our ($baseline, $totals); | |||
| 69 | 4 | 13 | return unless defined $baseline->{$key} && defined $totals->{$key}; | |||
| 70 | 3 3 | 2 5 | my %hash_baseline = %{$baseline->{$key}}; | |||
| 71 | 3 3 | 3 4 | my %hash_totals = %{$totals->{$key}}; | |||
| 72 | 3 | 2 | our @patterns; | |||
| 73 | 3 | 2 | my @results; | |||
| 74 | 3 | 1 | my $joined_patterns = '\$^'; | |||
| 75 | 3 | 3 | for my $i (keys %hash_baseline) { | |||
| 76 | 19 | 56 | next if $i =~ /$joined_patterns/; | |||
| 77 | 18 | 13 | my ($hits, $total) = ($hash_baseline{$i}, $hash_totals{$i}); | |||
| 78 | 18 | 17 | next if $hits == 1 || $total == 0; | |||
| 79 | 5 | 3 | my $ratio = $hits / $total; | |||
| 80 | 5 | 5 | next if $ratio < $suggest_threshold; | |||
| 81 | 3 | 1 | my $entry = $template; | |||
| 82 | 3 | 3 | my $value = maybe_quote_regex($i); | |||
| 83 | 3 | 5 | $entry =~ s/\n/$value/; | |||
| 84 | 3 | 4 | if ($ratio == 1) { | |||
| 85 | 2 | 2 | push @results, $value; | |||
| 86 | 2 | 1 | $joined_patterns .= '|'.quote_regex($i); | |||
| 87 | } elsif ($ratio < 1) { | |||||
| 88 | 1 | 1 | my $percentage = int($ratio * 100); | |||
| 89 | 1 | 1 | $entry = "(?:|\$^ $percentage\% - excluded $hits/$total)$entry"; | |||
| 90 | 1 | 1 | if ($ratio >= $use_threshold) { | |||
| 91 | 1 | 1 | push @results, $value; | |||
| 92 | 1 | 1 | $joined_patterns .= '|'.quote_regex($i); | |||
| 93 | } else { | |||||
| 94 | 0 | 0 | $entry = "#$entry"; | |||
| 95 | } | |||||
| 96 | } | |||||
| 97 | 3 | 4 | push @patterns, $entry; | |||
| 98 | } | |||||
| 99 | 3 | 5 | return @results; | |||
| 100 | } | |||||
| 101 | ||||||
| 102 | sub set_up_counters { | |||||
| 103 | 2 | 1 | my ($ref) = @_; | |||
| 104 | 2 | 2 | for my $key (qw(extensions files directories rooted_paths tailed_paths)) { | |||
| 105 | 10 | 7 | $ref->{$key} = (); | |||
| 106 | } | |||||
| 107 | } | |||||
| 108 | ||||||
| 109 | sub score_patterns { | |||||
| 110 | # Each pattern is recorded as ($data): | |||||
| 111 | # hit count (number) | |||||
| 112 | # pattern (string for regex) | |||||
| 113 | # files covered (array of string) | |||||
| 114 | # | |||||
| 115 | # %scores is a map from a hit count to an array of $data | |||||
| 116 | # | |||||
| 117 | 1 | 2 | my @excluded = @_; | |||
| 118 | 1 | 1 | our @patterns; | |||
| 119 | 1 | 0 | my %scores; | |||
| 120 | 1 | 1 | for my $pattern (@patterns) { | |||
| 121 | 5 | 4 | my @hits; | |||
| 122 | 5 1 | 4 1 | $pattern =~ s/\\Q(.*?)\\E/quote_regex($1)/eg; | |||
| 123 | 5 | 4 | for my $path (@excluded) { | |||
| 124 | 75 | 86 | if ($path =~ /$pattern/) { | |||
| 125 | 12 | 11 | push @hits, $path; | |||
| 126 | } | |||||
| 127 | } | |||||
| 128 | 5 | 4 | my $hit_count = scalar @hits; | |||
| 129 | # naive data structure | |||||
| 130 | 5 | 4 | my @data = ($hit_count, $pattern, \@hits); | |||
| 131 | 5 2 | 5 2 | my @entries = defined $scores{$hit_count} ? @{$scores{$hit_count}} : (); | |||
| 132 | 5 | 3 | push @entries, \@data; | |||
| 133 | 5 | 4 | $scores{$hit_count} = \@entries; | |||
| 134 | } | |||||
| 135 | 1 | 4 | if (defined $scores{0}) { | |||
| 136 | 1 2 1 | 1 2 1 | our @not_hit_patterns = map { $_[0] } (@{$scores{0}}) | |||
| 137 | } | |||||
| 138 | 1 | 1 | my %generally_covered_paths; | |||
| 139 | my @selected_patterns; | |||||
| 140 | 1 | 1 | while (%scores) { | |||
| 141 | 5 4 | 5 4 | my @ordered_scores = (sort { $b <=> $a } keys %scores); | |||
| 142 | 5 | 4 | my $top_score = shift @ordered_scores; | |||
| 143 | 5 2 5 | 1 2 4 | my @top_scoring = (sort { length($a->[1]) <=> length($b->[1])} @{$scores{$top_score}}); | |||
| 144 | ||||||
| 145 | 5 | 4 | my $selected_pattern = pop @top_scoring; | |||
| 146 | 5 | 5 | if (@top_scoring) { | |||
| 147 | 2 | 2 | $scores{$top_score} = \@top_scoring; | |||
| 148 | } else { | |||||
| 149 | 3 | 1 | delete $scores{$top_score}; | |||
| 150 | } | |||||
| 151 | 5 | 9 | my $current_hit_count = $top_score; | |||
| 152 | 5 | 2 | my @remaining_paths; | |||
| 153 | 5 | 3 | if (%generally_covered_paths) { | |||
| 154 | 4 4 | 1 3 | for my $path (@{$selected_pattern->[2]}) { | |||
| 155 | 4 | 4 | if (defined $generally_covered_paths{$path}) { | |||
| 156 | 0 | 0 | --$current_hit_count; | |||
| 157 | } else { | |||||
| 158 | 4 | 3 | push @remaining_paths, $path; | |||
| 159 | } | |||||
| 160 | } | |||||
| 161 | 4 | 4 | $selected_pattern->[0] = $current_hit_count; | |||
| 162 | 4 | 2 | $selected_pattern->[2] = \@remaining_paths; | |||
| 163 | } else { | |||||
| 164 | 1 1 | 0 2 | @remaining_paths = @{$selected_pattern->[2]}; | |||
| 165 | } | |||||
| 166 | 5 | 6 | next unless $current_hit_count; | |||
| 167 | 3 | 3 | if ($current_hit_count == $top_score || | |||
| 168 | (!@top_scoring && | |||||
| 169 | (!@ordered_scores || | |||||
| 170 | ($current_hit_count > @{$scores{$ordered_scores[0]}[0]})[0]))) { | |||||
| 171 | 3 | 1 | push @selected_patterns, $selected_pattern->[1]; | |||
| 172 | 3 | 3 | for my $path (@remaining_paths) { | |||
| 173 | 12 | 11 | $generally_covered_paths{$path} = 1; | |||
| 174 | } | |||||
| 175 | } else { | |||||
| 176 | # we're not the best, so we'll move our object to where it should be now and revisit it later | |||||
| 177 | 0 | 0 | unless (defined $scores{$current_hit_count}) { | |||
| 178 | 0 | 0 | $scores{$current_hit_count} = []; | |||
| 179 | } | |||||
| 180 | 0 0 | 0 0 | push @{$scores{$current_hit_count}}, $selected_pattern; | |||
| 181 | } | |||||
| 182 | } | |||||
| 183 | ||||||
| 184 | 1 | 3 | return @selected_patterns; | |||
| 185 | } | |||||
| 186 | ||||||
| 187 | sub main { | |||||
| 188 | 1 | 2 | my ($file_list, $should_exclude_file, $current_exclude_patterns) = @_; | |||
| 189 | 1 | 7 | open FILES, '<', $file_list; | |||
| 190 | 1 | 5 | our @patterns = (); | |||
| 191 | 1 | 1 | our $baseline = {}; | |||
| 192 | 1 | 1 | set_up_counters($baseline); | |||
| 193 | 1 | 1 | my (%paths, @excluded); | |||
| 194 | { | |||||
| 195 | 1 1 | 0 2 | local $/ = "\0"; | |||
| 196 | 1 | 7 | while (<FILES>) { | |||
| 197 | 31 | 18 | chomp; | |||
| 198 | 31 | 27 | $paths{$_} = 1; | |||
| 199 | 31 | 14 | process_path $_; | |||
| 200 | } | |||||
| 201 | 1 | 3 | close FILES; | |||
| 202 | } | |||||
| 203 | 1 | 1 | our $totals = $baseline; | |||
| 204 | 1 | 1 | $baseline = {}; | |||
| 205 | 1 | 1 | set_up_counters($baseline); | |||
| 206 | ||||||
| 207 | 1 | 7 | open EXCLUDES, '<', $should_exclude_file; | |||
| 208 | 1 | 6 | while (<EXCLUDES>) { | |||
| 209 | 15 | 7 | chomp; | |||
| 210 | 15 | 14 | push @excluded, $_; | |||
| 211 | 15 | 8 | process_path $_; | |||
| 212 | } | |||||
| 213 | 1 | 3 | close EXCLUDES; | |||
| 214 | ||||||
| 215 | 1 | 1 | my @current_patterns; | |||
| 216 | 1 | 8 | open CURRENT_EXCLUDES, '<', $current_exclude_patterns; | |||
| 217 | 1 | 7 | while (<CURRENT_EXCLUDES>) { | |||
| 218 | 2 | 2 | chomp; | |||
| 219 | 2 | 3 | next unless /./; | |||
| 220 | 2 | 2 | next if /^#/; | |||
| 221 | 2 | 4 | push @current_patterns, $_; | |||
| 222 | } | |||||
| 223 | 1 | 2 | close CURRENT_EXCLUDES; | |||
| 224 | ||||||
| 225 | 1 | 2 | build_patterns("[^/]\n\$", 'extensions', .87, .81); | |||
| 226 | 1 | 1 | build_patterns("(?:^|/)\n\$", 'files', .87, .81); | |||
| 227 | 1 | 1 | build_patterns("^\n/", 'rooted_paths', .87, .81); | |||
| 228 | 1 | 1 | build_patterns("/\n/[^/]+\$", 'tailed_paths', .87, .81); | |||
| 229 | ||||||
| 230 | 1 | 1 | push (@patterns, @current_patterns); | |||
| 231 | 1 | 1 | @patterns = score_patterns(@excluded); | |||
| 232 | 1 | 1 | my @drop_patterns; | |||
| 233 | 1 | 0 | if (@current_patterns) { | |||
| 234 | 1 3 | 1 3 | my %positive_patterns = map { $_ => 1 } @patterns; | |||
| 235 | 1 | 1 | our @not_hit_patterns; | |||
| 236 | 1 2 | 1 2 | my %zero_patterns = map { $_ => 1 } @not_hit_patterns; | |||
| 237 | 1 | 1 | for my $pattern (@current_patterns) { | |||
| 238 | 2 | 6 | push @drop_patterns, $pattern unless defined $positive_patterns{$pattern} || defined $zero_patterns{$pattern}; | |||
| 239 | } | |||||
| 240 | } | |||||
| 241 | ||||||
| 242 | 1 | 2 | my $test = '(?:'.join('|', @patterns).')' if @patterns; | |||
| 243 | 1 | 1 | for my $file (@excluded) { | |||
| 244 | 15 | 42 | next if $test && $file =~ /$test/; | |||
| 245 | 3 | 4 | push @patterns, '^'.maybe_quote_regex($file).'$'; | |||
| 246 | } | |||||
| 247 | ||||||
| 248 | 1 | 2 | @patterns = sort CheckSpelling::Util::case_biased @patterns; | |||
| 249 | 1 | 6 | return (\@patterns, \@drop_patterns); | |||
| 250 | } | |||||
| 251 | ||||||
| 252 | 1; | |||||