File Coverage

File:lib/CheckSpelling/SuggestExcludes.pm
Coverage:87.8%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::SuggestExcludes;
4
5
1
1
51945
3
use 5.022;
6
1
1
1
1
1
39
use feature 'unicode_strings';
7
1
1
1
1
1
1092
use CheckSpelling::Util;
8
9my (%extensions, %files, %directories, %rooted_paths, %tailed_paths);
10my @patterns;
11my @not_hit_patterns;
12
13sub get_extension {
14
46
58
  my ($path) = @_;
15
46
37
  $path =~ s!.*/!!;
16
46
48
  return '' unless $path =~ s/.+\././;
17
8
9
  return $path;
18}
19
20sub path_to_pattern {
21
1
1
  my ($path) = @_;
22
1
2
  $path =~ s/^/^\\Q/;
23
1
2
  $path =~ s/$/\\E\$/;
24
1
1
  return $path;
25}
26
27sub process_path {
28
46
27
  my ($path) = @_;
29
46
18
  our $baseline;
30
46
19
  my $extension = get_extension($_);
31
46
35
  ++$baseline->{'extensions'}{$extension} if $extension ne '';
32
46
40
  my @directories = split m{/}, $_;
33
46
27
  my $file = pop @directories;
34
46
33
  ++$baseline->{'files'}{$file};
35
46
38
  return unless @directories;
36
41
14
  my @path_elements;
37
41
24
  for my $directory (@directories) {
38
44
23
    ++$baseline->{'directories'}{$directory};
39
44
29
    push @path_elements, $directory;
40
44
38
    ++$baseline->{'rooted_paths'}{join '/', @path_elements};
41  }
42
41
21
  shift @path_elements;
43
41
72
  while (@path_elements) {
44
3
2
    ++$baseline->{'tailed_paths'}{join '/', @path_elements};
45
3
6
    shift @path_elements;
46  }
47}
48
49sub quote_regex {
50
10
9
  my ($pattern) = @_;
51
10
11
  return quotemeta($pattern);
52}
53
54sub maybe_quote_regex {
55
6
6
  my ($i) = @_;
56
6
3
  my $quoted = quote_regex($i);
57
6
7
  $quoted =~ s<\\([-/])><$1>g;
58
6
7
  return $i if $i eq $quoted;
59
3
2
  my $slashed = $i;
60
3
4
  if (($slashed =~ s<\.><\\.>) == 1){
61
3
6
    return $slashed if $slashed eq $quoted;
62  }
63
1
3
  return "\\Q$i\\E";
64}
65
66sub build_patterns {
67
4
3
  my ($template, $key, $use_threshold, $suggest_threshold) = @_;
68
4
3
  our ($baseline, $totals);
69
4
11
  return unless defined $baseline->{$key} && defined $totals->{$key};
70
3
3
1
5
  my %hash_baseline = %{$baseline->{$key}};
71
3
3
3
5
  my %hash_totals = %{$totals->{$key}};
72
3
2
  our @patterns;
73
3
4
  my @results;
74
3
1
  my $joined_patterns = '\$^';
75
3
3
  for my $i (keys %hash_baseline) {
76
19
40
    next if $i =~ /$joined_patterns/;
77
19
14
    my ($hits, $total) = ($hash_baseline{$i}, $hash_totals{$i});
78
19
19
    next if $hits == 1 || $total == 0;
79
5
3
    my $ratio = $hits / $total;
80
5
4
    next if $ratio < $suggest_threshold;
81
3
2
    my $entry = $template;
82
3
1
    my $value = maybe_quote_regex($i);
83
3
3
    $entry =~ s/\n/$value/;
84
3
4
    if ($ratio == 1) {
85
2
1
      push @results, $value;
86
2
1
      $joined_patterns .= '|'.quote_regex($i);
87    } elsif ($ratio < 1) {
88
1
8
      my $percentage = int($ratio * 100);
89
1
1
      $entry = "(?:|\$^ $percentage\% - excluded $hits/$total)$entry";
90
1
1
      if ($ratio >= $use_threshold) {
91
1
1
        push @results, $value;
92
1
0
        $joined_patterns .= '|'.quote_regex($i);
93      } else {
94
0
0
        $entry = "#$entry";
95      }
96    }
97
3
3
    push @patterns, $entry;
98  }
99
3
5
  return @results;
100}
101
102sub set_up_counters {
103
2
1
  my ($ref) = @_;
104
2
2
  for my $key (qw(extensions files directories rooted_paths tailed_paths)) {
105
10
8
    $ref->{$key} = ();
106  }
107}
108
109sub score_patterns {
110  # Each pattern is recorded as ($data):
111  #   hit count (number)
112  #   pattern (string for regex)
113  #   files covered (array of string)
114  #
115  # %scores is a map from a hit count to an array of $data
116  #
117
1
2
  my @excluded = @_;
118
1
1
  our @patterns;
119
1
0
  my %scores;
120
1
1
  for my $pattern (@patterns) {
121
5
2
    my @hits;
122
5
1
5
5
    $pattern =~ s/\\Q(.*?)\\E/quote_regex($1)/eg;
123
5
3
    for my $path (@excluded) {
124
75
80
      if ($path =~ /$pattern/) {
125
12
12
        push @hits, $path;
126      }
127    }
128
5
4
    my $hit_count = scalar @hits;
129    # naive data structure
130
5
4
    my @data = ($hit_count, $pattern, \@hits);
131
5
2
6
1
    my @entries = defined $scores{$hit_count} ? @{$scores{$hit_count}} : ();
132
5
2
    push @entries, \@data;
133
5
6
    $scores{$hit_count} = \@entries;
134  }
135
1
1
  if (defined $scores{0}) {
136
1
2
1
0
3
1
    our @not_hit_patterns = map { $_[0] } (@{$scores{0}})
137  }
138
1
1
  my %generally_covered_paths;
139  my @selected_patterns;
140
1
1
  while (%scores) {
141
5
5
3
5
    my @ordered_scores = (sort { $b <=> $a } keys %scores);
142
5
4
    my $top_score = shift @ordered_scores;
143
5
2
5
2
3
4
    my @top_scoring = (sort { length($a->[1]) <=> length($b->[1])} @{$scores{$top_score}});
144
145
5
2
    my $selected_pattern = pop @top_scoring;
146
5
4
    if (@top_scoring) {
147
2
1
      $scores{$top_score} = \@top_scoring;
148    } else {
149
3
2
      delete $scores{$top_score};
150    }
151
5
3
    my $current_hit_count = $top_score;
152
5
2
    my @remaining_paths;
153
5
3
    if (%generally_covered_paths) {
154
4
4
0
5
      for my $path (@{$selected_pattern->[2]}) {
155
4
4
        if (defined $generally_covered_paths{$path}) {
156
0
0
          --$current_hit_count;
157        } else {
158
4
3
          push @remaining_paths, $path;
159        }
160      }
161
4
3
      $selected_pattern->[0] = $current_hit_count;
162
4
2
      $selected_pattern->[2] = \@remaining_paths;
163    } else {
164
1
1
1
9
      @remaining_paths = @{$selected_pattern->[2]};
165    }
166
5
5
    next unless $current_hit_count;
167
3
4
    if ($current_hit_count == $top_score ||
168        (!@top_scoring &&
169         (!@ordered_scores ||
170          ($current_hit_count > @{$scores{$ordered_scores[0]}[0]})[0]))) {
171
3
2
      push @selected_patterns, $selected_pattern->[1];
172
3
2
      for my $path (@remaining_paths) {
173
12
12
        $generally_covered_paths{$path} = 1;
174      }
175    } else {
176      # we're not the best, so we'll move our object to where it should be now and revisit it later
177
0
0
      unless (defined $scores{$current_hit_count}) {
178
0
0
        $scores{$current_hit_count} = [];
179      }
180
0
0
0
0
      push @{$scores{$current_hit_count}}, $selected_pattern;
181    }
182  }
183
184
1
3
  return @selected_patterns;
185}
186
187sub main {
188
1
3
  my ($file_list, $should_exclude_file, $current_exclude_patterns) = @_;
189
1
7
  open FILES, '<', $file_list;
190
1
1
  our @patterns = ();
191
1
2
  our $baseline = {};
192
1
1
  set_up_counters($baseline);
193
1
1
  my (%paths, @excluded);
194  {
195
1
1
3
1
    local $/ = "\0";
196
1
5
    while (<FILES>) {
197
31
21
      chomp;
198
31
21
      $paths{$_} = 1;
199
31
18
      process_path $_;
200    }
201
1
4
    close FILES;
202  }
203
1
1
  our $totals = $baseline;
204
1
0
  $baseline = {};
205
1
2
  set_up_counters($baseline);
206
207
1
8
  open EXCLUDES, '<', $should_exclude_file;
208
1
15
  while (<EXCLUDES>) {
209
15
9
    chomp;
210
15
10
    push @excluded, $_;
211
15
10
    process_path $_;
212  }
213
1
5
  close EXCLUDES;
214
215
1
0
  my @current_patterns;
216
1
9
  open CURRENT_EXCLUDES, '<', $current_exclude_patterns;
217
1
7
  while (<CURRENT_EXCLUDES>) {
218
2
2
    chomp;
219
2
3
    next unless /./;
220
2
2
    next if /^#/;
221
2
3
    push @current_patterns, $_;
222  }
223
1
2
  close CURRENT_EXCLUDES;
224
225
1
1
  build_patterns("[^/]\n\$", 'extensions', .87, .81);
226
1
1
  build_patterns("(?:^|/)\n\$", 'files', .87, .81);
227
1
1
  build_patterns("^\n/", 'rooted_paths', .87, .81);
228
1
1
  build_patterns("/\n/[^/]+\$", 'tailed_paths', .87, .81);
229
230
1
1
  push (@patterns, @current_patterns);
231
1
2
  @patterns = score_patterns(@excluded);
232
1
1
  my @drop_patterns;
233
1
1
  if (@current_patterns) {
234
1
3
1
3
    my %positive_patterns = map { $_ => 1 } @patterns;
235
1
1
    our @not_hit_patterns;
236
1
2
1
2
    my %zero_patterns = map { $_ => 1 } @not_hit_patterns;
237
1
1
    for my $pattern (@current_patterns) {
238
2
7
      push @drop_patterns, $pattern unless defined $positive_patterns{$pattern} || defined $zero_patterns{$pattern};
239    }
240  }
241
242
1
4
  my $test = '(?:'.join('|', @patterns).')' if @patterns;
243
1
0
  for my $file (@excluded) {
244
15
47
    next if $test && $file =~ /$test/;
245
3
2
    push @patterns, '^'.maybe_quote_regex($file).'$';
246  }
247
248
1
2
  @patterns = sort CheckSpelling::Util::case_biased @patterns;
249
1
6
  return (\@patterns, \@drop_patterns);
250}
251
2521;