File Coverage

File:lib/CheckSpelling/SuggestExcludes.pm
Coverage:89.8%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::SuggestExcludes;
4
5
1
1
54280
2
use 5.022;
6
1
1
1
2
0
61
use feature 'unicode_strings';
7
1
1
1
1
1
1128
use CheckSpelling::Util;
8
9my (%extensions, %files, %directories, %rooted_paths, %tailed_paths);
10my @patterns;
11my @not_hit_patterns;
12
13sub get_extension {
14
56
32
  my ($path) = @_;
15
56
47
  $path =~ s!.*/!!;
16
56
52
  return '' unless $path =~ s/.+\././;
17
8
12
  return $path;
18}
19
20sub path_to_pattern {
21
1
1
  my ($path) = @_;
22
1
1
  $path =~ s/^/^\\Q/;
23
1
1
  $path =~ s/$/\\E\$/;
24
1
2
  return $path;
25}
26
27sub process_path {
28
56
33
  my ($path) = @_;
29
56
19
  our $baseline;
30
56
28
  my $extension = get_extension($_);
31
56
44
  ++$baseline->{'extensions'}{$extension} if $extension ne '';
32
56
42
  my @directories = split m{/}, $_;
33
56
35
  my $file = pop @directories;
34
56
52
  ++$baseline->{'files'}{$file};
35
56
41
  return unless @directories;
36
51
23
  my @path_elements;
37
51
27
  for my $directory (@directories) {
38
62
28
    ++$baseline->{'directories'}{$directory};
39
62
41
    push @path_elements, $directory;
40
62
61
    ++$baseline->{'rooted_paths'}{join '/', @path_elements};
41  }
42
51
19
  shift @path_elements;
43
51
85
  while (@path_elements) {
44
11
10
    ++$baseline->{'tailed_paths'}{join '/', @path_elements};
45
11
23
    shift @path_elements;
46  }
47}
48
49sub quote_regex {
50
14
8
  my ($pattern) = @_;
51
14
15
  return quotemeta($pattern);
52}
53
54sub maybe_quote_regex {
55
8
2
  my ($i) = @_;
56
8
6
  my $quoted = quote_regex($i);
57
8
11
  $quoted =~ s<\\([-/@])><$1>g;
58
8
10
  return $i if $i eq $quoted;
59
3
2
  my $slashed = $i;
60
3
4
  if (($slashed =~ s<\.><\\.>) == 1){
61
3
5
    return $slashed if $slashed eq $quoted;
62  }
63
1
4
  return "\\Q$i\\E";
64}
65
66sub build_patterns {
67
8
5
  my ($template, $key, $use_threshold, $suggest_threshold) = @_;
68
8
4
  our ($baseline, $totals);
69
8
21
  return unless defined $baseline->{$key} && defined $totals->{$key};
70
6
6
3
8
  my %hash_baseline = %{$baseline->{$key}};
71
6
6
5
6
  my %hash_totals = %{$totals->{$key}};
72
6
6
  our @patterns;
73
6
1
  my @results;
74
6
4
  my $joined_patterns = '\$^';
75
6
2
  for my $i (keys %hash_baseline) {
76
28
89
    next if $i =~ /$joined_patterns/;
77
26
17
    my ($hits, $total) = ($hash_baseline{$i}, $hash_totals{$i});
78
26
27
    next if $hits == 1 || $total == 0;
79
8
3
    my $ratio = $hits / $total;
80
8
11
    next if $ratio < $suggest_threshold;
81
5
3
    my $entry = $template;
82
5
5
    my $value = maybe_quote_regex($i);
83
5
8
    $entry =~ s/\n/$value/;
84
5
6
    if ($ratio == 1) {
85
4
3
      push @results, $value;
86
4
2
      $joined_patterns .= '|'.quote_regex($i);
87    } elsif ($ratio < 1) {
88
1
7
      my $percentage = int($ratio * 100);
89
1
2
      $entry = "(?:|\$^ $percentage\% - excluded $hits/$total)$entry";
90
1
1
      if ($ratio >= $use_threshold) {
91
1
1
        push @results, $value;
92
1
0
        $joined_patterns .= '|'.quote_regex($i);
93      } else {
94
0
0
        $entry = "#$entry";
95      }
96    }
97
5
6
    push @patterns, $entry;
98  }
99
6
9
  return @results;
100}
101
102sub set_up_counters {
103
4
4
  my ($ref) = @_;
104
4
3
  for my $key (qw(extensions files directories rooted_paths tailed_paths)) {
105
20
13
    $ref->{$key} = ();
106  }
107}
108
109sub replace_qe {
110
11
5
  my ($pattern) = @_;
111
11
1
9
1
  $pattern =~ s/\\Q(.*?)\\E/quote_regex($1)/eg;
112
11
11
  return $pattern;
113}
114
115sub score_patterns {
116  # Each pattern is recorded as ($data):
117  #   hit count (number)
118  #   pattern (string for regex)
119  #   files covered (array of string)
120  #
121  # %scores is a map from a hit count to an array of $data
122  #
123
2
3
  my @excluded = @_;
124
2
1
  our @patterns;
125
2
1
  my %scores;
126
2
1
  for my $raw_pattern (@patterns) {
127
7
4
    my @hits;
128
7
4
    my $pattern = replace_qe($raw_pattern);
129
7
5
    for my $path (@excluded) {
130
81
93
      if ($path =~ /$pattern/) {
131
17
12
        push @hits, $path;
132      }
133    }
134
7
6
    my $hit_count = scalar @hits;
135    # naive data structure
136
7
4
    my @data = ($hit_count, $raw_pattern, \@hits);
137
7
2
7
1
    my @entries = defined $scores{$hit_count} ? @{$scores{$hit_count}} : ();
138
7
7
    push @entries, \@data;
139
7
6
    $scores{$hit_count} = \@entries;
140  }
141
2
2
  if (defined $scores{0}) {
142
1
2
1
0
5
1
    our @not_hit_patterns = map { $_[0] } (@{$scores{0}})
143  }
144
2
4
  my %generally_covered_paths;
145  my @selected_patterns;
146
2
2
  while (%scores) {
147
7
6
8
7
    my @ordered_scores = (sort { $b <=> $a } keys %scores);
148
7
2
    my $top_score = shift @ordered_scores;
149
7
2
7
5
3
4
    my @top_scoring = (sort { length($a->[1]) <=> length($b->[1])} @{$scores{$top_score}});
150
151
7
9
    my $selected_pattern = pop @top_scoring;
152
7
6
    if (@top_scoring) {
153
2
2
      $scores{$top_score} = \@top_scoring;
154    } else {
155
5
3
      delete $scores{$top_score};
156    }
157
7
3
    my $current_hit_count = $top_score;
158
7
3
    my @remaining_paths;
159
7
2
    if (%generally_covered_paths) {
160
5
5
3
3
      for my $path (@{$selected_pattern->[2]}) {
161
6
5
        if (defined $generally_covered_paths{$path}) {
162
2
2
          --$current_hit_count;
163        } else {
164
4
3
          push @remaining_paths, $path;
165        }
166      }
167
5
3
      $selected_pattern->[0] = $current_hit_count;
168
5
3
      $selected_pattern->[2] = \@remaining_paths;
169    } else {
170
2
2
1
3
      @remaining_paths = @{$selected_pattern->[2]};
171    }
172
7
7
    next unless $current_hit_count;
173
4
5
    if ($current_hit_count == $top_score ||
174        (!@top_scoring &&
175         (!@ordered_scores ||
176          ($current_hit_count > @{$scores{$ordered_scores[0]}[0]})[0]))) {
177
4
3
      push @selected_patterns, $selected_pattern->[1];
178
4
1
      for my $path (@remaining_paths) {
179
15
15
        $generally_covered_paths{$path} = 1;
180      }
181    } else {
182      # we're not the best, so we'll move our object to where it should be now and revisit it later
183
0
0
      unless (defined $scores{$current_hit_count}) {
184
0
0
        $scores{$current_hit_count} = [];
185      }
186
0
0
0
0
      push @{$scores{$current_hit_count}}, $selected_pattern;
187    }
188  }
189
190
2
5
  return @selected_patterns;
191}
192
193sub main {
194
2
8
  my ($file_list, $should_exclude_file, $current_exclude_patterns) = @_;
195
2
13
  open FILES, '<', $file_list;
196
2
6
  our @patterns = ();
197
2
4
  our $baseline = {};
198
2
2
  set_up_counters($baseline);
199
2
2
  my (%paths, @excluded);
200  {
201
2
2
0
2
    local $/ = "\0";
202
2
14
    while (<FILES>) {
203
38
35
      chomp;
204
38
29
      $paths{$_} = 1;
205
38
22
      process_path $_;
206    }
207
2
6
    close FILES;
208  }
209
2
5
  our $totals = $baseline;
210
2
0
  $baseline = {};
211
2
2
  set_up_counters($baseline);
212
213
2
13
  open EXCLUDES, '<', $should_exclude_file;
214
2
12
  while (<EXCLUDES>) {
215
18
12
    chomp;
216
18
9
    push @excluded, $_;
217
18
10
    process_path $_;
218  }
219
2
7
  close EXCLUDES;
220
221
2
1
  my @current_patterns;
222
2
11
  open CURRENT_EXCLUDES, '<', $current_exclude_patterns;
223
2
8
  while (<CURRENT_EXCLUDES>) {
224
2
1
    chomp;
225
2
4
    next unless /./;
226
2
1
    next if /^#/;
227
2
5
    push @current_patterns, $_;
228  }
229
2
4
  close CURRENT_EXCLUDES;
230
231
2
2
  build_patterns("[^/]\n\$", 'extensions', .87, .81);
232
2
1
  build_patterns("(?:^|/)\n\$", 'files', .87, .81);
233
2
2
  build_patterns("^\n/", 'rooted_paths', .87, .81);
234
2
1
  build_patterns("/\n/[^/]+\$", 'tailed_paths', .87, .81);
235
236
2
3
  push (@patterns, @current_patterns);
237
2
3
  @patterns = score_patterns(@excluded);
238
2
0
  my @drop_patterns;
239
2
4
  if (@current_patterns) {
240
1
3
1
3
    my %positive_patterns = map { $_ => 1 } @patterns;
241
1
1
    our @not_hit_patterns;
242
1
2
0
3
    my %zero_patterns = map { $_ => 1 } @not_hit_patterns;
243
1
1
    for my $pattern (@current_patterns) {
244
2
10
      push @drop_patterns, $pattern unless defined $positive_patterns{$pattern} || defined $zero_patterns{$pattern};
245    }
246  }
247
248
2
4
2
3
  my $test = '(?:'.join('|', map { replace_qe($_) } @patterns).')' if @patterns;
249
2
1
  for my $file (@excluded) {
250
18
64
    next if $test && $file =~ /$test/;
251
3
3
    push @patterns, '^'.maybe_quote_regex($file).'$';
252  }
253
254
2
5
  @patterns = sort CheckSpelling::Util::case_biased @patterns;
255
2
9
  return (\@patterns, \@drop_patterns);
256}
257
2581;