File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:83.3%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
104948
1417
61
use Digest::SHA qw($errmsg);
9
1
1
1
2
0
23
use JSON::PP;
10
1
1
1
169
3625
42
use Hash::Merge qw( merge );
11
1
1
1
132
0
1142
use CheckSpelling::Util;
12
13sub encode_low_ascii {
14
7
169
    $_ = shift;
15
7
1
7
2
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
16
7
4
    return $_;
17}
18
19sub url_encode {
20
6
3
    $_ = shift;
21
6
0
2
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
22
6
4
    return $_;
23}
24
25sub double_slash_escape {
26
6
4
    $_ = shift;
27
6
14
    s/(["()\]\\])/\\\\$1/g;
28
6
4
    return $_;
29}
30
31sub parse_warnings {
32
1
1
    my ($warnings) = @_;
33
1
0
    our $flatten;
34
1
1
    my @results;
35
1
11
    unless (open WARNINGS, '<', $warnings) {
36
0
0
        print STDERR "Could not open $warnings\n";
37
0
0
        return [];
38    }
39
1
1
    my $rules = ();
40
1
0
    my %encoded_files = ();
41
1
1
    my %hashes_needed_for_files = ();
42
1
7
    while (<WARNINGS>) {
43
8
7
        next if m{^https://};
44
7
20
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
45
6
10
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
46        # single-slash-escape `"` and `\`
47
6
17
        $message =~ s/(["\\])/\\$1/g;
48
6
4
        $message = encode_low_ascii $message;
49        # double-slash-escape `"`, `(`, `)`, `]`
50
6
3
        $message = double_slash_escape $message;
51        # encode `message` and `file` to protect against low ascii`
52
6
3
        my $encoded_file = url_encode $file;
53
6
4
        $encoded_files{$encoded_file} = $file;
54        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's sarif view
55
6
6
        if ($message =~ /(`{2,})/) {
56
1
0
            my $backticks = $1;
57
1
13
            while ($message =~ /($backticks`+)(?=[`].*?\g{-1})/gs) {
58
0
0
                $backticks = $1 if length($1) > length($backticks);
59            }
60
1
11
            $message =~ s/(^|[^\\])$backticks(.+?)$backticks/${1}[${2}](#security-tab)/;
61        } else {
62
5
10
            $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
63        }
64        # replace '`' with `\`+`'` because GitHub's SARIF parser doesn't like them
65
6
4
        $message =~ s/\`/'/g;
66
6
4
        unless (defined $rules->{$code}) {
67
1
1
            $rules->{$code} = {};
68        }
69
6
4
        my $rule = $rules->{$code};
70
6
3
        unless (defined $rule->{$message}) {
71
3
2
            $rule->{$message} = [];
72        }
73
6
18
        my $hashed_message = Digest::SHA::sha1_base64($message);
74
6
2
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
75
6
5
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
76
6
6
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
77
6
6
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
78
6
9
        my $locations = $rule->{$message};
79
6
6
        my $physicalLocation = {
80            'uri' => $encoded_file,
81            'startLine' => $line,
82            'startColumn' => $column,
83            'endColumn' => $endColumn,
84        };
85
6
4
        push @$locations, $physicalLocation;
86
6
7
        $rule->{$message} = $locations;
87    }
88
1
1
    my %line_hashes = ();
89
1
0
    my %used_hashes = ();
90
1
2
    for my $file (sort keys %hashes_needed_for_files) {
91
1
0
        $line_hashes{$file} = ();
92
1
6
        unless (-e $file) {
93
0
0
            delete $hashes_needed_for_files{$file};
94
0
0
            next;
95        }
96
1
1
0
3
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
97
1
9
        open $file_fh, '<', $file;
98
1
1
        my $line = shift @lines;
99
1
2
        $line = 2 if $line == 1;
100
1
0
        my $buffer = '';
101
1
7
        while (<$file_fh>) {
102
8
7
            if ($line == $.) {
103
4
3
                my $sample = substr $buffer, -100, 100;
104
4
4
                my $hash = Digest::SHA::sha1_base64($sample);
105
4
4
                for (; $line == $.; $line = shift @lines) {
106
5
2
                    my $hit = $used_hashes{$hash}++;
107
5
4
                    $hash = "$hash:$hit" if $hit;
108
5
4
                    $line_hashes{$file}{$line} = $hash;
109
5
5
                    last unless @lines;
110                }
111            }
112
8
4
            $buffer .= $_;
113
8
17
            $buffer =~ s/\s+/ /g;
114
8
8
            $buffer = substr $buffer, -100, 100;
115        }
116
1
4
        close $file_fh;
117    }
118
1
1
0
1
    for my $code (sort keys %{$rules}) {
119
1
1
        my $rule = $rules->{$code};
120
1
1
0
2
        for my $message (sort keys %{$rule}) {
121
3
5
            my $hashed_message = Digest::SHA::sha1_base64($message);
122
3
2
            my $locations = $rule->{$message};
123
3
2
            my @locations_json = ();
124
3
1
            my @fingerprints = ();
125
3
2
            for my $location (@$locations) {
126
6
3
                my $encoded_file = $location->{uri};
127
6
3
                my $line = $location->{startLine};
128
6
4
                my $column = $location->{startColumn};
129
6
1
                my $endColumn = $location->{endColumn};
130
6
3
                my $partialFingerprint = '';
131
6
3
                my $file = $encoded_files{$encoded_file};
132
6
4
                if (defined $line_hashes{$file}) {
133
6
4
                    my $line_hash = $line_hashes{$file}{$line};
134
6
3
                    if (defined $line_hash) {
135
4
4
1
5
                        my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
136
4
2
                        my $hit = scalar @instances;
137
4
4
                        while (--$hit > 0) {
138
0
0
                            last if $instances[$hit] == $column;
139                        }
140
4
7
                        $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
141                    }
142                }
143
6
4
                push @fingerprints, $partialFingerprint;
144
6
4
                my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line, "startColumn": $column, "endColumn": $endColumn } } }>;
145
6
3
                push @locations_json, $json_fragment;
146            }
147
3
2
            if ($flatten) {
148
0
0
                my $locations_json_flat = join ',', @locations_json;
149
0
0
                my $partialFingerprints;
150
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
151
0
0
                if ($partialFingerprint ne '') {
152
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
153                }
154
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
155
0
0
                my $result = decode_json $result_json;
156
0
0
                push @results, $result;
157            } else {
158
3
2
                my $limit = scalar @locations_json;
159
3
1
                for (my $i = 0; $i < $limit; ++$i) {
160
6
3
                    my $locations_json_flat = $locations_json[$i];
161
6
4
                    my $partialFingerprints = '';
162
6
2
                    my $partialFingerprint = $fingerprints[$i];
163
6
4
                    if ($partialFingerprint ne '') {
164
4
2
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
165                    }
166
6
3
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
167
6
6
                    my $result = decode_json $result_json;
168
6
6888
                    push @results, $result;
169                }
170            }
171        }
172    }
173
1
3
    close WARNINGS;
174
1
7
    return \@results;
175}
176
177sub get_runs_from_sarif {
178
2
2
    my ($sarif_json) = @_;
179
2
1
    my %runs_view;
180
2
2
    return %runs_view unless $sarif_json->{'runs'};
181
2
2
1
2
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
182
2
1
    foreach my $sarif_json_run (@sarif_json_runs) {
183
2
2
4
2
        my %sarif_json_run_hash=%{$sarif_json_run};
184
2
2
        next unless defined $sarif_json_run_hash{'tool'};
185
186
2
2
2
1
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
187
2
2
        next unless defined $sarif_json_run_tool_hash{'driver'};
188
189
2
2
0
3
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
190        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
191
2
8
            defined $sarif_json_run_tool_driver_hash{'rules'};
192
193
2
1
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
194
2
2
1
4
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
195
2
1
        my %driver_view;
196
2
1
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
197
30
24
            next unless defined $driver_rule->{'id'};
198
30
14
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
199        }
200
2
4
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
201    }
202
2
2
    return %runs_view;
203}
204
205sub main {
206
1
328
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
207
1
6
    unless (-f $sarif_template_file) {
208
0
0
        warn "Could not find sarif template";
209
0
0
        return '';
210    }
211
212
1
1
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
213
1
1
    die "sarif template is empty" unless $sarif_template;
214
215
1
0
2
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
216
1
41
    my $sarif_json = $json->decode($sarif_template);
217
218
1
81780
    if (defined $sarif_template_overlay_file) {
219
1
5
        my $merger = Hash::Merge->new();
220
1
42
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
221
1
3
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
222
223        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
224
28
3633
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
225
28
28
15
37
            return [@{$_[1]}];
226
1
2
        };
227
228
1
5
        if (-s $sarif_template_overlay_file) {
229
1
1
            my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
230
1
1
            my %runs_base = get_runs_from_sarif($sarif_json);
231
232
1
1
            my $sarif_template_hash = $json->decode($sarif_template_overlay);
233
1
1788
            my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
234
1
1
            for my $run_id (keys %runs_overlay) {
235
1
1
                if (defined $runs_base{$run_id}) {
236
1
1
                    my $run_base_hash = $runs_base{$run_id};
237
1
0
                    my $run_overlay_hash = $runs_overlay{$run_id};
238
1
1
                    for my $overlay_id (keys %$run_overlay_hash) {
239                        $run_base_hash->{$overlay_id} = $merger->merge(
240                            $run_overlay_hash->{$overlay_id},
241
1
2
                            $run_base_hash->{$overlay_id}
242                        );
243                    }
244                } else {
245
0
0
                    $runs_base{$run_id} = $runs_overlay{$run_id};
246                }
247            }
248            #$sarif_json->
249
1
1
39
1
            my @sarif_json_runs = @{$sarif_json->{'runs'}};
250
1
1
            foreach my $sarif_json_run (@sarif_json_runs) {
251
1
1
0
1
                my %sarif_json_run_hash=%{$sarif_json_run};
252
1
2
                next unless defined $sarif_json_run_hash{'tool'};
253
254
1
1
0
1
                my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
255
1
1
                next unless defined $sarif_json_run_tool_hash{'driver'};
256
257
1
1
0
2
                my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
258
1
1
                my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
259                next unless defined $driver_name &&
260
1
2
                    defined $sarif_json_run_tool_driver_hash{'rules'};
261
262
1
3
                my $driver_view_hash = $runs_base{$driver_name};
263
1
1
                next unless defined $driver_view_hash;
264
265
1
1
0
2
                my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
266
1
1
                for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
267
30
2061
                    my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
268
30
17
                    my $driver_rule_id = $driver_rule->{'id'};
269                    next unless defined $driver_rule_id &&
270
30
54
                        defined $driver_view_hash->{$driver_rule_id};
271
29
19
                    $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
272                }
273            }
274
1
1
            delete $sarif_template_hash->{'runs'};
275
1
1
            $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
276        }
277    }
278    {
279
1
1
1
384
1
1
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
280
1
1
        foreach my $sarif_json_run (@sarif_json_runs) {
281
1
0
            my %sarif_json_run_automationDetails;
282
1
1
            $sarif_json_run_automationDetails{id} = $category;
283
1
29
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
284        }
285    }
286
287
1
1
0
2
    my %sarif = %{$sarif_json};
288
289
1
1
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
290
291
1
1
    my $results = parse_warnings $ENV{warning_output};
292
1
1
    if ($results) {
293
1
1
        $sarif{'runs'}[0]{'results'} = $results;
294
1
1
        my %codes;
295
1
1
        for my $result_ref (@$results) {
296
6
6
5
6
            my %result = %{$result_ref};
297
6
2
            $codes{$result{'ruleId'}} = 1;
298        }
299
1
1
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
300
1
1
0
1
        my @rules = @{$rules_ref};
301
1
1
        my $missing_rule_definition_id = 'missing-rule-definition';
302
1
29
1
15
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
303
1
29
1
15
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
304
1
0
        my $code_index = 0;
305
1
1
1
1
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
306
1
1
1
1
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
307
1
1
        my $missing_rule_definition_index;
308
1
1
        if (@missing_codes) {
309
0
0
            push @rules, $missing_rule_definition_ref;
310
0
0
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
311
0
0
            for my $missing_code (@missing_codes) {
312
0
0
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
313
0
0
                my $result = decode_json $result_json;
314
0
0
0
0
                push @{$results}, $result;
315            }
316        }
317
1
1
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
318
1
1
0
1
        for my $result_index (0 .. scalar @{$results}) {
319
7
5
            my $result = $results->[$result_index];
320
7
2
            my $ruleId = $result->{'ruleId'};
321
7
11
            next if defined $ruleId && defined $defined_codes{$ruleId};
322
1
24
            $result->{'ruleIndex'} = $missing_rule_definition_index;
323        }
324    }
325
326
1
1
    return encode_json \%sarif;
327}
328
3291;