File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:85.1%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
96497
1175
77
use Digest::SHA qw($errmsg);
9
1
1
1
3
0
24
use JSON::PP;
10
1
1
1
150
3236
27
use Hash::Merge qw( merge );
11
1
1
1
127
1
1176
use CheckSpelling::Util;
12
13sub encode_low_ascii {
14
8
236
    $_ = shift;
15
8
1
8
3
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
16
8
6
    return $_;
17}
18
19sub url_encode {
20
7
4
    $_ = shift;
21
7
0
4
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
22
7
3
    return $_;
23}
24
25sub double_slash_escape {
26
7
4
    $_ = shift;
27
7
16
    s/(["()\]\\])/\\\\$1/g;
28
7
4
    return $_;
29}
30
31sub fingerprintLocations {
32
4
4
    my ($locations, $encoded_files_ref, $line_hashes_ref, $hashes_needed_for_files_ref, $message, $hashed_message) = @_;
33
4
3
    my %encoded_files = %$encoded_files_ref;
34
4
3
    my %line_hashes = %$line_hashes_ref;
35
4
2
    my %hashes_needed_for_files = %$hashes_needed_for_files_ref;
36
4
3
    my @locations_json = ();
37
4
1
    my @fingerprints = ();
38
4
4
    for my $location (@$locations) {
39
7
3
        my $encoded_file = $location->{uri};
40
7
2
        my $line = $location->{startLine};
41
7
5
        my $column = $location->{startColumn};
42
7
2
        my $endColumn = $location->{endColumn};
43
7
4
        my $partialFingerprint = '';
44
7
3
        my $file = $encoded_files{$encoded_file};
45
7
5
        if (defined $line_hashes{$file}) {
46
7
3
            my $line_hash = $line_hashes{$file}{$line};
47
7
6
            if (defined $line_hash) {
48
4
4
1
4
                my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
49
4
3
                my $hit = scalar @instances;
50
4
3
                while (--$hit > 0) {
51
0
0
                    last if $instances[$hit] == $column;
52                }
53
4
8
                $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
54            }
55        }
56
7
2
        push @fingerprints, $partialFingerprint;
57
7
6
        my $startColumn = $column ? qq<, "startColumn": $column> : '';
58
7
5
        $endColumn = $endColumn ? qq<, "endColumn": $endColumn> : '';
59
7
4
        $line = 1 unless $line;
60
7
4
        my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line$startColumn$endColumn } } }>;
61
7
5
        push @locations_json, $json_fragment;
62    }
63
4
5
    return { locations_json => \@locations_json, fingerprints => \@fingerprints };
64}
65
66sub parse_warnings {
67
1
1
    my ($warnings) = @_;
68
1
1
    our $flatten;
69
1
1
    my @results;
70
1
18
    unless (open WARNINGS, '<', $warnings) {
71
0
0
        print STDERR "Could not open $warnings\n";
72
0
0
        return [];
73    }
74
1
1
    my $rules = ();
75
1
1
    my %encoded_files = ();
76
1
1
    my %hashes_needed_for_files = ();
77
1
14
    while (<WARNINGS>) {
78
9
7
        next if m{^https://};
79
8
24
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
80
7
12
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
81        # single-slash-escape `"` and `\`
82
7
6
        $message =~ s/(["\\])/\\$1/g;
83
7
3
        $message = encode_low_ascii $message;
84        # double-slash-escape `"`, `(`, `)`, `]`
85
7
3
        $message = double_slash_escape $message;
86        # encode `message` and `file` to protect against low ascii`
87
7
4
        my $encoded_file = url_encode $file;
88
7
3
        $encoded_files{$encoded_file} = $file;
89        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's SARIF view
90
7
6
        if ($message =~ /(`{2,})/) {
91
1
1
            my $backticks = $1;
92
1
25
            while ($message =~ /($backticks`+)(?=[`].*?\g{-1})/gs) {
93
0
0
                $backticks = $1 if length($1) > length($backticks);
94            }
95
1
15
            $message =~ s/(^|[^\\])$backticks(.+?)$backticks/${1}[${2}](#security-tab)/;
96        } else {
97
6
11
            $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
98        }
99        # replace '`' with `\`+`'` because GitHub's SARIF parser doesn't like them
100
7
6
        $message =~ s/\`/'/g;
101
7
6
        unless (defined $rules->{$code}) {
102
2
2
            $rules->{$code} = {};
103        }
104
7
3
        my $rule = $rules->{$code};
105
7
5
        unless (defined $rule->{$message}) {
106
4
3
            $rule->{$message} = [];
107        }
108
7
17
        my $hashed_message = Digest::SHA::sha1_base64($message);
109
7
7
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
110
7
7
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
111
7
6
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
112
7
5
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
113
7
4
        my $locations = $rule->{$message};
114
7
9
        my $physicalLocation = {
115            'uri' => $encoded_file,
116            'startLine' => $line,
117            'startColumn' => $column,
118            'endColumn' => $endColumn,
119        };
120
7
11
        push @$locations, $physicalLocation;
121
7
10
        $rule->{$message} = $locations;
122    }
123
1
0
    my %line_hashes = ();
124
1
2
    my %used_hashes = ();
125
1
2
    for my $file (sort keys %hashes_needed_for_files) {
126
1
1
        $line_hashes{$file} = ();
127
1
5
        unless (-e $file) {
128
0
0
            delete $hashes_needed_for_files{$file};
129
0
0
            next;
130        }
131
1
1
1
3
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
132
1
9
        open $file_fh, '<', $file;
133
1
0
        my $line = shift @lines;
134
1
3
        $line = 2 if $line == 1;
135
1
0
        my $buffer = '';
136
1
5
        while (<$file_fh>) {
137
8
6
            if ($line == $.) {
138
4
2
                my $sample = substr $buffer, -100, 100;
139
4
4
                my $hash = Digest::SHA::sha1_base64($sample);
140
4
5
                for (; $line == $.; $line = shift @lines) {
141
5
3
                    my $hit = $used_hashes{$hash}++;
142
5
4
                    $hash = "$hash:$hit" if $hit;
143
5
3
                    $line_hashes{$file}{$line} = $hash;
144
5
5
                    last unless @lines;
145                }
146            }
147
8
7
            $buffer .= $_;
148
8
16
            $buffer =~ s/\s+/ /g;
149
8
9
            $buffer = substr $buffer, -100, 100;
150        }
151
1
3
        close $file_fh;
152    }
153
1
1
1
2
    for my $code (sort keys %{$rules}) {
154
2
1
        my $rule = $rules->{$code};
155
2
2
1
2
        for my $message (sort keys %{$rule}) {
156
4
8
            my $hashed_message = Digest::SHA::sha1_base64($message);
157
4
0
            my $locations = $rule->{$message};
158
4
5
            my $fingerprintResults = fingerprintLocations($locations, \%encoded_files, \%line_hashes, \%hashes_needed_for_files, $message, $hashed_message);
159
4
4
3
2
            my @locations_json = @{$fingerprintResults->{locations_json}};
160
4
4
3
2
            my @fingerprints = @{$fingerprintResults->{fingerprints}};
161
4
2
            if ($flatten) {
162
0
0
                my $locations_json_flat = join ',', @locations_json;
163
0
0
                my $partialFingerprints;
164
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
165
0
0
                if ($partialFingerprint ne '') {
166
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
167                }
168
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
169
0
0
                my $result = decode_json $result_json;
170
0
0
                push @results, $result;
171            } else {
172
4
3
                my $limit = scalar @locations_json;
173
4
2
                for (my $i = 0; $i < $limit; ++$i) {
174
7
4
                    my $locations_json_flat = $locations_json[$i];
175
7
3
                    my $partialFingerprints = '';
176
7
2
                    my $partialFingerprint = $fingerprints[$i];
177
7
3
                    if ($partialFingerprint ne '') {
178
4
2
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
179                    }
180
7
7
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
181
7
2
                    my $result = decode_json $result_json;
182
7
7135
                    push @results, $result;
183                }
184            }
185        }
186    }
187
1
4
    close WARNINGS;
188
1
7
    return \@results;
189}
190
191sub get_runs_from_sarif {
192
2
2
    my ($sarif_json) = @_;
193
2
1
    my %runs_view;
194
2
2
    return %runs_view unless $sarif_json->{'runs'};
195
2
2
1
3
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
196
2
3
    foreach my $sarif_json_run (@sarif_json_runs) {
197
2
2
2
2
        my %sarif_json_run_hash=%{$sarif_json_run};
198
2
3
        next unless defined $sarif_json_run_hash{'tool'};
199
200
2
2
0
2
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
201
2
3
        next unless defined $sarif_json_run_tool_hash{'driver'};
202
203
2
2
0
4
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
204        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
205
2
6
            defined $sarif_json_run_tool_driver_hash{'rules'};
206
207
2
1
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
208
2
2
1
3
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
209
2
1
        my %driver_view;
210
2
2
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
211
38
22
            next unless defined $driver_rule->{'id'};
212
38
37
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
213        }
214
2
3
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
215    }
216
2
3
    return %runs_view;
217}
218
219sub main {
220
1
383
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
221
1
3
    unless (-f $sarif_template_file) {
222
0
0
        warn "Could not find sarif template";
223
0
0
        return '';
224    }
225
226
1
2
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
227
1
2
    die "sarif template is empty" unless $sarif_template;
228
229
1
0
3
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
230
1
51
    my $sarif_json = $json->decode($sarif_template);
231
232
1
96961
    if (defined $sarif_template_overlay_file && -s $sarif_template_overlay_file) {
233
1
6
        my $merger = Hash::Merge->new();
234
1
77
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
235
1
6
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
236
237        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
238
36
3689
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
239
36
36
15
37
            return [@{$_[1]}];
240
1
3
        };
241
242
1
2
        my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
243
1
3
        my %runs_base = get_runs_from_sarif($sarif_json);
244
245
1
3
        my $sarif_template_hash = $json->decode($sarif_template_overlay);
246
1
1659
        my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
247
1
2
        for my $run_id (keys %runs_overlay) {
248
1
1
            if (defined $runs_base{$run_id}) {
249
1
0
                my $run_base_hash = $runs_base{$run_id};
250
1
3
                my $run_overlay_hash = $runs_overlay{$run_id};
251
1
1
                for my $overlay_id (keys %$run_overlay_hash) {
252                    $run_base_hash->{$overlay_id} = $merger->merge(
253                        $run_overlay_hash->{$overlay_id},
254
1
2
                        $run_base_hash->{$overlay_id}
255                    );
256                }
257            } else {
258
0
0
                $runs_base{$run_id} = $runs_overlay{$run_id};
259            }
260        }
261        #$sarif_json->
262
1
1
26
1
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
263
1
0
        foreach my $sarif_json_run (@sarif_json_runs) {
264
1
1
1
1
            my %sarif_json_run_hash=%{$sarif_json_run};
265
1
2
            next unless defined $sarif_json_run_hash{'tool'};
266
267
1
1
1
1
            my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
268
1
1
            next unless defined $sarif_json_run_tool_hash{'driver'};
269
270
1
1
1
4
            my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
271
1
1
            my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
272            next unless defined $driver_name &&
273
1
3
                defined $sarif_json_run_tool_driver_hash{'rules'};
274
275
1
1
            my $driver_view_hash = $runs_base{$driver_name};
276
1
0
            next unless defined $driver_view_hash;
277
278
1
1
1
3
            my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
279
1
2
            for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
280
38
2443
                my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
281
38
16
                my $driver_rule_id = $driver_rule->{'id'};
282                next unless defined $driver_rule_id &&
283
38
55
                    defined $driver_view_hash->{$driver_rule_id};
284
37
22
                $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
285            }
286        }
287
1
2
        delete $sarif_template_hash->{'runs'};
288
1
1
        $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
289    }
290    {
291
1
1
1
489
1
2
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
292
1
1
        foreach my $sarif_json_run (@sarif_json_runs) {
293
1
0
            my %sarif_json_run_automationDetails;
294
1
37
            $sarif_json_run_automationDetails{id} = $category;
295
1
1
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
296        }
297    }
298
299
1
1
0
2
    my %sarif = %{$sarif_json};
300
301
1
2
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
302
303
1
2
    my $results = parse_warnings $ENV{warning_output};
304
1
1
    if ($results) {
305
1
2
        $sarif{'runs'}[0]{'results'} = $results;
306
1
1
        my %codes;
307
1
1
        for my $result_ref (@$results) {
308
7
7
2
6
            my %result = %{$result_ref};
309
7
6
            $codes{$result{'ruleId'}} = 1;
310        }
311
1
2
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
312
1
1
1
2
        my @rules = @{$rules_ref};
313
1
0
        my $missing_rule_definition_id = 'missing-rule-definition';
314
1
37
1
28
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
315
1
37
2
19
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
316
1
1
        my $code_index = 0;
317
1
1
2
2
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
318
1
2
2
1
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
319
1
1
        my $missing_rule_definition_index;
320
1
4
        if (@missing_codes) {
321
1
1
            push @rules, $missing_rule_definition_ref;
322
1
1
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
323
1
1
            for my $missing_code (@missing_codes) {
324
1
1
                my $message = "No rule definition for `$missing_code`";
325
1
0
                my $locations_json_flat = '';
326
1
1
                my $partialFingerprints = '';
327
1
2
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
328
1
1
                my $result = decode_json $result_json;
329
1
1
373
1
                push @{$results}, $result;
330            }
331        }
332
1
2
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
333
1
1
0
2
        for my $result_index (0 .. scalar @{$results}) {
334
9
5
            my $result = $results->[$result_index];
335
9
2
            my $ruleId = $result->{'ruleId'};
336
9
14
            next if defined $ruleId && defined $defined_codes{$ruleId};
337
2
55
            $result->{'ruleId'} = $missing_rule_definition_id;
338        }
339    }
340
341
1
2
    return encode_json \%sarif;
342}
343
3441;