File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:84.4%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
106851
1
36
use File::Basename;
9
1
1
1
182
1214
83
use Digest::SHA qw($errmsg);
10
1
1
1
2
2
26
use JSON::PP;
11
1
1
1
172
3735
48
use Hash::Merge qw( merge );
12
1
1
1
136
1
16
use CheckSpelling::Util;
13
1
1
1
168
1
1372
use CheckSpelling::GitSources;
14
15sub encode_low_ascii {
16
6
204
    $_ = shift;
17
6
1
11
4
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
18
6
4
    return $_;
19}
20
21sub url_encode {
22
5
3
    $_ = shift;
23
5
0
2
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
24
5
4
    return $_;
25}
26
27sub double_slash_escape {
28
5
2
    $_ = shift;
29
5
19
    s/(["()\]\\])/\\\\$1/g;
30
5
6
    return $_;
31}
32
33sub parse_warnings {
34
1
0
    my ($warnings) = @_;
35
1
30
    our $flatten;
36
1
0
    our %directoryToRepo;
37
1
1
    our $provenanceInsertion;
38
1
0
    our %provenanceStringToIndex;
39
1
1
    our %directoryToProvenanceInsertion;
40
1
0
    my @results;
41
1
10
    open WARNINGS, '<', $warnings || print STDERR "Could not open $warnings\n";
42
1
2
    my $rules = ();
43
1
0
    my %encoded_files = ();
44
1
1
    my %hashes_needed_for_files = ();
45
1
10
    while (<WARNINGS>) {
46
7
8
        next if m{^https://};
47
6
19
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
48
5
18
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
49
5
76
        my $directory = dirname($file);
50
5
5
        unless (defined $directoryToProvenanceInsertion{$directory}) {
51
2
3
            my $provenanceString = collectVersionControlProvenance($file);
52
2
183
            if (defined $provenanceStringToIndex{$provenanceString}) {
53
0
0
                $directoryToProvenanceInsertion{$directory} = $provenanceStringToIndex{$provenanceString};
54            } else {
55
2
4
                $provenanceStringToIndex{$provenanceString} = $provenanceInsertion;
56
2
2
                $directoryToProvenanceInsertion{$directory} = $provenanceInsertion;
57
2
3
                ++$provenanceInsertion;
58            }
59        }
60        # single-slash-escape `"` and `\`
61
5
4
        $message =~ s/(["\\])/\\$1/g;
62
5
5
        $message = encode_low_ascii $message;
63        # double-slash-escape `"`, `(`, `)`, `]`
64
5
5
        $message = double_slash_escape $message;
65        # encode `message` and `file` to protect against low ascii`
66
5
3
        my $encoded_file = url_encode $file;
67
5
5
        $encoded_files{$encoded_file} = $file;
68        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's SARIF view
69
5
16
        $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
70        # replace '`' with `\`+`"` because GitHub's SARIF parser doesn't like them
71
5
4
        $message =~ s/\`/\\"/g;
72
5
7
        unless (defined $rules->{$code}) {
73
1
2
            $rules->{$code} = {};
74        }
75
5
4
        my $rule = $rules->{$code};
76
5
4
        unless (defined $rule->{$message}) {
77
2
5
            $rule->{$message} = [];
78        }
79
5
13
        my $hashed_message = Digest::SHA::sha1_base64($message);
80
5
5
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
81
5
10
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
82
5
8
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
83
5
8
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
84
5
3
        my $locations = $rule->{$message};
85
5
9
        my $physicalLocation = {
86            'uri' => $encoded_file,
87            'startLine' => $line,
88            'startColumn' => $column,
89            'endColumn' => $endColumn,
90        };
91
5
3
        push @$locations, $physicalLocation;
92
5
16
        $rule->{$message} = $locations;
93    }
94
1
0
    my %line_hashes = ();
95
1
1
    my %used_hashes = ();
96
1
4
    for my $file (sort keys %hashes_needed_for_files) {
97
1
1
        $line_hashes{$file} = ();
98
1
4
        unless (-e $file) {
99
0
0
            delete $hashes_needed_for_files{$file};
100
0
0
            next;
101        }
102
1
1
1
3
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
103
1
25
        unless (defined $directoryToRepo{dirname($file)}) {
104
1
2
            my ($parsed_file, $git_base_dir, $prefix, $remote_url, $rev, $branch) = CheckSpelling::GitSources::git_source_and_rev($file);
105        }
106
1
18
        open $file_fh, '<', $file;
107
1
1
        my $line = shift @lines;
108
1
2
        $line = 2 if $line == 1;
109
1
0
        my $buffer = '';
110
1
8
        while (<$file_fh>) {
111
7
8
            if ($line == $.) {
112
3
3
                my $sample = substr $buffer, -100, 100;
113
3
4
                my $hash = Digest::SHA::sha1_base64($sample);
114
3
4
                for (; $line == $.; $line = shift @lines) {
115
4
3
                    my $hit = $used_hashes{$hash}++;
116
4
5
                    $hash = "$hash:$hit" if $hit;
117
4
3
                    $line_hashes{$file}{$line} = $hash;
118
4
5
                    last unless @lines;
119                }
120            }
121
7
4
            $buffer .= $_;
122
7
18
            $buffer =~ s/\s+/ /g;
123
7
8
            $buffer = substr $buffer, -100, 100;
124        }
125
1
4
        close $file_fh;
126    }
127
1
1
0
2
    for my $code (sort keys %{$rules}) {
128
1
1
        my $rule = $rules->{$code};
129
1
1
0
1
        for my $message (sort keys %{$rule}) {
130
2
5
            my $hashed_message = Digest::SHA::sha1_base64($message);
131
2
2
            my $locations = $rule->{$message};
132
2
1
            my @locations_json = ();
133
2
0
            my @fingerprints = ();
134
2
2
            for my $location (@$locations) {
135
5
3
                my $encoded_file = $location->{uri};
136
5
5
                my $line = $location->{startLine};
137
5
0
                my $column = $location->{startColumn};
138
5
3
                my $endColumn = $location->{endColumn};
139
5
2
                my $partialFingerprint = '';
140
5
2
                my $file = $encoded_files{$encoded_file};
141
5
4
                if (defined $line_hashes{$file}) {
142
5
4
                    my $line_hash = $line_hashes{$file}{$line};
143
5
3
                    if (defined $line_hash) {
144
3
3
0
5
                        my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
145
3
1
                        my $hit = scalar @instances;
146
3
4
                        while (--$hit > 0) {
147
0
0
                            last if $instances[$hit] == $column;
148                        }
149
3
7
                        $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
150                    }
151                }
152
5
3
                push @fingerprints, $partialFingerprint;
153
5
6
                my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line, "startColumn": $column, "endColumn": $endColumn } } }>;
154
5
15
                push @locations_json, $json_fragment;
155            }
156
2
3
            if ($flatten) {
157
0
0
                my $locations_json_flat = join ',', @locations_json;
158
0
0
                my $partialFingerprints;
159
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
160
0
0
                if ($partialFingerprint ne '') {
161
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
162                }
163
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
164
0
0
                my $result = decode_json $result_json;
165
0
0
                push @results, $result;
166            } else {
167
2
1
                my $limit = scalar @locations_json;
168
2
2
                for (my $i = 0; $i < $limit; ++$i) {
169
5
1
                    my $locations_json_flat = $locations_json[$i];
170
5
5
                    my $partialFingerprints = '';
171
5
2
                    my $partialFingerprint = $fingerprints[$i];
172
5
4
                    if ($partialFingerprint ne '') {
173
3
2
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
174                    }
175
5
5
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
176
5
6
                    my $result = decode_json $result_json;
177
5
5814
                    push @results, $result;
178                }
179            }
180        }
181    }
182
1
6
    close WARNINGS;
183
1
9
    return \@results;
184}
185
186sub get_runs_from_sarif {
187
2
2
    my ($sarif_json) = @_;
188
2
1
    my %runs_view;
189
2
3
    return %runs_view unless $sarif_json->{'runs'};
190
2
2
0
2
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
191
2
4
    foreach my $sarif_json_run (@sarif_json_runs) {
192
2
2
1
3
        my %sarif_json_run_hash=%{$sarif_json_run};
193
2
2
        next unless defined $sarif_json_run_hash{'tool'};
194
195
2
2
1
3
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
196
2
1
        next unless defined $sarif_json_run_tool_hash{'driver'};
197
198
2
2
2
5
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
199        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
200
2
8
            defined $sarif_json_run_tool_driver_hash{'rules'};
201
202
2
3
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
203
2
2
1
3
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
204
2
1
        my %driver_view;
205
2
2
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
206
31
7
            next unless defined $driver_rule->{'id'};
207
31
38
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
208        }
209
2
6
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
210    }
211
2
3
    return %runs_view;
212}
213
214sub collectVersionControlProvenance {
215
2
1
    my ($file) = @_;
216
2
4
    my ($parsed_file, $git_base_dir, $prefix, $remote_url, $rev, $branch) = CheckSpelling::GitSources::git_source_and_rev($file);
217
2
3
    my $base = substr $parsed_file, 0, length($file);
218
2
2
    my $provenance = [$remote_url, $rev, $branch, $git_base_dir];
219
2
11
    return JSON::PP::encode_json($provenance);
220}
221
222sub generateVersionControlProvenance {
223
1
1
    my ($versionControlProvenanceList, $run) = @_;
224
1
1
    my %provenance;
225    sub buildVersionControlProvenance {
226
1
1
        my $d = $_;
227
1
1
0
1
        my ($remote_url, $rev, $branch, $git_base_dir) = @{JSON::PP::decode_json($d)};
228
1
304
        my $dir = $git_base_dir eq '.' ? '%SRCROOT%' : "DIR_$provenanceStringToIndex{$d}";
229
1
1
        my $mappedTo = {
230            "uriBaseId" => $dir
231        };
232
1
2
        my $versionControlProvenance = {
233            "mappedTo" => $mappedTo
234        };
235
1
1
        $versionControlProvenance->{"revisionId"} = $rev if defined $rev;
236
1
3
        $versionControlProvenance->{"branch"} = $branch if defined $branch;
237
1
1
        $versionControlProvenance->{"repositoryUri"} = $remote_url if defined $remote_url;
238
1
2
        return $versionControlProvenance;
239    }
240
1
1
    @provenanceList = map(buildVersionControlProvenance,@$versionControlProvenanceList);
241
1
1
    $run->{"versionControlProvenance"} = \@provenanceList;
242}
243
244my $provenanceInsertion = 0;
245my %provenanceStringToIndex = ();
246my %directoryToProvenanceInsertion = ();
247
248sub main {
249
1
16793
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
250
1
7
    unless (-f $sarif_template_file) {
251
0
0
        warn "Could not find SARIF template";
252
0
0
        return '';
253    }
254
255
1
3
    $ENV{GITHUB_SERVER_URL} = '' unless defined $ENV{GITHUB_SERVER_URL};
256
1
1
    $ENV{GITHUB_REPOSITORY} = '' unless defined $ENV{GITHUB_REPOSITORY};
257
1
4
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
258
1
1
    die "sarif template is empty" unless $sarif_template;
259
260
1
0
9
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
261
1
74
    my $sarif_json = $json->decode($sarif_template);
262
263
1
86586
    if (defined $sarif_template_overlay_file) {
264
1
9
        my $merger = Hash::Merge->new();
265
1
86
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
266
1
5
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
267
268        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
269
28
2690
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
270
28
28
17
33
            return [@{$_[1]}];
271
1
2
        };
272
273
1
6
        if (-s $sarif_template_overlay_file) {
274
1
1
            my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
275
1
1
            my %runs_base = get_runs_from_sarif($sarif_json);
276
277
1
1
            my $sarif_template_hash = $json->decode($sarif_template_overlay);
278
1
1806
            my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
279
1
1
            for my $run_id (keys %runs_overlay) {
280
1
7
                if (defined $runs_base{$run_id}) {
281
1
10
                    my $run_base_hash = $runs_base{$run_id};
282
1
0
                    my $run_overlay_hash = $runs_overlay{$run_id};
283
1
2
                    for my $overlay_id (keys %$run_overlay_hash) {
284                        $run_base_hash->{$overlay_id} = $merger->merge(
285                            $run_overlay_hash->{$overlay_id},
286
1
2
                            $run_base_hash->{$overlay_id}
287                        );
288                    }
289                } else {
290
0
0
                    $runs_base{$run_id} = $runs_overlay{$run_id};
291                }
292            }
293            #$sarif_json->
294
1
1
53
1
            my @sarif_json_runs = @{$sarif_json->{'runs'}};
295
1
0
            foreach my $sarif_json_run (@sarif_json_runs) {
296
1
1
1
1
                my %sarif_json_run_hash=%{$sarif_json_run};
297
1
1
                next unless defined $sarif_json_run_hash{'tool'};
298
299
1
1
1
1
                my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
300
1
1
                next unless defined $sarif_json_run_tool_hash{'driver'};
301
302
1
1
1
2
                my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
303
1
1
                my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
304                next unless defined $driver_name &&
305
1
5
                    defined $sarif_json_run_tool_driver_hash{'rules'};
306
307
1
1
                my $driver_view_hash = $runs_base{$driver_name};
308
1
1
                next unless defined $driver_view_hash;
309
310
1
1
0
2
                my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
311
1
2
                for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
312
31
2695
                    my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
313
31
15
                    my $driver_rule_id = $driver_rule->{'id'};
314                    next unless defined $driver_rule_id &&
315
31
44
                        defined $driver_view_hash->{$driver_rule_id};
316
30
21
                    $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
317                }
318            }
319
1
2
            delete $sarif_template_hash->{'runs'};
320
1
1
            $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
321        }
322    }
323    {
324
1
1
1
462
0
2
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
325
1
1
        foreach my $sarif_json_run (@sarif_json_runs) {
326
1
1
            my %sarif_json_run_automationDetails;
327
1
1
            $sarif_json_run_automationDetails{id} = $category;
328
1
1
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
329        }
330    }
331
332
1
1
0
2
    my %sarif = %{$sarif_json};
333
334
1
1
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
335
336
1
2
    my $results = parse_warnings $ENV{warning_output};
337
1
1
    if ($results) {
338
1
2
        $sarif{'runs'}[0]{'results'} = $results;
339
1
0
        our %provenanceStringToIndex;
340
1
1
        my @provenanceList = keys %provenanceStringToIndex;
341
1
2
        generateVersionControlProvenance(\@provenanceList, $sarif{'runs'}[0]);
342
1
1
        my %codes;
343
1
2
        for my $result_ref (@$results) {
344
5
5
1
5
            my %result = %{$result_ref};
345
5
6
            $codes{$result{'ruleId'}} = 1;
346        }
347
1
1
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
348
1
1
0
5
        my @rules = @{$rules_ref};
349
1
2
        my $missing_rule_definition_id = 'missing-rule-definition';
350
1
30
1
17
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
351
1
30
1
16
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
352
1
1
        my $code_index = 0;
353
1
1
1
1
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
354
1
1
1
2
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
355
1
0
        my $missing_rule_definition_index;
356
1
2
        if (@missing_codes) {
357
0
0
            push @rules, $missing_rule_definition_ref;
358
0
0
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
359
0
0
            for my $missing_code (@missing_codes) {
360
0
0
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
361
0
0
                my $result = decode_json $result_json;
362
0
0
0
0
                push @{$results}, $result;
363            }
364        }
365
1
1
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
366
1
1
1
3
        for my $result_index (0 .. scalar @{$results}) {
367
6
1
            my $result = $results->[$result_index];
368
6
2
            my $ruleId = $result->{'ruleId'};
369
6
13
            next if defined $ruleId && defined $defined_codes{$ruleId};
370
1
43
            $result->{'ruleIndex'} = $missing_rule_definition_index;
371        }
372    }
373
374
1
1
    return encode_json \%sarif;
375}
376
3771;