File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:84.0%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
107204
0
34
use File::Basename;
9
1
1
1
181
1249
69
use Digest::SHA qw($errmsg);
10
1
1
1
2
1
23
use JSON::PP;
11
1
1
1
174
3701
25
use Hash::Merge qw( merge );
12
1
1
1
132
1
17
use CheckSpelling::Util;
13
1
1
1
143
1
1401
use CheckSpelling::GitSources;
14
15sub encode_low_ascii {
16
7
168
    $_ = shift;
17
7
1
8
2
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
18
7
7
    return $_;
19}
20
21sub url_encode {
22
6
4
    $_ = shift;
23
6
0
3
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
24
6
5
    return $_;
25}
26
27sub double_slash_escape {
28
6
4
    $_ = shift;
29
6
20
    s/(["()\]\\])/\\\\$1/g;
30
6
5
    return $_;
31}
32
33sub parse_warnings {
34
1
1
    my ($warnings) = @_;
35
1
1
    our $flatten;
36
1
33
    our %directoryToRepo;
37
1
0
    our $provenanceInsertion;
38
1
1
    our %provenanceStringToIndex;
39
1
1
    our %directoryToProvenanceInsertion;
40
1
0
    my @results;
41
1
12
    open WARNINGS, '<', $warnings || print STDERR "Could not open $warnings\n";
42
1
1
    my $rules = ();
43
1
2
    my %encoded_files = ();
44
1
2
    my %hashes_needed_for_files = ();
45
1
14
    while (<WARNINGS>) {
46
8
9
        next if m{^https://};
47
7
30
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
48
6
26
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
49
6
89
        my $directory = dirname($file);
50
6
5
        unless (defined $directoryToProvenanceInsertion{$directory}) {
51
2
3
            my $provenanceString = collectVersionControlProvenance($file);
52
2
153
            if (defined $provenanceStringToIndex{$provenanceString}) {
53
0
0
                $directoryToProvenanceInsertion{$directory} = $provenanceStringToIndex{$provenanceString};
54            } else {
55
2
5
                $provenanceStringToIndex{$provenanceString} = $provenanceInsertion;
56
2
2
                $directoryToProvenanceInsertion{$directory} = $provenanceInsertion;
57
2
2
                ++$provenanceInsertion;
58            }
59        }
60        # single-slash-escape `"` and `\`
61
6
5
        $message =~ s/(["\\])/\\$1/g;
62
6
7
        $message = encode_low_ascii $message;
63        # double-slash-escape `"`, `(`, `)`, `]`
64
6
4
        $message = double_slash_escape $message;
65        # encode `message` and `file` to protect against low ascii`
66
6
4
        my $encoded_file = url_encode $file;
67
6
5
        $encoded_files{$encoded_file} = $file;
68        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's SARIF view
69
6
7
        if ($message =~ /(`{2,})/) {
70
1
1
            my $backticks = $1;
71
1
29
            while ($message =~ /($backticks`+)(?=[`].*?\g{-1})/gs) {
72
0
0
                $backticks = $1 if length($1) > length($backticks);
73            }
74
1
16
            $message =~ s/(^|[^\\])$backticks(.+?)$backticks/${1}[${2}](#security-tab)/;
75        } else {
76
5
15
            $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
77        }
78        # replace '`' with `\`+`'` because GitHub's SARIF parser doesn't like them
79
6
5
        $message =~ s/\`/'/g;
80
6
7
        unless (defined $rules->{$code}) {
81
1
2
            $rules->{$code} = {};
82        }
83
6
3
        my $rule = $rules->{$code};
84
6
5
        unless (defined $rule->{$message}) {
85
3
4
            $rule->{$message} = [];
86        }
87
6
14
        my $hashed_message = Digest::SHA::sha1_base64($message);
88
6
5
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
89
6
12
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
90
6
12
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
91
6
8
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
92
6
4
        my $locations = $rule->{$message};
93
6
12
        my $physicalLocation = {
94            'uri' => $encoded_file,
95            'startLine' => $line,
96            'startColumn' => $column,
97            'endColumn' => $endColumn,
98        };
99
6
2
        push @$locations, $physicalLocation;
100
6
17
        $rule->{$message} = $locations;
101    }
102
1
1
    my %line_hashes = ();
103
1
1
    my %used_hashes = ();
104
1
2
    for my $file (sort keys %hashes_needed_for_files) {
105
1
25
        $line_hashes{$file} = ();
106
1
5
        unless (-e $file) {
107
0
0
            delete $hashes_needed_for_files{$file};
108
0
0
            next;
109        }
110
1
1
1
3
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
111
1
12
        unless (defined $directoryToRepo{dirname($file)}) {
112
1
1
            my ($parsed_file, $git_base_dir, $prefix, $remote_url, $rev, $branch) = CheckSpelling::GitSources::git_source_and_rev($file);
113        }
114
1
17
        open $file_fh, '<', $file;
115
1
1
        my $line = shift @lines;
116
1
2
        $line = 2 if $line == 1;
117
1
1
        my $buffer = '';
118
1
8
        while (<$file_fh>) {
119
8
6
            if ($line == $.) {
120
4
2
                my $sample = substr $buffer, -100, 100;
121
4
6
                my $hash = Digest::SHA::sha1_base64($sample);
122
4
4
                for (; $line == $.; $line = shift @lines) {
123
5
4
                    my $hit = $used_hashes{$hash}++;
124
5
4
                    $hash = "$hash:$hit" if $hit;
125
5
4
                    $line_hashes{$file}{$line} = $hash;
126
5
6
                    last unless @lines;
127                }
128            }
129
8
4
            $buffer .= $_;
130
8
19
            $buffer =~ s/\s+/ /g;
131
8
10
            $buffer = substr $buffer, -100, 100;
132        }
133
1
4
        close $file_fh;
134    }
135
1
1
1
1
    for my $code (sort keys %{$rules}) {
136
1
1
        my $rule = $rules->{$code};
137
1
1
0
3
        for my $message (sort keys %{$rule}) {
138
3
7
            my $hashed_message = Digest::SHA::sha1_base64($message);
139
3
1
            my $locations = $rule->{$message};
140
3
3
            my @locations_json = ();
141
3
2
            my @fingerprints = ();
142
3
4
            for my $location (@$locations) {
143
6
3
                my $encoded_file = $location->{uri};
144
6
4
                my $line = $location->{startLine};
145
6
0
                my $column = $location->{startColumn};
146
6
6
                my $endColumn = $location->{endColumn};
147
6
1
                my $partialFingerprint = '';
148
6
4
                my $file = $encoded_files{$encoded_file};
149
6
4
                if (defined $line_hashes{$file}) {
150
6
3
                    my $line_hash = $line_hashes{$file}{$line};
151
6
12
                    if (defined $line_hash) {
152
4
4
3
8
                        my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
153
4
3
                        my $hit = scalar @instances;
154
4
2
                        while (--$hit > 0) {
155
0
0
                            last if $instances[$hit] == $column;
156                        }
157
4
9
                        $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
158                    }
159                }
160
6
4
                push @fingerprints, $partialFingerprint;
161
6
6
                my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line, "startColumn": $column, "endColumn": $endColumn } } }>;
162
6
4
                push @locations_json, $json_fragment;
163            }
164
3
2
            if ($flatten) {
165
0
0
                my $locations_json_flat = join ',', @locations_json;
166
0
0
                my $partialFingerprints;
167
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
168
0
0
                if ($partialFingerprint ne '') {
169
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
170                }
171
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
172
0
0
                my $result = decode_json $result_json;
173
0
0
                push @results, $result;
174            } else {
175
3
3
                my $limit = scalar @locations_json;
176
3
1
                for (my $i = 0; $i < $limit; ++$i) {
177
6
4
                    my $locations_json_flat = $locations_json[$i];
178
6
2
                    my $partialFingerprints = '';
179
6
3
                    my $partialFingerprint = $fingerprints[$i];
180
6
4
                    if ($partialFingerprint ne '') {
181
4
3
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
182                    }
183
6
6
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
184
6
6
                    my $result = decode_json $result_json;
185
6
7082
                    push @results, $result;
186                }
187            }
188        }
189    }
190
1
7
    close WARNINGS;
191
1
8
    return \@results;
192}
193
194sub get_runs_from_sarif {
195
2
4
    my ($sarif_json) = @_;
196
2
1
    my %runs_view;
197
2
3
    return %runs_view unless $sarif_json->{'runs'};
198
2
2
1
2
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
199
2
3
    foreach my $sarif_json_run (@sarif_json_runs) {
200
2
2
1
4
        my %sarif_json_run_hash=%{$sarif_json_run};
201
2
2
        next unless defined $sarif_json_run_hash{'tool'};
202
203
2
2
1
2
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
204
2
2
        next unless defined $sarif_json_run_tool_hash{'driver'};
205
206
2
2
0
7
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
207        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
208
2
8
            defined $sarif_json_run_tool_driver_hash{'rules'};
209
210
2
2
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
211
2
2
2
5
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
212
2
2
        my %driver_view;
213
2
1
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
214
31
23
            next unless defined $driver_rule->{'id'};
215
31
28
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
216        }
217
2
7
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
218    }
219
2
3
    return %runs_view;
220}
221
222sub collectVersionControlProvenance {
223
2
2
    my ($file) = @_;
224
2
3
    my ($parsed_file, $git_base_dir, $prefix, $remote_url, $rev, $branch) = CheckSpelling::GitSources::git_source_and_rev($file);
225
2
3
    my $base = substr $parsed_file, 0, length($file);
226
2
4
    my $provenance = [$remote_url, $rev, $branch, $git_base_dir];
227
2
9
    return JSON::PP::encode_json($provenance);
228}
229
230sub generateVersionControlProvenance {
231
1
1
    my ($versionControlProvenanceList, $run) = @_;
232
1
1
    my %provenance;
233    sub buildVersionControlProvenance {
234
1
0
        my $d = $_;
235
1
1
1
1
        my ($remote_url, $rev, $branch, $git_base_dir) = @{JSON::PP::decode_json($d)};
236
1
304
        my $dir = $git_base_dir eq '.' ? '%SRCROOT%' : "DIR_$provenanceStringToIndex{$d}";
237
1
1
        my $mappedTo = {
238            "uriBaseId" => $dir
239        };
240
1
2
        my $versionControlProvenance = {
241            "mappedTo" => $mappedTo
242        };
243
1
2
        $versionControlProvenance->{"revisionId"} = $rev if defined $rev;
244
1
3
        $versionControlProvenance->{"branch"} = $branch if defined $branch;
245
1
1
        $versionControlProvenance->{"repositoryUri"} = $remote_url if defined $remote_url;
246
1
1
        return $versionControlProvenance;
247    }
248
1
2
    @provenanceList = map(buildVersionControlProvenance,@$versionControlProvenanceList);
249
1
2
    $run->{"versionControlProvenance"} = \@provenanceList;
250}
251
252my $provenanceInsertion = 0;
253my %provenanceStringToIndex = ();
254my %directoryToProvenanceInsertion = ();
255
256sub main {
257
1
17281
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
258
1
6
    unless (-f $sarif_template_file) {
259
0
0
        warn "Could not find SARIF template";
260
0
0
        return '';
261    }
262
263
1
2
    $ENV{GITHUB_SERVER_URL} = '' unless defined $ENV{GITHUB_SERVER_URL};
264
1
2
    $ENV{GITHUB_REPOSITORY} = '' unless defined $ENV{GITHUB_REPOSITORY};
265
1
4
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
266
1
1
    die "sarif template is empty" unless $sarif_template;
267
268
1
0
14
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
269
1
94
    my $sarif_json = $json->decode($sarif_template);
270
271
1
85838
    if (defined $sarif_template_overlay_file) {
272
1
11
        my $merger = Hash::Merge->new();
273
1
114
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
274
1
7
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
275
276        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
277
28
2968
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
278
28
28
14
37
            return [@{$_[1]}];
279
1
4
        };
280
281
1
12
        if (-s $sarif_template_overlay_file) {
282
1
3
            my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
283
1
4
            my %runs_base = get_runs_from_sarif($sarif_json);
284
285
1
3
            my $sarif_template_hash = $json->decode($sarif_template_overlay);
286
1
1808
            my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
287
1
1
            for my $run_id (keys %runs_overlay) {
288
1
4
                if (defined $runs_base{$run_id}) {
289
1
12
                    my $run_base_hash = $runs_base{$run_id};
290
1
1
                    my $run_overlay_hash = $runs_overlay{$run_id};
291
1
2
                    for my $overlay_id (keys %$run_overlay_hash) {
292                        $run_base_hash->{$overlay_id} = $merger->merge(
293                            $run_overlay_hash->{$overlay_id},
294
1
3
                            $run_base_hash->{$overlay_id}
295                        );
296                    }
297                } else {
298
0
0
                    $runs_base{$run_id} = $runs_overlay{$run_id};
299                }
300            }
301            #$sarif_json->
302
1
1
53
1
            my @sarif_json_runs = @{$sarif_json->{'runs'}};
303
1
1
            foreach my $sarif_json_run (@sarif_json_runs) {
304
1
1
1
1
                my %sarif_json_run_hash=%{$sarif_json_run};
305
1
2
                next unless defined $sarif_json_run_hash{'tool'};
306
307
1
1
1
1
                my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
308
1
1
                next unless defined $sarif_json_run_tool_hash{'driver'};
309
310
1
1
1
2
                my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
311
1
1
                my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
312                next unless defined $driver_name &&
313
1
3
                    defined $sarif_json_run_tool_driver_hash{'rules'};
314
315
1
1
                my $driver_view_hash = $runs_base{$driver_name};
316
1
1
                next unless defined $driver_view_hash;
317
318
1
1
2
3
                my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
319
1
2
                for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
320
31
2530
                    my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
321
31
13
                    my $driver_rule_id = $driver_rule->{'id'};
322                    next unless defined $driver_rule_id &&
323
31
41
                        defined $driver_view_hash->{$driver_rule_id};
324
30
24
                    $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
325                }
326            }
327
1
2
            delete $sarif_template_hash->{'runs'};
328
1
1
            $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
329        }
330    }
331    {
332
1
1
1
482
1
2
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
333
1
1
        foreach my $sarif_json_run (@sarif_json_runs) {
334
1
1
            my %sarif_json_run_automationDetails;
335
1
1
            $sarif_json_run_automationDetails{id} = $category;
336
1
1
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
337        }
338    }
339
340
1
1
0
2
    my %sarif = %{$sarif_json};
341
342
1
2
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
343
344
1
2
    my $results = parse_warnings $ENV{warning_output};
345
1
1
    if ($results) {
346
1
1
        $sarif{'runs'}[0]{'results'} = $results;
347
1
1
        our %provenanceStringToIndex;
348
1
1
        my @provenanceList = keys %provenanceStringToIndex;
349
1
2
        generateVersionControlProvenance(\@provenanceList, $sarif{'runs'}[0]);
350
1
1
        my %codes;
351
1
2
        for my $result_ref (@$results) {
352
6
6
5
5
            my %result = %{$result_ref};
353
6
4
            $codes{$result{'ruleId'}} = 1;
354        }
355
1
1
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
356
1
1
1
4
        my @rules = @{$rules_ref};
357
1
1
        my $missing_rule_definition_id = 'missing-rule-definition';
358
1
30
2
16
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
359
1
30
1
16
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
360
1
1
        my $code_index = 0;
361
1
1
1
1
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
362
1
1
1
1
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
363
1
1
        my $missing_rule_definition_index;
364
1
1
        if (@missing_codes) {
365
0
0
            push @rules, $missing_rule_definition_ref;
366
0
0
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
367
0
0
            for my $missing_code (@missing_codes) {
368
0
0
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
369
0
0
                my $result = decode_json $result_json;
370
0
0
0
0
                push @{$results}, $result;
371            }
372        }
373
1
2
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
374
1
1
1
1
        for my $result_index (0 .. scalar @{$results}) {
375
7
4
            my $result = $results->[$result_index];
376
7
3
            my $ruleId = $result->{'ruleId'};
377
7
12
            next if defined $ruleId && defined $defined_codes{$ruleId};
378
1
54
            $result->{'ruleIndex'} = $missing_rule_definition_index;
379        }
380    }
381
382
1
2
    return encode_json \%sarif;
383}
384
3851;