File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:83.8%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
107294
1
29
use File::Basename;
9
1
1
1
193
1236
79
use Digest::SHA qw($errmsg);
10
1
1
1
2
1
22
use JSON::PP;
11
1
1
1
229
3593
24
use Hash::Merge qw( merge );
12
1
1
1
142
1
15
use CheckSpelling::Util;
13
1
1
1
160
1
1408
use CheckSpelling::GitSources;
14
15sub encode_low_ascii {
16
7
215
    $_ = shift;
17
7
1
11
3
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
18
7
5
    return $_;
19}
20
21sub url_encode {
22
6
3
    $_ = shift;
23
6
0
5
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
24
6
5
    return $_;
25}
26
27sub double_slash_escape {
28
6
2
    $_ = shift;
29
6
20
    s/(["()\]\\])/\\\\$1/g;
30
6
4
    return $_;
31}
32
33sub parse_warnings {
34
1
1
    my ($warnings) = @_;
35
1
0
    our $flatten;
36
1
1
    our %directoryToRepo;
37
1
32
    our $provenanceInsertion;
38
1
1
    our %provenanceStringToIndex;
39
1
0
    our %directoryToProvenanceInsertion;
40
1
1
    my @results;
41
1
14
    unless (open WARNINGS, '<', $warnings) {
42
0
0
        print STDERR "Could not open $warnings\n";
43
0
0
        return [];
44    }
45
1
1
    my $rules = ();
46
1
12
    my %encoded_files = ();
47
1
1
    my %hashes_needed_for_files = ();
48
1
16
    while (<WARNINGS>) {
49
8
9
        next if m{^https://};
50
7
33
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
51
6
28
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
52
6
96
        my $directory = dirname($file);
53
6
9
        unless (defined $directoryToProvenanceInsertion{$directory}) {
54
2
4
            my $provenanceString = collectVersionControlProvenance($file);
55
2
173
            if (defined $provenanceStringToIndex{$provenanceString}) {
56
0
0
                $directoryToProvenanceInsertion{$directory} = $provenanceStringToIndex{$provenanceString};
57            } else {
58
2
5
                $provenanceStringToIndex{$provenanceString} = $provenanceInsertion;
59
2
4
                $directoryToProvenanceInsertion{$directory} = $provenanceInsertion;
60
2
1
                ++$provenanceInsertion;
61            }
62        }
63        # single-slash-escape `"` and `\`
64
6
4
        $message =~ s/(["\\])/\\$1/g;
65
6
17
        $message = encode_low_ascii $message;
66        # double-slash-escape `"`, `(`, `)`, `]`
67
6
8
        $message = double_slash_escape $message;
68        # encode `message` and `file` to protect against low ascii`
69
6
5
        my $encoded_file = url_encode $file;
70
6
5
        $encoded_files{$encoded_file} = $file;
71        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's SARIF view
72
6
6
        if ($message =~ /(`{2,})/) {
73
1
1
            my $backticks = $1;
74
1
35
            while ($message =~ /($backticks`+)(?=[`].*?\g{-1})/gs) {
75
0
0
                $backticks = $1 if length($1) > length($backticks);
76            }
77
1
19
            $message =~ s/(^|[^\\])$backticks(.+?)$backticks/${1}[${2}](#security-tab)/;
78        } else {
79
5
15
            $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
80        }
81        # replace '`' with `\`+`'` because GitHub's SARIF parser doesn't like them
82
6
6
        $message =~ s/\`/'/g;
83
6
7
        unless (defined $rules->{$code}) {
84
1
3
            $rules->{$code} = {};
85        }
86
6
4
        my $rule = $rules->{$code};
87
6
4
        unless (defined $rule->{$message}) {
88
3
8
            $rule->{$message} = [];
89        }
90
6
15
        my $hashed_message = Digest::SHA::sha1_base64($message);
91
6
5
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
92
6
13
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
93
6
12
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
94
6
8
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
95
6
6
        my $locations = $rule->{$message};
96
6
11
        my $physicalLocation = {
97            'uri' => $encoded_file,
98            'startLine' => $line,
99            'startColumn' => $column,
100            'endColumn' => $endColumn,
101        };
102
6
3
        push @$locations, $physicalLocation;
103
6
21
        $rule->{$message} = $locations;
104    }
105
1
1
    my %line_hashes = ();
106
1
23
    my %used_hashes = ();
107
1
3
    for my $file (sort keys %hashes_needed_for_files) {
108
1
1
        $line_hashes{$file} = ();
109
1
6
        unless (-e $file) {
110
0
0
            delete $hashes_needed_for_files{$file};
111
0
0
            next;
112        }
113
1
1
1
4
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
114
1
11
        unless (defined $directoryToRepo{dirname($file)}) {
115
1
0
            my ($parsed_file, $git_base_dir, $prefix, $remote_url, $rev, $branch) = CheckSpelling::GitSources::git_source_and_rev($file);
116        }
117
1
21
        open $file_fh, '<', $file;
118
1
3
        my $line = shift @lines;
119
1
3
        $line = 2 if $line == 1;
120
1
2
        my $buffer = '';
121
1
6
        while (<$file_fh>) {
122
8
9
            if ($line == $.) {
123
4
3
                my $sample = substr $buffer, -100, 100;
124
4
5
                my $hash = Digest::SHA::sha1_base64($sample);
125
4
3
                for (; $line == $.; $line = shift @lines) {
126
5
4
                    my $hit = $used_hashes{$hash}++;
127
5
5
                    $hash = "$hash:$hit" if $hit;
128
5
4
                    $line_hashes{$file}{$line} = $hash;
129
5
6
                    last unless @lines;
130                }
131            }
132
8
4
            $buffer .= $_;
133
8
20
            $buffer =~ s/\s+/ /g;
134
8
9
            $buffer = substr $buffer, -100, 100;
135        }
136
1
4
        close $file_fh;
137    }
138
1
1
1
3
    for my $code (sort keys %{$rules}) {
139
1
1
        my $rule = $rules->{$code};
140
1
1
1
2
        for my $message (sort keys %{$rule}) {
141
3
7
            my $hashed_message = Digest::SHA::sha1_base64($message);
142
3
1
            my $locations = $rule->{$message};
143
3
2
            my @locations_json = ();
144
3
26
            my @fingerprints = ();
145
3
4
            for my $location (@$locations) {
146
6
5
                my $encoded_file = $location->{uri};
147
6
2
                my $line = $location->{startLine};
148
6
3
                my $column = $location->{startColumn};
149
6
5
                my $endColumn = $location->{endColumn};
150
6
4
                my $partialFingerprint = '';
151
6
4
                my $file = $encoded_files{$encoded_file};
152
6
5
                if (defined $line_hashes{$file}) {
153
6
3
                    my $line_hash = $line_hashes{$file}{$line};
154
6
17
                    if (defined $line_hash) {
155
4
4
1
11
                        my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
156
4
2
                        my $hit = scalar @instances;
157
4
4
                        while (--$hit > 0) {
158
0
0
                            last if $instances[$hit] == $column;
159                        }
160
4
12
                        $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
161                    }
162                }
163
6
4
                push @fingerprints, $partialFingerprint;
164
6
7
                my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line, "startColumn": $column, "endColumn": $endColumn } } }>;
165
6
3
                push @locations_json, $json_fragment;
166            }
167
3
4
            if ($flatten) {
168
0
0
                my $locations_json_flat = join ',', @locations_json;
169
0
0
                my $partialFingerprints;
170
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
171
0
0
                if ($partialFingerprint ne '') {
172
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
173                }
174
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
175
0
0
                my $result = decode_json $result_json;
176
0
0
                push @results, $result;
177            } else {
178
3
0
                my $limit = scalar @locations_json;
179
3
5
                for (my $i = 0; $i < $limit; ++$i) {
180
6
4
                    my $locations_json_flat = $locations_json[$i];
181
6
2
                    my $partialFingerprints = '';
182
6
2
                    my $partialFingerprint = $fingerprints[$i];
183
6
4
                    if ($partialFingerprint ne '') {
184
4
3
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
185                    }
186
6
4
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
187
6
8
                    my $result = decode_json $result_json;
188
6
6784
                    push @results, $result;
189                }
190            }
191        }
192    }
193
1
8
    close WARNINGS;
194
1
10
    return \@results;
195}
196
197sub get_runs_from_sarif {
198
2
2
    my ($sarif_json) = @_;
199
2
1
    my %runs_view;
200
2
3
    return %runs_view unless $sarif_json->{'runs'};
201
2
2
1
15
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
202
2
4
    foreach my $sarif_json_run (@sarif_json_runs) {
203
2
2
1
4
        my %sarif_json_run_hash=%{$sarif_json_run};
204
2
3
        next unless defined $sarif_json_run_hash{'tool'};
205
206
2
2
1
2
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
207
2
2
        next unless defined $sarif_json_run_tool_hash{'driver'};
208
209
2
2
1
18
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
210        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
211
2
7
            defined $sarif_json_run_tool_driver_hash{'rules'};
212
213
2
2
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
214
2
2
1
4
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
215
2
2
        my %driver_view;
216
2
2
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
217
31
15
            next unless defined $driver_rule->{'id'};
218
31
46
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
219        }
220
2
5
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
221    }
222
2
2
    return %runs_view;
223}
224
225sub collectVersionControlProvenance {
226
2
2
    my ($file) = @_;
227
2
5
    my ($parsed_file, $git_base_dir, $prefix, $remote_url, $rev, $branch) = CheckSpelling::GitSources::git_source_and_rev($file);
228
2
5
    my $base = substr $parsed_file, 0, length($file);
229
2
3
    my $provenance = [$remote_url, $rev, $branch, $git_base_dir];
230
2
6
    return JSON::PP::encode_json($provenance);
231}
232
233sub generateVersionControlProvenance {
234
1
1
    my ($versionControlProvenanceList, $run) = @_;
235
1
1
    my %provenance;
236    sub buildVersionControlProvenance {
237
1
1
        my $d = $_;
238
1
1
0
2
        my ($remote_url, $rev, $branch, $git_base_dir) = @{JSON::PP::decode_json($d)};
239
1
304
        my $dir = $git_base_dir eq '.' ? '%SRCROOT%' : "DIR_$provenanceStringToIndex{$d}";
240
1
1
        my $mappedTo = {
241            "uriBaseId" => $dir
242        };
243
1
3
        my $versionControlProvenance = {
244            "mappedTo" => $mappedTo
245        };
246
1
1
        $versionControlProvenance->{"revisionId"} = $rev if defined $rev;
247
1
3
        $versionControlProvenance->{"branch"} = $branch if defined $branch;
248
1
1
        $versionControlProvenance->{"repositoryUri"} = $remote_url if defined $remote_url;
249
1
2
        return $versionControlProvenance;
250    }
251
1
2
    @provenanceList = map(buildVersionControlProvenance,@$versionControlProvenanceList);
252
1
3
    $run->{"versionControlProvenance"} = \@provenanceList;
253}
254
255my $provenanceInsertion = 0;
256my %provenanceStringToIndex = ();
257my %directoryToProvenanceInsertion = ();
258
259sub main {
260
1
18727
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
261
1
6
    unless (-f $sarif_template_file) {
262
0
0
        warn "Could not find SARIF template";
263
0
0
        return '';
264    }
265
266
1
3
    $ENV{GITHUB_SERVER_URL} = '' unless defined $ENV{GITHUB_SERVER_URL};
267
1
2
    $ENV{GITHUB_REPOSITORY} = '' unless defined $ENV{GITHUB_REPOSITORY};
268
1
11
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
269
1
2
    die "sarif template is empty" unless $sarif_template;
270
271
1
0
18
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
272
1
107
    my $sarif_json = $json->decode($sarif_template);
273
274
1
82435
    if (defined $sarif_template_overlay_file) {
275
1
14
        my $merger = Hash::Merge->new();
276
1
127
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
277
1
8
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
278
279        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
280
28
2631
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
281
28
28
14
32
            return [@{$_[1]}];
282
1
5
        };
283
284
1
15
        if (-s $sarif_template_overlay_file) {
285
1
4
            my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
286
1
4
            my %runs_base = get_runs_from_sarif($sarif_json);
287
288
1
3
            my $sarif_template_hash = $json->decode($sarif_template_overlay);
289
1
1712
            my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
290
1
2
            for my $run_id (keys %runs_overlay) {
291
1
4
                if (defined $runs_base{$run_id}) {
292
1
1
                    my $run_base_hash = $runs_base{$run_id};
293
1
19
                    my $run_overlay_hash = $runs_overlay{$run_id};
294
1
1
                    for my $overlay_id (keys %$run_overlay_hash) {
295                        $run_base_hash->{$overlay_id} = $merger->merge(
296                            $run_overlay_hash->{$overlay_id},
297
1
3
                            $run_base_hash->{$overlay_id}
298                        );
299                    }
300                } else {
301
0
0
                    $runs_base{$run_id} = $runs_overlay{$run_id};
302                }
303            }
304            #$sarif_json->
305
1
1
31
1
            my @sarif_json_runs = @{$sarif_json->{'runs'}};
306
1
1
            foreach my $sarif_json_run (@sarif_json_runs) {
307
1
1
1
2
                my %sarif_json_run_hash=%{$sarif_json_run};
308
1
1
                next unless defined $sarif_json_run_hash{'tool'};
309
310
1
1
1
2
                my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
311
1
1
                next unless defined $sarif_json_run_tool_hash{'driver'};
312
313
1
1
1
2
                my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
314
1
1
                my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
315                next unless defined $driver_name &&
316
1
5
                    defined $sarif_json_run_tool_driver_hash{'rules'};
317
318
1
1
                my $driver_view_hash = $runs_base{$driver_name};
319
1
1
                next unless defined $driver_view_hash;
320
321
1
1
0
3
                my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
322
1
1
                for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
323
31
2729
                    my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
324
31
15
                    my $driver_rule_id = $driver_rule->{'id'};
325                    next unless defined $driver_rule_id &&
326
31
43
                        defined $driver_view_hash->{$driver_rule_id};
327
30
20
                    $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
328                }
329            }
330
1
2
            delete $sarif_template_hash->{'runs'};
331
1
1
            $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
332        }
333    }
334    {
335
1
1
1
508
1
1
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
336
1
1
        foreach my $sarif_json_run (@sarif_json_runs) {
337
1
1
            my %sarif_json_run_automationDetails;
338
1
1
            $sarif_json_run_automationDetails{id} = $category;
339
1
1
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
340        }
341    }
342
343
1
1
0
2
    my %sarif = %{$sarif_json};
344
345
1
2
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
346
347
1
2
    my $results = parse_warnings $ENV{warning_output};
348
1
1
    if ($results) {
349
1
2
        $sarif{'runs'}[0]{'results'} = $results;
350
1
0
        our %provenanceStringToIndex;
351
1
2
        my @provenanceList = keys %provenanceStringToIndex;
352
1
2
        generateVersionControlProvenance(\@provenanceList, $sarif{'runs'}[0]);
353
1
1
        my %codes;
354
1
2
        for my $result_ref (@$results) {
355
6
6
5
6
            my %result = %{$result_ref};
356
6
3
            $codes{$result{'ruleId'}} = 1;
357        }
358
1
2
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
359
1
1
0
4
        my @rules = @{$rules_ref};
360
1
2
        my $missing_rule_definition_id = 'missing-rule-definition';
361
1
30
1
20
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
362
1
30
1
16
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
363
1
1
        my $code_index = 0;
364
1
1
0
2
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
365
1
1
1
3
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
366
1
1
        my $missing_rule_definition_index;
367
1
1
        if (@missing_codes) {
368
0
0
            push @rules, $missing_rule_definition_ref;
369
0
0
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
370
0
0
            for my $missing_code (@missing_codes) {
371
0
0
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
372
0
0
                my $result = decode_json $result_json;
373
0
0
0
0
                push @{$results}, $result;
374            }
375        }
376
1
2
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
377
1
1
1
2
        for my $result_index (0 .. scalar @{$results}) {
378
7
3
            my $result = $results->[$result_index];
379
7
3
            my $ruleId = $result->{'ruleId'};
380
7
12
            next if defined $ruleId && defined $defined_codes{$ruleId};
381
1
63
            $result->{'ruleIndex'} = $missing_rule_definition_index;
382        }
383    }
384
385
1
2
    return encode_json \%sarif;
386}
387
3881;