File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:85.3%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
107427
2
29
use File::Basename;
9
1
1
1
218
1222
78
use Digest::SHA qw($errmsg);
10
1
1
1
2
1
23
use JSON::PP;
11
1
1
1
186
3650
25
use Hash::Merge qw( merge );
12
1
1
1
135
1
15
use CheckSpelling::Util;
13
1
1
1
144
1
1478
use CheckSpelling::GitSources;
14
15sub encode_low_ascii {
16
8
193
    $_ = shift;
17
8
1
8
3
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
18
8
10
    return $_;
19}
20
21sub url_encode {
22
7
4
    $_ = shift;
23
7
0
6
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
24
7
5
    return $_;
25}
26
27sub double_slash_escape {
28
7
6
    $_ = shift;
29
7
19
    s/(["()\]\\])/\\\\$1/g;
30
7
7
    return $_;
31}
32
33sub fingerprintLocations {
34
4
6
    my ($locations, $encoded_files_ref, $line_hashes_ref, $hashes_needed_for_files_ref, $message, $hashed_message) = @_;
35
4
3
    my %encoded_files = %$encoded_files_ref;
36
4
4
    my %line_hashes = %$line_hashes_ref;
37
4
3
    my %hashes_needed_for_files = %$hashes_needed_for_files_ref;
38
4
4
    my @locations_json = ();
39
4
2
    my @fingerprints = ();
40
4
3
    for my $location (@$locations) {
41
7
4
        my $encoded_file = $location->{uri};
42
7
4
        my $line = $location->{startLine};
43
7
4
        my $column = $location->{startColumn};
44
7
3
        my $endColumn = $location->{endColumn};
45
7
17
        my $partialFingerprint = '';
46
7
3
        my $file = $encoded_files{$encoded_file};
47
7
6
        if (defined $line_hashes{$file}) {
48
7
3
            my $line_hash = $line_hashes{$file}{$line};
49
7
6
            if (defined $line_hash) {
50
4
4
2
5
                my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
51
4
2
                my $hit = scalar @instances;
52
4
3
                while (--$hit > 0) {
53
0
0
                    last if $instances[$hit] == $column;
54                }
55
4
12
                $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
56            }
57        }
58
7
4
        push @fingerprints, $partialFingerprint;
59
7
5
        my $startColumn = $column ? qq<, "startColumn": $column> : '';
60
7
7
        $endColumn = $endColumn ? qq<, "endColumn": $endColumn> : '';
61
7
4
        $line = 1 unless $line;
62
7
8
        my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line$startColumn$endColumn } } }>;
63
7
5
        push @locations_json, $json_fragment;
64    }
65
4
9
    return { locations_json => \@locations_json, fingerprints => \@fingerprints };
66}
67
68sub parse_warnings {
69
1
1
    my ($warnings) = @_;
70
1
0
    our $flatten;
71
1
51
    our %directoryToRepo;
72
1
1
    our $provenanceInsertion;
73
1
1
    our %provenanceStringToIndex;
74
1
0
    our %directoryToProvenanceInsertion;
75
1
1
    my @results;
76
1
13
    unless (open WARNINGS, '<', $warnings) {
77
0
0
        print STDERR "Could not open $warnings\n";
78
0
0
        return [];
79    }
80
1
2
    my $rules = ();
81
1
2
    my %encoded_files = ();
82
1
2
    my %hashes_needed_for_files = ();
83
1
18
    while (<WARNINGS>) {
84
9
12
        next if m{^https://};
85
8
30
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
86
7
26
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
87
7
102
        my $directory = dirname($file);
88
7
4
        unless (defined $directoryToProvenanceInsertion{$directory}) {
89
2
4
            my $provenanceString = collectVersionControlProvenance($file);
90
2
175
            if (defined $provenanceStringToIndex{$provenanceString}) {
91
0
0
                $directoryToProvenanceInsertion{$directory} = $provenanceStringToIndex{$provenanceString};
92            } else {
93
2
7
                $provenanceStringToIndex{$provenanceString} = $provenanceInsertion;
94
2
3
                $directoryToProvenanceInsertion{$directory} = $provenanceInsertion;
95
2
3
                ++$provenanceInsertion;
96            }
97        }
98        # single-slash-escape `"` and `\`
99
7
8
        $message =~ s/(["\\])/\\$1/g;
100
7
10
        $message = encode_low_ascii $message;
101        # double-slash-escape `"`, `(`, `)`, `]`
102
7
7
        $message = double_slash_escape $message;
103        # encode `message` and `file` to protect against low ascii`
104
7
5
        my $encoded_file = url_encode $file;
105
7
6
        $encoded_files{$encoded_file} = $file;
106        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's SARIF view
107
7
7
        if ($message =~ /(`{2,})/) {
108
1
1
            my $backticks = $1;
109
1
35
            while ($message =~ /($backticks`+)(?=[`].*?\g{-1})/gs) {
110
0
0
                $backticks = $1 if length($1) > length($backticks);
111            }
112
1
21
            $message =~ s/(^|[^\\])$backticks(.+?)$backticks/${1}[${2}](#security-tab)/;
113        } else {
114
6
17
            $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
115        }
116        # replace '`' with `\`+`'` because GitHub's SARIF parser doesn't like them
117
7
8
        $message =~ s/\`/'/g;
118
7
6
        unless (defined $rules->{$code}) {
119
2
6
            $rules->{$code} = {};
120        }
121
7
4
        my $rule = $rules->{$code};
122
7
6
        unless (defined $rule->{$message}) {
123
4
12
            $rule->{$message} = [];
124        }
125
7
27
        my $hashed_message = Digest::SHA::sha1_base64($message);
126
7
5
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
127
7
17
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
128
7
11
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
129
7
10
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
130
7
2
        my $locations = $rule->{$message};
131
7
17
        my $physicalLocation = {
132            'uri' => $encoded_file,
133            'startLine' => $line,
134            'startColumn' => $column,
135            'endColumn' => $endColumn,
136        };
137
7
3
        push @$locations, $physicalLocation;
138
7
20
        $rule->{$message} = $locations;
139    }
140
1
1
    my %line_hashes = ();
141
1
2
    my %used_hashes = ();
142
1
4
    for my $file (sort keys %hashes_needed_for_files) {
143
1
21
        $line_hashes{$file} = ();
144
1
5
        unless (-e $file) {
145
0
0
            delete $hashes_needed_for_files{$file};
146
0
0
            next;
147        }
148
1
1
1
5
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
149
1
8
        unless (defined $directoryToRepo{dirname($file)}) {
150
1
2
            my ($parsed_file, $git_base_dir, $prefix, $remote_url, $rev, $branch) = CheckSpelling::GitSources::git_source_and_rev($file);
151        }
152
1
22
        open $file_fh, '<', $file;
153
1
1
        my $line = shift @lines;
154
1
3
        $line = 2 if $line == 1;
155
1
1
        my $buffer = '';
156
1
12
        while (<$file_fh>) {
157
8
10
            if ($line == $.) {
158
4
4
                my $sample = substr $buffer, -100, 100;
159
4
5
                my $hash = Digest::SHA::sha1_base64($sample);
160
4
4
                for (; $line == $.; $line = shift @lines) {
161
5
9
                    my $hit = $used_hashes{$hash}++;
162
5
3
                    $hash = "$hash:$hit" if $hit;
163
5
3
                    $line_hashes{$file}{$line} = $hash;
164
5
7
                    last unless @lines;
165                }
166            }
167
8
4
            $buffer .= $_;
168
8
19
            $buffer =~ s/\s+/ /g;
169
8
9
            $buffer = substr $buffer, -100, 100;
170        }
171
1
14
        close $file_fh;
172    }
173
1
1
0
2
    for my $code (sort keys %{$rules}) {
174
2
2
        my $rule = $rules->{$code};
175
2
2
2
2
        for my $message (sort keys %{$rule}) {
176
4
6
            my $hashed_message = Digest::SHA::sha1_base64($message);
177
4
5
            my $locations = $rule->{$message};
178
4
3
            my $fingerprintResults = fingerprintLocations($locations, \%encoded_files, \%line_hashes, \%hashes_needed_for_files, $message, $hashed_message);
179
4
4
2
3
            my @locations_json = @{$fingerprintResults->{locations_json}};
180
4
4
2
3
            my @fingerprints = @{$fingerprintResults->{fingerprints}};
181
4
5
            if ($flatten) {
182
0
0
                my $locations_json_flat = join ',', @locations_json;
183
0
0
                my $partialFingerprints;
184
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
185
0
0
                if ($partialFingerprint ne '') {
186
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
187                }
188
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
189
0
0
                my $result = decode_json $result_json;
190
0
0
                push @results, $result;
191            } else {
192
4
1
                my $limit = scalar @locations_json;
193
4
4
                for (my $i = 0; $i < $limit; ++$i) {
194
7
2
                    my $locations_json_flat = $locations_json[$i];
195
7
5
                    my $partialFingerprints = '';
196
7
1
                    my $partialFingerprint = $fingerprints[$i];
197
7
7
                    if ($partialFingerprint ne '') {
198
4
2
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
199                    }
200
7
6
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
201
7
7
                    my $result = decode_json $result_json;
202
7
7904
                    push @results, $result;
203                }
204            }
205        }
206    }
207
1
6
    close WARNINGS;
208
1
12
    return \@results;
209}
210
211sub get_runs_from_sarif {
212
2
2
    my ($sarif_json) = @_;
213
2
2
    my %runs_view;
214
2
3
    return %runs_view unless $sarif_json->{'runs'};
215
2
2
1
4
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
216
2
1
    foreach my $sarif_json_run (@sarif_json_runs) {
217
2
2
2
4
        my %sarif_json_run_hash=%{$sarif_json_run};
218
2
7
        next unless defined $sarif_json_run_hash{'tool'};
219
220
2
2
2
1
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
221
2
2
        next unless defined $sarif_json_run_tool_hash{'driver'};
222
223
2
2
1
5
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
224        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
225
2
8
            defined $sarif_json_run_tool_driver_hash{'rules'};
226
227
2
2
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
228
2
2
1
3
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
229
2
1
        my %driver_view;
230
2
1
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
231
38
20
            next unless defined $driver_rule->{'id'};
232
38
52
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
233        }
234
2
4
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
235    }
236
2
4
    return %runs_view;
237}
238
239sub collectVersionControlProvenance {
240
2
2
    my ($file) = @_;
241
2
6
    my ($parsed_file, $git_base_dir, $prefix, $remote_url, $rev, $branch) = CheckSpelling::GitSources::git_source_and_rev($file);
242
2
4
    my $base = substr $parsed_file, 0, length($file);
243
2
4
    my $provenance = [$remote_url, $rev, $branch, $git_base_dir];
244
2
5
    return JSON::PP::encode_json($provenance);
245}
246
247sub generateVersionControlProvenance {
248
1
2
    my ($versionControlProvenanceList, $run) = @_;
249
1
0
    my %provenance;
250    sub buildVersionControlProvenance {
251
1
0
        my $d = $_;
252
1
1
1
1
        my ($remote_url, $rev, $branch, $git_base_dir) = @{JSON::PP::decode_json($d)};
253
1
303
        my $dir = $git_base_dir eq '.' ? '%SRCROOT%' : "DIR_$provenanceStringToIndex{$d}";
254
1
1
        my $mappedTo = {
255            "uriBaseId" => $dir
256        };
257
1
3
        my $versionControlProvenance = {
258            "mappedTo" => $mappedTo
259        };
260
1
2
        $versionControlProvenance->{"revisionId"} = $rev if defined $rev;
261
1
3
        $versionControlProvenance->{"branch"} = $branch if defined $branch;
262
1
3
        $versionControlProvenance->{"repositoryUri"} = $remote_url if defined $remote_url;
263
1
2
        return $versionControlProvenance;
264    }
265
1
3
    @provenanceList = map(buildVersionControlProvenance,@$versionControlProvenanceList);
266
1
1
    $run->{"versionControlProvenance"} = \@provenanceList;
267}
268
269my $provenanceInsertion = 0;
270my %provenanceStringToIndex = ();
271my %directoryToProvenanceInsertion = ();
272
273sub main {
274
1
19299
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
275
1
6
    unless (-f $sarif_template_file) {
276
0
0
        warn "Could not find sarif template";
277
0
0
        return '';
278    }
279
280
1
3
    $ENV{GITHUB_SERVER_URL} = '' unless defined $ENV{GITHUB_SERVER_URL};
281
1
3
    $ENV{GITHUB_REPOSITORY} = '' unless defined $ENV{GITHUB_REPOSITORY};
282
1
6
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
283
1
3
    die "sarif template is empty" unless $sarif_template;
284
285
1
0
12
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
286
1
91
    my $sarif_json = $json->decode($sarif_template);
287
288
1
104885
    if (defined $sarif_template_overlay_file && -s $sarif_template_overlay_file) {
289
1
13
        my $merger = Hash::Merge->new();
290
1
126
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
291
1
7
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
292
293        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
294
36
3696
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
295
36
36
22
42
            return [@{$_[1]}];
296
1
20
        };
297
298
1
3
        my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
299
1
3
        my %runs_base = get_runs_from_sarif($sarif_json);
300
301
1
3
        my $sarif_template_hash = $json->decode($sarif_template_overlay);
302
1
1783
        my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
303
1
2
        for my $run_id (keys %runs_overlay) {
304
1
2
            if (defined $runs_base{$run_id}) {
305
1
1
                my $run_base_hash = $runs_base{$run_id};
306
1
13
                my $run_overlay_hash = $runs_overlay{$run_id};
307
1
1
                for my $overlay_id (keys %$run_overlay_hash) {
308                    $run_base_hash->{$overlay_id} = $merger->merge(
309                        $run_overlay_hash->{$overlay_id},
310
1
3
                        $run_base_hash->{$overlay_id}
311                    );
312                }
313            } else {
314
0
0
                $runs_base{$run_id} = $runs_overlay{$run_id};
315            }
316        }
317        #$sarif_json->
318
1
1
37
1
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
319
1
1
        foreach my $sarif_json_run (@sarif_json_runs) {
320
1
1
1
2
            my %sarif_json_run_hash=%{$sarif_json_run};
321
1
2
            next unless defined $sarif_json_run_hash{'tool'};
322
323
1
1
0
2
            my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
324
1
1
            next unless defined $sarif_json_run_tool_hash{'driver'};
325
326
1
1
1
2
            my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
327
1
1
            my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
328            next unless defined $driver_name &&
329
1
3
                defined $sarif_json_run_tool_driver_hash{'rules'};
330
331
1
1
            my $driver_view_hash = $runs_base{$driver_name};
332
1
1
            next unless defined $driver_view_hash;
333
334
1
1
0
7
            my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
335
1
2
            for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
336
38
3030
                my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
337
38
11
                my $driver_rule_id = $driver_rule->{'id'};
338                next unless defined $driver_rule_id &&
339
38
56
                    defined $driver_view_hash->{$driver_rule_id};
340
37
29
                $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
341            }
342        }
343
1
3
        delete $sarif_template_hash->{'runs'};
344
1
1
        $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
345    }
346    {
347
1
1
1
567
1
1
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
348
1
2
        foreach my $sarif_json_run (@sarif_json_runs) {
349
1
0
            my %sarif_json_run_automationDetails;
350
1
2
            $sarif_json_run_automationDetails{id} = $category;
351
1
1
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
352        }
353    }
354
355
1
1
1
1
    my %sarif = %{$sarif_json};
356
357
1
2
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
358
359
1
3
    my $results = parse_warnings $ENV{warning_output};
360
1
1
    if ($results) {
361
1
2
        $sarif{'runs'}[0]{'results'} = $results;
362
1
0
        our %provenanceStringToIndex;
363
1
1
        my @provenanceList = keys %provenanceStringToIndex;
364
1
4
        generateVersionControlProvenance(\@provenanceList, $sarif{'runs'}[0]);
365
1
1
        my %codes;
366
1
1
        for my $result_ref (@$results) {
367
7
7
4
6
            my %result = %{$result_ref};
368
7
6
            $codes{$result{'ruleId'}} = 1;
369        }
370
1
2
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
371
1
1
1
3
        my @rules = @{$rules_ref};
372
1
1
        my $missing_rule_definition_id = 'missing-rule-definition';
373
1
37
1
30
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
374
1
37
1
33
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
375
1
1
        my $code_index = 0;
376
1
1
1
2
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
377
1
2
2
2
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
378
1
1
        my $missing_rule_definition_index;
379
1
1
        if (@missing_codes) {
380
1
1
            push @rules, $missing_rule_definition_ref;
381
1
2
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
382
1
1
            for my $missing_code (@missing_codes) {
383
1
1
                my $message = "No rule definition for `$missing_code`";
384
1
1
                my $locations_json_flat = '';
385
1
1
                my $partialFingerprints = '';
386
1
2
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
387
1
1
                my $result = decode_json $result_json;
388
1
1
390
1
                push @{$results}, $result;
389            }
390        }
391
1
3
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
392
1
1
1
2
        for my $result_index (0 .. scalar @{$results}) {
393
9
5
            my $result = $results->[$result_index];
394
9
2
            my $ruleId = $result->{'ruleId'};
395
9
17
            next if defined $ruleId && defined $defined_codes{$ruleId};
396
2
64
            $result->{'ruleId'} = $missing_rule_definition_id;
397        }
398    }
399
400
1
2
    return encode_json \%sarif;
401}
402
4031;