File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:84.0%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
108222
1210
64
use Digest::SHA qw($errmsg);
9
1
1
1
2
1
27
use JSON::PP;
10
1
1
1
187
3681
25
use Hash::Merge qw( merge );
11
1
1
1
127
0
1157
use CheckSpelling::Util;
12
13sub encode_low_ascii {
14
6
158
    $_ = shift;
15
6
1
6
3
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
16
6
3
    return $_;
17}
18
19sub url_encode {
20
5
4
    $_ = shift;
21
5
0
2
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
22
5
3
    return $_;
23}
24
25sub double_slash_escape {
26
5
2
    $_ = shift;
27
5
12
    s/(["()\]\\])/\\\\$1/g;
28
5
4
    return $_;
29}
30
31sub parse_warnings {
32
1
1
    my ($warnings) = @_;
33
1
0
    our $flatten;
34
1
1
    my @results;
35
1
10
    open WARNINGS, '<', $warnings || print STDERR "Could not open $warnings\n";
36
1
1
    my $rules = ();
37
1
0
    my %encoded_files = ();
38
1
1
    my %hashes_needed_for_files = ();
39
1
9
    while (<WARNINGS>) {
40
7
6
        next if m{^https://};
41
6
15
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
42
5
9
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
43        # single-slash-escape `"` and `\`
44
5
3
        $message =~ s/(["\\])/\\$1/g;
45
5
4
        $message = encode_low_ascii $message;
46        # double-slash-escape `"`, `(`, `)`, `]`
47
5
4
        $message = double_slash_escape $message;
48        # encode `message` and `file` to protect against low ascii`
49
5
1
        my $encoded_file = url_encode $file;
50
5
3
        $encoded_files{$encoded_file} = $file;
51        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's sarif view
52
5
10
        $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
53        # replace '`' with `\`+`"` because GitHub's SARIF parser doesn't like them
54
5
3
        $message =~ s/\`/\\"/g;
55
5
4
        unless (defined $rules->{$code}) {
56
1
1
            $rules->{$code} = {};
57        }
58
5
2
        my $rule = $rules->{$code};
59
5
5
        unless (defined $rule->{$message}) {
60
2
1
            $rule->{$message} = [];
61        }
62
5
14
        my $hashed_message = Digest::SHA::sha1_base64($message);
63
5
4
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
64
5
4
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
65
5
5
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
66
5
3
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
67
5
3
        my $locations = $rule->{$message};
68
5
6
        my $physicalLocation = {
69            'uri' => $encoded_file,
70            'startLine' => $line,
71            'startColumn' => $column,
72            'endColumn' => $endColumn,
73        };
74
5
1
        push @$locations, $physicalLocation;
75
5
10
        $rule->{$message} = $locations;
76    }
77
1
1
    my %line_hashes = ();
78
1
1
    my %used_hashes = ();
79
1
1
    for my $file (sort keys %hashes_needed_for_files) {
80
1
1
        $line_hashes{$file} = ();
81
1
5
        unless (-e $file) {
82
0
0
            delete $hashes_needed_for_files{$file};
83
0
0
            next;
84        }
85
1
1
0
3
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
86
1
7
        open $file_fh, '<', $file;
87
1
1
        my $line = shift @lines;
88
1
2
        $line = 2 if $line == 1;
89
1
0
        my $buffer = '';
90
1
7
        while (<$file_fh>) {
91
7
6
            if ($line == $.) {
92
3
2
                my $sample = substr $buffer, -100, 100;
93
3
4
                my $hash = Digest::SHA::sha1_base64($sample);
94
3
3
                for (; $line == $.; $line = shift @lines) {
95
4
2
                    my $hit = $used_hashes{$hash}++;
96
4
4
                    $hash = "$hash:$hit" if $hit;
97
4
4
                    $line_hashes{$file}{$line} = $hash;
98
4
3
                    last unless @lines;
99                }
100            }
101
7
3
            $buffer .= $_;
102
7
16
            $buffer =~ s/\s+/ /g;
103
7
8
            $buffer = substr $buffer, -100, 100;
104        }
105
1
3
        close $file_fh;
106    }
107
1
1
1
2
    for my $code (sort keys %{$rules}) {
108
1
0
        my $rule = $rules->{$code};
109
1
1
1
1
        for my $message (sort keys %{$rule}) {
110
2
4
            my $hashed_message = Digest::SHA::sha1_base64($message);
111
2
1
            my $locations = $rule->{$message};
112
2
1
            my @locations_json = ();
113
2
2
            my @fingerprints = ();
114
2
1
            for my $location (@$locations) {
115
5
2
                my $encoded_file = $location->{uri};
116
5
2
                my $line = $location->{startLine};
117
5
3
                my $column = $location->{startColumn};
118
5
2
                my $endColumn = $location->{endColumn};
119
5
3
                my $partialFingerprint = '';
120
5
1
                my $file = $encoded_files{$encoded_file};
121
5
3
                if (defined $line_hashes{$file}) {
122
5
3
                    my $line_hash = $line_hashes{$file}{$line};
123
5
3
                    if (defined $line_hash) {
124
3
3
1
4
                        my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
125
3
2
                        my $hit = scalar @instances;
126
3
2
                        while (--$hit > 0) {
127
0
0
                            last if $instances[$hit] == $column;
128                        }
129
3
6
                        $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
130                    }
131                }
132
5
3
                push @fingerprints, $partialFingerprint;
133
5
2
                my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line, "startColumn": $column, "endColumn": $endColumn } } }>;
134
5
4
                push @locations_json, $json_fragment;
135            }
136
2
2
            if ($flatten) {
137
0
0
                my $locations_json_flat = join ',', @locations_json;
138
0
0
                my $partialFingerprints;
139
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
140
0
0
                if ($partialFingerprint ne '') {
141
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
142                }
143
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
144
0
0
                my $result = decode_json $result_json;
145
0
0
                push @results, $result;
146            } else {
147
2
1
                my $limit = scalar @locations_json;
148
2
2
                for (my $i = 0; $i < $limit; ++$i) {
149
5
4
                    my $locations_json_flat = $locations_json[$i];
150
5
2
                    my $partialFingerprints = '';
151
5
3
                    my $partialFingerprint = $fingerprints[$i];
152
5
5
                    if ($partialFingerprint ne '') {
153
3
0
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
154                    }
155
5
4
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
156
5
6
                    my $result = decode_json $result_json;
157
5
5645
                    push @results, $result;
158                }
159            }
160        }
161    }
162
1
3
    close WARNINGS;
163
1
7
    return \@results;
164}
165
166sub get_runs_from_sarif {
167
2
0
    my ($sarif_json) = @_;
168
2
2
    my %runs_view;
169
2
2
    return %runs_view unless $sarif_json->{'runs'};
170
2
2
1
3
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
171
2
2
    foreach my $sarif_json_run (@sarif_json_runs) {
172
2
2
1
3
        my %sarif_json_run_hash=%{$sarif_json_run};
173
2
1
        next unless defined $sarif_json_run_hash{'tool'};
174
175
2
2
2
1
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
176
2
2
        next unless defined $sarif_json_run_tool_hash{'driver'};
177
178
2
2
0
3
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
179        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
180
2
10
            defined $sarif_json_run_tool_driver_hash{'rules'};
181
182
2
1
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
183
2
2
1
2
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
184
2
2
        my %driver_view;
185
2
1
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
186
28
17
            next unless defined $driver_rule->{'id'};
187
28
22
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
188        }
189
2
3
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
190    }
191
2
3
    return %runs_view;
192}
193
194sub main {
195
1
337
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
196
1
5
    unless (-f $sarif_template_file) {
197
0
0
        warn "Could not find sarif template";
198
0
0
        return '';
199    }
200
201
1
1
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
202
1
1
    die "sarif template is empty" unless $sarif_template;
203
204
1
0
2
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
205
1
45
    my $sarif_json = $json->decode($sarif_template);
206
207
1
74548
    if (defined $sarif_template_overlay_file) {
208
1
3
        my $merger = Hash::Merge->new();
209
1
56
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
210
1
4
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
211
212        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
213
27
3188
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
214
27
27
13
29
            return [@{$_[1]}];
215
1
2
        };
216
217
1
6
        if (-s $sarif_template_overlay_file) {
218
1
1
            my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
219
1
2
            my %runs_base = get_runs_from_sarif($sarif_json);
220
221
1
6
            my $sarif_template_hash = $json->decode($sarif_template_overlay);
222
1
1777
            my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
223
1
1
            for my $run_id (keys %runs_overlay) {
224
1
1
                if (defined $runs_base{$run_id}) {
225
1
1
                    my $run_base_hash = $runs_base{$run_id};
226
1
0
                    my $run_overlay_hash = $runs_overlay{$run_id};
227
1
1
                    for my $overlay_id (keys %$run_overlay_hash) {
228                        $run_base_hash->{$overlay_id} = $merger->merge(
229                            $run_overlay_hash->{$overlay_id},
230
1
2
                            $run_base_hash->{$overlay_id}
231                        );
232                    }
233                } else {
234
0
0
                    $runs_base{$run_id} = $runs_overlay{$run_id};
235                }
236            }
237            #$sarif_json->
238
1
1
15
1
            my @sarif_json_runs = @{$sarif_json->{'runs'}};
239
1
1
            foreach my $sarif_json_run (@sarif_json_runs) {
240
1
1
0
2
                my %sarif_json_run_hash=%{$sarif_json_run};
241
1
1
                next unless defined $sarif_json_run_hash{'tool'};
242
243
1
1
1
1
                my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
244
1
1
                next unless defined $sarif_json_run_tool_hash{'driver'};
245
246
1
1
1
3
                my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
247
1
1
                my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
248                next unless defined $driver_name &&
249
1
3
                    defined $sarif_json_run_tool_driver_hash{'rules'};
250
251
1
0
                my $driver_view_hash = $runs_base{$driver_name};
252
1
1
                next unless defined $driver_view_hash;
253
254
1
1
0
2
                my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
255
1
1
                for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
256
28
1747
                    my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
257
28
14
                    my $driver_rule_id = $driver_rule->{'id'};
258                    next unless defined $driver_rule_id &&
259
28
37
                        defined $driver_view_hash->{$driver_rule_id};
260
27
22
                    $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
261                }
262            }
263
1
2
            delete $sarif_template_hash->{'runs'};
264
1
1
            $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
265        }
266    }
267    {
268
1
1
1
400
1
1
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
269
1
2
        foreach my $sarif_json_run (@sarif_json_runs) {
270
1
17
            my %sarif_json_run_automationDetails;
271
1
1
            $sarif_json_run_automationDetails{id} = $category;
272
1
1
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
273        }
274    }
275
276
1
1
0
2
    my %sarif = %{$sarif_json};
277
278
1
1
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
279
280
1
2
    my $results = parse_warnings $ENV{warning_output};
281
1
1
    if ($results) {
282
1
1
        $sarif{'runs'}[0]{'results'} = $results;
283
1
1
        my %codes;
284
1
1
        for my $result_ref (@$results) {
285
5
5
1
6
            my %result = %{$result_ref};
286
5
5
            $codes{$result{'ruleId'}} = 1;
287        }
288
1
1
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
289
1
1
0
1
        my @rules = @{$rules_ref};
290
1
1
        my $missing_rule_definition_id = 'missing-rule-definition';
291
1
27
1
15
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
292
1
27
1
14
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
293
1
1
        my $code_index = 0;
294
1
1
1
2
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
295
1
1
1
1
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
296
1
0
        my $missing_rule_definition_index;
297
1
1
        if (@missing_codes) {
298
0
0
            push @rules, $missing_rule_definition_ref;
299
0
0
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
300
0
0
            for my $missing_code (@missing_codes) {
301
0
0
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
302
0
0
                my $result = decode_json $result_json;
303
0
0
0
0
                push @{$results}, $result;
304            }
305        }
306
1
2
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
307
1
1
1
1
        for my $result_index (0 .. scalar @{$results}) {
308
6
2
            my $result = $results->[$result_index];
309
6
2
            my $ruleId = $result->{'ruleId'};
310
6
12
            next if defined $ruleId && defined $defined_codes{$ruleId};
311
1
22
            $result->{'ruleIndex'} = $missing_rule_definition_index;
312        }
313    }
314
315
1
2
    return encode_json \%sarif;
316}
317
3181;