File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:83.6%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
107191
1207
71
use Digest::SHA qw($errmsg);
9
1
1
1
2
1
26
use JSON::PP;
10
1
1
1
172
3647
25
use Hash::Merge qw( merge );
11
1
1
1
140
1
1150
use CheckSpelling::Util;
12
13sub encode_low_ascii {
14
7
187
    $_ = shift;
15
7
1
7
2
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
16
7
8
    return $_;
17}
18
19sub url_encode {
20
6
5
    $_ = shift;
21
6
0
2
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
22
6
5
    return $_;
23}
24
25sub double_slash_escape {
26
6
4
    $_ = shift;
27
6
13
    s/(["()\]\\])/\\\\$1/g;
28
6
5
    return $_;
29}
30
31sub parse_warnings {
32
1
1
    my ($warnings) = @_;
33
1
1
    our $flatten;
34
1
1
    my @results;
35
1
11
    open WARNINGS, '<', $warnings || print STDERR "Could not open $warnings\n";
36
1
1
    my $rules = ();
37
1
1
    my %encoded_files = ();
38
1
1
    my %hashes_needed_for_files = ();
39
1
12
    while (<WARNINGS>) {
40
8
7
        next if m{^https://};
41
7
23
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
42
6
12
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
43        # single-slash-escape `"` and `\`
44
6
4
        $message =~ s/(["\\])/\\$1/g;
45
6
4
        $message = encode_low_ascii $message;
46        # double-slash-escape `"`, `(`, `)`, `]`
47
6
3
        $message = double_slash_escape $message;
48        # encode `message` and `file` to protect against low ascii`
49
6
3
        my $encoded_file = url_encode $file;
50
6
2
        $encoded_files{$encoded_file} = $file;
51        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's sarif view
52
6
7
        if ($message =~ /(`{2,})/) {
53
1
0
            my $backticks = $1;
54
1
23
            while ($message =~ /($backticks`+)(?=[`].*?\g{-1})/gs) {
55
0
0
                $backticks = $1 if length($1) > length($backticks);
56            }
57
1
14
            $message =~ s/(^|[^\\])$backticks(.+?)$backticks/${1}[${2}](#security-tab)/;
58        } else {
59
5
10
            $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
60        }
61        # replace '`' with `\`+`'` because GitHub's SARIF parser doesn't like them
62
6
5
        $message =~ s/\`/'/g;
63
6
5
        unless (defined $rules->{$code}) {
64
1
1
            $rules->{$code} = {};
65        }
66
6
3
        my $rule = $rules->{$code};
67
6
4
        unless (defined $rule->{$message}) {
68
3
3
            $rule->{$message} = [];
69        }
70
6
15
        my $hashed_message = Digest::SHA::sha1_base64($message);
71
6
6
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
72
6
6
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
73
6
6
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
74
6
19
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
75
6
10
        my $locations = $rule->{$message};
76
6
7
        my $physicalLocation = {
77            'uri' => $encoded_file,
78            'startLine' => $line,
79            'startColumn' => $column,
80            'endColumn' => $endColumn,
81        };
82
6
4
        push @$locations, $physicalLocation;
83
6
8
        $rule->{$message} = $locations;
84    }
85
1
1
    my %line_hashes = ();
86
1
0
    my %used_hashes = ();
87
1
2
    for my $file (sort keys %hashes_needed_for_files) {
88
1
1
        $line_hashes{$file} = ();
89
1
5
        unless (-e $file) {
90
0
0
            delete $hashes_needed_for_files{$file};
91
0
0
            next;
92        }
93
1
1
1
3
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
94
1
10
        open $file_fh, '<', $file;
95
1
1
        my $line = shift @lines;
96
1
1
        $line = 2 if $line == 1;
97
1
1
        my $buffer = '';
98
1
7
        while (<$file_fh>) {
99
8
6
            if ($line == $.) {
100
4
3
                my $sample = substr $buffer, -100, 100;
101
4
4
                my $hash = Digest::SHA::sha1_base64($sample);
102
4
3
                for (; $line == $.; $line = shift @lines) {
103
5
3
                    my $hit = $used_hashes{$hash}++;
104
5
5
                    $hash = "$hash:$hit" if $hit;
105
5
3
                    $line_hashes{$file}{$line} = $hash;
106
5
4
                    last unless @lines;
107                }
108            }
109
8
6
            $buffer .= $_;
110
8
18
            $buffer =~ s/\s+/ /g;
111
8
8
            $buffer = substr $buffer, -100, 100;
112        }
113
1
3
        close $file_fh;
114    }
115
1
1
1
1
    for my $code (sort keys %{$rules}) {
116
1
1
        my $rule = $rules->{$code};
117
1
1
1
1
        for my $message (sort keys %{$rule}) {
118
3
5
            my $hashed_message = Digest::SHA::sha1_base64($message);
119
3
2
            my $locations = $rule->{$message};
120
3
2
            my @locations_json = ();
121
3
2
            my @fingerprints = ();
122
3
2
            for my $location (@$locations) {
123
6
2
                my $encoded_file = $location->{uri};
124
6
4
                my $line = $location->{startLine};
125
6
2
                my $column = $location->{startColumn};
126
6
2
                my $endColumn = $location->{endColumn};
127
6
3
                my $partialFingerprint = '';
128
6
3
                my $file = $encoded_files{$encoded_file};
129
6
4
                if (defined $line_hashes{$file}) {
130
6
3
                    my $line_hash = $line_hashes{$file}{$line};
131
6
5
                    if (defined $line_hash) {
132
4
4
1
5
                        my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
133
4
2
                        my $hit = scalar @instances;
134
4
4
                        while (--$hit > 0) {
135
0
0
                            last if $instances[$hit] == $column;
136                        }
137
4
7
                        $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
138                    }
139                }
140
6
3
                push @fingerprints, $partialFingerprint;
141
6
5
                my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line, "startColumn": $column, "endColumn": $endColumn } } }>;
142
6
4
                push @locations_json, $json_fragment;
143            }
144
3
3
            if ($flatten) {
145
0
0
                my $locations_json_flat = join ',', @locations_json;
146
0
0
                my $partialFingerprints;
147
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
148
0
0
                if ($partialFingerprint ne '') {
149
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
150                }
151
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
152
0
0
                my $result = decode_json $result_json;
153
0
0
                push @results, $result;
154            } else {
155
3
1
                my $limit = scalar @locations_json;
156
3
4
                for (my $i = 0; $i < $limit; ++$i) {
157
6
2
                    my $locations_json_flat = $locations_json[$i];
158
6
4
                    my $partialFingerprints = '';
159
6
2
                    my $partialFingerprint = $fingerprints[$i];
160
6
5
                    if ($partialFingerprint ne '') {
161
4
1
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
162                    }
163
6
6
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
164
6
3
                    my $result = decode_json $result_json;
165
6
6829
                    push @results, $result;
166                }
167            }
168        }
169    }
170
1
4
    close WARNINGS;
171
1
7
    return \@results;
172}
173
174sub get_runs_from_sarif {
175
2
2
    my ($sarif_json) = @_;
176
2
2
    my %runs_view;
177
2
3
    return %runs_view unless $sarif_json->{'runs'};
178
2
2
1
3
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
179
2
1
    foreach my $sarif_json_run (@sarif_json_runs) {
180
2
2
1
3
        my %sarif_json_run_hash=%{$sarif_json_run};
181
2
3
        next unless defined $sarif_json_run_hash{'tool'};
182
183
2
2
1
3
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
184
2
3
        next unless defined $sarif_json_run_tool_hash{'driver'};
185
186
2
2
1
7
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
187        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
188
2
10
            defined $sarif_json_run_tool_driver_hash{'rules'};
189
190
2
1
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
191
2
2
2
2
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
192
2
1
        my %driver_view;
193
2
4
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
194
29
19
            next unless defined $driver_rule->{'id'};
195
29
26
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
196        }
197
2
3
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
198    }
199
2
3
    return %runs_view;
200}
201
202sub main {
203
1
424
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
204
1
4
    unless (-f $sarif_template_file) {
205
0
0
        warn "Could not find sarif template";
206
0
0
        return '';
207    }
208
209
1
2
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
210
1
1
    die "sarif template is empty" unless $sarif_template;
211
212
1
0
2
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
213
1
51
    my $sarif_json = $json->decode($sarif_template);
214
215
1
76914
    if (defined $sarif_template_overlay_file) {
216
1
6
        my $merger = Hash::Merge->new();
217
1
78
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
218
1
7
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
219
220        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
221
27
3085
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
222
27
27
14
31
            return [@{$_[1]}];
223
1
3
        };
224
225
1
14
        if (-s $sarif_template_overlay_file) {
226
1
3
            my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
227
1
2
            my %runs_base = get_runs_from_sarif($sarif_json);
228
229
1
5
            my $sarif_template_hash = $json->decode($sarif_template_overlay);
230
1
1764
            my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
231
1
1
            for my $run_id (keys %runs_overlay) {
232
1
2
                if (defined $runs_base{$run_id}) {
233
1
0
                    my $run_base_hash = $runs_base{$run_id};
234
1
1
                    my $run_overlay_hash = $runs_overlay{$run_id};
235
1
5
                    for my $overlay_id (keys %$run_overlay_hash) {
236                        $run_base_hash->{$overlay_id} = $merger->merge(
237                            $run_overlay_hash->{$overlay_id},
238
1
2
                            $run_base_hash->{$overlay_id}
239                        );
240                    }
241                } else {
242
0
0
                    $runs_base{$run_id} = $runs_overlay{$run_id};
243                }
244            }
245            #$sarif_json->
246
1
1
44
1
            my @sarif_json_runs = @{$sarif_json->{'runs'}};
247
1
1
            foreach my $sarif_json_run (@sarif_json_runs) {
248
1
1
1
1
                my %sarif_json_run_hash=%{$sarif_json_run};
249
1
2
                next unless defined $sarif_json_run_hash{'tool'};
250
251
1
1
1
1
                my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
252
1
1
                next unless defined $sarif_json_run_tool_hash{'driver'};
253
254
1
1
1
2
                my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
255
1
1
                my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
256                next unless defined $driver_name &&
257
1
3
                    defined $sarif_json_run_tool_driver_hash{'rules'};
258
259
1
0
                my $driver_view_hash = $runs_base{$driver_name};
260
1
1
                next unless defined $driver_view_hash;
261
262
1
1
1
1
                my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
263
1
3
                for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
264
29
1920
                    my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
265
29
14
                    my $driver_rule_id = $driver_rule->{'id'};
266                    next unless defined $driver_rule_id &&
267
29
37
                        defined $driver_view_hash->{$driver_rule_id};
268
28
16
                    $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
269                }
270            }
271
1
1
            delete $sarif_template_hash->{'runs'};
272
1
1
            $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
273        }
274    }
275    {
276
1
1
1
419
0
1
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
277
1
2
        foreach my $sarif_json_run (@sarif_json_runs) {
278
1
0
            my %sarif_json_run_automationDetails;
279
1
1
            $sarif_json_run_automationDetails{id} = $category;
280
1
1
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
281        }
282    }
283
284
1
1
30
2
    my %sarif = %{$sarif_json};
285
286
1
2
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
287
288
1
2
    my $results = parse_warnings $ENV{warning_output};
289
1
1
    if ($results) {
290
1
2
        $sarif{'runs'}[0]{'results'} = $results;
291
1
0
        my %codes;
292
1
1
        for my $result_ref (@$results) {
293
6
6
1
7
            my %result = %{$result_ref};
294
6
7
            $codes{$result{'ruleId'}} = 1;
295        }
296
1
1
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
297
1
1
1
2
        my @rules = @{$rules_ref};
298
1
0
        my $missing_rule_definition_id = 'missing-rule-definition';
299
1
28
1
16
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
300
1
28
1
15
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
301
1
1
        my $code_index = 0;
302
1
1
1
2
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
303
1
1
1
1
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
304
1
0
        my $missing_rule_definition_index;
305
1
1
        if (@missing_codes) {
306
0
0
            push @rules, $missing_rule_definition_ref;
307
0
0
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
308
0
0
            for my $missing_code (@missing_codes) {
309
0
0
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
310
0
0
                my $result = decode_json $result_json;
311
0
0
0
0
                push @{$results}, $result;
312            }
313        }
314
1
1
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
315
1
1
1
2
        for my $result_index (0 .. scalar @{$results}) {
316
7
2
            my $result = $results->[$result_index];
317
7
5
            my $ruleId = $result->{'ruleId'};
318
7
11
            next if defined $ruleId && defined $defined_codes{$ruleId};
319
1
33
            $result->{'ruleIndex'} = $missing_rule_definition_index;
320        }
321    }
322
323
1
2
    return encode_json \%sarif;
324}
325
3261;