File Coverage

File:lib/CheckSpelling/Sarif.pm
Coverage:83.6%

linestmtbrancondsubtimecode
1#! -*-perl-*-
2
3package CheckSpelling::Sarif;
4
5our $VERSION='0.1.0';
6our $flatten=0;
7
8
1
1
1
107050
1200
70
use Digest::SHA qw($errmsg);
9
1
1
1
1
1
22
use JSON::PP;
10
1
1
1
173
3632
26
use Hash::Merge qw( merge );
11
1
1
1
127
1
1219
use CheckSpelling::Util;
12
13sub encode_low_ascii {
14
7
157
    $_ = shift;
15
7
1
6
3
    s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg;
16
7
5
    return $_;
17}
18
19sub url_encode {
20
6
3
    $_ = shift;
21
6
0
2
0
    s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg;
22
6
3
    return $_;
23}
24
25sub double_slash_escape {
26
6
3
    $_ = shift;
27
6
13
    s/(["()\]\\])/\\\\$1/g;
28
6
5
    return $_;
29}
30
31sub fingerprintLocations {
32
3
3
    my ($locations, $encoded_files_ref, $line_hashes_ref, $hashes_needed_for_files_ref, $message, $hashed_message) = @_;
33
3
4
    my %encoded_files = %$encoded_files_ref;
34
3
2
    my %line_hashes = %$line_hashes_ref;
35
3
2
    my %hashes_needed_for_files = %$hashes_needed_for_files_ref;
36
3
1
    my @locations_json = ();
37
3
2
    my @fingerprints = ();
38
3
2
    for my $location (@$locations) {
39
6
4
        my $encoded_file = $location->{uri};
40
6
1
        my $line = $location->{startLine};
41
6
5
        my $column = $location->{startColumn};
42
6
1
        my $endColumn = $location->{endColumn};
43
6
4
        my $partialFingerprint = '';
44
6
2
        my $file = $encoded_files{$encoded_file};
45
6
5
        if (defined $line_hashes{$file}) {
46
6
2
            my $line_hash = $line_hashes{$file}{$line};
47
6
5
            if (defined $line_hash) {
48
4
4
0
7
                my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}};
49
4
1
                my $hit = scalar @instances;
50
4
4
                while (--$hit > 0) {
51
0
0
                    last if $instances[$hit] == $column;
52                }
53
4
8
                $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit");
54            }
55        }
56
6
3
        push @fingerprints, $partialFingerprint;
57
6
4
        my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line, "startColumn": $column, "endColumn": $endColumn } } }>;
58
6
5
        push @locations_json, $json_fragment;
59    }
60
3
4
    return { locations_json => \@locations_json, fingerprints => \@fingerprints };
61}
62
63sub parse_warnings {
64
1
1
    my ($warnings) = @_;
65
1
0
    our $flatten;
66
1
1
    my @results;
67
1
10
    unless (open WARNINGS, '<', $warnings) {
68
0
0
        print STDERR "Could not open $warnings\n";
69
0
0
        return [];
70    }
71
1
1
    my $rules = ();
72
1
0
    my %encoded_files = ();
73
1
1
    my %hashes_needed_for_files = ();
74
1
10
    while (<WARNINGS>) {
75
8
8
        next if m{^https://};
76
7
22
        next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$};
77
6
12
        my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7);
78        # single-slash-escape `"` and `\`
79
6
3
        $message =~ s/(["\\])/\\$1/g;
80
6
6
        $message = encode_low_ascii $message;
81        # double-slash-escape `"`, `(`, `)`, `]`
82
6
4
        $message = double_slash_escape $message;
83        # encode `message` and `file` to protect against low ascii`
84
6
4
        my $encoded_file = url_encode $file;
85
6
4
        $encoded_files{$encoded_file} = $file;
86        # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's SARIF view
87
6
7
        if ($message =~ /(`{2,})/) {
88
1
1
            my $backticks = $1;
89
1
17
            while ($message =~ /($backticks`+)(?=[`].*?\g{-1})/gs) {
90
0
0
                $backticks = $1 if length($1) > length($backticks);
91            }
92
1
11
            $message =~ s/(^|[^\\])$backticks(.+?)$backticks/${1}[${2}](#security-tab)/;
93        } else {
94
5
8
            $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/;
95        }
96        # replace '`' with `\`+`'` because GitHub's SARIF parser doesn't like them
97
6
7
        $message =~ s/\`/'/g;
98
6
3
        unless (defined $rules->{$code}) {
99
1
2
            $rules->{$code} = {};
100        }
101
6
3
        my $rule = $rules->{$code};
102
6
2
        unless (defined $rule->{$message}) {
103
3
3
            $rule->{$message} = [];
104        }
105
6
17
        my $hashed_message = Digest::SHA::sha1_base64($message);
106
6
5
        $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file};
107
6
7
        $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line};
108
6
5
        $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message};
109
6
4
        $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1';
110
6
3
        my $locations = $rule->{$message};
111
6
7
        my $physicalLocation = {
112            'uri' => $encoded_file,
113            'startLine' => $line,
114            'startColumn' => $column,
115            'endColumn' => $endColumn,
116        };
117
6
2
        push @$locations, $physicalLocation;
118
6
14
        $rule->{$message} = $locations;
119    }
120
1
1
    my %line_hashes = ();
121
1
1
    my %used_hashes = ();
122
1
1
    for my $file (sort keys %hashes_needed_for_files) {
123
1
1
        $line_hashes{$file} = ();
124
1
5
        unless (-e $file) {
125
0
0
            delete $hashes_needed_for_files{$file};
126
0
0
            next;
127        }
128
1
1
1
3
        my @lines = sort (keys %{$hashes_needed_for_files{$file}});
129
1
10
        open $file_fh, '<', $file;
130
1
0
        my $line = shift @lines;
131
1
2
        $line = 2 if $line == 1;
132
1
1
        my $buffer = '';
133
1
6
        while (<$file_fh>) {
134
8
6
            if ($line == $.) {
135
4
2
                my $sample = substr $buffer, -100, 100;
136
4
6
                my $hash = Digest::SHA::sha1_base64($sample);
137
4
2
                for (; $line == $.; $line = shift @lines) {
138
5
3
                    my $hit = $used_hashes{$hash}++;
139
5
3
                    $hash = "$hash:$hit" if $hit;
140
5
4
                    $line_hashes{$file}{$line} = $hash;
141
5
6
                    last unless @lines;
142                }
143            }
144
8
4
            $buffer .= $_;
145
8
17
            $buffer =~ s/\s+/ /g;
146
8
9
            $buffer = substr $buffer, -100, 100;
147        }
148
1
3
        close $file_fh;
149    }
150
1
1
1
2
    for my $code (sort keys %{$rules}) {
151
1
1
        my $rule = $rules->{$code};
152
1
1
1
1
        for my $message (sort keys %{$rule}) {
153
3
5
            my $hashed_message = Digest::SHA::sha1_base64($message);
154
3
1
            my $locations = $rule->{$message};
155
3
4
            my $fingerprintResults = fingerprintLocations($locations, \%encoded_files, \%line_hashes, \%hashes_needed_for_files, $message, $hashed_message);
156
3
3
3
1
            my @locations_json = @{$fingerprintResults->{locations_json}};
157
3
3
3
0
            my @fingerprints = @{$fingerprintResults->{fingerprints}};
158
3
4
            if ($flatten) {
159
0
0
                my $locations_json_flat = join ',', @locations_json;
160
0
0
                my $partialFingerprints;
161
0
0
                my $partialFingerprint = (sort @fingerprints)[0];
162
0
0
                if ($partialFingerprint ne '') {
163
0
0
                    $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
164                }
165
0
0
                my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
166
0
0
                my $result = decode_json $result_json;
167
0
0
                push @results, $result;
168            } else {
169
3
1
                my $limit = scalar @locations_json;
170
3
2
                for (my $i = 0; $i < $limit; ++$i) {
171
6
5
                    my $locations_json_flat = $locations_json[$i];
172
6
2
                    my $partialFingerprints = '';
173
6
3
                    my $partialFingerprint = $fingerprints[$i];
174
6
4
                    if ($partialFingerprint ne '') {
175
4
2
                        $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>;
176                    }
177
6
4
                    my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
178
6
5
                    my $result = decode_json $result_json;
179
6
6788
                    push @results, $result;
180                }
181            }
182        }
183    }
184
1
3
    close WARNINGS;
185
1
6
    return \@results;
186}
187
188sub get_runs_from_sarif {
189
2
2
    my ($sarif_json) = @_;
190
2
0
    my %runs_view;
191
2
3
    return %runs_view unless $sarif_json->{'runs'};
192
2
2
2
1
    my @sarif_json_runs=@{$sarif_json->{'runs'}};
193
2
2
    foreach my $sarif_json_run (@sarif_json_runs) {
194
2
2
1
3
        my %sarif_json_run_hash=%{$sarif_json_run};
195
2
2
        next unless defined $sarif_json_run_hash{'tool'};
196
197
2
2
3
2
        my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
198
2
3
        next unless defined $sarif_json_run_tool_hash{'driver'};
199
200
2
2
1
2
        my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
201        next unless defined $sarif_json_run_tool_driver_hash{'name'} &&
202
2
5
            defined $sarif_json_run_tool_driver_hash{'rules'};
203
204
2
1
        my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
205
2
2
2
2
        my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
206
2
2
        my %driver_view;
207
2
0
        for my $driver_rule (@sarif_json_run_tool_driver_rules) {
208
37
30
            next unless defined $driver_rule->{'id'};
209
37
23
            $driver_view{$driver_rule->{'id'}} = $driver_rule;
210        }
211
2
5
        $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view;
212    }
213
2
2
    return %runs_view;
214}
215
216sub main {
217
1
386
    my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_;
218
1
4
    unless (-f $sarif_template_file) {
219
0
0
        warn "Could not find sarif template";
220
0
0
        return '';
221    }
222
223
1
1
    my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file;
224
1
1
    die "sarif template is empty" unless $sarif_template;
225
226
1
0
2
0
    my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b });
227
1
45
    my $sarif_json = $json->decode($sarif_template);
228
229
1
98939
    if (defined $sarif_template_overlay_file && -s $sarif_template_overlay_file) {
230
1
3
        my $merger = Hash::Merge->new();
231
1
53
        my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()};
232
1
6
        my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'};
233
234        $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub {
235
35
3863
            return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]);
236
35
35
9
47
            return [@{$_[1]}];
237
1
2
        };
238
239
1
1
        my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file;
240
1
2
        my %runs_base = get_runs_from_sarif($sarif_json);
241
242
1
1
        my $sarif_template_hash = $json->decode($sarif_template_overlay);
243
1
1760
        my %runs_overlay = get_runs_from_sarif($sarif_template_hash);
244
1
1
        for my $run_id (keys %runs_overlay) {
245
1
1
            if (defined $runs_base{$run_id}) {
246
1
0
                my $run_base_hash = $runs_base{$run_id};
247
1
1
                my $run_overlay_hash = $runs_overlay{$run_id};
248
1
1
                for my $overlay_id (keys %$run_overlay_hash) {
249                    $run_base_hash->{$overlay_id} = $merger->merge(
250                        $run_overlay_hash->{$overlay_id},
251
1
1
                        $run_base_hash->{$overlay_id}
252                    );
253                }
254            } else {
255
0
0
                $runs_base{$run_id} = $runs_overlay{$run_id};
256            }
257        }
258        #$sarif_json->
259
1
1
46
0
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
260
1
1
        foreach my $sarif_json_run (@sarif_json_runs) {
261
1
1
1
1
            my %sarif_json_run_hash=%{$sarif_json_run};
262
1
1
            next unless defined $sarif_json_run_hash{'tool'};
263
264
1
1
1
0
            my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}};
265
1
3
            next unless defined $sarif_json_run_tool_hash{'driver'};
266
267
1
1
1
1
            my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}};
268
1
1
            my $driver_name = $sarif_json_run_tool_driver_hash{'name'};
269            next unless defined $driver_name &&
270
1
2
                defined $sarif_json_run_tool_driver_hash{'rules'};
271
272
1
1
            my $driver_view_hash = $runs_base{$driver_name};
273
1
0
            next unless defined $driver_view_hash;
274
275
1
1
1
91
            my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}};
276
1
2
            for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) {
277
37
2464
                my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number];
278
37
11
                my $driver_rule_id = $driver_rule->{'id'};
279                next unless defined $driver_rule_id &&
280
37
57
                    defined $driver_view_hash->{$driver_rule_id};
281
36
20
                $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule);
282            }
283        }
284
1
2
        delete $sarif_template_hash->{'runs'};
285
1
1
        $sarif_json = $merger->merge($sarif_json, $sarif_template_hash);
286    }
287    {
288
1
1
1
508
1
1
        my @sarif_json_runs = @{$sarif_json->{'runs'}};
289
1
1
        foreach my $sarif_json_run (@sarif_json_runs) {
290
1
1
            my %sarif_json_run_automationDetails;
291
1
36
            $sarif_json_run_automationDetails{id} = $category;
292
1
2
            $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails;
293        }
294    }
295
296
1
1
0
2
    my %sarif = %{$sarif_json};
297
298
1
1
    $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION};
299
300
1
2
    my $results = parse_warnings $ENV{warning_output};
301
1
2
    if ($results) {
302
1
1
        $sarif{'runs'}[0]{'results'} = $results;
303
1
1
        my %codes;
304
1
1
        for my $result_ref (@$results) {
305
6
6
3
5
            my %result = %{$result_ref};
306
6
5
            $codes{$result{'ruleId'}} = 1;
307        }
308
1
1
        my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'};
309
1
1
0
2
        my @rules = @{$rules_ref};
310
1
1
        my $missing_rule_definition_id = 'missing-rule-definition';
311
1
36
1
17
        my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules;
312
1
36
1
18
        @rules = grep { defined $codes{$_->{'id'}} } @rules;
313
1
1
        my $code_index = 0;
314
1
1
1
2
        my %defined_codes = map { $_->{'id'} => $code_index++ } @rules;
315
1
1
1
1
        my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes;
316
1
0
        my $missing_rule_definition_index;
317
1
2
        if (@missing_codes) {
318
0
0
            push @rules, $missing_rule_definition_ref;
319
0
0
            $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++;
320
0
0
            for my $missing_code (@missing_codes) {
321
0
0
                my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>;
322
0
0
                my $result = decode_json $result_json;
323
0
0
0
0
                push @{$results}, $result;
324            }
325        }
326
1
0
        $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules;
327
1
1
1
1
        for my $result_index (0 .. scalar @{$results}) {
328
7
3
            my $result = $results->[$result_index];
329
7
2
            my $ruleId = $result->{'ruleId'};
330
7
13
            next if defined $ruleId && defined $defined_codes{$ruleId};
331
1
29
            $result->{'ruleIndex'} = $missing_rule_definition_index;
332        }
333    }
334
335
1
1
    return encode_json \%sarif;
336}
337
3381;