| File: | lib/CheckSpelling/Sarif.pm |
| Coverage: | 84.0% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #! -*-perl-*- | |||||
| 2 | ||||||
| 3 | package CheckSpelling::Sarif; | |||||
| 4 | ||||||
| 5 | our $VERSION='0.1.0'; | |||||
| 6 | our $flatten=0; | |||||
| 7 | ||||||
| 8 | 1 1 1 | 110752 1266 81 | use Digest::SHA qw($errmsg); | |||
| 9 | 1 1 1 | 3 1 35 | use JSON::PP; | |||
| 10 | 1 1 1 | 177 3778 27 | use Hash::Merge qw( merge ); | |||
| 11 | 1 1 1 | 141 1 1444 | use CheckSpelling::Util; | |||
| 12 | ||||||
| 13 | sub encode_low_ascii { | |||||
| 14 | 6 | 257 | $_ = shift; | |||
| 15 | 6 1 | 9 3 | s/([\x{0}-\x{9}\x{0b}\x{1f}#%])/"\\u".sprintf("%04x",ord($1))/eg; | |||
| 16 | 6 | 5 | return $_; | |||
| 17 | } | |||||
| 18 | ||||||
| 19 | sub url_encode { | |||||
| 20 | 5 | 3 | $_ = shift; | |||
| 21 | 5 0 | 3 0 | s<([^-!\$&'()*+,/:;=?\@A-Za-z0-9_.~])><"%".sprintf("%02x",ord($1))>eg; | |||
| 22 | 5 | 3 | return $_; | |||
| 23 | } | |||||
| 24 | ||||||
| 25 | sub double_slash_escape { | |||||
| 26 | 5 | 2 | $_ = shift; | |||
| 27 | 5 | 13 | s/(["()\]\\])/\\\\$1/g; | |||
| 28 | 5 | 3 | return $_; | |||
| 29 | } | |||||
| 30 | ||||||
| 31 | sub parse_warnings { | |||||
| 32 | 1 | 1 | my ($warnings) = @_; | |||
| 33 | 1 | 1 | our $flatten; | |||
| 34 | 1 | 0 | my @results; | |||
| 35 | 1 | 14 | open WARNINGS, '<', $warnings || print STDERR "Could not open $warnings\n"; | |||
| 36 | 1 | 1 | my $rules = (); | |||
| 37 | 1 | 1 | my %encoded_files = (); | |||
| 38 | 1 | 1 | my %hashes_needed_for_files = (); | |||
| 39 | 1 | 13 | while (<WARNINGS>) { | |||
| 40 | 7 | 5 | next if m{^https://}; | |||
| 41 | 6 | 22 | next unless m{^(.+):(\d+):(\d+) \.\.\. (\d+),\s(Error|Warning|Notice)\s-\s(.+\s\((.+)\))$}; | |||
| 42 | 5 | 13 | my ($file, $line, $column, $endColumn, $severity, $message, $code) = ($1, $2, $3, $4, $5, $6, $7); | |||
| 43 | # single-slash-escape `"` and `\` | |||||
| 44 | 5 | 3 | $message =~ s/(["\\])/\\$1/g; | |||
| 45 | 5 | 5 | $message = encode_low_ascii $message; | |||
| 46 | # double-slash-escape `"`, `(`, `)`, `]` | |||||
| 47 | 5 | 5 | $message = double_slash_escape $message; | |||
| 48 | # encode `message` and `file` to protect against low ascii` | |||||
| 49 | 5 | 3 | my $encoded_file = url_encode $file; | |||
| 50 | 5 | 3 | $encoded_files{$encoded_file} = $file; | |||
| 51 | # hack to make the first `...` identifier a link (that goes nowhere, but is probably blue and underlined) in GitHub's sarif view | |||||
| 52 | 5 | 10 | $message =~ s/(^|[^\\])\`([^`]+[^`\\])\`/${1}[${2}](#security-tab)/; | |||
| 53 | # replace '`' with `\`+`"` because GitHub's SARIF parser doesn't like them | |||||
| 54 | 5 | 5 | $message =~ s/\`/\\"/g; | |||
| 55 | 5 | 4 | unless (defined $rules->{$code}) { | |||
| 56 | 1 | 1 | $rules->{$code} = {}; | |||
| 57 | } | |||||
| 58 | 5 | 2 | my $rule = $rules->{$code}; | |||
| 59 | 5 | 4 | unless (defined $rule->{$message}) { | |||
| 60 | 2 | 1 | $rule->{$message} = []; | |||
| 61 | } | |||||
| 62 | 5 | 18 | my $hashed_message = Digest::SHA::sha1_base64($message); | |||
| 63 | 5 | 5 | $hashes_needed_for_files{$file} = () unless defined $hashes_needed_for_files{$file}; | |||
| 64 | 5 | 4 | $hashes_needed_for_files{$file}{$line} = () unless defined $hashes_needed_for_files{$file}{$line}; | |||
| 65 | 5 | 5 | $hashes_needed_for_files{$file}{$line}{$hashed_message} = () unless defined $hashes_needed_for_files{$file}{$line}{$hashed_message}; | |||
| 66 | 5 | 5 | $hashes_needed_for_files{$file}{$line}{$hashed_message}{$column} = '1'; | |||
| 67 | 5 | 2 | my $locations = $rule->{$message}; | |||
| 68 | 5 | 15 | my $physicalLocation = { | |||
| 69 | 'uri' => $encoded_file, | |||||
| 70 | 'startLine' => $line, | |||||
| 71 | 'startColumn' => $column, | |||||
| 72 | 'endColumn' => $endColumn, | |||||
| 73 | }; | |||||
| 74 | 5 | 3 | push @$locations, $physicalLocation; | |||
| 75 | 5 | 8 | $rule->{$message} = $locations; | |||
| 76 | } | |||||
| 77 | 1 | 1 | my %line_hashes = (); | |||
| 78 | 1 | 0 | my %used_hashes = (); | |||
| 79 | 1 | 2 | for my $file (sort keys %hashes_needed_for_files) { | |||
| 80 | 1 | 2 | $line_hashes{$file} = (); | |||
| 81 | 1 | 4 | unless (-e $file) { | |||
| 82 | 0 | 0 | delete $hashes_needed_for_files{$file}; | |||
| 83 | 0 | 0 | next; | |||
| 84 | } | |||||
| 85 | 1 1 | 1 3 | my @lines = sort (keys %{$hashes_needed_for_files{$file}}); | |||
| 86 | 1 | 31 | open $file_fh, '<', $file; | |||
| 87 | 1 | 2 | my $line = shift @lines; | |||
| 88 | 1 | 2 | $line = 2 if $line == 1; | |||
| 89 | 1 | 1 | my $buffer = ''; | |||
| 90 | 1 | 7 | while (<$file_fh>) { | |||
| 91 | 7 | 4 | if ($line == $.) { | |||
| 92 | 3 | 2 | my $sample = substr $buffer, -100, 100; | |||
| 93 | 3 | 4 | my $hash = Digest::SHA::sha1_base64($sample); | |||
| 94 | 3 | 3 | for (; $line == $.; $line = shift @lines) { | |||
| 95 | 4 | 3 | my $hit = $used_hashes{$hash}++; | |||
| 96 | 4 | 4 | $hash = "$hash:$hit" if $hit; | |||
| 97 | 4 | 3 | $line_hashes{$file}{$line} = $hash; | |||
| 98 | 4 | 5 | last unless @lines; | |||
| 99 | } | |||||
| 100 | } | |||||
| 101 | 7 | 4 | $buffer .= $_; | |||
| 102 | 7 | 18 | $buffer =~ s/\s+/ /g; | |||
| 103 | 7 | 8 | $buffer = substr $buffer, -100, 100; | |||
| 104 | } | |||||
| 105 | 1 | 9 | close $file_fh; | |||
| 106 | } | |||||
| 107 | 1 1 | 2 3 | for my $code (sort keys %{$rules}) { | |||
| 108 | 1 | 1 | my $rule = $rules->{$code}; | |||
| 109 | 1 1 | 0 1 | for my $message (sort keys %{$rule}) { | |||
| 110 | 2 | 5 | my $hashed_message = Digest::SHA::sha1_base64($message); | |||
| 111 | 2 | 1 | my $locations = $rule->{$message}; | |||
| 112 | 2 | 2 | my @locations_json = (); | |||
| 113 | 2 | 1 | my @fingerprints = (); | |||
| 114 | 2 | 2 | for my $location (@$locations) { | |||
| 115 | 5 | 4 | my $encoded_file = $location->{uri}; | |||
| 116 | 5 | 2 | my $line = $location->{startLine}; | |||
| 117 | 5 | 3 | my $column = $location->{startColumn}; | |||
| 118 | 5 | 3 | my $endColumn = $location->{endColumn}; | |||
| 119 | 5 | 1 | my $partialFingerprint = ''; | |||
| 120 | 5 | 4 | my $file = $encoded_files{$encoded_file}; | |||
| 121 | 5 | 4 | if (defined $line_hashes{$file}) { | |||
| 122 | 5 | 2 | my $line_hash = $line_hashes{$file}{$line}; | |||
| 123 | 5 | 4 | if (defined $line_hash) { | |||
| 124 | 3 3 | 2 4 | my @instances = sort keys %{$hashes_needed_for_files{$file}{$line}{$hashed_message}}; | |||
| 125 | 3 | 4 | my $hit = scalar @instances; | |||
| 126 | 3 | 3 | while (--$hit > 0) { | |||
| 127 | 0 | 0 | last if $instances[$hit] == $column; | |||
| 128 | } | |||||
| 129 | 3 | 6 | $partialFingerprint = Digest::SHA::sha1_base64("$line_hash:$message:$hit"); | |||
| 130 | } | |||||
| 131 | } | |||||
| 132 | 5 | 2 | push @fingerprints, $partialFingerprint; | |||
| 133 | 5 | 5 | my $json_fragment = qq<{ "physicalLocation": { "artifactLocation": { "uri": "$encoded_file", "uriBaseId": "%SRCROOT%" }, "region": { "startLine": $line, "startColumn": $column, "endColumn": $endColumn } } }>; | |||
| 134 | 5 | 2 | push @locations_json, $json_fragment; | |||
| 135 | } | |||||
| 136 | 2 | 3 | if ($flatten) { | |||
| 137 | 0 | 0 | my $locations_json_flat = join ',', @locations_json; | |||
| 138 | 0 | 0 | my $partialFingerprints; | |||
| 139 | 0 | 0 | my $partialFingerprint = (sort @fingerprints)[0]; | |||
| 140 | 0 | 0 | if ($partialFingerprint ne '') { | |||
| 141 | 0 | 0 | $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>; | |||
| 142 | } | |||||
| 143 | 0 | 0 | my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>; | |||
| 144 | 0 | 0 | my $result = decode_json $result_json; | |||
| 145 | 0 | 0 | push @results, $result; | |||
| 146 | } else { | |||||
| 147 | 2 | 1 | my $limit = scalar @locations_json; | |||
| 148 | 2 | 1 | for (my $i = 0; $i < $limit; ++$i) { | |||
| 149 | 5 | 4 | my $locations_json_flat = $locations_json[$i]; | |||
| 150 | 5 | 3 | my $partialFingerprints = ''; | |||
| 151 | 5 | 2 | my $partialFingerprint = $fingerprints[$i]; | |||
| 152 | 5 | 3 | if ($partialFingerprint ne '') { | |||
| 153 | 3 | 3 | $partialFingerprints = qq<"partialFingerprints": { "cs0" : "$partialFingerprint" },>; | |||
| 154 | } | |||||
| 155 | 5 | 4 | my $result_json = qq<{"ruleId": "$code", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>; | |||
| 156 | 5 | 5 | my $result = decode_json $result_json; | |||
| 157 | 5 | 5614 | push @results, $result; | |||
| 158 | } | |||||
| 159 | } | |||||
| 160 | } | |||||
| 161 | } | |||||
| 162 | 1 | 6 | close WARNINGS; | |||
| 163 | 1 | 10 | return \@results; | |||
| 164 | } | |||||
| 165 | ||||||
| 166 | sub get_runs_from_sarif { | |||||
| 167 | 2 | 1 | my ($sarif_json) = @_; | |||
| 168 | 2 | 2 | my %runs_view; | |||
| 169 | 2 | 3 | return %runs_view unless $sarif_json->{'runs'}; | |||
| 170 | 2 2 | 1 2 | my @sarif_json_runs=@{$sarif_json->{'runs'}}; | |||
| 171 | 2 | 3 | foreach my $sarif_json_run (@sarif_json_runs) { | |||
| 172 | 2 2 | 1 3 | my %sarif_json_run_hash=%{$sarif_json_run}; | |||
| 173 | 2 | 3 | next unless defined $sarif_json_run_hash{'tool'}; | |||
| 174 | ||||||
| 175 | 2 2 | 3 3 | my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}}; | |||
| 176 | 2 | 1 | next unless defined $sarif_json_run_tool_hash{'driver'}; | |||
| 177 | ||||||
| 178 | 2 2 | 2 6 | my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}}; | |||
| 179 | next unless defined $sarif_json_run_tool_driver_hash{'name'} && | |||||
| 180 | 2 | 9 | defined $sarif_json_run_tool_driver_hash{'rules'}; | |||
| 181 | ||||||
| 182 | 2 | 2 | my $driver_name = $sarif_json_run_tool_driver_hash{'name'}; | |||
| 183 | 2 2 | 1 3 | my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}}; | |||
| 184 | 2 | 1 | my %driver_view; | |||
| 185 | 2 | 3 | for my $driver_rule (@sarif_json_run_tool_driver_rules) { | |||
| 186 | 29 | 20 | next unless defined $driver_rule->{'id'}; | |||
| 187 | 29 | 31 | $driver_view{$driver_rule->{'id'}} = $driver_rule; | |||
| 188 | } | |||||
| 189 | 2 | 3 | $runs_view{$sarif_json_run_tool_driver_hash{'name'}} = \%driver_view; | |||
| 190 | } | |||||
| 191 | 2 | 4 | return %runs_view; | |||
| 192 | } | |||||
| 193 | ||||||
| 194 | sub main { | |||||
| 195 | 1 | 429 | my ($sarif_template_file, $sarif_template_overlay_file, $category) = @_; | |||
| 196 | 1 | 5 | unless (-f $sarif_template_file) { | |||
| 197 | 0 | 0 | warn "Could not find sarif template"; | |||
| 198 | 0 | 0 | return ''; | |||
| 199 | } | |||||
| 200 | ||||||
| 201 | 1 | 1 | my $sarif_template = CheckSpelling::Util::read_file $sarif_template_file; | |||
| 202 | 1 | 1 | die "sarif template is empty" unless $sarif_template; | |||
| 203 | ||||||
| 204 | 1 0 | 4 0 | my $json = JSON::PP->new->utf8->pretty->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b }); | |||
| 205 | 1 | 57 | my $sarif_json = $json->decode($sarif_template); | |||
| 206 | ||||||
| 207 | 1 | 78686 | if (defined $sarif_template_overlay_file) { | |||
| 208 | 1 | 8 | my $merger = Hash::Merge->new(); | |||
| 209 | 1 | 100 | my $merge_behaviors = $merger->{'behaviors'}->{$merger->get_behavior()}; | |||
| 210 | 1 | 6 | my $merge_arrays = $merge_behaviors->{'ARRAY'}->{'ARRAY'}; | |||
| 211 | ||||||
| 212 | $merge_behaviors->{'ARRAY'}->{'ARRAY'} = sub { | |||||
| 213 | 27 | 3113 | return $merge_arrays->(@_) if ref($_[0][0]).ref($_[1][0]); | |||
| 214 | 27 27 | 13 29 | return [@{$_[1]}]; | |||
| 215 | 1 | 6 | }; | |||
| 216 | ||||||
| 217 | 1 | 12 | if (-s $sarif_template_overlay_file) { | |||
| 218 | 1 | 4 | my $sarif_template_overlay = CheckSpelling::Util::read_file $sarif_template_overlay_file; | |||
| 219 | 1 | 2 | my %runs_base = get_runs_from_sarif($sarif_json); | |||
| 220 | ||||||
| 221 | 1 | 3 | my $sarif_template_hash = $json->decode($sarif_template_overlay); | |||
| 222 | 1 | 1812 | my %runs_overlay = get_runs_from_sarif($sarif_template_hash); | |||
| 223 | 1 | 1 | for my $run_id (keys %runs_overlay) { | |||
| 224 | 1 | 1 | if (defined $runs_base{$run_id}) { | |||
| 225 | 1 | 1 | my $run_base_hash = $runs_base{$run_id}; | |||
| 226 | 1 | 0 | my $run_overlay_hash = $runs_overlay{$run_id}; | |||
| 227 | 1 | 2 | for my $overlay_id (keys %$run_overlay_hash) { | |||
| 228 | $run_base_hash->{$overlay_id} = $merger->merge( | |||||
| 229 | $run_overlay_hash->{$overlay_id}, | |||||
| 230 | 1 | 3 | $run_base_hash->{$overlay_id} | |||
| 231 | ); | |||||
| 232 | } | |||||
| 233 | } else { | |||||
| 234 | 0 | 0 | $runs_base{$run_id} = $runs_overlay{$run_id}; | |||
| 235 | } | |||||
| 236 | } | |||||
| 237 | #$sarif_json-> | |||||
| 238 | 1 1 | 22 1 | my @sarif_json_runs = @{$sarif_json->{'runs'}}; | |||
| 239 | 1 | 2 | foreach my $sarif_json_run (@sarif_json_runs) { | |||
| 240 | 1 1 | 1 1 | my %sarif_json_run_hash=%{$sarif_json_run}; | |||
| 241 | 1 | 2 | next unless defined $sarif_json_run_hash{'tool'}; | |||
| 242 | ||||||
| 243 | 1 1 | 0 1 | my %sarif_json_run_tool_hash = %{$sarif_json_run_hash{'tool'}}; | |||
| 244 | 1 | 3 | next unless defined $sarif_json_run_tool_hash{'driver'}; | |||
| 245 | ||||||
| 246 | 1 1 | 1 2 | my %sarif_json_run_tool_driver_hash = %{$sarif_json_run_tool_hash{'driver'}}; | |||
| 247 | 1 | 1 | my $driver_name = $sarif_json_run_tool_driver_hash{'name'}; | |||
| 248 | next unless defined $driver_name && | |||||
| 249 | 1 | 2 | defined $sarif_json_run_tool_driver_hash{'rules'}; | |||
| 250 | ||||||
| 251 | 1 | 1 | my $driver_view_hash = $runs_base{$driver_name}; | |||
| 252 | 1 | 1 | next unless defined $driver_view_hash; | |||
| 253 | ||||||
| 254 | 1 1 | 0 1 | my @sarif_json_run_tool_driver_rules = @{$sarif_json_run_tool_driver_hash{'rules'}}; | |||
| 255 | 1 | 2 | for my $driver_rule_number (0 .. scalar @sarif_json_run_tool_driver_rules) { | |||
| 256 | 29 | 1971 | my $driver_rule = $sarif_json_run_tool_driver_rules[$driver_rule_number]; | |||
| 257 | 29 | 11 | my $driver_rule_id = $driver_rule->{'id'}; | |||
| 258 | next unless defined $driver_rule_id && | |||||
| 259 | 29 | 39 | defined $driver_view_hash->{$driver_rule_id}; | |||
| 260 | 28 | 23 | $sarif_json_run_tool_driver_hash{'rules'}[$driver_rule_number] = $merger->merge($driver_view_hash->{$driver_rule_id}, $driver_rule); | |||
| 261 | } | |||||
| 262 | } | |||||
| 263 | 1 | 2 | delete $sarif_template_hash->{'runs'}; | |||
| 264 | 1 | 1 | $sarif_json = $merger->merge($sarif_json, $sarif_template_hash); | |||
| 265 | } | |||||
| 266 | } | |||||
| 267 | { | |||||
| 268 | 1 1 1 | 423 1 1 | my @sarif_json_runs = @{$sarif_json->{'runs'}}; | |||
| 269 | 1 | 2 | foreach my $sarif_json_run (@sarif_json_runs) { | |||
| 270 | 1 | 0 | my %sarif_json_run_automationDetails; | |||
| 271 | 1 | 1 | $sarif_json_run_automationDetails{id} = $category; | |||
| 272 | 1 | 30 | $sarif_json_run->{'automationDetails'} = \%sarif_json_run_automationDetails; | |||
| 273 | } | |||||
| 274 | } | |||||
| 275 | ||||||
| 276 | 1 1 | 1 2 | my %sarif = %{$sarif_json}; | |||
| 277 | ||||||
| 278 | 1 | 2 | $sarif{'runs'}[0]{'tool'}{'driver'}{'version'} = $ENV{CHECK_SPELLING_VERSION}; | |||
| 279 | ||||||
| 280 | 1 | 2 | my $results = parse_warnings $ENV{warning_output}; | |||
| 281 | 1 | 2 | if ($results) { | |||
| 282 | 1 | 2 | $sarif{'runs'}[0]{'results'} = $results; | |||
| 283 | 1 | 1 | my %codes; | |||
| 284 | 1 | 1 | for my $result_ref (@$results) { | |||
| 285 | 5 5 | 1 7 | my %result = %{$result_ref}; | |||
| 286 | 5 | 6 | $codes{$result{'ruleId'}} = 1; | |||
| 287 | } | |||||
| 288 | 1 | 1 | my $rules_ref = $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'}; | |||
| 289 | 1 1 | 1 3 | my @rules = @{$rules_ref}; | |||
| 290 | 1 | 1 | my $missing_rule_definition_id = 'missing-rule-definition'; | |||
| 291 | 1 28 | 1 23 | my ($missing_rule_definition_ref) = grep { $_->{'id'} eq $missing_rule_definition_id } @rules; | |||
| 292 | 1 28 | 0 17 | @rules = grep { defined $codes{$_->{'id'}} } @rules; | |||
| 293 | 1 | 1 | my $code_index = 0; | |||
| 294 | 1 1 | 1 2 | my %defined_codes = map { $_->{'id'} => $code_index++ } @rules; | |||
| 295 | 1 1 | 1 2 | my @missing_codes = grep { !defined $defined_codes{$_}} keys %codes; | |||
| 296 | 1 | 1 | my $missing_rule_definition_index; | |||
| 297 | 1 | 1 | if (@missing_codes) { | |||
| 298 | 0 | 0 | push @rules, $missing_rule_definition_ref; | |||
| 299 | 0 | 0 | $missing_rule_definition_index = $defined_codes{$missing_rule_definition_id} = $code_index++; | |||
| 300 | 0 | 0 | for my $missing_code (@missing_codes) { | |||
| 301 | 0 | 0 | my $result_json = qq<{"ruleId": "$missing_rule_definition_id", $partialFingerprints "message": { "text": "$message" }, "locations": [ $locations_json_flat ] }>; | |||
| 302 | 0 | 0 | my $result = decode_json $result_json; | |||
| 303 | 0 0 | 0 0 | push @{$results}, $result; | |||
| 304 | } | |||||
| 305 | } | |||||
| 306 | 1 | 1 | $sarif{'runs'}[0]{'tool'}{'driver'}{'rules'} = \@rules; | |||
| 307 | 1 1 | 1 2 | for my $result_index (0 .. scalar @{$results}) { | |||
| 308 | 6 | 2 | my $result = $results->[$result_index]; | |||
| 309 | 6 | 1 | my $ruleId = $result->{'ruleId'}; | |||
| 310 | 6 | 12 | next if defined $ruleId && defined $defined_codes{$ruleId}; | |||
| 311 | 1 | 53 | $result->{'ruleIndex'} = $missing_rule_definition_index; | |||
| 312 | } | |||||
| 313 | } | |||||
| 314 | ||||||
| 315 | 1 | 3 | return encode_json \%sarif; | |||
| 316 | } | |||||
| 317 | ||||||
| 318 | 1; | |||||