~fgaz/nixpkgs-find-untrusted-git-hashes

844eb230385c678d897d787a8fb746dd8da27e5a — Francesco Gazzetta 3 months ago 4ccaa8a
Switch to scraping

Should have done it from the start.
1 files changed, 7 insertions(+), 15 deletions(-)

M main.tcl
M main.tcl => main.tcl +7 -15
@@ 12,13 12,6 @@

package require json

if {[info exists env(GITHUB_TOKEN)]} then {
  set curl_args [list -H "Authorization: Bearer $env(GITHUB_TOKEN)"]
} else {
  puts stderr "WARNING: \$GITHUB_TOKEN not found, will perform unauthenticated requests."
  set curl_args ""
}

proc validate {repo hash} {
  global curl_args
  puts stderr "Validating $hash ∈ $repo"


@@ 26,15 19,14 @@ proc validate {repo hash} {
    # Failed attempts:
    # * https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#get-a-commit
    #   Does not distinguish between commits in the upstream repo and forks
    set api_res [json::json2dict [exec -ignorestderr curl {*}$curl_args --fail -s --retry 5 --retry-max-time 30 --retry-all-errors https://api.github.com/search/commits?q=repo:$repo+hash:$hash]]
    #puts stderr $api_res
    if {[lindex [dict get $api_res items] 0] == ""} then {
    # * https://stackoverflow.com/a/44819657
    #   Does not return anyting if the repo is a fork or if the commit belongs to a non-default branch
    # * https://stackoverflow.com/a/23970412
    #   May work... very... very... slowly...
    # IT'S SCRAPING TIME 😎
    set branch_commits_html [exec -ignorestderr curl --fail -s --retry 5 --retry-max-time 30 --retry-all-errors https://github.com/$repo/branch_commits/$hash]
    if {[string match *js-spoofed-commit-warning-trigger* $branch_commits_html]} then {
      puts "$hash ∉ $repo"
    } else {
      set actual_repo [dict get [lindex [dict get $api_res items] 0] repository full_name]
      if {$repo != $actual_repo} then {
        puts "$hash ∈ $actual_repo ≠ $repo"
      }
    }
  } on error err {
    puts "SKIPPING $repo $hash. Error: $err"