~fgaz/nixpkgs-find-untrusted-git-hashes

4ccaa8a34ec2c59058181b5b2416da313c89d065 — Francesco Gazzetta 3 months ago
🌅
2 files changed, 83 insertions(+), 0 deletions(-)

A find-gh-srcs.nix
A main.tcl
A  => find-gh-srcs.nix +19 -0
@@ 1,19 @@
{ pkgs ? import <nixpkgs> {} }:

let
  getRepoAndHash = pkg:
    let
      repoUrl = pkg.src.gitRepoUrl or (throw "gitRepoUrl not found");
      repo = (pkg.src.owner or (throw "owner not found")) + "/" + (pkg.src.repo or (throw "repo not found"));
      rev = pkg.src.rev or (throw "rev not found");
    in
    assert pkgs.lib.strings.hasPrefix "https://github.com" repoUrl;
    assert builtins.match "[0-9a-f]{40}" rev != null;
    builtins.seq repo (builtins.seq rev { inherit repo; hash = rev; });
  # TODO check all packages, not just top-level.
  # Is it possible to find all fetchFromGitHub calls? I vaguely remember someone
  # mentioning a script/expression that collects all sources for archiving purposes.
  allTries = builtins.mapAttrs (name: pkg: builtins.tryEval (getRepoAndHash pkg)) pkgs;
  allSuccesses = pkgs.lib.filterAttrs (name: result: result.success) allTries;
  allGHSrcs = builtins.mapAttrs (name: result: result.value) allSuccesses;
in allGHSrcs

A  => main.tcl +64 -0
@@ 1,64 @@
#!/usr/bin/env tclsh

# SPDX-FileCopyrightText: Francesco Gazzetta <fgaz@fgaz.me>
# SPDX-License-Identifier: MIT

# Requires tcllib
# Usage: in the nixpkgs directory, run the script.
# Outputs all errors and mismatching sources to stdout
# and logs what is happening to stderr.
# You likely want to redirect stdout to a file and `tail --follow` it from
# another console.

package require json

if {[info exists env(GITHUB_TOKEN)]} then {
  set curl_args [list -H "Authorization: Bearer $env(GITHUB_TOKEN)"]
} else {
  puts stderr "WARNING: \$GITHUB_TOKEN not found, will perform unauthenticated requests."
  set curl_args ""
}

proc validate {repo hash} {
  global curl_args
  puts stderr "Validating $hash ∈ $repo"
  try {
    # Failed attempts:
    # * https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#get-a-commit
    #   Does not distinguish between commits in the upstream repo and forks
    set api_res [json::json2dict [exec -ignorestderr curl {*}$curl_args --fail -s --retry 5 --retry-max-time 30 --retry-all-errors https://api.github.com/search/commits?q=repo:$repo+hash:$hash]]
    #puts stderr $api_res
    if {[lindex [dict get $api_res items] 0] == ""} then {
      puts "$hash ∉ $repo"
    } else {
      set actual_repo [dict get [lindex [dict get $api_res items] 0] repository full_name]
      if {$repo != $actual_repo} then {
        puts "$hash ∈ $actual_repo ≠ $repo"
      }
    }
  } on error err {
    puts "SKIPPING $repo $hash. Error: $err"
    puts stderr "SKIPPING $repo $hash. Error: $err"
  }
}

puts stderr "VALIDATING RAW URLS"

set raw_urls [exec git grep -Eoh {github.com/[^/]+/[^/]+/commit/[a-f0-9]{40}}]

foreach url $raw_urls {
  regexp {github.com/([^/]+/[^/]+)/commit/([a-f0-9]{40})} $url _ repo hash
  validate $repo $hash
  after 1000
}

puts stderr "VALIDATING SRCS"

set scriptdir [file normalize [file dirname [info script]]]
# Somehow nix-instantiate --eval doesn't work
set srcs [json::json2dict [exec -ignorestderr nix eval --file $scriptdir/find-gh-srcs.nix --apply "f: f { pkgs = import [pwd] {}; }" --json]]

dict for {name src} $srcs {
  validate [dict get $src repo] [dict get $src hash]
  after 1000
}