~etalab/codegouvfr-consolidate-data

ref: e6b6a8e7ba5b5ad811bdbee8a05372be0e3f9547 codegouvfr-consolidate-data/src/utils.clj -rw-r--r-- 12.5 KiB
e6b6a8e7Bastien Guerry deps.edn: Bump datalevin version 4 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
;; Copyright (c) 2020, 2022 DINUM, Bastien Guerry <bastien.guerry@data.gouv.fr>
;; SPDX-License-Identifier: EPL-2.0
;; License-Filename: LICENSE

(ns utils
  (:require [jsonista.core :as json]
            [clojure.data.csv :as csv]
            [clojure.string :as string]
            [clojure.walk :as walk]
            [clojure.edn :as edn]
            [babashka.curl :as curl]
            [clj-yaml.core :as yaml]
            [java-time :as t]
            [hickory.core :as h]
            [hickory.select :as hs]
            [taoensso.timbre :as timbre]))

(defonce env-vars
  {:gh-user             (System/getenv "CODEGOUVFR_GITHUB_USER")
   :gh-token            (System/getenv "CODEGOUVFR_GITHUB_ACCESS_TOKEN")
   :thread-interval     (Integer. (System/getenv "CODEGOUVFR_GET_INTERVAL"))
   :updating-after-days (Integer. (System/getenv "CODEGOUVFR_DAYS_INTERVAL"))})

(defonce urls
  {:sources    "https://git.sr.ht/~etalab/codegouvfr-sources/blob/master/comptes-organismes-publics.yml"
   :sill       "https://code.gouv.fr/data/sill-data.json"
   :libs       "https://code.gouv.fr/data/libraries/json/all.json"
   :repos      "https://code.gouv.fr/data/repositories/json/all.json"
   :orgas      "https://code.gouv.fr/data/organizations/json/all.json"
   :annuaire   "https://static.data.gouv.fr/resources/organisations-de-codegouvfr/20191011-110549/lannuaire.csv"
   :emoji-json "https://raw.githubusercontent.com/amio/emoji.json/master/emoji.json"})

(defonce sources
  (try
    (-> (slurp (:sources urls))
        (yaml/parse-string :keywords false))
    (catch Exception e
      (timbre/error
       (str "Error while fetching the list of organizations")
       (.getMessage e)))))

(defonce mappings
  {;; Mapping from libraries keywords to local short versions
   :libs     {:description                        :d
              :latest_stable_release_published_at :u
              :repo_url                           :l
              :name                               :n
              :platform                           :t
              :is_repo                            :r?
              :license                            :l}
   ;; Mapping from sill keywords to local short versions
   :sill     {:sill_id                           :id
              :name                              :n
              :license                           :l
              :function                          :f
              :isFromFrenchPublicService         :fr
              :referencedSinceTime               :u
              :isPresentInSupportContract        :s
              :comptoirDuLibreSoftwareId         :cl
              :comptoirDuLibreSoftwareProviders? :clp
              :wikidataDataLogoUrl               :i
              :useCaseUrls                       :c
              :workshopUrls                      :w
              :agentWorkstation                  :a}
   ;; Mapping from papillon keywords to local short versions
   :papillon {:agencyName        :a
              :publicSector      :p
              :serviceName       :n
              :description       :d
              :serviceUrl        :l
              :softwareSillId    :i
              :comptoirDuLibreId :c}
   ;; Mapping from repositories keywords to local short versions
   :repos    {:last_update       :u
              :description       :d
              :is_archived       :a?
              :is_fork           :f?
              :is_esr            :e?
              :is_lib            :l?
              :is_contrib        :c?
              :language          :l
              :license           :li
              :name              :n
              :forks_count       :f
              :stars_count       :s
              :platform          :p
              :organization_name :o
              :reuses            :re
              :repository_url    :r}
   ;; Mapping from libraries keywords to local short versions
   :deps     {:type         :t
              :name         :n
              :description  :d
              :repositories :r
              :updated      :u
              ;; FIXME: Unused yet?
              :repo_url     :ru
              :link         :l}
   ;; Mapping from groups/organizations keywords to local short versions
   :orgas    {:description        :d
              :location           :a
              :email              :e
              :name               :n
              :platform           :p
              :website            :h
              :is_verified        :v?
              :ministry           :m
              :annuaire           :an
              :floss_policy       :f
              :login              :l
              :creation_date      :c
              :repositories_count :r
              :organization_url   :o
              :avatar_url         :au}
   :licenses
   {"MIT License"                                                "MIT License (MIT)"
    "GNU Affero General Public License v3.0"                     "GNU Affero General Public License v3.0 (AGPL-3.0)"
    "GNU General Public License v3.0"                            "GNU General Public License v3.0 (GPL-3.0)"
    "GNU Lesser General Public License v2.1"                     "GNU Lesser General Public License v2.1 (LGPL-2.1)"
    "Apache License 2.0"                                         "Apache License 2.0 (Apache-2.0)"
    "GNU General Public License v2.0"                            "GNU General Public License v2.0 (GPL-2.0)"
    "GNU Lesser General Public License v3.0"                     "GNU Lesser General Public License v3.0 (LGPL-3.0)"
    "Mozilla Public License 2.0"                                 "Mozilla Public License 2.0 (MPL-2.0)"
    "Eclipse Public License 2.0"                                 "Eclipse Public License 2.0 (EPL-2.0)"
    "Eclipse Public License 1.0"                                 "Eclipse Public License 1.0 (EPL-1.0)"
    "BSD 3-Clause \"New\" or \"Revised\" License"                "BSD 3-Clause \"New\" or \"Revised\" License (BSD-3-Clause)"
    "European Union Public License 1.2"                          "European Union Public License 1.2 (EUPL-1.2)"
    "Creative Commons Attribution Share Alike 4.0 International" "Creative Commons Attribution Share Alike 4.0 International (CC-BY-SA-4.0)"
    "BSD 2-Clause \"Simplified\" License"                        "BSD 2-Clause \"Simplified\" License (BSD-2-Clause)"
    "The Unlicense"                                              "The Unlicense (Unlicense)"
    "Do What The Fuck You Want To Public License"                "Do What The Fuck You Want To Public License (WTFPL)"
    "Creative Commons Attribution 4.0 International"             "Creative Commons Attribution 4.0 International (CC-BY-4.0)"}
   :licenses-spdx
   {"Other"                                                      "Other"
    "MIT License"                                                "MIT"
    "GNU Affero General Public License v3.0"                     "AGPL-3.0"
    "GNU General Public License v3.0"                            "GPL-3.0"
    "GNU Lesser General Public License v2.1"                     "LGPL-2.1"
    "Apache License 2.0"                                         "Apache-2.0"
    "GNU General Public License v2.0"                            "GPL-2.0"
    "GNU Lesser General Public License v3.0"                     "LGPL-3.0"
    "Mozilla Public License 2.0"                                 "MPL-2.0"
    "Eclipse Public License 2.0"                                 "EPL-2.0"
    "Eclipse Public License 1.0"                                 "EPL-1.0"
    "BSD 3-Clause \"New\" or \"Revised\" License"                "BSD-3-Clause"
    "European Union Public License 1.2"                          "EUPL-1.2"
    "Creative Commons Attribution Share Alike 4.0 International" "CC-BY-SA-4.0"
    "BSD 2-Clause \"Simplified\" License"                        "BSD-2-Clause"
    "The Unlicense"                                              "Unlicense"
    "Do What The Fuck You Want To Public License"                "WTFPL"
    "Creative Commons Attribution 4.0 International"             "CC-BY-4.0"}})

(defn- mean
  ;; FIXME: Get this from a standard library?
  "Standard mean function."
  [xs] (float (/ (reduce + xs) (count xs))))

(defn median
  ;; FIXME: Get this from a standard library?
  "Standard mean function."
  [xs]
  (let [n   (count xs)
        mid (/ n 2)]
    (if (odd? n)
      (nth (sort xs) mid)
      (->> (sort xs)
           (drop (dec mid))
           (take 2)
           (mean)))))

(defn replace-vals [m v r]
  (walk/postwalk #(if (= % v) r %) m))

(def user-agent
  {:raw-args ["--connect-timeout" "10"]
   :headers  {"User-Agent" "https://code.gouv.fr bot (logiciels-libres@data.gouv.fr)"}})

(def default-parameters user-agent)

(def gh-parameters
  (merge user-agent {:basic-auth [(:gh-user env-vars) (:gh-token env-vars)]}))

(defn needs-updating? [date-str]
  (let [delay (:updating-after-days env-vars)]
    (if-not (string? date-str)
      true
      (t/before?
       (t/minus (t/instant date-str) (t/days (rand-int delay)))
       (t/minus (t/instant) (t/days delay))))))

(defn get-contents [s]
  (Thread/sleep (:thread-interval env-vars))
  (let [url?    (re-find #"https://" s)
        gh-api? (and url? (re-find #"https://api.github.com" s))
        res     (try (apply
                      (cond
                        gh-api? #(curl/get % gh-parameters)
                        url?    #(curl/get % default-parameters)
                        :else   slurp) [s])
                     (catch Exception e
                       (timbre/error
                        (str "Error while getting contents for " s ":")
                        (.getMessage e))))]
    (if (and url? (= (:status res) 200))
      (:body res)
      res)))

(defn- rows->maps [csv]
  (let [headers (map keyword (first csv))
        rows    (rest csv)]
    (map #(zipmap headers %) rows)))

(defn csv-url-to-map [url]
  (try
    (rows->maps (csv/read-csv (get-contents url)))
    (catch Exception e
      (timbre/error (.getMessage e)))))

(defn json-parse-with-keywords [s]
  (-> s
      (json/read-value
       (json/object-mapper {:decode-key-fn keyword}))))

(defn get-contents-json-to-kwds [s]
  (json-parse-with-keywords (get-contents s)))

(defonce emojis
  (->> (:emoji-json urls)
       get-contents
       json-parse-with-keywords
       (map #(select-keys % [:char :name]))
       (map #(update % :name
                     (fn [n] (str ":" (string/replace n " " "_") ":"))))))

;;; Main functions to update repos

(defn get-contributing
  [{:keys [platform organization_name name repository_url default_branch contributing]}]
  (timbre/info "Checking CONTRIBUTING.md for" repository_url)
  (if-not (needs-updating? (:updated contributing))
    contributing
    (let  [path        (str (or default_branch "master") "/CONTRIBUTING.md")
           url         (condp = platform
                         "GitHub"    (format "https://raw.githubusercontent.com/%s/%s/%s"
                                             organization_name name path)
                         "SourceHut" (str repository_url "/blob/" path)
                         "GitLab"    (str repository_url "/-/raw/" path))
           contents    (get-contents url)
           ;; FIXME: Hack to circumvent cases when GitLab returns the Sign in page:
           contents-ok (and contents (not (re-matches #"<!DOCTYPE html>" contents)))]
      {:is_contrib? (when contents-ok (boolean (seq contents)))
       :updated     (str (t/instant))})))

(defn get-reuses
  "Return a hash-map with reuse information"
  [{:keys [platform repository_url reuses]}]
  (if-not (needs-updating? (:updated reuses))
    reuses
    (let [updated        (str (t/instant))
          default_reuses {:number 0 :updated updated}]
      (if-not (= platform "GitHub")
        ;; Don't try to fetch reuses for GitLab and SourceHut
        default_reuses
        (do
          (timbre/info "Getting dependents for" repository_url)
          (if-let [repo-github-html
                   (get-contents (str repository_url "/network/dependents"))]
            (let [btn-links (-> repo-github-html
                                h/parse
                                h/as-hickory
                                (as-> d (hs/select (hs/class "btn-link") d)))
                  nb-reps   (or (try (re-find #"\d+" (last (:content (nth btn-links 1))))
                                     (catch Exception _ "0")) "0")
                  nb-pkgs   (or (try (re-find #"\d+" (last (:content (nth btn-links 2))))
                                     (catch Exception _ "0")) "0")]
              {:number  (+ (edn/read-string nb-reps)
                           (edn/read-string nb-pkgs))
               :updated updated})
            default_reuses))))))