~etalab/codegouvfr-consolidate-data

ref: e6b6a8e7ba5b5ad811bdbee8a05372be0e3f9547 codegouvfr-consolidate-data/src/stats.clj -rw-r--r-- 4.3 KiB
e6b6a8e7Bastien Guerry deps.edn: Bump datalevin version 4 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
;; Copyright (c) 2022 DINUM, Bastien Guerry <bastien.guerry@data.gouv.fr>
;; SPDX-License-Identifier: EPL-2.0
;; License-Filename: LICENSE

(ns stats
  (:require [clojure.string :as string]
            [utils :as utils]
            [jsonista.core :as json]))

(defn- top-orgas-by-repos
  "Take the first 10 organizations with the highest repos count."
  [orgas]
  (->> orgas
       (sort-by :repositories_count)
       reverse
       (take 10)
       (map #(select-keys % [:login :platform :repositories_count]))
       (map (fn [{:keys [login platform repositories_count]}]
              [(str login " (" platform ")") repositories_count]))))

(defn- top-orgas-by-stars
  "Take the first 10 organizations with the highest stars count."
  [repos]
  (->> repos
       (group-by :organization_name)
       (map (fn [[k v]] {:orga  k
                         :platform
                         (let [rurl (:repository_url (first v))]
                           (cond (re-find #"https://github.com" rurl) "GitHub"
                                 (re-find #"https://git.sr.ht" rurl)  "SourceHut"
                                 :else                                "GitLab"))
                         :stars (reduce + (map :stars_count v))}))
       (sort-by :stars)
       reverse
       (take 10)
       (map (fn [{:keys [orga stars platform]}]
              [(str orga " (" platform ")") stars]))))

(defn top-licenses
  "Return the 10 most used licenses in all repositories."
  [repos]
  (->> repos
       (filter #(not-empty (:license %)))
       (group-by :license)
       (map (fn [[k v]] {:license k :repos_cnt (count v)}))
       (sort-by :repos_cnt)
       reverse
       (take 10)
       (map (fn [{:keys [license repos_cnt]}] [license repos_cnt]))))

(defn- top-languages
  "Return the 10 most used languages in all repositories."
  [repos]
  (->> repos
       (filter #(not-empty (:language %)))
       (group-by :language)
       (map (fn [[k v]] {:language k :repos_cnt (count v)}))
       (sort-by :repos_cnt)
       reverse
       (take 10)
       (map (fn [{:keys [language repos_cnt]}] [language repos_cnt]))))

(defn- top-forges
  "Return the top 10 platforms with most repositories."
  [orgas]
  (->> orgas
       (map (juxt :organization_url :repositories_count))
       (group-by (fn [[o _]] (last (re-find #"^https://([^/]+)" o))))
       (map (fn [[k v]] [k (reduce + (map last v))]))
       (sort-by last)
       reverse
       (take 10)))

(defn- top-ministries
  "Return the top 10 ministries with most repositories."
  [orgas]
  (->> orgas
       (filter #(not-empty (:ministry %)))
       (map (juxt :ministry :repositories_count))
       (group-by first)
       (map (fn [[k v]] [k (reduce + (map last v))]))
       (sort-by last)
       reverse
       (take 10)))

(defn- top-topics
  "Return the 10 most frequent topics in all repositories."
  [repos]
  (->> repos
       (map :topics)
       (keep not-empty)
       (map #(string/split % #","))
       flatten
       (group-by identity)
       (sort-by #(count (val %)))
       (map (fn [[k v]] [k (count v)]))
       reverse
       (take 10)))

(defn- mean_repos_by_orga
  "Return the average number of repositories per organization."
  [orgas]
  (->> (/ (reduce + (map :repositories_count orgas))
          (count orgas))
       float
       (format "%.2f")))

(defn- median_repos_by_orga
  "Return the median number of repositories per organization."
  [orgas]
  (int (utils/median (map :repositories_count orgas))))

(defn generate-stats-json [repos orgas libs deps sill papillon]
  (let [stats {:repos_cnt         (count repos)
               :orgas_cnt         (count orgas)
               :libs_cnt          (count libs)
               :deps_cnt          (count deps)
               :sill_cnt          (count sill)
               :papillon_cnt      (count papillon)
               :median_repos_cnt  (median_repos_by_orga orgas)
               :avg_repos_cnt     (mean_repos_by_orga orgas)
               :top_orgs_by_repos (top-orgas-by-repos orgas)
               :top_orgs_by_stars (top-orgas-by-stars repos)
               :top_licenses      (top-licenses repos)
               :top_languages     (top-languages repos)
               :top_topics        (top-topics repos)
               :top_forges        (top-forges orgas)
               :top_ministries    (top-ministries orgas)}]
    (spit "stats.json" (json/write-value-as-string stats))))