~jomco/straatnaam

d1e9adeae075af3e4dc9e0d6ec49bd991b609851 — Remco van 't Veer 1 year, 2 months ago b3826ff
Only accept dataset which are larger than the original/current

To avoid getting downgraded.
3 files changed, 56 insertions(+), 49 deletions(-)

M src/straatnaam/data.clj
M src/straatnaam/sanity.clj
M test/straatnaam/data_test.clj
M src/straatnaam/data.clj => src/straatnaam/data.clj +10 -1
@@ 50,10 50,19 @@
                             SELECT * FROM " sn ".verblijfsobject_pand")])
  (log/info "version activated:" sn))

(defn- current-sn
  "Determine scheme name of currently exposed version (which ever was
  imported successfully last)."
  [db]
  (-> db
      (sql/query "SELECT schema FROM bag_syncs WHERE ok ORDER BY finished_at DESC LIMIT 1")
      first
      :schema))

(defn version-ok?
  "Returns true when version passes sanity checks."
  [db sn]
  (and (sanity/count-stats-ok? db sn)
  (and (sanity/count-stats-ok? db sn (current-sn db))
       (sanity/common-addresses-available? db sn)))

(defn versions

M src/straatnaam/sanity.clj => src/straatnaam/sanity.clj +39 -43
@@ 23,68 23,64 @@

(defn ratio-vals
  "Return map of ratios between values in given maps a and b.  Missing
  entry in one of the maps is replaced by zero values yielding either
  0 or (-)infinity."
  entry in one of the maps is omitted."
  [a b]
  (reduce (fn [m k]
            (let [a (get a k 0)
                  b (get b k 0)]
              (assoc m k (if (zero? a)
                           (if (neg? b)
                             Double/NEGATIVE_INFINITY
                             Double/POSITIVE_INFINITY)
                           (/ b a)))))
            (let [a (get a k)
                  b (get b k)]
              (if (and a b)
                (assoc m k (/ b a))
                m)))
          {}
          (set (concat (keys a)
                       (keys b)))))

(defn count-by-object-type
  "Returns a map of object_type to count in given sn."
  [db sn]
  (reduce (fn [m {:keys [n object_type]}]
            (assoc m object_type n))
          {}
          (sql/query db [(str "SELECT COUNT(*) AS n, object_type
                               FROM " sn ".bag
                               GROUP BY object_type")])))
;; Values based on import of 2023-01-08.  Overridden by tests.
(def ^:dynamic *baseline-count-by-table*
  {"ligplaats"        17985
   "nummeraanduiding" 12187041
   "pand"             20845820
   "standplaats"      54179
   "verblijfsobject"  21684895
   "woonplaats"       3968})

(defn- table-or-view-exists? [db name & [sn]]
  (sql/with-db-metadata [md db]
    (.next (.getTables md nil sn name nil))))

(defn virgin?
  "Test if a current version is available and contains data."
  [db]
  (or (not (table-or-view-exists? db "bag" "public"))
      (-> db (sql/query "SELECT COUNT(*) AS n FROM public.bag")
          first :n
          (= 0))))

;; Values based on import of 2021-09-14.  Overridden by tests.
(def ^:dynamic *baseline-count-by-object-type*
  {"ligplaats" 13000, "standplaats" 30000, "verblijfsobject" 9537000})
(defn count-by-table
  "Returns a map of tables to counts in given sn."
  [db sn]
  (reduce (fn [m table]
            (if (table-or-view-exists? db table sn)
              (assoc m table
                     (-> db
                         (sql/query (str "SELECT COUNT(*) AS n FROM " sn "." table))
                         first
                         :n))
              m))
          {}
          (keys *baseline-count-by-table*)))

(defn count-stats-ok?
  "Are the count stats of the given version sn good compared to the
  current version in public.  Return true if the object_types counts
  in the bag table differs between -5% and 10% percent from the
  current version or between -10% and 20% percent from the hardcoded
  baseline values.  Bad stats are logged."
  [db sn]
  (let [[pred
         baseline] [#(< 0.90 (val %) 1.20)
                    (if (virgin? db)
                      *baseline-count-by-object-type*
                      (count-by-object-type db "public"))]
        new        (count-by-object-type db sn)
        bad-ratios (remove pred (ratio-vals baseline new))]
  current version in public.  Return true if the table counts differ
  between 0% and 20% percent from the current version or from the
  hardcoded baseline values.  Tables which are not in both schemas are
  not considered to allow the introduction of extra imported data in
  newer versions.  Bad stats are logged."
  [db new-sn current-sn]
  (let [current    (when current-sn (count-by-table db current-sn))
        current    (if (seq current) current *baseline-count-by-table*)
        new        (count-by-table db new-sn)
        bad-ratios (remove #(<= 1 (val %) 1.20) (ratio-vals current new))]
    (if (seq bad-ratios)
      (do
        (log/info "difference too big for:"
                  (string/join ", "
                               (map #(let [[k] %]
                                       (str k " (" (get baseline k)
                                            " => " (get new k 0) ")"))
                                       (str k " (" (get current k)
                                            " => " (get new k) ")"))
                                    bad-ratios)))
        false)
      true)))

M test/straatnaam/data_test.clj => test/straatnaam/data_test.clj +7 -5
@@ 58,11 58,13 @@
  (compose-fixtures
   test-db/each-fixture
   (fn [f]
     (binding [sanity/*baseline-count-by-object-type* {"ligplaats"       1
                                                       "standplaats"     1
                                                       "verblijfsobject" 3}
               sanity/*common-addresses*              [["1011AB" 105
                                                        "De Ruijterkade" "Amsterdam"]]]
     (binding [sanity/*baseline-count-by-table* {"ligplaats"        1
                                                 "nummeraanduiding" 5
                                                 "verblijfsobject"  1
                                                 "standplaats"      1
                                                 "pand"             1
                                                 "woonplaats"       3}
               sanity/*common-addresses*        [["1011AB" 105 "De Ruijterkade" "Amsterdam"]]]
       (let [srv (run-jetty handler {:port test-port, :join? false})]
         (try
           (f)