~ihabunek/triglav

c33cf3d512455de7d30f92ebfc02d50fb9611ce5 — Ivan Habunek 6 months ago 113d8b3 master
Rework posta poi loader
3 files changed, 126 insertions(+), 102 deletions(-)

M lib/triglav/poi/sources/posta.ex
M mix.exs
M mix.lock
M lib/triglav/poi/sources/posta.ex => lib/triglav/poi/sources/posta.ex +124 -102
@@ 1,6 1,9 @@
defmodule Triglav.Poi.Sources.Posta do
  @moduledoc """
  Loads and parses post office data from posta.hr.

  NB: There are some XML exports, but they don't include geo coordinates, so we
  read the website instead. See:
  https://www.posta.hr/preuzimanje-podataka-o-postanskim-uredima-6543/6543

  TODO: Add city (and place?) to tags.


@@ 15,9 18,10 @@ defmodule Triglav.Poi.Sources.Posta do

  import Ecto.Query
  import Geo.PostGIS
  import SweetXml
  import Triglav.Query

  require Logger

  plug Tesla.Middleware.Retry,
    delay: :timer.seconds(1),
    max_retries: 5,


@@ 40,12 44,8 @@ defmodule Triglav.Poi.Sources.Posta do

  @impl Source
  def fetch() do
    source = Triglav.Poi.Sources.posta()
    response = get!("https://www.posta.hr/postanskiurediRh_rv.aspx?vrsta=xml")
    coordinates = fetch_coordinates()
    nodes = stream_tags!(response.body, :ured) |> Enum.map(&parse_node(&1, source, coordinates))

    {:ok, nodes}
    response = get!("https://www.posta.hr/interaktivna-karta-postanskih-ureda")
    {:ok, parse_map(response.body)}
  end

  @impl Source


@@ 90,72 90,6 @@ defmodule Triglav.Poi.Sources.Posta do
    |> Repo.all()
  end

  defp parse_node({:ured, ured}, source, coordinates) do
    post_code = get_text(ured, "brojPu")
    official_name = get_text(ured, "nazivPu") |> StringUtils.title_case()
    {lat, lng} = Map.fetch!(coordinates, post_code)
    {street, housenumber} = parse_address(ured)

    %{
      source_id: source.id,
      name: "#{post_code} #{official_name}",
      ref: post_code,
      geometry: %Geo.Point{coordinates: {lng, lat}, srid: 4326},
      tags:
        MapUtils.remove_blank(%{
          "addr:postcode": post_code,
          "addr:street": street,
          "addr:housenumber": housenumber,
          official_name: official_name,
          name: "#{post_code} #{official_name}",
          amenity: "post_office",
          opening_hours: parse_opening_hours(ured)
        })
    }
  end

  defp parse_address(ured) do
    address =
      get_text(ured, "adresa")
      # Remove notes in parenthesis
      |> String.replace(~r"\(.+\)", "")
      |> String.trim()

    [street_no | _] =
      address
      |> String.split(",")
      |> Enum.map(&String.trim/1)
      # Remove TC references (Trgovački centar)
      |> Enum.reject(&String.starts_with?(&1, "TC "))

    # Join street number, e.g. "1 A" -> "1A" and "7/b" -> "7b"
    street_no = String.replace(street_no, ~r" (\d+)[\s/]+([a-z])$"i, " \\1\\2")

    case Regex.run(~r"^(.+) (\d+[a-z]*|bb)$"i, street_no) do
      [_, street, housenumber] -> {street, String.downcase(housenumber)}
      nil -> {street_no, nil}
    end
  end

  defp get_text(element, name) do
    case xpath(element, ~x"./#{name}/text()"s) do
      "" -> nil
      other -> other
    end
  end

  defp parse_opening_hours(ured) do
    format_opening_hours([
      get_time_span(ured, "RV_pon_od", "RV_pon_do"),
      get_time_span(ured, "RV_uto_od", "RV_uto_do"),
      get_time_span(ured, "RV_sri_od", "RV_sri_do"),
      get_time_span(ured, "RV_cet_od", "RV_cet_do"),
      get_time_span(ured, "RV_pet_od", "RV_pet_do"),
      get_time_span(ured, "RV_sub_od", "RV_sub_do"),
      get_time_span(ured, "RV_ned_od", "RV_ned_do")
    ])
  end

  defp format_opening_hours(time_spans) do
    time_spans
    |> Enum.with_index()


@@ 170,9 104,9 @@ defmodule Triglav.Poi.Sources.Posta do
      end
    end)
    |> Enum.reverse()
    |> Enum.map(fn {time_span, days} ->
    |> Enum.map(fn {{open, close}, days} ->
      day_span = format_day_span(List.last(days), List.first(days))
      "#{day_span} #{time_span}"
      "#{day_span} #{open}-#{close}"
    end)
    |> Enum.join("; ")
    |> String.replace("Mo-Su 7:00-24:00", "24/7")


@@ 189,39 123,127 @@ defmodule Triglav.Poi.Sources.Posta do
  defp format_day(5), do: "Sa"
  defp format_day(6), do: "Su"

  def get_time_span(element, name1, name2) do
    value1 = get_text(element, name1)
    value2 = get_text(element, name2)
  def parse_map(html) do
    Logger.info("Parsing post offices")
    types = parse_types(html)
    coordinates = parse_coordinates(html)
    nodes = parse_content(html, types, coordinates)
    Logger.info("Found #{length(nodes)} post offices")
    nodes
  end

    cond do
      is_nil(value1) and is_nil(value2) -> nil
      not is_nil(value1) and not is_nil(value2) -> "#{value1}-#{value2}"
  defp parse_types(html) do
    Regex.scan(~r"vrsta\[(\d+)\] = '(\w+)';", html)
    |> Enum.map(fn [_, id, type] -> {to_integer(id), String.to_atom(type)} end)
    |> Map.new()
  end

  defp parse_coordinates(html) do
    [_, match] = Regex.run(~r/var neighborhoods = \[([^\n]+)\];/, html)

    for [_line, latlng] <- Regex.scan(~r"new google.maps.LatLng\(([^)]+)\)", match) do
      latlng
      |> String.trim()
      |> String.split(",")
      |> Enum.map(&to_float/1)
      |> List.to_tuple()
    end
    |> Enum.with_index()
    |> Map.new(fn {coordinates, index} -> {index, coordinates} end)
  end

  defp fetch_coordinates() do
    response = get!("https://www.posta.hr/mapahp.aspx?lng=_hr")
    [_line, match] = Regex.run(~r/var neighborhoods = \[([^\n]+)\];/, response.body)

    coordinates =
      for [_line, latlng] <- Regex.scan(~r"new google.maps.LatLng\(([^)]+)\)", match) do
        latlng
        |> String.trim()
        |> String.split(",")
        |> Enum.map(&to_float/1)
        |> List.to_tuple()
      end
  defp parse_content(html, types, coordinates) do
    source = Triglav.Poi.Sources.posta()

    post_codes =
      Regex.scan(~r"content\[(\d+)\] = '(.+)';"U, response.body)
      |> Enum.filter(fn [_, _, content] -> content =~ "POŠTANSKI URED" end)
      |> Enum.map(fn [_line, _index, content] ->
        pattern = ~r'<div class="cloud"><h1>POŠTANSKI URED<br /><br />(\d+) '
        [_, post_code] = Regex.run(pattern, content)
        post_code
      end)
    for [_, index, content] <- Regex.scan(~r"content\[(\d+)\] = '(.+)';"U, html) do
      doc = Floki.parse_fragment!(content)
      index = to_integer(index)
      amenity = Map.get(types, index) |> amenity()
      {street, housenumber} = parse_address(doc)

      [post_code_div] = Floki.find(doc, ".cl-postanski")
      [post_code, name] = Floki.text(post_code_div) |> String.split(" ", parts: 2)
      name = StringUtils.title_case(name) |> String.trim()

      # Set ref for post office
      ref = if(amenity == "post_office", do: post_code)

      hours =
        with [hours_div] <- Floki.find(doc, ".rv"),
             children <- Floki.children(hours_div),
             {_, _, ["Radno vrijeme:"]} <- List.first(children) do
          children
          |> Enum.filter(&is_binary/1)
          |> Enum.map(&parse_hours/1)
          |> format_opening_hours()
        else
          _ -> nil
        end

      {lat, lng} = Map.get(coordinates, index)

      %{
        source_id: source.id,
        name: "#{post_code} #{name}",
        ref: ref,
        geometry: %Geo.Point{coordinates: {lng, lat}, srid: 4326},
        tags:
          MapUtils.remove_blank(%{
            "addr:postcode": post_code,
            "addr:street": street,
            "addr:housenumber": housenumber,
            official_name: name,
            name: "#{post_code} #{name}",
            amenity: amenity,
            opening_hours: hours
          })
      }
    end
  end

  defp parse_address(doc) do
    address =
      doc
      |> Floki.find(".cl-adresa")
      |> Floki.text()
      # Remove notes in parenthesis
      |> String.replace(~r"\(.+\)", "")
      |> String.trim()

    [street_no | _] =
      address
      |> String.split(",")
      |> Enum.map(&String.trim/1)
      # Remove TC references (Trgovački centar)
      |> Enum.reject(&String.starts_with?(&1, "TC "))

    # Join street number, e.g. "1 A" -> "1A" and "7/b" -> "7b"
    street_no = String.replace(street_no, ~r" (\d+)[\s/]+([a-z])$"i, " \\1\\2")

    case Regex.run(~r"^(.+) (\d+[a-z]*|bb)$"i, street_no) do
      [_, street, housenumber] -> {street, String.downcase(housenumber)}
      nil -> {street_no, nil}
    end
  end

  defp amenity(:pu), do: "post_office"
  defp amenity(:kov), do: "post_box"
  defp amenity(:pak), do: "parcel_locker"

  defp parse_hours(hours) do
    hours
    |> String.trim()
    |> String.trim("-")
    |> String.split("-")
    |> case do
      [open, close] -> {open, close}
      _ -> nil
    end
  end

    Enum.zip(post_codes, coordinates) |> Map.new()
  defp to_integer(str) do
    {int, ""} = Integer.parse(str)
    int
  end

  defp to_float(string) do

M mix.exs => mix.exs +1 -0
@@ 43,6 43,7 @@ defmodule Triglav.MixProject do
      {:ecto_psql_extras, "~> 0.6"},
      {:ecto_sql, "~> 3.4"},
      {:esbuild, "~> 0.4", runtime: Mix.env() == :dev},
      {:floki, "~> 0.36.2"},
      {:geo, "~> 3.0"},
      {:geo_postgis, "~> 3.1"},
      {:hackney, "~> 1.18"},

M mix.lock => mix.lock +1 -0
@@ 21,6 21,7 @@
  "esbuild": {:hex, :esbuild, "0.8.1", "0cbf919f0eccb136d2eeef0df49c4acf55336de864e63594adcea3814f3edf41", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "25fc876a67c13cb0a776e7b5d7974851556baeda2085296c14ab48555ea7560f"},
  "file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"},
  "finch": {:hex, :finch, "0.12.0", "6bbb3e0bb62dd91cd1217d9682a30f5bfc9b0b74950bf10a0b4d4399c2076892", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "320da3f32459e7dcb77f4271b4f2445ba6c5d32cc3c7cca8e2cff599e24be5a6"},
  "floki": {:hex, :floki, "0.36.2", "a7da0193538c93f937714a6704369711998a51a6164a222d710ebd54020aa7a3", [:mix], [], "hexpm", "a8766c0bc92f074e5cb36c4f9961982eda84c5d2b8e979ca67f5c268ec8ed580"},
  "geo": {:hex, :geo, "3.6.0", "00c9c6338579f67e91cd5950af4ae2eb25cdce0c3398718c232539f61625d0bd", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm", "1dbdebf617183b54bc3c8ad7a36531a9a76ada8ca93f75f573b0ae94006168da"},
  "geo_postgis": {:hex, :geo_postgis, "3.6.0", "dbb7874ac04515235bc8e018a3bf4b1b6f3833a870b3371cfa81c983152969f2", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:geo, "~> 3.6", [hex: :geo, repo: "hexpm", optional: false]}, {:jason, "~> 1.2", [hex: :jason, repo: "hexpm", optional: true]}, {:poison, "~> 2.2 or ~> 3.0 or ~> 4.0 or ~> 5.0", [hex: :poison, repo: "hexpm", optional: true]}, {:postgrex, ">= 0.0.0", [hex: :postgrex, repo: "hexpm", optional: false]}], "hexpm", "0fa927b52a2bb17ad8526657ece025867277148c9206c87e165c85a57e56f5f9"},
  "gettext": {:hex, :gettext, "0.18.2", "7df3ea191bb56c0309c00a783334b288d08a879f53a7014341284635850a6e55", [:mix], [], "hexpm", "f9f537b13d4fdd30f3039d33cb80144c3aa1f8d9698e47d7bcbcc8df93b1f5c5"},