M lib/triglav/poi/sources/posta.ex => lib/triglav/poi/sources/posta.ex +124 -102
@@ 1,6 1,9 @@
defmodule Triglav.Poi.Sources.Posta do
@moduledoc """
Loads and parses post office data from posta.hr.
+
+ NB: There are some XML exports, but they don't include geo coordinates, so we
+ read the website instead. See:
https://www.posta.hr/preuzimanje-podataka-o-postanskim-uredima-6543/6543
TODO: Add city (and place?) to tags.
@@ 15,9 18,10 @@ defmodule Triglav.Poi.Sources.Posta do
import Ecto.Query
import Geo.PostGIS
- import SweetXml
import Triglav.Query
+ require Logger
+
plug Tesla.Middleware.Retry,
delay: :timer.seconds(1),
max_retries: 5,
@@ 40,12 44,8 @@ defmodule Triglav.Poi.Sources.Posta do
@impl Source
def fetch() do
- source = Triglav.Poi.Sources.posta()
- response = get!("https://www.posta.hr/postanskiurediRh_rv.aspx?vrsta=xml")
- coordinates = fetch_coordinates()
- nodes = stream_tags!(response.body, :ured) |> Enum.map(&parse_node(&1, source, coordinates))
-
- {:ok, nodes}
+ response = get!("https://www.posta.hr/interaktivna-karta-postanskih-ureda")
+ {:ok, parse_map(response.body)}
end
@impl Source
@@ 90,72 90,6 @@ defmodule Triglav.Poi.Sources.Posta do
|> Repo.all()
end
- defp parse_node({:ured, ured}, source, coordinates) do
- post_code = get_text(ured, "brojPu")
- official_name = get_text(ured, "nazivPu") |> StringUtils.title_case()
- {lat, lng} = Map.fetch!(coordinates, post_code)
- {street, housenumber} = parse_address(ured)
-
- %{
- source_id: source.id,
- name: "#{post_code} #{official_name}",
- ref: post_code,
- geometry: %Geo.Point{coordinates: {lng, lat}, srid: 4326},
- tags:
- MapUtils.remove_blank(%{
- "addr:postcode": post_code,
- "addr:street": street,
- "addr:housenumber": housenumber,
- official_name: official_name,
- name: "#{post_code} #{official_name}",
- amenity: "post_office",
- opening_hours: parse_opening_hours(ured)
- })
- }
- end
-
- defp parse_address(ured) do
- address =
- get_text(ured, "adresa")
- # Remove notes in parenthesis
- |> String.replace(~r"\(.+\)", "")
- |> String.trim()
-
- [street_no | _] =
- address
- |> String.split(",")
- |> Enum.map(&String.trim/1)
- # Remove TC references (Trgovački centar)
- |> Enum.reject(&String.starts_with?(&1, "TC "))
-
- # Join street number, e.g. "1 A" -> "1A" and "7/b" -> "7b"
- street_no = String.replace(street_no, ~r" (\d+)[\s/]+([a-z])$"i, " \\1\\2")
-
- case Regex.run(~r"^(.+) (\d+[a-z]*|bb)$"i, street_no) do
- [_, street, housenumber] -> {street, String.downcase(housenumber)}
- nil -> {street_no, nil}
- end
- end
-
- defp get_text(element, name) do
- case xpath(element, ~x"./#{name}/text()"s) do
- "" -> nil
- other -> other
- end
- end
-
- defp parse_opening_hours(ured) do
- format_opening_hours([
- get_time_span(ured, "RV_pon_od", "RV_pon_do"),
- get_time_span(ured, "RV_uto_od", "RV_uto_do"),
- get_time_span(ured, "RV_sri_od", "RV_sri_do"),
- get_time_span(ured, "RV_cet_od", "RV_cet_do"),
- get_time_span(ured, "RV_pet_od", "RV_pet_do"),
- get_time_span(ured, "RV_sub_od", "RV_sub_do"),
- get_time_span(ured, "RV_ned_od", "RV_ned_do")
- ])
- end
-
defp format_opening_hours(time_spans) do
time_spans
|> Enum.with_index()
@@ 170,9 104,9 @@ defmodule Triglav.Poi.Sources.Posta do
end
end)
|> Enum.reverse()
- |> Enum.map(fn {time_span, days} ->
+ |> Enum.map(fn {{open, close}, days} ->
day_span = format_day_span(List.last(days), List.first(days))
- "#{day_span} #{time_span}"
+ "#{day_span} #{open}-#{close}"
end)
|> Enum.join("; ")
|> String.replace("Mo-Su 7:00-24:00", "24/7")
@@ 189,39 123,127 @@ defmodule Triglav.Poi.Sources.Posta do
defp format_day(5), do: "Sa"
defp format_day(6), do: "Su"
- def get_time_span(element, name1, name2) do
- value1 = get_text(element, name1)
- value2 = get_text(element, name2)
+ def parse_map(html) do
+ Logger.info("Parsing post offices")
+ types = parse_types(html)
+ coordinates = parse_coordinates(html)
+ nodes = parse_content(html, types, coordinates)
+ Logger.info("Found #{length(nodes)} post offices")
+ nodes
+ end
- cond do
- is_nil(value1) and is_nil(value2) -> nil
- not is_nil(value1) and not is_nil(value2) -> "#{value1}-#{value2}"
+ defp parse_types(html) do
+ Regex.scan(~r"vrsta\[(\d+)\] = '(\w+)';", html)
+ |> Enum.map(fn [_, id, type] -> {to_integer(id), String.to_atom(type)} end)
+ |> Map.new()
+ end
+
+ defp parse_coordinates(html) do
+ [_, match] = Regex.run(~r/var neighborhoods = \[([^\n]+)\];/, html)
+
+ for [_line, latlng] <- Regex.scan(~r"new google.maps.LatLng\(([^)]+)\)", match) do
+ latlng
+ |> String.trim()
+ |> String.split(",")
+ |> Enum.map(&to_float/1)
+ |> List.to_tuple()
end
+ |> Enum.with_index()
+ |> Map.new(fn {coordinates, index} -> {index, coordinates} end)
end
- defp fetch_coordinates() do
- response = get!("https://www.posta.hr/mapahp.aspx?lng=_hr")
- [_line, match] = Regex.run(~r/var neighborhoods = \[([^\n]+)\];/, response.body)
-
- coordinates =
- for [_line, latlng] <- Regex.scan(~r"new google.maps.LatLng\(([^)]+)\)", match) do
- latlng
- |> String.trim()
- |> String.split(",")
- |> Enum.map(&to_float/1)
- |> List.to_tuple()
- end
+ defp parse_content(html, types, coordinates) do
+ source = Triglav.Poi.Sources.posta()
- post_codes =
- Regex.scan(~r"content\[(\d+)\] = '(.+)';"U, response.body)
- |> Enum.filter(fn [_, _, content] -> content =~ "POŠTANSKI URED" end)
- |> Enum.map(fn [_line, _index, content] ->
- pattern = ~r'<div class="cloud"><h1>POŠTANSKI URED<br /><br />(\d+) '
- [_, post_code] = Regex.run(pattern, content)
- post_code
- end)
+ for [_, index, content] <- Regex.scan(~r"content\[(\d+)\] = '(.+)';"U, html) do
+ doc = Floki.parse_fragment!(content)
+ index = to_integer(index)
+ amenity = Map.get(types, index) |> amenity()
+ {street, housenumber} = parse_address(doc)
+
+ [post_code_div] = Floki.find(doc, ".cl-postanski")
+ [post_code, name] = Floki.text(post_code_div) |> String.split(" ", parts: 2)
+ name = StringUtils.title_case(name) |> String.trim()
+
+ # Set ref for post office
+ ref = if(amenity == "post_office", do: post_code)
+
+ hours =
+ with [hours_div] <- Floki.find(doc, ".rv"),
+ children <- Floki.children(hours_div),
+ {_, _, ["Radno vrijeme:"]} <- List.first(children) do
+ children
+ |> Enum.filter(&is_binary/1)
+ |> Enum.map(&parse_hours/1)
+ |> format_opening_hours()
+ else
+ _ -> nil
+ end
+
+ {lat, lng} = Map.get(coordinates, index)
+
+ %{
+ source_id: source.id,
+ name: "#{post_code} #{name}",
+ ref: ref,
+ geometry: %Geo.Point{coordinates: {lng, lat}, srid: 4326},
+ tags:
+ MapUtils.remove_blank(%{
+ "addr:postcode": post_code,
+ "addr:street": street,
+ "addr:housenumber": housenumber,
+ official_name: name,
+ name: "#{post_code} #{name}",
+ amenity: amenity,
+ opening_hours: hours
+ })
+ }
+ end
+ end
+
+ defp parse_address(doc) do
+ address =
+ doc
+ |> Floki.find(".cl-adresa")
+ |> Floki.text()
+ # Remove notes in parenthesis
+ |> String.replace(~r"\(.+\)", "")
+ |> String.trim()
+
+ [street_no | _] =
+ address
+ |> String.split(",")
+ |> Enum.map(&String.trim/1)
+ # Remove TC references (Trgovački centar)
+ |> Enum.reject(&String.starts_with?(&1, "TC "))
+
+ # Join street number, e.g. "1 A" -> "1A" and "7/b" -> "7b"
+ street_no = String.replace(street_no, ~r" (\d+)[\s/]+([a-z])$"i, " \\1\\2")
+
+ case Regex.run(~r"^(.+) (\d+[a-z]*|bb)$"i, street_no) do
+ [_, street, housenumber] -> {street, String.downcase(housenumber)}
+ nil -> {street_no, nil}
+ end
+ end
+
+ defp amenity(:pu), do: "post_office"
+ defp amenity(:kov), do: "post_box"
+ defp amenity(:pak), do: "parcel_locker"
+
+ defp parse_hours(hours) do
+ hours
+ |> String.trim()
+ |> String.trim("-")
+ |> String.split("-")
+ |> case do
+ [open, close] -> {open, close}
+ _ -> nil
+ end
+ end
- Enum.zip(post_codes, coordinates) |> Map.new()
+ defp to_integer(str) do
+ {int, ""} = Integer.parse(str)
+ int
end
defp to_float(string) do
M mix.exs => mix.exs +1 -0
@@ 43,6 43,7 @@ defmodule Triglav.MixProject do
{:ecto_psql_extras, "~> 0.6"},
{:ecto_sql, "~> 3.4"},
{:esbuild, "~> 0.4", runtime: Mix.env() == :dev},
+ {:floki, "~> 0.36.2"},
{:geo, "~> 3.0"},
{:geo_postgis, "~> 3.1"},
{:hackney, "~> 1.18"},
M mix.lock => mix.lock +1 -0
@@ 21,6 21,7 @@
"esbuild": {:hex, :esbuild, "0.8.1", "0cbf919f0eccb136d2eeef0df49c4acf55336de864e63594adcea3814f3edf41", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "25fc876a67c13cb0a776e7b5d7974851556baeda2085296c14ab48555ea7560f"},
"file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"},
"finch": {:hex, :finch, "0.12.0", "6bbb3e0bb62dd91cd1217d9682a30f5bfc9b0b74950bf10a0b4d4399c2076892", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "320da3f32459e7dcb77f4271b4f2445ba6c5d32cc3c7cca8e2cff599e24be5a6"},
+ "floki": {:hex, :floki, "0.36.2", "a7da0193538c93f937714a6704369711998a51a6164a222d710ebd54020aa7a3", [:mix], [], "hexpm", "a8766c0bc92f074e5cb36c4f9961982eda84c5d2b8e979ca67f5c268ec8ed580"},
"geo": {:hex, :geo, "3.6.0", "00c9c6338579f67e91cd5950af4ae2eb25cdce0c3398718c232539f61625d0bd", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm", "1dbdebf617183b54bc3c8ad7a36531a9a76ada8ca93f75f573b0ae94006168da"},
"geo_postgis": {:hex, :geo_postgis, "3.6.0", "dbb7874ac04515235bc8e018a3bf4b1b6f3833a870b3371cfa81c983152969f2", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:geo, "~> 3.6", [hex: :geo, repo: "hexpm", optional: false]}, {:jason, "~> 1.2", [hex: :jason, repo: "hexpm", optional: true]}, {:poison, "~> 2.2 or ~> 3.0 or ~> 4.0 or ~> 5.0", [hex: :poison, repo: "hexpm", optional: true]}, {:postgrex, ">= 0.0.0", [hex: :postgrex, repo: "hexpm", optional: false]}], "hexpm", "0fa927b52a2bb17ad8526657ece025867277148c9206c87e165c85a57e56f5f9"},
"gettext": {:hex, :gettext, "0.18.2", "7df3ea191bb56c0309c00a783334b288d08a879f53a7014341284635850a6e55", [:mix], [], "hexpm", "f9f537b13d4fdd30f3039d33cb80144c3aa1f8d9698e47d7bcbcc8df93b1f5c5"},