~ihabunek/triglav

6332f6317176efc9bffb180caec1f36cf605053a — Ivan Habunek 1 year, 17 days ago 0cfd9d2
Simplify and automate GTFS data import
M .gitignore => .gitignore +3 -0
@@ 38,3 38,6 @@ npm-debug.log

# Local config
/config/*.local.exs

# Temp dir
/tmp

D lib/mix/tasks/triglav/import_gtfs.ex => lib/mix/tasks/triglav/import_gtfs.ex +0 -33
@@ 1,33 0,0 @@
defmodule Mix.Tasks.Triglav.ImportGtfs do
  use Mix.Task

  alias Triglav.Repo

  @shortdoc "Imports GTFS data for ZET routes"

  @impl Mix.Task
  def run(_args) do
    {:ok, _} = Application.ensure_all_started(:triglav)

    db_state = Triglav.DataImport.load_state()
    web_state = get_web_state()
  end

  defp get_web_state() do
    html = get("https://www.zet.hr/odredbe/datoteke-u-gtfs-formatu/669")
    Regex.run(~r/https:\/\/www.zet.hr\/UserDocsImages\/[^"]+/, html)
    |> IO.inspect(label: "matches")
  end

  defp get(url) do
    {:ok, {{'HTTP/1.1', 200, 'OK'}, _headers, body}} =
      :httpc.request(:get, {to_charlist(url), []}, [], [])

    to_string(body)
  end

  defp download(url, target) do
    {:ok, :saved_to_file} =
      :httpc.request(:get, {to_charlist(url), []}, [], stream: to_charlist(target))
  end
end

A lib/mix/tasks/triglav/import_zet.ex => lib/mix/tasks/triglav/import_zet.ex +12 -0
@@ 0,0 1,12 @@
defmodule Mix.Tasks.Triglav.ImportZet do
  use Mix.Task

  @shortdoc "Imports GTFS data for ZET routes"

  @impl Mix.Task
  def run(_args) do
    Application.put_env(:triglav, :repo_only, true)
    Application.ensure_all_started(:triglav)
    Triglav.Import.Zet.run()
  end
end

A lib/triglav/import/zet.ex => lib/triglav/import/zet.ex +130 -0
@@ 0,0 1,130 @@
defmodule Triglav.Import.Zet do
  @moduledoc """
  Imports the latest GTFS data from ZET.

  See:
  https://download.geofabrik.de/europe/croatia.html
  """

  alias Triglav.Repo
  alias Triglav.Schemas.Zet.FeedInfo

  def run(force \\ false) do
    url = get_download_url()

    filename =
      url
      |> URI.parse()
      |> Map.get(:path)
      |> Path.basename()
      |> String.replace(" ", "")

    temp_dir = Path.join([System.tmp_dir!(), "triglav"])
    File.mkdir_p!(temp_dir)

    IO.puts("Downloading: #{url}")
    target = Path.join([temp_dir, filename])
    download(url, target)

    IO.puts("Decompressing...")
    {_, 0} = System.cmd("unzip", ["-o", target, "-d", temp_dir])

    [header, row | _] =
      temp_dir
      |> Path.join("feed_info.txt")
      |> File.read!()
      |> String.split("\n")
      |> Enum.map(&String.split(&1, ","))

    version_index = Enum.find_index(header, &(&1 == "feed_version"))
    web_version = Enum.at(row, version_index) |> String.trim("\"")
    local_version = get_local_version()

    IO.puts("Local version: #{local_version}")
    IO.puts("  Web version: #{web_version}")

    if force or is_nil(local_version) or
         String.to_integer(web_version) > String.to_integer(local_version) do
      IO.puts("Updating...")
      File.cd!(temp_dir)
      setup_db_env()
      run_sql("priv/gtfs/schema.sql")
      run_sql("priv/gtfs/load.sql")
      run_sql("priv/gtfs/indices.sql")
    else
      IO.puts("You already have the latest data. Use --force option to import anyway.")
    end
  end

  defp get_local_version() do
    if zet_schema_exists() do
      FeedInfo
      |> Repo.one()
      |> Map.get(:feed_version)
    end
  end

  defp zet_schema_exists() do
    {:ok, %{rows: [[exists?]]}} =
      Repo.query("""
        SELECT EXISTS (
         SELECT FROM information_schema.tables
         WHERE table_schema = 'zet'
         AND table_name = 'feed_info'
      );
      """)

    exists?
  end

  defp run_sql(path) do
    IO.puts("Running: #{path}")
    path = Application.app_dir(:triglav, path)
    {_, 0} = System.cmd("psql", ["-f", path])
  end

  defp get_download_url() do
    html = get("https://www.zet.hr/odredbe/datoteke-u-gtfs-formatu/669")

    Regex.run(~r/https:\/\/www.zet.hr\/UserDocsImages\/[^"]+/, html)
    |> List.first()
  end

  defp get(url) do
    {:ok, {{'HTTP/1.1', 200, 'OK'}, _headers, body}} =
      :httpc.request(:get, {to_charlist(url), []}, [], [])

    to_string(body)
  end

  defp download(url, target) do
    # Encode whitespace in path
    url =
      url
      |> URI.parse()
      |> Map.update!(:path, &URI.encode(&1))
      |> URI.to_string()

    if File.exists?(target) do
      File.rm(target)
    end

    {:ok, :saved_to_file} =
      :httpc.request(:get, {to_charlist(url), []}, [], stream: to_charlist(target))

    IO.puts("Saved to: #{target}")
  end

  defp setup_db_env() do
    Application.fetch_env!(:triglav, Triglav.Repo)
    |> Keyword.get(:url)
    |> Ecto.Repo.Supervisor.parse_url()
    |> Enum.each(fn
      {:hostname, hostname} -> System.put_env("PGHOST", hostname)
      {:database, database} -> System.put_env("PGDATABASE", database)
      {:username, username} -> System.put_env("PGUSER", username)
      {:password, password} -> System.put_env("PGPASSWORD", password)
      {:port, port} -> System.put_env("PGPORT", port)
    end)
  end
end

M lib/triglav/release.ex => lib/triglav/release.ex +7 -2
@@ 15,10 15,15 @@ defmodule Triglav.Release do
  end

  def import_osm(force \\ false) do
    start_app()
    start_repo()
    Triglav.Import.Osm.run(force)
  end

  def import_zet(force \\ false) do
    start_repo()
    Triglav.Import.Zet.run(force)
  end

  defp repos do
    Application.fetch_env!(@app, :ecto_repos)
  end


@@ 27,7 32,7 @@ defmodule Triglav.Release do
    Application.load(@app)
  end

  defp start_app do
  defp start_repo do
    load_app()
    Application.put_env(@app, :repo_only, true)
    Application.ensure_all_started(@app)

M lib/triglav/schemas/zet/feed_info.ex => lib/triglav/schemas/zet/feed_info.ex +4 -11
@@ 2,21 2,14 @@ defmodule Triglav.Schemas.Zet.FeedInfo do
  use Ecto.Schema

  @primary_key false
  @schema_prefix :gtfs
  @schema_prefix :zet

  schema "feed_info" do
    field :feed_index, :integer, primary_key: true
    field :publisher_name, :string, source: :feed_publisher_name
    field :publisher_url, :string, source: :feed_publisher_url
    field :timezone, :string, source: :feed_timezone
    field :publisher_name, :string, source: :feed_publisher_name, null: false
    field :publisher_url, :string, source: :feed_publisher_url, null: false
    field :lang, :string, source: :feed_lang
    field :version, :string, source: :feed_version
    field :start_date, :date, source: :feed_start_date
    field :end_date, :date, source: :feed_end_date
    field :id, :string, source: :feed_id
    field :contact_url, :string, source: :feed_contact_url
    field :contact_email, :string, source: :feed_contact_email
    field :download_date, :date, source: :feed_download_date
    field :file, :string, source: :feed_file
    field :version, :string, source: :feed_version
  end
end

M lib/triglav/schemas/zet/route.ex => lib/triglav/schemas/zet/route.ex +1 -3
@@ 1,11 1,10 @@
defmodule Triglav.Schemas.Zet.Route do
  use Ecto.Schema

  @schema_prefix "gtfs"
  @primary_key false
  @schema_prefix :zet

  schema "routes" do
    field :feed_index, :integer, source: :feed_index, primary_key: true
    field :id, :string, source: :route_id, primary_key: true
    field :agency_id, :string, source: :agency_id
    field :short_name, :string, source: :route_short_name


@@ 15,6 14,5 @@ defmodule Triglav.Schemas.Zet.Route do
    field :url, :string, source: :route_url
    field :color, :string, source: :route_color
    field :text_color, :string, source: :route_text_color
    field :sort_order, :integer, source: :route_sort_order
  end
end

M lib/triglav/schemas/zet/trip.ex => lib/triglav/schemas/zet/trip.ex +1 -2
@@ 1,11 1,10 @@
defmodule Triglav.Schemas.Zet.Trip do
  use Ecto.Schema

  @schema_prefix "gtfs"
  @primary_key false
  @schema_prefix :zet

  schema "trips" do
    field :feed_index, :integer, primary_key: true
    field :id, :string, primary_key: true
    field :route_id, :string
    field :service_id, :string

M lib/triglav_web/templates/zet/routes/index.html.eex => lib/triglav_web/templates/zet/routes/index.html.eex +0 -4
@@ 39,10 39,6 @@
        <th>Version</th>
        <td><%= @gtfs_info.version %></td>
      </tr>
      <tr>
        <th>File</th>
        <td><%= @gtfs_info.file %></td>
      </tr>
    </table>

    <table style="margin-left: 1rem">

A priv/gtfs/indices.sql => priv/gtfs/indices.sql +0 -0
A priv/gtfs/load.sql => priv/gtfs/load.sql +8 -0
@@ 0,0 1,8 @@
\copy zet.agency FROM 'agency.txt' (FORMAT CSV, HEADER);
\copy zet.stops FROM 'stops.txt' (FORMAT CSV, HEADER);
\copy zet.routes FROM 'routes.txt' (FORMAT CSV, HEADER);
\copy zet.trips FROM 'trips.txt' (FORMAT CSV, HEADER);
\copy zet.stop_times FROM 'stop_times.txt' (FORMAT CSV, HEADER);
\copy zet.calendar FROM 'calendar.txt' (FORMAT CSV, HEADER);
\copy zet.calendar_dates FROM 'calendar_dates.txt' (FORMAT CSV, HEADER);
\copy zet.feed_info FROM 'feed_info.txt' (FORMAT CSV, HEADER);

A priv/gtfs/schema.sql => priv/gtfs/schema.sql +96 -0
@@ 0,0 1,96 @@
DROP SCHEMA IF EXISTS zet CASCADE;
CREATE SCHEMA zet;

CREATE TABLE zet.agency
(
  agency_id              text PRIMARY KEY,
  agency_name            text NOT NULL,
  agency_url             text NOT NULL,
  agency_timezone        text NOT NULL,
  agency_lang            text,
  agency_phone           text,
  agency_fare_url        text
);

CREATE TABLE zet.stops
(
  stop_id                text PRIMARY KEY,
  stop_code              text,
  stop_name              text,
  stop_desc              text,
  stop_lat               double precision,
  stop_lon               double precision,
  zone_id                text,
  stop_url               text,
  location_type          integer,
  parent_station         text
);

CREATE TABLE zet.routes
(
  route_id               text PRIMARY KEY,
  agency_id              text REFERENCES zet.agency(agency_id),
  route_short_name       text,
  route_long_name        text,
  route_desc             text,
  route_type             integer NOT NULL,
  route_url              text,
  route_color            text,
  route_text_color       text
);

CREATE TABLE zet.trips
(
  route_id               text NOT NULL REFERENCES zet.routes,
  service_id             text NOT NULL,
  trip_id                text NOT NULL PRIMARY KEY,
  trip_headsign          text,
  trip_short_name        text,
  direction_id           boolean,
  block_id               text,
  shape_id               text
);

CREATE TABLE zet.stop_times
(
  trip_id                text NOT NULL REFERENCES zet.trips,
  arrival_time           interval,
  departure_time         interval NOT NULL,
  stop_id                text NOT NULL REFERENCES zet.stops,
  stop_sequence          integer NOT NULL CHECK (stop_sequence >= 0),
  stop_headsign          text,
  pickup_type            integer,
  drop_off_type          integer,
  shape_dist_traveled    double precision
);

CREATE TABLE zet.calendar
(
  service_id             text PRIMARY KEY,
  monday                 boolean NOT NULL,
  tuesday                boolean NOT NULL,
  wednesday              boolean NOT NULL,
  thursday               boolean NOT NULL,
  friday                 boolean NOT NULL,
  saturday               boolean NOT NULL,
  sunday                 boolean NOT NULL,
  start_date             date NOT NULL,
  end_date               date NOT NULL
);

CREATE TABLE zet.calendar_dates
(
  service_id             text NOT NULL,
  date                   date NOT NULL,
  exception_type         integer NOT NULL
);

CREATE TABLE zet.feed_info
(
  feed_publisher_name    text NOT NULL,
  feed_publisher_url     text NOT NULL,
  feed_lang              text,
  feed_start_date        date,
  feed_end_date          date,
  feed_version           text
);