~hrbrmstr/metis-jars

0a3e1570265fe698cb2606870c4c4c3c6715396b — boB Rudis 3 years ago b91c192
Update to AthenaJDBC42_2.0.2.jar
11 files changed, 96 insertions(+), 77 deletions(-)

M DESCRIPTION
M R/jdbc.r
M R/metis.r
A R/zzz.R
M README.Rmd
M README.md
D inst/java/AthenaJDBC41-1.1.0.jar
A inst/java/AthenaJDBC42_2.0.2.jar
A inst/java/log4j.properties
M man/athena_connect.Rd
M man/dbConnect-AthenaDriver-method.Rd
M DESCRIPTION => DESCRIPTION +2 -0
@@ 6,6 6,7 @@ Date: 2018-03-19
Authors@R: c(
    person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
           comment = c(ORCID = "0000-0001-5670-2640")),
    person("Derek", "Abdine", comment = "Authentication driver update"),
    person("Zachary", "Kurtz", email = "zdkurtz@gmail.com", role = "ctb")
  )
Maintainer: Bob Rudis <bob@rud.is>


@@ 15,6 16,7 @@ Description: Methods are provides to connect to 'Amazon' 'Athena', lookup schema
    is included along with an interface to the 'AWS' command-line utility.
URL: https://github.com/hrbrmstr/metis
BugReports: https://github.com/hrbrmstr/metis/issues
SystemRequirements: JDK 1.8+
License: AGPL
Suggests:
    testthat,

M R/jdbc.r => R/jdbc.r +39 -27
@@ 1,3 1,8 @@
stats::setNames(
  0:6,
  c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")
) -> .ll_trans

#' AthenaJDBC
#'
#' @export


@@ 18,8 23,8 @@ setClass(
Athena <- function(identifier.quote = '`') {

  JDBC(
    driverClass = "com.amazonaws.athena.jdbc.AthenaDriver",
    system.file("java", "AthenaJDBC41-1.1.0.jar", package = "metis"),
    driverClass = "com.simba.athena.jdbc.Driver",
    system.file("java", "AthenaJDBC42_2.0.2.jar", package = "metis"),
    identifier.quote = identifier.quote
  ) -> drv



@@ 33,10 38,12 @@ Athena <- function(identifier.quote = '`') {
#' @param region AWS region the Athena tables are in
#' @param s3_staging_dir A write-able bucket on S3 that you have permissions for
#' @param schema_name LOL if only this actually worked with Amazon's hacked Presto driver
#' @param max_error_retries,connection_timeout,socket_timeout,retry_base_delay,retry_max_backoff_time
#' @param max_error_retries,connection_timeout,socket_timeout
#'     technical connection info that you should only muck with if you know what you're doing.
#' @param log_path,log_level The Athena JDBC driver can (shockingly) provide a decent bit
#'     of data in logs. Set this to a temporary directory or somethign log4j can use.
#'     of data in logs. Set this to a temporary directory or something log4j can use. For
#'     `log_level` use the names ("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE") or
#'     their corresponding integer values 0-6.
#' @param ... unused
#' @references <https://docs.aws.amazon.com/athena/latest/ug/connect-with-jdbc.html>
#' @export


@@ 45,37 52,42 @@ setMethod(
  "dbConnect",
  "AthenaDriver",

  def = function(drv,
                 provider = "com.amazonaws.athena.jdbc.shaded.com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
                 region = "us-east-1",
                 s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
                 schema_name = "default",
                 max_error_retries = 10,
                 connection_timeout = 10000,
                 socket_timeout = 10000,
                 retry_base_delay = 100,
                 retry_max_backoff_time = 1000,
                 log_path,
                 log_level,
                 ...) {
  def = function(
    drv,
    provider = "com.simba.athena.amazonaws.auth.DefaultAWSCredentialsProviderChain",
    region = "us-east-1",
    s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
    schema_name = "default",
    max_error_retries = 10,
    connection_timeout = 10000,
    socket_timeout = 10000,
    # retry_base_delay = 100,
    # retry_max_backoff_time = 1000,
    log_path,
    log_level,
    ...) {

    conn_string = sprintf(
      'jdbc:awsathena://athena.%s.amazonaws.com:443/%s', region, schema_name
    )

    if (!(log_level %in% 0:6)) log_level <- .ll_trans[log_level]



    callNextMethod(
      drv,
      conn_string,
      s3_staging_dir = s3_staging_dir,
      schema_name = schema_name,
      max_error_retries = max_error_retries,
      connection_timeout = connection_timeout,
      socket_timeout = socket_timeout,
      retry_base_delay = retry_base_delay,
      retry_max_backoff_time = retry_max_backoff_time,
      log_path = log_path,
      log_level = log_level,
      aws_credentials_provider_class = provider,
      S3OutputLocation = s3_staging_dir,
      Schema = schema_name,
      MaxErrorRetry = max_error_retries,
      ConnectTimeout = connection_timeout,
      SocketTimeout = socket_timeout,
      # retry_base_delay = retry_base_delay,
      # retry_max_backoff_time = retry_max_backoff_time,
      LogPath = log_path,
      LogLevel = log_level,
      AwsCredentialsProviderClass = provider,
      ...
    ) -> jc


M R/metis.r => R/metis.r +18 -16
@@ 9,10 9,11 @@
#' @param max_error_retries the maximum number of retries that the JDBC client attempts to make a request to Athena.
#' @param connection_timeout the maximum amount of time, in milliseconds, to make a successful connection to Athena before an attempt is terminated.
#' @param socket_timeout the maximum amount of time, in milliseconds, to wait for a socket in order to send data to Athena.
#' @param retry_base_delay minimum delay amount, in milliseconds, between retrying attempts to connect Athena.
#' @param retry_max_backoff_time maximum delay amount, in milliseconds, between retrying attempts to connect Athena.
# @param retry_base_delay minimum delay amount, in milliseconds, between retrying attempts to connect Athena.
# @param retry_max_backoff_time maximum delay amount, in milliseconds, between retrying attempts to connect Athena.
#' @param log_path local path of the Athena JDBC driver logs. If no log path is provided, then no log files are created.
#' @param log_level log level of the Athena JDBC driver logs.
#' @param log_level log level of the Athena JDBC driver logs. Use  names
#'     "OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE".
#' @export
#' @examples \dontrun{
#' use_credentials("personal")


@@ 27,21 28,22 @@
#' dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 1")
#'
#' }
athena_connect <- function(default_schema = "default",
                           region = c("us-east-1", "us-east-2", "us-west-2"),
                           s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
                           max_error_retries = 10,
                           connection_timeout = 10000,
                           socket_timeout = 10000,
                           retry_base_delay = 100,
                           retry_max_backoff_time = 1000,
                           log_path = "",
                           log_level = c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE")) {
athena_connect <- function(
  default_schema = "default",
  region = c("us-east-1", "us-east-2", "us-west-2"),
  s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
  max_error_retries = 10,
  connection_timeout = 10000,
  socket_timeout = 10000,
  # retry_base_delay = 100,
  # retry_max_backoff_time = 1000,
  log_path = "",
  log_level = c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")) {

  athena_jdbc <- Athena()

  region <- match.arg(region, c("us-east-1", "us-east-2", "us-west-2"))
  log_level <- match.arg(log_level, c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE"))
  log_level <- match.arg(log_level, c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE"))

  # if (!simple) {
  dbConnect(


@@ 52,8 54,8 @@ athena_connect <- function(default_schema = "default",
    max_error_retries = max_error_retries,
    connection_timeout = connection_timeout,
    socket_timeout = socket_timeout,
    retry_base_delay = retry_base_delay,
    retry_max_backoff_time = retry_max_backoff_time,
    # retry_base_delay = retry_base_delay,
    # retry_max_backoff_time = retry_max_backoff_time,
    log_path = log_path,
    log_level = log_level
  ) -> con

A R/zzz.R => R/zzz.R +4 -0
@@ 0,0 1,4 @@
.onLoad <- function(libname, pkgname) {
  rJava::.jpackage(pkgname, jars = "*", lib.loc = libname)
  rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE))
}

M README.Rmd => README.Rmd +6 -4
@@ 1,7 1,8 @@
---
output: rmarkdown::github_document
editor_options: 
  chunk_output_type: console
---
![](https://upload.wikimedia.org/wikipedia/commons/thumb/5/53/Winged_goddess_Louvre_F32.jpg/300px-Winged_goddess_Louvre_F32.jpg)

# `metis`



@@ 15,10 16,11 @@ In Greek mythology, Metis was Athena's "helper".

Still fairly beta-quality level but getting there.

The goal will be to get around enough of the "gotchas" that are preventing raw RJDBC Athena
connections from "just working" with `dplyr` v0.6.0+ and also get around the [`fetchSize` problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) without having to not use `dbGetQuery()`.
The goal will be to get around enough of the "gotchas" that are preventing raw RJDBC Athena connections from "just working" with `dplyr` v0.6.0+ and also get around the [`fetchSize` problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) without having to not use `dbGetQuery()`.

The `AthenaJDBC41-1.1.0.jar` JAR file is included out of convenience but that will likely move to a separate package as this gets closer to prime time if this goes on CRAN.
The `AthenaJDBC42_2.0.2.jar` JAR file is included out of convenience but that will likely move to a separate package as this gets closer to prime time if this goes on CRAN.

NOTE that the updated driver *REQUIRES JDK 1.8+*.

See the **Usage** section for an example.


M README.md => README.md +15 -15
@@ 1,6 1,4 @@

![](https://upload.wikimedia.org/wikipedia/commons/thumb/5/53/Winged_goddess_Louvre_F32.jpg/300px-Winged_goddess_Louvre_F32.jpg)

# `metis`

Helpers for Accessing and Querying Amazon Athena


@@ 19,10 17,12 @@ v0.6.0+ and also get around the [`fetchSize`
problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/)
without having to not use `dbGetQuery()`.

The `AthenaJDBC41-1.1.0.jar` JAR file is included out of convenience but
The `AthenaJDBC42_2.0.2.jar` JAR file is included out of convenience but
that will likely move to a separate package as this gets closer to prime
time if this goes on CRAN.

NOTE that the updated driver *REQUIRES JDK 1.8+*.

See the **Usage** section for an example.

## What’s Inside The Tin?


@@ 111,21 111,21 @@ dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 10") %>%

    ## Observations: 10
    ## Variables: 16
    ## $ timestamp             <dttm> 2014-09-30 01:03:00, 2014-09-30 01:03:01, 2014-09-30 01:03:01, 2014-09-30 01:03:01, ...
    ## $ timestamp             <dttm> 2014-09-30 00:00:25, 2014-09-30 00:00:57, 2014-09-30 00:01:06, 2014-09-30 00:01:29, ...
    ## $ elbname               <chr> "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo...
    ## $ requestip             <chr> "253.90.22.60", "253.51.141.83", "245.59.222.144", "241.35.85.250", "246.245.70.48", ...
    ## $ requestport           <dbl> 4095, 14668, 29796, 38607, 32750, 10182, 64948, 51279, 13331, 2700
    ## $ backendip             <chr> "250.133.18.39", "248.214.120.18", "250.38.70.52", "249.45.101.192", "249.28.120.9", ...
    ## $ backendport           <dbl> 8888, 443, 8899, 8888, 8888, 8888, 8888, 8888, 8888, 8000
    ## $ requestprocessingtime <dbl> 7.3e-05, 8.9e-05, 4.5e-05, 4.3e-05, 7.6e-05, 7.3e-05, 7.7e-05, 4.6e-05, 4.9e-05, 5.3e-05
    ## $ backendprocessingtime <dbl> 0.561864, 0.021517, 0.019530, 0.018937, 0.022727, 0.390384, 0.017017, 0.016437, 0.019...
    ## $ clientresponsetime    <dbl> 9.0e-05, 7.0e-05, 3.0e-05, 3.3e-05, 3.2e-05, 8.4e-05, 5.2e-05, 7.1e-05, 6.9e-05, 5.4e-05
    ## $ elbresponsecode       <int> 200, 304, 304, 304, 200, 200, 304, 304, 200, 304
    ## $ backendresponsecode   <int> 200, 200, 403, 200, 200, 400, 200, 200, 200, 200
    ## $ requestip             <chr> "246.247.182.239", "250.128.76.75", "243.157.244.21", "255.172.234.242", "245.27.105....
    ## $ requestport           <dbl> 33998, 33998, 33998, 33998, 33998, 33998, 33998, 14346, 33998, 33998
    ## $ backendip             <chr> "251.173.42.143", "254.201.134.52", "240.175.197.76", "255.212.79.68", "250.102.227.5...
    ## $ backendport           <dbl> 8888, 8888, 8888, 8888, 8888, 8888, 8888, 8000, 8888, 8888
    ## $ requestprocessingtime <dbl> 0.000091, 0.000092, 0.000105, 0.000091, 0.000091, 0.000091, 0.000090, 0.000077, 0.000...
    ## $ backendprocessingtime <dbl> 0.048114, 0.055741, 0.008005, 0.037602, 0.039396, 0.053371, 0.040238, 0.192458, 0.027...
    ## $ clientresponsetime    <dbl> 6.2e-05, 5.0e-05, 4.8e-05, 6.1e-05, 4.7e-05, 6.2e-05, 5.5e-05, 8.3e-05, 5.7e-05, 8.5e-05
    ## $ elbresponsecode       <int> 200, 200, 302, 200, 200, 200, 200, 500, 200, 200
    ## $ backendresponsecode   <int> 404, 200, 200, 200, 200, 200, 400, 500, 200, 200
    ## $ receivedbytes         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ## $ sentbytes             <dbl> 58402, 0, 0, 0, 152213, 58402, 0, 0, 152213, 0
    ## $ sentbytes             <dbl> 2, 2, 0, 2, 2, 2, 2, 28098, 2, 2
    ## $ requestverb           <chr> "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET"
    ## $ url                   <chr> "http://www.abcxyz.com:80/", "http://www.abcxyz.com:80/static/css/hue3.css", "http://...
    ## $ url                   <chr> "http://www.abcxyz.com:80/jobbrowser/?format=json&state=running&user=l29ezwd", "http:...
    ## $ protocol              <chr> "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "...

## Code of Conduct

D inst/java/AthenaJDBC41-1.1.0.jar => inst/java/AthenaJDBC41-1.1.0.jar +0 -0
A inst/java/AthenaJDBC42_2.0.2.jar => inst/java/AthenaJDBC42_2.0.2.jar +0 -0
A inst/java/log4j.properties => inst/java/log4j.properties +1 -0
@@ 0,0 1,1 @@
log4j.rootLogger=WARN

M man/athena_connect.Rd => man/athena_connect.Rd +4 -9
@@ 7,10 7,8 @@
athena_connect(default_schema = "default", region = c("us-east-1",
  "us-east-2", "us-west-2"),
  s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), max_error_retries = 10,
  connection_timeout = 10000, socket_timeout = 10000,
  retry_base_delay = 100, retry_max_backoff_time = 1000, log_path = "",
  log_level = c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL",
  "TRACE"))
  connection_timeout = 10000, socket_timeout = 10000, log_path = "",
  log_level = c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE"))
}
\arguments{
\item{default_schema}{default schema (you'll still need to fully qualify non-default schema table names)}


@@ 25,13 23,10 @@ athena_connect(default_schema = "default", region = c("us-east-1",

\item{socket_timeout}{the maximum amount of time, in milliseconds, to wait for a socket in order to send data to Athena.}

\item{retry_base_delay}{minimum delay amount, in milliseconds, between retrying attempts to connect Athena.}

\item{retry_max_backoff_time}{maximum delay amount, in milliseconds, between retrying attempts to connect Athena.}

\item{log_path}{local path of the Athena JDBC driver logs. If no log path is provided, then no log files are created.}

\item{log_level}{log level of the Athena JDBC driver logs.}
\item{log_level}{log level of the Athena JDBC driver logs. Use  names
"OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE".}
}
\description{
Handles the up-front JDBC config

M man/dbConnect-AthenaDriver-method.Rd => man/dbConnect-AthenaDriver-method.Rd +7 -6
@@ 6,12 6,11 @@
\title{AthenaJDBC}
\usage{
\S4method{dbConnect}{AthenaDriver}(drv,
  provider = "com.amazonaws.athena.jdbc.shaded.com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
  provider = "com.simba.athena.amazonaws.auth.DefaultAWSCredentialsProviderChain",
  region = "us-east-1", s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
  schema_name = "default", max_error_retries = 10,
  connection_timeout = 10000, socket_timeout = 10000,
  retry_base_delay = 100, retry_max_backoff_time = 1000, log_path,
  log_level, ...)
  connection_timeout = 10000, socket_timeout = 10000, log_path, log_level,
  ...)
}
\arguments{
\item{provider}{JDBC auth provider (ideally leave default)}


@@ 22,10 21,12 @@

\item{schema_name}{LOL if only this actually worked with Amazon's hacked Presto driver}

\item{max_error_retries, connection_timeout, socket_timeout, retry_base_delay, retry_max_backoff_time}{technical connection info that you should only muck with if you know what you're doing.}
\item{max_error_retries, connection_timeout, socket_timeout}{technical connection info that you should only muck with if you know what you're doing.}

\item{log_path, log_level}{The Athena JDBC driver can (shockingly) provide a decent bit
of data in logs. Set this to a temporary directory or somethign log4j can use.}
of data in logs. Set this to a temporary directory or something log4j can use. For
`log_level` use the names ("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE") or
their corresponding integer values 0-6.}

\item{...}{unused}
}