~hrbrmstr/sergeant

9b52fa5eea65f32de125cf404df04dd3b906b30a — boB Rudis 2 years ago f4c1bdc
separated JDBC
29 files changed, 406 insertions(+), 1208 deletions(-)

M DESCRIPTION
M NAMESPACE
M NEWS.md
M R/dplyr.r
D R/jdbc.r
M R/query.r
M R/schemas.R
M R/sergeant-package.r
M R/sergeant.r
M R/utils.r
D R/zzz.R
M README.Rmd
M README.md
D inst/java/slf4j-nop-1.7.25.jar
M man/DrillConnection-class.Rd
D man/DrillJDBC.Rd
D man/DrillJDBCConnection-class.Rd
D man/DrillJDBCDriver-class.Rd
D man/dbConnect-DrillJDBCDriver-method.Rd
D man/dbDataType-DrillJDBCConnection-method.Rd
M man/drill_active.Rd
D man/drill_jdbc.Rd
D man/drill_jdbc_internals.Rd
M man/drill_query.Rd
M man/drill_show_files.Rd
M man/sergeant-exports.Rd
M man/sergeant.Rd
M man/src_drill.Rd
M tests/testthat/test-sergeant.R
M DESCRIPTION => DESCRIPTION +0 -2
@@ 29,12 29,10 @@ Imports:
    readr (>= 1.1.1),
    purrr (>= 0.2.2),
    scales (>= 0.4.1),
    RJDBC (>= 0.2-5),
    utils,
    methods,
    magrittr
Suggests:
    rJava (>= 0.9-8),
    testthat (>= 1.0.2),
    covr (>= 3.0.0)
RoxygenNote: 6.0.1.9000

M NAMESPACE => NAMESPACE +1 -16
@@ 2,26 2,18 @@

S3method(copy_to,src_drill)
S3method(db_data_type,DrillConnection)
S3method(db_data_type,DrillJDBCConnection)
S3method(db_data_type,tbl_drill_jdbc)
S3method(db_desc,src_drill)
S3method(db_explain,DrillConnection)
S3method(db_query_fields,DrillConnection)
S3method(sql_escape_ident,DrillConnection)
S3method(sql_escape_ident,DrillJDBCConnection)
S3method(sql_translate_env,DrillConnection)
S3method(sql_translate_env,DrillJDBCConnection)
S3method(src_tbls,src_dbi)
S3method(src_tbls,src_drill)
S3method(tbl,src_drill)
S3method(tbl,src_drill_jdbc)
export("%>%")
export(Drill)
export(DrillJDBC)
export(drill_active)
export(drill_cancel)
export(drill_connection)
export(drill_jdbc)
export(drill_metrics)
export(drill_mod_storage)
export(drill_options)


@@ 42,12 34,9 @@ export(drill_uplift)
export(drill_use)
export(drill_version)
export(src_drill)
export(src_drill_jdbc)
export(tbl)
exportClasses(DrillConnection)
exportClasses(DrillDriver)
exportClasses(DrillJDBCConnection)
exportClasses(DrillJDBCDriver)
exportClasses(DrillJDBCResult)
exportClasses(DrillResult)
exportMethods(dbClearResult)
exportMethods(dbConnect)


@@ 58,7 47,6 @@ exportMethods(dbGetStatement)
exportMethods(dbHasCompleted)
exportMethods(dbIsValid)
exportMethods(dbListFields)
exportMethods(dbSendQuery)
exportMethods(dbUnloadDriver)
import(DBI)
import(htmltools)


@@ 66,9 54,6 @@ import(httr)
import(jsonlite)
import(methods)
import(utils)
importClassesFrom(RJDBC,JDBCConnection)
importClassesFrom(RJDBC,JDBCDriver)
importClassesFrom(RJDBC,JDBCResult)
importFrom(dbplyr,base_agg)
importFrom(dbplyr,base_scalar)
importFrom(dbplyr,base_win)

M NEWS.md => NEWS.md +9 -0
@@ 1,3 1,12 @@
# sergeant 0.7.0-BETA

- separated out Java-based functions into a sibling package

# sergeant 0.7.0-ALPHA

- enhanced JDBC connectivity
- updated REST API to handle new features in Drill 1.14.0

# sergeant 0.6.0

- Authentication support for DBI/dplyr and `drill_connection()` pure REST interface

M R/dplyr.r => R/dplyr.r +3 -2
@@ 13,7 13,8 @@
#' @param username,password if not `NULL` the credentials for the Drill service.
#' @note This is a DBI wrapper around the Drill REST API.
#' @export
#' @examples \dontrun{
#' @examples
#' try({
#' db <- src_drill("localhost", 8047L)
#'
#' print(db)


@@ 57,7 58,7 @@
#' ##  9     2      Darren Stanz    12 Darren Stanz********     5      D*rr*n St*nz
#' ## 10     4 Jonathan Murraiin    17 Jonathan Murraiin***     0 J*n*th*n M*rr***n
#' ## # ... with more rows, and 3 more variables: rpd <chr>, rnd <dbl>, first_three <chr>
#' }
#' }, silent=TRUE)
src_drill <- function(host  = Sys.getenv("DRILL_HOST", "localhost"),
                      port = as.integer(Sys.getenv("DRILL_PORT", 8047L)),
                      ssl = FALSE, username = NULL, password = NULL) {

D R/jdbc.r => R/jdbc.r +0 -425
@@ 1,425 0,0 @@
.fillStatementParameters <- function(s, l) {
  for (i in 1:length(l)) {
    v <- l[[i]]
    if (is.na(v)) { # map NAs to NULLs (courtesy of Axel Klenk)
      sqlType <- if (is.integer(v)) 4 else if (is.numeric(v)) 8 else 12
      rJava::.jcall(s, "V", "setNull", i, as.integer(sqlType))
    } else if (is.integer(v))
      rJava::.jcall(s, "V", "setInt", i, v[1])
    else if (is.numeric(v))
      rJava::.jcall(s, "V", "setDouble", i, as.double(v)[1])
    else
      rJava::.jcall(s, "V", "setString", i, as.character(v)[1])
  }
}

#' JDBC Driver for Drill database.
#'
#' @keywords internal
#' @export
setClass(
  Class = "DrillJDBCDriver",
  contains = "JDBCDriver"
)

#' Drill JDBC connection class.
#'
#' @export
#' @keywords internal
#' @export
setClass(
  Class = "DrillJDBCConnection",
  contains = "JDBCConnection"
)

#' Connect to Drill JDBC with your own connection string
#'
#' You should really use [drill_jdbc()] as it handles some cruft for
#' you, but you can specify the full JDBC connection string
#'
#' @md
#' @family Drill JDBC API
#' @param drv what you get back from [DrillJDBC()]
#' @param url your Drill connection strinfg
#' @param user,password username & password (leave as-is for no-auth)
#' @param ... additional `name=val` properties which will be set with Java's
#'        `SetProperty` method.
#' @export
setMethod(
  f = "dbConnect",
  signature = "DrillJDBCDriver",
  definition = function(drv, url, user='', password='', ...) {

    rJava::.jcall(
      "java/sql/DriverManager",
      "Ljava/sql/Connection;",
      "getConnection",
      as.character(url)[1],
      as.character(user)[1],
      as.character(password)[1],
      check = FALSE
    ) -> jc

    if (rJava::is.jnull(jc) && !rJava::is.jnull(drv@jdrv)) {
      # ok one reason for this to fail is its interaction with rJava's
      # class loader. In that case we try to load the driver directly.
      oex <- rJava::.jgetEx(TRUE)

      p <- rJava::.jnew("java/util/Properties")

      if (length(user)==1 && nchar(user)) {
        rJava::.jcall(p,"Ljava/lang/Object;","setProperty","user",user)
      }

      if (length(password)==1 && nchar(password)) {
        rJava::.jcall(p,"Ljava/lang/Object;","setProperty","password",password)
      }

      l <- list(...)
      if (length(names(l))) for (n in names(l)) {
        rJava::.jcall(p, "Ljava/lang/Object;", "setProperty", n, as.character(l[[n]]))
      }

      jc <- rJava::.jcall(drv@jdrv, "Ljava/sql/Connection;", "connect", as.character(url)[1], p)

    }

    .verify.JDBC.result(jc, "Unable to connect JDBC to ",url)

    new("DrillJDBCConnection", jc=jc, identifier.quote=drv@identifier.quote)

  },

  valueClass = "DrillJDBCConnection"

)

#' Drill JDBC dbDataType
#'
#' @param dbObj A \code{\linkS4class{DrillJDBCDriver}} object
#' @param obj Any R object
#' @param ... Extra optional parameters
#' @family Drill JDBC API
#' @export
setMethod(
  "dbDataType",
  "DrillJDBCConnection",
  function(dbObj, obj, ...) {
    if (is.integer(obj)) "INTEGER"
    else if (inherits(obj, "Date")) "DATE"
    else if (identical(class(obj), "times")) "TIME"
    else if (inherits(obj, "POSIXct")) "TIMESTAMP"
    else if (is.numeric(obj)) "DOUBLE"
    else "VARCHAR(255)"
  },
  valueClass = "character"
)


#' Drill's JDBC driver main class loader
#'
#' @family Drill JDBC API
#' @export
DrillJDBC <- function() {

  driverClass <-  "org.apache.drill.jdbc.Driver"

  ## expand all paths in the classPath
  classPath <- path.expand(unlist(strsplit(Sys.getenv("DRILL_JDBC_JAR"), .Platform$path.sep)))

  ## this is benign in that it's equivalent to rJava::.jaddClassPath if a JVM is running
  rJava::.jinit(classPath)

  rJava::.jaddClassPath(system.file("java", "RJDBC.jar", package="RJDBC"))
  rJava::.jaddClassPath(system.file("java", "slf4j-nop-1.7.25.jar", package = "sergeant"))

  if (nchar(driverClass) && rJava::is.jnull(rJava::.jfindClass(as.character(driverClass)[1]))) {
    stop("Cannot find JDBC driver class ",driverClass)
  }

  jdrv <- rJava::.jnew(driverClass, check=FALSE)

  rJava::.jcheck(TRUE)

  if (rJava::is.jnull(jdrv)) jdrv <- rJava::.jnull()

  new("DrillJDBCDriver", identifier.quote = "`", jdrv = jdrv)

}

#' Connect to Drill using JDBC
#'
#' The DRILL JDBC driver fully-qualified path must be placed in the
#' \code{DRILL_JDBC_JAR} environment variable. This is best done via \code{~/.Renviron}
#' for interactive work. e.g. \code{DRILL_JDBC_JAR=/usr/local/drill/jars/jdbc-driver/drill-jdbc-all-1.10.0.jar}
#'
#' [src_drill_jdbc()] wraps the JDBC [dbConnect()] connection instantation in
#' [dbplyr::src_dbi()] to return the equivalent of the REST driver's [src_drill()].
#'
#' @param nodes character vector of nodes. If more than one node, you can either have
#'              a single string with the comma-separated node:port pairs pre-made or
#'              pass in a character vector with multiple node:port strings and the
#'              function will make a comma-separated node string for you.
#' @param cluster_id the cluster id from \code{drill-override.conf}
#' @param schema an optional schema name to append to the JDBC connection string
#' @param use_zk are you connecting to a ZooKeeper instance (default: \code{TRUE}) or
#'               connecting to an individual DrillBit.
#' @family Drill JDBC API
#' @return a JDBC connection object
#' @references \url{https://drill.apache.org/docs/using-the-jdbc-driver/#using-the-jdbc-url-for-a-random-drillbit-connection}
#' @export
#' @examples \dontrun{
#' con <- drill_jdbc("localhost:2181", "main")
#' drill_query(con, "SELECT * FROM cp.`employee.json`")
#'
#' # you can also use the connection with RJDBC calls:
#' dbGetQuery(con, "SELECT * FROM cp.`employee.json`")
#'
#' # for local/embedded mode with default configuration info
#' con <- drill_jdbc("localhost:31010", use_zk=FALSE)
#' }
drill_jdbc <- function(nodes = "localhost:2181", cluster_id = NULL,
                       schema = NULL, use_zk = TRUE) {

  try_require("rJava")
  try_require("RJDBC")

  jar_path <- Sys.getenv("DRILL_JDBC_JAR")
  if (!file.exists(jar_path)) {
    stop(sprintf("Cannot locate DRILL JDBC JAR [%s]", jar_path))
  }

  drill_jdbc_drv <- DrillJDBC()

  conn_type <- "drillbit"
  if (use_zk) conn_type <- "zk"

  if (length(nodes) > 1) nodes <- paste0(nodes, collapse=",")

  conn_str <- sprintf("jdbc:drill:%s=%s", conn_type, nodes)

  if (!is.null(cluster_id)) {
    conn_str <- sprintf("%s%s", conn_str, sprintf("/drill/%s", cluster_id))
  }

  if (!is.null(schema)) conn_str <- sprintf("%s;%s", schema)

  message(sprintf("Using [%s]...", conn_str))

  dbConnect(drill_jdbc_drv, conn_str)

}

#' @rdname drill_jdbc
#' @family Drill JDBC API
#' @export
src_drill_jdbc <- function(nodes = "localhost:2181", cluster_id = NULL,
                           schema = NULL, use_zk = TRUE) {

  con <- drill_jdbc(nodes, cluster_id, schema, use_zk)
  src_sql("drill_jdbc", con)

}

#' @rdname drill_jdbc
#' @param src A Drill "src" created with \code{src_drill()}
#' @param from A Drill view or table specification
#' @param ... Extra parameters
#' @family Drill JDBC API
#' @export
tbl.src_drill_jdbc <- function(src, from, ...) {
  tbl_sql("drill_jdbc", src=src, from=from, ...)
}

#' Drill internals
#'
#' @rdname drill_jdbc_internals
#' @keywords internal
#' @export
db_data_type.DrillJDBCConnection <- function(con, fields, ...) {
  data_type <- function(x) {
    switch(
      class(x)[1],
      logical = "BOOLEAN",
      integer = "INTEGER",
      numeric = "DOUBLE",
      factor =  "CHARACTER",
      character = "CHARACTER",
      Date = "DATE",
      POSIXct = "TIMESTAMP",
      stop("Can't map type ", paste(class(x), collapse = "/"),
           " to a supported database type.")
    )
  }
  vapply(fields, data_type, character(1))
}

#' Drill internals
#'
#' @rdname drill_jdbc_internals
#' @keywords internal
#' @export
db_data_type.tbl_drill_jdbc <- db_data_type.DrillJDBCConnection

#' @rdname drill_jdbc_internals
#' @keywords internal
#' @export
setClass("DrillJDBCResult", representation("JDBCResult", jr="jobjRef", md="jobjRef", stat="jobjRef", pull="jobjRef"))

#' @rdname drill_jdbc_internals
#' @keywords internal
#' @export
setMethod(
  f = "dbSendQuery",
  signature = signature(conn="DrillJDBCConnection", statement="character"),
  definition = function(conn, statement, ..., list=NULL) {
    statement <- as.character(statement)[1L]
    ## if the statement starts with {call or {?= call then we use CallableStatement
    if (isTRUE(as.logical(grepl("^\\{(call|\\?= *call)", statement)))) {
      s <- rJava::.jcall(conn@jc, "Ljava/sql/CallableStatement;", "prepareCall", statement, check=FALSE)
      .verify.JDBC.result(s, "Unable to execute JDBC callable statement ",statement)
      if (length(list(...))) .fillStatementParameters(s, list(...))
      if (!is.null(list)) .fillStatementParameters(s, list)
      r <- rJava::.jcall(s, "Ljava/sql/ResultSet;", "executeQuery", check=FALSE)
      .verify.JDBC.result(r, "Unable to retrieve JDBC result set for ",statement)
    } else if (length(list(...)) || length(list)) { ## use prepared statements if there are additional arguments
      s <- rJava::.jcall(conn@jc, "Ljava/sql/PreparedStatement;", "prepareStatement", statement, check=FALSE)
      .verify.JDBC.result(s, "Unable to execute JDBC prepared statement ", statement)
      if (length(list(...))) .fillStatementParameters(s, list(...))
      if (!is.null(list)) .fillStatementParameters(s, list)
      r <- rJava::.jcall(s, "Ljava/sql/ResultSet;", "executeQuery", check=FALSE)
      .verify.JDBC.result(r, "Unable to retrieve JDBC result set for ",statement)
    } else { ## otherwise use a simple statement some DBs fail with the above)
      s <- rJava::.jcall(conn@jc, "Ljava/sql/Statement;", "createStatement")
      .verify.JDBC.result(s, "Unable to create simple JDBC statement ",statement)
      r <- rJava::.jcall(s, "Ljava/sql/ResultSet;", "executeQuery", as.character(statement)[1], check=FALSE)
      .verify.JDBC.result(r, "Unable to retrieve JDBC result set for ",statement)
    }
    md <- rJava::.jcall(r, "Ljava/sql/ResultSetMetaData;", "getMetaData", check=FALSE)
    .verify.JDBC.result(md, "Unable to retrieve JDBC result set meta data for ",statement, " in dbSendQuery")
    new("DrillJDBCResult", jr=r, md=md, stat=s, pull=rJava::.jnull())
  })

#' @rdname drill_jdbc_internals
#' @keywords internal
#' @export
sql_escape_ident.DrillJDBCConnection <- function(con, x) {
  ifelse(grepl(con@identifier.quote, x), sql_quote(x, ' '), sql_quote(x, con@identifier.quote))
}

#' @rdname drill_jdbc_internals
#' @keywords internal
#' @export
sql_translate_env.DrillJDBCConnection <- function(con) {

  x <- con

  dbplyr::sql_variant(

    scalar = dbplyr::sql_translator(
      .parent = dbplyr::base_scalar,
      `!=` = dbplyr::sql_infix("<>"),
      as.numeric = function(x) build_sql("CAST(", x, " AS DOUBLE)"),
      as.character = function(x) build_sql("CAST(", x, " AS CHARACTER)"),
      as.date = function(x) build_sql("CAST(", x, " AS DATE)"),
      as.posixct = function(x) build_sql("CAST(", x, " AS TIMESTAMP)"),
      as.logical = function(x) build_sql("CAST(", x, " AS BOOLEAN)"),
      date_part = function(x, y) build_sql("DATE_PART(", x, ",", y ,")"),
      grepl = function(x, y) build_sql("CONTAINS(", y, ", ", x, ")"),
      gsub = function(x, y, z) build_sql("REGEXP_REPLACE(", z, ", ", x, ",", y ,")"),
      str_replace = function(x, y, z) build_sql("REGEXP_REPLACE(", x, ", ", y, ",", z ,")"),
      trimws = function(x) build_sql("TRIM(both ' ' FROM ", x, ")"),
      cbrt = sql_prefix("CBRT", 1),
      degrees = sql_prefix("DEGREES", 1),
      e = sql_prefix("E", 0),
      row_number = sql_prefix("row_number", 0),
      lshift = sql_prefix("LSHIFT", 2),
      mod = sql_prefix("MOD", 2),
      age = sql_prefix("AGE", 1),
      negative = sql_prefix("NEGATIVE", 1),
      pi = sql_prefix("PI", 0),
      pow = sql_prefix("POW", 2),
      radians = sql_prefix("RADIANS", 1),
      rand = sql_prefix("RAND", 0),
      rshift = sql_prefix("RSHIFT", 2),
      trunc = sql_prefix("TRUNC", 2),
      contains = sql_prefix("CONTAINS", 2),
      convert_to = sql_prefix("CONVERT_TO", 2),
      convert_from = sql_prefix("CONVERT_FROM", 2),
      string_binary = sql_prefix("STRING_BINARY", 1),
      binary_string = sql_prefix("BINARY_STRING", 1),
      to_char = sql_prefix("TO_CHAR", 2),
      to_date = sql_prefix("TO_DATE", 2),
      to_number = sql_prefix("TO_NUMBER", 2),
      char_to_timestamp = sql_prefix("TO_TIMESTAMP", 2),
      double_to_timestamp = sql_prefix("TO_TIMESTAMP", 1),
      char_length = sql_prefix("CHAR_LENGTH", 1),
      flatten = sql_prefix("FLATTEN", 1),
      kvgen = sql_prefix("KVGEN", 1),
      repeated_count = sql_prefix("REPEATED_COUNT", 1),
      repeated_contains = sql_prefix("REPEATED_CONTAINS", 2),
      ilike = sql_prefix("ILIKE", 2),
      init_cap = sql_prefix("INIT_CAP", 1),
      length = sql_prefix("LENGTH", 1),
      lower = sql_prefix("LOWER", 1),
      str_to_lower = sql_prefix("LOWER", 1),
      tolower = sql_prefix("LOWER", 1),
      ltrim = sql_prefix("LTRIM", 2),
      nullif = sql_prefix("NULLIF", 2),
      position = function(x, y) build_sql("POSITION(", x, " IN ", y, ")"),
      regexp_replace = sql_prefix("REGEXP_REPLACE", 3),
      rtrim = sql_prefix("RTRIM", 2),
      rpad = sql_prefix("RPAD", 2),
      rpad_with = sql_prefix("RPAD", 3),
      lpad = sql_prefix("LPAD", 2),
      lpad_with = sql_prefix("LPAD", 3),
      strpos = sql_prefix("STRPOS", 2),
      substr = sql_prefix("SUBSTR", 3),
      str_sub = sql_prefix("SUBSTR", 3),
      trim = function(x, y, z) build_sql("TRIM(", x, " ", y, " FROM ", z, ")"),
      upper = sql_prefix("UPPER", 1),
      str_to_upper = sql_prefix("UPPER", 1),
      toupper = sql_prefix("UPPER", 1)
    ),

    aggregate = dbplyr::sql_translator(
      .parent = dbplyr::base_agg,
      n = function() dbplyr::sql("COUNT(*)"),
      cor = dbplyr::sql_prefix("CORR"),
      cov = dbplyr::sql_prefix("COVAR_SAMP"),
      sd =  dbplyr::sql_prefix("STDDEV_SAMP"),
      var = dbplyr::sql_prefix("VAR_SAMP"),
      n_distinct = function(x) {
        dbplyr::build_sql(dbplyr::sql("COUNT(DISTINCT "), x, dbplyr::sql(")"))
      }
    ),

    window = dbplyr::sql_translator(
      .parent = dbplyr::base_win,
      n = function() { dbplyr::win_over(dbplyr::sql("count(*)"),
                                        partition = dbplyr::win_current_group()) },
      cor = dbplyr::win_recycled("corr"),
      cov = dbplyr::win_recycled("covar_samp"),
      sd =  dbplyr::win_recycled("stddev_samp"),
      var = dbplyr::win_recycled("var_samp"),
      all = dbplyr::win_recycled("bool_and"),
      any = dbplyr::win_recycled("bool_or")
    )

  )

}

#' src tbls
#'
#' "SHOW DATABASES"
#'
#' @rdname drill_jdbc_internals
#' @family Drill JDBC API
#' @keywords internal
#' @param x x
#' @export
src_tbls.src_dbi <- function(x) {
  tmp <- dbGetQuery(x$con, "SHOW DATABASES")
  paste0(unlist(tmp$SCHEMA_NAME, use.names=FALSE), collapse=", ")
}

M R/query.r => R/query.r +5 -3
@@ 17,10 17,11 @@
#' @references \href{https://drill.apache.org/docs/}{Drill documentation}
#' @family Dill direct REST API Interface
#' @export
#' @examples \dontrun{
#' @examples
#' try({
#' drill_connection() %>%
#'   drill_query("SELECT * FROM cp.`employee.json` limit 5")
#' }
#' }, silent=TRUE)
drill_query <- function(drill_con, query, uplift=TRUE, .progress=interactive()) {

  query <- trimws(query)


@@ 30,8 31,9 @@ drill_query <- function(drill_con, query, uplift=TRUE, .progress=interactive()) 

    try_require("rJava")
    try_require("RJDBC")
    try_require("sergeant.caffeinated")

    dplyr::tbl_df(RJDBC::dbGetQuery(drill_con, query))
    dplyr::tbl_df(dbGetQuery(drill_con, query))

  } else {


M R/schemas.R => R/schemas.R +3 -2
@@ 30,9 30,10 @@ drill_use <- function(drill_con, schema_name) {
#' @export
#' @references \href{https://drill.apache.org/docs/}{Drill documentation}
#' @family Dill direct REST API Interface
#' @examples \dontrun{
#' @examples
#' try({
#' drill_connection() %>% drill_show_files("dfs.tmp")
#' }
#' }, silent=TRUE)
drill_show_files <- function(drill_con, schema_spec) {
  query <- sprintf("SHOW FILES IN %s", schema_spec)
  drill_query(drill_con, query, uplift=TRUE) %>%

M R/sergeant-package.r => R/sergeant-package.r +6 -2
@@ 23,7 23,7 @@
#' network or between nodes. Drill uses ZooKeeper to maintain cluster membership and health
#' check information.
#'
#' Methods are provided to work with Drill via the native JDBC & REST APIs along with R
#' Methods are provided to work with Drill via the REST APIs along with R
#' \code{DBI} and \code{dplyr} interfaces.
#'
#' @name sergeant


@@ 40,7 40,6 @@
#' @import utils
#' @import DBI methods
#' @importFrom scales comma
#' @importClassesFrom RJDBC JDBCDriver JDBCConnection JDBCResult
NULL




@@ 57,3 56,8 @@ NULL
#' @export
#' @rdname sergeant-exports
NULL

#' @name tbl
#' @export
#' @rdname sergeant-exports
NULL

M R/sergeant.r => R/sergeant.r +3 -2
@@ 41,9 41,10 @@ drill_connection <- function(host=Sys.getenv("DRILL_HOST", "localhost"),
#' @param drill_con drill server connection object setup by \code{drill_connection()}
#' @export
#' @family Dill direct REST API Interface
#' @examples \dontrun{
#' @examples
#' try({
#' drill_connection() %>% drill_active()
#' }
#' }, silent=TRUE)
drill_active <- function(drill_con) {
  drill_server <- make_server(drill_con)
  !is.null(s_head(drill_server, httr::timeout(2))$result)

M R/utils.r => R/utils.r +0 -11
@@ 1,14 1,3 @@
.verify.JDBC.result <- function (result, ...) {
  if (rJava::is.jnull(result)) {
    x <- rJava::.jgetEx(TRUE)
    if (rJava::is.jnull(x))
      stop(...)
    else
      stop(...," (",rJava::.jcall(x, "S", "getMessage"),")")
  }
}


try_require <- function(package, fun) {
  if (requireNamespace(package, quietly = TRUE)) {
    library(package, character.only = TRUE)

D R/zzz.R => R/zzz.R +0 -3
@@ 1,3 0,0 @@
# .onLoad <- function(libname, pkgname) {
#   if (requireNamespace("rJava")) rJava::.jpackage(pkgname, lib.loc = libname)
# }

M README.Rmd => README.Rmd +23 -120
@@ 1,12 1,14 @@
---
output: github_document
editor_options: 
  chunk_output_type: console
---
<!-- README.md is generated from README.Rmd. Please edit that file -->

```{r, echo = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  comment = "##",
  fig.path = "README-"
)
```


@@ 22,19 24,19 @@ Tools to Transform and Query Data with 'Apache' 'Drill'

## NOTE

Version 0.7.0 re-introduces an `RJDBC` (and, as such, an `rJava` depedency). If you desire this to be put into a sibling package, [cast your vote](https://github.com/hrbrmstr/sergeant/issues/20).
Version 0.7.0 splits off the JDBC interface into a separate package `sergeant.caffeinated` ([GitLab](https://gitlab.com/hrbrmstr/sergeant-caffeinated); [GitHub](https://github.com/hrbrmstr/sergeant-caffeinated)).

## Description

Drill + `sergeant` is (IMO) a nice alternative to Spark + `sparklyr` if you don't need the ML components of Spark (i.e. just need to query "big data" sources, need to interface with parquet, need to combine disparate data source types — json, csv, parquet, rdbms - for aggregation, etc). Drill also has support for spatial queries.
Drill + `sergeant` is (IMO) a streamlined alternative to Spark + `sparklyr` if you don't need the ML components of Spark (i.e. just need to query "big data" sources, need to interface with parquet, need to combine disparate data source types — json, csv, parquet, rdbms - for aggregation, etc). Drill also has support for spatial queries.

I find writing SQL queries to parquet files with Drill on a local linux or macOS workstation to be more performant than doing the data ingestion work with R (especially for large or disperate data sets). I also work with many tiny JSON files on a daily basis and Drill makes it much easier to do so. YMMV.
Using Drill SQL queries that reference parquet files on a local linux or macOS workstation can often be more performant than doing the same data ingestion & wrangling work with R (especially for large or disperate data sets). Drill can often help further streaming workflows that infolve wrangling many tiny JSON files on a daily basis.

You can download Drill from <https://drill.apache.org/download/> (use "Direct File Download"). I use `/usr/local/drill` as the install directory. `drill-embedded` is a super-easy way to get started playing with Drill on a single workstation and most of my workflows can get by using Drill this way. If there is sufficient desire for an automated downloader and a way to start the `drill-embedded` server from within R, please file an issue.
Drill can be obtained from <https://drill.apache.org/download/> (use "Direct File Download"). Drill can also be installed via [Docker](https://drill.apache.org/docs/running-drill-on-docker/). For local installs on Unix-like systems, a common/suggestion location for the Drill directory is  `/usr/local/drill` as the install directory. 

There are a few convenience wrappers for various informational SQL queries (like `drill_version()`). Please file an PR if you add more.
Drill embedded (started using the `$DRILL_BASE_DIR/bin/drill-embedded` script) is a super-easy way to get started playing with Drill on a single workstation and most of many workflows can "get by" using Drill this way.

The package has been written with retrieval of rectangular data sources in mind. If you need/want a version of `drill_query()` that will enable returning of non-rectangular data (which is possible with Drill) then please file an issue.
There are a few convenience wrappers for various informational SQL queries (like `drill_version()`). Please file an PR if you add more.

Some of the more "controlling vs data ops" REST API functions aren't implemented. Please file a PR if you need those.



@@ 45,22 47,11 @@ The following functions are implemented:
- A "just enough" feature complete R `DBI` driver has been implemented using the Drill REST API, mostly to facilitate the `dplyr` interface. Use the `RJDBC` driver interface if you need more `DBI` functionality.
- This also means that SQL functions unique to Drill have also been "implemented" (i.e. made accessible to the `dplyr` interface). If you have custom Drill SQL functions that need to be implemented please file an issue on GitHub. Many should work without it, but some may require a custom interface. 

**`DBI`** (RJDBC)

- `drill_jdbc`:	Connect to Drill using JDBC, enabling use of said idioms. See `RJDBC` for more info.
- NOTE: The DRILL JDBC driver fully-qualified path must be placed in the `DRILL_JDBC_JAR` environment variable. This is best done via `~/.Renviron` for interactive work. i.e. `DRILL_JDBC_JAR=/usr/local/drill/jars/drill-jdbc-all-1.14.0.jar`

**`dplyr`**: (REST)

- `src_drill`: Connect to Drill (using dplyr) + supporting functions

See `dplyr` for the `dplyr` operations (light testing shows they work in basic SQL use-cases but Drill's SQL engine has issues with more complex queries).

**`dplyr`**: (RJDBC)
- `src_drill`: Connect to Drill (using `dplyr`) + supporting functions

- `src_drill_jdbc`: Connect to Drill (using dplyr & RJDBC) + supporting functions

See `dplyr` for the `dplyr` operations (light testing shows they work in basic SQL use-cases but Drill's SQL engine has issues with more complex queries).
Note that a number of Drill SQL functions have been mapped to R functions (e.g. `grepl`) to make it easier to transition from non-database-backed SQL ops to Drill. See the help on `drill_custom_functions` for more info on these helper Drill custom function mappings.

**Drill APIs**:



@@ 86,7 77,7 @@ See `dplyr` for the `dplyr` operations (light testing shows they work in basic S
- `drill_use`:	Change to a particular schema.
- `drill_version`:	Identify the version of Drill running

### Installation
## Installation

```{r eval=FALSE}
devtools::install_github("hrbrmstr/sergeant")


@@ 96,45 87,23 @@ devtools::install_github("hrbrmstr/sergeant")
options(width=120)
```

### Experimental `dplyr` interface

```{r message=FALSE}
library(sergeant)

```
## Usage

```{r echo=TRUE, eval=FALSE}
ds <- src_drill("localhost")  # use localhost if running standalone on same system otherwise the host or IP of your Drill server
ds
```
### `dplyr` interface

```{r echo=FALSE, eval=TRUE}
ds <- src_drill("bigd")
ds
```
```{r dplyr-01, message=FALSE}
library(sergeant)
library(tidyverse)

```{r message=FALSE}
# use localhost if running standalone on same system otherwise the host or IP of your Drill server
ds <- src_drill("localhost")  #ds
db <- tbl(ds, "cp.`employee.json`") 

# without `collect()`:
count(db, gender, marital_status)

# ^^ gets translated to:
# 
# SELECT *
# FROM (SELECT  gender ,  marital_status , COUNT(*) AS  n 
#       FROM  cp.`employee.json` 
#       GROUP BY  gender ,  marital_status )  govketbhqb 
# LIMIT 1000

count(db, gender, marital_status) %>% collect()

# ^^ gets translated to:
# 
# SELECT  gender ,  marital_status , COUNT(*) AS  n 
# FROM  cp.`employee.json` 
# GROUP BY  gender ,  marital_status 

group_by(db, position_title) %>% 
  count(gender) -> tmp2



@@ 145,24 114,8 @@ group_by(db, position_title) %>%
  collect() %>% 
  select(Title=position_title, Gender=full_desc, Count=n)

# ^^ gets translated to:
# 
# SELECT  position_title ,  gender ,  n ,
#         CASE WHEN ( gender  = 'F') THEN ('Female') ELSE ('Male') END AS  full_desc 
# FROM (SELECT  position_title ,  gender , COUNT(*) AS  n 
#       FROM  cp.`employee.json` 
#       GROUP BY  position_title ,  gender )  dcyuypuypb 

arrange(db, desc(employee_id)) %>% print(n=20)

# ^^ gets translated to:
# 
# SELECT *
# FROM (SELECT *
#       FROM  cp.`employee.json` 
#       ORDER BY  employee_id  DESC)  lvpxoaejbc 
# LIMIT 5

mutate(db, position_title=tolower(position_title)) %>%
  mutate(salary=as.numeric(salary)) %>% 
  mutate(gender=ifelse(gender=="F", "Female", "Male")) %>%


@@ 170,44 123,19 @@ mutate(db, position_title=tolower(position_title)) %>%
  group_by(supervisor_id) %>% 
  summarise(underlings_count=n()) %>% 
  collect()

# ^^ gets translated to:
# 
# SELECT  supervisor_id , COUNT(*) AS  underlings_count 
# FROM (SELECT  employee_id ,  full_name ,  first_name ,  last_name ,  position_id ,  position_title ,  store_id ,  department_id ,  birth_date ,  hire_date ,  salary ,  supervisor_id ,  education_level ,  gender ,  management_role , CASE WHEN ( marital_status  = 'S') THEN ('Single') ELSE ('Married') END AS  marital_status 
#       FROM (SELECT  employee_id ,  full_name ,  first_name ,  last_name ,  position_id ,  position_title ,  store_id ,  department_id ,  birth_date ,  hire_date ,  salary ,  supervisor_id ,  education_level ,  marital_status ,  management_role , CASE WHEN ( gender  = 'F') THEN ('Female') ELSE ('Male') END AS  gender 
#             FROM (SELECT  employee_id ,  full_name ,  first_name ,  last_name ,  position_id ,  position_title ,  store_id ,  department_id ,  birth_date ,  hire_date ,  supervisor_id ,  education_level ,  marital_status ,  gender ,  management_role , CAST( salary  AS DOUBLE) AS  salary 
#                   FROM (SELECT  employee_id ,  full_name ,  first_name ,  last_name ,  position_id ,  store_id ,  department_id ,  birth_date ,  hire_date ,  salary ,  supervisor_id ,  education_level ,  marital_status ,  gender ,  management_role , LOWER( position_title ) AS  position_title 
#                         FROM  cp.`employee.json` )  cnjsqxeick )  bnbnjrubna )  wavfmhkczv )  zaxeyyicxo 
# GROUP BY  supervisor_id 
```

### Usage

```{r message=FALSE}
library(sergeant)

# current verison
packageVersion("sergeant")
### REST API

```
```{r eval=FALSE}
```{r rest-01}
dc <- drill_connection("localhost") 
```
```{r echo=FALSE}
dc <- drill_connection("bigd") 
```
```{r message=FALSE}

drill_active(dc)

drill_version(dc)

drill_storage(dc)$name
```

Working with the built-in JSON data sets:

```{r}
drill_query(dc, "SELECT * FROM cp.`employee.json` limit 100")

drill_query(dc, "SELECT COUNT(gender) AS gender FROM cp.`employee.json` GROUP BY gender")


@@ 229,7 157,7 @@ Including multiple parquet files in different directories (note the wildcard sup
drill_query(dc, "SELECT * FROM dfs.`/usr/local/drill/sample-data/nations*/nations*.parquet` LIMIT 5")
```

### A preview of the built-in support for spatial ops
### Drill has built-in support for spatial ops

Via: <https://github.com/k255/drill-gis>



@@ 248,31 176,6 @@ select columns[2] as city, columns[4] as lon, columns[3] as lat
            )
")
```

### JDBC

```{r}
library(RJDBC)

# Use this if connecting to a cluster with zookeeper
# con <- drill_jdbc("drill-node:2181", "drillbits1") 

# Use the following if running drill-embedded
```
```{r eval=FALSE}
con <- drill_jdbc("localhost:31010", use_zk=FALSE)
```
```{r echo=FALSE}
con <- drill_jdbc("bigd:31010", use_zk=FALSE)
```
```{r message=FALSE}
drill_query(con, "SELECT * FROM cp.`employee.json`")

# but it can work via JDBC function calls, too
dbGetQuery(con, "SELECT * FROM cp.`employee.json`") %>% 
  tibble::as_tibble()
```

### Test Results

```{r}

M README.md => README.md +339 -379
@@ 1,138 1,149 @@

<!-- README.md is generated from README.Rmd. Please edit that file -->
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1248912.svg)](https://doi.org/10.5281/zenodo.1248912) 
[![Travis-CI Build Status](https://travis-ci.org/hrbrmstr/sergeant.svg?branch=master)](https://travis-ci.org/hrbrmstr/sergeant) 
[![Coverage Status](https://codecov.io/gh/hrbrmstr/sergeant/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/sergeant)
[![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/sergeant)](https://cran.r-project.org/package=sergeant)

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1248912.svg)](https://doi.org/10.5281/zenodo.1248912)
[![Travis-CI Build
Status](https://travis-ci.org/hrbrmstr/sergeant.svg?branch=master)](https://travis-ci.org/hrbrmstr/sergeant)
[![Coverage
Status](https://codecov.io/gh/hrbrmstr/sergeant/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/sergeant)
[![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/sergeant)](https://cran.r-project.org/package=sergeant)

# 💂 sergeant

Tools to Transform and Query Data with 'Apache' 'Drill'
Tools to Transform and Query Data with ‘Apache’ ‘Drill’

## NOTE

Version 0.7.0 re-introduces an `RJDBC` (and, as such, an `rJava` depedency). If you desire this to be put into a sibling package, [cast your vote](https://github.com/hrbrmstr/sergeant/issues/20).
Version 0.7.0 splits off the JDBC interface into a separate package
`sergeant.caffeinated`
([GitLab](https://gitlab.com/hrbrmstr/sergeant-caffeinated);
[GitHub](https://github.com/hrbrmstr/sergeant-caffeinated)).

## Description

Drill + `sergeant` is (IMO) a nice alternative to Spark + `sparklyr` if you don't need the ML components of Spark (i.e. just need to query "big data" sources, need to interface with parquet, need to combine disparate data source types — json, csv, parquet, rdbms - for aggregation, etc). Drill also has support for spatial queries.
Drill + `sergeant` is (IMO) a streamlined alternative to Spark +
`sparklyr` if you don’t need the ML components of Spark (i.e. just need
to query “big data” sources, need to interface with parquet, need to
combine disparate data source types — json, csv, parquet, rdbms - for
aggregation, etc). Drill also has support for spatial queries.

I find writing SQL queries to parquet files with Drill on a local linux or macOS workstation to be more performant than doing the data ingestion work with R (especially for large or disperate data sets). I also work with many tiny JSON files on a daily basis and Drill makes it much easier to do so. YMMV.
Using Drill SQL queries that reference parquet files on a local linux or
macOS workstation can often be more performant than doing the same data
ingestion & wrangling work with R (especially for large or disperate
data sets). Drill can often help further streaming workflows that
infolve wrangling many tiny JSON files on a daily basis.

You can download Drill from <https://drill.apache.org/download/> (use "Direct File Download"). I use `/usr/local/drill` as the install directory. `drill-embedded` is a super-easy way to get started playing with Drill on a single workstation and most of my workflows can get by using Drill this way. If there is sufficient desire for an automated downloader and a way to start the `drill-embedded` server from within R, please file an issue.
Drill can be obtained from <https://drill.apache.org/download/> (use
“Direct File Download”). Drill can also be installed via
[Docker](https://drill.apache.org/docs/running-drill-on-docker/). For
local installs on Unix-like systems, a common/suggestion location for
the Drill directory is `/usr/local/drill` as the install directory.

There are a few convenience wrappers for various informational SQL queries (like `drill_version()`). Please file an PR if you add more.
Drill embedded (started using the `$DRILL_BASE_DIR/bin/drill-embedded`
script) is a super-easy way to get started playing with Drill on a
single workstation and most of many workflows can “get by” using Drill
this way.

The package has been written with retrieval of rectangular data sources in mind. If you need/want a version of `drill_query()` that will enable returning of non-rectangular data (which is possible with Drill) then please file an issue.
There are a few convenience wrappers for various informational SQL
queries (like `drill_version()`). Please file an PR if you add more.

Some of the more "controlling vs data ops" REST API functions aren't implemented. Please file a PR if you need those.
Some of the more “controlling vs data ops” REST API functions aren’t
implemented. Please file a PR if you need those.

The following functions are implemented:

**`DBI`** (REST)

- A "just enough" feature complete R `DBI` driver has been implemented using the Drill REST API, mostly to facilitate the `dplyr` interface. Use the `RJDBC` driver interface if you need more `DBI` functionality.
- This also means that SQL functions unique to Drill have also been "implemented" (i.e. made accessible to the `dplyr` interface). If you have custom Drill SQL functions that need to be implemented please file an issue on GitHub. Many should work without it, but some may require a custom interface. 

**`DBI`** (RJDBC)

- `drill_jdbc`:	Connect to Drill using JDBC, enabling use of said idioms. See `RJDBC` for more info.
- NOTE: The DRILL JDBC driver fully-qualified path must be placed in the `DRILL_JDBC_JAR` environment variable. This is best done via `~/.Renviron` for interactive work. i.e. `DRILL_JDBC_JAR=/usr/local/drill/jars/drill-jdbc-all-1.14.0.jar`
  - A “just enough” feature complete R `DBI` driver has been implemented
    using the Drill REST API, mostly to facilitate the `dplyr`
    interface. Use the `RJDBC` driver interface if you need more `DBI`
    functionality.
  - This also means that SQL functions unique to Drill have also been
    “implemented” (i.e. made accessible to the `dplyr` interface). If
    you have custom Drill SQL functions that need to be implemented
    please file an issue on GitHub. Many should work without it, but
    some may require a custom interface.

**`dplyr`**: (REST)

- `src_drill`: Connect to Drill (using dplyr) + supporting functions

See `dplyr` for the `dplyr` operations (light testing shows they work in basic SQL use-cases but Drill's SQL engine has issues with more complex queries).

**`dplyr`**: (RJDBC)
  - `src_drill`: Connect to Drill (using `dplyr`) + supporting functions

- `src_drill_jdbc`: Connect to Drill (using dplyr & RJDBC) + supporting functions

See `dplyr` for the `dplyr` operations (light testing shows they work in basic SQL use-cases but Drill's SQL engine has issues with more complex queries).
Note that a number of Drill SQL functions have been mapped to R
functions (e.g. `grepl`) to make it easier to transition from
non-database-backed SQL ops to Drill. See the help on
`drill_custom_functions` for more info on these helper Drill custom
function mappings.

**Drill APIs**:

-   `drill_connection`: Setup parameters for a Drill server/cluster connection
-   `drill_active`: Test whether Drill HTTP REST API server is up
-   `drill_cancel`: Cancel the query that has the given queryid
-   `drill_jdbc`: Connect to Drill using JDBC
-   `drill_metrics`: Get the current memory metrics
-   `drill_options`: List the name, default, and data type of the system and session options
-   `drill_profile`: Get the profile of the query that has the given query id
-   `drill_profiles`: Get the profiles of running and completed queries
-   `drill_query`: Submit a query and return results
-   `drill_set`: Set Drill SYSTEM or SESSION options
-   `drill_settings_reset`: Changes (optionally, all) session settings back to system defaults
-   `drill_show_files`: Show files in a file system schema.
-   `drill_show_schemas`: Returns a list of available schemas.
-   `drill_stats`: Get Drillbit information, such as ports numbers
-   `drill_status`: Get the status of Drill
-   `drill_storage`: Get the list of storage plugin names and configurations
-   `drill_system_reset`: Changes (optionally, all) system settings back to system defaults
-   `drill_threads`: Get information about threads
-   `drill_uplift`: Turn a columnar query results into a type-converted tbl
-   `drill_use`: Change to a particular schema.
-   `drill_version`: Identify the version of Drill running

### Installation
  - `drill_connection`: Setup parameters for a Drill server/cluster
    connection
  - `drill_active`: Test whether Drill HTTP REST API server is up
  - `drill_cancel`: Cancel the query that has the given queryid
  - `drill_jdbc`: Connect to Drill using JDBC
  - `drill_metrics`: Get the current memory metrics
  - `drill_options`: List the name, default, and data type of the system
    and session options
  - `drill_profile`: Get the profile of the query that has the given
    query id
  - `drill_profiles`: Get the profiles of running and completed queries
  - `drill_query`: Submit a query and return results
  - `drill_set`: Set Drill SYSTEM or SESSION options
  - `drill_settings_reset`: Changes (optionally, all) session settings
    back to system defaults
  - `drill_show_files`: Show files in a file system schema.
  - `drill_show_schemas`: Returns a list of available schemas.
  - `drill_stats`: Get Drillbit information, such as ports numbers
  - `drill_status`: Get the status of Drill
  - `drill_storage`: Get the list of storage plugin names and
    configurations
  - `drill_system_reset`: Changes (optionally, all) system settings back
    to system defaults
  - `drill_threads`: Get information about threads
  - `drill_uplift`: Turn a columnar query results into a type-converted
    tbl
  - `drill_use`: Change to a particular schema.
  - `drill_version`: Identify the version of Drill running

## Installation

``` r
devtools::install_github("hrbrmstr/sergeant")
```

### Experimental `dplyr` interface
## Usage

``` r
library(sergeant)
```
### `dplyr` interface

``` r
ds <- src_drill("localhost")  # use localhost if running standalone on same system otherwise the host or IP of your Drill server
ds
```

    #> src:  DrillConnection
    #> tbls: INFORMATION_SCHEMA, cp.default, dfs.d, dfs.default, dfs.h, dfs.natexp, dfs.p, dfs.root, dfs.tmp, sys
library(sergeant)
library(tidyverse)

``` r
# use localhost if running standalone on same system otherwise the host or IP of your Drill server
ds <- src_drill("localhost")  #ds
db <- tbl(ds, "cp.`employee.json`") 

# without `collect()`:
count(db, gender, marital_status)
#> # Source:   lazy query [?? x 3]
#> # Database: DrillConnection
#> # Groups:   gender
#>   marital_status gender     n
#>            <chr>  <chr> <int>
#> 1              S      F   297
#> 2              M      M   278
#> 3              S      M   276
#> 4              M      F   304

# ^^ gets translated to:
# 
# SELECT *
# FROM (SELECT  gender ,  marital_status , COUNT(*) AS  n 
#       FROM  cp.`employee.json` 
#       GROUP BY  gender ,  marital_status )  govketbhqb 
# LIMIT 1000
## # Source:   lazy query [?? x 3]
## # Database: DrillConnection
## # Groups:   gender
##   marital_status gender     n
##   <chr>          <chr>  <int>
## 1 S              F        297
## 2 M              M        278
## 3 S              M        276
## 4 M              F        304

count(db, gender, marital_status) %>% collect()
#> # A tibble: 4 x 3
#> # Groups:   gender [2]
#>   marital_status gender     n
#> *          <chr>  <chr> <int>
#> 1              S      F   297
#> 2              M      M   278
#> 3              S      M   276
#> 4              M      F   304

# ^^ gets translated to:
# 
# SELECT  gender ,  marital_status , COUNT(*) AS  n 
# FROM  cp.`employee.json` 
# GROUP BY  gender ,  marital_status 
## # A tibble: 4 x 3
## # Groups:   gender [2]
##   marital_status gender     n
## * <chr>          <chr>  <int>
## 1 S              F        297
## 2 M              M        278
## 3 S              M        276
## 4 M              F        304

group_by(db, position_title) %>% 
  count(gender) -> tmp2


@@ 143,65 154,49 @@ group_by(db, position_title) %>%
  mutate(full_desc=ifelse(gender=="F", "Female", "Male")) %>% 
  collect() %>% 
  select(Title=position_title, Gender=full_desc, Count=n)
#> # A tibble: 30 x 3
#>                     Title Gender Count
#>  *                  <chr>  <chr> <int>
#>  1              President Female     1
#>  2     VP Country Manager   Male     3
#>  3     VP Country Manager Female     3
#>  4 VP Information Systems Female     1
#>  5     VP Human Resources Female     1
#>  6          Store Manager Female    13
#>  7             VP Finance   Male     1
#>  8          Store Manager   Male    11
#>  9           HQ Marketing Female     2
#> 10 HQ Information Systems Female     4
#> # ... with 20 more rows

# ^^ gets translated to:
# 
# SELECT  position_title ,  gender ,  n ,
#         CASE WHEN ( gender  = 'F') THEN ('Female') ELSE ('Male') END AS  full_desc 
# FROM (SELECT  position_title ,  gender , COUNT(*) AS  n 
#       FROM  cp.`employee.json` 
#       GROUP BY  position_title ,  gender )  dcyuypuypb 
## # A tibble: 30 x 3
##    Title                  Gender Count
##  * <chr>                  <chr>  <int>
##  1 President              Female     1
##  2 VP Country Manager     Male       3
##  3 VP Country Manager     Female     3
##  4 VP Information Systems Female     1
##  5 VP Human Resources     Female     1
##  6 Store Manager          Female    13
##  7 VP Finance             Male       1
##  8 Store Manager          Male      11
##  9 HQ Marketing           Female     2
## 10 HQ Information Systems Female     4
## # ... with 20 more rows

arrange(db, desc(employee_id)) %>% print(n=20)
#> # Source:     table<cp.`employee.json`> [?? x 16]
#> # Database:   DrillConnection
#> # Ordered by: desc(employee_id)
#>    store_id gender department_id birth_date supervisor_id  last_name          position_title  hire_date
#>       <int>  <chr>         <int>     <date>         <int>      <chr>                   <chr>     <dttm>
#>  1       18      F            18 1914-02-02          1140      Stand Store Temporary Stocker 1998-01-01
#>  2       18      M            18 1914-02-02          1140    Burnham Store Temporary Stocker 1998-01-01
#>  3       18      F            18 1914-02-02          1139  Doolittle Store Temporary Stocker 1998-01-01
#>  4       18      M            18 1914-02-02          1139     Pirnie Store Temporary Stocker 1998-01-01
#>  5       18      M            17 1914-02-02          1140     Younce Store Permanent Stocker 1998-01-01
#>  6       18      F            17 1914-02-02          1140    Biltoft Store Permanent Stocker 1998-01-01
#>  7       18      M            17 1914-02-02          1139   Detwiler Store Permanent Stocker 1998-01-01
#>  8       18      F            17 1914-02-02          1139     Ciruli Store Permanent Stocker 1998-01-01
#>  9       18      F            16 1914-02-02          1140     Bishop Store Temporary Checker 1998-01-01
#> 10       18      F            16 1914-02-02          1140  Cutwright Store Temporary Checker 1998-01-01
#> 11       18      F            16 1914-02-02          1139   Anderson Store Temporary Checker 1998-01-01
#> 12       18      F            16 1914-02-02          1139  Swartwood Store Temporary Checker 1998-01-01
#> 13       18      M            15 1914-02-02          1140 Curtsinger Store Permanent Checker 1998-01-01
#> 14       18      F            15 1914-02-02          1140      Quick Store Permanent Checker 1998-01-01
#> 15       18      M            15 1914-02-02          1139      Souza Store Permanent Checker 1998-01-01
#> 16       18      M            15 1914-02-02          1139   Compagno Store Permanent Checker 1998-01-01
#> 17       18      M            11 1961-09-24          1139  Jaramillo  Store Shift Supervisor 1998-01-01
#> 18       18      M            11 1972-05-12            17     Belsey Store Assistant Manager 1998-01-01
#> 19       12      M            18 1914-02-02          1069    Eichorn Store Temporary Stocker 1998-01-01
#> 20       12      F            18 1914-02-02          1069  Geiermann Store Temporary Stocker 1998-01-01
#> # ... with more rows, and 8 more variables: management_role <chr>, salary <dbl>, marital_status <chr>, full_name <chr>,
#> #   employee_id <int>, education_level <chr>, first_name <chr>, position_id <int>

# ^^ gets translated to:
# 
# SELECT *
# FROM (SELECT *
#       FROM  cp.`employee.json` 
#       ORDER BY  employee_id  DESC)  lvpxoaejbc 
# LIMIT 5
## # Source:     table<cp.`employee.json`> [?? x 20]
## # Database:   DrillConnection
## # Ordered by: desc(employee_id)
##    store_id gender department_id birth_date supervisor_id last_name  position_title hire_date           management_role
##       <int> <chr>          <int> <date>             <int> <chr>      <chr>          <dttm>              <chr>          
##  1       18 F                 18 1914-02-02          1140 Stand      Store Tempora… 1998-01-01 00:00:00 Store Temp Sta…
##  2       18 M                 18 1914-02-02          1140 Burnham    Store Tempora… 1998-01-01 00:00:00 Store Temp Sta…
##  3       18 F                 18 1914-02-02          1139 Doolittle  Store Tempora… 1998-01-01 00:00:00 Store Temp Sta…
##  4       18 M                 18 1914-02-02          1139 Pirnie     Store Tempora… 1998-01-01 00:00:00 Store Temp Sta…
##  5       18 M                 17 1914-02-02          1140 Younce     Store Permane… 1998-01-01 00:00:00 Store Full Tim…
##  6       18 F                 17 1914-02-02          1140 Biltoft    Store Permane… 1998-01-01 00:00:00 Store Full Tim…
##  7       18 M                 17 1914-02-02          1139 Detwiler   Store Permane… 1998-01-01 00:00:00 Store Full Tim…
##  8       18 F                 17 1914-02-02          1139 Ciruli     Store Permane… 1998-01-01 00:00:00 Store Full Tim…
##  9       18 F                 16 1914-02-02          1140 Bishop     Store Tempora… 1998-01-01 00:00:00 Store Full Tim…
## 10       18 F                 16 1914-02-02          1140 Cutwright  Store Tempora… 1998-01-01 00:00:00 Store Full Tim…
## 11       18 F                 16 1914-02-02          1139 Anderson   Store Tempora… 1998-01-01 00:00:00 Store Full Tim…
## 12       18 F                 16 1914-02-02          1139 Swartwood  Store Tempora… 1998-01-01 00:00:00 Store Full Tim…
## 13       18 M                 15 1914-02-02          1140 Curtsinger Store Permane… 1998-01-01 00:00:00 Store Full Tim…
## 14       18 F                 15 1914-02-02          1140 Quick      Store Permane… 1998-01-01 00:00:00 Store Full Tim…
## 15       18 M                 15 1914-02-02          1139 Souza      Store Permane… 1998-01-01 00:00:00 Store Full Tim…
## 16       18 M                 15 1914-02-02          1139 Compagno   Store Permane… 1998-01-01 00:00:00 Store Full Tim…
## 17       18 M                 11 1961-09-24          1139 Jaramillo  Store Shift S… 1998-01-01 00:00:00 Store Manageme…
## 18       18 M                 11 1972-05-12            17 Belsey     Store Assista… 1998-01-01 00:00:00 Store Manageme…
## 19       12 M                 18 1914-02-02          1069 Eichorn    Store Tempora… 1998-01-01 00:00:00 Store Temp Sta…
## 20       12 F                 18 1914-02-02          1069 Geiermann  Store Tempora… 1998-01-01 00:00:00 Store Temp Sta…
## # ... with more rows, and 7 more variables: salary <dbl>, marital_status <chr>, full_name <chr>, employee_id <int>,
## #   education_level <chr>, first_name <chr>, position_id <int>

mutate(db, position_title=tolower(position_title)) %>%
  mutate(salary=as.numeric(salary)) %>% 


@@ 210,181 205,160 @@ mutate(db, position_title=tolower(position_title)) %>%
  group_by(supervisor_id) %>% 
  summarise(underlings_count=n()) %>% 
  collect()
#> # A tibble: 112 x 2
#>    supervisor_id underlings_count
#>  *         <int>            <int>
#>  1             0                1
#>  2             1                7
#>  3             5                9
#>  4             4                2
#>  5             2                3
#>  6            20                2
#>  7            21                4
#>  8            22                7
#>  9             6                4
#> 10            36                2
#> # ... with 102 more rows

# ^^ gets translated to:
# 
# SELECT  supervisor_id , COUNT(*) AS  underlings_count 
# FROM (SELECT  employee_id ,  full_name ,  first_name ,  last_name ,  position_id ,  position_title ,  store_id ,  department_id ,  birth_date ,  hire_date ,  salary ,  supervisor_id ,  education_level ,  gender ,  management_role , CASE WHEN ( marital_status  = 'S') THEN ('Single') ELSE ('Married') END AS  marital_status 
#       FROM (SELECT  employee_id ,  full_name ,  first_name ,  last_name ,  position_id ,  position_title ,  store_id ,  department_id ,  birth_date ,  hire_date ,  salary ,  supervisor_id ,  education_level ,  marital_status ,  management_role , CASE WHEN ( gender  = 'F') THEN ('Female') ELSE ('Male') END AS  gender 
#             FROM (SELECT  employee_id ,  full_name ,  first_name ,  last_name ,  position_id ,  position_title ,  store_id ,  department_id ,  birth_date ,  hire_date ,  supervisor_id ,  education_level ,  marital_status ,  gender ,  management_role , CAST( salary  AS DOUBLE) AS  salary 
#                   FROM (SELECT  employee_id ,  full_name ,  first_name ,  last_name ,  position_id ,  store_id ,  department_id ,  birth_date ,  hire_date ,  salary ,  supervisor_id ,  education_level ,  marital_status ,  gender ,  management_role , LOWER( position_title ) AS  position_title 
#                         FROM  cp.`employee.json` )  cnjsqxeick )  bnbnjrubna )  wavfmhkczv )  zaxeyyicxo 
# GROUP BY  supervisor_id 
## # A tibble: 112 x 2
##    supervisor_id underlings_count
##  *         <int>            <int>
##  1             0                1
##  2             1                7
##  3             5                9
##  4             4                2
##  5             2                3
##  6            20                2
##  7            21                4
##  8            22                7
##  9             6                4
## 10            36                2
## # ... with 102 more rows
```

### Usage

``` r
library(sergeant)

# current verison
packageVersion("sergeant")
#> [1] '0.5.2'
```
### REST API

``` r
dc <- drill_connection("localhost") 
```

``` r
drill_active(dc)
#> [1] TRUE
## [1] TRUE

drill_version(dc)
#> [1] "1.11.0"
## [1] "1.13.0"

drill_storage(dc)$name
#> [1] "cp"    "dfs"   "hbase" "hive"  "kudu"  "mongo" "s3"
```

Working with the built-in JSON data sets:
## [1] "cp"    "dfs"   "hbase" "hive"  "kudu"  "mongo" "s3"

``` r
drill_query(dc, "SELECT * FROM cp.`employee.json` limit 100")
#> Parsed with column specification:
#> cols(
#>   store_id = col_integer(),
#>   gender = col_character(),
#>   department_id = col_integer(),
#>   birth_date = col_date(format = ""),
#>   supervisor_id = col_integer(),
#>   last_name = col_character(),
#>   position_title = col_character(),
#>   hire_date = col_datetime(format = ""),
#>   management_role = col_character(),
#>   salary = col_double(),
#>   marital_status = col_character(),
#>   full_name = col_character(),
#>   employee_id = col_integer(),
#>   education_level = col_character(),
#>   first_name = col_character(),
#>   position_id = col_integer()
#> )
#> # A tibble: 100 x 16
#>    store_id gender department_id birth_date supervisor_id last_name         position_title  hire_date   management_role
#>  *    <int>  <chr>         <int>     <date>         <int>     <chr>                  <chr>     <dttm>             <chr>
#>  1        0      F             1 1961-08-26             0    Nowmer              President 1994-12-01 Senior Management
#>  2        0      M             1 1915-07-03             1   Whelply     VP Country Manager 1994-12-01 Senior Management
#>  3        0      M             1 1969-06-20             1    Spence     VP Country Manager 1998-01-01 Senior Management
#>  4        0      F             1 1951-05-10             1 Gutierrez     VP Country Manager 1998-01-01 Senior Management
#>  5        0      F             2 1942-10-08             1   Damstra VP Information Systems 1994-12-01 Senior Management
#>  6        0      F             3 1949-03-27             1  Kanagaki     VP Human Resources 1994-12-01 Senior Management
#>  7        9      F            11 1922-08-10             5   Brunner          Store Manager 1998-01-01  Store Management
#>  8       21      F            11 1979-06-23             5  Blumberg          Store Manager 1998-01-01  Store Management
#>  9        0      M             5 1949-08-26             1     Stanz             VP Finance 1994-12-01 Senior Management
#> 10        1      M            11 1967-06-20             5  Murraiin          Store Manager 1998-01-01  Store Management
#> # ... with 90 more rows, and 7 more variables: salary <dbl>, marital_status <chr>, full_name <chr>, employee_id <int>,
#> #   education_level <chr>, first_name <chr>, position_id <int>
## Parsed with column specification:
## cols(
##   store_id = col_integer(),
##   gender = col_character(),
##   department_id = col_integer(),
##   birth_date = col_date(format = ""),
##   supervisor_id = col_integer(),
##   last_name = col_character(),
##   position_title = col_character(),
##   hire_date = col_datetime(format = ""),
##   management_role = col_character(),
##   salary = col_double(),
##   marital_status = col_character(),
##   full_name = col_character(),
##   employee_id = col_integer(),
##   education_level = col_character(),
##   first_name = col_character(),
##   position_id = col_integer()
## )
## # A tibble: 100 x 16
##    store_id gender department_id birth_date supervisor_id last_name position_title  hire_date           management_role
##  *    <int> <chr>          <int> <date>             <int> <chr>     <chr>           <dttm>              <chr>          
##  1        0 F                  1 1961-08-26             0 Nowmer    President       1994-12-01 00:00:00 Senior Managem…
##  2        0 M                  1 1915-07-03             1 Whelply   VP Country Man… 1994-12-01 00:00:00 Senior Managem…
##  3        0 M                  1 1969-06-20             1 Spence    VP Country Man… 1998-01-01 00:00:00 Senior Managem…
##  4        0 F                  1 1951-05-10             1 Gutierrez VP Country Man… 1998-01-01 00:00:00 Senior Managem…
##  5        0 F                  2 1942-10-08             1 Damstra   VP Information… 1994-12-01 00:00:00 Senior Managem…
##  6        0 F                  3 1949-03-27             1 Kanagaki  VP Human Resou… 1994-12-01 00:00:00 Senior Managem…
##  7        9 F                 11 1922-08-10             5 Brunner   Store Manager   1998-01-01 00:00:00 Store Manageme…
##  8       21 F                 11 1979-06-23             5 Blumberg  Store Manager   1998-01-01 00:00:00 Store Manageme…
##  9        0 M                  5 1949-08-26             1 Stanz     VP Finance      1994-12-01 00:00:00 Senior Managem…
## 10        1 M                 11 1967-06-20             5 Murraiin  Store Manager   1998-01-01 00:00:00 Store Manageme…
## # ... with 90 more rows, and 7 more variables: salary <dbl>, marital_status <chr>, full_name <chr>, employee_id <int>,
## #   education_level <chr>, first_name <chr>, position_id <int>

drill_query(dc, "SELECT COUNT(gender) AS gender FROM cp.`employee.json` GROUP BY gender")
#> Parsed with column specification:
#> cols(
#>   gender = col_integer()
#> )
#> # A tibble: 2 x 1
#>   gender
#> *  <int>
#> 1    601
#> 2    554
## Parsed with column specification:
## cols(
##   gender = col_integer()
## )
## # A tibble: 2 x 1
##   gender
## *  <int>
## 1    601
## 2    554

drill_options(dc)
#> # A tibble: 124 x 4
#>                                              name value   type    kind
#>  *                                          <chr> <chr>  <chr>   <chr>
#>  1                 planner.enable_hash_single_key  TRUE SYSTEM BOOLEAN
#>  2      store.parquet.reader.pagereader.queuesize     2 SYSTEM    LONG
#>  3             planner.enable_limit0_optimization FALSE SYSTEM BOOLEAN
#>  4              store.json.read_numbers_as_double FALSE SYSTEM BOOLEAN
#>  5                planner.enable_constant_folding  TRUE SYSTEM BOOLEAN
#>  6                      store.json.extended_types FALSE SYSTEM BOOLEAN
#>  7   planner.memory.non_blocking_operators_memory    64 SYSTEM    LONG
#>  8                  planner.enable_multiphase_agg  TRUE SYSTEM BOOLEAN
#>  9                  exec.query_profile.debug_mode FALSE SYSTEM BOOLEAN
#> 10 planner.filter.max_selectivity_estimate_factor     1 SYSTEM  DOUBLE
#> # ... with 114 more rows
## # A tibble: 138 x 5
##    name                                              value    accessibleScopes kind    optionScope
##  * <chr>                                             <chr>    <chr>            <chr>   <chr>      
##  1 debug.validate_iterators                          FALSE    ALL              BOOLEAN BOOT       
##  2 debug.validate_vectors                            FALSE    ALL              BOOLEAN BOOT       
##  3 drill.exec.functions.cast_empty_string_to_null    FALSE    ALL              BOOLEAN BOOT       
##  4 drill.exec.hashagg.fallback.enabled               FALSE    ALL              BOOLEAN BOOT       
##  5 drill.exec.memory.operator.output_batch_size      16777216 SYSTEM           LONG    BOOT       
##  6 drill.exec.storage.file.partition.column.label    dir      ALL              STRING  BOOT       
##  7 drill.exec.storage.implicit.filename.column.label filename ALL              STRING  BOOT       
##  8 drill.exec.storage.implicit.filepath.column.label filepath ALL              STRING  BOOT       
##  9 drill.exec.storage.implicit.fqn.column.label      fqn      ALL              STRING  BOOT       
## 10 drill.exec.storage.implicit.suffix.column.label   suffix   ALL              STRING  BOOT       
## # ... with 128 more rows

drill_options(dc, "json")
#> # A tibble: 7 x 4
#>                                                    name value   type    kind
#>                                                   <chr> <chr>  <chr>   <chr>
#> 1                     store.json.read_numbers_as_double FALSE SYSTEM BOOLEAN
#> 2                             store.json.extended_types FALSE SYSTEM BOOLEAN
#> 3                              store.json.writer.uglify FALSE SYSTEM BOOLEAN
#> 4                store.json.reader.skip_invalid_records FALSE SYSTEM BOOLEAN
#> 5 store.json.reader.print_skipped_invalid_record_number FALSE SYSTEM BOOLEAN
#> 6                              store.json.all_text_mode FALSE SYSTEM BOOLEAN
#> 7                    store.json.writer.skip_null_fields  TRUE SYSTEM BOOLEAN
## # A tibble: 9 x 5
##   name                                                  value accessibleScopes kind    optionScope
##   <chr>                                                 <chr> <chr>            <chr>   <chr>      
## 1 store.json.all_text_mode                              FALSE ALL              BOOLEAN BOOT       
## 2 store.json.extended_types                             FALSE ALL              BOOLEAN BOOT       
## 3 store.json.read_numbers_as_double                     FALSE ALL              BOOLEAN BOOT       
## 4 store.json.reader.allow_nan_inf                       TRUE  ALL              BOOLEAN BOOT       
## 5 store.json.reader.print_skipped_invalid_record_number FALSE ALL              BOOLEAN BOOT       
## 6 store.json.reader.skip_invalid_records                FALSE ALL              BOOLEAN BOOT       
## 7 store.json.writer.allow_nan_inf                       TRUE  ALL              BOOLEAN BOOT       
## 8 store.json.writer.skip_null_fields                    TRUE  ALL              BOOLEAN BOOT       
## 9 store.json.writer.uglify                              FALSE ALL              BOOLEAN BOOT
```

Working with parquet files
--------------------------
## Working with parquet files

``` r
drill_query(dc, "SELECT * FROM dfs.`/usr/local/drill/sample-data/nation.parquet` LIMIT 5")
#> Parsed with column specification:
#> cols(
#>   N_COMMENT = col_character(),
#>   N_NAME = col_character(),
#>   N_NATIONKEY = col_integer(),
#>   N_REGIONKEY = col_integer()
#> )
#> # A tibble: 5 x 4
#>              N_COMMENT    N_NAME N_NATIONKEY N_REGIONKEY
#> *                <chr>     <chr>       <int>       <int>
#> 1  haggle. carefully f   ALGERIA           0           0
#> 2 al foxes promise sly ARGENTINA           1           1
#> 3 y alongside of the p    BRAZIL           2           1
#> 4 eas hang ironic, sil    CANADA           3           1
#> 5 y above the carefull     EGYPT           4           4
## Parsed with column specification:
## cols(
##   N_COMMENT = col_character(),
##   N_NAME = col_character(),
##   N_NATIONKEY = col_integer(),
##   N_REGIONKEY = col_integer()
## )
## # A tibble: 5 x 4
##   N_COMMENT            N_NAME    N_NATIONKEY N_REGIONKEY
## * <chr>                <chr>           <int>       <int>
## 1 haggle. carefully f  ALGERIA             0           0
## 2 al foxes promise sly ARGENTINA           1           1
## 3 y alongside of the p BRAZIL              2           1
## 4 eas hang ironic, sil CANADA              3           1
## 5 y above the carefull EGYPT               4           4
```

Including multiple parquet files in different directories (note the wildcard support):
Including multiple parquet files in different directories (note the
wildcard
support):

``` r
drill_query(dc, "SELECT * FROM dfs.`/usr/local/drill/sample-data/nations*/nations*.parquet` LIMIT 5")
#> Parsed with column specification:
#> cols(
#>   N_COMMENT = col_character(),
#>   N_NAME = col_character(),
#>   N_NATIONKEY = col_integer(),
#>   N_REGIONKEY = col_integer(),
#>   dir0 = col_character()
#> )
#> # A tibble: 5 x 5
#>              N_COMMENT    N_NAME N_NATIONKEY N_REGIONKEY      dir0
#> *                <chr>     <chr>       <int>       <int>     <chr>
#> 1  haggle. carefully f   ALGERIA           0           0 nationsSF
#> 2 al foxes promise sly ARGENTINA           1           1 nationsSF
#> 3 y alongside of the p    BRAZIL           2           1 nationsSF
#> 4 eas hang ironic, sil    CANADA           3           1 nationsSF
#> 5 y above the carefull     EGYPT           4           4 nationsSF
## Parsed with column specification:
## cols(
##   N_COMMENT = col_character(),
##   N_NAME = col_character(),
##   N_NATIONKEY = col_integer(),
##   dir0 = col_character(),
##   N_REGIONKEY = col_integer()
## )
## # A tibble: 5 x 5
##   N_COMMENT            N_NAME    N_NATIONKEY dir0      N_REGIONKEY
## * <chr>                <chr>           <int> <chr>           <int>
## 1 haggle. carefully f  ALGERIA             0 nationsSF           0
## 2 al foxes promise sly ARGENTINA           1 nationsSF           1
## 3 y alongside of the p BRAZIL              2 nationsSF           1
## 4 eas hang ironic, sil CANADA              3 nationsSF           1
## 5 y above the carefull EGYPT               4 nationsSF           4
```

### A preview of the built-in support for spatial ops
### Drill has built-in support for spatial ops

Via: <https://github.com/k255/drill-gis>



@@ 402,78 376,22 @@ select columns[2] as city, columns[4] as lon, columns[3] as lat
                )
            )
")
#> Parsed with column specification:
#> cols(
#>   city = col_character(),
#>   lon = col_double(),
#>   lat = col_double()
#> )
#> # A tibble: 7 x 3
#>          city       lon      lat
#> *       <chr>     <dbl>    <dbl>
#> 1     Burbank -121.9316 37.32328
#> 2    San Jose -121.8950 37.33939
#> 3        Lick -121.8458 37.28716
#> 4 Willow Glen -121.8897 37.30855
#> 5 Buena Vista -121.9166 37.32133
#> 6    Parkmoor -121.9308 37.32105
#> 7   Fruitdale -121.9327 37.31086
```

### JDBC

``` r
library(RJDBC)
#> Loading required package: rJava

# Use this if connecting to a cluster with zookeeper
# con <- drill_jdbc("drill-node:2181", "drillbits1") 

# Use the following if running drill-embedded
```

``` r
con <- drill_jdbc("localhost:31010", use_zk=FALSE)
```

    #> Using [jdbc:drill:drillbit=bigd:31010]...

``` r
drill_query(con, "SELECT * FROM cp.`employee.json`")
#> # A tibble: 1,155 x 16
#>    employee_id         full_name first_name last_name position_id         position_title store_id department_id
#>  *       <dbl>             <chr>      <chr>     <chr>       <dbl>                  <chr>    <dbl>         <dbl>
#>  1           1      Sheri Nowmer      Sheri    Nowmer           1              President        0             1
#>  2           2   Derrick Whelply    Derrick   Whelply           2     VP Country Manager        0             1
#>  3           4    Michael Spence    Michael    Spence           2     VP Country Manager        0             1
#>  4           5    Maya Gutierrez       Maya Gutierrez           2     VP Country Manager        0             1
#>  5           6   Roberta Damstra    Roberta   Damstra           3 VP Information Systems        0             2
#>  6           7  Rebecca Kanagaki    Rebecca  Kanagaki           4     VP Human Resources        0             3
#>  7           8       Kim Brunner        Kim   Brunner          11          Store Manager        9            11
#>  8           9   Brenda Blumberg     Brenda  Blumberg          11          Store Manager       21            11
#>  9          10      Darren Stanz     Darren     Stanz           5             VP Finance        0             5
#> 10          11 Jonathan Murraiin   Jonathan  Murraiin          11          Store Manager        1            11
#> # ... with 1,145 more rows, and 8 more variables: birth_date <chr>, hire_date <chr>, salary <dbl>, supervisor_id <dbl>,
#> #   education_level <chr>, marital_status <chr>, gender <chr>, management_role <chr>

# but it can work via JDBC function calls, too
dbGetQuery(con, "SELECT * FROM cp.`employee.json`") %>% 
  tibble::as_tibble()
#> # A tibble: 1,155 x 16
#>    employee_id         full_name first_name last_name position_id         position_title store_id department_id
#>  *       <dbl>             <chr>      <chr>     <chr>       <dbl>                  <chr>    <dbl>         <dbl>
#>  1           1      Sheri Nowmer      Sheri    Nowmer           1              President        0             1
#>  2           2   Derrick Whelply    Derrick   Whelply           2     VP Country Manager        0             1
#>  3           4    Michael Spence    Michael    Spence           2     VP Country Manager        0             1
#>  4           5    Maya Gutierrez       Maya Gutierrez           2     VP Country Manager        0             1
#>  5           6   Roberta Damstra    Roberta   Damstra           3 VP Information Systems        0             2
#>  6           7  Rebecca Kanagaki    Rebecca  Kanagaki           4     VP Human Resources        0             3
#>  7           8       Kim Brunner        Kim   Brunner          11          Store Manager        9            11
#>  8           9   Brenda Blumberg     Brenda  Blumberg          11          Store Manager       21            11
#>  9          10      Darren Stanz     Darren     Stanz           5             VP Finance        0             5
#> 10          11 Jonathan Murraiin   Jonathan  Murraiin          11          Store Manager        1            11
#> # ... with 1,145 more rows, and 8 more variables: birth_date <chr>, hire_date <chr>, salary <dbl>, supervisor_id <dbl>,
#> #   education_level <chr>, marital_status <chr>, gender <chr>, management_role <chr>
## Parsed with column specification:
## cols(
##   city = col_character(),
##   lon = col_double(),
##   lat = col_double()
## )
## # A tibble: 7 x 3
##   city          lon   lat
## * <chr>       <dbl> <dbl>
## 1 Burbank     -122.  37.3
## 2 San Jose    -122.  37.3
## 3 Lick        -122.  37.3
## 4 Willow Glen -122.  37.3
## 5 Buena Vista -122.  37.3
## 6 Parkmoor    -122.  37.3
## 7 Fruitdale   -122.  37.3
```

### Test Results


@@ 481,24 399,66 @@ dbGetQuery(con, "SELECT * FROM cp.`employee.json`") %>%
``` r
library(sergeant)
library(testthat)
#> 
#> Attaching package: 'testthat'
#> The following object is masked from 'package:dplyr':
#> 
#>     matches
## 
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
## 
##     matches
## The following object is masked from 'package:purrr':
## 
##     is_null

date()
#> [1] "Sun Sep 17 13:31:23 2017"
## [1] "Sun Oct 14 08:27:29 2018"

devtools::test()
#> Loading sergeant
#> Testing sergeant
#> dplyr: ...
#> rest: ................
#> 
#> DONE ===================================================================================================================
## Loading sergeant
## Testing sergeant
## ✔ | OK F W S | Context
## 
⠏ |  0       | dplyr API
⠋ |  1       | dplyr API
⠙ |  2       | dplyr API
⠹ |  3       | dplyr API
✔ |  3       | dplyr API [0.3 s]
## 
⠏ |  0       | REST API
⠋ |  1       | REST API
⠙ |  2       | REST API
⠹ |  3       | REST API
⠸ |  4       | REST API
⠼ |  5       | REST API
⠴ |  6       | REST API
⠦ |  7       | REST API
⠧ |  8       | REST API
⠇ |  9       | REST API
⠏ | 10       | REST API
⠋ | 11       | REST API
⠙ | 12       | REST API
⠹ | 13       | REST API
⠸ | 14       | REST API
⠼ | 15       | REST API
⠴ | 16       | REST API
✔ | 16       | REST API [2.2 s]
## 
## ══ Results ═══════════════════════════════════════════════════
## Duration: 2.5 s
## 
## OK:       19
## Failed:   0
## Warnings: 0
## Skipped:  0
```

### Code of Conduct
## sergeant Metrics

| Lang | \# Files |  (%) | LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
| :--- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
| R    |       12 | 0.92 | 625 | 0.92 |         173 | 0.75 |      562 | 0.87 |
| Rmd  |        1 | 0.08 |  55 | 0.08 |          58 | 0.25 |       86 | 0.13 |

## Code of Conduct

Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.
Please note that this project is released with a [Contributor Code of
Conduct](CONDUCT.md). By participating in this project you agree to
abide by its terms.

D inst/java/slf4j-nop-1.7.25.jar => inst/java/slf4j-nop-1.7.25.jar +0 -0
M man/DrillConnection-class.Rd => man/DrillConnection-class.Rd +2 -2
@@ 3,13 3,13 @@
\docType{class}
\name{DrillConnection-class}
\alias{DrillConnection-class}
\alias{dbSendQuery,DrillConnection,ANY-method}
\alias{dbSendQuery,DrillConnection-method}
\alias{dbSendQuery,DrillConnection,character-method}
\alias{dbIsValid,DrillConnection-method}
\alias{dbListFields,DrillConnection,character-method}
\title{Drill connection class.}
\usage{
\S4method{dbSendQuery}{DrillConnection,ANY}(conn, statement, ...)
\S4method{dbSendQuery}{DrillConnection}(conn, statement, ...)

\S4method{dbIsValid}{DrillConnection}(dbObj, ...)


D man/DrillJDBC.Rd => man/DrillJDBC.Rd +0 -18
@@ 1,18 0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/jdbc.r
\name{DrillJDBC}
\alias{DrillJDBC}
\title{Drill's JDBC driver main class loader}
\usage{
DrillJDBC()
}
\description{
Drill's JDBC driver main class loader
}
\seealso{
Other Drill JDBC API: \code{\link{dbConnect,DrillJDBCDriver-method}},
  \code{\link{dbDataType,DrillJDBCConnection-method}},
  \code{\link{db_data_type.DrillJDBCConnection}},
  \code{\link{drill_jdbc}}
}
\concept{Drill JDBC API}

D man/DrillJDBCConnection-class.Rd => man/DrillJDBCConnection-class.Rd +0 -10
@@ 1,10 0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/jdbc.r
\docType{class}
\name{DrillJDBCConnection-class}
\alias{DrillJDBCConnection-class}
\title{Drill JDBC connection class.}
\description{
Drill JDBC connection class.
}
\keyword{internal}

D man/DrillJDBCDriver-class.Rd => man/DrillJDBCDriver-class.Rd +0 -10
@@ 1,10 0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/jdbc.r
\docType{class}
\name{DrillJDBCDriver-class}
\alias{DrillJDBCDriver-class}
\title{JDBC Driver for Drill database.}
\description{
JDBC Driver for Drill database.
}
\keyword{internal}

D man/dbConnect-DrillJDBCDriver-method.Rd => man/dbConnect-DrillJDBCDriver-method.Rd +0 -31
@@ 1,31 0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/jdbc.r
\docType{methods}
\name{dbConnect,DrillJDBCDriver-method}
\alias{dbConnect,DrillJDBCDriver-method}
\title{Connect to Drill JDBC with your own connection string}
\usage{
\S4method{dbConnect}{DrillJDBCDriver}(drv, url, user = "", password = "",
  ...)
}
\arguments{
\item{drv}{what you get back from \code{\link[=DrillJDBC]{DrillJDBC()}}}

\item{url}{your Drill connection strinfg}

\item{user, password}{username & password (leave as-is for no-auth)}

\item{...}{additional \code{name=val} properties which will be set with Java's
\code{SetProperty} method.}
}
\description{
You should really use \code{\link[=drill_jdbc]{drill_jdbc()}} as it handles some cruft for
you, but you can specify the full JDBC connection string
}
\seealso{
Other Drill JDBC API: \code{\link{DrillJDBC}},
  \code{\link{dbDataType,DrillJDBCConnection-method}},
  \code{\link{db_data_type.DrillJDBCConnection}},
  \code{\link{drill_jdbc}}
}
\concept{Drill JDBC API}

D man/dbDataType-DrillJDBCConnection-method.Rd => man/dbDataType-DrillJDBCConnection-method.Rd +0 -26
@@ 1,26 0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/jdbc.r
\docType{methods}
\name{dbDataType,DrillJDBCConnection-method}
\alias{dbDataType,DrillJDBCConnection-method}
\title{Drill JDBC dbDataType}
\usage{
\S4method{dbDataType}{DrillJDBCConnection}(dbObj, obj, ...)
}
\arguments{
\item{dbObj}{A \code{\linkS4class{DrillJDBCDriver}} object}

\item{obj}{Any R object}

\item{...}{Extra optional parameters}
}
\description{
Drill JDBC dbDataType
}
\seealso{
Other Drill JDBC API: \code{\link{DrillJDBC}},
  \code{\link{dbConnect,DrillJDBCDriver-method}},
  \code{\link{db_data_type.DrillJDBCConnection}},
  \code{\link{drill_jdbc}}
}
\concept{Drill JDBC API}

M man/drill_active.Rd => man/drill_active.Rd +2 -2
@@ 13,9 13,9 @@ drill_active(drill_con)
This is a very simple test (performs \code{HEAD /} request on the Drill server/cluster)
}
\examples{
\dontrun{
try({
drill_connection() \%>\% drill_active()
}
}, silent=TRUE)
}
\seealso{
Other Dill direct REST API Interface: \code{\link{drill_cancel}},

D man/drill_jdbc.Rd => man/drill_jdbc.Rd +0 -79
@@ 1,79 0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/jdbc.r
\name{drill_jdbc}
\alias{drill_jdbc}
\alias{src_drill_jdbc}
\alias{tbl.src_drill_jdbc}
\title{Connect to Drill using JDBC}
\usage{
drill_jdbc(nodes = "localhost:2181", cluster_id = NULL, schema = NULL,
  use_zk = TRUE)

src_drill_jdbc(nodes = "localhost:2181", cluster_id = NULL, schema = NULL,
  use_zk = TRUE)

\method{tbl}{src_drill_jdbc}(src, from, ...)
}
\arguments{
\item{nodes}{character vector of nodes. If more than one node, you can either have
a single string with the comma-separated node:port pairs pre-made or
pass in a character vector with multiple node:port strings and the
function will make a comma-separated node string for you.}

\item{cluster_id}{the cluster id from \code{drill-override.conf}}

\item{schema}{an optional schema name to append to the JDBC connection string}

\item{use_zk}{are you connecting to a ZooKeeper instance (default: \code{TRUE}) or
connecting to an individual DrillBit.}

\item{src}{A Drill "src" created with \code{src_drill()}}

\item{from}{A Drill view or table specification}

\item{...}{Extra parameters}
}
\value{
a JDBC connection object
}
\description{
The DRILL JDBC driver fully-qualified path must be placed in the
\code{DRILL_JDBC_JAR} environment variable. This is best done via \code{~/.Renviron}
for interactive work. e.g. \code{DRILL_JDBC_JAR=/usr/local/drill/jars/jdbc-driver/drill-jdbc-all-1.10.0.jar}
}
\details{
[src_drill_jdbc()] wraps the JDBC [dbConnect()] connection instantation in
[dbplyr::src_dbi()] to return the equivalent of the REST driver's [src_drill()].
}
\examples{
\dontrun{
con <- drill_jdbc("localhost:2181", "main")
drill_query(con, "SELECT * FROM cp.`employee.json`")

# you can also use the connection with RJDBC calls:
dbGetQuery(con, "SELECT * FROM cp.`employee.json`")

# for local/embedded mode with default configuration info
con <- drill_jdbc("localhost:31010", use_zk=FALSE)
}
}
\references{
\url{https://drill.apache.org/docs/using-the-jdbc-driver/#using-the-jdbc-url-for-a-random-drillbit-connection}
}
\seealso{
Other Drill JDBC API: \code{\link{DrillJDBC}},
  \code{\link{dbConnect,DrillJDBCDriver-method}},
  \code{\link{dbDataType,DrillJDBCConnection-method}},
  \code{\link{db_data_type.DrillJDBCConnection}}

Other Drill JDBC API: \code{\link{DrillJDBC}},
  \code{\link{dbConnect,DrillJDBCDriver-method}},
  \code{\link{dbDataType,DrillJDBCConnection-method}},
  \code{\link{db_data_type.DrillJDBCConnection}}

Other Drill JDBC API: \code{\link{DrillJDBC}},
  \code{\link{dbConnect,DrillJDBCDriver-method}},
  \code{\link{dbDataType,DrillJDBCConnection-method}},
  \code{\link{db_data_type.DrillJDBCConnection}}
}
\concept{Drill JDBC API}

D man/drill_jdbc_internals.Rd => man/drill_jdbc_internals.Rd +0 -40
@@ 1,40 0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/jdbc.r
\docType{class}
\name{db_data_type.DrillJDBCConnection}
\alias{db_data_type.DrillJDBCConnection}
\alias{db_data_type.tbl_drill_jdbc}
\alias{DrillJDBCResult-class}
\alias{dbSendQuery,DrillJDBCConnection,character-method}
\alias{sql_escape_ident.DrillJDBCConnection}
\alias{sql_translate_env.DrillJDBCConnection}
\alias{src_tbls.src_dbi}
\title{Drill internals}
\usage{
\method{db_data_type}{DrillJDBCConnection}(con, fields, ...)

\method{db_data_type}{tbl_drill_jdbc}(con, fields, ...)

\S4method{dbSendQuery}{DrillJDBCConnection,character}(conn, statement, ...,
  list = NULL)

\method{sql_escape_ident}{DrillJDBCConnection}(con, x)

\method{sql_translate_env}{DrillJDBCConnection}(con)

\method{src_tbls}{src_dbi}(x)
}
\arguments{
\item{x}{x}
}
\description{
"SHOW DATABASES"
}
\seealso{
Other Drill JDBC API: \code{\link{DrillJDBC}},
  \code{\link{dbConnect,DrillJDBCDriver-method}},
  \code{\link{dbDataType,DrillJDBCConnection-method}},
  \code{\link{drill_jdbc}}
}
\concept{Drill JDBC API}
\keyword{internal}

M man/drill_query.Rd => man/drill_query.Rd +2 -2
@@ 27,10 27,10 @@ default printing (which can be helpful if you accidentally execute a query and t
set is huge).
}
\examples{
\dontrun{
try({
drill_connection() \%>\%
  drill_query("SELECT * FROM cp.`employee.json` limit 5")
}
}, silent=TRUE)
}
\references{
\href{https://drill.apache.org/docs/}{Drill documentation}

M man/drill_show_files.Rd => man/drill_show_files.Rd +2 -2
@@ 15,9 15,9 @@ drill_show_files(drill_con, schema_spec)
Show files in a file system schema.
}
\examples{
\dontrun{
try({
drill_connection() \%>\% drill_show_files("dfs.tmp")
}
}, silent=TRUE)
}
\references{
\href{https://drill.apache.org/docs/}{Drill documentation}

M man/sergeant-exports.Rd => man/sergeant-exports.Rd +1 -0
@@ 3,6 3,7 @@
\name{sergeant-exports}
\alias{sergeant-exports}
\alias{\%>\%}
\alias{tbl}
\title{sergeant exported operators}
\description{
The following functions are imported and then re-exported

M man/sergeant.Rd => man/sergeant.Rd +1 -1
@@ 30,7 30,7 @@ Drill can maximize data locality during query execution without moving data over
network or between nodes. Drill uses ZooKeeper to maintain cluster membership and health
check information.

Methods are provided to work with Drill via the native JDBC & REST APIs along with R
Methods are provided to work with Drill via the REST APIs along with R
\code{DBI} and \code{dplyr} interfaces.
}
\references{

M man/src_drill.Rd => man/src_drill.Rd +2 -2
@@ 36,7 36,7 @@ SQL functions that need to be implemented please file an issue on GitHub.
This is a DBI wrapper around the Drill REST API.
}
\examples{
\dontrun{
try({
db <- src_drill("localhost", 8047L)

print(db)


@@ 80,7 80,7 @@ select(emp, full_name) \%>\%
##  9     2      Darren Stanz    12 Darren Stanz********     5      D*rr*n St*nz
## 10     4 Jonathan Murraiin    17 Jonathan Murraiin***     0 J*n*th*n M*rr***n
## # ... with more rows, and 3 more variables: rpd <chr>, rnd <dbl>, first_three <chr>
}
}, silent=TRUE)
}
\seealso{
Other Drill REST `dplyr` API: \code{\link{drill_custom_functions}},

M tests/testthat/test-sergeant.R => tests/testthat/test-sergeant.R +2 -16
@@ 1,6 1,6 @@
test_host <- Sys.getenv("DRILL_TEST_HOST", "localhost")

context("dplyr")
context("dplyr API")
test_that("Core dbplyr ops work", {

 testthat::skip_on_cran()


@@ 16,7 16,7 @@ test_that("Core dbplyr ops work", {

})

context("rest")
context("REST API")
test_that("REST API works", {

 testthat::skip_on_cran()


@@ 51,17 51,3 @@ test_that("REST API works", {


})

# context("jdbc")
# test_that("we can do something", {
#
#  testthat::skip_on_cran()
#
#   dc <- drill_jdbc("localhost:31010", use_zk=FALSE)
#
#   expect_that(dc, is_a("JDBCConnection"))
#
#   expect_that(drill_query(dc, "SELECT * FROM cp.`employee.json`"),
#               is_a("tbl"))
#
# })