~hrbrmstr/metis-tidy

ref: 0dc1c46b446391e7b16d8873cf43164c0a6ab1a6 metis-tidy/README.md -rw-r--r-- 5.8 KiB View raw
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# metis.tidy

Access and Query Amazon Athena via the Tidyverse

## Description

Methods are provided to use the ‘metis’ JDBC/DBI interface via the
Tidyverse (e.g. ‘dbplyr’/‘dplyr’ idioms).

## What’s Inside The Tin?

Lightweight helpers to make it easier to `filter` and `mutate` plus type
support for Athena `BIGINT` (64-bit integers).

## Installation

``` r
devtools::install_git("https://git.sr.ht/~hrbrmstr/metis-tidy")
# OR
devtools::install_gitlab("hrbrmstr/metis-tidy")
# OR
devtools::install_github("hrbrmstr/metis-tidy")
```

## Usage

``` r
library(metis.tidy)

# current verison
packageVersion("metis.tidy")
```

    ## [1] '0.3.0'

### Basic Setup (using an alternate provider)

``` r
library(metis.tidy)
library(tidyverse)

metis::dbConnect(
  metis::Athena(),
  Schema = "sampledb",
  AwsCredentialsProviderClass = "com.simba.athena.amazonaws.auth.PropertiesFileCredentialsProvider",
  AwsCredentialsProviderArguments = path.expand("~/.aws/athenaCredentials.props")
) -> con

elb_logs <- tbl(con, "elb_logs")

glimpse(elb_logs)
```

    ## Observations: ??
    ## Variables: 16
    ## Database: AthenaConnection
    ## $ timestamp             <chr> "2014-09-29T03:00:52.641389Z", "2014-09-29T03:01:23.603288Z", "2014-09-29T03:01:54.6438…
    ## $ elbname               <chr> "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo",…
    ## $ requestip             <chr> "252.225.51.44", "252.43.130.244", "248.52.123.119", "246.118.11.120", "240.201.80.176"…
    ## $ requestport           <int> 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 414…
    ## $ backendip             <chr> "246.185.32.192", "246.5.160.209", "252.12.154.122", "246.191.203.13", "247.27.176.211"…
    ## $ backendport           <int> 8888, 8899, 8000, 8888, 8888, 8888, 8888, 8888, 8000, 8000, 8888, 8888, 8888, 8899, 888…
    ## $ requestprocessingtime <dbl> 0.000095, 0.000090, 0.000087, 0.000089, 0.000090, 0.000093, 0.000092, 0.000094, 0.00010…
    ## $ backendprocessingtime <dbl> 0.035755, 0.048942, 0.050951, 0.046141, 0.039483, 0.052850, 0.032934, 0.046127, 0.04017…
    ## $ clientresponsetime    <dbl> 5.8e-05, 5.5e-05, 5.0e-05, 4.9e-05, 4.8e-05, 5.4e-05, 5.1e-05, 4.8e-05, 4.8e-05, 4.9e-0…
    ## $ elbresponsecode       <chr> "200", "200", "200", "200", "200", "200", "200", "200", "200", "200", "200", "200", "20…
    ## $ backendresponsecode   <chr> "200", "400", "200", "200", "200", "200", "404", "200", "200", "403", "404", "200", "20…
    ## $ receivedbytes         <S3: integer64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
    ## $ sentbytes             <S3: integer64> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
    ## $ requestverb           <chr> "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GE…
    ## $ url                   <chr> "http://www.abcxyz.com:80/jobbrowser/?format=json&state=running&user=248nnm5", "http://…
    ## $ protocol              <chr> "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HT…

#### Using custom Athena functions

``` r
filter(elb_logs, elbresponsecode == "200") %>% 
  mutate(
    tsday = as.Date(substring(timestamp, 1L, 10L)),
    host = url_extract_host(url),
    proto_version = regexp_extract(protocol, "([[:digit:]\\.]+)"),
  ) %>% 
  select(tsday, host, receivedbytes, requestprocessingtime, proto_version) %>% 
  glimpse()
```

    ## Observations: ??
    ## Variables: 5
    ## Database: AthenaConnection
    ## $ tsday                 <date> 2014-09-26, 2014-09-26, 2014-09-26, 2014-09-26, 2014-09-26, 2014-09-26, 2014-09-26, 20…
    ## $ host                  <chr> "www.abcxyz.com", "www.abcxyz.com", "www.abcxyz.com", "www.abcxyz.com", "www.abcxyz.com…
    ## $ receivedbytes         <S3: integer64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
    ## $ requestprocessingtime <dbl> 0.000074, 0.000102, 0.000056, 0.000048, 0.000047, 0.000087, 0.000095, 0.000049, 0.00005…
    ## $ proto_version         <chr> "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.…

#### All the types work. Some are useful.

``` r
tbl(con, sql("
SELECT
  CAST('chr' AS CHAR(4)) achar,
  CAST('varchr' AS VARCHAR) avarchr,
  CAST(SUBSTR(timestamp, 1, 10) AS DATE) AS tsday,
  CAST(100.1 AS DOUBLE) AS justadbl,
  CAST(127 AS TINYINT) AS asmallint,
  CAST(100 AS INTEGER) AS justanint,
  CAST(100000000000000000 AS BIGINT) AS abigint,
  CAST(('GET' = 'GET') AS BOOLEAN) AS is_get,
  ARRAY[1, 2, 3] AS arr,
  ARRAY['1', '2, 3', '4'] AS arr,
  MAP(ARRAY['foo', 'bar'], ARRAY[1, 2]) AS mp,
  CAST(ROW(1, 2.0) AS ROW(x BIGINT, y DOUBLE)) AS rw,
  CAST('{\"a\":1}' AS JSON) js
FROM elb_logs
LIMIT 1
")) %>% 
  glimpse()
```

    ## Observations: ??
    ## Variables: 13
    ## Database: AthenaConnection
    ## $ achar     <chr> "chr "
    ## $ avarchr   <chr> "varchr"
    ## $ tsday     <date> 2014-09-26
    ## $ justadbl  <dbl> 100.1
    ## $ asmallint <int> 127
    ## $ justanint <int> 100
    ## $ abigint   <S3: integer64> 100000000000000000
    ## $ is_get    <lgl> TRUE
    ## $ arr       <chr> "1, 2, 3"
    ## $ arr       <chr> "1, 2, 3, 4"
    ## $ mp        <chr> "{bar=2, foo=1}"
    ## $ rw        <chr> "{x=1, y=2.0}"
    ## $ js        <chr> "\"{\\\"a\\\":1}\""

``` r
cloc::cloc_pkg_md()
```

| Lang | \# Files |  (%) | LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
| :--- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
| R    |        6 | 0.86 | 156 | 0.75 |          18 | 0.42 |       26 | 0.41 |
| Rmd  |        1 | 0.14 |  53 | 0.25 |          25 | 0.58 |       38 | 0.59 |

## Code of Conduct

Please note that this project is released with a [Contributor Code of
Conduct](CONDUCT.md). By participating in this project you agree to
abide by its terms.