~hrbrmstr/htmlunit

a5e22b9d5ff4d1e9049bd1c9fc1836eb1714f728 — hrbrmstr 3 years ago 9bfb41f
tinytest / 2.40.0 jars
M DESCRIPTION => DESCRIPTION +6 -7
@@ 1,8 1,8 @@
Package: htmlunit
Type: Package
Title: Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library
Version: 0.3.2
Date: 2020-04-09
Version: 0.4.0
Date: 2020-05-09
Authors@R: c(
    person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), 
           comment = c(ORCID = "0000-0001-5670-2640")),


@@ 25,13 25,12 @@ Encoding: UTF-8
License: Apache License 2.0 | file LICENSE
Imports: 
    magrittr
Suggests:
    testthat,
    covr
Depends:
Suggests: 
    covr, tinytest
Depends: 
    R (>= 3.2.0),
    rJava,
    htmlunitjars (>= 2.39.0),
    htmlunitjars (>= 2.40.0),
    rvest,
    xml2
Roxygen: list(markdown = TRUE)

M NEWS.md => NEWS.md +4 -0
@@ 1,3 1,7 @@
0.4.0
* Switched to {tinytest}
* Updated for 2.40.0 jars

0.3.0
* java 11 compile
* tested against new htmlunit jar release

M README.md => README.md +31 -31
@@ 132,7 132,7 @@ library(tidyverse) # for some data ops; not req'd for pkg

# current verison
packageVersion("htmlunit")
## [1] '0.3.2'
## [1] '0.4.0'
```

Something `xml2::read_html()` cannot do, read the table from


@@ 178,20 178,20 @@ colnames(xdf)
## [7] "content_type"   "load_time"      "headers"

select(xdf, method, url, status_code, content_length, load_time)
## # A tibble: 45 x 5
## # A tibble: 59 x 5
##    method url                                                                       status_code content_length load_time
##    <chr>  <chr>                                                                           <int>          <dbl>     <dbl>
##  1 GET    https://rstudio.com/                                                              200          12292       701
##  2 GET    https://dev.visualwebsiteoptimizer.com/j.php?a=450622&u=https%3A%2F%2Frs…         200           2498       349
##  3 GET    https://dev.visualwebsiteoptimizer.com/6.0/va-268e5d055e3477f16578a91cda…         200          55711        91
##  4 GET    https://use.fontawesome.com/releases/v5.0.6/css/all.css                           200           8699       427
##  5 GET    https://d33wubrfki0l68.cloudfront.net/bundles/c5ddb3e999592179708beea702…         200          53046       599
##  6 GET    https://cdn.rawgit.com/noelboss/featherlight/1.7.13/release/featherlight…         200            763       402
##  7 GET    https://d33wubrfki0l68.cloudfront.net/css/4a0f49009a213e6e2207c6f66893f0…         200            505        80
##  8 GET    https://gitcdn.github.io/bootstrap-toggle/2.2.2/css/bootstrap-toggle.min…         200            548       346
##  9 GET    https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-aweso…         200           6663       292
## 10 GET    https://snap.licdn.com/li.lms-analytics/insight.min.js                            200           1576       483
## # … with 35 more rows
##  1 GET    https://rstudio.com/                                                              200          13531       625
##  2 GET    https://use.fontawesome.com/releases/v5.0.6/css/all.css                           200           8699       376
##  3 GET    https://d33wubrfki0l68.cloudfront.net/bundles/c5ddb3e999592179708beea702…         200          53046       563
##  4 GET    https://cdn.rawgit.com/noelboss/featherlight/1.7.13/release/featherlight…         200            763       376
##  5 GET    https://d33wubrfki0l68.cloudfront.net/css/4a0f49009a213e6e2207c6f66893f0…         200            505        73
##  6 GET    https://gitcdn.github.io/bootstrap-toggle/2.2.2/css/bootstrap-toggle.min…         200            548       258
##  7 GET    https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-aweso…         200           6663       247
##  8 GET    https://metadata-static-files.sfo2.cdn.digitaloceanspaces.com/pixel/lp.js         200           3876       364
##  9 GET    https://snap.licdn.com/li.lms-analytics/insight.min.js                            200           1576       455
## 10 GET    https://connect.facebook.net/en_US/fbevents.js                                    200          31766       412
## # … with 49 more rows

group_by(xdf, content_type) %>% 
  summarise(


@@ 201,18 201,18 @@ group_by(xdf, content_type) %>%
## # A tibble: 12 x 3
##    content_type               total_size total_load_time
##    <chr>                           <dbl>           <dbl>
##  1 ""                              45565           0.521
##  2 "application/javascript"       265147           1.78 
##  3 "application/json"               4100           0.687
##  4 "application/x-javascript"     152398           1.97 
##  5 "image/gif"                        35           0.557
##  6 "image/jpeg"                    59772           0.114
##  7 "image/png"                     40634           0.269
##  8 "image/svg+xml"                 10869           0.314
##  9 "text/css"                     118095           2.81 
## 10 "text/html"                     12709           0.798
## 11 "text/javascript"              249573           2.02 
## 12 "text/plain"                       28           0.344
##  1 ""                                  0           1.02 
##  2 "application/javascript"       443531           3.61 
##  3 "application/json"               4176           3.10 
##  4 "application/x-javascript"     161004           1.69 
##  5 "image/gif"                       131           0.561
##  6 "image/jpeg"                    59772           0.105
##  7 "image/png"                     40634           0.234
##  8 "image/svg+xml"                 10869           0.303
##  9 "text/css"                     121175           2.81 
## 10 "text/html"                     14425           1.3  
## 11 "text/javascript"              174172           1.42 
## 12 "text/plain"                       28           0.354
```

### DSL


@@ 221,7 221,7 @@ group_by(xdf, content_type) %>%
wc <- web_client(emulate = "chrome")

wc %>% wc_browser_info()
## < Netscape / 5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 / en-US >
## < Netscape / 5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 / en-US >

wc <- web_client()



@@ 263,7 263,7 @@ wc %>%
  wc_render("text") %>% 
  substr(1, 300) %>% 
  cat()
## USA.gov: The U.S. Government's Official Web Portal | USAGov
## Official Guide to Government Information and Services | USAGov
## Skip to main content
## An official website of the United States government Here's how you know
## 


@@ 275,17 275,17 @@ wc %>%
## All Topics and Services
## Benefits, Grants, Loans
## Government Agencies and Elected Officials
## Jobs and Unemployme
## Jobs and Unemplo
```

### htmlunit Metrics

| Lang  | \# Files |  (%) | LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
| :---- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
| R     |       14 | 0.78 | 351 | 0.76 |         193 | 0.74 |      372 | 0.83 |
| Rmd   |        1 | 0.06 |  41 | 0.09 |          52 | 0.20 |       75 | 0.17 |
| R     |       13 | 0.76 | 320 | 0.75 |         182 | 0.73 |      372 | 0.83 |
| Rmd   |        1 | 0.06 |  41 | 0.10 |          52 | 0.21 |       75 | 0.17 |
| Maven |        1 | 0.06 |  30 | 0.07 |           0 | 0.00 |        1 | 0.00 |
| Java  |        1 | 0.06 |  28 | 0.06 |          12 | 0.05 |        0 | 0.00 |
| Java  |        1 | 0.06 |  28 | 0.07 |          12 | 0.05 |        0 | 0.00 |
| make  |        1 | 0.06 |  10 | 0.02 |           4 | 0.02 |        0 | 0.00 |

## Code of Conduct

M inst/java/htmlunit-1.0-SNAPSHOT.jar => inst/java/htmlunit-1.0-SNAPSHOT.jar +0 -0
R tests/testthat/test-htmlunit.R => inst/tinytest/test_htmlunit.R +29 -33
@@ 1,45 1,41 @@
context("Core htmlunit ops work")
test_that("we can do something", {

  test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"

  w <- web_client()
w <- web_client()

  expect_is(w, "webclient")
  expect_is(wc_browser_info(w), "browserinfo")
expect_equal(class(w), "webclient")
expect_equal(class(wc_browser_info(w)), "browserinfo")

  expect_is(wc_go(w, url = test_url), "webclient")
expect_equal(class(wc_go(w, url = test_url)), "webclient")

  expect_equal(wc_url(w), test_url)
  expect_equal(wc_title(w), "")
expect_equal(wc_url(w), test_url)
expect_equal(wc_title(w), "")

  expect_is(wc_render(w, "parsed"), "xml_document")
  expect_is(wc_render(w, "html"), "character")
  expect_is(wc_render(w, "text"), "character")
expect_true(inherits(wc_render(w, "parsed"), "xml_document"))
expect_true(inherits(wc_render(w, "html"), "character"))
expect_true(inherits(wc_render(w, "text"), "character"))

  expect_is(wc_click_on(w, "table"), "webclient")
expect_true(inherits(wc_click_on(w, "table"), "webclient"))

  expect_equal(
    wc_html_nodes(w, "title") %>%  sapply(wc_html_text),
    ""
  )
expect_equal(
  wc_html_nodes(w, "title") %>%  sapply(wc_html_text),
  ""
)

  expect_equal(
    wc_html_nodes(w, "title") %>% sapply(wc_html_name),
    "title"
  )
expect_equal(
  wc_html_nodes(w, "title") %>% sapply(wc_html_name),
  "title"
)

  h <- wc_headers(w)
  expect_true(any(h$value == "GitHub.com"))
h <- wc_headers(w)
expect_true(any(h$value == "GitHub.com"))

  expect_is(
    hu_read_html(url = test_url, ret = "html_document"),
    "xml_document"
  )
  expect_is(
    hu_read_html(url = test_url, ret = "text"),
    "character"
  )
expect_inherits(
  hu_read_html(url = test_url, ret = "html_document"),
  "xml_document"
)
expect_true(
  inherits(hu_read_html(url = test_url, ret = "text"),
  "character"
))


})

A java/htmlunit/deps/commons-lang3-3.10.jar => java/htmlunit/deps/commons-lang3-3.10.jar +0 -0
A java/htmlunit/deps/htmlunit-2.40.0.jar => java/htmlunit/deps/htmlunit-2.40.0.jar +0 -0
A java/htmlunit/deps/htmlunit-core-js-2.40.0.jar => java/htmlunit/deps/htmlunit-core-js-2.40.0.jar +0 -0
A java/htmlunit/deps/jetty-client-9.4.28.v20200408.jar => java/htmlunit/deps/jetty-client-9.4.28.v20200408.jar +0 -0
A java/htmlunit/deps/jetty-http-9.4.28.v20200408.jar => java/htmlunit/deps/jetty-http-9.4.28.v20200408.jar +0 -0
A java/htmlunit/deps/jetty-io-9.4.28.v20200408.jar => java/htmlunit/deps/jetty-io-9.4.28.v20200408.jar +0 -0
A java/htmlunit/deps/jetty-util-9.4.28.v20200408.jar => java/htmlunit/deps/jetty-util-9.4.28.v20200408.jar +0 -0
A java/htmlunit/deps/jetty-xml-9.4.28.v20200408.jar => java/htmlunit/deps/jetty-xml-9.4.28.v20200408.jar +0 -0
A java/htmlunit/deps/neko-htmlunit-2.40.0.jar => java/htmlunit/deps/neko-htmlunit-2.40.0.jar +0 -0
A java/htmlunit/deps/salvation-2.7.1.jar => java/htmlunit/deps/salvation-2.7.1.jar +0 -0
A java/htmlunit/deps/websocket-api-9.4.28.v20200408.jar => java/htmlunit/deps/websocket-api-9.4.28.v20200408.jar +0 -0
A java/htmlunit/deps/websocket-client-9.4.28.v20200408.jar => java/htmlunit/deps/websocket-client-9.4.28.v20200408.jar +0 -0
A java/htmlunit/deps/websocket-common-9.4.28.v20200408.jar => java/htmlunit/deps/websocket-common-9.4.28.v20200408.jar +0 -0
M java/htmlunit/pom.xml => java/htmlunit/pom.xml +1 -1
@@ 25,7 25,7 @@
    <dependency>
      <groupId>net.sourceforge.htmlunit</groupId>
      <artifactId>htmlunit</artifactId>
      <version>2.38.0</version>
      <version>2.40.0</version>
    </dependency>
  </dependencies>
</project>

D java/htmlunit/target/classes/is/rud/htmlunit/App$1.class => java/htmlunit/target/classes/is/rud/htmlunit/App$1.class +0 -0
D java/htmlunit/target/classes/is/rud/htmlunit/App.class => java/htmlunit/target/classes/is/rud/htmlunit/App.class +0 -0
M java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar => java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar +0 -0
M java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst => java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst +0 -2
@@ 1,4 1,2 @@
is/rud/htmlunit/App$1.class
is/rud/htmlunit/Zapp.class
is/rud/htmlunit/App.class
is/rud/htmlunit/Zapp$1.class

M java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst => java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst +0 -1
@@ 1,2 1,1 @@
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/App.java
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java

D tests/test-all.R => tests/test-all.R +0 -2
@@ 1,2 0,0 @@
library(testthat)
test_check("htmlunit")

A tests/tinytest.R => tests/tinytest.R +5 -0
@@ 0,0 1,5 @@

if ( requireNamespace("tinytest", quietly=TRUE) ){
  tinytest::test_package("htmlunit")
}