~hrbrmstr/htmlunit

b9dd94a108c3aeaea1e438b91e61c14cda15a4ae — hrbrmstr 3 years ago cef05d7 master
2.43.0
M NEWS.md => NEWS.md +2 -1
@@ 1,6 1,7 @@
0.5.0
* Updated for 2.43.0 jars
* Added `timeout` to `wc_inspect()`
* Added support for Microsoft Edge browser
* Added `timeout`, `css`, and `images` parameters to `wc_inspect()`

0.4.0
* Switched to {tinytest}

M R/hu-read-html.R => R/hu-read-html.R +10 -3
@@ 45,7 45,7 @@
#' hu_read_html(test_url)
#' }
hu_read_html <- function(url,
                         emulate = c("best", "chrome", "firefox", "ie"),
                         emulate = c("best", "chrome", "firefox", "ie", "edge"),
                         ret = c("html_document", "text"),
                         js_delay = 2000L,
                         timeout = 30000L,


@@ 54,7 54,7 @@ hu_read_html <- function(url,
                         download_images = FALSE,
                         options = c("RECOVER", "NOERROR", "NOBLANKS")) {

  emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie"))
  emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
  ret <- match.arg(ret, c("html_document", "text"))

  available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")


@@ 63,12 63,19 @@ hu_read_html <- function(url,
    emulate,
    best = available_browsers$BEST_SUPPORTED,
    chrome = available_browsers$CHROME,
    firefox = available_browsers$FIREFOX_60,
    firefox = available_browsers$FIREFOX,
    edge = available_browsers$EDGE,
    ie = available_browsers$INTERNET_EXPLORER
  ) -> use_browser

  wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser)

  cssErrorHandler <- .jnew("is.rud.htmlunit.RDefaultCssErrorHandler")
  wc$setCssErrorHandler(cssErrorHandler)

  incorrectListenerHandler <- .jnew("is.rud.htmlunit.RIncorrectnessListener")
  wc$setIncorrectnessListener(incorrectListenerHandler)

  res <- wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay)))

  wc_opts <- wc$getOptions()

M R/wc-inspect.R => R/wc-inspect.R +11 -3
@@ 5,16 5,24 @@
#' @md
#' @param url URL to fetch
#' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)
#' @param timeout Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait.
#' @param timeout Sets the timeout (milliseconds) of the web connection. Set to zero for an infinite wait.
#'        Defaults to `30000`. Note: The timeout is used twice. The first is for making the socket
#'        connection, the second is for data retrieval. If the time is critical you must allow for twice
#'        the time specified here.
#' @param css,images enable CSS/download images? (default `FALSE`)
#' @export
wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L) {
wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L, css = FALSE, images = FALSE) {

  app <- J("is.rud.htmlunit.Zapp")

  res <- app$getRequestsFor(url, .jlong(js_delay), .jint(timeout))
  app$getRequestsFor(
    url,
    .jlong(js_delay),
    as.integer(timeout),
    .jnew("java/lang/Boolean", css),
    .jnew("java/lang/Boolean", images)
  ) -> res

  res <- as.list(res)

  lapply(res, function(.x) {

M R/web-client.R => R/web-client.R +4 -3
@@ 14,17 14,18 @@
#' @examples
#' w <- web_client()
#' wc_browser_info(w)
web_client <- function(emulate = c("best", "chrome", "firefox", "ie"),
web_client <- function(emulate = c("best", "chrome", "firefox", "ie", "edge"),
                       proxy_host = NULL, proxy_port = NULL) {

  emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie"))
  emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
  available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")

  switch(
    emulate,
    best = available_browsers$BEST_SUPPORTED,
    chrome = available_browsers$CHROME,
    firefox = available_browsers$FIREFOX_60,
    firefox = available_browsers$FIREFOX,
    edge = available_browsers$EDGE,
    ie = available_browsers$INTERNET_EXPLORER
  ) -> use_browser


A R/zzz.R => R/zzz.R +12 -0
@@ 0,0 1,12 @@
stop_logging <- function() {
  rJava::J("java.util.logging.LogManager")$getLogManager()$reset()
  invisible(NULL)
}

.onLoad <- function(libname, pkgname) {
  rJava::.jpackage(pkgname, jars = "*", lib.loc = libname)
  rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE))
  stop_logging()
}



M README.md => README.md +33 -36
@@ 11,7 11,7 @@ Status](https://travis-ci.org/hrbrmstr/htmlunit.svg?branch=master)](https://trav
[![Coverage
Status](https://codecov.io/gh/hrbrmstr/htmlunit/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/htmlunit)
![Minimal R
Version](https://img.shields.io/badge/R%3E%3D-3.2.0-blue.svg)
Version](https://img.shields.io/badge/R%3E%3D-3.6.0-blue.svg)
![License](https://img.shields.io/badge/License-Apache-blue.svg)

# htmlunit


@@ 132,7 132,7 @@ library(tidyverse) # for some data ops; not req'd for pkg

# current verison
packageVersion("htmlunit")
## [1] '0.4.0'
## [1] '0.5.0'
```

Something `xml2::read_html()` cannot do, read the table from


@@ 178,41 178,36 @@ colnames(xdf)
## [7] "content_type"   "load_time"      "headers"

select(xdf, method, url, status_code, content_length, load_time)
## # A tibble: 59 x 5
## # A tibble: 36 x 5
##    method url                                                                       status_code content_length load_time
##    <chr>  <chr>                                                                           <int>          <dbl>     <dbl>
##  1 GET    https://rstudio.com/                                                              200          13531       625
##  2 GET    https://use.fontawesome.com/releases/v5.0.6/css/all.css                           200           8699       376
##  3 GET    https://d33wubrfki0l68.cloudfront.net/bundles/c5ddb3e999592179708beea702…         200          53046       563
##  4 GET    https://cdn.rawgit.com/noelboss/featherlight/1.7.13/release/featherlight…         200            763       376
##  5 GET    https://d33wubrfki0l68.cloudfront.net/css/4a0f49009a213e6e2207c6f66893f0…         200            505        73
##  6 GET    https://gitcdn.github.io/bootstrap-toggle/2.2.2/css/bootstrap-toggle.min…         200            548       258
##  7 GET    https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-aweso…         200           6663       247
##  8 GET    https://metadata-static-files.sfo2.cdn.digitaloceanspaces.com/pixel/lp.js         200           3876       364
##  9 GET    https://snap.licdn.com/li.lms-analytics/insight.min.js                            200           1576       455
## 10 GET    https://connect.facebook.net/en_US/fbevents.js                                    200          31766       412
## # … with 49 more rows
##  1 GET    https://rstudio.com/                                                              200          14621       495
##  2 GET    https://metadata-static-files.sfo2.cdn.digitaloceanspaces.com/pixel/lp.js         200           3576       221
##  3 GET    https://snap.licdn.com/li.lms-analytics/insight.min.js                            200           1576       162
##  4 GET    https://connect.facebook.net/en_US/fbevents.js                                    200          34269       138
##  5 GET    https://connect.facebook.net/signals/config/151855192184380?v=2.9.23&r=s…         200         134841        66
##  6 GET    https://munchkin.marketo.net/munchkin-beta.js                                     200            752       230
##  7 GET    https://munchkin.marketo.net/159/munchkin.js                                      200           4810        27
##  8 GET    https://x.clearbitjs.com/v1/pk_60c5aa2221e3c03eca10fb6876aa6df7/clearbit…         200          86568       483
##  9 GET    https://cdn.segment.com/analytics.js/v1/gO0uTGfCkO4DQpfkRim9mBsjdKrehtnu…         200          62860       243
## 10 GET    https://static.hotjar.com/c/hotjar-1446157.js?sv=6                                200           1708       212
## # … with 26 more rows

group_by(xdf, content_type) %>% 
  summarise(
    total_size = sum(content_length), 
    total_load_time = sum(load_time)/1000
  )
## # A tibble: 12 x 3
##    content_type               total_size total_load_time
##    <chr>                           <dbl>           <dbl>
##  1 ""                                  0           1.02 
##  2 "application/javascript"       443531           3.61 
##  3 "application/json"               4176           3.10 
##  4 "application/x-javascript"     161004           1.69 
##  5 "image/gif"                       131           0.561
##  6 "image/jpeg"                    59772           0.105
##  7 "image/png"                     40634           0.234
##  8 "image/svg+xml"                 10869           0.303
##  9 "text/css"                     121175           2.81 
## 10 "text/html"                     14425           1.3  
## 11 "text/javascript"              174172           1.42 
## 12 "text/plain"                       28           0.354
## # A tibble: 7 x 3
##   content_type             total_size total_load_time
##   <chr>                         <dbl>           <dbl>
## 1 application/javascript       431338           2.58 
## 2 application/json               4118           1.37 
## 3 application/x-javascript     176248           0.623
## 4 image/gif                        35           0.232
## 5 text/html                     16640           1.36 
## 6 text/javascript              254971           0.996
## 7 text/plain                       28           0.189
```

### DSL


@@ 221,7 216,7 @@ group_by(xdf, content_type) %>%
wc <- web_client(emulate = "chrome")

wc %>% wc_browser_info()
## < Netscape / 5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 / en-US >
## < Netscape / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 / en-US >

wc <- web_client()



@@ 268,6 263,7 @@ wc %>%
## An official website of the United States government Here's how you know
## 
## 
## Main Navigation
## Search
## Search
## Search


@@ 275,18 271,19 @@ wc %>%
## All Topics and Services
## Benefits, Grants, Loans
## Government Agencies and Elected Officials
## Jobs and Unemplo
```

### htmlunit Metrics

| Lang  | \# Files |  (%) | LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
| :---- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
| R     |       13 | 0.76 | 320 | 0.75 |         182 | 0.73 |      372 | 0.83 |
| Rmd   |        1 | 0.06 |  41 | 0.10 |          52 | 0.21 |       75 | 0.17 |
| Maven |        1 | 0.06 |  30 | 0.07 |           0 | 0.00 |        1 | 0.00 |
| Java  |        1 | 0.06 |  28 | 0.07 |          12 | 0.05 |        0 | 0.00 |
| make  |        1 | 0.06 |  10 | 0.02 |           4 | 0.02 |        0 | 0.00 |
| R     |       14 | 0.70 | 341 | 0.72 |         188 | 0.70 |      377 | 0.82 |
| Java  |        3 | 0.15 |  52 | 0.11 |          23 | 0.09 |        3 | 0.01 |
| Rmd   |        1 | 0.05 |  41 | 0.09 |          52 | 0.19 |       75 | 0.16 |
| Maven |        1 | 0.05 |  30 | 0.06 |           0 | 0.00 |        1 | 0.00 |
| make  |        1 | 0.05 |  10 | 0.02 |           4 | 0.01 |        4 | 0.01 |

clock Package Metrics for htmlunit

## Code of Conduct


M inst/java/htmlunit-1.0-SNAPSHOT.jar => inst/java/htmlunit-1.0-SNAPSHOT.jar +0 -0
M java/htmlunit/Makefile => java/htmlunit/Makefile +8 -4
@@ 1,14 1,18 @@
.PHONY: clean pkg deps run

pkg:
	JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn --quiet package
#	JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn --quiet package
	JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn --quiet package
	cp target/htmlunit-1.0-SNAPSHOT.jar ../../inst/java

clean:
	JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn clean
#	JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn clean
	JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn clean

deps:
	JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps
#	JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps
	JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps

new:
	JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false
#	JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false
	JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false

A java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java => java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java +17 -0
@@ 0,0 1,17 @@
package is.rud.htmlunit;

public class RDefaultCssErrorHandler implements com.gargoylesoftware.css.parser.CSSErrorHandler,
                                                java.io.Serializable {
    @Override
    public void error(final com.gargoylesoftware.css.parser.CSSParseException exception) {
    }

    @Override
    public void fatalError(final com.gargoylesoftware.css.parser.CSSParseException exception) {
    }

    @Override
    public void warning(final com.gargoylesoftware.css.parser.CSSParseException exception) {
    }

}
\ No newline at end of file

A java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java => java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java +13 -0
@@ 0,0 1,13 @@
package is.rud.htmlunit;

public class RIncorrectnessListener implements com.gargoylesoftware.htmlunit.IncorrectnessListener,
                                               java.io.Serializable {

    /**
     * {@inheritDoc}
     */
    @Override
    public void notify(final java.lang.String message, final java.lang.Object origin) {
    }

}

M java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java => java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java +11 -3
@@ 2,20 2,28 @@ package is.rud.htmlunit;

import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.util.*;

import java.util.*;
import java.lang.*;
import java.io.*;

public class Zapp {

  public static List<WebResponse> getRequestsFor(String url, long jsDelay, int timeout) throws IOException {
  private static com.gargoylesoftware.htmlunit.IncorrectnessListener incorrectnessListener_ = new RIncorrectnessListener();
  private static com.gargoylesoftware.css.parser.CSSErrorHandler cssErrorHandler_ = new RDefaultCssErrorHandler();

  public static List<WebResponse> getRequestsFor(String url, long jsDelay, int timeout, Boolean css, Boolean images) throws IOException {

    final WebClient webClient = new WebClient(BrowserVersion.CHROME);

    webClient.setCssErrorHandler(cssErrorHandler_);
    webClient.setIncorrectnessListener(incorrectnessListener_);

    WebClientOptions wco = webClient.getOptions();

    wco.setThrowExceptionOnScriptError(false);
    wco.setCssEnabled(true);
    wco.setDownloadImages(true);
    wco.setCssEnabled(css);
    wco.setDownloadImages(images);
    wco.setTimeout(timeout);

    final List<WebResponse> list = new ArrayList<>();

A java/htmlunit/target/classes/is/rud/htmlunit/RDefaultCssErrorHandler.class => java/htmlunit/target/classes/is/rud/htmlunit/RDefaultCssErrorHandler.class +0 -0
A java/htmlunit/target/classes/is/rud/htmlunit/RIncorrectnessListener.class => java/htmlunit/target/classes/is/rud/htmlunit/RIncorrectnessListener.class +0 -0
M java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class => java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class +0 -0
M java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class => java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class +0 -0
M java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar => java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar +0 -0
M java/htmlunit/target/maven-archiver/pom.properties => java/htmlunit/target/maven-archiver/pom.properties +1 -1
@@ 1,5 1,5 @@
#Generated by Maven
#Tue Mar 10 08:03:25 EDT 2020
#Wed Aug 19 08:51:02 EDT 2020
groupId=is.rud.htmlunit
artifactId=htmlunit
version=1.0-SNAPSHOT

M java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst => java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst +2 -0
@@ 1,2 1,4 @@
is/rud/htmlunit/RDefaultCssErrorHandler.class
is/rud/htmlunit/RIncorrectnessListener.class
is/rud/htmlunit/Zapp.class
is/rud/htmlunit/Zapp$1.class

M java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst => java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst +2 -0
@@ 1,1 1,3 @@
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java

M man/hu_read_html.Rd => man/hu_read_html.Rd +1 -1
@@ 6,7 6,7 @@
\usage{
hu_read_html(
  url,
  emulate = c("best", "chrome", "firefox", "ie"),
  emulate = c("best", "chrome", "firefox", "ie", "edge"),
  ret = c("html_document", "text"),
  js_delay = 2000L,
  timeout = 30000L,

M man/wc_inspect.Rd => man/wc_inspect.Rd +10 -2
@@ 4,17 4,25 @@
\alias{wc_inspect}
\title{Perform a "Developer Tools"-like Network Inspection of a URL}
\usage{
wc_inspect(url, js_delay = 5000L, timeout = 30000L)
wc_inspect(
  url,
  js_delay = 5000L,
  timeout = 30000L,
  css = FALSE,
  images = FALSE
)
}
\arguments{
\item{url}{URL to fetch}

\item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)}

\item{timeout}{Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait.
\item{timeout}{Sets the timeout (milliseconds) of the web connection. Set to zero for an infinite wait.
Defaults to \code{30000}. Note: The timeout is used twice. The first is for making the socket
connection, the second is for data retrieval. If the time is critical you must allow for twice
the time specified here.}

\item{css, images}{enable CSS/download images? (default \code{FALSE})}
}
\description{
Retrieves \emph{all} content loaded

M man/web_client.Rd => man/web_client.Rd +2 -2
@@ 6,13 6,13 @@
\title{Create a new HtmlUnit WebClient instance}
\usage{
web_client(
  emulate = c("best", "chrome", "firefox", "ie"),
  emulate = c("best", "chrome", "firefox", "ie", "edge"),
  proxy_host = NULL,
  proxy_port = NULL
)

webclient(
  emulate = c("best", "chrome", "firefox", "ie"),
  emulate = c("best", "chrome", "firefox", "ie", "edge"),
  proxy_host = NULL,
  proxy_port = NULL
)