@@ 1,8 1,8 @@
Package: htmlunit
Type: Package
Title: Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library
-Version: 0.4.0
-Date: 2020-05-09
+Version: 0.5.0
+Date: 2020-07-18
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
@@ 28,11 28,11 @@ Imports:
covr, tinytest
- R (>= 3.2.0),
+ R (>= 3.6.0),
- htmlunitjars (>= 2.40.0),
+ htmlunitjars (>= 2.43.0),
Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
Remotes: gitlab::hrbrmstr/htmlunitjars
M NEWS.md => NEWS.md +4 -0
@@ 1,3 1,7 @@
+* Updated for 2.43.0 jars
+* Added `timeout` to `wc_inspect()`
* Switched to {tinytest}
* Updated for 2.40.0 jars
M R/wc-inspect.R => R/wc-inspect.R +6 -2
@@ 5,12 5,16 @@
#' @md
#' @param url URL to fetch
#' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)
+#' @param timeout Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait.
+#' Defaults to `30000`. Note: The timeout is used twice. The first is for making the socket
+#' connection, the second is for data retrieval. If the time is critical you must allow for twice
+#' the time specified here.
#' @export
-wc_inspect <- function(url, js_delay = 5000L) {
+wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L) {
app <- J("is.rud.htmlunit.Zapp")
- res <- app$getRequestsFor(url, .jlong(js_delay))
+ res <- app$getRequestsFor(url, .jlong(js_delay), .jint(timeout))
res <- as.list(res)
lapply(res, function(.x) {
M inst/java/htmlunit-1.0-SNAPSHOT.jar => inst/java/htmlunit-1.0-SNAPSHOT.jar +0 -0
A java/htmlunit/deps/commons-io-2.7.jar => java/htmlunit/deps/commons-io-2.7.jar +0 -0
A java/htmlunit/deps/commons-lang3-3.11.jar => java/htmlunit/deps/commons-lang3-3.11.jar +0 -0
A java/htmlunit/deps/commons-net-3.7.jar => java/htmlunit/deps/commons-net-3.7.jar +0 -0
A java/htmlunit/deps/commons-text-1.9.jar => java/htmlunit/deps/commons-text-1.9.jar +0 -0
A java/htmlunit/deps/htmlunit-2.43.0.jar => java/htmlunit/deps/htmlunit-2.43.0.jar +0 -0
A java/htmlunit/deps/htmlunit-core-js-2.43.0.jar => java/htmlunit/deps/htmlunit-core-js-2.43.0.jar +0 -0
A java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/neko-htmlunit-2.43.0.jar => java/htmlunit/deps/neko-htmlunit-2.43.0.jar +0 -0
A java/htmlunit/deps/salvation-2.7.2.jar => java/htmlunit/deps/salvation-2.7.2.jar +0 -0
A java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar => java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar => java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar => java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar +0 -0
M java/htmlunit/pom.xml => java/htmlunit/pom.xml +1 -1
@@ 25,7 25,7 @@
- <version>2.40.0</version>
+ <version>2.43.0</version>
M java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java => java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java +2 -2
@@ 8,7 8,7 @@ import java.io.*;
public class Zapp {
- public static List<WebResponse> getRequestsFor(String url, long jsDelay) throws IOException {
+ public static List<WebResponse> getRequestsFor(String url, long jsDelay, int timeout) throws IOException {
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
@@ 16,7 16,7 @@ public class Zapp {
- wco.setTimeout(30000);
+ wco.setTimeout(timeout);
final List<WebResponse> list = new ArrayList<>();
M java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class => java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class +0 -0
M java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class => java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class +0 -0
M java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar => java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar +0 -0
M man/hu_read_html.Rd => man/hu_read_html.Rd +2 -2
@@ 22,7 22,7 @@ hu_read_html(
\item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"}
\item{ret}{what to return; if \code{html_document} (the default) then the HTML created
-by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_html]{xml2::read_html()}}
+by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_xml]{xml2::read_html()}}
and an \code{xml2} \code{html_document}/\code{xml_document} is returned. Note that this causes
further HTML processing by \code{xml2}/\code{libxml2} so is not \emph{exactly} what
\code{HtmlUnit} generated. If you want the HTML code (text) without any further
@@ 47,7 47,7 @@ function is a high-level wrapper designed to do a read of HTML,
it is recommended that you leave this the default \code{FALSE} to save
-\item{options}{options to pass to \code{\link[xml2:read_html]{xml2::read_html()}} if \code{ret} == \code{html_document}.}
+\item{options}{options to pass to \code{\link[xml2:read_xml]{xml2::read_html()}} if \code{ret} == \code{html_document}.}
an \code{xml2} \code{html_document}/\code{xml_document} if \code{ret} == \code{html_document} else
M man/wc_inspect.Rd => man/wc_inspect.Rd +6 -1
@@ 4,12 4,17 @@
\title{Perform a "Developer Tools"-like Network Inspection of a URL}
-wc_inspect(url, js_delay = 5000L)
+wc_inspect(url, js_delay = 5000L, timeout = 30000L)
\item{url}{URL to fetch}
\item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)}
+\item{timeout}{Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait.
+Defaults to \code{30000}. Note: The timeout is used twice. The first is for making the socket
+connection, the second is for data retrieval. If the time is critical you must allow for twice
+the time specified here.}
Retrieves \emph{all} content loaded