~hrbrmstr/htmlunit

cef05d7ddfa99cc1415b6b99bf2eceea28a6cc27 — hrbrmstr 4 years ago a5e22b9
2.43.0
M DESCRIPTION => DESCRIPTION +5 -5
@@ 1,8 1,8 @@
Package: htmlunit
Type: Package
Title: Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library
Version: 0.4.0
Date: 2020-05-09
Version: 0.5.0
Date: 2020-07-18
Authors@R: c(
    person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), 
           comment = c(ORCID = "0000-0001-5670-2640")),


@@ 28,11 28,11 @@ Imports:
Suggests: 
    covr, tinytest
Depends: 
    R (>= 3.2.0),
    R (>= 3.6.0),
    rJava,
    htmlunitjars (>= 2.40.0),
    htmlunitjars (>= 2.43.0),
    rvest,
    xml2
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
Remotes: gitlab::hrbrmstr/htmlunitjars

M NEWS.md => NEWS.md +4 -0
@@ 1,3 1,7 @@
0.5.0
* Updated for 2.43.0 jars
* Added `timeout` to `wc_inspect()`

0.4.0
* Switched to {tinytest}
* Updated for 2.40.0 jars

M R/wc-inspect.R => R/wc-inspect.R +6 -2
@@ 5,12 5,16 @@
#' @md
#' @param url URL to fetch
#' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)
#' @param timeout Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait.
#'        Defaults to `30000`. Note: The timeout is used twice. The first is for making the socket
#'        connection, the second is for data retrieval. If the time is critical you must allow for twice
#'        the time specified here.
#' @export
wc_inspect <- function(url, js_delay = 5000L) {
wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L) {

  app <- J("is.rud.htmlunit.Zapp")

  res <- app$getRequestsFor(url, .jlong(js_delay))
  res <- app$getRequestsFor(url, .jlong(js_delay), .jint(timeout))
  res <- as.list(res)

  lapply(res, function(.x) {

M inst/java/htmlunit-1.0-SNAPSHOT.jar => inst/java/htmlunit-1.0-SNAPSHOT.jar +0 -0
A java/htmlunit/deps/commons-io-2.7.jar => java/htmlunit/deps/commons-io-2.7.jar +0 -0
A java/htmlunit/deps/commons-lang3-3.11.jar => java/htmlunit/deps/commons-lang3-3.11.jar +0 -0
A java/htmlunit/deps/commons-net-3.7.jar => java/htmlunit/deps/commons-net-3.7.jar +0 -0
A java/htmlunit/deps/commons-text-1.9.jar => java/htmlunit/deps/commons-text-1.9.jar +0 -0
A java/htmlunit/deps/htmlunit-2.43.0.jar => java/htmlunit/deps/htmlunit-2.43.0.jar +0 -0
A java/htmlunit/deps/htmlunit-core-js-2.43.0.jar => java/htmlunit/deps/htmlunit-core-js-2.43.0.jar +0 -0
A java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar => java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/neko-htmlunit-2.43.0.jar => java/htmlunit/deps/neko-htmlunit-2.43.0.jar +0 -0
A java/htmlunit/deps/salvation-2.7.2.jar => java/htmlunit/deps/salvation-2.7.2.jar +0 -0
A java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar => java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar => java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar +0 -0
A java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar => java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar +0 -0
M java/htmlunit/pom.xml => java/htmlunit/pom.xml +1 -1
@@ 25,7 25,7 @@
    <dependency>
      <groupId>net.sourceforge.htmlunit</groupId>
      <artifactId>htmlunit</artifactId>
      <version>2.40.0</version>
      <version>2.43.0</version>
    </dependency>
  </dependencies>
</project>

M java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java => java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java +2 -2
@@ 8,7 8,7 @@ import java.io.*;

public class Zapp {

  public static List<WebResponse> getRequestsFor(String url, long jsDelay) throws IOException {
  public static List<WebResponse> getRequestsFor(String url, long jsDelay, int timeout) throws IOException {

    final WebClient webClient = new WebClient(BrowserVersion.CHROME);



@@ 16,7 16,7 @@ public class Zapp {
    wco.setThrowExceptionOnScriptError(false);
    wco.setCssEnabled(true);
    wco.setDownloadImages(true);
    wco.setTimeout(30000);
    wco.setTimeout(timeout);

    final List<WebResponse> list = new ArrayList<>();


M java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class => java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class +0 -0
M java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class => java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class +0 -0
M java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar => java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar +0 -0
M man/hu_read_html.Rd => man/hu_read_html.Rd +2 -2
@@ 22,7 22,7 @@ hu_read_html(
\item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"}

\item{ret}{what to return; if \code{html_document} (the default) then the HTML created
by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_html]{xml2::read_html()}}
by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_xml]{xml2::read_html()}}
and an \code{xml2} \code{html_document}/\code{xml_document} is returned. Note that this causes
further HTML processing by \code{xml2}/\code{libxml2} so is not \emph{exactly} what
\code{HtmlUnit} generated. If you want the HTML code (text) without any further


@@ 47,7 47,7 @@ function is a high-level wrapper designed to do a read of HTML,
it is recommended that you leave this the default \code{FALSE} to save
time/bandwidth.}

\item{options}{options to pass to \code{\link[xml2:read_html]{xml2::read_html()}} if \code{ret} == \code{html_document}.}
\item{options}{options to pass to \code{\link[xml2:read_xml]{xml2::read_html()}} if \code{ret} == \code{html_document}.}
}
\value{
an \code{xml2} \code{html_document}/\code{xml_document} if \code{ret} == \code{html_document} else

M man/wc_inspect.Rd => man/wc_inspect.Rd +6 -1
@@ 4,12 4,17 @@
\alias{wc_inspect}
\title{Perform a "Developer Tools"-like Network Inspection of a URL}
\usage{
wc_inspect(url, js_delay = 5000L)
wc_inspect(url, js_delay = 5000L, timeout = 30000L)
}
\arguments{
\item{url}{URL to fetch}

\item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)}

\item{timeout}{Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait.
Defaults to \code{30000}. Note: The timeout is used twice. The first is for making the socket
connection, the second is for data retrieval. If the time is critical you must allow for twice
the time specified here.}
}
\description{
Retrieves \emph{all} content loaded