~hrbrmstr/reapr

c8ee6d10dd032cf130583ed34fb0008480d11b1b — hrbrmstr 3 years ago 0285591
going public
M DESCRIPTION => DESCRIPTION +12 -2
@@ 8,7 8,15 @@ Authors@R: c(
           comment = c(ORCID = "0000-0001-5670-2640"))
  )
Maintainer: Bob Rudis <bob@rud.is>
Description: This will eventually be a clever description about web scraping with reapr.
Description: There's no longer need to fear getting at the gnarly bits of web pages.
    For the vast majority of web scraping tasks, the 'rvest' package does a 
    phenomenal job providing just enough of what you need to get by. But, if you 
    want more of the details of the site you're scraping, some handy shortcuts to
    page elements in use and the ability to not have to think too hard about 
    serialization during scraping tasks, then you may be interested in reaping
    more than harvesting. Tools are provided to interact with web sites content
    and metadata more granular level than 'rvest' but at a higher level than
    'httr'/'curl'.
URL: https://gitlab.com/hrbrmstr/reapr
BugReports: https://gitlab.com/hrbrmstr/reapr/issues
NeedsCompilation: yes


@@ 26,6 34,8 @@ Imports:
    selectr,
    magrittr,
    curl,
    methods
    methods,
    xslt,
    stats
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1

M NAMESPACE => NAMESPACE +25 -0
@@ 1,14 1,39 @@
# Generated by roxygen2: do not edit by hand

S3method(as.data.frame,reapr_taglist)
S3method(print,reapr_doc)
S3method(print,reapr_raw_tbl)
S3method(print,reapr_tbl_cell)
S3method(print,reapr_tbl_row)
S3method(reap_node,default)
S3method(reap_node,reapr_doc)
S3method(reap_nodes,default)
S3method(reap_nodes,reapr_doc)
S3method(reap_table,reapr_doc)
S3method(reap_table,xml_document)
S3method(reap_table,xml_node)
S3method(reap_table,xml_nodeset)
export("%>%")
export(add_response_url_from)
export(as_tibble.reapr_taglist)
export(mill)
export(reap_attr)
export(reap_attrs)
export(reap_children)
export(reap_name)
export(reap_node)
export(reap_nodes)
export(reap_table)
export(reap_text)
export(reap_url)
import(httr)
import(selectr)
import(xml2)
import(xslt)
importFrom(curl,nslookup)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(magrittr,"%>%")
importFrom(methods,is)
importFrom(stats,terms)
useDynLib(reapr, .registration=TRUE)

A R/add-response-url.R => R/add-response-url.R +29 -0
@@ 0,0 1,29 @@
#' Add a `reapr_doc` response prefix URL to a data frame
#'
#' @param xdf a data frame
#' @param x a `reapr_doc`
#' @export
#' @examples
#' x <- reap_url("http://books.toscrape.com/")
#'
#' # good ol' R
#' add_response_url_from(
#'   as.data.frame(x$tag$a),
#'   x
#' )
#'
#' \dontrun{
#' # piping
#' as_tibble(x$tag$a) %>%
#'   add_response_url_from(x)
#' }
add_response_url_from <- function(xdf, x) {

  stopifnot(is.data.frame(xdf))
  stopifnot(inherits(x, "reapr_doc"))

  xdf[["prefix_url"]] <- x$response$url %||% NA_character_

  xdf

}
\ No newline at end of file

A R/as-data-frame-taglist.R => R/as-data-frame-taglist.R +31 -0
@@ 0,0 1,31 @@
#' Turn a `reapr_taglist` into a data frame (tibble)
#'
#' Takes a taglist from a `reapr_doc` `tag` slot and turns it
#' into a data frame with normalized column names
#'
#' @param x a `reapr_taglist`
#' @param ... ignored
#' @param trim trim front/back whitspace? Default: `TRUE`
#' @param stringsAsFactors always `FALSE`
#' @export
#' @examples
#' x <- reap_url("http://r-project.org/")
#' as.data.frame(x$tag$meta)
as.data.frame.reapr_taglist <- function(x, ..., trim = TRUE, stringsAsFactors = FALSE) {

  map_df(x, function(.x) {
    tg <- as.list(unlist(xml2::xml_attrs(.x)))
    tg$elem_content <- xml2::xml_text(.x, trim=trim) %||% NA_character_
    as.data.frame(tg, stringsAsFactors=FALSE)
  }) -> xdf

  class(xdf) <- c("tbl_df", "tbl", "data.frame")

  xdf

}

#' @rdname as.data.frame.reapr_taglist
#' @export
as_tibble.reapr_taglist <- as.data.frame.reapr_taglist


A R/mill.R => R/mill.R +39 -0
@@ 0,0 1,39 @@
#' Turn a `reapr_doc` into plain text without cruft
#'
#' Plain text extraction is accomplished via the following idiom: first,
#' an attempt is made to use an XSLT style sheet to select only the best
#' target nodes for extraction. On some malformed HTML content this
#' results in an empty document. When that occurs a less conservative
#' approach is taken with a simple XPath that is desgined to capture
#' all `<body>` elements that are not `<script>` tags. This is imperfect
#' but does provide fairly decent results when the preferred method fails.
#'
#' @param x a `reapr_doc`
#' @return a character vector of plain text with no HTML
#' @export
mill <- function(x) {

  stopifnot(inherits(x, "reapr_doc"))

  proc <- xml2::read_xml(system.file("xslt/mill.xslt", package="reapr"))

  x <- validate_parsed_content(x)

  out <- xslt::xml_xslt(x$parsed_html, proc)
  out <- xml2::xml_text(out, trim=TRUE)

  if (!nzchar(out)) {
    xml2::xml_find_all(
      x$parsed_html,
      "//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]"
    ) -> out
    out <- paste0(xml2::xml_text(out, trim=TRUE), collapse="\n")
  }

  out <- paste0(trimws(unlist(strsplit(out, "[\r\n]+"))), collaspe="\n")
  out <- out[out != ""]
  out <- paste0(out, collapse = "\n")

  out

}
\ No newline at end of file

M R/read-url.R => R/read-url.R +9 -2
@@ 21,7 21,7 @@
#'     - extraction of the plaintext webpage `<title>` (if any)
#'     - generation of a dynamic list tags in the document which can be
#'       fed directly to HTML/XML search/retrieval function (which may speed
#'       up node discover)
#'       up node discovery)
#'     - extraction of the text of all comments in the HTML document
#'     - inclusion of the full `httr::response` object with the returned object
#'     - extraction of the time it took to make the complete request


@@ 37,6 37,8 @@
#' @param ... other parameters passed on to [httr::GET()]
#' @return a `reapr_doc` object
#' @export
#' @examples
#' x <- reap_url("http://books.toscrape.com/")
reap_url <- function(url, encoding = "", ...) {

  encoding <- trimws(encoding)


@@ 70,7 72,12 @@ reap_url <- function(url, encoding = "", ...) {
      xml2::xml_find_all(parsed_html, tag)
    }),
    tags_in_doc
  ) -> env$tag
  ) -> env[["tag"]]

  lapply(env[["tag"]], function(.x) {
    class(.x) <- c("reapr_taglist", class(.x))
    .x
  }) -> env[["tag"]]

  env$doc_comments <- xml2::xml_text(xml2::xml_find_all(parsed_html, "//*/comment()"))


A R/reap-bits.R => R/reap-bits.R +52 -0
@@ 0,0 1,52 @@
#' Reap text, names and attributes from HTML
#'
#' You need to pass in anyting the underlying [xml2::xml_text()],
#' [xml2::xml_name()], [xml2::xml_children()], [xml2::xml_attrs()],
#' or [xml2::xml_attr()] expect. These are merely convenience wrappers
#' so you don't have to `library(xml2)`.
#'
#' You _can_ pass in a `reapr_doc` `$parsed_html` full document if you
#' wish but you should really be working with the output of
#' [reap_nodes()] or [reap_node()] or the pre-extracted tags in the `$tag`
#' element of a `reapr_doc`.
#'
#' @param x anything the underlying `xml2` functions can take
#' @param trim if `TRUE` then trim whitespace. Unlike the `rvest` counterparts
#'        this defaults to `TRUE`.
#' @export
#' @examples
#' x <- reap_url("http://r-project.org/")
#' reap_text(x$tag$div)
#' reap_nodes(x, ".//*") %>% reap_name()
#' x$tag$div %>% reap_children()
#' reap_attrs(x$tag$div)
reap_text <- function(x, trim = TRUE) {
  xml2::xml_text(x, trim = trim)
}

#' @rdname reap_text
#' @export
reap_name <- function(x) {
  xml2::xml_name(x)
}

#' @rdname reap_text
#' @export
reap_children <- function(x) {
  xml2::xml_children(x)
}

#' @rdname reap_text
#' @export
reap_attrs <- function(x) {
  xml2::xml_attrs(x)
}

#' @rdname reap_text
#' @param name attribute name to retrieve
#' @param otherwise what to return if `name` doesn't exist in a given node
#' @export
reap_attr <- function(x, name, otherwise = NA_character_) {
  xml2::xml_attr(x, name, default = otherwise)
}


A R/reap-nodes.R => R/reap-nodes.R +71 -0
@@ 0,0 1,71 @@
#' Reap nodes from an reaped HTML document
#'
#' Provides simialar functionality to `rvest::html_nodes()` except when
#' a `reapr_doc` is passed in where it will then test for the validity of
#' the pre-parsed HTML content and regenerate the parse tree if the pointer
#' is invalid. Another major difference is that it prefers XPath queries over
#' CSS selectors so the `xpath` and `css` named (yet positional) parameters
#' are in a different order than in their `rvest` cousins.
#'
#' @param x A `reapr_doc` or anything `rvest::html_nodes()` takes.
#' @param xpath,css either an XPath query (string) or CSS selector; NOTE the
#'        order difference.
#' @export
#' @examples
#' x <- reap_url("http://r-project.org/")
#' reap_text(x$tag$div)
#' reap_nodes(x, ".//*") %>% reap_name()
#' x$tag$div %>% reap_children()
#' reap_attrs(x$tag$div)
reap_nodes <- function(x, xpath, css) {
  UseMethod("reap_nodes")
}

#' @export
reap_nodes.reapr_doc <- function(x, xpath, css) {
  x <- validate_parsed_content(x)
  reap_nodes(x$parsed_html, xpath, css)
}


#' @export
reap_nodes.default <- function(x, xpath, css) {
  xml2::xml_find_all(x, make_selector(xpath, css))
}

#' @export
#' @rdname reap_nodes
reap_node <- function(x, xpath, css) {
  UseMethod("reap_node")
}

#' @export
reap_node.reapr_doc <- function(x, xpath, css) {
  x <- validate_parsed_content(x)
  xml2::xml_find_first(x$parsed_html, make_selector(xpath, css))
}

#' @export
reap_node.default <- function(x, xpath, css) {
  xml2::xml_find_first(x, make_selector(xpath, css))
}

make_selector <- function(xpath, css) {
  if (missing(css) && missing(xpath))
    stop("Please supply one of css or xpath", call. = FALSE)
  if (!missing(css) && !missing(xpath))
    stop("Please supply css or xpath, not both", call. = FALSE)

  if (!missing(css)) {
    if (!is.character(css) && length(css) == 1)
      stop("`css` must be a string")

    selectr::css_to_xpath(css, prefix = ".//")
  } else {
    if (!is.character(xpath) && length(xpath) == 1)
      stop("`xpath` must be a string")

    xpath
  }
}


A R/reap-table.R => R/reap-table.R +207 -0
@@ 0,0 1,207 @@
#' Extract data from HTML tables
#'
#' This behaves differently than [rvest::html_table()]. It does an
#' aggressive fill by default when `colspan`s or `rowspan`s are detected
#' and does not make any attempt to go beyond providing a basic data frame
#' of the HTML table. See `Details` for more information.
#'
#' The functionality provided in [rvest::html_table()] is double-plus good so
#' the intent of this function was not to subvert it. Rather, [reap_table()]
#' was designed to give you more direct R-access to the underlying structure
#' of an HTML table so you can wrangle it as you please. In "raw" mode,
#' you get a list with attributes enabling you to work with the table structure,
#' cell values and entity attributes with R idioms vs XPath queries.
#'
#' @note When passing in a `reapr_doc` object, the pre-parsed HTML will be
#'       tested for validity and re-generated if the external pointer is
#'       invalid.
#' @param x a `reapr_doc` or anyting you're used to passing to [rvest::html_table()]
#' @param raw if `TRUE` then a `list` with rows and cells will be returned. Each
#'        cell has the value in the source HTML table but also has an `hattr`
#'        attribute (short for "html entity attribute") which contains all the
#'        attributes (if any) of the table cell. Each row in the list also has an `hattr`
#'        attribute holding its attributes (if any). This structure may be useful
#'        for doing more infolved extractions of weirdly formed HTML tables
#'        without having to muck with XPath queries. Default: `FALSE`
#' @param trim if `TRUE` trim cell whitespace. Default: `FALSE`.
#' @export
#' @examples
#' x <- reap_url("https://en.wikipedia.org/wiki/Demography_of_the_United_Kingdom")
#'
#' # take advantage of the pre-processing reap_url() does:
#' tbl <- reap_table(x$tag$table[[10]])
#' tbl_raw <- reap_table(x$tag$table[[10]], raw=TRUE)
#'
#' # get all of 'em:
#' tbls <- reap_table(x)
#'
#' # fid a specific one:
#' reap_node(x, ".//table[contains(., 'Other identity and at least one UK identity')]") %>%
#'   reap_table() -> tbl
reap_table <- function(x, raw=FALSE, trim=TRUE) UseMethod("reap_table")

#' @export
reap_table.reapr_doc <- function(x, raw=FALSE, trim=TRUE) {
  x <- validate_parsed_content(x)
  reap_table(x$parsed_html, raw, trim)
}

#' @export
reap_table.xml_document <- function(x, raw=FALSE, trim=TRUE) {
  tbls <- xml2::xml_find_all(x, ".//table")
  lapply(tbls, reap_table, raw, trim)
}

#' @export
reap_table.xml_nodeset <- function(x, raw=FALSE, trim=TRUE) {
  lapply(x, reap_table, raw, trim)
}

#' @export
reap_table.xml_node <- function(x, raw=FALSE, trim=TRUE) {

  stopifnot(xml2::xml_name(x) == "table")

  trs <- xml_find_all(x, ".//tr")

  lapply(trs, function(.x) {
    xml_find_all(.x, ".//td|.//th") %>%
      lapply(function(.x) {
        val <- xml_text(.x, trim=trim)
        attr(val, "hattr") <- xml_attrs(.x)
        class(val) <- c("reapr_tbl_cell", "list")
        val
      }) -> row
    attr(row, "hattr") <- xml_attrs(.x)
    class(row) <- c("reapr_tbl_row", "list")
    row
  }) -> tblist

  attr(tblist, "hattr") <- xml_attrs(x)

  class(tblist) <- c("reapr_raw_tbl", "list")

  if (raw) return(tblist)

  row_count <- length(tblist)
  col_count <- max(sapply(tblist, length))

  mtbl <- matrix(data = NA_character_, nrow=row_count, ncol=col_count)

  for (ridx in seq_along(tblist)) {

    row <- tblist[[ridx]]

    cofs <- 0
    for (cidx in seq_along(row)) {

      col <- row[[cidx]] # actual value @ index in what was in the HTML
      if (trim) col <- trimws(col)

      cattrs <- attr(col, "hattr")
      cspan <- as.integer(cattrs["colspan"] %na% 1) - 1 # doing a range later so 1=0, 2=1 for range addition
      rspan <- as.integer(cattrs["rowspan"] %na% 1) - 1

      # move over until not NA (implies a rowspan somewhere above current row)
      repeat {
        if ((cidx + cofs) > col_count) {
          cofs <- cofs - 1
          break
        }
        if (is.na(mtbl[ridx, cidx+cofs])) break # current position has NA so we can stop
        cofs <- cofs + 1 # move over one
      }

      # cat("VAL: ", trimws(col), "\n", sep="")
      # cat(" RC: ", row_count, "; ", ridx, ":", ridx+rspan, "\n", sep="")
      # cat(" CC: ", length(row), "; ", (cidx+cofs), ":", (cidx+cofs+cspan), "\n\n", sep="")

      if ((cofs+cspan) > length(row)) break
      if ((cidx+cofs+cspan) > col_count) cspan <- 0

      mtbl[ridx:(ridx+rspan), (cidx+cofs):(cidx+cofs+cspan)] <- col
      cofs <- cofs + cspan

    }

  }

  xdf <- as.data.frame(mtbl, stringsAsFactors = FALSE)
  class(xdf) <- c("tbl_df", "tbl", "data.frame")
  xdf

}


elip <- function(x, n=10) {
  sapply(
    x,
    function(.x) if (nchar(.x) > n) sprintf("%s...", substr(.x, 1, n-1)) else .x,
    USE.NAMES = FALSE
  )
}

#' Print for reapr table elements
#'
#' @param x reapr raw table
#' @param ... ignored
#' @param indent how much to indent this element
#' @keywords internal
#' @export
print.reapr_raw_tbl <- function(x, ..., indent = 0) {
  h <- attr(x, "hattr")
  if (length(h) == 0) {
    cat("<table (noattrs)>\n")
  } else {
    cat(
      paste0(rep(" ", indent), collapse=""),
      "<table ",
      paste0(sprintf("%s=%s", names(h), shQuote(elip(h))), collapse = " "),
      ">\n",
      sep=""
    )
  }
  for (row in seq_along(x)) {
    print(x[[row]], indent = indent + 2)
  }
}

#' @rdname print.reapr_raw_tbl
#' @keywords internal
#' @export
print.reapr_tbl_row <- function(x, ..., indent = 0) {
  h <- attr(x, "hattr")
  if (length(h) == 0) {
    cat(paste0(rep(" ", indent), collapse=""), "<row (noattrs)>\n", sep="")
  } else {
    cat(
      paste0(rep(" ", indent), collapse=""),
      "<row ",
      paste0(sprintf("%s=%s", names(h), shQuote(elip(h))), collapse = " "),
      ">\n",
      sep=""
    )
  }
  for (cell in seq_along(x)) {
    print(x[[cell]], indent = indent + 2)
  }
}

#' @rdname print.reapr_raw_tbl
#' @keywords internal
#' @export
print.reapr_tbl_cell <- function(x, ..., indent = 0) {
  h <- attr(x, "hattr")
  if (length(h) == 0) {
    cat(paste0(rep(" ", indent), collapse=""), "<cell (noattrs)>\n", sep="")
  } else {
    h <- as.list(h)
    cat(
      paste0(rep(" ", indent), collapse=""),
      "<cell ",
      paste0(sprintf("%s=%s", names(h), shQuote(elip(h))), collapse = " "),
      ">\n",
      sep=""
    )
  }
}

M R/reapr-package.R => R/reapr-package.R +11 -2
@@ 1,6 1,14 @@
#' Reap Information from Websites
#'
#' This will eventually be a clever description about web scraping with reapr.
#' There's no longer need to fear getting at the gnarly bits of web pages.
#' For the vast majority of web scraping tasks, the 'rvest' package does a
#' phenomenal job providing just enough of what you need to get by. But, if you
#' want more of the details of the site you're scraping, some handy shortcuts to
#' page elements in use and the ability to not have to think too hard about
#' serialization during scraping tasks, then you may be interested in reaping
#' more than harvesting. Tools are provided to interact with web sites content
#' and metadata more granular level than 'rvest' but at a higher level than
#' 'httr'/'curl'.
#'
#' - URL: <https://gitlab.com/hrbrmstr/reapr>
#' - BugReports: <https://gitlab.com/hrbrmstr/reapr/issues>


@@ 9,9 17,10 @@
#' @name reapr
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @import httr xml2 selectr
#' @import httr xml2 selectr xslt
#' @importFrom curl nslookup
#' @importFrom methods is
#' @importFrom jsonlite fromJSON toJSON
#' @importFrom stats terms
#' @useDynLib reapr, .registration=TRUE
NULL

M R/util.R => R/util.R +10 -18
@@ 18,29 18,17 @@ validate_parsed_content <- function(env) {
      xml2::xml_find_all(parsed_html, tag)
    }),
    tags_in_doc
  ) -> env$tag
  ) -> env[["tag"]]

  lapply(env[["tag"]], function(.x) {
    class(.x) <- c("reapr_taglist", class(.x))
    .x
  }) -> env[["tag"]]

  env

}

# make_selector <- function (css, xpath) {
#   if (missing(css) && missing(xpath))
#     stop("Please supply one of css or xpath", call. = FALSE)
#   if (!missing(css) && !missing(xpath))
#     stop("Please supply css or xpath, not both", call. = FALSE)
#   if (!missing(css)) {
#     if (!is.character(css) && length(css) == 1)
#       stop("`css` must be a string")
#     selectr::css_to_xpath(css, prefix = ".//")
#   }
#   else {
#     if (!is.character(xpath) && length(xpath) == 1)
#       stop("`xpath` must be a string")
#     xpath
#   }
# }

lock_environment <- function(env, exclude="") {
  for (nm in ls(env)) {
    if (!(nm %in% exclude)) lockBinding(as.name(nm), env)


@@ 54,6 42,10 @@ set_names <- function(x, nms) {
  x
}

`%na%` <- function(a, b) {
  if (is.na(a)) b else a
}

`%||%` <- function(a, b) {
  if (is.null(a)) b else a
}

A R/utils-mappers.R => R/utils-mappers.R +127 -0
@@ 0,0 1,127 @@
map <- function(.x, .f, ..., .default) {

  default_exists <- !missing(.default)

  if (inherits(.f, "formula")) {
    .body <- dimnames(attr(terms(.f), "factors"))[[1]]
    .f <- function(.x, . = .x) {}
    body(.f) <- as.expression(parse(text=.body))
  }

  nm <- names(.x)

  if (inherits(.f, "function")) {

    lapply(.x, function(x) {
      res <- .f(x, ...)
      if ((length(res) == 0) & default_exists) res <- .default
      res
    }) -> out

  } else if (is.numeric(.f) | is.character(.f)) {

    lapply(.x, function(x) {
      res <- try(x[[.f]], silent = TRUE)
      if (inherits(res, "try-error")) res <- NULL
      if ((length(res) == 0) & default_exists) res <- .default
      res
    }) -> out

  }

  if (length(nm) > 0) out <- set_names(out, nm)

  out

}

map_df <- function(.x, .f, ..., .id=NULL) {

  res <- map(.x, .f, ...)
  out <- bind_rows(res, .id=.id)
  out

}

# this has limitations and is more like 75% of dplyr::bind_rows()
# this is also orders of magnitude slower than dplyr::bind_rows()
bind_rows <- function(..., .id = NULL) {

  res <- list(...)

  if (length(res) == 1) res <- res[[1]]

  cols <- unique(unlist(lapply(res, names), use.names = FALSE))

  if (!is.null(.id)) {
    inthere <- cols[.id %in% cols]
    if (length(inthere) > 0) {
      .id <- make.unique(c(inthere, .id))[2]
    }
  }

  id_vals <- if (is.null(names(res))) 1:length(res) else names(res)

  saf <- default.stringsAsFactors()
  options(stringsAsFactors = FALSE)
  on.exit(options(stringsAsFactors = saf))

  idx <- 1
  do.call(
    rbind.data.frame,
    lapply(res, function(.x) {
      x_names <- names(.x)
      moar_names <- setdiff(cols, x_names)
      if (length(moar_names) > 0) {
        for (i in 1:length(moar_names)) {
          .x[[moar_names[i]]] <- rep(NA, length(.x[[1]]))
        }
      }
      if (!is.null(.id)) {
        .x[[.id]] <- id_vals[idx]
        idx <<- idx + 1
      }
      .x
    })
  ) -> out

  rownames(out) <- NULL

  class(out) <- c("tbl_df", "tbl", "data.frame")

  out

}

bind_cols <- function(...) {

  res <- list(...)

  row_mismatch <- lapply(res, nrow) != nrow(res[[1]])

  if (any(row_mismatch)) {
    first_mismatch_pos <- which(row_mismatch)[1]
    stop(paste0("Argument ", first_mismatch_pos,
                " must be length ", nrow(res[[1]]),
                ", not ", nrow(res[[first_mismatch_pos]])))
    }

  if (length(res) == 1) res <- res[[1]]

  col_names <- unlist(lapply(res, names), use.names = FALSE)
  col_names <- make.unique(col_names, sep = "")

  saf <- default.stringsAsFactors()
  options(stringsAsFactors = FALSE)
  on.exit(options(stringsAsFactors = saf))

  out <- do.call(cbind.data.frame, res)

  names(out) <- col_names
  rownames(out) <- NULL

  class(out) <- c("tbl_df", "tbl", "data.frame")

  out

}

M README.Rmd => README.Rmd +145 -2
@@ 1,7 1,7 @@
---
output: rmarkdown::github_document
editor_options: 
  chunk_output_type: inline
  chunk_output_type: console
---
```{r pkg-knitr-opts, include=FALSE}
knitr::opts_chunk$set(collapse=TRUE, fig.retina=2, message=FALSE, warning=FALSE)


@@ 18,17 18,43 @@ Reap Information from Websites

## Description

This will eventually be a clever description about web scraping with reapr.
There's no longer need to fear getting at the gnarly bits of web pages.
For the vast majority of web scraping tasks, the 'rvest' package does a
phenomenal job providing just enough of what you need to get by. But, if you
want more of the details of the site you're scraping, some handy shortcuts to
page elements in use and the ability to not have to think too hard about
serialization during scraping tasks, then you may be interested in reaping
more than harvesting. Tools are provided to interact with web sites content
and metadata more granular level than 'rvest' but at a higher level than
'httr'/'curl'.

## NOTE

This is very much a WIP but there are enough basic features to let others kick the tyres
and see what's woefully busted or in need of attention.

## What's Inside The Tin

The following functions are implemented:

- `reap_url`:	Read HTML content from a URL
- `mill`:	Turn a 'reapr_doc' into plain text without cruft
- `reapr`:	Reap Information from Websites
- `reap_attr`:	Reap text, names and attributes from HTML
- `reap_attrs`:	Reap text, names and attributes from HTML
- `reap_children`:	Reap text, names and attributes from HTML
- `reap_name`:	Reap text, names and attributes from HTML
- `reap_node`:	Reap nodes from an reaped HTML document
- `reap_nodes`:	Reap nodes from an reaped HTML document
- `reap_table`:	Extract data from HTML tables
- `reap_text`:	Reap text, names and attributes from HTML
- `add_response_url_from`:	Add a 'reapr_doc' response prefix URL to a data frame

## Installation

```{r install-ex, eval=FALSE}
devtools::install_git("https://git.sr.ht/~hrbrmstr/reapr")
# or 
devtools::install_git("https://gitlab.com/hrbrmstr/reapr.git")
# or
devtools::install_github("hrbrmstr/reapr")


@@ 38,6 64,8 @@ devtools::install_github("hrbrmstr/reapr")

```{r lib-ex}
library(reapr)
library(hrbrthemes) # sr.hr/~hrbrmstr/hrbrthemes | git[la|hu]b.com/hrbrmstr/hrbrthemes
library(tidyverse) # for some examples only

# current version
packageVersion("reapr")


@@ 52,6 80,121 @@ x <- reap_url("http://rud.is/b")
x
```

The formatted object print-output shows much of what you get with a reaped URL.

`reapr::real_url()`:

- Uses `httr::GET()` to make web connections and retrieve content. This enables
  it to behave more like an actual (non-javascript-enabled) browser. You can
  pass anything `httr::GET()` can handle to `...` (e.g. `httr::user_agent()`)
  to have as much granular control over the interaction as possible.
- Returns a richer set of data. After the `httr::response` object is obtained
  many tasks are performed including:
    - timestamping the URL crawl
    - extraction of the asked-for URL and the final URL (in the case of redirects)
    - extraction of the IP address of the target server
    - extraction of both plaintext and parsed (`xml_document`) HTML
    - extraction of the plaintext webpage `<title>` (if any)
    - generation of a dynamic list tags in the document which can be
      fed directly to HTML/XML search/retrieval function (which may speed
      up node discovery)
    - extraction of the text of all comments in the HTML document
    - inclusion of the full `httr::response` object with the returned object
    - extraction of the time it took to make the complete request

Finally, it works with other package member functions to check the validity
of the parsed `xml_document` and auto-regen the parse (since it has the full
content available to it) prior to any other operations. This also makes `reapr_doc`
object _serializable_ without having to spend your own cycles on that.

If you need more or need the above in different ways please file issues.

## Pre-computed Tags

On document retrieval, `reapr` automagically builds convenient R-accessible lists of
all the tags in the retrieved document. They aren't recursive, but they are a convenient
"bags" of tags to use when you don't feel like crafting that perfect XPath.

Let's see what tags RStudio favors most on their Shiny home page:

```{r}
x <- reap_url("https://shiny.rstudio.com/articles/")

x

enframe(sort(lengths(x$tag))) %>%
  mutate(name = factor(name, levels = name)) %>%
  ggplot(aes(value, name)) +
  geom_segment(aes(xend = 0, yend = name), , size = 3, color = "goldenrod") +
  labs(
    x = "Tag frequency", y = NULL,
    title = "HTML Tag Distribution on RStudio's Shiny Homepage"
  ) +
  scale_x_comma(position = "top") +
  theme_ft_rc(grid = "X") +
  theme(axis.text.y = element_text(family = "mono"))
```

Lots and lots of `<div>`s!

```{r}
x$tag$div
```

Let's take a look at the article titles:

```{r results = 'asis'}
as.data.frame(x$tag$div) %>% 
  filter(class == "article-title") %>% 
  select(`Shiny Articles`=elem_content) %>% 
  knitr::kable()
```

No XPath or CSS selectors!

Let's abandon the `tidyverse` for base R piping for a minute and do something similar to extract and convert the index of [CRAN Task Views](https://cloud.r-project.org/web/views/) to a markdown list (which will conveniently render here). Again, no XPath or CSS selectors required once we read in the URL:

```{r results='asis'}
x <- reap_url("https://cloud.r-project.org/web/views/")

as.data.frame(x$tag$a) %>% 
  add_response_url_from(x) %>% 
  subset(!grepl("^http[s]://", href)) %>% 
  transform(href = sprintf("- [%s](%s%s)", elem_content, prefix_url, href)) %>% 
  .[, "href", drop=TRUE] %>% 
  paste0(collapse = "\n") %>% 
  cat()
```

This functionality is not a panacea since they are just bags of tags, but it may save you some time and frustration.

## Tables

Unlike `rvest` with it's magical and wonderful `html_table()` `reapr` provides more raw control
over the content of `<table>` elements. Let's look at the "population change over time" table from the Wikipedia page on the demography of the UK (<https://en.wikipedia.org/wiki/Demography_of_the_United_Kingdom>):

```{r}
x <- reap_url("https://en.wikipedia.org/wiki/Demography_of_the_United_Kingdom")

reap_node(x, ".//table[contains(., 'Intercensal')]") %>% 
  reap_table()
```

As you can see, it doesn't do the cleanup work for you and has no way to even say there's a header. That's because you can do that with `rvest::html_table()`. The equivalent `reapr` function gives you the raw table and handles `colspan` and `rowspan` insanity by adding the missing cells and filling in the gaps. You can use `docxtractr::assign_colnames()` to make a given row the column titles and `docxtractr::mcga()` or `janitor::clean_names()` to name them proper R names then `readr::type_convert()` to finish the task.

While that may seem overkill for this example (it is), it wouldn't be if the table were more gnarly (I'm working on an example for that which will replace this one when it's done).

For truly gnarly tables you can get an overview of the structure (without the data frame conversion):

```{r}
reap_node(x, ".//table[contains(., 'Intercensal')]") %>% 
  reap_table(raw = TRUE) -> raw_tbl

raw_tbl
```

And work with the `list` it gives back (which contains all the HTML element attributes as R attributes so you can pull data stored in them if need be).

## reapr Metrics

```{r cloc, echo=FALSE}

M README.md => README.md +532 -7
@@ 11,18 11,45 @@ Reap Information from Websites

## Description

This will eventually be a clever description about web scraping with
reapr.
There’s no longer need to fear getting at the gnarly bits of web pages.
For the vast majority of web scraping tasks, the ‘rvest’ package does a
phenomenal job providing just enough of what you need to get by. But, if
you want more of the details of the site you’re scraping, some handy
shortcuts to page elements in use and the ability to not have to think
too hard about serialization during scraping tasks, then you may be
interested in reaping more than harvesting. Tools are provided to
interact with web sites content and metadata more granular level than
‘rvest’ but at a higher level than ‘httr’/‘curl’.

## NOTE

This is very much a WIP but there are enough basic features to let
others kick the tyres and see what’s woefully busted or in need of
attention.

## What’s Inside The Tin

The following functions are implemented:

  - `reap_url`: Read HTML content from a URL
  - `mill`: Turn a ‘reapr\_doc’ into plain text without cruft
  - `reapr`: Reap Information from Websites
  - `reap_attr`: Reap text, names and attributes from HTML
  - `reap_attrs`: Reap text, names and attributes from HTML
  - `reap_children`: Reap text, names and attributes from HTML
  - `reap_name`: Reap text, names and attributes from HTML
  - `reap_node`: Reap nodes from an reaped HTML document
  - `reap_nodes`: Reap nodes from an reaped HTML document
  - `reap_table`: Extract data from HTML tables
  - `reap_text`: Reap text, names and attributes from HTML
  - `add_response_url_from`: Add a ‘reapr\_doc’ response prefix URL to a
    data frame

## Installation

``` r
devtools::install_git("https://git.sr.ht/~hrbrmstr/reapr")
# or 
devtools::install_git("https://gitlab.com/hrbrmstr/reapr.git")
# or
devtools::install_github("hrbrmstr/reapr")


@@ 32,6 59,8 @@ devtools::install_github("hrbrmstr/reapr")

``` r
library(reapr)
library(hrbrthemes) # sr.hr/~hrbrmstr/hrbrthemes | git[la|hu]b.com/hrbrmstr/hrbrthemes
library(tidyverse) # for some examples only

# current version
packageVersion("reapr")


@@ 47,7 76,7 @@ x
##                Title: rud.is | "In God we trust. All others must bring data"
##         Original URL: http://rud.is/b
##            Final URL: https://rud.is/b/
##           Crawl-Date: 2019-01-16 13:24:44
##           Crawl-Date: 2019-01-17 19:51:09
##               Status: 200
##         Content-Type: text/html; charset=UTF-8
##                 Size: 50 kB


@@ 59,16 88,512 @@ x
##                       header[9], p[10], li[19], meta[20], div[31],
##                       script[40], span[49], link[53], a[94]
##           # Comments: 17
##   Total Request Time: 1.949s
##   Total Request Time: 2.093s
```

The formatted object print-output shows much of what you get with a
reaped URL.

`reapr::real_url()`:

  - Uses `httr::GET()` to make web connections and retrieve content.
    This enables it to behave more like an actual
    (non-javascript-enabled) browser. You can pass anything
    `httr::GET()` can handle to `...` (e.g. `httr::user_agent()`) to
    have as much granular control over the interaction as possible.
  - Returns a richer set of data. After the `httr::response` object is
    obtained many tasks are performed including:
      - timestamping the URL crawl
      - extraction of the asked-for URL and the final URL (in the case
        of redirects)
      - extraction of the IP address of the target server
      - extraction of both plaintext and parsed (`xml_document`) HTML
      - extraction of the plaintext webpage `<title>` (if any)
      - generation of a dynamic list tags in the document which can be
        fed directly to HTML/XML search/retrieval function (which may
        speed up node discovery)
      - extraction of the text of all comments in the HTML document
      - inclusion of the full `httr::response` object with the returned
        object
      - extraction of the time it took to make the complete request

Finally, it works with other package member functions to check the
validity of the parsed `xml_document` and auto-regen the parse (since it
has the full content available to it) prior to any other operations.
This also makes `reapr_doc` object *serializable* without having to
spend your own cycles on that.

If you need more or need the above in different ways please file issues.

## Pre-computed Tags

On document retrieval, `reapr` automagically builds convenient
R-accessible lists of all the tags in the retrieved document. They
aren’t recursive, but they are a convenient “bags” of tags to use when
you don’t feel like crafting that perfect XPath.

Let’s see what tags RStudio favors most on their Shiny home page:

``` r
x <- reap_url("https://shiny.rstudio.com/articles/")

x
##                Title: Shiny - Articles
##         Original URL: https://shiny.rstudio.com/articles/
##            Final URL: https://shiny.rstudio.com/articles/
##           Crawl-Date: 2019-01-17 19:51:10
##               Status: 200
##         Content-Type: text/html
##                 Size: 79 kB
##           IP Address: 13.35.78.118
##                 Tags: body[1], h1[1], head[1], html[1], title[1], meta[4], link[8],
##                       script[10], span[43], a[276], div[465]
##           # Comments: 25
##   Total Request Time: 0.191s

enframe(sort(lengths(x$tag))) %>%
  mutate(name = factor(name, levels = name)) %>%
  ggplot(aes(value, name)) +
  geom_segment(aes(xend = 0, yend = name), , size = 3, color = "goldenrod") +
  labs(
    x = "Tag frequency", y = NULL,
    title = "HTML Tag Distribution on RStudio's Shiny Homepage"
  ) +
  scale_x_comma(position = "top") +
  theme_ft_rc(grid = "X") +
  theme(axis.text.y = element_text(family = "mono"))
```

<img src="README_files/figure-gfm/unnamed-chunk-1-1.png" width="672" />

Lots and lots of `<div>`s\!

``` r
x$tag$div
## {xml_nodeset (465)}
##  [1] <div id="app" class="shrinkHeader alwaysShrinkHeader">\n  <div id="main">\n    <!-- rstudio header -->\n    <div ...
##  [2] <div id="main">\n    <!-- rstudio header -->\n    <div id="rStudioHeader">\n      <div class="band">\n        <d ...
##  [3] <div id="rStudioHeader">\n      <div class="band">\n        <div class="innards bandContent">\n          <div>\n ...
##  [4] <div class="band">\n        <div class="innards bandContent">\n          <div>\n            <a class="productNam ...
##  [5] <div class="innards bandContent">\n          <div>\n            <a class="productName" href="/">Shiny</a>\n      ...
##  [6] <div>\n            <a class="productName" href="/">Shiny</a>\n            <div class="rStudio">\n<span>from </sp ...
##  [7] <div class="rStudio">\n<span>from </span> <a href="https://www.rstudio.com/"><div class="rStudioLogo"></div></a> ...
##  [8] <div class="rStudioLogo"></div>
##  [9] <div id="menu">\n            <div id="menuToggler"></div>\n            <div id="menuItems" class="">\n           ...
## [10] <div id="menuToggler"></div>
## [11] <div id="menuItems" class="">\n              <a class="menuItem" href="/tutorial/">Get Started</a>\n             ...
## [12] <div class="mainContent pushFooter">\n\n  <div class="band">\n    <a name="top"></a>\n    <div class="bandConten ...
## [13] <div class="band">\n    <a name="top"></a>\n    <div class="bandContent">\n      <h1>Articles</h1>\n    </div>\n ...
## [14] <div class="bandContent">\n      <h1>Articles</h1>\n    </div>
## [15] <div class="band articlesBand">\n    <div class="bandContent">\n      <div class="articles-outline splitColumns  ...
## [16] <div class="bandContent">\n      <div class="articles-outline splitColumns withMobileMargins">\n\n        \n     ...
## [17] <div class="articles-outline splitColumns withMobileMargins">\n\n        \n          <div class="column25 start" ...
## [18] <div class="column25 start">\n            <div class="section-title">Start</div>\n            \n              <d ...
## [19] <div class="section-title">Start</div>
## [20] <div class="subsection-group">\n                <div class="subsection-group-title"></div>\n                \n   ...
## ...
```

Let’s take a look at the article titles:

``` r
as.data.frame(x$tag$div) %>% 
  filter(class == "article-title") %>% 
  select(`Shiny Articles`=elem_content) %>% 
  knitr::kable()
```

| Shiny Articles                                                                      |
| :---------------------------------------------------------------------------------- |
| The basic parts of a Shiny app                                                      |
| How to build a Shiny app                                                            |
| How to launch a Shiny app                                                           |
| How to get help                                                                     |
| The Shiny Cheat sheet                                                               |
| App formats and launching apps                                                      |
| Two-file Shiny apps                                                                 |
| Introduction to R Markdown                                                          |
| Introduction to interactive documents                                               |
| R Markdown integration in the RStudio IDE                                           |
| The R Markdown Cheat sheet                                                          |
| Setting Output args via Render functions                                            |
| Generating downloadable reports                                                     |
| Dashboards                                                                          |
| Shiny Gadgets                                                                       |
| Designing Gadget UI                                                                 |
| Reactivity - An overview                                                            |
| Stop reactions with isolate()                                                       |
| Execution scheduling                                                                |
| How to understand reactivity in R                                                   |
| Learn about your user with session$clientData                                       |
| Database basics - dplyr and DBI                                                     |
| SQL injection prevention                                                            |
| Using the pool package (basics)                                                     |
| Using the pool package (advanced)                                                   |
| Using dplyr and pool to query a database                                            |
| Persistent data storage in Shiny apps                                               |
| Application layout guide                                                            |
| Display modes                                                                       |
| Tabsets                                                                             |
| Customize your UI with HTML                                                         |
| Build your entire UI with HTML                                                      |
| Build a dynamic UI that reacts to user input                                        |
| HTML Templates                                                                      |
| Shiny HTML Tags Glossary                                                            |
| Progress indicators                                                                 |
| Modal dialogs                                                                       |
| Notifications                                                                       |
| Themes                                                                              |
| Render images in a Shiny app                                                        |
| Displaying and customizing static tables                                            |
| How to use DataTables in a Shiny App                                                |
| Using Action Buttons                                                                |
| Using sliders                                                                       |
| Help users download data from your app                                              |
| Help users upload files to your app                                                 |
| Using selectize input                                                               |
| Interactive plots                                                                   |
| Selecting rows of data                                                              |
| Interactive plots - advanced                                                        |
| htmlwidgets                                                                         |
| JavaScript actions packaged for Shiny apps                                          |
| How to build a JavaScript based widget                                              |
| How to add functionality to JavaScript widgets                                      |
| How to send messages from the browser to the server and back using Shiny            |
| How to develop an interactive, dynamic help system for your app with introJS        |
| How to create custom input bindings                                                 |
| Putting everything together to create an interactive dashboard                      |
| Style your apps with CSS                                                            |
| Build custom input objects                                                          |
| Build custom output objects                                                         |
| Add Google Analytics to a Shiny app                                                 |
| Packaging JavaScript code for Shiny                                                 |
| Communicating with Shiny via JavaScript                                             |
| JavaScript Events in Shiny                                                          |
| Debugging Shiny applications                                                        |
| Upgrading to a new version of R                                                     |
| Handling missing inputs with req(…)                                                 |
| Scoping rules for Shiny apps                                                        |
| Reconnecting to Shiny apps                                                          |
| Sanitizing error messages                                                           |
| Write error messages for your UI with validate                                      |
| Unicode characters in Shiny apps                                                    |
| shinytest                                                                           |
| Modularizing Shiny app code                                                         |
| Shiny App Usage Tracking                                                            |
| Add Google Analytics to a Shiny app                                                 |
| Plot Caching                                                                        |
| Profiling your Shiny app                                                            |
| Performance                                                                         |
| Improving scalability with async programming                                        |
| Scaling and Performance Tuning with shinyapps.io                                    |
| Scaling and Performance Tuning with Shiny Server Pro and RStudio Connect            |
| Deploying Shiny apps to the web                                                     |
| Shinyapps.io - Getting started                                                      |
| Shinyapps.io - Authentication and Authorization Model                               |
| Shinyapps.io - Setting up custom domains                                            |
| Shinyapps.io - Sharing data across sessions                                         |
| Shinyapps.io - Migrating authentication                                             |
| Shiny Server - Introduction                                                         |
| Shiny Server and Shiny Server Pro - Allowing different libraries for different apps |
| Shiny Server Pro and RStudio Connect - Creating user privileges                     |
| Shiny Server Pro and RStudio Connect - Administrating deployed Shiny applications   |
| Sharing apps to run locally                                                         |
| Save your app as a function                                                         |
| Bookmarking state                                                                   |
| Advanced bookmarking                                                                |
| Bookmarking and modules                                                             |

No XPath or CSS selectors\!

Let’s abandon the `tidyverse` for base R piping for a minute and do
something similar to extract and convert the index of [CRAN Task
Views](https://cloud.r-project.org/web/views/) to a markdown list (which
will conveniently render here). Again, no XPath or CSS selectors
required once we read in the URL:

``` r
x <- reap_url("https://cloud.r-project.org/web/views/")

as.data.frame(x$tag$a) %>% 
  add_response_url_from(x) %>% 
  subset(!grepl("^http[s]://", href)) %>% 
  transform(href = sprintf("- [%s](%s%s)", elem_content, prefix_url, href)) %>% 
  .[, "href", drop=TRUE] %>% 
  paste0(collapse = "\n") %>% 
  cat()
```

  - [Bayesian](https://cloud.r-project.org/web/views/Bayesian.html)
  - [ChemPhys](https://cloud.r-project.org/web/views/ChemPhys.html)
  - [ClinicalTrials](https://cloud.r-project.org/web/views/ClinicalTrials.html)
  - [Cluster](https://cloud.r-project.org/web/views/Cluster.html)
  - [Databases](https://cloud.r-project.org/web/views/Databases.html)
  - [DifferentialEquations](https://cloud.r-project.org/web/views/DifferentialEquations.html)
  - [Distributions](https://cloud.r-project.org/web/views/Distributions.html)
  - [Econometrics](https://cloud.r-project.org/web/views/Econometrics.html)
  - [Environmetrics](https://cloud.r-project.org/web/views/Environmetrics.html)
  - [ExperimentalDesign](https://cloud.r-project.org/web/views/ExperimentalDesign.html)
  - [ExtremeValue](https://cloud.r-project.org/web/views/ExtremeValue.html)
  - [Finance](https://cloud.r-project.org/web/views/Finance.html)
  - [FunctionalData](https://cloud.r-project.org/web/views/FunctionalData.html)
  - [Genetics](https://cloud.r-project.org/web/views/Genetics.html)
  - [Graphics](https://cloud.r-project.org/web/views/Graphics.html)
  - [HighPerformanceComputing](https://cloud.r-project.org/web/views/HighPerformanceComputing.html)
  - [Hydrology](https://cloud.r-project.org/web/views/Hydrology.html)
  - [MachineLearning](https://cloud.r-project.org/web/views/MachineLearning.html)
  - [MedicalImaging](https://cloud.r-project.org/web/views/MedicalImaging.html)
  - [MetaAnalysis](https://cloud.r-project.org/web/views/MetaAnalysis.html)
  - [MissingData](https://cloud.r-project.org/web/views/MissingData.html)
  - [ModelDeployment](https://cloud.r-project.org/web/views/ModelDeployment.html)
  - [Multivariate](https://cloud.r-project.org/web/views/Multivariate.html)
  - [NaturalLanguageProcessing](https://cloud.r-project.org/web/views/NaturalLanguageProcessing.html)
  - [NumericalMathematics](https://cloud.r-project.org/web/views/NumericalMathematics.html)
  - [OfficialStatistics](https://cloud.r-project.org/web/views/OfficialStatistics.html)
  - [Optimization](https://cloud.r-project.org/web/views/Optimization.html)
  - [Pharmacokinetics](https://cloud.r-project.org/web/views/Pharmacokinetics.html)
  - [Phylogenetics](https://cloud.r-project.org/web/views/Phylogenetics.html)
  - [Psychometrics](https://cloud.r-project.org/web/views/Psychometrics.html)
  - [ReproducibleResearch](https://cloud.r-project.org/web/views/ReproducibleResearch.html)
  - [Robust](https://cloud.r-project.org/web/views/Robust.html)
  - [SocialSciences](https://cloud.r-project.org/web/views/SocialSciences.html)
  - [Spatial](https://cloud.r-project.org/web/views/Spatial.html)
  - [SpatioTemporal](https://cloud.r-project.org/web/views/SpatioTemporal.html)
  - [Survival](https://cloud.r-project.org/web/views/Survival.html)
  - [TimeSeries](https://cloud.r-project.org/web/views/TimeSeries.html)
  - [WebTechnologies](https://cloud.r-project.org/web/views/WebTechnologies.html)
  - [gR](https://cloud.r-project.org/web/views/gR.html)

This functionality is not a panacea since they are just bags of tags,
but it may save you some time and frustration.

## Tables

Unlike `rvest` with it’s magical and wonderful `html_table()` `reapr`
provides more raw control over the content of `<table>` elements. Let’s
look at the “population change over time” table from the Wikipedia page
on the demography of the UK
(<https://en.wikipedia.org/wiki/Demography_of_the_United_Kingdom>):

``` r
x <- reap_url("https://en.wikipedia.org/wiki/Demography_of_the_United_Kingdom")

reap_node(x, ".//table[contains(., 'Intercensal')]") %>% 
  reap_table()
## # A tibble: 18 x 8
##    V1         V2             V3             V4            V5            V6            V7            V8                  
##    <chr>      <chr>          <chr>          <chr>         <chr>         <chr>         <chr>         <chr>               
##  1 Intercens… Populationat … Average annua… Average annu… Average annu… Average annu… Average annu… Populationdensityat…
##  2 Intercens… Populationat … Overallchange  Births        Deaths        Net naturalc… Netmigration* Populationdensityat…
##  3 1851–1861  27,368,800     154,910        Unknown       Unknown       Unknown       Unknown       87                  
##  4 1861–1871  28,917,900     256,680        Unknown       Unknown       Unknown       Unknown       92                  
##  5 1871–1881  31,484,700     344,980        Unknown       Unknown       Unknown       Unknown       100                 
##  6 1881–1891  34,934,500     286,790        Unknown       Unknown       Unknown       Unknown       111                 
##  7 1891–1901  37,802,400     373,580        Unknown       Unknown       Unknown       Unknown       120                 
##  8 1901–1911  38,237,000     385,000        1,091,000     624,000       467,000       −82,000       156                 
##  9 1911–1921  42,082,000     195,000        975,000       689,000       286,000       −92,000       172                 
## 10 1921–1931  44,027,000     201,000        824,000       555,000       268,000       −67,000       180                 
## 11 1931–1951  46,038,000     213,000        793,000       603,000       190,000       22,000        188                 
## 12 1951–1961  50,225,000     258,000        839,000       593,000       246,000       12,000        205                 
## 13 1961–1971  52,807,000     312,000        962,000       638,000       324,000       −12,000       216                 
## 14 1971–1981  55,928,000     42,000         736,000       666,000       69,000        −27,000       229                 
## 15 1981–1991  56,357,000     108,000        757,000       655,000       103,000       5,000         231                 
## 16 1991–2001  57,439,000     161,000        731,000       631,000       100,000       61,000        235                 
## 17 2001–2011  59,113,000     324,000        722,000       588,000       134,000       191,000       242                 
## 18 2011–2021  63,182,000     N/A            N/A           N/A           N/A           N/A           259
```

As you can see, it doesn’t do the cleanup work for you and has no way to
even say there’s a header. That’s because you can do that with
`rvest::html_table()`. The equivalent `reapr` function gives you the raw
table and handles `colspan` and `rowspan` insanity by adding the missing
cells and filling in the gaps. You can use
`docxtractr::assign_colnames()` to make a given row the column titles
and `docxtractr::mcga()` or `janitor::clean_names()` to name them proper
R names then `readr::type_convert()` to finish the task.

While that may seem overkill for this example (it is), it wouldn’t be if
the table were more gnarly (I’m working on an example for that which
will replace this one when it’s done).

For truly gnarly tables you can get an overview of the structure
(without the data frame conversion):

``` r
reap_node(x, ".//table[contains(., 'Intercensal')]") %>% 
  reap_table(raw = TRUE) -> raw_tbl

raw_tbl
## <table class='wikitable...'>
##   <row (noattrs)>
##     <cell rowspan='2'>
##     <cell rowspan='2'>
##     <cell colspan='5'>
##     <cell rowspan='2'>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell style='backgroun...' class='unknown t...'>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##   <row (noattrs)>
##     <cell (noattrs)>
##     <cell (noattrs)>
##     <cell data-sort-value='' style='backgroun...' class='table-na'>
##     <cell data-sort-value='' style='backgroun...' class='table-na'>
##     <cell data-sort-value='' style='backgroun...' class='table-na'>
##     <cell data-sort-value='' style='backgroun...' class='table-na'>
##     <cell data-sort-value='' style='backgroun...' class='table-na'>
##     <cell (noattrs)>
```

And work with the `list` it gives back (which contains all the HTML
element attributes as R attributes so you can pull data stored in them
if need
be).

## reapr Metrics

| Lang | \# Files |  (%) | LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
| :--- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
| R    |        6 | 0.67 | 112 | 0.81 |          46 | 0.64 |       86 | 0.69 |
| C    |        2 | 0.22 |  17 | 0.12 |           5 | 0.07 |        4 | 0.03 |
| Rmd  |        1 | 0.11 |   9 | 0.07 |          21 | 0.29 |       34 | 0.27 |
| R    |       13 | 0.81 | 405 | 0.87 |         148 | 0.72 |      251 | 0.69 |
| Rmd  |        1 | 0.06 |  44 | 0.09 |          53 | 0.26 |      110 | 0.30 |
| C    |        2 | 0.12 |  17 | 0.04 |           5 | 0.02 |        4 | 0.01 |

## Code of Conduct


A README_files/figure-gfm/unnamed-chunk-1-1.png => README_files/figure-gfm/unnamed-chunk-1-1.png +0 -0
A inst/xslt/mill.xslt => inst/xslt/mill.xslt +124 -0
@@ 0,0 1,124 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">

  <xsl:output method="xml"/>

  <xsl:template match="@*|node()">
    <xsl:copy>
      <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
  </xsl:template>

  <xsl:template match="head"/>
  <xsl:template match="script"/>
  <xsl:template match="style"/>
  <xsl:template match="img"/>
  <xsl:template match="header"/>
  <xsl:template match="footer"/>
  <xsl:template match="link"/>
  <xsl:template match="iframe"/>
  <xsl:template match="form"/>
  <xsl:template match="figure"/>
  <xsl:template match="object"/>
  <xsl:template match="input"/>
  <xsl:template match="textarea"/>
  <xsl:template match="option"/>
  <xsl:template match="select"/>
  <xsl:template match="code"/>
  <xsl:template match="cite"/>
  <xsl:template match="a"/>
  <xsl:template match="comment()"/>

  <xsl:template match="@style[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'display:none')]"/>
  <xsl:template match="@style[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'display: none')]"/>

  <xsl:template match="*[@class='ad']"/>
  <xsl:template match="*[@id='ad']"/>

  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad ad')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'topic')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'banner')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'intercept')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'eyebrow')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'about')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'meta')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'combx')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'comment')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'community')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'disqus')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'extra')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'foot')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'header')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'menu')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remark')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'shoutbox')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sidebar')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sponsor')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad-break')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agegate')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pagination')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pager')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'popup')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'tweet')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'twitter')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'brand')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'related')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'img')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'image')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'thumbnail')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'email')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'friend')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'copyright')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'taboola')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'breadcrumb')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'label')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'fb-')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'panel')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'separator')]"/>
  <xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'forum')]"/>

  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad ad')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'topic')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'banner')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'intercept')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'eyebrow')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'about')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'meta')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'combx')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'comment')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'community')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'disqus')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'extra')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'foot')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'header')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'menu')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remark')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'shoutbox')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sidebar')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad-break')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sponsor')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agegate')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pagination')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pager')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'popup')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'tweet')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'twitter')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'brand')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'related')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'img')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'image')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'thumbnail')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'email')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'friend')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'copyright')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'taboola')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'breadcrumb')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'label')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'fb-')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'panel')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'separator')]"/>
  <xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'forum')]"/>

</xsl:stylesheet>

A man/add_response_url_from.Rd => man/add_response_url_from.Rd +31 -0
@@ 0,0 1,31 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/add-response-url.R
\name{add_response_url_from}
\alias{add_response_url_from}
\title{Add a \code{reapr_doc} response prefix URL to a data frame}
\usage{
add_response_url_from(xdf, x)
}
\arguments{
\item{xdf}{a data frame}

\item{x}{a \code{reapr_doc}}
}
\description{
Add a \code{reapr_doc} response prefix URL to a data frame
}
\examples{
x <- reap_url("http://books.toscrape.com/")

# good ol' R
add_response_url_from(
  as.data.frame(x$tag$a),
  x
)

\dontrun{
# piping
as_tibble(x$tag$a) \%>\%
  add_response_url_from(x)
}
}

A man/as.data.frame.reapr_taglist.Rd => man/as.data.frame.reapr_taglist.Rd +29 -0
@@ 0,0 1,29 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/as-data-frame-taglist.R
\name{as.data.frame.reapr_taglist}
\alias{as.data.frame.reapr_taglist}
\alias{as_tibble.reapr_taglist}
\title{Turn a \code{reapr_taglist} into a data frame (tibble)}
\usage{
\method{as.data.frame}{reapr_taglist}(x, ..., trim = TRUE,
  stringsAsFactors = FALSE)

as_tibble.reapr_taglist(x, ..., trim = TRUE, stringsAsFactors = FALSE)
}
\arguments{
\item{x}{a \code{reapr_taglist}}

\item{...}{ignored}

\item{trim}{trim front/back whitspace? Default: \code{TRUE}}

\item{stringsAsFactors}{always \code{FALSE}}
}
\description{
Takes a taglist from a \code{reapr_doc} \code{tag} slot and turns it
into a data frame with normalized column names
}
\examples{
x <- reap_url("http://r-project.org/")
as.data.frame(x$tag$meta)
}

A man/mill.Rd => man/mill.Rd +23 -0
@@ 0,0 1,23 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/mill.R
\name{mill}
\alias{mill}
\title{Turn a \code{reapr_doc} into plain text without cruft}
\usage{
mill(x)
}
\arguments{
\item{x}{a \code{reapr_doc}}
}
\value{
a character vector of plain text with no HTML
}
\description{
Plain text extraction is accomplished via the following idiom: first,
an attempt is made to use an XSLT style sheet to select only the best
target nodes for extraction. On some malformed HTML content this
results in an empty document. When that occurs a less conservative
approach is taken with a simple XPath that is desgined to capture
all \code{<body>} elements that are not \code{<script>} tags. This is imperfect
but does provide fairly decent results when the preferred method fails.
}

A man/print.reapr_raw_tbl.Rd => man/print.reapr_raw_tbl.Rd +25 -0
@@ 0,0 1,25 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/reap-table.R
\name{print.reapr_raw_tbl}
\alias{print.reapr_raw_tbl}
\alias{print.reapr_tbl_row}
\alias{print.reapr_tbl_cell}
\title{Print for reapr table elements}
\usage{
\method{print}{reapr_raw_tbl}(x, ..., indent = 0)

\method{print}{reapr_tbl_row}(x, ..., indent = 0)

\method{print}{reapr_tbl_cell}(x, ..., indent = 0)
}
\arguments{
\item{x}{reapr raw table}

\item{...}{ignored}

\item{indent}{how much to indent this element}
}
\description{
Print for reapr table elements
}
\keyword{internal}

A man/reap_nodes.Rd => man/reap_nodes.Rd +32 -0
@@ 0,0 1,32 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/reap-nodes.R
\name{reap_nodes}
\alias{reap_nodes}
\alias{reap_node}
\title{Reap nodes from an reaped HTML document}
\usage{
reap_nodes(x, xpath, css)

reap_node(x, xpath, css)
}
\arguments{
\item{x}{A \code{reapr_doc} or anything \code{rvest::html_nodes()} takes.}

\item{xpath, css}{either an XPath query (string) or CSS selector; NOTE the
order difference.}
}
\description{
Provides simialar functionality to \code{rvest::html_nodes()} except when
a \code{reapr_doc} is passed in where it will then test for the validity of
the pre-parsed HTML content and regenerate the parse tree if the pointer
is invalid. Another major difference is that it prefers XPath queries over
CSS selectors so the \code{xpath} and \code{css} named (yet positional) parameters
are in a different order than in their \code{rvest} cousins.
}
\examples{
x <- reap_url("http://r-project.org/")
reap_text(x$tag$div)
reap_nodes(x, ".//*") \%>\% reap_name()
x$tag$div \%>\% reap_children()
reap_attrs(x$tag$div)
}

A man/reap_table.Rd => man/reap_table.Rd +54 -0
@@ 0,0 1,54 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/reap-table.R
\name{reap_table}
\alias{reap_table}
\title{Extract data from HTML tables}
\usage{
reap_table(x, raw = FALSE, trim = TRUE)
}
\arguments{
\item{x}{a \code{reapr_doc} or anyting you're used to passing to \code{\link[rvest:html_table]{rvest::html_table()}}}

\item{raw}{if \code{TRUE} then a \code{list} with rows and cells will be returned. Each
cell has the value in the source HTML table but also has an \code{hattr}
attribute (short for "html entity attribute") which contains all the
attributes (if any) of the table cell. Each row in the list also has an \code{hattr}
attribute holding its attributes (if any). This structure may be useful
for doing more infolved extractions of weirdly formed HTML tables
without having to muck with XPath queries. Default: \code{FALSE}}

\item{trim}{if \code{TRUE} trim cell whitespace. Default: \code{FALSE}.}
}
\description{
This behaves differently than \code{\link[rvest:html_table]{rvest::html_table()}}. It does an
aggressive fill by default when \code{colspan}s or \code{rowspan}s are detected
and does not make any attempt to go beyond providing a basic data frame
of the HTML table. See \code{Details} for more information.
}
\details{
The functionality provided in \code{\link[rvest:html_table]{rvest::html_table()}} is double-plus good so
the intent of this function was not to subvert it. Rather, \code{\link[=reap_table]{reap_table()}}
was designed to give you more direct R-access to the underlying structure
of an HTML table so you can wrangle it as you please. In "raw" mode,
you get a list with attributes enabling you to work with the table structure,
cell values and entity attributes with R idioms vs XPath queries.
}
\note{
When passing in a \code{reapr_doc} object, the pre-parsed HTML will be
tested for validity and re-generated if the external pointer is
invalid.
}
\examples{
x <- reap_url("https://en.wikipedia.org/wiki/Demography_of_the_United_Kingdom")

# take advantage of the pre-processing reap_url() does:
tbl <- reap_table(x$tag$table[[10]])
tbl_raw <- reap_table(x$tag$table[[10]], raw=TRUE)

# get all of 'em:
tbls <- reap_table(x)

# fid a specific one:
reap_node(x, ".//table[contains(., 'Other identity and at least one UK identity')]") \%>\%
  reap_table() -> tbl
}

A man/reap_text.Rd => man/reap_text.Rd +49 -0
@@ 0,0 1,49 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/reap-bits.R
\name{reap_text}
\alias{reap_text}
\alias{reap_name}
\alias{reap_children}
\alias{reap_attrs}
\alias{reap_attr}
\title{Reap text, names and attributes from HTML}
\usage{
reap_text(x, trim = TRUE)

reap_name(x)

reap_children(x)

reap_attrs(x)

reap_attr(x, name, otherwise = NA_character_)
}
\arguments{
\item{x}{anything the underlying \code{xml2} functions can take}

\item{trim}{if \code{TRUE} then trim whitespace. Unlike the \code{rvest} counterparts
this defaults to \code{TRUE}.}

\item{name}{attribute name to retrieve}

\item{otherwise}{what to return if \code{name} doesn't exist in a given node}
}
\description{
You need to pass in anyting the underlying \code{\link[xml2:xml_text]{xml2::xml_text()}},
\code{\link[xml2:xml_name]{xml2::xml_name()}}, \code{\link[xml2:xml_children]{xml2::xml_children()}}, \code{\link[xml2:xml_attrs]{xml2::xml_attrs()}},
or \code{\link[xml2:xml_attr]{xml2::xml_attr()}} expect. These are merely convenience wrappers
so you don't have to \code{library(xml2)}.
}
\details{
You \emph{can} pass in a \code{reapr_doc} \code{$parsed_html} full document if you
wish but you should really be working with the output of
\code{\link[=reap_nodes]{reap_nodes()}} or \code{\link[=reap_node]{reap_node()}} or the pre-extracted tags in the \code{$tag}
element of a \code{reapr_doc}.
}
\examples{
x <- reap_url("http://r-project.org/")
reap_text(x$tag$div)
reap_nodes(x, ".//*") \%>\% reap_name()
x$tag$div \%>\% reap_children()
reap_attrs(x$tag$div)
}

M man/reap_url.Rd => man/reap_url.Rd +4 -1
@@ 41,7 41,7 @@ many tasks are performed including:
\item extraction of the plaintext webpage \code{<title>} (if any)
\item generation of a dynamic list tags in the document which can be
fed directly to HTML/XML search/retrieval function (which may speed
up node discover)
up node discovery)
\item extraction of the text of all comments in the HTML document
\item inclusion of the full \code{httr::response} object with the returned object
\item extraction of the time it took to make the complete request


@@ 53,3 53,6 @@ of the parsed \code{xml_document} and auto-regen the parse (since it has the ful
content available to it) prior to any other operations. This also makes \code{reapr_doc}
object \emph{serializable} without having to spend your own cycles on that.
}
\examples{
x <- reap_url("http://books.toscrape.com/")
}

M man/reapr.Rd => man/reapr.Rd +9 -1
@@ 6,7 6,15 @@
\alias{reapr-package}
\title{Reap Information from Websites}
\description{
This will eventually be a clever description about web scraping with reapr.
There's no longer need to fear getting at the gnarly bits of web pages.
For the vast majority of web scraping tasks, the 'rvest' package does a
phenomenal job providing just enough of what you need to get by. But, if you
want more of the details of the site you're scraping, some handy shortcuts to
page elements in use and the ability to not have to think too hard about
serialization during scraping tasks, then you may be interested in reaping
more than harvesting. Tools are provided to interact with web sites content
and metadata more granular level than 'rvest' but at a higher level than
'httr'/'curl'.
}
\details{
\itemize{