~hrbrmstr/spiderbar

00d5c5bc4c0a54e7bda692c08e4230649245cc83 — hrbrmstr 1 year, 3 months ago fb27ce1 master
tinytest, spelling, some tidying up
17 files changed, 84 insertions(+), 83 deletions(-)

M .Rbuildignore
A CRAN-RELEASE
M DESCRIPTION
M NEWS.md
M R/can-fetch.r
M R/crawl-delay.r
M R/robxp.r
M README.md
A cran-comments.md
R tests/testthat/test-spiderbar.R => inst/tinytest/test_spiderbar.R
M man/can_fetch.Rd
M man/crawl_delays.Rd
M man/robxp.Rd
M man/sitemaps.Rd
M man/spiderbar.Rd
D tests/test-all.R
A tests/tinytest.R
M .Rbuildignore => .Rbuildignore +1 -0
@@ 13,3 13,4 @@
^appveyor\.yml$
^codecov\.yml$
^cran-comments\.md$
^CRAN-RELEASE$

A CRAN-RELEASE => CRAN-RELEASE +2 -0
@@ 0,0 1,2 @@
This package was submitted to CRAN on 2020-05-29.
Once it is accepted, delete this file and tag the release (commit fb27ce1dec).

M DESCRIPTION => DESCRIPTION +6 -6
@@ 14,14 14,14 @@ NeedsCompilation: yes
URL: https://gitlab.com/hrbrmstr/spiderbar
BugReports: https://gitlab.com/hrbrmstr/spiderbar/issues
License: MIT + file LICENSE
Suggests:
    testthat,
Suggests: 
    covr,
    robotstxt
Depends:
    robotstxt, 
    tinytest
Depends: 
    R (>= 3.2.0)
Encoding: UTF-8
Imports:
Imports: 
    Rcpp
RoxygenNote: 6.1.1
RoxygenNote: 7.1.0
LinkingTo: Rcpp

M NEWS.md => NEWS.md +6 -0
@@ 1,3 1,9 @@
0.2.3
* fix by Peter Meissner for fetching case
* custom print method now returns the object
* fixed spelling
* ensured there's a roxygen return for every function

0.2.0
* Added crawl delay extraction
* Made all examples local so CRAN doesn't have to hit actual websites

M R/can-fetch.r => R/can-fetch.r +1 -0
@@ 9,6 9,7 @@
#' @param path path to test
#' @param user_agent user agent to test
#' @export
#' @return logical vector indicating whether you have permission to fetch the content
#' @examples
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
#'              package="spiderbar")), collapse="\n")

M R/crawl-delay.r => R/crawl-delay.r +1 -1
@@ 1,4 1,4 @@
#' Retrive all agent crawl delay values in a `robxp` `robots.txt` object
#' Retrieve all agent crawl delay values in a `robxp` `robots.txt` object
#'
#' @md
#' @param obj `robxp` object

M R/robxp.r => R/robxp.r +5 -3
@@ 8,10 8,11 @@
#'        a `connection` object that will be passed to [readLines()], the result of which
#'        will be concatenated into a single string and parsed and the connection will be closed.
#' @export
#' @return a classed object holding an external pointer to parsed robots.txt data
#' @examples
#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
#'                package="spiderbar")), collapse="\n")
#' rt <- robxp(imdb)
# imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
#                package="spiderbar")), collapse="\n")
# rt <- robxp(imdb)
robxp <- function(x) {

  if (inherits(x, "connection")) {


@@ 38,4 39,5 @@ robxp <- function(x) {
#' @export
print.robxp <- function(x, ...) {
  cat("<Robots Exclusion Protocol Object>")
  invisible(x)
}
\ No newline at end of file

M README.md => README.md +14 -37
@@ 5,7 5,7 @@ developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.re
[![Signed
by](https://img.shields.io/badge/Keybase-Verified-brightgreen.svg)](https://keybase.io/hrbrmstr)
![Signed commit
%](https://img.shields.io/badge/Signed_Commits-100%25-lightgrey.svg)
%](https://img.shields.io/badge/Signed_Commits-89%25-lightgrey.svg)
[![Linux build
Status](https://travis-ci.org/hrbrmstr/spiderbar.svg?branch=master)](https://travis-ci.org/hrbrmstr/spiderbar)
[![Windows build


@@ 40,7 40,7 @@ processing these ‘robots.txt’ files.
The following functions are implemented:

  - `can_fetch`: Test URL paths against a robxp robots.txt object
  - `crawl_delays`: Retrive all agent crawl delay values in a robxp
  - `crawl_delays`: Retrieve all agent crawl delay values in a robxp
    robots.txt object
  - `print.robxp`: Custom printer for ’robxp“ objects
  - `robxp`: Parse a ‘robots.txt’ file & create a ‘robxp’ object


@@ 50,7 50,7 @@ The following functions are implemented:
## Installation

``` r
install.packages("spiderbar", repos = "https://cinc.rud.is")
install.packages("spiderbar", repos = c("https://cinc.rud.is", "https://cloud.r-project.org/"))
# or
remotes::install_git("https://git.rud.is/hrbrmstr/spiderbar.git")
# or


@@ 74,7 74,7 @@ library(robotstxt)

# current verison
packageVersion("spiderbar")
## [1] '0.2.2'
## [1] '0.2.3'

# use helpers from the robotstxt package



@@ 99,46 99,23 @@ can_fetch(gh_rt, "/humans.txt", "*") # TRUE
## [1] TRUE

can_fetch(gh_rt, "/login", "*") # FALSE
## [1] FALSE
## [1] TRUE

can_fetch(gh_rt, "/oembed", "CCBot") # FALSE
## [1] FALSE
## [1] TRUE

can_fetch(gh_rt, c("/humans.txt", "/login", "/oembed"))
## [1]  TRUE FALSE FALSE
## [1] TRUE TRUE TRUE

crawl_delays(gh_rt)
```

<div class="kable-table">

| agent             | crawl\_delay |
| :---------------- | -----------: |
| yandex            |          \-1 |
| twitterbot        |          \-1 |
| ccbot             |          \-1 |
| mail.ru\_bot      |          \-1 |
| telefonica        |          \-1 |
| slurp             |          \-1 |
| seznambot         |          \-1 |
| sanddollar        |          \-1 |
| coccoc            |          \-1 |
| ia\_archiver      |          \-1 |
| swiftbot          |          \-1 |
| red-app-gsa-p-one |          \-1 |
| naverbot          |          \-1 |
| msnbot            |          \-1 |
| teoma             |          \-1 |
| \*                |          \-1 |
| intuitgsacrawler  |          \-1 |
| bingbot           |          \-1 |
| daumoa            |          \-1 |
| googlebot         |          \-1 |
| httrack           |          \-1 |
| duckduckbot       |          \-1 |
| etaospider        |          \-1 |
| rogerbot          |          \-1 |
| dotbot            |          \-1 |
| agent | crawl\_delay |
| :---- | -----------: |
| baidu |            1 |
| \*    |          \-1 |

</div>



@@ 167,9 144,9 @@ sitemaps(imdb_rt)

| Lang         | \# Files |  (%) |  LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
| :----------- | -------: | ---: | ---: | ---: | ----------: | ---: | -------: | ---: |
| C++          |        9 | 0.38 | 1763 | 0.78 |         257 | 0.55 |      258 | 0.38 |
| C/C++ Header |        7 | 0.29 |  395 | 0.18 |         152 | 0.33 |      280 | 0.42 |
| R            |        7 | 0.29 |   68 | 0.03 |          26 | 0.06 |      101 | 0.15 |
| C++          |        9 | 0.39 | 1763 | 0.79 |         257 | 0.56 |      258 | 0.38 |
| C/C++ Header |        7 | 0.30 |  395 | 0.18 |         152 | 0.33 |      280 | 0.42 |
| R            |        6 | 0.26 |   47 | 0.02 |          18 | 0.04 |      101 | 0.15 |
| Rmd          |        1 | 0.04 |   23 | 0.01 |          31 | 0.07 |       33 | 0.05 |

## Code of Conduct

A cran-comments.md => cran-comments.md +12 -0
@@ 0,0 1,12 @@
## Test environments
* local R installation, R 4.0.1
* ubuntu 16.04 (on travis-ci), R 4.0.1
* win-builder (devel)

## R CMD check results

0 errors | 0 warnings | 1 note

* This is a update release to fix a parsing edge case
* Ensured all functions have a @return block
* Fixed spelling mistakes

R tests/testthat/test-spiderbar.R => inst/tinytest/test_spiderbar.R +19 -21
@@ 1,31 1,29 @@
context("basic functionality")
test_that("parsing and fetch testing and sitemaps work", {
library(spiderbar)

  cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="spiderbar")), collapse="\n")
  rt1 <- robxp(cdc)
cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="spiderbar")), collapse="\n")
rt1 <- robxp(cdc)

  expect_that(rt1, is_a("robxp"))
expect_true(inherits(rt1, "robxp"))

  expect_that(can_fetch(rt1, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE))
  expect_that(can_fetch(rt1, "/_borders", "*"), equals(FALSE))
expect_true(can_fetch(rt1, "/asthma/asthma_stats/default.htm", "*"))
expect_false(can_fetch(rt1, "/_borders", "*"))

  imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
  rt2 <- robxp(imdb)
  cd <- crawl_delays(rt2)
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
rt2 <- robxp(imdb)
cd <- crawl_delays(rt2)

  expect_that(cd, is_a("data.frame"))
  expect_equal(sort(cd$crawl_delay), sort(c(0.1, 3.0, -1.0)))
expect_true(inherits(cd, "data.frame"))
expect_equal(sort(cd$crawl_delay), sort(c(0.1, 3.0, -1.0)))

  imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar"))
  rt2 <- robxp(imdb)
imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar"))
rt2 <- robxp(imdb)

  gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
  rt3 <- robxp(gh)
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
rt3 <- robxp(gh)

  rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="spiderbar")))
rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="spiderbar")))

  expect_equal(sitemaps(rt1), "http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml")
  expect_equal(sitemaps(rt2), "http://www.imdb.com/sitemap_US_index.xml.gz")
  expect_equal(sitemaps(rt3), character(0))
expect_equal(sitemaps(rt1), "http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml")
expect_equal(sitemaps(rt2), "http://www.imdb.com/sitemap_US_index.xml.gz")
expect_equal(sitemaps(rt3), character(0))

})

M man/can_fetch.Rd => man/can_fetch.Rd +4 -1
@@ 13,6 13,9 @@ can_fetch(obj, path = "/", user_agent = "*")

\item{user_agent}{user agent to test}
}
\value{
logical vector indicating whether you have permission to fetch the content
}
\description{
Provide a character vector of URL paths plus optional user agent and this function will
return a logical vector indicating whether you have permission to fetch the content


@@ 20,7 23,7 @@ at the respective path.
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
             package="spiderbar")), collapse="\\n")
             package="spiderbar")), collapse="\n")
gh_rt <- robxp(gh)

can_fetch(gh_rt, "/humans.txt", "*") # TRUE

M man/crawl_delays.Rd => man/crawl_delays.Rd +4 -4
@@ 2,7 2,7 @@
% Please edit documentation in R/crawl-delay.r
\name{crawl_delays}
\alias{crawl_delays}
\title{Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object}
\title{Retrieve all agent crawl delay values in a \code{robxp} \code{robots.txt} object}
\usage{
crawl_delays(obj)
}


@@ 13,19 13,19 @@ crawl_delays(obj)
data frame of agents and their crawl delays
}
\description{
Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object
Retrieve all agent crawl delay values in a \code{robxp} \code{robots.txt} object
}
\note{
\code{-1} will be returned for any listed agent \emph{without} a crawl delay setting
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
             package="spiderbar")), collapse="\\n")
             package="spiderbar")), collapse="\n")
gh_rt <- robxp(gh)
crawl_delays(gh_rt)

imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
               package="spiderbar")), collapse="\\n")
               package="spiderbar")), collapse="\n")
imdb_rt <- robxp(imdb)
crawl_delays(imdb_rt)
}

M man/robxp.Rd => man/robxp.Rd +3 -5
@@ 12,12 12,10 @@ _or_ a length >1 character vector that will concatenated into a single string _o
a `connection` object that will be passed to [readLines()], the result of which
will be concatenated into a single string and parsed and the connection will be closed.}
}
\value{
a classed object holding an external pointer to parsed robots.txt data
}
\description{
This function takes in a single element character vector and parses it into
a `robxp` object.
}
\examples{
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
               package="spiderbar")), collapse="\\n")
rt <- robxp(imdb)
}

M man/sitemaps.Rd => man/sitemaps.Rd +1 -1
@@ 17,7 17,7 @@ Retrieve a character vector of sitemaps from a parsed robots.txt object
}
\examples{
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
               package="rep")), collapse="\\n")
               package="rep")), collapse="\n")
rt <- robxp(imdb)
sitemaps(rt)
}

M man/spiderbar.Rd => man/spiderbar.Rd +0 -1
@@ 3,7 3,6 @@
\docType{package}
\name{spiderbar}
\alias{spiderbar}
\alias{spiderbar-package}
\title{Parse and Test Robots Exclusion Protocol Files and Rules}
\description{
The 'Robots Exclusion Protocol' (\url{https://www.robotstxt.org/orig.html}) documents a set

D tests/test-all.R => tests/test-all.R +0 -3
@@ 1,3 0,0 @@
library(testthat)
library(robotstxt)
test_check("spiderbar")

A tests/tinytest.R => tests/tinytest.R +5 -0
@@ 0,0 1,5 @@

if ( requireNamespace("tinytest", quietly=TRUE) ){
  tinytest::test_package("spiderbar")
}