~hrbrmstr/spiderbar

2aa63dad75d45f7f1d66af68179fe2228b2c1a31 — Bob Rudis 3 years ago 4277d70
pre-CRAN flight check
M .Rbuildignore => .Rbuildignore +1 -0
@@ 11,3 11,4 @@
^CONDUCT\.md$
^appveyor\.yml$
^codecov\.yml$
^cran-comments\.md$

M NAMESPACE => NAMESPACE +1 -0
@@ 4,5 4,6 @@ S3method(print,robxp)
export(can_fetch)
export(crawl_delays)
export(robxp)
export(sitemaps)
importFrom(Rcpp,sourceCpp)
useDynLib(rep, .registration=TRUE)

M R/RcppExports.R => R/RcppExports.R +22 -0
@@ 17,6 17,28 @@ rep_crawl_delays <- function(xp) {
    .Call(`_rep_rep_crawl_delays`, xp)
}

#' Retrieve a character vector of sitemaps from a parsed robots.txt object
#'
#' @md
#' @param xp A `robxp` object
#' @return charcter vector of all sitemaps found in the parsed `robots.txt` file
#' @export
#' @examples
#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\n")
#' rt <- robxp(imdb)
#' sitemaps(rt)
sitemaps <- function(xp) {
    .Call(`_rep_sitemaps`, xp)
}

#' Retrieve a character vector of sitemaps from a parsed robots.txt object
#'
#' @noRd
#'
rep_as_string <- function(xp) {
    .Call(`_rep_rep_as_string`, xp)
}

#' Path allowed
#'
#' @noRd

M R/can-fetch.r => R/can-fetch.r +10 -3
@@ 1,4 1,8 @@
#' Test URL path against robots.txt
#' Test URL paths against a `robxp` `robots.txt` object
#'
#' Provide a character vector of URL paths plus optional user agent and this function will
#' return a logical vector indicating whether you have permission to fetch the content
#' at the respective path.
#'
#' @md
#' @param obj `robxp` object


@@ 8,13 12,16 @@
#' @examples
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\n")
#' gh_rt <- robxp(gh)
#'
#' can_fetch(gh_rt, "/humans.txt", "*") # TRUE
#' can_fetch(gh_rt, "/login", "*") # FALSE
#' can_fetch(gh_rt, "/oembed", "CCBot") # FALSE
can_fetch <- function(obj, path="/", user_agent="*") {
#'
#' can_fetch(gh_rt, c("/humans.txt", "/login", "/oembed"))
can_fetch <- function(obj, path = "/", user_agent = "*") {

  if (inherits(obj, "robxp")) {
    rep_path_allowed(obj, path, user_agent)
    vapply(path, rep_path_allowed, logical(1), x=obj, agent=user_agent, USE.NAMES=FALSE)
  } else {
    return(NULL)
  }

R R/cd.r => R/crawl-delay.r +2 -2
@@ 1,9 1,9 @@
#' Get all agent crawl delay values
#' Retrive all agent crawl delay values in a `robxp` `robots.txt` object
#'
#' @md
#' @param obj `robxp` object
#' @return data frame of agents and their crawl delays
#' @note `-1` will be returned for any listed agent without a crawl delay setting
#' @note `-1` will be returned for any listed agent _without_ a crawl delay setting
#' @export
#' @examples
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\n")

M R/rep-package.R => R/rep-package.R +2 -2
@@ 1,9 1,9 @@
#' Tools to Parse and Test Robots Exclusion Protocol Files and Rules
#'
#' The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set
#' The 'Robots Exclusion Protocol' (<http://www.robotstxt.org/orig.html>) documents a set
#' of standards for allowing or excluding robot/spider crawling of different areas of
#' site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp>
#' C++ library for processing these 'robots.txt' files.
#' C++ library for processing these `robots.txt`` files.
#'
#' @md
#' @name rep

R R/rep.r => R/robxp.r +17 -3
@@ 1,12 1,25 @@
#' Create a robots.txt object
#' Parse a `robots.txt` file & create a `robxp` object
#'
#' @param x atomic character vector containing a complete robots.txt file
#' This function takes in a single element character vector and parses it into
#' a `robxp` object.
#'
#' @param x either an atomic character vector containing a complete `robots.txt`` file
#'        _or_ a length >1 character vector that will concatenated into a single string _or_
#'        a `connection` object that will be passed to [readLines()], the result of which
#'        will be concatenated into a single string and parsed and the connection will be closed.
#' @export
#' @examples
#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\n")
#' rt <- robxp(imdb)
robxp <- function(x) {

  if (inherits(x, "connection")) {
    y <- readLines(x, warn = FALSE)
    close(x)
    x <- y
  }
  if (is.character(x)) if (length(x) > 1) x <- paste0(x, collapse="\n")

  robxp <- rep_parse(x)
  class(robxp) <- c("robxp")



@@ 15,9 28,10 @@ robxp <- function(x) {
}


#' Custom printer for 'robexp' objects
#' Custom printer for `robxp`` objects
#'
#' @md
#' @keywords internal
#' @param x object to print
#' @param ... unused
#' @export

M README.Rmd => README.Rmd +22 -8
@@ 2,9 2,9 @@
output: rmarkdown::github_document
---

[Travis-CI Build Status](https://travis-ci.org/hrbrmstr/rep.svg?branch=master) | 
[AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hrbrmstr/rep?branch=master&svg=true) | 
[Coverage Status](https://img.shields.io/codecov/c/github/hrbrmstr/rep/master.svg)
[![Build Status](https://travis-ci.org/hrbrmstr/rep.svg?branch=master)](https://travis-ci.org/hrbrmstr/rep)
[![Build status](https://ci.appveyor.com/api/projects/status/dakiw5y0xpq1m3bk?svg=true)](https://ci.appveyor.com/project/hrbrmstr/rep)
![Coverage Status](https://img.shields.io/codecov/c/github/hrbrmstr/rep/master.svg)

# rep



@@ 12,7 12,7 @@ Tools to Parse and Test Robots Exclusion Protocol Files and Rules

## Description

The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files.
The 'Robots Exclusion Protocol' (<http://www.robotstxt.org/orig.html>) documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The `rep-cpp` (<https://github.com/seomoz/rep-cpp>) C++ library for processing these `robots.txt` files.

- [`rep-cpp`](https://github.com/seomoz/rep-cpp)
- [`url-cpp`](https://github.com/seomoz/url-cpp)


@@ 21,10 21,10 @@ The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a

The following functions are implemented:

- `can_fetch`:	Test URL path against robots.txt
- `crawl_delays`:	Get all agent crawl delay values
- `print.robxp`:	Custom printer for 'robexp' objects
- `robxp`:	Create a robots.txt object
- `robxp`:	Parse a 'robots.txt' file & create a 'robxp' object
- `can_fetch`:	Test URL paths against a 'robxp' 'robots.txt' object
- `crawl_delays`:	Retrive all agent crawl delay values in a 'robxp' 'robots.txt' object
- `sitemaps`:	Retrieve a character vector of sitemaps from a parsed robots.txt object

## Installation



@@ 45,23 45,37 @@ library(robotstxt)
# current verison
packageVersion("rep")

# use helpers from the robotstxt package

rt <- robxp(get_robotstxt("https://cdc.gov"))

print(rt)

# or 

rt <- robxp(url("https://cdc.gov/robots.txt"))

can_fetch(rt, "/asthma/asthma_stats/default.htm", "*")

can_fetch(rt, "/_borders", "*")

gh_rt <- robxp(robotstxt::get_robotstxt("github.com"))

can_fetch(gh_rt, "/humans.txt", "*") # TRUE

can_fetch(gh_rt, "/login", "*") # FALSE

can_fetch(gh_rt, "/oembed", "CCBot") # FALSE

can_fetch(gh_rt, c("/humans.txt", "/login", "/oembed"))

crawl_delays(gh_rt)

imdb_rt <- robxp(robotstxt::get_robotstxt("imdb.com"))

crawl_delays(imdb_rt)

sitemaps(imdb_rt)
```

## Test Results

M README.md => README.md +28 -8
@@ 1,5 1,5 @@

[Travis-CI Build Status](https://travis-ci.org/hrbrmstr/rep.svg?branch=master) | [AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hrbrmstr/rep?branch=master&svg=true) | [Coverage Status](https://img.shields.io/codecov/c/github/hrbrmstr/rep/master.svg)
[![Build Status](https://travis-ci.org/hrbrmstr/rep.svg?branch=master)](https://travis-ci.org/hrbrmstr/rep) [![Build status](https://ci.appveyor.com/api/projects/status/dakiw5y0xpq1m3bk?svg=true)](https://ci.appveyor.com/project/hrbrmstr/rep) ![Coverage Status](https://img.shields.io/codecov/c/github/hrbrmstr/rep/master.svg)

rep
===


@@ 9,7 9,7 @@ Tools to Parse and Test Robots Exclusion Protocol Files and Rules
Description
-----------

The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp\` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files.
The 'Robots Exclusion Protocol' (<http://www.robotstxt.org/orig.html>) documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The `rep-cpp` (<https://github.com/seomoz/rep-cpp>) C++ library for processing these `robots.txt` files.

-   [`rep-cpp`](https://github.com/seomoz/rep-cpp)
-   [`url-cpp`](https://github.com/seomoz/url-cpp)


@@ 19,10 19,10 @@ Tools

The following functions are implemented:

-   `can_fetch`: Test URL path against robots.txt
-   `crawl_delays`: Get all agent crawl delay values
-   `print.robxp`: Custom printer for 'robexp' objects
-   `robxp`: Create a robots.txt object
-   `robxp`: Parse a 'robots.txt' file & create a 'robxp' object
-   `can_fetch`: Test URL paths against a 'robxp' 'robots.txt' object
-   `crawl_delays`: Retrive all agent crawl delay values in a 'robxp' 'robots.txt' object
-   `sitemaps`: Retrieve a character vector of sitemaps from a parsed robots.txt object

Installation
------------


@@ 45,6 45,8 @@ packageVersion("rep")
    ## [1] '0.2.0'

``` r
# use helpers from the robotstxt package

rt <- robxp(get_robotstxt("https://cdc.gov"))

print(rt)


@@ 53,6 55,10 @@ print(rt)
    ## <Robots Exclusion Protocol Object>

``` r
# or 

rt <- robxp(url("https://cdc.gov/robots.txt"))

can_fetch(rt, "/asthma/asthma_stats/default.htm", "*")
```



@@ 66,6 72,7 @@ can_fetch(rt, "/_borders", "*")

``` r
gh_rt <- robxp(robotstxt::get_robotstxt("github.com"))

can_fetch(gh_rt, "/humans.txt", "*") # TRUE
```



@@ 84,6 91,12 @@ can_fetch(gh_rt, "/oembed", "CCBot") # FALSE
    ## [1] FALSE

``` r
can_fetch(gh_rt, c("/humans.txt", "/login", "/oembed"))
```

    ## [1]  TRUE FALSE FALSE

``` r
crawl_delays(gh_rt)
```



@@ 116,6 129,7 @@ crawl_delays(gh_rt)

``` r
imdb_rt <- robxp(robotstxt::get_robotstxt("imdb.com"))

crawl_delays(imdb_rt)
```



@@ 124,6 138,12 @@ crawl_delays(imdb_rt)
    ## 2 scoutjet         3.0
    ## 3        *        -1.0

``` r
sitemaps(imdb_rt)
```

    ## [1] "http://www.imdb.com/sitemap_US_index.xml.gz"

Test Results
------------



@@ 134,14 154,14 @@ library(testthat)
date()
```

    ## [1] "Sat Sep 23 09:14:02 2017"
    ## [1] "Sat Sep 23 13:07:16 2017"

``` r
test_dir("tests/")
```

    ## testthat results ========================================================================================================
    ## OK: 5 SKIPPED: 0 FAILED: 0
    ## OK: 8 SKIPPED: 0 FAILED: 0
    ## 
    ## DONE ===================================================================================================================


A cran-comments.md => cran-comments.md +25 -0
@@ 0,0 1,25 @@
## Test environments
* local OS X install, R 3.4.1
* ubuntu 14.04 (on travis-ci), R oldrel, release & devel
* appveyor (windows)
* win-builder (devel and release)
* rhub (Windows)

## R CMD check results

0 errors | 0 warnings | 1 note

* This is a new release.

## Reverse dependencies

This is a new release, so there are no reverse dependencies.

---

* Code coverage is provided via codecov.io: https://codecov.io/gh/hrbrmstr/rep
* Travis-CI build/test results are at https://travis-ci.org/hrbrmstr/rep
* Appveyor build/test results are at https://ci.appveyor.com/project/hrbrmstr/rep
* No external network calls are made for the robots.txt tests or examples as there
  are four files in the inst/extdata folder which are used instead.
* The README.md generation does exercise the external URL tests.

M man/can_fetch.Rd => man/can_fetch.Rd +7 -2
@@ 2,7 2,7 @@
% Please edit documentation in R/can-fetch.r
\name{can_fetch}
\alias{can_fetch}
\title{Test URL path against robots.txt}
\title{Test URL paths against a \code{robxp} \code{robots.txt} object}
\usage{
can_fetch(obj, path = "/", user_agent = "*")
}


@@ 14,12 14,17 @@ can_fetch(obj, path = "/", user_agent = "*")
\item{user_agent}{user agent to test}
}
\description{
Test URL path against robots.txt
Provide a character vector of URL paths plus optional user agent and this function will
return a logical vector indicating whether you have permission to fetch the content
at the respective path.
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\\n")
gh_rt <- robxp(gh)

can_fetch(gh_rt, "/humans.txt", "*") # TRUE
can_fetch(gh_rt, "/login", "*") # FALSE
can_fetch(gh_rt, "/oembed", "CCBot") # FALSE

can_fetch(gh_rt, c("/humans.txt", "/login", "/oembed"))
}

M man/crawl_delays.Rd => man/crawl_delays.Rd +4 -4
@@ 1,8 1,8 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cd.r
% Please edit documentation in R/crawl-delay.r
\name{crawl_delays}
\alias{crawl_delays}
\title{Get all agent crawl delay values}
\title{Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object}
\usage{
crawl_delays(obj)
}


@@ 13,10 13,10 @@ crawl_delays(obj)
data frame of agents and their crawl delays
}
\description{
Get all agent crawl delay values
Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object
}
\note{
\code{-1} will be returned for any listed agent without a crawl delay setting
\code{-1} will be returned for any listed agent \emph{without} a crawl delay setting
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\\n")

M man/print.robxp.Rd => man/print.robxp.Rd +4 -3
@@ 1,8 1,8 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rep.r
% Please edit documentation in R/robxp.r
\name{print.robxp}
\alias{print.robxp}
\title{Custom printer for 'robexp' objects}
\title{Custom printer for `robxp`` objects}
\usage{
\method{print}{robxp}(x, ...)
}


@@ 12,5 12,6 @@
\item{...}{unused}
}
\description{
Custom printer for 'robexp' objects
Custom printer for `robxp`` objects
}
\keyword{internal}

M man/rep.Rd => man/rep.Rd +2 -3
@@ 6,10 6,9 @@
\alias{rep-package}
\title{Tools to Parse and Test Robots Exclusion Protocol Files and Rules}
\description{
The 'Robots Exclusion Protocol' \url{http://www.robotstxt.org/orig.html} documents a set
The 'Robots Exclusion Protocol' (\url{http://www.robotstxt.org/orig.html}) documents a set
of standards for allowing or excluding robot/spider crawling of different areas of
site content. Tools are provided which wrap The 'rep-cpp` \url{https://github.com/seomoz/rep-cpp}
C++ library for processing these 'robots.txt' files.
site content. Tools are provided which wrap The 'rep-cpp\code{<https://github.com/seomoz/rep-cpp> C++ library for processing these}robots.txt`` files.
}
\author{
Bob Rudis (bob@rud.is)

M man/robxp.Rd => man/robxp.Rd +8 -4
@@ 1,16 1,20 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rep.r
% Please edit documentation in R/robxp.r
\name{robxp}
\alias{robxp}
\title{Create a robots.txt object}
\title{Parse a `robots.txt` file & create a `robxp` object}
\usage{
robxp(x)
}
\arguments{
\item{x}{atomic character vector containing a complete robots.txt file}
\item{x}{either an atomic character vector containing a complete `robots.txt`` file
_or_ a length >1 character vector that will concatenated into a single string _or_
a `connection` object that will be passed to [readLines()], the result of which
will be concatenated into a single string and parsed and the connection will be closed.}
}
\description{
Create a robots.txt object
This function takes in a single element character vector and parses it into
a `robxp` object.
}
\examples{
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\\n")

A man/sitemaps.Rd => man/sitemaps.Rd +22 -0
@@ 0,0 1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{sitemaps}
\alias{sitemaps}
\title{Retrieve a character vector of sitemaps from a parsed robots.txt object}
\usage{
sitemaps(xp)
}
\arguments{
\item{xp}{A \code{robxp} object}
}
\value{
charcter vector of all sitemaps found in the parsed \code{robots.txt} file
}
\description{
Retrieve a character vector of sitemaps from a parsed robots.txt object
}
\examples{
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\\n")
rt <- robxp(imdb)
sitemaps(rt)
}

M src/RcppExports.cpp => src/RcppExports.cpp +24 -0
@@ 27,6 27,28 @@ BEGIN_RCPP
    return rcpp_result_gen;
END_RCPP
}
// sitemaps
std::vector<std::string> sitemaps(SEXP xp);
RcppExport SEXP _rep_sitemaps(SEXP xpSEXP) {
BEGIN_RCPP
    Rcpp::RObject rcpp_result_gen;
    Rcpp::RNGScope rcpp_rngScope_gen;
    Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP);
    rcpp_result_gen = Rcpp::wrap(sitemaps(xp));
    return rcpp_result_gen;
END_RCPP
}
// rep_as_string
std::string rep_as_string(SEXP xp);
RcppExport SEXP _rep_rep_as_string(SEXP xpSEXP) {
BEGIN_RCPP
    Rcpp::RObject rcpp_result_gen;
    Rcpp::RNGScope rcpp_rngScope_gen;
    Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP);
    rcpp_result_gen = Rcpp::wrap(rep_as_string(xp));
    return rcpp_result_gen;
END_RCPP
}
// rep_path_allowed
bool rep_path_allowed(SEXP xp, std::string path, std::string agent);
RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) {


@@ 44,6 66,8 @@ END_RCPP
static const R_CallMethodDef CallEntries[] = {
    {"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1},
    {"_rep_rep_crawl_delays", (DL_FUNC) &_rep_rep_crawl_delays, 1},
    {"_rep_sitemaps", (DL_FUNC) &_rep_sitemaps, 1},
    {"_rep_rep_as_string", (DL_FUNC) &_rep_rep_as_string, 1},
    {"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3},
    {NULL, NULL, 0}
};

M src/repmain.cpp => src/repmain.cpp +29 -0
@@ 40,6 40,35 @@ DataFrame rep_crawl_delays(SEXP xp) {

}

//' Retrieve a character vector of sitemaps from a parsed robots.txt object
//'
//' @md
//' @param xp A `robxp` object
//' @return charcter vector of all sitemaps found in the parsed `robots.txt` file
//' @export
//' @examples
//' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\n")
//' rt <- robxp(imdb)
//' sitemaps(rt)
// [[Rcpp::export]]
std::vector<std::string> sitemaps(SEXP xp) {

  Rcpp::XPtr<Rep::Robots> ptr(xp);
  return(ptr->sitemaps());

}

//' Retrieve a character vector of sitemaps from a parsed robots.txt object
//'
//' @noRd
//'
// [[Rcpp::export]]
std::string rep_as_string(SEXP xp) {

  Rcpp::XPtr<Rep::Robots> ptr(xp);
  return(ptr->str());

}

//' Path allowed
//'

M tests/testthat/test-rep.R => tests/testthat/test-rep.R +19 -7
@@ 1,19 1,31 @@
context("basic functionality")
test_that("parsing and testing works", {
test_that("parsing and fetch testing and sitemaps work", {

  cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="rep")), collapse="\n")
  rt <- robxp(cdc)
  rt1 <- robxp(cdc)

  expect_that(rt, is_a("robxp"))
  expect_that(rt1, is_a("robxp"))

  expect_that(can_fetch(rt, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE))
  expect_that(can_fetch(rt, "/_borders", "*"), equals(FALSE))
  expect_that(can_fetch(rt1, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE))
  expect_that(can_fetch(rt1, "/_borders", "*"), equals(FALSE))

  imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\n")
  rt <- robxp(imdb)
  cd <- crawl_delays(rt)
  rt2 <- robxp(imdb)
  cd <- crawl_delays(rt2)

  expect_that(cd, is_a("data.frame"))
  expect_equal(cd$crawl_delay, c(0.1, 3.0, -1.0))

  imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="rep"))
  rt2 <- robxp(imdb)

  gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\n")
  rt3 <- robxp(gh)

  rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="rep")))

  expect_equal(sitemaps(rt1), "http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml")
  expect_equal(sitemaps(rt2), "http://www.imdb.com/sitemap_US_index.xml.gz")
  expect_equal(sitemaps(rt3), character(0))

})