~hrbrmstr/spiderbar

cb6f9b99897de977479e10d96cec9fb888b02588 — boB Rudis 3 years ago 2aa63da
package rename as requested by CRAN
17 files changed, 65 insertions(+), 63 deletions(-)

M DESCRIPTION
M NAMESPACE
M R/RcppExports.R
M R/can-fetch.r
M R/crawl-delay.r
M R/robxp.r
R R/{rep-package.R => spiderbar-package.R}
M README.Rmd
M README.md
M man/can_fetch.Rd
M man/crawl_delays.Rd
M man/robxp.Rd
R man/{rep.Rd => spiderbar.Rd}
R rep.Rproj => spiderbar.Rproj
M src/RcppExports.cpp
M tests/test-all.R
R tests/testthat/{test-rep.R => test-spiderbar.R}
M DESCRIPTION => DESCRIPTION +5 -5
@@ 1,18 1,18 @@
Package: rep
Package: spiderbar
Type: Package
Title: Tools to Parse and Test Robots Exclusion Protocol Files and Rules
Title: Parse and Test Robots Exclusion Protocol Files and Rules
Version: 0.2.0
Date: 2017-09-23
Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut]
Maintainer: Bob Rudis <bob@rud.is>
Description: The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents
    a set of standards for allowing or excluding robot/spider crawling of different areas of
    site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp>
    site content. Tools are provided which wrap The 'rep-cpp' <https://github.com/seomoz/rep-cpp>
    C++ library for processing these 'robots.txt' files.
SystemRequirements: C++11
NeedsCompilation: yes
URL: https://github.com/hrbrmstr/rep
BugReports: https://github.com/hrbrmstr/rep/issues
URL: https://github.com/hrbrmstr/spiderbar
BugReports: https://github.com/hrbrmstr/spiderbar/issues
License: MIT + file LICENSE
Suggests:
    testthat,

M NAMESPACE => NAMESPACE +1 -1
@@ 6,4 6,4 @@ export(crawl_delays)
export(robxp)
export(sitemaps)
importFrom(Rcpp,sourceCpp)
useDynLib(rep, .registration=TRUE)
useDynLib(spiderbar, .registration=TRUE)

M R/RcppExports.R => R/RcppExports.R +5 -5
@@ 6,7 6,7 @@
#' @noRd
#'
rep_parse <- function(content) {
    .Call(`_rep_rep_parse`, content)
    .Call(`_spiderbar_rep_parse`, content)
}

#' Get delays


@@ 14,7 14,7 @@ rep_parse <- function(content) {
#' @noRd
#'
rep_crawl_delays <- function(xp) {
    .Call(`_rep_rep_crawl_delays`, xp)
    .Call(`_spiderbar_rep_crawl_delays`, xp)
}

#' Retrieve a character vector of sitemaps from a parsed robots.txt object


@@ 28,7 28,7 @@ rep_crawl_delays <- function(xp) {
#' rt <- robxp(imdb)
#' sitemaps(rt)
sitemaps <- function(xp) {
    .Call(`_rep_sitemaps`, xp)
    .Call(`_spiderbar_sitemaps`, xp)
}

#' Retrieve a character vector of sitemaps from a parsed robots.txt object


@@ 36,7 36,7 @@ sitemaps <- function(xp) {
#' @noRd
#'
rep_as_string <- function(xp) {
    .Call(`_rep_rep_as_string`, xp)
    .Call(`_spiderbar_rep_as_string`, xp)
}

#' Path allowed


@@ 44,6 44,6 @@ rep_as_string <- function(xp) {
#' @noRd
#'
rep_path_allowed <- function(xp, path, agent = "*") {
    .Call(`_rep_rep_path_allowed`, xp, path, agent)
    .Call(`_spiderbar_rep_path_allowed`, xp, path, agent)
}


M R/can-fetch.r => R/can-fetch.r +1 -1
@@ 10,7 10,7 @@
#' @param user_agent user agent to test
#' @export
#' @examples
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\n")
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
#' gh_rt <- robxp(gh)
#'
#' can_fetch(gh_rt, "/humans.txt", "*") # TRUE

M R/crawl-delay.r => R/crawl-delay.r +2 -2
@@ 6,11 6,11 @@
#' @note `-1` will be returned for any listed agent _without_ a crawl delay setting
#' @export
#' @examples
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\n")
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
#' gh_rt <- robxp(gh)
#' crawl_delays(gh_rt)
#'
#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\n")
#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
#' imdb_rt <- robxp(imdb)
#' crawl_delays(imdb_rt)
crawl_delays <- function(obj) {

M R/robxp.r => R/robxp.r +1 -1
@@ 9,7 9,7 @@
#'        will be concatenated into a single string and parsed and the connection will be closed.
#' @export
#' @examples
#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\n")
#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
#' rt <- robxp(imdb)
robxp <- function(x) {


R R/rep-package.R => R/spiderbar-package.R +4 -4
@@ 1,14 1,14 @@
#' Tools to Parse and Test Robots Exclusion Protocol Files and Rules
#' Parse and Test Robots Exclusion Protocol Files and Rules
#'
#' The 'Robots Exclusion Protocol' (<http://www.robotstxt.org/orig.html>) documents a set
#' of standards for allowing or excluding robot/spider crawling of different areas of
#' site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp>
#' site content. Tools are provided which wrap The `rep-cpp` <https://github.com/seomoz/rep-cpp>
#' C++ library for processing these `robots.txt`` files.
#'
#' @md
#' @name rep
#' @name spiderbar
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @useDynLib rep, .registration=TRUE
#' @useDynLib spiderbar, .registration=TRUE
#' @importFrom Rcpp sourceCpp
NULL
\ No newline at end of file

M README.Rmd => README.Rmd +8 -8
@@ 2,13 2,13 @@
output: rmarkdown::github_document
---

[![Build Status](https://travis-ci.org/hrbrmstr/rep.svg?branch=master)](https://travis-ci.org/hrbrmstr/rep)
[![Build status](https://ci.appveyor.com/api/projects/status/dakiw5y0xpq1m3bk?svg=true)](https://ci.appveyor.com/project/hrbrmstr/rep)
![Coverage Status](https://img.shields.io/codecov/c/github/hrbrmstr/rep/master.svg)
<!-- [![Build Status](https://travis-ci.org/hrbrmstr/rep.svg?branch=master)](https://travis-ci.org/hrbrmstr/rep) -->
<!-- [![Build status](https://ci.appveyor.com/api/projects/status/dakiw5y0xpq1m3bk?svg=true)](https://ci.appveyor.com/project/hrbrmstr/rep) -->
<!-- ![Coverage Status](https://img.shields.io/codecov/c/github/hrbrmstr/rep/master.svg) -->

# rep
# spiderbar

Tools to Parse and Test Robots Exclusion Protocol Files and Rules
Parse and Test Robots Exclusion Protocol Files and Rules

## Description



@@ 29,7 29,7 @@ The following functions are implemented:
## Installation

```{r eval=FALSE}
devtools::install_github("hrbrmstr/rep")
devtools::install_github("hrbrmstr/spiderbar")
```

```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE}


@@ 39,11 39,11 @@ options(width=120)
## Usage

```{r message=FALSE, warning=FALSE, error=FALSE}
library(rep)
library(spiderbar)
library(robotstxt)

# current verison
packageVersion("rep")
packageVersion("spiderbar")

# use helpers from the robotstxt package


M README.md => README.md +10 -9
@@ 1,10 1,11 @@

[![Build Status](https://travis-ci.org/hrbrmstr/rep.svg?branch=master)](https://travis-ci.org/hrbrmstr/rep) [![Build status](https://ci.appveyor.com/api/projects/status/dakiw5y0xpq1m3bk?svg=true)](https://ci.appveyor.com/project/hrbrmstr/rep) ![Coverage Status](https://img.shields.io/codecov/c/github/hrbrmstr/rep/master.svg)
<!-- [![Build Status](https://travis-ci.org/hrbrmstr/rep.svg?branch=master)](https://travis-ci.org/hrbrmstr/rep) -->
<!-- [![Build status](https://ci.appveyor.com/api/projects/status/dakiw5y0xpq1m3bk?svg=true)](https://ci.appveyor.com/project/hrbrmstr/rep) -->
<!-- ![Coverage Status](https://img.shields.io/codecov/c/github/hrbrmstr/rep/master.svg) -->
spiderbar
=========

rep
===

Tools to Parse and Test Robots Exclusion Protocol Files and Rules
Parse and Test Robots Exclusion Protocol Files and Rules

Description
-----------


@@ 28,18 29,18 @@ Installation
------------

``` r
devtools::install_github("hrbrmstr/rep")
devtools::install_github("hrbrmstr/spiderbar")
```

Usage
-----

``` r
library(rep)
library(spiderbar)
library(robotstxt)

# current verison
packageVersion("rep")
packageVersion("spiderbar")
```

    ## [1] '0.2.0'


@@ 154,7 155,7 @@ library(testthat)
date()
```

    ## [1] "Sat Sep 23 13:07:16 2017"
    ## [1] "Sun Sep 24 08:28:30 2017"

``` r
test_dir("tests/")

M man/can_fetch.Rd => man/can_fetch.Rd +1 -1
@@ 19,7 19,7 @@ return a logical vector indicating whether you have permission to fetch the cont
at the respective path.
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\\n")
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\\n")
gh_rt <- robxp(gh)

can_fetch(gh_rt, "/humans.txt", "*") # TRUE

M man/crawl_delays.Rd => man/crawl_delays.Rd +2 -2
@@ 19,11 19,11 @@ Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object
\code{-1} will be returned for any listed agent \emph{without} a crawl delay setting
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\\n")
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\\n")
gh_rt <- robxp(gh)
crawl_delays(gh_rt)

imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\\n")
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\\n")
imdb_rt <- robxp(imdb)
crawl_delays(imdb_rt)
}

M man/robxp.Rd => man/robxp.Rd +1 -1
@@ 17,6 17,6 @@ This function takes in a single element character vector and parses it into
a `robxp` object.
}
\examples{
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\\n")
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\\n")
rt <- robxp(imdb)
}

R man/rep.Rd => man/spiderbar.Rd +7 -6
@@ 1,14 1,15 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rep-package.R
% Please edit documentation in R/spiderbar-package.R
\docType{package}
\name{rep}
\alias{rep}
\alias{rep-package}
\title{Tools to Parse and Test Robots Exclusion Protocol Files and Rules}
\name{spiderbar}
\alias{spiderbar}
\alias{spiderbar-package}
\title{Parse and Test Robots Exclusion Protocol Files and Rules}
\description{
The 'Robots Exclusion Protocol' (\url{http://www.robotstxt.org/orig.html}) documents a set
of standards for allowing or excluding robot/spider crawling of different areas of
site content. Tools are provided which wrap The 'rep-cpp\code{<https://github.com/seomoz/rep-cpp> C++ library for processing these}robots.txt`` files.
site content. Tools are provided which wrap The \code{rep-cpp} \url{https://github.com/seomoz/rep-cpp}
C++ library for processing these `robots.txt`` files.
}
\author{
Bob Rudis (bob@rud.is)

R rep.Rproj => spiderbar.Rproj +0 -0
M src/RcppExports.cpp => src/RcppExports.cpp +11 -11
@@ 7,7 7,7 @@ using namespace Rcpp;

// rep_parse
SEXP rep_parse(std::string content);
RcppExport SEXP _rep_rep_parse(SEXP contentSEXP) {
RcppExport SEXP _spiderbar_rep_parse(SEXP contentSEXP) {
BEGIN_RCPP
    Rcpp::RObject rcpp_result_gen;
    Rcpp::RNGScope rcpp_rngScope_gen;


@@ 18,7 18,7 @@ END_RCPP
}
// rep_crawl_delays
DataFrame rep_crawl_delays(SEXP xp);
RcppExport SEXP _rep_rep_crawl_delays(SEXP xpSEXP) {
RcppExport SEXP _spiderbar_rep_crawl_delays(SEXP xpSEXP) {
BEGIN_RCPP
    Rcpp::RObject rcpp_result_gen;
    Rcpp::RNGScope rcpp_rngScope_gen;


@@ 29,7 29,7 @@ END_RCPP
}
// sitemaps
std::vector<std::string> sitemaps(SEXP xp);
RcppExport SEXP _rep_sitemaps(SEXP xpSEXP) {
RcppExport SEXP _spiderbar_sitemaps(SEXP xpSEXP) {
BEGIN_RCPP
    Rcpp::RObject rcpp_result_gen;
    Rcpp::RNGScope rcpp_rngScope_gen;


@@ 40,7 40,7 @@ END_RCPP
}
// rep_as_string
std::string rep_as_string(SEXP xp);
RcppExport SEXP _rep_rep_as_string(SEXP xpSEXP) {
RcppExport SEXP _spiderbar_rep_as_string(SEXP xpSEXP) {
BEGIN_RCPP
    Rcpp::RObject rcpp_result_gen;
    Rcpp::RNGScope rcpp_rngScope_gen;


@@ 51,7 51,7 @@ END_RCPP
}
// rep_path_allowed
bool rep_path_allowed(SEXP xp, std::string path, std::string agent);
RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) {
RcppExport SEXP _spiderbar_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) {
BEGIN_RCPP
    Rcpp::RObject rcpp_result_gen;
    Rcpp::RNGScope rcpp_rngScope_gen;


@@ 64,15 64,15 @@ END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
    {"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1},
    {"_rep_rep_crawl_delays", (DL_FUNC) &_rep_rep_crawl_delays, 1},
    {"_rep_sitemaps", (DL_FUNC) &_rep_sitemaps, 1},
    {"_rep_rep_as_string", (DL_FUNC) &_rep_rep_as_string, 1},
    {"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3},
    {"_spiderbar_rep_parse", (DL_FUNC) &_spiderbar_rep_parse, 1},
    {"_spiderbar_rep_crawl_delays", (DL_FUNC) &_spiderbar_rep_crawl_delays, 1},
    {"_spiderbar_sitemaps", (DL_FUNC) &_spiderbar_sitemaps, 1},
    {"_spiderbar_rep_as_string", (DL_FUNC) &_spiderbar_rep_as_string, 1},
    {"_spiderbar_rep_path_allowed", (DL_FUNC) &_spiderbar_rep_path_allowed, 3},
    {NULL, NULL, 0}
};

RcppExport void R_init_rep(DllInfo *dll) {
RcppExport void R_init_spiderbar(DllInfo *dll) {
    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
    R_useDynamicSymbols(dll, FALSE);
}

M tests/test-all.R => tests/test-all.R +1 -1
@@ 1,3 1,3 @@
library(testthat)
library(robotstxt)
test_check("rep")
test_check("spiderbar")

R tests/testthat/test-rep.R => tests/testthat/test-spiderbar.R +5 -5
@@ 1,7 1,7 @@
context("basic functionality")
test_that("parsing and fetch testing and sitemaps work", {

  cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="rep")), collapse="\n")
  cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="spiderbar")), collapse="\n")
  rt1 <- robxp(cdc)

  expect_that(rt1, is_a("robxp"))


@@ 9,20 9,20 @@ test_that("parsing and fetch testing and sitemaps work", {
  expect_that(can_fetch(rt1, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE))
  expect_that(can_fetch(rt1, "/_borders", "*"), equals(FALSE))

  imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="rep")), collapse="\n")
  imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
  rt2 <- robxp(imdb)
  cd <- crawl_delays(rt2)

  expect_that(cd, is_a("data.frame"))
  expect_equal(cd$crawl_delay, c(0.1, 3.0, -1.0))

  imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="rep"))
  imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar"))
  rt2 <- robxp(imdb)

  gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="rep")), collapse="\n")
  gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
  rt3 <- robxp(gh)

  rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="rep")))
  rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="spiderbar")))

  expect_equal(sitemaps(rt1), "http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml")
  expect_equal(sitemaps(rt2), "http://www.imdb.com/sitemap_US_index.xml.gz")