M .Rbuildignore => .Rbuildignore +1 -0
@@ 13,3 13,4 @@
^appveyor\.yml$
^codecov\.yml$
^cran-comments\.md$
+^CRAN-RELEASE$
A CRAN-RELEASE => CRAN-RELEASE +2 -0
@@ 0,0 1,2 @@
+This package was submitted to CRAN on 2020-05-29.
+Once it is accepted, delete this file and tag the release (commit fb27ce1dec).
M DESCRIPTION => DESCRIPTION +6 -6
@@ 14,14 14,14 @@ NeedsCompilation: yes
URL: https://gitlab.com/hrbrmstr/spiderbar
BugReports: https://gitlab.com/hrbrmstr/spiderbar/issues
License: MIT + file LICENSE
-Suggests:
- testthat,
+Suggests:
covr,
- robotstxt
-Depends:
+ robotstxt,
+ tinytest
+Depends:
R (>= 3.2.0)
Encoding: UTF-8
-Imports:
+Imports:
Rcpp
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.0
LinkingTo: Rcpp
M NEWS.md => NEWS.md +6 -0
@@ 1,3 1,9 @@
+0.2.3
+* fix by Peter Meissner for fetching case
+* custom print method now returns the object
+* fixed spelling
+* ensured there's a roxygen return for every function
+
0.2.0
* Added crawl delay extraction
* Made all examples local so CRAN doesn't have to hit actual websites
M R/can-fetch.r => R/can-fetch.r +1 -0
@@ 9,6 9,7 @@
#' @param path path to test
#' @param user_agent user agent to test
#' @export
+#' @return logical vector indicating whether you have permission to fetch the content
#' @examples
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
#' package="spiderbar")), collapse="\n")
M R/crawl-delay.r => R/crawl-delay.r +1 -1
@@ 1,4 1,4 @@
-#' Retrive all agent crawl delay values in a `robxp` `robots.txt` object
+#' Retrieve all agent crawl delay values in a `robxp` `robots.txt` object
#'
#' @md
#' @param obj `robxp` object
M R/robxp.r => R/robxp.r +5 -3
@@ 8,10 8,11 @@
#' a `connection` object that will be passed to [readLines()], the result of which
#' will be concatenated into a single string and parsed and the connection will be closed.
#' @export
+#' @return a classed object holding an external pointer to parsed robots.txt data
#' @examples
-#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
-#' package="spiderbar")), collapse="\n")
-#' rt <- robxp(imdb)
+# imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
+# package="spiderbar")), collapse="\n")
+# rt <- robxp(imdb)
robxp <- function(x) {
if (inherits(x, "connection")) {
@@ 38,4 39,5 @@ robxp <- function(x) {
#' @export
print.robxp <- function(x, ...) {
cat("<Robots Exclusion Protocol Object>")
+ invisible(x)
}=
\ No newline at end of file
M README.md => README.md +14 -37
@@ 5,7 5,7 @@ developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.re
[](https://keybase.io/hrbrmstr)

+%](https://img.shields.io/badge/Signed_Commits-89%25-lightgrey.svg)
[](https://travis-ci.org/hrbrmstr/spiderbar)
[![Windows build
@@ 40,7 40,7 @@ processing these ‘robots.txt’ files.
The following functions are implemented:
- `can_fetch`: Test URL paths against a robxp robots.txt object
- - `crawl_delays`: Retrive all agent crawl delay values in a robxp
+ - `crawl_delays`: Retrieve all agent crawl delay values in a robxp
robots.txt object
- `print.robxp`: Custom printer for ’robxp“ objects
- `robxp`: Parse a ‘robots.txt’ file & create a ‘robxp’ object
@@ 50,7 50,7 @@ The following functions are implemented:
## Installation
``` r
-install.packages("spiderbar", repos = "https://cinc.rud.is")
+install.packages("spiderbar", repos = c("https://cinc.rud.is", "https://cloud.r-project.org/"))
# or
remotes::install_git("https://git.rud.is/hrbrmstr/spiderbar.git")
# or
@@ 74,7 74,7 @@ library(robotstxt)
# current verison
packageVersion("spiderbar")
-## [1] '0.2.2'
+## [1] '0.2.3'
# use helpers from the robotstxt package
@@ 99,46 99,23 @@ can_fetch(gh_rt, "/humans.txt", "*") # TRUE
## [1] TRUE
can_fetch(gh_rt, "/login", "*") # FALSE
-## [1] FALSE
+## [1] TRUE
can_fetch(gh_rt, "/oembed", "CCBot") # FALSE
-## [1] FALSE
+## [1] TRUE
can_fetch(gh_rt, c("/humans.txt", "/login", "/oembed"))
-## [1] TRUE FALSE FALSE
+## [1] TRUE TRUE TRUE
crawl_delays(gh_rt)
```
<div class="kable-table">
-| agent | crawl\_delay |
-| :---------------- | -----------: |
-| yandex | \-1 |
-| twitterbot | \-1 |
-| ccbot | \-1 |
-| mail.ru\_bot | \-1 |
-| telefonica | \-1 |
-| slurp | \-1 |
-| seznambot | \-1 |
-| sanddollar | \-1 |
-| coccoc | \-1 |
-| ia\_archiver | \-1 |
-| swiftbot | \-1 |
-| red-app-gsa-p-one | \-1 |
-| naverbot | \-1 |
-| msnbot | \-1 |
-| teoma | \-1 |
-| \* | \-1 |
-| intuitgsacrawler | \-1 |
-| bingbot | \-1 |
-| daumoa | \-1 |
-| googlebot | \-1 |
-| httrack | \-1 |
-| duckduckbot | \-1 |
-| etaospider | \-1 |
-| rogerbot | \-1 |
-| dotbot | \-1 |
+| agent | crawl\_delay |
+| :---- | -----------: |
+| baidu | 1 |
+| \* | \-1 |
</div>
@@ 167,9 144,9 @@ sitemaps(imdb_rt)
| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) |
| :----------- | -------: | ---: | ---: | ---: | ----------: | ---: | -------: | ---: |
-| C++ | 9 | 0.38 | 1763 | 0.78 | 257 | 0.55 | 258 | 0.38 |
-| C/C++ Header | 7 | 0.29 | 395 | 0.18 | 152 | 0.33 | 280 | 0.42 |
-| R | 7 | 0.29 | 68 | 0.03 | 26 | 0.06 | 101 | 0.15 |
+| C++ | 9 | 0.39 | 1763 | 0.79 | 257 | 0.56 | 258 | 0.38 |
+| C/C++ Header | 7 | 0.30 | 395 | 0.18 | 152 | 0.33 | 280 | 0.42 |
+| R | 6 | 0.26 | 47 | 0.02 | 18 | 0.04 | 101 | 0.15 |
| Rmd | 1 | 0.04 | 23 | 0.01 | 31 | 0.07 | 33 | 0.05 |
## Code of Conduct
A => +12 -0
@@ 0,0 1,12 @@
## Test environments
* local R installation, R 4.0.1
* ubuntu 16.04 (on travis-ci), R 4.0.1
* win-builder (devel)
## R CMD check results
0 errors | 0 warnings | 1 note
* This is a update release to fix a parsing edge case
* Ensured all functions have a @return block
* Fixed spelling mistakes
R tests/testthat/test-spiderbar.R => inst/tinytest/test_spiderbar.R +19 -21
@@ 1,31 1,29 @@
-context("basic functionality")
-test_that("parsing and fetch testing and sitemaps work", {
+library(spiderbar)
- cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="spiderbar")), collapse="\n")
- rt1 <- robxp(cdc)
+cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="spiderbar")), collapse="\n")
+rt1 <- robxp(cdc)
- expect_that(rt1, is_a("robxp"))
+expect_true(inherits(rt1, "robxp"))
- expect_that(can_fetch(rt1, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE))
- expect_that(can_fetch(rt1, "/_borders", "*"), equals(FALSE))
+expect_true(can_fetch(rt1, "/asthma/asthma_stats/default.htm", "*"))
+expect_false(can_fetch(rt1, "/_borders", "*"))
- imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
- rt2 <- robxp(imdb)
- cd <- crawl_delays(rt2)
+imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
+rt2 <- robxp(imdb)
+cd <- crawl_delays(rt2)
- expect_that(cd, is_a("data.frame"))
- expect_equal(sort(cd$crawl_delay), sort(c(0.1, 3.0, -1.0)))
+expect_true(inherits(cd, "data.frame"))
+expect_equal(sort(cd$crawl_delay), sort(c(0.1, 3.0, -1.0)))
- imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar"))
- rt2 <- robxp(imdb)
+imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar"))
+rt2 <- robxp(imdb)
- gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
- rt3 <- robxp(gh)
+gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
+rt3 <- robxp(gh)
- rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="spiderbar")))
+rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="spiderbar")))
- expect_equal(sitemaps(rt1), "http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml")
- expect_equal(sitemaps(rt2), "http://www.imdb.com/sitemap_US_index.xml.gz")
- expect_equal(sitemaps(rt3), character(0))
+expect_equal(sitemaps(rt1), "http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml")
+expect_equal(sitemaps(rt2), "http://www.imdb.com/sitemap_US_index.xml.gz")
+expect_equal(sitemaps(rt3), character(0))
-})
M man/can_fetch.Rd => man/can_fetch.Rd +4 -1
@@ 13,6 13,9 @@ can_fetch(obj, path = "/", user_agent = "*")
\item{user_agent}{user agent to test}
}
+\value{
+logical vector indicating whether you have permission to fetch the content
+}
\description{
Provide a character vector of URL paths plus optional user agent and this function will
return a logical vector indicating whether you have permission to fetch the content
@@ 20,7 23,7 @@ at the respective path.
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
- package="spiderbar")), collapse="\\n")
+ package="spiderbar")), collapse="\n")
gh_rt <- robxp(gh)
can_fetch(gh_rt, "/humans.txt", "*") # TRUE
M man/crawl_delays.Rd => man/crawl_delays.Rd +4 -4
@@ 2,7 2,7 @@
% Please edit documentation in R/crawl-delay.r
\name{crawl_delays}
\alias{crawl_delays}
-\title{Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object}
+\title{Retrieve all agent crawl delay values in a \code{robxp} \code{robots.txt} object}
\usage{
crawl_delays(obj)
}
@@ 13,19 13,19 @@ crawl_delays(obj)
data frame of agents and their crawl delays
}
\description{
-Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object
+Retrieve all agent crawl delay values in a \code{robxp} \code{robots.txt} object
}
\note{
\code{-1} will be returned for any listed agent \emph{without} a crawl delay setting
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
- package="spiderbar")), collapse="\\n")
+ package="spiderbar")), collapse="\n")
gh_rt <- robxp(gh)
crawl_delays(gh_rt)
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
- package="spiderbar")), collapse="\\n")
+ package="spiderbar")), collapse="\n")
imdb_rt <- robxp(imdb)
crawl_delays(imdb_rt)
}
M man/robxp.Rd => man/robxp.Rd +3 -5
@@ 12,12 12,10 @@ _or_ a length >1 character vector that will concatenated into a single string _o
a `connection` object that will be passed to [readLines()], the result of which
will be concatenated into a single string and parsed and the connection will be closed.}
}
+\value{
+a classed object holding an external pointer to parsed robots.txt data
+}
\description{
This function takes in a single element character vector and parses it into
a `robxp` object.
}
-\examples{
-imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
- package="spiderbar")), collapse="\\n")
-rt <- robxp(imdb)
-}
M man/sitemaps.Rd => man/sitemaps.Rd +1 -1
@@ 17,7 17,7 @@ Retrieve a character vector of sitemaps from a parsed robots.txt object
}
\examples{
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
- package="rep")), collapse="\\n")
+ package="rep")), collapse="\n")
rt <- robxp(imdb)
sitemaps(rt)
}
M man/spiderbar.Rd => man/spiderbar.Rd +0 -1
@@ 3,7 3,6 @@
\docType{package}
\name{spiderbar}
\alias{spiderbar}
-\alias{spiderbar-package}
\title{Parse and Test Robots Exclusion Protocol Files and Rules}
\description{
The 'Robots Exclusion Protocol' (\url{https://www.robotstxt.org/orig.html}) documents a set
D tests/test-all.R => tests/test-all.R +0 -3
@@ 1,3 0,0 @@
-library(testthat)
-library(robotstxt)
-test_check("spiderbar")
A tests/tinytest.R => tests/tinytest.R +5 -0
@@ 0,0 1,5 @@
+
+if ( requireNamespace("tinytest", quietly=TRUE) ){
+ tinytest::test_package("spiderbar")
+}
+