27643119f4cb1b32f455283ce1f9456eb92c5455 — hrbrmstr 4 months ago 2e4f5c1
addresses 2019-08-18 CRAN comments
8 files changed, 180 insertions(+), 203 deletions(-)

M .Rbuildignore
M DESCRIPTION
M R/spiderbar-package.R
M README.Rmd
M README.md
D cran-comments.md
M man/spiderbar.Rd
M src/robots.cpp
M .Rbuildignore => .Rbuildignore +1 -0
@@ 5,6 5,7 @@
 ^README\.*html$
 ^NOTES\.*Rmd$
 ^NOTES\.*html$
+^README\.md$
 ^\.codecov\.yml$
 ^README_files$
 ^doc$

M DESCRIPTION => DESCRIPTION +7 -6
@@ 1,18 1,18 @@
 Package: spiderbar
 Type: Package
 Title: Parse and Test Robots Exclusion Protocol Files and Rules
-Version: 0.2.1
-Date: 2017-11-17
+Version: 0.2.2
+Date: 2019-08-18
 Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut]
 Maintainer: Bob Rudis <bob@rud.is>
-Description: The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents
+Description: The 'Robots Exclusion Protocol' <https://www.robotstxt.org/orig.html> documents
     a set of standards for allowing or excluding robot/spider crawling of different areas of
     site content. Tools are provided which wrap The 'rep-cpp' <https://github.com/seomoz/rep-cpp>
     C++ library for processing these 'robots.txt' files.
 SystemRequirements: C++11
 NeedsCompilation: yes
-URL: https://github.com/hrbrmstr/spiderbar
-BugReports: https://github.com/hrbrmstr/spiderbar/issues
+URL: https://gitlab.com/hrbrmstr/spiderbar
+BugReports: https://gitlab.com/hrbrmstr/spiderbar/issues
 License: MIT + file LICENSE
 Suggests:
     testthat,


@@ 20,7 20,8 @@ Suggests:
     robotstxt
 Depends:
     R (>= 3.2.0)
+Encoding: UTF-8
 Imports:
     Rcpp
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.1
 LinkingTo: Rcpp

M R/spiderbar-package.R => R/spiderbar-package.R +1 -1
@@ 1,6 1,6 @@
 #' Parse and Test Robots Exclusion Protocol Files and Rules
 #'
-#' The 'Robots Exclusion Protocol' (<http://www.robotstxt.org/orig.html>) documents a set
+#' The 'Robots Exclusion Protocol' (<https://www.robotstxt.org/orig.html>) documents a set
 #' of standards for allowing or excluding robot/spider crawling of different areas of
 #' site content. Tools are provided which wrap The `rep-cpp` <https://github.com/seomoz/rep-cpp>
 #' C++ library for processing these `robots.txt`` files.

M README.Rmd => README.Rmd +24 -31
@@ 1,39 1,37 @@
 ---
-output: rmarkdown::github_document
+output: 
+  rmarkdown::github_document:
+    df_print: kable
+editor_options: 
+  chunk_output_type: console
 ---
+```{r pkg-knitr-opts, include=FALSE}
+hrbrpkghelpr::global_opts()
+```
 
-[![Build Status](https://travis-ci.org/hrbrmstr/spiderbar.svg?branch=master)](https://travis-ci.org/hrbrmstr/spiderbar)
-[![Build status](https://ci.appveyor.com/api/projects/status/dakiw5y0xpq1m3bk?svg=true)](https://ci.appveyor.com/project/hrbrmstr/spiderbar)
-![Coverage Status](http://img.shields.io/codecov/c/github/hrbrmstr/spiderbar/master.svg)
-
-# spiderbar
-
-Parse and Test Robots Exclusion Protocol Files and Rules
-
-## Description
+```{r badges, results='asis', echo=FALSE, cache=FALSE}
+hrbrpkghelpr::stinking_badges()
+```
 
-The 'Robots Exclusion Protocol' (<http://www.robotstxt.org/orig.html>) documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The `rep-cpp` (<https://github.com/seomoz/rep-cpp>) C++ library for processing these `robots.txt` files.
+```{r description, results='asis', echo=FALSE, cache=FALSE}
+hrbrpkghelpr::yank_title_and_description()
+```
 
 - [`rep-cpp`](https://github.com/seomoz/rep-cpp)
 - [`url-cpp`](https://github.com/seomoz/url-cpp)
 
-## Tools
+## What's Inside the Tin
 
 The following functions are implemented:
 
-- `robxp`:	Parse a 'robots.txt' file & create a 'robxp' object
-- `can_fetch`:	Test URL paths against a 'robxp' 'robots.txt' object
-- `crawl_delays`:	Retrive all agent crawl delay values in a 'robxp' 'robots.txt' object
-- `sitemaps`:	Retrieve a character vector of sitemaps from a parsed robots.txt object
+```{r ingredients, results='asis', echo=FALSE, cache=FALSE}
+hrbrpkghelpr::describe_ingredients()
+```
 
 ## Installation
 
-```{r eval=FALSE}
-devtools::install_github("hrbrmstr/spiderbar")
-```
-
-```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE}
-options(width=120)
+```{r install-ex, results='asis', echo=FALSE, cache=FALSE}
+hrbrpkghelpr::install_block()
 ```
 
 ## Usage


@@ 78,17 76,12 @@ crawl_delays(imdb_rt)
 sitemaps(imdb_rt)
 ```
 
-## Test Results
-
-```{r message=FALSE, warning=FALSE, error=FALSE}
-library(rep)
-library(testthat)
-
-date()
+## spiderbar Metrics
 
-test_dir("tests/")
+```{r cloc, echo=FALSE}
+cloc::cloc_pkg_md()
 ```
 
 ## Code of Conduct
 
-Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.>
\ No newline at end of file
+Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.<
\ No newline at end of file

M README.md => README.md +104 -104
@@ 1,38 1,64 @@
 
-[![Build Status](https://travis-ci.org/hrbrmstr/spiderbar.svg?branch=master)](https://travis-ci.org/hrbrmstr/spiderbar) [![Build status](https://ci.appveyor.com/api/projects/status/dakiw5y0xpq1m3bk?svg=true)](https://ci.appveyor.com/project/hrbrmstr/spiderbar) ![Coverage Status](http://img.shields.io/codecov/c/github/hrbrmstr/spiderbar/master.svg)
-
-spiderbar
-=========
+[![Project Status: Active – The project has reached a stable, usable
+state and is being actively
+developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
+[![Signed
+by](https://img.shields.io/badge/Keybase-Verified-brightgreen.svg)](https://keybase.io/hrbrmstr)
+![Signed commit
+%](https://img.shields.io/badge/Signed_Commits-0%25-lightgrey.svg)
+[![Linux build
+Status](https://travis-ci.org/hrbrmstr/spiderbar.svg?branch=master)](https://travis-ci.org/hrbrmstr/spiderbar)
+[![Windows build
+status](https://ci.appveyor.com/api/projects/status/github/hrbrmstr/spiderbar?svg=true)](https://ci.appveyor.com/project/hrbrmstr/spiderbar)
+[![Coverage
+Status](https://codecov.io/gh/hrbrmstr/spiderbar/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/spiderbar)
+[![cran
+checks](https://cranchecks.info/badges/worst/spiderbar)](https://cranchecks.info/pkgs/spiderbar)
+[![CRAN
+status](https://www.r-pkg.org/badges/version/spiderbar)](https://www.r-pkg.org/pkg/spiderbar)
+![Minimal R
+Version](https://img.shields.io/badge/R%3E%3D-3.2.0-blue.svg)
+![License](https://img.shields.io/badge/License-MIT-blue.svg)
+
+# spiderbar
 
 Parse and Test Robots Exclusion Protocol Files and Rules
 
-Description
------------
+## Description
 
-The 'Robots Exclusion Protocol' (<http://www.robotstxt.org/orig.html>) documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The `rep-cpp` (<https://github.com/seomoz/rep-cpp>) C++ library for processing these `robots.txt` files.
+The ‘Robots Exclusion Protocol’ <https://www.robotstxt.org/orig.html>
+documents a set of standards for allowing or excluding robot/spider
+crawling of different areas of site content. Tools are provided which
+wrap The ‘rep-cpp’ <https://github.com/seomoz/rep-cpp> C++ library for
+processing these ‘robots.txt’ files.
 
--   [`rep-cpp`](https://github.com/seomoz/rep-cpp)
--   [`url-cpp`](https://github.com/seomoz/url-cpp)
+  - [`rep-cpp`](https://github.com/seomoz/rep-cpp)
+  - [`url-cpp`](https://github.com/seomoz/url-cpp)
 
-Tools
------
+## What’s Inside the Tin
 
 The following functions are implemented:
 
--   `robxp`: Parse a 'robots.txt' file & create a 'robxp' object
--   `can_fetch`: Test URL paths against a 'robxp' 'robots.txt' object
--   `crawl_delays`: Retrive all agent crawl delay values in a 'robxp' 'robots.txt' object
--   `sitemaps`: Retrieve a character vector of sitemaps from a parsed robots.txt object
+  - `can_fetch`: Test URL paths against a robxp robots.txt object
+  - `crawl_delays`: Retrive all agent crawl delay values in a robxp
+    robots.txt object
+  - `print.robxp`: Custom printer for ’robxp“ objects
+  - `robxp`: Parse a ‘robots.txt’ file & create a ‘robxp’ object
+  - `sitemaps`: Retrieve a character vector of sitemaps from a parsed
+    robots.txt object
 
-Installation
-------------
+## Installation
 
 ``` r
-devtools::install_github("hrbrmstr/spiderbar")
+remotes::install_gitlab("hrbrmstr/spiderbar")
+# or
+remotes::install_github("hrbrmstr/spiderbar")
 ```
 
-Usage
------
+NOTE: To use the ‘remotes’ install options you will need to have the
+[{remotes} package](https://github.com/r-lib/remotes) installed.
+
+## Usage
 
 ``` r
 library(spiderbar)


@@ 40,132 66,106 @@ library(robotstxt)
 
 # current verison
 packageVersion("spiderbar")
-```
+## [1] '0.2.2'
 
-    ## [1] '0.2.0'
-
-``` r
 # use helpers from the robotstxt package
 
 rt <- robxp(get_robotstxt("https://cdc.gov"))
 
 print(rt)
-```
-
-    ## <Robots Exclusion Protocol Object>
+## <Robots Exclusion Protocol Object>
 
-``` r
 # or 
 
 rt <- robxp(url("https://cdc.gov/robots.txt"))
 
 can_fetch(rt, "/asthma/asthma_stats/default.htm", "*")
-```
+## [1] TRUE
 
-    ## [1] TRUE
-
-``` r
 can_fetch(rt, "/_borders", "*")
-```
+## [1] FALSE
 
-    ## [1] FALSE
-
-``` r
 gh_rt <- robxp(robotstxt::get_robotstxt("github.com"))
 
 can_fetch(gh_rt, "/humans.txt", "*") # TRUE
-```
-
-    ## [1] TRUE
+## [1] TRUE
 
-``` r
 can_fetch(gh_rt, "/login", "*") # FALSE
-```
+## [1] FALSE
 
-    ## [1] FALSE
-
-``` r
 can_fetch(gh_rt, "/oembed", "CCBot") # FALSE
-```
+## [1] FALSE
 
-    ## [1] FALSE
-
-``` r
 can_fetch(gh_rt, c("/humans.txt", "/login", "/oembed"))
-```
-
-    ## [1]  TRUE FALSE FALSE
+## [1]  TRUE FALSE FALSE
 
-``` r
 crawl_delays(gh_rt)
 ```
 
-    ##                agent crawl_delay
-    ## 1             yandex          -1
-    ## 2         twitterbot          -1
-    ## 3              ccbot          -1
-    ## 4        mail.ru_bot          -1
-    ## 5         telefonica          -1
-    ## 6              slurp          -1
-    ## 7          seznambot          -1
-    ## 8         sanddollar          -1
-    ## 9             coccoc          -1
-    ## 10       ia_archiver          -1
-    ## 11          swiftbot          -1
-    ## 12 red-app-gsa-p-one          -1
-    ## 13          naverbot          -1
-    ## 14            msnbot          -1
-    ## 15             teoma          -1
-    ## 16                 *          -1
-    ## 17  intuitgsacrawler          -1
-    ## 18           bingbot          -1
-    ## 19            daumoa          -1
-    ## 20         googlebot          -1
-    ## 21           httrack          -1
-    ## 22       duckduckbot          -1
-    ## 23        etaospider          -1
-    ## 24          rogerbot          -1
-    ## 25            dotbot          -1
+<div class="kable-table">
+
+| agent             | crawl\_delay |
+| :---------------- | -----------: |
+| yandex            |          \-1 |
+| twitterbot        |          \-1 |
+| ccbot             |          \-1 |
+| mail.ru\_bot      |          \-1 |
+| telefonica        |          \-1 |
+| slurp             |          \-1 |
+| seznambot         |          \-1 |
+| sanddollar        |          \-1 |
+| coccoc            |          \-1 |
+| ia\_archiver      |          \-1 |
+| swiftbot          |          \-1 |
+| red-app-gsa-p-one |          \-1 |
+| naverbot          |          \-1 |
+| msnbot            |          \-1 |
+| teoma             |          \-1 |
+| \*                |          \-1 |
+| intuitgsacrawler  |          \-1 |
+| bingbot           |          \-1 |
+| daumoa            |          \-1 |
+| googlebot         |          \-1 |
+| httrack           |          \-1 |
+| duckduckbot       |          \-1 |
+| etaospider        |          \-1 |
+| rogerbot          |          \-1 |
+| dotbot            |          \-1 |
+
+</div>
 
 ``` r
+
 imdb_rt <- robxp(robotstxt::get_robotstxt("imdb.com"))
 
 crawl_delays(imdb_rt)
 ```
 
-    ##      agent crawl_delay
-    ## 1    slurp         0.1
-    ## 2 scoutjet         3.0
-    ## 3        *        -1.0
+<div class="kable-table">
 
-``` r
-sitemaps(imdb_rt)
-```
-
-    ## [1] "http://www.imdb.com/sitemap_US_index.xml.gz"
+| agent | crawl\_delay |
+| :---- | -----------: |
+| \*    |          \-1 |
 
-Test Results
-------------
+</div>
 
 ``` r
-library(rep)
-library(testthat)
 
-date()
+sitemaps(imdb_rt)
+## character(0)
 ```
 
-    ## [1] "Sun Sep 24 08:39:04 2017"
-
-``` r
-test_dir("tests/")
-```
+## spiderbar Metrics
 
-    ## testthat results ========================================================================================================
-    ## OK: 8 SKIPPED: 0 FAILED: 0
-    ## 
-    ## DONE ===================================================================================================================
+| Lang         | \# Files |  (%) |  LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
+| :----------- | -------: | ---: | ---: | ---: | ----------: | ---: | -------: | ---: |
+| C++          |        9 | 0.38 | 1763 | 0.78 |         257 | 0.55 |      258 | 0.38 |
+| C/C++ Header |        7 | 0.29 |  395 | 0.18 |         152 | 0.33 |      280 | 0.42 |
+| R            |        7 | 0.29 |   68 | 0.03 |          26 | 0.06 |      101 | 0.15 |
+| Rmd          |        1 | 0.04 |   23 | 0.01 |          31 | 0.07 |       33 | 0.05 |
 
-Code of Conduct
----------------
+## Code of Conduct
 
-Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.
+Please note that this project is released with a Contributor Code of
+Conduct. By participating in this project you agree to abide by its
+terms.

D cran-comments.md => cran-comments.md +0 -27
@@ 1,27 0,0 @@
-## Test environments
-* local OS X install, R 3.4.1
-* ubuntu 14.04 (on travis-ci), R oldrel, release & devel
-* appveyor (windows)
-* win-builder (devel and release)
-* rhub (Windows)
-
-## R CMD check results
-
-0 errors | 0 warnings | 1 note
-
-* This is a new release.
-
-## Reverse dependencies
-
-This is a new release, so there are no reverse dependencies.
-
----
-
-* Package name, Title & Description were changed/fided 
-  as requested by CRAN (Swetlana Herbrandt)
-* Code coverage is provided via codecov.io: https://codecov.io/gh/hrbrmstr/spiderbar
-* Travis-CI build/test results are at https://travis-ci.org/hrbrmstr/spiderbar
-* Appveyor build/test results are at https://ci.appveyor.com/project/hrbrmstr/spiderbar
-* No external network calls are made for the robots.txt tests or examples as there
-  are four files in the inst/extdata folder which are used instead.
-* The README.md generation does exercise the external URL tests.

M man/spiderbar.Rd => man/spiderbar.Rd +1 -1
@@ 6,7 6,7 @@
 \alias{spiderbar-package}
 \title{Parse and Test Robots Exclusion Protocol Files and Rules}
 \description{
-The 'Robots Exclusion Protocol' (\url{http://www.robotstxt.org/orig.html}) documents a set
+The 'Robots Exclusion Protocol' (\url{https://www.robotstxt.org/orig.html}) documents a set
 of standards for allowing or excluding robot/spider crawling of different areas of
 site content. Tools are provided which wrap The \code{rep-cpp} \url{https://github.com/seomoz/rep-cpp}
 C++ library for processing these `robots.txt`` files.

M src/robots.cpp => src/robots.cpp +42 -33
@@ 14,45 14,54 @@
 namespace Rep
 {
 
-    void Robots::strip(std::string& string)
-    {
-        string.erase(string.begin(), std::find_if(string.begin(), string.end(),
-            std::not1(std::ptr_fun<int, int>(std::isspace))));
-        string.erase(std::find_if(string.rbegin(), string.rend(),
-            std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
-    }
+  void Robots::strip(std::string& string) {
 
-    bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value)
-    {
-        while (getline(stream, key))
-        {
-            size_t index = key.find('#');
-            if (index != std::string::npos)
-            {
-                key.resize(index);
-            }
+    string.erase(
+      string.begin(),
+      std::find_if(
+          string.begin(), string.end(),
+          [](int c) { return(!std::isspace(c)); }
+      )
+    );
 
-            // Find the colon and divide it into key and value, skipping malformed lines
-            index = key.find(':');
-            if (index == std::string::npos)
-            {
-                continue;
-            }
+    string.erase(
+      std::find_if(
+        string.rbegin(), string.rend(),
+        [](int c) { return(!std::isspace(c)); }
+      ).base(), string.end()
+    );
 
-            value.assign(key.begin() + index + 1, key.end());
-            key.resize(index);
+  }
 
-            // Strip whitespace off of each
-            strip(key);
-            strip(value);
+  bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value) {
 
-            // Lowercase the key
-            std::transform(key.begin(), key.end(), key.begin(), ::tolower);
+      while (getline(stream, key)) {
 
-            return true;
-        }
-        return false;
-    }
+          size_t index = key.find('#');
+
+          if (index != std::string::npos) key.resize(index);
+
+          // Find the colon and divide it into key and value, skipping malformed lines
+          index = key.find(':');
+          if (index == std::string::npos) continue;
+
+          value.assign(key.begin() + index + 1, key.end());
+          key.resize(index);
+
+          // Strip whitespace off of each
+          strip(key);
+          strip(value);
+
+          // Lowercase the key
+          std::transform(key.begin(), key.end(), key.begin(), ::tolower);
+
+          return true;
+
+      }
+
+      return false;
+
+  }
 
     Robots::Robots(const std::string& content): agents_(), sitemaps_(), default_(agents_["*"])
     {