~hrbrmstr/spiderbar

4848ff5e681f65c5a1ba5bb1b79b70974346e23f — boB Rudis 4 years ago 2de5d5c
custom function to retrieve all crawl_delay settings for all user agents
7 files changed, 73 insertions(+), 1 deletions(-)

M NAMESPACE
M R/RcppExports.R
A R/cd.r
A man/crawl_delays.Rd
M src/RcppExports.cpp
M src/repmain.cpp
M src/robots.h
M NAMESPACE => NAMESPACE +1 -0
@@ 2,6 2,7 @@

S3method(print,robxp)
export(can_fetch)
export(crawl_delays)
export(robxp)
importFrom(Rcpp,sourceCpp)
useDynLib(rep, .registration=TRUE)

M R/RcppExports.R => R/RcppExports.R +8 -0
@@ 9,6 9,14 @@ rep_parse <- function(content) {
    .Call(`_rep_rep_parse`, content)
}

#' Get delays
#'
#' @noRd
#'
rep_crawl_delays <- function(xp) {
    .Call(`_rep_rep_crawl_delays`, xp)
}

#' Path allowed
#'
#' @noRd

A R/cd.r => R/cd.r +15 -0
@@ 0,0 1,15 @@
#' Get all crawl_delay
#'
#' @md
#' @param obj `robxp` object
#' @export
#' @examples
crawl_delays <- function(obj) {

  if (inherits(obj, "robxp")) {
    rep_crawl_delays(obj)
  } else {
    return(NULL)
  }

}
\ No newline at end of file

A man/crawl_delays.Rd => man/crawl_delays.Rd +14 -0
@@ 0,0 1,14 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cd.r
\name{crawl_delays}
\alias{crawl_delays}
\title{Get all crawl_delay}
\usage{
crawl_delays(obj)
}
\arguments{
\item{obj}{\code{robxp} object}
}
\description{
Get all crawl_delay
}

M src/RcppExports.cpp => src/RcppExports.cpp +12 -0
@@ 16,6 16,17 @@ BEGIN_RCPP
    return rcpp_result_gen;
END_RCPP
}
// rep_crawl_delays
std::vector<float> rep_crawl_delays(SEXP xp);
RcppExport SEXP _rep_rep_crawl_delays(SEXP xpSEXP) {
BEGIN_RCPP
    Rcpp::RObject rcpp_result_gen;
    Rcpp::RNGScope rcpp_rngScope_gen;
    Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP);
    rcpp_result_gen = Rcpp::wrap(rep_crawl_delays(xp));
    return rcpp_result_gen;
END_RCPP
}
// rep_path_allowed
bool rep_path_allowed(SEXP xp, std::string path, std::string agent);
RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) {


@@ 32,6 43,7 @@ END_RCPP

static const R_CallMethodDef CallEntries[] = {
    {"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1},
    {"_rep_rep_crawl_delays", (DL_FUNC) &_rep_rep_crawl_delays, 1},
    {"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3},
    {NULL, NULL, 0}
};

M src/repmain.cpp => src/repmain.cpp +22 -0
@@ 2,6 2,8 @@
using namespace Rcpp;

#include "url.h"
#include "agent.h"
#include "directive.h"
#include "robots.h"

//' Parse robots.txt


@@ 14,6 16,26 @@ SEXP rep_parse(std::string content) {
  return(ptr);
}

//' Get delays
//'
//' @noRd
//'
// [[Rcpp::export]]
std::vector<float> rep_crawl_delays(SEXP xp) {

  Rcpp::XPtr<Rep::Robots> ptr(xp);

  std::vector<float> vals;
  vals.reserve(ptr->agents_.size());

  for(auto kv : ptr->agents_) {
    vals.push_back(kv.second.delay());
  }

  return(vals);

}


//' Path allowed
//'

M src/robots.h => src/robots.h +1 -1
@@ 15,6 15,7 @@ namespace Rep
    public:
        typedef std::unordered_map<std::string, Agent> agent_map_t;
        typedef std::vector<std::string> sitemaps_t;
        agent_map_t agents_;

        /**
         * Create a robots.txt from a utf-8-encoded string.


@@ 60,7 61,6 @@ namespace Rep
        static bool getpair(
            std::istringstream& stream, std::string& key, std::string& value);

        agent_map_t agents_;
        sitemaps_t sitemaps_;
        Agent& default_;
    };