A => .Rbuildignore +11 -0
@@ 1,11 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^\.travis\.yml$
+^README\.*Rmd$
+^README\.*html$
+^NOTES\.*Rmd$
+^NOTES\.*html$
+^\.codecov\.yml$
+^README_files$
+^doc$
+^tmp$
A => .codecov.yml +1 -0
@@ 1,1 @@
+comment: false
A => .gitignore +8 -0
@@ 1,8 @@
+.DS_Store
+.Rproj.user
+.Rhistory
+.RData
+.Rproj
+src/*.o
+src/*.so
+src/*.dll
A => .travis.yml +6 -0
@@ 1,6 @@
+language: R
+sudo: false
+cache: packages
+
+after_success:
+- Rscript -e 'covr::codecov()'
A => DESCRIPTION +28 -0
@@ 1,28 @@
+Package: urlscan
+Type: Package
+Title: Analyze Websites and Resources They Request
+Version: 0.1.0
+Date: 2018-03-23
+Authors@R: c(
+ person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
+ comment = c(ORCID = "0000-0001-5670-2640"))
+ )
+Maintainer: Bob Rudis <bob@rud.is>
+Description: The <urlscan.io> service provides an 'API' enabling analysis of
+ websites and the resources they request. Much like the 'Inspector' of your
+ browser, <urlscan.io> will let you take a look at the individual resources
+ that are requested when a site is loaded. Tools are provided to search
+ public <urlscans.io> scan submissions.
+URL: https://github.com/hrbrmstr/urlscan
+BugReports: https://github.com/hrbrmstr/urlscan/issues
+Encoding: UTF-8
+License: AGPL
+Suggests:
+ testthat,
+ covr
+Depends:
+ R (>= 3.2.0)
+Imports:
+ httr,
+ jsonlite
+RoxygenNote: 6.0.1.9000
A => NAMESPACE +8 -0
@@ 1,8 @@
+# Generated by roxygen2: do not edit by hand
+
+export(urlscan_search)
+importFrom(httr,GET)
+importFrom(httr,content)
+importFrom(httr,stop_for_status)
+importFrom(httr,user_agent)
+importFrom(jsonlite,fromJSON)
A => NEWS.md +2 -0
@@ 1,2 @@
+0.1.0
+* Initial release
A => R/urlscan-package.R +15 -0
@@ 1,15 @@
+#' Analyze Websites and Resources They Request
+#'
+#' The <urlscan.io> service provides an 'API' enabling analysis of
+#' websites and the resources they request. Much like the 'Inspector' of your
+#' browser, <urlscan.io> will let you take a look at the individual resources
+#' that are requested when a site is loaded. Tools are provided to search
+#' public <urlscans.io> scan submissions.
+#'
+#' @md
+#' @name urlscan
+#' @docType package
+#' @author Bob Rudis (bob@@rud.is)
+#' @importFrom httr GET user_agent content stop_for_status
+#' @importFrom jsonlite fromJSON
+NULL
A => R/urlscan.R +51 -0
@@ 1,51 @@
+#' Perform a urlscan.io query
+#'
+#' urlscan.io uses an Elasticsearch back-end and enables querying by a number
+#' of fields, including:
+#'
+#' - `domain`: Domain (or a subdomain of it) is contacted in one of the requests
+#' - `page.domain`: Domain (or a subdomain of it) is the first domain to be contacted
+#' - `ip`: The IP or subnet are contacted in one request
+#' - `asn`: The autonomous system (AS) was contacted (_must_ use `AS` prefix!) (comma-separated for more than one)
+#' - `asname`: The autonomous system with this name was contacted (comma-separated for more than one)
+#' - `filename`: This filename was requested
+#' - `hash`: A resource with this SHA256 hash was downloaded
+#' - `server`: The page contact a host running this web server
+#' - `task.method`: one of "`manual`" or "`api`"; show manual (user) or API submissions
+#'
+#' The fields `ip`, `domain`, `url`, `asn`, `asnname`, `country` and `server` can also be prefixed with `page.`
+#' to only match the value for the first request/response (e.g. `page.server:nginx AND page.domain:de`).
+#' Furthermore, you can concatenate search-terms with `AND`, `OR`, etc.
+#'
+#' @md
+#' @param query query to run
+#' @param size number of results to return (default is `100`)
+#' @param offset offset of first result (for pagination) (default is `0`)
+#' @param sort sorting, specified via `$sort_field:$sort_order`. Default: `_score`
+#' @references <https://urlscan.io/about-api/#search>
+#' @note Search can only find **public** scans, there is no way to search for private scans.
+#' @export
+urlscan_search <- function(query, size=100, offset=0, sort=NULL) {
+
+ httr::GET(
+ url = "https://urlscan.io/api/v1/search/",
+ query = list(
+ q = query,
+ size = size,
+ offset = offset,
+ sort = sort
+ ),
+ httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan")
+ ) -> res
+
+ httr::stop_for_status(res)
+
+ res <- httr::content(res, as="text")
+
+ res <- jsonlite::fromJSON(res)
+
+ class(res) <- c("urlscan", "list")
+
+ res
+
+}<
\ No newline at end of file
A => R/util.R +12 -0
@@ 1,12 @@
+#' #' Turn urlscan object into a data frame
+#' #'
+#' #' param x `urlscan` object
+#' #' param ... unused
+#' #' export
+#' as.data.frame.urlscan <- function(x, ...) {
+#'
+#' res <- x$results
+#' class(res) <- c("tbl_df", "tbl", "data.frame")
+#' res
+#'
+#' }<
\ No newline at end of file
A => README.Rmd +59 -0
@@ 1,59 @@
+---
+output: rmarkdown::github_document
+---
+
+# urlscan
+
+Analyze Websites and Resources They Request
+
+## Description
+
+WIP
+
+The <urlscan.io> service provides an 'API' enabling analysis of
+websites and the resources they request. Much like the 'Inspector' of your
+browser, <urlscan.io> will let you take a look at the individual resources
+that are requested when a site is loaded. Tools are provided to search
+public <urlscans.io> scan submissions.
+
+## What's Inside The Tin
+
+The following functions are implemented:
+
+- `urlscan_search`: Perform a urlscan.io query
+
+## Installation
+
+```{r eval=FALSE}
+devtools::install_github("hrbrmstr/urlscan")
+```
+
+```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE}
+options(width=120)
+```
+
+## Usage
+
+```{r message=FALSE, warning=FALSE, error=FALSE}
+library(urlscan)
+
+# current verison
+packageVersion("urlscan")
+```
+
+```{r}
+library(tidyverse)
+
+x <- urlscan_search("domain:r-project.org")
+
+bind_cols(
+ select(x$results$task, -options) %>%
+ mutate(user_agent = x$results$task$options$useragent)
+ , x$results$stats, x$results$page
+) %>%
+ tbl_df() -> xdf
+
+xdf
+
+glimpse(xdf)
+```<
\ No newline at end of file
A => README.md +110 -0
@@ 1,110 @@
+
+# urlscan
+
+Analyze Websites and Resources They Request
+
+## Description
+
+WIP
+
+The \<urlscan.io\> service provides an ‘API’ enabling analysis of
+websites and the resources they request. Much like the ‘Inspector’ of
+your browser, \<urlscan.io\> will let you take a look at the individual
+resources that are requested when a site is loaded. Tools are provided
+to search public \<urlscans.io\> scan submissions.
+
+## What’s Inside The Tin
+
+The following functions are implemented:
+
+ - `urlscan_search`: Perform a urlscan.io query
+
+## Installation
+
+``` r
+devtools::install_github("hrbrmstr/urlscan")
+```
+
+## Usage
+
+``` r
+library(urlscan)
+
+# current verison
+packageVersion("urlscan")
+```
+
+ ## [1] '0.1.0'
+
+``` r
+library(tidyverse)
+```
+
+ ## ── Attaching packages ────────────────────────────────────── tidyverse 1.2.1 ──
+
+ ## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
+ ## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
+ ## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
+ ## ✔ readr 1.1.1 ✔ forcats 0.2.0
+
+ ## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
+ ## ✖ dplyr::filter() masks stats::filter()
+ ## ✖ dplyr::lag() masks stats::lag()
+
+``` r
+x <- urlscan_search("domain:r-project.org")
+
+bind_cols(
+ select(x$results$task, -options) %>%
+ mutate(user_agent = x$results$task$options$useragent)
+ , x$results$stats, x$results$page
+) %>%
+ tbl_df() -> xdf
+
+xdf
+```
+
+ ## # A tibble: 12 x 20
+ ## visibility method time source url user_agent uniqIPs consoleMsgs dataLength encodedDataLeng… requests country
+ ## <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int> <chr>
+ ## 1 public manual 2017… web https… Mozilla/5.0… 1 0 12758 676 2 AT
+ ## 2 public manual 2017… web https… Mozilla/5.0… 1 0 14396 676 2 AT
+ ## 3 public manual 2017… web https… Mozilla/5.0… 2 0 286138 97317 6 AT
+ ## 4 public manual 2017… web https… <NA> 1 0 0 0 1 AT
+ ## 5 public manual 2017… web https… <NA> 1 0 0 0 1 AT
+ ## 6 public manual 2017… web https… <NA> 1 0 5284 1813 2 AT
+ ## 7 public manual 2017… web https… <NA> 1 0 5284 1813 2 AT
+ ## 8 public manual 2017… web https… <NA> 1 0 4297 1640 2 AT
+ ## 9 public manual 2017… web https… <NA> 1 0 14722 6288 9 AT
+ ## 10 public manual 2017… web https… <NA> 2 0 285893 97695 6 AT
+ ## 11 public automa… 2017… hacke… https… <NA> 1 0 343270 101327 4 AT
+ ## 12 public automa… 2017… hacke… https… <NA> 1 0 345452 101840 4 AT
+ ## # ... with 8 more variables: server <chr>, city <chr>, domain <chr>, ip <chr>, asnname <chr>, asn <chr>, url1 <chr>,
+ ## # ptr <chr>
+
+``` r
+glimpse(xdf)
+```
+
+ ## Observations: 12
+ ## Variables: 20
+ ## $ visibility <chr> "public", "public", "public", "public", "public", "public", "public", "public", "public",...
+ ## $ method <chr> "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual",...
+ ## $ time <chr> "2017-12-29T17:23:39.785Z", "2017-12-20T15:52:22.902Z", "2017-11-10T13:40:19.991Z", "2017...
+ ## $ source <chr> "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "hackernews", "hack...
+ ## $ url <chr> "https://cran.r-project.org/web/packages/randomForest/index.html", "https://cran.r-projec...
+ ## $ user_agent <chr> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) C...
+ ## $ uniqIPs <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1
+ ## $ consoleMsgs <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ ## $ dataLength <int> 12758, 14396, 286138, 0, 0, 5284, 5284, 4297, 14722, 285893, 343270, 345452
+ ## $ encodedDataLength <int> 676, 676, 97317, 0, 0, 1813, 1813, 1640, 6288, 97695, 101327, 101840
+ ## $ requests <int> 2, 2, 6, 1, 1, 2, 2, 2, 9, 6, 4, 4
+ ## $ country <chr> "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT"
+ ## $ server <chr> "Apache/2.4.10 (Debian)", "Apache/2.4.10 (Debian)", "Apache/2.4.10 (Debian)", "Apache/2.4...
+ ## $ city <chr> "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna",...
+ ## $ domain <chr> "cran.r-project.org", "cran.r-project.org", "www.r-project.org", "cran.r-project.org", "c...
+ ## $ ip <chr> "137.208.57.37", "137.208.57.37", "137.208.57.37", "137.208.57.37", "137.208.57.37", "137...
+ ## $ asnname <chr> "Welthandelsplatz 1, AT", "Welthandelsplatz 1, AT", "Welthandelsplatz 1, AT", "Welthandel...
+ ## $ asn <chr> "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776",...
+ ## $ url1 <chr> "https://cran.r-project.org/web/packages/randomForest/index.html", "https://cran.r-projec...
+ ## $ ptr <chr> "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "...
A => man/urlscan.Rd +17 -0
@@ 1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/urlscan-package.R
+\docType{package}
+\name{urlscan}
+\alias{urlscan}
+\alias{urlscan-package}
+\title{Analyze Websites and Resources They Request}
+\description{
+The <urlscan.io> service provides an 'API' enabling analysis of
+websites and the resources they request. Much like the 'Inspector' of your
+browser, <urlscan.io> will let you take a look at the individual resources
+that are requested when a site is loaded. Tools are provided to search
+public <urlscans.io> scan submissions.
+}
+\author{
+Bob Rudis (bob@rud.is)
+}
A => man/urlscan_search.Rd +44 -0
@@ 1,44 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/urlscan.R
+\name{urlscan_search}
+\alias{urlscan_search}
+\title{Perform a urlscan.io query}
+\usage{
+urlscan_search(query, size = 100, offset = 0, sort = NULL)
+}
+\arguments{
+\item{query}{query to run}
+
+\item{size}{number of results to return (default is \code{100})}
+
+\item{offset}{offset of first result (for pagination) (default is \code{0})}
+
+\item{sort}{sorting, specified via \code{$sort_field:$sort_order}. Default: \code{_score}}
+}
+\description{
+urlscan.io uses an Elasticsearch back-end and enables querying by a number
+of fields, including:
+}
+\details{
+\itemize{
+\item \code{domain}: Domain (or a subdomain of it) is contacted in one of the requests
+\item \code{page.domain}: Domain (or a subdomain of it) is the first domain to be contacted
+\item \code{ip}: The IP or subnet are contacted in one request
+\item \code{asn}: The autonomous system (AS) was contacted (\emph{must} use \code{AS} prefix!) (comma-separated for more than one)
+\item \code{asname}: The autonomous system with this name was contacted (comma-separated for more than one)
+\item \code{filename}: This filename was requested
+\item \code{hash}: A resource with this SHA256 hash was downloaded
+\item \code{server}: The page contact a host running this web server
+\item \code{task.method}: one of "\code{manual}" or "\code{api}"; show manual (user) or API submissions
+}
+
+The fields \code{ip}, \code{domain}, \code{url}, \code{asn}, \code{asnname}, \code{country} and \code{server} can also be prefixed with \code{page.}
+to only match the value for the first request/response (e.g. \code{page.server:nginx AND page.domain:de}).
+Furthermore, you can concatenate search-terms with \code{AND}, \code{OR}, etc.
+}
+\note{
+Search can only find \strong{public} scans, there is no way to search for private scans.
+}
+\references{
+\url{https://urlscan.io/about-api/#search}
+}
A => tests/test-all.R +2 -0
@@ 1,2 @@
+library(testthat)
+test_check("urlscan")
A => tests/testthat/test-urlscan.R +6 -0
@@ 1,6 @@
+context("minimal package functionality")
+test_that("we can do something", {
+
+ #expect_that(some_function(), is_a("data.frame"))
+
+})
A => urlscan.Rproj +21 -0
@@ 1,21 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageBuildArgs: --resave-data
+PackageRoxygenize: rd,collate,namespace