From a60e6c38011f19dada7811352d7fa791e0fd5b3b Mon Sep 17 00:00:00 2001 From: boB Rudis Date: Fri, 23 Mar 2018 10:31:32 -0400 Subject: [PATCH] initial commit --- .Rbuildignore | 11 ++++ .codecov.yml | 1 + .gitignore | 8 +++ .travis.yml | 6 ++ DESCRIPTION | 28 +++++++++ NAMESPACE | 8 +++ NEWS.md | 2 + R/urlscan-package.R | 15 +++++ R/urlscan.R | 51 ++++++++++++++++ R/util.R | 12 ++++ README.Rmd | 59 ++++++++++++++++++ README.md | 110 ++++++++++++++++++++++++++++++++++ man/urlscan.Rd | 17 ++++++ man/urlscan_search.Rd | 44 ++++++++++++++ tests/test-all.R | 2 + tests/testthat/test-urlscan.R | 6 ++ urlscan.Rproj | 21 +++++++ 17 files changed, 401 insertions(+) create mode 100644 .Rbuildignore create mode 100644 .codecov.yml create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 DESCRIPTION create mode 100644 NAMESPACE create mode 100644 NEWS.md create mode 100644 R/urlscan-package.R create mode 100644 R/urlscan.R create mode 100644 R/util.R create mode 100644 README.Rmd create mode 100644 README.md create mode 100644 man/urlscan.Rd create mode 100644 man/urlscan_search.Rd create mode 100644 tests/test-all.R create mode 100644 tests/testthat/test-urlscan.R create mode 100644 urlscan.Rproj diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..70baf05 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,11 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.travis\.yml$ +^README\.*Rmd$ +^README\.*html$ +^NOTES\.*Rmd$ +^NOTES\.*html$ +^\.codecov\.yml$ +^README_files$ +^doc$ +^tmp$ diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..69cb760 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1 @@ +comment: false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cce1f17 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.DS_Store +.Rproj.user +.Rhistory +.RData +.Rproj +src/*.o +src/*.so +src/*.dll diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..f93993f --- /dev/null +++ b/.travis.yml @@ -0,0 +1,6 @@ +language: R +sudo: false +cache: packages + +after_success: +- Rscript -e 'covr::codecov()' diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..ca5a370 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,28 @@ +Package: urlscan +Type: Package +Title: Analyze Websites and Resources They Request +Version: 0.1.0 +Date: 2018-03-23 +Authors@R: c( + person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), + comment = c(ORCID = "0000-0001-5670-2640")) + ) +Maintainer: Bob Rudis +Description: The service provides an 'API' enabling analysis of + websites and the resources they request. Much like the 'Inspector' of your + browser, will let you take a look at the individual resources + that are requested when a site is loaded. Tools are provided to search + public scan submissions. +URL: https://github.com/hrbrmstr/urlscan +BugReports: https://github.com/hrbrmstr/urlscan/issues +Encoding: UTF-8 +License: AGPL +Suggests: + testthat, + covr +Depends: + R (>= 3.2.0) +Imports: + httr, + jsonlite +RoxygenNote: 6.0.1.9000 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..b1fed46 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,8 @@ +# Generated by roxygen2: do not edit by hand + +export(urlscan_search) +importFrom(httr,GET) +importFrom(httr,content) +importFrom(httr,stop_for_status) +importFrom(httr,user_agent) +importFrom(jsonlite,fromJSON) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..9b4679b --- /dev/null +++ b/NEWS.md @@ -0,0 +1,2 @@ +0.1.0 +* Initial release diff --git a/R/urlscan-package.R b/R/urlscan-package.R new file mode 100644 index 0000000..adea753 --- /dev/null +++ b/R/urlscan-package.R @@ -0,0 +1,15 @@ +#' Analyze Websites and Resources They Request +#' +#' The service provides an 'API' enabling analysis of +#' websites and the resources they request. Much like the 'Inspector' of your +#' browser, will let you take a look at the individual resources +#' that are requested when a site is loaded. Tools are provided to search +#' public scan submissions. +#' +#' @md +#' @name urlscan +#' @docType package +#' @author Bob Rudis (bob@@rud.is) +#' @importFrom httr GET user_agent content stop_for_status +#' @importFrom jsonlite fromJSON +NULL diff --git a/R/urlscan.R b/R/urlscan.R new file mode 100644 index 0000000..0895dd8 --- /dev/null +++ b/R/urlscan.R @@ -0,0 +1,51 @@ +#' Perform a urlscan.io query +#' +#' urlscan.io uses an Elasticsearch back-end and enables querying by a number +#' of fields, including: +#' +#' - `domain`: Domain (or a subdomain of it) is contacted in one of the requests +#' - `page.domain`: Domain (or a subdomain of it) is the first domain to be contacted +#' - `ip`: The IP or subnet are contacted in one request +#' - `asn`: The autonomous system (AS) was contacted (_must_ use `AS` prefix!) (comma-separated for more than one) +#' - `asname`: The autonomous system with this name was contacted (comma-separated for more than one) +#' - `filename`: This filename was requested +#' - `hash`: A resource with this SHA256 hash was downloaded +#' - `server`: The page contact a host running this web server +#' - `task.method`: one of "`manual`" or "`api`"; show manual (user) or API submissions +#' +#' The fields `ip`, `domain`, `url`, `asn`, `asnname`, `country` and `server` can also be prefixed with `page.` +#' to only match the value for the first request/response (e.g. `page.server:nginx AND page.domain:de`). +#' Furthermore, you can concatenate search-terms with `AND`, `OR`, etc. +#' +#' @md +#' @param query query to run +#' @param size number of results to return (default is `100`) +#' @param offset offset of first result (for pagination) (default is `0`) +#' @param sort sorting, specified via `$sort_field:$sort_order`. Default: `_score` +#' @references +#' @note Search can only find **public** scans, there is no way to search for private scans. +#' @export +urlscan_search <- function(query, size=100, offset=0, sort=NULL) { + + httr::GET( + url = "https://urlscan.io/api/v1/search/", + query = list( + q = query, + size = size, + offset = offset, + sort = sort + ), + httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan") + ) -> res + + httr::stop_for_status(res) + + res <- httr::content(res, as="text") + + res <- jsonlite::fromJSON(res) + + class(res) <- c("urlscan", "list") + + res + +} \ No newline at end of file diff --git a/R/util.R b/R/util.R new file mode 100644 index 0000000..c13c58c --- /dev/null +++ b/R/util.R @@ -0,0 +1,12 @@ +#' #' Turn urlscan object into a data frame +#' #' +#' #' param x `urlscan` object +#' #' param ... unused +#' #' export +#' as.data.frame.urlscan <- function(x, ...) { +#' +#' res <- x$results +#' class(res) <- c("tbl_df", "tbl", "data.frame") +#' res +#' +#' } \ No newline at end of file diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..3784e34 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,59 @@ +--- +output: rmarkdown::github_document +--- + +# urlscan + +Analyze Websites and Resources They Request + +## Description + +WIP + +The service provides an 'API' enabling analysis of +websites and the resources they request. Much like the 'Inspector' of your +browser, will let you take a look at the individual resources +that are requested when a site is loaded. Tools are provided to search +public scan submissions. + +## What's Inside The Tin + +The following functions are implemented: + +- `urlscan_search`: Perform a urlscan.io query + +## Installation + +```{r eval=FALSE} +devtools::install_github("hrbrmstr/urlscan") +``` + +```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE} +options(width=120) +``` + +## Usage + +```{r message=FALSE, warning=FALSE, error=FALSE} +library(urlscan) + +# current verison +packageVersion("urlscan") +``` + +```{r} +library(tidyverse) + +x <- urlscan_search("domain:r-project.org") + +bind_cols( + select(x$results$task, -options) %>% + mutate(user_agent = x$results$task$options$useragent) + , x$results$stats, x$results$page +) %>% + tbl_df() -> xdf + +xdf + +glimpse(xdf) +``` \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..235830b --- /dev/null +++ b/README.md @@ -0,0 +1,110 @@ + +# urlscan + +Analyze Websites and Resources They Request + +## Description + +WIP + +The \ service provides an ‘API’ enabling analysis of +websites and the resources they request. Much like the ‘Inspector’ of +your browser, \ will let you take a look at the individual +resources that are requested when a site is loaded. Tools are provided +to search public \ scan submissions. + +## What’s Inside The Tin + +The following functions are implemented: + + - `urlscan_search`: Perform a urlscan.io query + +## Installation + +``` r +devtools::install_github("hrbrmstr/urlscan") +``` + +## Usage + +``` r +library(urlscan) + +# current verison +packageVersion("urlscan") +``` + + ## [1] '0.1.0' + +``` r +library(tidyverse) +``` + + ## ── Attaching packages ────────────────────────────────────── tidyverse 1.2.1 ── + + ## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4 + ## ✔ tibble 1.4.2 ✔ dplyr 0.7.4 + ## ✔ tidyr 0.7.2 ✔ stringr 1.2.0 + ## ✔ readr 1.1.1 ✔ forcats 0.2.0 + + ## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ── + ## ✖ dplyr::filter() masks stats::filter() + ## ✖ dplyr::lag() masks stats::lag() + +``` r +x <- urlscan_search("domain:r-project.org") + +bind_cols( + select(x$results$task, -options) %>% + mutate(user_agent = x$results$task$options$useragent) + , x$results$stats, x$results$page +) %>% + tbl_df() -> xdf + +xdf +``` + + ## # A tibble: 12 x 20 + ## visibility method time source url user_agent uniqIPs consoleMsgs dataLength encodedDataLeng… requests country + ## + ## 1 public manual 2017… web https… Mozilla/5.0… 1 0 12758 676 2 AT + ## 2 public manual 2017… web https… Mozilla/5.0… 1 0 14396 676 2 AT + ## 3 public manual 2017… web https… Mozilla/5.0… 2 0 286138 97317 6 AT + ## 4 public manual 2017… web https… 1 0 0 0 1 AT + ## 5 public manual 2017… web https… 1 0 0 0 1 AT + ## 6 public manual 2017… web https… 1 0 5284 1813 2 AT + ## 7 public manual 2017… web https… 1 0 5284 1813 2 AT + ## 8 public manual 2017… web https… 1 0 4297 1640 2 AT + ## 9 public manual 2017… web https… 1 0 14722 6288 9 AT + ## 10 public manual 2017… web https… 2 0 285893 97695 6 AT + ## 11 public automa… 2017… hacke… https… 1 0 343270 101327 4 AT + ## 12 public automa… 2017… hacke… https… 1 0 345452 101840 4 AT + ## # ... with 8 more variables: server , city , domain , ip , asnname , asn , url1 , + ## # ptr + +``` r +glimpse(xdf) +``` + + ## Observations: 12 + ## Variables: 20 + ## $ visibility "public", "public", "public", "public", "public", "public", "public", "public", "public",... + ## $ method "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual",... + ## $ time "2017-12-29T17:23:39.785Z", "2017-12-20T15:52:22.902Z", "2017-11-10T13:40:19.991Z", "2017... + ## $ source "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "hackernews", "hack... + ## $ url "https://cran.r-project.org/web/packages/randomForest/index.html", "https://cran.r-projec... + ## $ user_agent "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) C... + ## $ uniqIPs 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1 + ## $ consoleMsgs 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ## $ dataLength 12758, 14396, 286138, 0, 0, 5284, 5284, 4297, 14722, 285893, 343270, 345452 + ## $ encodedDataLength 676, 676, 97317, 0, 0, 1813, 1813, 1640, 6288, 97695, 101327, 101840 + ## $ requests 2, 2, 6, 1, 1, 2, 2, 2, 9, 6, 4, 4 + ## $ country "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT" + ## $ server "Apache/2.4.10 (Debian)", "Apache/2.4.10 (Debian)", "Apache/2.4.10 (Debian)", "Apache/2.4... + ## $ city "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna",... + ## $ domain "cran.r-project.org", "cran.r-project.org", "www.r-project.org", "cran.r-project.org", "c... + ## $ ip "137.208.57.37", "137.208.57.37", "137.208.57.37", "137.208.57.37", "137.208.57.37", "137... + ## $ asnname "Welthandelsplatz 1, AT", "Welthandelsplatz 1, AT", "Welthandelsplatz 1, AT", "Welthandel... + ## $ asn "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776",... + ## $ url1 "https://cran.r-project.org/web/packages/randomForest/index.html", "https://cran.r-projec... + ## $ ptr "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "... diff --git a/man/urlscan.Rd b/man/urlscan.Rd new file mode 100644 index 0000000..3cf3c07 --- /dev/null +++ b/man/urlscan.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/urlscan-package.R +\docType{package} +\name{urlscan} +\alias{urlscan} +\alias{urlscan-package} +\title{Analyze Websites and Resources They Request} +\description{ +The service provides an 'API' enabling analysis of +websites and the resources they request. Much like the 'Inspector' of your +browser, will let you take a look at the individual resources +that are requested when a site is loaded. Tools are provided to search +public scan submissions. +} +\author{ +Bob Rudis (bob@rud.is) +} diff --git a/man/urlscan_search.Rd b/man/urlscan_search.Rd new file mode 100644 index 0000000..c715882 --- /dev/null +++ b/man/urlscan_search.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/urlscan.R +\name{urlscan_search} +\alias{urlscan_search} +\title{Perform a urlscan.io query} +\usage{ +urlscan_search(query, size = 100, offset = 0, sort = NULL) +} +\arguments{ +\item{query}{query to run} + +\item{size}{number of results to return (default is \code{100})} + +\item{offset}{offset of first result (for pagination) (default is \code{0})} + +\item{sort}{sorting, specified via \code{$sort_field:$sort_order}. Default: \code{_score}} +} +\description{ +urlscan.io uses an Elasticsearch back-end and enables querying by a number +of fields, including: +} +\details{ +\itemize{ +\item \code{domain}: Domain (or a subdomain of it) is contacted in one of the requests +\item \code{page.domain}: Domain (or a subdomain of it) is the first domain to be contacted +\item \code{ip}: The IP or subnet are contacted in one request +\item \code{asn}: The autonomous system (AS) was contacted (\emph{must} use \code{AS} prefix!) (comma-separated for more than one) +\item \code{asname}: The autonomous system with this name was contacted (comma-separated for more than one) +\item \code{filename}: This filename was requested +\item \code{hash}: A resource with this SHA256 hash was downloaded +\item \code{server}: The page contact a host running this web server +\item \code{task.method}: one of "\code{manual}" or "\code{api}"; show manual (user) or API submissions +} + +The fields \code{ip}, \code{domain}, \code{url}, \code{asn}, \code{asnname}, \code{country} and \code{server} can also be prefixed with \code{page.} +to only match the value for the first request/response (e.g. \code{page.server:nginx AND page.domain:de}). +Furthermore, you can concatenate search-terms with \code{AND}, \code{OR}, etc. +} +\note{ +Search can only find \strong{public} scans, there is no way to search for private scans. +} +\references{ +\url{https://urlscan.io/about-api/#search} +} diff --git a/tests/test-all.R b/tests/test-all.R new file mode 100644 index 0000000..2b2bc31 --- /dev/null +++ b/tests/test-all.R @@ -0,0 +1,2 @@ +library(testthat) +test_check("urlscan") diff --git a/tests/testthat/test-urlscan.R b/tests/testthat/test-urlscan.R new file mode 100644 index 0000000..0c22968 --- /dev/null +++ b/tests/testthat/test-urlscan.R @@ -0,0 +1,6 @@ +context("minimal package functionality") +test_that("we can do something", { + + #expect_that(some_function(), is_a("data.frame")) + +}) diff --git a/urlscan.Rproj b/urlscan.Rproj new file mode 100644 index 0000000..446d9e1 --- /dev/null +++ b/urlscan.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageBuildArgs: --resave-data +PackageRoxygenize: rd,collate,namespace -- 2.26.2