~hrbrmstr/urlscan

a60e6c38011f19dada7811352d7fa791e0fd5b3b — boB Rudis 2 years ago
initial commit
A  => .Rbuildignore +11 -0
@@ 1,11 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.travis\.yml$
^README\.*Rmd$
^README\.*html$
^NOTES\.*Rmd$
^NOTES\.*html$
^\.codecov\.yml$
^README_files$
^doc$
^tmp$

A  => .codecov.yml +1 -0
@@ 1,1 @@
comment: false

A  => .gitignore +8 -0
@@ 1,8 @@
.DS_Store
.Rproj.user
.Rhistory
.RData
.Rproj
src/*.o
src/*.so
src/*.dll

A  => .travis.yml +6 -0
@@ 1,6 @@
language: R
sudo: false
cache: packages

after_success:
- Rscript -e 'covr::codecov()'

A  => DESCRIPTION +28 -0
@@ 1,28 @@
Package: urlscan
Type: Package
Title: Analyze Websites and Resources They Request
Version: 0.1.0
Date: 2018-03-23
Authors@R: c(
    person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), 
           comment = c(ORCID = "0000-0001-5670-2640"))
  )
Maintainer: Bob Rudis <bob@rud.is>
Description: The <urlscan.io> service provides an 'API' enabling analysis of 
    websites and the resources they request. Much like the 'Inspector' of your 
    browser, <urlscan.io> will let you take a look at the individual resources 
    that are requested when a site is loaded. Tools are provided to search
    public <urlscans.io> scan submissions.
URL: https://github.com/hrbrmstr/urlscan
BugReports: https://github.com/hrbrmstr/urlscan/issues
Encoding: UTF-8
License: AGPL
Suggests:
    testthat,
    covr
Depends:
    R (>= 3.2.0)
Imports:
    httr,
    jsonlite
RoxygenNote: 6.0.1.9000

A  => NAMESPACE +8 -0
@@ 1,8 @@
# Generated by roxygen2: do not edit by hand

export(urlscan_search)
importFrom(httr,GET)
importFrom(httr,content)
importFrom(httr,stop_for_status)
importFrom(httr,user_agent)
importFrom(jsonlite,fromJSON)

A  => NEWS.md +2 -0
@@ 1,2 @@
0.1.0 
* Initial release

A  => R/urlscan-package.R +15 -0
@@ 1,15 @@
#' Analyze Websites and Resources They Request
#' 
#' The <urlscan.io> service provides an 'API' enabling analysis of 
#' websites and the resources they request. Much like the 'Inspector' of your 
#' browser, <urlscan.io> will let you take a look at the individual resources 
#' that are requested when a site is loaded. Tools are provided to search
#' public <urlscans.io> scan submissions.
#'
#' @md
#' @name urlscan
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @importFrom httr GET user_agent content stop_for_status
#' @importFrom jsonlite fromJSON
NULL

A  => R/urlscan.R +51 -0
@@ 1,51 @@
#' Perform a urlscan.io query
#'
#' urlscan.io uses an Elasticsearch back-end and enables querying by a number
#' of fields, including:
#'
#' - `domain`: Domain (or a subdomain of it) is contacted in one of the requests
#' - `page.domain`: Domain (or a subdomain of it) is the first domain to be contacted
#' - `ip`: The IP or subnet are contacted in one request
#' - `asn`: The autonomous system (AS) was contacted (_must_ use `AS` prefix!) (comma-separated for more than one)
#' - `asname`: The autonomous system with this name was contacted (comma-separated for more than one)
#' - `filename`: This filename was requested
#' - `hash`: A resource with this SHA256 hash was downloaded
#' - `server`: The page contact a host running this web server
#' - `task.method`: one of "`manual`" or "`api`"; show manual (user) or API submissions
#'
#' The fields `ip`, `domain`, `url`, `asn`, `asnname`, `country` and `server` can also be prefixed with `page.`
#' to only match the value for the first request/response (e.g. `page.server:nginx AND page.domain:de`).
#' Furthermore, you can concatenate search-terms with `AND`, `OR`, etc.
#'
#' @md
#' @param query query to run
#' @param size number of results to return (default is `100`)
#' @param offset offset of first result (for pagination) (default is `0`)
#' @param sort sorting, specified via `$sort_field:$sort_order`. Default: `_score`
#' @references <https://urlscan.io/about-api/#search>
#' @note Search can only find **public** scans, there is no way to search for private scans.
#' @export
urlscan_search <- function(query, size=100, offset=0, sort=NULL) {

   httr::GET(
     url = "https://urlscan.io/api/v1/search/",
     query = list(
       q = query,
       size = size,
       offset = offset,
       sort = sort
     ),
     httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan")
   ) -> res

   httr::stop_for_status(res)

   res <- httr::content(res, as="text")

   res <- jsonlite::fromJSON(res)

   class(res) <- c("urlscan", "list")

   res

}
\ No newline at end of file

A  => R/util.R +12 -0
@@ 1,12 @@
#' #' Turn urlscan object into a data frame
#' #'
#' #' param x `urlscan` object
#' #' param ... unused
#' #' export
#' as.data.frame.urlscan <- function(x, ...) {
#'
#'   res <- x$results
#'   class(res) <- c("tbl_df", "tbl", "data.frame")
#'   res
#'
#' }
\ No newline at end of file

A  => README.Rmd +59 -0
@@ 1,59 @@
---
output: rmarkdown::github_document
---

# urlscan

Analyze Websites and Resources They Request

## Description

WIP

The <urlscan.io> service provides an 'API' enabling analysis of 
websites and the resources they request. Much like the 'Inspector' of your 
browser, <urlscan.io> will let you take a look at the individual resources 
that are requested when a site is loaded. Tools are provided to search
public <urlscans.io> scan submissions.

## What's Inside The Tin

The following functions are implemented:

- `urlscan_search`: Perform a urlscan.io query

## Installation

```{r eval=FALSE}
devtools::install_github("hrbrmstr/urlscan")
```

```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE}
options(width=120)
```

## Usage

```{r message=FALSE, warning=FALSE, error=FALSE}
library(urlscan)

# current verison
packageVersion("urlscan")
```

```{r}
library(tidyverse)

x <- urlscan_search("domain:r-project.org")

bind_cols(
  select(x$results$task, -options) %>% 
    mutate(user_agent = x$results$task$options$useragent)
  , x$results$stats, x$results$page
) %>% 
  tbl_df() -> xdf

xdf

glimpse(xdf)
```
\ No newline at end of file

A  => README.md +110 -0
@@ 1,110 @@

# urlscan

Analyze Websites and Resources They Request

## Description

WIP

The \<urlscan.io\> service provides an ‘API’ enabling analysis of
websites and the resources they request. Much like the ‘Inspector’ of
your browser, \<urlscan.io\> will let you take a look at the individual
resources that are requested when a site is loaded. Tools are provided
to search public \<urlscans.io\> scan submissions.

## What’s Inside The Tin

The following functions are implemented:

  - `urlscan_search`: Perform a urlscan.io query

## Installation

``` r
devtools::install_github("hrbrmstr/urlscan")
```

## Usage

``` r
library(urlscan)

# current verison
packageVersion("urlscan")
```

    ## [1] '0.1.0'

``` r
library(tidyverse)
```

    ## ── Attaching packages ────────────────────────────────────── tidyverse 1.2.1 ──

    ## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
    ## ✔ tibble  1.4.2          ✔ dplyr   0.7.4     
    ## ✔ tidyr   0.7.2          ✔ stringr 1.2.0     
    ## ✔ readr   1.1.1          ✔ forcats 0.2.0

    ## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
    ## ✖ dplyr::filter() masks stats::filter()
    ## ✖ dplyr::lag()    masks stats::lag()

``` r
x <- urlscan_search("domain:r-project.org")

bind_cols(
  select(x$results$task, -options) %>% 
    mutate(user_agent = x$results$task$options$useragent)
  , x$results$stats, x$results$page
) %>% 
  tbl_df() -> xdf

xdf
```

    ## # A tibble: 12 x 20
    ##    visibility method  time  source url    user_agent   uniqIPs consoleMsgs dataLength encodedDataLeng… requests country
    ##    <chr>      <chr>   <chr> <chr>  <chr>  <chr>          <int>       <int>      <int>            <int>    <int> <chr>  
    ##  1 public     manual  2017… web    https… Mozilla/5.0…       1           0      12758              676        2 AT     
    ##  2 public     manual  2017… web    https… Mozilla/5.0…       1           0      14396              676        2 AT     
    ##  3 public     manual  2017… web    https… Mozilla/5.0…       2           0     286138            97317        6 AT     
    ##  4 public     manual  2017… web    https… <NA>               1           0          0                0        1 AT     
    ##  5 public     manual  2017… web    https… <NA>               1           0          0                0        1 AT     
    ##  6 public     manual  2017… web    https… <NA>               1           0       5284             1813        2 AT     
    ##  7 public     manual  2017… web    https… <NA>               1           0       5284             1813        2 AT     
    ##  8 public     manual  2017… web    https… <NA>               1           0       4297             1640        2 AT     
    ##  9 public     manual  2017… web    https… <NA>               1           0      14722             6288        9 AT     
    ## 10 public     manual  2017… web    https… <NA>               2           0     285893            97695        6 AT     
    ## 11 public     automa… 2017… hacke… https… <NA>               1           0     343270           101327        4 AT     
    ## 12 public     automa… 2017… hacke… https… <NA>               1           0     345452           101840        4 AT     
    ## # ... with 8 more variables: server <chr>, city <chr>, domain <chr>, ip <chr>, asnname <chr>, asn <chr>, url1 <chr>,
    ## #   ptr <chr>

``` r
glimpse(xdf)
```

    ## Observations: 12
    ## Variables: 20
    ## $ visibility        <chr> "public", "public", "public", "public", "public", "public", "public", "public", "public",...
    ## $ method            <chr> "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual",...
    ## $ time              <chr> "2017-12-29T17:23:39.785Z", "2017-12-20T15:52:22.902Z", "2017-11-10T13:40:19.991Z", "2017...
    ## $ source            <chr> "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "hackernews", "hack...
    ## $ url               <chr> "https://cran.r-project.org/web/packages/randomForest/index.html", "https://cran.r-projec...
    ## $ user_agent        <chr> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) C...
    ## $ uniqIPs           <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1
    ## $ consoleMsgs       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ## $ dataLength        <int> 12758, 14396, 286138, 0, 0, 5284, 5284, 4297, 14722, 285893, 343270, 345452
    ## $ encodedDataLength <int> 676, 676, 97317, 0, 0, 1813, 1813, 1640, 6288, 97695, 101327, 101840
    ## $ requests          <int> 2, 2, 6, 1, 1, 2, 2, 2, 9, 6, 4, 4
    ## $ country           <chr> "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "AT"
    ## $ server            <chr> "Apache/2.4.10 (Debian)", "Apache/2.4.10 (Debian)", "Apache/2.4.10 (Debian)", "Apache/2.4...
    ## $ city              <chr> "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna", "Vienna",...
    ## $ domain            <chr> "cran.r-project.org", "cran.r-project.org", "www.r-project.org", "cran.r-project.org", "c...
    ## $ ip                <chr> "137.208.57.37", "137.208.57.37", "137.208.57.37", "137.208.57.37", "137.208.57.37", "137...
    ## $ asnname           <chr> "Welthandelsplatz 1, AT", "Welthandelsplatz 1, AT", "Welthandelsplatz 1, AT", "Welthandel...
    ## $ asn               <chr> "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776",...
    ## $ url1              <chr> "https://cran.r-project.org/web/packages/randomForest/index.html", "https://cran.r-projec...
    ## $ ptr               <chr> "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "...

A  => man/urlscan.Rd +17 -0
@@ 1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/urlscan-package.R
\docType{package}
\name{urlscan}
\alias{urlscan}
\alias{urlscan-package}
\title{Analyze Websites and Resources They Request}
\description{
The <urlscan.io> service provides an 'API' enabling analysis of
websites and the resources they request. Much like the 'Inspector' of your
browser, <urlscan.io> will let you take a look at the individual resources
that are requested when a site is loaded. Tools are provided to search
public <urlscans.io> scan submissions.
}
\author{
Bob Rudis (bob@rud.is)
}

A  => man/urlscan_search.Rd +44 -0
@@ 1,44 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/urlscan.R
\name{urlscan_search}
\alias{urlscan_search}
\title{Perform a urlscan.io query}
\usage{
urlscan_search(query, size = 100, offset = 0, sort = NULL)
}
\arguments{
\item{query}{query to run}

\item{size}{number of results to return (default is \code{100})}

\item{offset}{offset of first result (for pagination) (default is \code{0})}

\item{sort}{sorting, specified via \code{$sort_field:$sort_order}. Default: \code{_score}}
}
\description{
urlscan.io uses an Elasticsearch back-end and enables querying by a number
of fields, including:
}
\details{
\itemize{
\item \code{domain}: Domain (or a subdomain of it) is contacted in one of the requests
\item \code{page.domain}: Domain (or a subdomain of it) is the first domain to be contacted
\item \code{ip}: The IP or subnet are contacted in one request
\item \code{asn}: The autonomous system (AS) was contacted (\emph{must} use \code{AS} prefix!) (comma-separated for more than one)
\item \code{asname}: The autonomous system with this name was contacted (comma-separated for more than one)
\item \code{filename}: This filename was requested
\item \code{hash}: A resource with this SHA256 hash was downloaded
\item \code{server}: The page contact a host running this web server
\item \code{task.method}: one of "\code{manual}" or "\code{api}"; show manual (user) or API submissions
}

The fields \code{ip}, \code{domain}, \code{url}, \code{asn}, \code{asnname}, \code{country} and \code{server} can also be prefixed with \code{page.}
to only match the value for the first request/response (e.g. \code{page.server:nginx AND page.domain:de}).
Furthermore, you can concatenate search-terms with \code{AND}, \code{OR}, etc.
}
\note{
Search can only find \strong{public} scans, there is no way to search for private scans.
}
\references{
\url{https://urlscan.io/about-api/#search}
}

A  => tests/test-all.R +2 -0
@@ 1,2 @@
library(testthat)
test_check("urlscan")

A  => tests/testthat/test-urlscan.R +6 -0
@@ 1,6 @@
context("minimal package functionality")
test_that("we can do something", {

  #expect_that(some_function(), is_a("data.frame"))

})

A  => urlscan.Rproj +21 -0
@@ 1,21 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageBuildArgs: --resave-data
PackageRoxygenize: rd,collate,namespace