~hrbrmstr/urlscan

4c870355025d226167c1e505557423b4d395b607 — boB Rudis 2 years ago a60e6c3
urlscan result API function added
7 files changed, 190 insertions(+), 12 deletions(-)

M DESCRIPTION
M NAMESPACE
A R/results.R
M R/urlscan-package.R
M README.Rmd
M README.md
A man/urlscan_result.Rd
M DESCRIPTION => DESCRIPTION +2 -1
@@ 24,5 24,6 @@ Depends:
    R (>= 3.2.0)
Imports:
    httr,
    jsonlite
    jsonlite,
    magick
RoxygenNote: 6.0.1.9000

M NAMESPACE => NAMESPACE +4 -0
@@ 1,8 1,12 @@
# Generated by roxygen2: do not edit by hand

export(urlscan_result)
export(urlscan_search)
importFrom(httr,GET)
importFrom(httr,content)
importFrom(httr,status_code)
importFrom(httr,stop_for_status)
importFrom(httr,user_agent)
importFrom(httr,warn_for_status)
importFrom(jsonlite,fromJSON)
importFrom(magick,image_read)

A R/results.R => R/results.R +58 -0
@@ 0,0 1,58 @@
#' Retrieve detailed results for a given scan ID
#'
#' @md
#' @param scan_id scan id (UUID)
#' @param include_dom (logical) include the website DOM? (default: `FALSE`)
#' @param include_shot (logical) include the website screen shot? (default: `FALSE`)
#' @return `list` with `scan_result` task, page, content lists, fetch data,
#'         connection metadata and computed stats.\cr
#'         \cr
#'         The list can also include `dom` if
#'         `include_dom` is `TRUE`. If so, `dom` will be an `httr` `response` object
#'         since the data could be binary. Use `httr` tools to process it.\cr
#'         \cr
#'         The list can also include `screenshot` if `include_shot` is `TRUE` and
#'         a screenshot was available.
#' @export
urlscan_result <- function(scan_id, include_dom=FALSE, include_shot=FALSE) {

  httr::GET(
    url = sprintf("https://urlscan.io/api/v1/result/%s", scan_id),
    httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan")
  ) -> res

  httr::stop_for_status(res)

  res <- httr::content(res, as="text")

  res <- jsonlite::fromJSON(res)

  class(res) <- c("urlscan_result", "list")

  out <- list(scan_result = res)

  if (include_dom) {

    httr::GET(
      url = sprintf("https://urlscan.io/dom/%s", scan_id),
      httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan")
    ) -> res

    out$dom <- res

  }

  if (include_shot) {

    httr::GET(
      url = sprintf("https://urlscan.io/screenshots/%s.png", scan_id),
      httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan")
    ) -> res

    if (httr::status_code(res) == 200) out$screenshot <-  magick::image_read(res$content)

  }

  out

}
\ No newline at end of file

M R/urlscan-package.R => R/urlscan-package.R +6 -5
@@ 1,8 1,8 @@
#' Analyze Websites and Resources They Request
#' 
#' The <urlscan.io> service provides an 'API' enabling analysis of 
#' websites and the resources they request. Much like the 'Inspector' of your 
#' browser, <urlscan.io> will let you take a look at the individual resources 
#'
#' The <urlscan.io> service provides an 'API' enabling analysis of
#' websites and the resources they request. Much like the 'Inspector' of your
#' browser, <urlscan.io> will let you take a look at the individual resources
#' that are requested when a site is loaded. Tools are provided to search
#' public <urlscans.io> scan submissions.
#'


@@ 10,6 10,7 @@
#' @name urlscan
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @importFrom httr GET user_agent content stop_for_status
#' @importFrom httr GET user_agent content stop_for_status warn_for_status status_code
#' @importFrom jsonlite fromJSON
#' @importFrom magick image_read
NULL

M README.Rmd => README.Rmd +10 -1
@@ 49,11 49,20 @@ x <- urlscan_search("domain:r-project.org")
bind_cols(
  select(x$results$task, -options) %>% 
    mutate(user_agent = x$results$task$options$useragent)
  , x$results$stats, x$results$page
  ,x$results$stats, 
  x$results$page
) %>% 
  mutate(id = x$results$`_id`) %>% 
  mutate(result_api_url = x$results$result) %>% 
  tbl_df() -> xdf

xdf

glimpse(xdf)

ures <- urlscan_result(xdf$id[2], TRUE, TRUE)

str(ures$scan_result, 2)

ures$screenshot
```
\ No newline at end of file

M README.md => README.md +81 -5
@@ 57,14 57,17 @@ x <- urlscan_search("domain:r-project.org")
bind_cols(
  select(x$results$task, -options) %>% 
    mutate(user_agent = x$results$task$options$useragent)
  , x$results$stats, x$results$page
  ,x$results$stats, 
  x$results$page
) %>% 
  mutate(id = x$results$`_id`) %>% 
  mutate(result_api_url = x$results$result) %>% 
  tbl_df() -> xdf

xdf
```

    ## # A tibble: 12 x 20
    ## # A tibble: 12 x 22
    ##    visibility method  time  source url    user_agent   uniqIPs consoleMsgs dataLength encodedDataLeng… requests country
    ##    <chr>      <chr>   <chr> <chr>  <chr>  <chr>          <int>       <int>      <int>            <int>    <int> <chr>  
    ##  1 public     manual  2017… web    https… Mozilla/5.0…       1           0      12758              676        2 AT     


@@ 79,15 82,15 @@ xdf
    ## 10 public     manual  2017… web    https… <NA>               2           0     285893            97695        6 AT     
    ## 11 public     automa… 2017… hacke… https… <NA>               1           0     343270           101327        4 AT     
    ## 12 public     automa… 2017… hacke… https… <NA>               1           0     345452           101840        4 AT     
    ## # ... with 8 more variables: server <chr>, city <chr>, domain <chr>, ip <chr>, asnname <chr>, asn <chr>, url1 <chr>,
    ## #   ptr <chr>
    ## # ... with 10 more variables: server <chr>, city <chr>, domain <chr>, ip <chr>, asnname <chr>, asn <chr>, url1 <chr>,
    ## #   ptr <chr>, id <chr>, result_api_url <chr>

``` r
glimpse(xdf)
```

    ## Observations: 12
    ## Variables: 20
    ## Variables: 22
    ## $ visibility        <chr> "public", "public", "public", "public", "public", "public", "public", "public", "public",...
    ## $ method            <chr> "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual",...
    ## $ time              <chr> "2017-12-29T17:23:39.785Z", "2017-12-20T15:52:22.902Z", "2017-11-10T13:40:19.991Z", "2017...


@@ 108,3 111,76 @@ glimpse(xdf)
    ## $ asn               <chr> "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776",...
    ## $ url1              <chr> "https://cran.r-project.org/web/packages/randomForest/index.html", "https://cran.r-projec...
    ## $ ptr               <chr> "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "...
    ## $ id                <chr> "d134c3b7-f306-4c7b-b2cb-c0f900793083", "075778b6-20f6-45a9-bb76-a80ac9bae1d2", "fbacb280...
    ## $ result_api_url    <chr> "https://urlscan.io/api/v1/result/d134c3b7-f306-4c7b-b2cb-c0f900793083", "https://urlscan...

``` r
ures <- urlscan_result(xdf$id[2], TRUE, TRUE)

str(ures$scan_result, 2)
```

    ## List of 6
    ##  $ data :List of 6
    ##   ..$ requests:'data.frame': 2 obs. of  3 variables:
    ##   ..$ cookies : list()
    ##   ..$ console : list()
    ##   ..$ links   : list()
    ##   ..$ timing  :List of 6
    ##   ..$ globals :'data.frame': 2 obs. of  2 variables:
    ##  $ stats:List of 14
    ##   ..$ resourceStats   :'data.frame': 2 obs. of  9 variables:
    ##   ..$ protocolStats   :'data.frame': 1 obs. of  7 variables:
    ##   ..$ tlsStats        :'data.frame': 1 obs. of  7 variables:
    ##   ..$ serverStats     :'data.frame': 1 obs. of  6 variables:
    ##   ..$ domainStats     :'data.frame': 1 obs. of  9 variables:
    ##   ..$ regDomainStats  :'data.frame': 1 obs. of  9 variables:
    ##   ..$ secureRequests  : int 2
    ##   ..$ securePercentage: int 100
    ##   ..$ IPv6Percentage  : int 0
    ##   ..$ uniqCountries   : int 1
    ##   ..$ totalLinks      : int 0
    ##   ..$ malicious       : int 0
    ##   ..$ adBlocked       : int 0
    ##   ..$ ipStats         :'data.frame': 1 obs. of  14 variables:
    ##  $ meta :List of 1
    ##   ..$ processors:List of 8
    ##  $ task :List of 11
    ##   ..$ uuid         : chr "075778b6-20f6-45a9-bb76-a80ac9bae1d2"
    ##   ..$ time         : chr "2017-12-20T15:52:22.902Z"
    ##   ..$ url          : chr "https://cran.r-project.org/web/packages/e1071/"
    ##   ..$ visibility   : chr "public"
    ##   ..$ options      :List of 1
    ##   ..$ method       : chr "manual"
    ##   ..$ source       : chr "web"
    ##   ..$ userAgent    : chr "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
    ##   ..$ reportURL    : chr "https://urlscan.io/result/075778b6-20f6-45a9-bb76-a80ac9bae1d2/"
    ##   ..$ screenshotURL: chr "https://urlscan.io/screenshots/075778b6-20f6-45a9-bb76-a80ac9bae1d2.png"
    ##   ..$ domURL       : chr "https://urlscan.io/dom/075778b6-20f6-45a9-bb76-a80ac9bae1d2/"
    ##  $ page :List of 9
    ##   ..$ url    : chr "https://cran.r-project.org/web/packages/e1071/"
    ##   ..$ domain : chr "cran.r-project.org"
    ##   ..$ country: chr "AT"
    ##   ..$ city   : chr "Vienna"
    ##   ..$ server : chr "Apache/2.4.10 (Debian)"
    ##   ..$ ip     : chr "137.208.57.37"
    ##   ..$ ptr    : chr "cran.wu-wien.ac.at"
    ##   ..$ asn    : chr "AS1776"
    ##   ..$ asnname: chr "Welthandelsplatz 1, AT"
    ##  $ lists:List of 9
    ##   ..$ ips         : chr "137.208.57.37"
    ##   ..$ countries   : chr "AT"
    ##   ..$ asns        : chr "1776"
    ##   ..$ domains     : chr "cran.r-project.org"
    ##   ..$ servers     : chr "Apache/2.4.10 (Debian)"
    ##   ..$ urls        : chr [1:2] "https://cran.r-project.org/web/packages/e1071/" "https://cran.r-project.org/web/CRAN_web.css"
    ##   ..$ linkDomains : list()
    ##   ..$ certificates:'data.frame': 1 obs. of  5 variables:
    ##   ..$ hashes      : chr [1:2] "48f7615c35fe15989530b1df31256a02340bed62069275c534a4222791eb23b2" "6a738f3da9f1203b5d765088a4ff4e4ac36c59fad008f450b808354d9625bc51"
    ##  - attr(*, "class")= chr [1:2] "urlscan_result" "list"

``` r
ures$screenshot
```

![](/var/folders/1w/2d82v7ts3gs98tc6v772h8s40000gp/T//RtmpdPM7M1/file16085246d6ab0.png)<!-- -->

A man/urlscan_result.Rd => man/urlscan_result.Rd +29 -0
@@ 0,0 1,29 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/results.R
\name{urlscan_result}
\alias{urlscan_result}
\title{Retrieve detailed results for a given scan ID}
\usage{
urlscan_result(scan_id, include_dom = FALSE, include_shot = FALSE)
}
\arguments{
\item{scan_id}{scan id (UUID)}

\item{include_dom}{(logical) include the website DOM? (default: \code{FALSE})}

\item{include_shot}{(logical) include the website screen shot? (default: \code{FALSE})}
}
\value{
\code{list} with \code{scan_result} task, page, content lists, fetch data,
connection metadata and computed stats.\cr
\cr
The list can also include \code{dom} if
\code{include_dom} is \code{TRUE}. If so, \code{dom} will be an \code{httr} \code{response} object
since the data could be binary. Use \code{httr} tools to process it.\cr
\cr
The list can also include \code{screenshot} if \code{include_shot} is \code{TRUE} and
a screenshot was available.
}
\description{
Retrieve detailed results for a given scan ID
}