M DESCRIPTION => DESCRIPTION +2 -1
@@ 24,5 24,6 @@ Depends:
R (>= 3.2.0)
Imports:
httr,
- jsonlite
+ jsonlite,
+ magick
RoxygenNote: 6.0.1.9000
M NAMESPACE => NAMESPACE +4 -0
@@ 1,8 1,12 @@
# Generated by roxygen2: do not edit by hand
+export(urlscan_result)
export(urlscan_search)
importFrom(httr,GET)
importFrom(httr,content)
+importFrom(httr,status_code)
importFrom(httr,stop_for_status)
importFrom(httr,user_agent)
+importFrom(httr,warn_for_status)
importFrom(jsonlite,fromJSON)
+importFrom(magick,image_read)
A R/results.R => R/results.R +58 -0
@@ 0,0 1,58 @@
+#' Retrieve detailed results for a given scan ID
+#'
+#' @md
+#' @param scan_id scan id (UUID)
+#' @param include_dom (logical) include the website DOM? (default: `FALSE`)
+#' @param include_shot (logical) include the website screen shot? (default: `FALSE`)
+#' @return `list` with `scan_result` task, page, content lists, fetch data,
+#' connection metadata and computed stats.\cr
+#' \cr
+#' The list can also include `dom` if
+#' `include_dom` is `TRUE`. If so, `dom` will be an `httr` `response` object
+#' since the data could be binary. Use `httr` tools to process it.\cr
+#' \cr
+#' The list can also include `screenshot` if `include_shot` is `TRUE` and
+#' a screenshot was available.
+#' @export
+urlscan_result <- function(scan_id, include_dom=FALSE, include_shot=FALSE) {
+
+ httr::GET(
+ url = sprintf("https://urlscan.io/api/v1/result/%s", scan_id),
+ httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan")
+ ) -> res
+
+ httr::stop_for_status(res)
+
+ res <- httr::content(res, as="text")
+
+ res <- jsonlite::fromJSON(res)
+
+ class(res) <- c("urlscan_result", "list")
+
+ out <- list(scan_result = res)
+
+ if (include_dom) {
+
+ httr::GET(
+ url = sprintf("https://urlscan.io/dom/%s", scan_id),
+ httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan")
+ ) -> res
+
+ out$dom <- res
+
+ }
+
+ if (include_shot) {
+
+ httr::GET(
+ url = sprintf("https://urlscan.io/screenshots/%s.png", scan_id),
+ httr::user_agent("urlscan #rstats package : https://github.com/hrbrmstr/urlscan")
+ ) -> res
+
+ if (httr::status_code(res) == 200) out$screenshot <- magick::image_read(res$content)
+
+ }
+
+ out
+
+}<
\ No newline at end of file
M R/urlscan-package.R => R/urlscan-package.R +6 -5
@@ 1,8 1,8 @@
#' Analyze Websites and Resources They Request
-#'
-#' The <urlscan.io> service provides an 'API' enabling analysis of
-#' websites and the resources they request. Much like the 'Inspector' of your
-#' browser, <urlscan.io> will let you take a look at the individual resources
+#'
+#' The <urlscan.io> service provides an 'API' enabling analysis of
+#' websites and the resources they request. Much like the 'Inspector' of your
+#' browser, <urlscan.io> will let you take a look at the individual resources
#' that are requested when a site is loaded. Tools are provided to search
#' public <urlscans.io> scan submissions.
#'
@@ 10,6 10,7 @@
#' @name urlscan
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
-#' @importFrom httr GET user_agent content stop_for_status
+#' @importFrom httr GET user_agent content stop_for_status warn_for_status status_code
#' @importFrom jsonlite fromJSON
+#' @importFrom magick image_read
NULL
M README.Rmd => README.Rmd +10 -1
@@ 49,11 49,20 @@ x <- urlscan_search("domain:r-project.org")
bind_cols(
select(x$results$task, -options) %>%
mutate(user_agent = x$results$task$options$useragent)
- , x$results$stats, x$results$page
+ ,x$results$stats,
+ x$results$page
) %>%
+ mutate(id = x$results$`_id`) %>%
+ mutate(result_api_url = x$results$result) %>%
tbl_df() -> xdf
xdf
glimpse(xdf)
+
+ures <- urlscan_result(xdf$id[2], TRUE, TRUE)
+
+str(ures$scan_result, 2)
+
+ures$screenshot
```=
\ No newline at end of file
M README.md => README.md +81 -5
@@ 57,14 57,17 @@ x <- urlscan_search("domain:r-project.org")
bind_cols(
select(x$results$task, -options) %>%
mutate(user_agent = x$results$task$options$useragent)
- , x$results$stats, x$results$page
+ ,x$results$stats,
+ x$results$page
) %>%
+ mutate(id = x$results$`_id`) %>%
+ mutate(result_api_url = x$results$result) %>%
tbl_df() -> xdf
xdf
```
- ## # A tibble: 12 x 20
+ ## # A tibble: 12 x 22
## visibility method time source url user_agent uniqIPs consoleMsgs dataLength encodedDataLeng… requests country
## <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int> <chr>
## 1 public manual 2017… web https… Mozilla/5.0… 1 0 12758 676 2 AT
@@ 79,15 82,15 @@ xdf
## 10 public manual 2017… web https… <NA> 2 0 285893 97695 6 AT
## 11 public automa… 2017… hacke… https… <NA> 1 0 343270 101327 4 AT
## 12 public automa… 2017… hacke… https… <NA> 1 0 345452 101840 4 AT
- ## # ... with 8 more variables: server <chr>, city <chr>, domain <chr>, ip <chr>, asnname <chr>, asn <chr>, url1 <chr>,
- ## # ptr <chr>
+ ## # ... with 10 more variables: server <chr>, city <chr>, domain <chr>, ip <chr>, asnname <chr>, asn <chr>, url1 <chr>,
+ ## # ptr <chr>, id <chr>, result_api_url <chr>
``` r
glimpse(xdf)
```
## Observations: 12
- ## Variables: 20
+ ## Variables: 22
## $ visibility <chr> "public", "public", "public", "public", "public", "public", "public", "public", "public",...
## $ method <chr> "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual", "manual",...
## $ time <chr> "2017-12-29T17:23:39.785Z", "2017-12-20T15:52:22.902Z", "2017-11-10T13:40:19.991Z", "2017...
@@ 108,3 111,76 @@ glimpse(xdf)
## $ asn <chr> "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776", "AS1776",...
## $ url1 <chr> "https://cran.r-project.org/web/packages/randomForest/index.html", "https://cran.r-projec...
## $ ptr <chr> "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "cran.wu-wien.ac.at", "...
+ ## $ id <chr> "d134c3b7-f306-4c7b-b2cb-c0f900793083", "075778b6-20f6-45a9-bb76-a80ac9bae1d2", "fbacb280...
+ ## $ result_api_url <chr> "https://urlscan.io/api/v1/result/d134c3b7-f306-4c7b-b2cb-c0f900793083", "https://urlscan...
+
+``` r
+ures <- urlscan_result(xdf$id[2], TRUE, TRUE)
+
+str(ures$scan_result, 2)
+```
+
+ ## List of 6
+ ## $ data :List of 6
+ ## ..$ requests:'data.frame': 2 obs. of 3 variables:
+ ## ..$ cookies : list()
+ ## ..$ console : list()
+ ## ..$ links : list()
+ ## ..$ timing :List of 6
+ ## ..$ globals :'data.frame': 2 obs. of 2 variables:
+ ## $ stats:List of 14
+ ## ..$ resourceStats :'data.frame': 2 obs. of 9 variables:
+ ## ..$ protocolStats :'data.frame': 1 obs. of 7 variables:
+ ## ..$ tlsStats :'data.frame': 1 obs. of 7 variables:
+ ## ..$ serverStats :'data.frame': 1 obs. of 6 variables:
+ ## ..$ domainStats :'data.frame': 1 obs. of 9 variables:
+ ## ..$ regDomainStats :'data.frame': 1 obs. of 9 variables:
+ ## ..$ secureRequests : int 2
+ ## ..$ securePercentage: int 100
+ ## ..$ IPv6Percentage : int 0
+ ## ..$ uniqCountries : int 1
+ ## ..$ totalLinks : int 0
+ ## ..$ malicious : int 0
+ ## ..$ adBlocked : int 0
+ ## ..$ ipStats :'data.frame': 1 obs. of 14 variables:
+ ## $ meta :List of 1
+ ## ..$ processors:List of 8
+ ## $ task :List of 11
+ ## ..$ uuid : chr "075778b6-20f6-45a9-bb76-a80ac9bae1d2"
+ ## ..$ time : chr "2017-12-20T15:52:22.902Z"
+ ## ..$ url : chr "https://cran.r-project.org/web/packages/e1071/"
+ ## ..$ visibility : chr "public"
+ ## ..$ options :List of 1
+ ## ..$ method : chr "manual"
+ ## ..$ source : chr "web"
+ ## ..$ userAgent : chr "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
+ ## ..$ reportURL : chr "https://urlscan.io/result/075778b6-20f6-45a9-bb76-a80ac9bae1d2/"
+ ## ..$ screenshotURL: chr "https://urlscan.io/screenshots/075778b6-20f6-45a9-bb76-a80ac9bae1d2.png"
+ ## ..$ domURL : chr "https://urlscan.io/dom/075778b6-20f6-45a9-bb76-a80ac9bae1d2/"
+ ## $ page :List of 9
+ ## ..$ url : chr "https://cran.r-project.org/web/packages/e1071/"
+ ## ..$ domain : chr "cran.r-project.org"
+ ## ..$ country: chr "AT"
+ ## ..$ city : chr "Vienna"
+ ## ..$ server : chr "Apache/2.4.10 (Debian)"
+ ## ..$ ip : chr "137.208.57.37"
+ ## ..$ ptr : chr "cran.wu-wien.ac.at"
+ ## ..$ asn : chr "AS1776"
+ ## ..$ asnname: chr "Welthandelsplatz 1, AT"
+ ## $ lists:List of 9
+ ## ..$ ips : chr "137.208.57.37"
+ ## ..$ countries : chr "AT"
+ ## ..$ asns : chr "1776"
+ ## ..$ domains : chr "cran.r-project.org"
+ ## ..$ servers : chr "Apache/2.4.10 (Debian)"
+ ## ..$ urls : chr [1:2] "https://cran.r-project.org/web/packages/e1071/" "https://cran.r-project.org/web/CRAN_web.css"
+ ## ..$ linkDomains : list()
+ ## ..$ certificates:'data.frame': 1 obs. of 5 variables:
+ ## ..$ hashes : chr [1:2] "48f7615c35fe15989530b1df31256a02340bed62069275c534a4222791eb23b2" "6a738f3da9f1203b5d765088a4ff4e4ac36c59fad008f450b808354d9625bc51"
+ ## - attr(*, "class")= chr [1:2] "urlscan_result" "list"
+
+``` r
+ures$screenshot
+```
+
+<!-- -->
A man/urlscan_result.Rd => man/urlscan_result.Rd +29 -0
@@ 0,0 1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/results.R
+\name{urlscan_result}
+\alias{urlscan_result}
+\title{Retrieve detailed results for a given scan ID}
+\usage{
+urlscan_result(scan_id, include_dom = FALSE, include_shot = FALSE)
+}
+\arguments{
+\item{scan_id}{scan id (UUID)}
+
+\item{include_dom}{(logical) include the website DOM? (default: \code{FALSE})}
+
+\item{include_shot}{(logical) include the website screen shot? (default: \code{FALSE})}
+}
+\value{
+\code{list} with \code{scan_result} task, page, content lists, fetch data,
+connection metadata and computed stats.\cr
+\cr
+The list can also include \code{dom} if
+\code{include_dom} is \code{TRUE}. If so, \code{dom} will be an \code{httr} \code{response} object
+since the data could be binary. Use \code{httr} tools to process it.\cr
+\cr
+The list can also include \code{screenshot} if \code{include_shot} is \code{TRUE} and
+a screenshot was available.
+}
+\description{
+Retrieve detailed results for a given scan ID
+}