From 1148b0466a748fc5b8d85ff6d55b08818d236512 Mon Sep 17 00:00:00 2001 From: hrbrmstr Date: Sun, 21 Jul 2019 13:54:54 -0400 Subject: [PATCH] ALTREP --- DESCRIPTION | 4 ++-- NEWS.md | 4 ++++ R/create.R | 5 ++++- README.Rmd | 12 ++++++++++++ README.md | 27 +++++++++++++++++++++------ man/tdigest.Rd | 4 +++- src/tdigest-main.c | 16 +++++++++++++--- tests/testthat/test-tdigest.R | 10 ++++++++++ 8 files changed, 69 insertions(+), 13 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 877b765..4021453 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tdigest Type: Package Title: Wicked Fast, Accurate Quantiles Using 't-Digests' -Version: 0.2.0 +Version: 0.3.0 Date: 2019-07-21 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), @@ -31,7 +31,7 @@ Suggests: testthat, covr Depends: - R (>= 3.2.0) + R (>= 3.5.0) Imports: magrittr, stats diff --git a/NEWS.md b/NEWS.md index 1f1e39e..160d7c5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +0.3.0 +* ALTREP-aware +* `length()` and `[` implemented for `tdigest` objects + 0.2.0 * Added input vaildity checks * Added `quantile()` function S3 implementation for `tdigest` objects diff --git a/R/create.R b/R/create.R index 2428725..ce2ea6a 100644 --- a/R/create.R +++ b/R/create.R @@ -11,7 +11,9 @@ #' The accuracy of quantile estimates produced by t-digests can be orders of #' magnitude more accurate than those produced by previous digest algorithms. #' -#' @param vec vector (will be converted to `double` if not already double) +#' @param vec vector (will be converted to `double` if not already double). NOTE that this +#' is ALTREP-aware and will not materialize the passed-in object in order to +#' add the values to the t-Digest. #' @param compression the input compression value; should be >= 1.0; this #' will control how aggressively the TDigest compresses data together. #' The original t-Digest paper suggests using a value of 100 for a good @@ -129,6 +131,7 @@ td_add <- function(td, val, count) { stopifnot(inherits(td, "tdigest")) stopifnot(!is_null_xptr(td)) val <- as.double(val[1]) + stopifnot(!is.na(val)) count <- as.double(count[1]) .Call("Rtd_add", tdig=td, val=val, count=count, PACKAGE="tdigest") } diff --git a/README.Rmd b/README.Rmd index be4016e..0d02744 100644 --- a/README.Rmd +++ b/README.Rmd @@ -88,6 +88,18 @@ tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1)) quantile(td) ``` +#### ALTREP-aware + +```{r} +N <- 1000000 +x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0 + +td <- tdigest(x.altrep) +td[0.1] +td[0.5] +length(td) +``` + #### Proof it's faster ```{r} diff --git a/README.md b/README.md index fb29e10..a4ce750 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,21 @@ quantile(td) ## [1] 0.00000 24.74751 49.99666 75.24783 100.00000 ``` +#### ALTREP-aware + +``` r +N <- 1000000 +x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0 + +td <- tdigest(x.altrep) +td[0.1] +## [1] 93051 +td[0.5] +## [1] 491472.5 +length(td) +## [1] 1000000 +``` + #### Proof it’s faster ``` r @@ -130,18 +145,18 @@ microbenchmark::microbenchmark( r_quantile = quantile(x, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1)) ) ## Unit: microseconds -## expr min lq mean median uq max neval cld -## tdigest 8.994 10.5205 20.97971 12.20 33.1645 37.448 100 a -## r_quantile 52181.822 53123.4685 55258.30131 53593.93 56231.9310 104215.008 100 b +## expr min lq mean median uq max neval cld +## tdigest 8.227 9.4895 21.66915 12.509 33.6245 69.111 100 a +## r_quantile 53792.878 54695.9560 56684.11386 55361.924 57719.2745 99458.184 100 b ``` ## tdigest Metrics | Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) | | :----------- | -------: | --: | --: | ---: | ----------: | ---: | -------: | ---: | -| C | 3 | 0.3 | 337 | 0.67 | 45 | 0.38 | 26 | 0.11 | -| R | 5 | 0.5 | 128 | 0.25 | 28 | 0.23 | 133 | 0.58 | -| Rmd | 1 | 0.1 | 30 | 0.06 | 37 | 0.31 | 42 | 0.18 | +| C | 3 | 0.3 | 347 | 0.66 | 45 | 0.36 | 26 | 0.11 | +| R | 5 | 0.5 | 136 | 0.26 | 31 | 0.25 | 135 | 0.58 | +| Rmd | 1 | 0.1 | 36 | 0.07 | 40 | 0.32 | 45 | 0.19 | | C/C++ Header | 1 | 0.1 | 10 | 0.02 | 10 | 0.08 | 28 | 0.12 | ## Code of Conduct diff --git a/man/tdigest.Rd b/man/tdigest.Rd index d0fbe05..e2a073d 100644 --- a/man/tdigest.Rd +++ b/man/tdigest.Rd @@ -10,7 +10,9 @@ tdigest(vec, compression = 100) \method{print}{tdigest}(x, ...) } \arguments{ -\item{vec}{vector (will be converted to \code{double} if not already double)} +\item{vec}{vector (will be converted to \code{double} if not already double). NOTE that this +is ALTREP-aware and will not materialize the passed-in object in order to +add the values to the t-Digest.} \item{compression}{the input compression value; should be >= 1.0; this will control how aggressively the TDigest compresses data together. diff --git a/src/tdigest-main.c b/src/tdigest-main.c index 2034e12..e29a41a 100644 --- a/src/tdigest-main.c +++ b/src/tdigest-main.c @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include #include #include @@ -45,9 +47,17 @@ SEXP Rtdig(SEXP vec, SEXP compression) { SEXP ptr; td_histogram_t *t = td_new(asReal(compression)); if (t) { - R_xlen_t n = xlength(vec); - for (R_xlen_t i = 0; i < n; i++) { - td_add(t, REAL(vec)[i], 1); + R_xlen_t n = Rf_xlength(vec); + if (ALTREP(vec)) { + ITERATE_BY_REGION(vec, x, i, nbatch, double, REAL, { + for (R_xlen_t k = 0; k < nbatch; k++) { + if (!ISNAN(x[k])) td_add(t, x[k], 1); + } + }); + } else { + for (R_xlen_t i = 0; i < n; i++) { + if (!ISNAN(REAL(vec)[i])) td_add(t, REAL(vec)[i], 1); + } } ptr = R_MakeExternalPtr(t, install("tdigest"), R_NilValue); R_RegisterCFinalizerEx(ptr, td_finalizer, TRUE); diff --git a/tests/testthat/test-tdigest.R b/tests/testthat/test-tdigest.R index ce30d7e..07360b3 100644 --- a/tests/testthat/test-tdigest.R +++ b/tests/testthat/test-tdigest.R @@ -53,3 +53,13 @@ expect_equal( c(0, 25, 51, 76, 100), tolerance = 3 ) + +context("ALTREP test") + +N <- 1000000 +x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0 + +td <- tdigest(x.altrep) +expect_equal(as.integer(td[0.1]), 93051) +expect_equal(as.integer(td[0.5]), 491472) +expect_equal(length(td), 1000000) -- 2.45.2