M DESCRIPTION => DESCRIPTION +2 -2
@@ 1,7 1,7 @@
Package: tdigest
Type: Package
Title: Wicked Fast, Accurate Quantiles Using 't-Digests'
-Version: 0.2.0
+Version: 0.3.0
Date: 2019-07-21
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
@@ 31,7 31,7 @@ Suggests:
testthat,
covr
Depends:
- R (>= 3.2.0)
+ R (>= 3.5.0)
Imports:
magrittr,
stats
M NEWS.md => NEWS.md +4 -0
@@ 1,3 1,7 @@
+0.3.0
+* ALTREP-aware
+* `length()` and `[` implemented for `tdigest` objects
+
0.2.0
* Added input vaildity checks
* Added `quantile()` function S3 implementation for `tdigest` objects
M R/create.R => R/create.R +4 -1
@@ 11,7 11,9 @@
#' The accuracy of quantile estimates produced by t-digests can be orders of
#' magnitude more accurate than those produced by previous digest algorithms.
#'
-#' @param vec vector (will be converted to `double` if not already double)
+#' @param vec vector (will be converted to `double` if not already double). NOTE that this
+#' is ALTREP-aware and will not materialize the passed-in object in order to
+#' add the values to the t-Digest.
#' @param compression the input compression value; should be >= 1.0; this
#' will control how aggressively the TDigest compresses data together.
#' The original t-Digest paper suggests using a value of 100 for a good
@@ 129,6 131,7 @@ td_add <- function(td, val, count) {
stopifnot(inherits(td, "tdigest"))
stopifnot(!is_null_xptr(td))
val <- as.double(val[1])
+ stopifnot(!is.na(val))
count <- as.double(count[1])
.Call("Rtd_add", tdig=td, val=val, count=count, PACKAGE="tdigest")
}
M README.Rmd => README.Rmd +12 -0
@@ 88,6 88,18 @@ tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
quantile(td)
```
+#### ALTREP-aware
+
+```{r}
+N <- 1000000
+x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0
+
+td <- tdigest(x.altrep)
+td[0.1]
+td[0.5]
+length(td)
+```
+
#### Proof it's faster
```{r}
M README.md => README.md +21 -6
@@ 122,6 122,21 @@ quantile(td)
## [1] 0.00000 24.74751 49.99666 75.24783 100.00000
```
+#### ALTREP-aware
+
+``` r
+N <- 1000000
+x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0
+
+td <- tdigest(x.altrep)
+td[0.1]
+## [1] 93051
+td[0.5]
+## [1] 491472.5
+length(td)
+## [1] 1000000
+```
+
#### Proof it’s faster
``` r
@@ 130,18 145,18 @@ microbenchmark::microbenchmark(
r_quantile = quantile(x, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
)
## Unit: microseconds
-## expr min lq mean median uq max neval cld
-## tdigest 8.994 10.5205 20.97971 12.20 33.1645 37.448 100 a
-## r_quantile 52181.822 53123.4685 55258.30131 53593.93 56231.9310 104215.008 100 b
+## expr min lq mean median uq max neval cld
+## tdigest 8.227 9.4895 21.66915 12.509 33.6245 69.111 100 a
+## r_quantile 53792.878 54695.9560 56684.11386 55361.924 57719.2745 99458.184 100 b
```
## tdigest Metrics
| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) |
| :----------- | -------: | --: | --: | ---: | ----------: | ---: | -------: | ---: |
-| C | 3 | 0.3 | 337 | 0.67 | 45 | 0.38 | 26 | 0.11 |
-| R | 5 | 0.5 | 128 | 0.25 | 28 | 0.23 | 133 | 0.58 |
-| Rmd | 1 | 0.1 | 30 | 0.06 | 37 | 0.31 | 42 | 0.18 |
+| C | 3 | 0.3 | 347 | 0.66 | 45 | 0.36 | 26 | 0.11 |
+| R | 5 | 0.5 | 136 | 0.26 | 31 | 0.25 | 135 | 0.58 |
+| Rmd | 1 | 0.1 | 36 | 0.07 | 40 | 0.32 | 45 | 0.19 |
| C/C++ Header | 1 | 0.1 | 10 | 0.02 | 10 | 0.08 | 28 | 0.12 |
## Code of Conduct
M man/tdigest.Rd => man/tdigest.Rd +3 -1
@@ 10,7 10,9 @@ tdigest(vec, compression = 100)
\method{print}{tdigest}(x, ...)
}
\arguments{
-\item{vec}{vector (will be converted to \code{double} if not already double)}
+\item{vec}{vector (will be converted to \code{double} if not already double). NOTE that this
+is ALTREP-aware and will not materialize the passed-in object in order to
+add the values to the t-Digest.}
\item{compression}{the input compression value; should be >= 1.0; this
will control how aggressively the TDigest compresses data together.
M src/tdigest-main.c => src/tdigest-main.c +13 -3
@@ 1,6 1,8 @@
#include <R.h>
#include <Rinternals.h>
#include <Rdefines.h>
+#include <R_ext/Altrep.h>
+#include <R_ext/Itermacros.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
@@ 45,9 47,17 @@ SEXP Rtdig(SEXP vec, SEXP compression) {
SEXP ptr;
td_histogram_t *t = td_new(asReal(compression));
if (t) {
- R_xlen_t n = xlength(vec);
- for (R_xlen_t i = 0; i < n; i++) {
- td_add(t, REAL(vec)[i], 1);
+ R_xlen_t n = Rf_xlength(vec);
+ if (ALTREP(vec)) {
+ ITERATE_BY_REGION(vec, x, i, nbatch, double, REAL, {
+ for (R_xlen_t k = 0; k < nbatch; k++) {
+ if (!ISNAN(x[k])) td_add(t, x[k], 1);
+ }
+ });
+ } else {
+ for (R_xlen_t i = 0; i < n; i++) {
+ if (!ISNAN(REAL(vec)[i])) td_add(t, REAL(vec)[i], 1);
+ }
}
ptr = R_MakeExternalPtr(t, install("tdigest"), R_NilValue);
R_RegisterCFinalizerEx(ptr, td_finalizer, TRUE);
M tests/testthat/test-tdigest.R => tests/testthat/test-tdigest.R +10 -0
@@ 53,3 53,13 @@ expect_equal(
c(0, 25, 51, 76, 100),
tolerance = 3
)
+
+context("ALTREP test")
+
+N <- 1000000
+x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0
+
+td <- tdigest(x.altrep)
+expect_equal(as.integer(td[0.1]), 93051)
+expect_equal(as.integer(td[0.5]), 491472)
+expect_equal(length(td), 1000000)