~hrbrmstr/tdigest

1148b0466a748fc5b8d85ff6d55b08818d236512 — hrbrmstr 4 years ago 7169ba2
ALTREP
8 files changed, 69 insertions(+), 13 deletions(-)

M DESCRIPTION
M NEWS.md
M R/create.R
M README.Rmd
M README.md
M man/tdigest.Rd
M src/tdigest-main.c
M tests/testthat/test-tdigest.R
M DESCRIPTION => DESCRIPTION +2 -2
@@ 1,7 1,7 @@
Package: tdigest
Type: Package
Title: Wicked Fast, Accurate Quantiles Using 't-Digests'
Version: 0.2.0
Version: 0.3.0
Date: 2019-07-21
Authors@R: c(
    person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), 


@@ 31,7 31,7 @@ Suggests:
    testthat,
    covr
Depends:
    R (>= 3.2.0)
    R (>= 3.5.0)
Imports: 
    magrittr,
    stats

M NEWS.md => NEWS.md +4 -0
@@ 1,3 1,7 @@
0.3.0
* ALTREP-aware
* `length()` and `[` implemented for `tdigest` objects

0.2.0
* Added input vaildity checks
* Added `quantile()` function S3 implementation for `tdigest` objects

M R/create.R => R/create.R +4 -1
@@ 11,7 11,9 @@
#' The accuracy of quantile estimates produced by t-digests can be orders of
#' magnitude more accurate than those produced by previous digest algorithms.
#'
#' @param vec vector (will be converted to `double` if not already double)
#' @param vec vector (will be converted to `double` if not already double). NOTE that this
#'        is ALTREP-aware and will not materialize the passed-in object in order to
#'        add the values to the t-Digest.
#' @param compression the input compression value; should be >= 1.0; this
#'        will control how aggressively the TDigest compresses data together.
#'        The original t-Digest paper suggests using a value of 100 for a good


@@ 129,6 131,7 @@ td_add <- function(td, val, count) {
  stopifnot(inherits(td, "tdigest"))
  stopifnot(!is_null_xptr(td))
  val <- as.double(val[1])
  stopifnot(!is.na(val))
  count <- as.double(count[1])
  .Call("Rtd_add", tdig=td, val=val, count=count, PACKAGE="tdigest")
}

M README.Rmd => README.Rmd +12 -0
@@ 88,6 88,18 @@ tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
quantile(td)
```

#### ALTREP-aware

```{r}
N <- 1000000
x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0

td <- tdigest(x.altrep)
td[0.1]
td[0.5]
length(td)
```

#### Proof it's faster

```{r}

M README.md => README.md +21 -6
@@ 122,6 122,21 @@ quantile(td)
## [1]   0.00000  24.74751  49.99666  75.24783 100.00000
```

#### ALTREP-aware

``` r
N <- 1000000
x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0

td <- tdigest(x.altrep)
td[0.1]
## [1] 93051
td[0.5]
## [1] 491472.5
length(td)
## [1] 1000000
```

#### Proof it’s faster

``` r


@@ 130,18 145,18 @@ microbenchmark::microbenchmark(
  r_quantile = quantile(x, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
)
## Unit: microseconds
##        expr       min         lq        mean   median         uq        max neval cld
##     tdigest     8.994    10.5205    20.97971    12.20    33.1645     37.448   100  a 
##  r_quantile 52181.822 53123.4685 55258.30131 53593.93 56231.9310 104215.008   100   b
##        expr       min         lq        mean    median         uq       max neval cld
##     tdigest     8.227     9.4895    21.66915    12.509    33.6245    69.111   100  a 
##  r_quantile 53792.878 54695.9560 56684.11386 55361.924 57719.2745 99458.184   100   b
```

## tdigest Metrics

| Lang         | \# Files | (%) | LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
| :----------- | -------: | --: | --: | ---: | ----------: | ---: | -------: | ---: |
| C            |        3 | 0.3 | 337 | 0.67 |          45 | 0.38 |       26 | 0.11 |
| R            |        5 | 0.5 | 128 | 0.25 |          28 | 0.23 |      133 | 0.58 |
| Rmd          |        1 | 0.1 |  30 | 0.06 |          37 | 0.31 |       42 | 0.18 |
| C            |        3 | 0.3 | 347 | 0.66 |          45 | 0.36 |       26 | 0.11 |
| R            |        5 | 0.5 | 136 | 0.26 |          31 | 0.25 |      135 | 0.58 |
| Rmd          |        1 | 0.1 |  36 | 0.07 |          40 | 0.32 |       45 | 0.19 |
| C/C++ Header |        1 | 0.1 |  10 | 0.02 |          10 | 0.08 |       28 | 0.12 |

## Code of Conduct

M man/tdigest.Rd => man/tdigest.Rd +3 -1
@@ 10,7 10,9 @@ tdigest(vec, compression = 100)
\method{print}{tdigest}(x, ...)
}
\arguments{
\item{vec}{vector (will be converted to \code{double} if not already double)}
\item{vec}{vector (will be converted to \code{double} if not already double). NOTE that this
is ALTREP-aware and will not materialize the passed-in object in order to
add the values to the t-Digest.}

\item{compression}{the input compression value; should be >= 1.0; this
will control how aggressively the TDigest compresses data together.

M src/tdigest-main.c => src/tdigest-main.c +13 -3
@@ 1,6 1,8 @@
#include <R.h>
#include <Rinternals.h>
#include <Rdefines.h>
#include <R_ext/Altrep.h>
#include <R_ext/Itermacros.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>


@@ 45,9 47,17 @@ SEXP Rtdig(SEXP vec, SEXP compression) {
  SEXP ptr;
  td_histogram_t *t = td_new(asReal(compression));
  if (t) {
    R_xlen_t n = xlength(vec);
    for (R_xlen_t i = 0; i < n; i++) {
      td_add(t, REAL(vec)[i], 1);
    R_xlen_t n = Rf_xlength(vec);
    if (ALTREP(vec)) {
      ITERATE_BY_REGION(vec, x, i, nbatch, double, REAL, {
        for (R_xlen_t k = 0; k < nbatch; k++) {
          if (!ISNAN(x[k])) td_add(t, x[k], 1);
        }
      });
    } else {
      for (R_xlen_t i = 0; i < n; i++) {
        if (!ISNAN(REAL(vec)[i])) td_add(t, REAL(vec)[i], 1);
      }
    }
    ptr = R_MakeExternalPtr(t, install("tdigest"), R_NilValue);
    R_RegisterCFinalizerEx(ptr, td_finalizer, TRUE);

M tests/testthat/test-tdigest.R => tests/testthat/test-tdigest.R +10 -0
@@ 53,3 53,13 @@ expect_equal(
  c(0, 25, 51, 76, 100),
  tolerance = 3
)

context("ALTREP test")

N <- 1000000
x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0

td <- tdigest(x.altrep)
expect_equal(as.integer(td[0.1]), 93051)
expect_equal(as.integer(td[0.5]), 491472)
expect_equal(length(td), 1000000)