M .Rbuildignore => .Rbuildignore +1 -0
@@ 9,6 9,7 @@
^NOTES\.*html$
^\.codecov\.yml$
^README_files$
+^README_cache$
^doc$
^docs$
^tmp$
M DESCRIPTION => DESCRIPTION +2 -2
@@ 1,8 1,8 @@
Package: tdigest
Type: Package
Title: Wicked Fast, Accurate Quantiles Using t-Digests
-Version: 0.3.0
-Date: 2019-07-25
+Version: 0.4.0
+Date: 2019-08-20
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
M NAMESPACE => NAMESPACE +2 -0
@@ 1,10 1,12 @@
# Generated by roxygen2: do not edit by hand
S3method("[",tdigest)
+S3method(as.list,tdigest)
S3method(length,tdigest)
S3method(print,tdigest)
S3method(quantile,tdigest)
export("%>%")
+export(as_tdigest)
export(is_tdigest)
export(td_add)
export(td_create)
M NEWS.md => NEWS.md +3 -0
@@ 1,3 1,6 @@
+0.4.0
+* Serialization & deserialization of tdigest objects
+
0.3.0
* ALTREP-aware
* `length()` and `[` implemented for `tdigest` objects
M R/create.R => R/create.R +29 -0
@@ 208,3 208,32 @@ length.tdigest <- function(x) {
if ((i<=0) || (i>1)) return(NULL)
td_value_at(x, i)
}
+
+#' Serialize a tdigest object to an R list or unserialize a serialized tdigest
+#' list back into a tdigest object
+#'
+#' These functions make it possible to create & populate a tdigest, serialize it out,
+#' read it in at a later time and continue populating it enabling compact
+#' distribution accumulation & storage for large, "continuous" datasets.
+#'
+#' @param x a tdigest object or a tdigest_list object
+#' @param ... unused
+#' @export
+#' @examples
+#' set.seed(1492)
+#' x <- sample(0:100, 1000000, replace = TRUE)
+#' td <- tdigest(x, 1000)
+#' as_tdigest(as.list(td))
+as.list.tdigest <- function(x, ...) {
+ stopifnot(inherits(x, "tdigest"))
+ out <- .Call("Rg_toR", x)
+ class(out) <- c("tdigest_list", "list")
+ out
+}
+
+#' @rdname as.list.tdigest
+#' @export
+as_tdigest <- function(x) {
+ stopifnot(inherits(x, "tdigest_list"))
+ .Call("Rg_fromR", x)
+}
M README.Rmd => README.Rmd +25 -4
@@ 56,7 56,7 @@ packageVersion("tdigest")
### Basic (Low-level interface)
-```{r}
+```{r basic}
td <- td_create(10)
td
@@ 76,7 76,7 @@ quantile(td)
#### Bigger (and Vectorised)
-```{r}
+```{r bigger}
td <- tdigest(c(0, 10), 10)
is_tdigest(td)
@@ 95,9 95,30 @@ tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
quantile(td)
```
+#### Serialization
+
+These [de]serialization functions make it possible to create & populate a tdigest,
+serialize it out, read it in at a later time and continue populating it enabling
+compact distribution accumulation & storage for large, "continuous" datasets.
+
+```{r serialize}
+set.seed(1492)
+x <- sample(0:100, 1000000, replace = TRUE)
+td <- tdigest(x, 1000)
+
+tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
+
+str(in_r <- as.list(td), 1)
+
+td2 <- as_tdigest(in_r)
+tquantile(td2, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
+
+identical(in_r, as.list(td2))
+```
+
#### ALTREP-aware
-```{r}
+```{r altrep}
N <- 1000000
x.altrep <- seq_len(N) # this is an ALTREP in R version >= 3.5.0
@@ 109,7 130,7 @@ length(td)
#### Proof it's faster
-```{r}
+```{r benchmark, cache=TRUE}
microbenchmark::microbenchmark(
tdigest = tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1)),
r_quantile = quantile(x, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
M README.md => README.md +46 -8
@@ 46,6 46,8 @@ Ertl](https://arxiv.org/abs/1902.04023) for more details on t-Digests.
The following functions are implemented:
+ - `as.list.tdigest`: Serialize a tdigest object to an R list or
+ unserialize a serialized tdigest list back into a tdigest object
- `td_add`: Add a value to the t-Digest with the specified count
- `td_create`: Allocate a new histogram
- `td_merge`: Merge one t-Digest into another
@@ 80,7 82,7 @@ library(tdigest)
# current version
packageVersion("tdigest")
-## [1] '0.3.0'
+## [1] '0.4.0'
```
### Basic (Low-level interface)
@@ 138,6 140,42 @@ quantile(td)
## [1] 0.00000 24.74751 49.99666 75.24783 100.00000
```
+#### Serialization
+
+These \[de\]serialization functions make it possible to create &
+populate a tdigest, serialize it out, read it in at a later time and
+continue populating it enabling compact distribution accumulation &
+storage for large, “continuous” datasets.
+
+``` r
+set.seed(1492)
+x <- sample(0:100, 1000000, replace = TRUE)
+td <- tdigest(x, 1000)
+
+tquantile(td, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
+## [1] 0.0000000 0.8099857 9.6725790 19.7533723 29.7448283 39.7544675 49.9966628 60.0235148 70.2067574
+## [10] 80.3090454 90.2594642 99.4269454 100.0000000
+
+str(in_r <- as.list(td), 1)
+## List of 7
+## $ compression : num 1000
+## $ cap : int 6010
+## $ merged_nodes : int 226
+## $ unmerged_nodes: int 0
+## $ merged_count : num 1e+06
+## $ unmerged_count: num 0
+## $ nodes :List of 2
+## - attr(*, "class")= chr [1:2] "tdigest_list" "list"
+
+td2 <- as_tdigest(in_r)
+tquantile(td2, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
+## [1] 0.0000000 0.8099857 9.6725790 19.7533723 29.7448283 39.7544675 49.9966628 60.0235148 70.2067574
+## [10] 80.3090454 90.2594642 99.4269454 100.0000000
+
+identical(in_r, as.list(td2))
+## [1] TRUE
+```
+
#### ALTREP-aware
``` r
@@ 161,19 199,19 @@ microbenchmark::microbenchmark(
r_quantile = quantile(x, c(0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1))
)
## Unit: microseconds
-## expr min lq mean median uq max neval cld
-## tdigest 7.868 9.23 19.94667 10.203 32.59 34.672 100 a
-## r_quantile 52673.523 53300.41 55414.58517 53616.418 56251.37 108089.810 100 b
+## expr min lq mean median uq max neval cld
+## tdigest 8.02 9.175 20.2545 10.185 32.682 43.003 100 a
+## r_quantile 52657.60 53307.742 55924.6932 54093.988 56487.027 108778.946 100 b
```
## tdigest Metrics
| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) |
| :----------- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
-| C | 3 | 0.27 | 350 | 0.65 | 46 | 0.36 | 26 | 0.11 |
-| R | 6 | 0.55 | 140 | 0.26 | 31 | 0.24 | 139 | 0.57 |
-| Rmd | 1 | 0.09 | 36 | 0.07 | 40 | 0.31 | 52 | 0.21 |
-| C/C++ Header | 1 | 0.09 | 10 | 0.02 | 10 | 0.08 | 26 | 0.11 |
+| C | 3 | 0.27 | 484 | 0.68 | 77 | 0.44 | 46 | 0.16 |
+| R | 6 | 0.55 | 157 | 0.22 | 35 | 0.20 | 156 | 0.54 |
+| Rmd | 1 | 0.09 | 44 | 0.06 | 47 | 0.27 | 58 | 0.20 |
+| C/C++ Header | 1 | 0.09 | 24 | 0.03 | 16 | 0.09 | 30 | 0.10 |
## Code of Conduct
A man/as.list.tdigest.Rd => man/as.list.tdigest.Rd +28 -0
@@ 0,0 1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create.R
+\name{as.list.tdigest}
+\alias{as.list.tdigest}
+\alias{as_tdigest}
+\title{Serialize a tdigest object to an R list or unserialize a serialized tdigest
+list back into a tdigest object}
+\usage{
+\method{as.list}{tdigest}(x, ...)
+
+as_tdigest(x)
+}
+\arguments{
+\item{x}{a tdigest object or a tdigest_list object}
+
+\item{...}{unused}
+}
+\description{
+These functions make it possible to create & populate a tdigest, serialize it out,
+read it in at a later time and continue populating it enabling compact
+distribution accumulation & storage for large, "continuous" datasets.
+}
+\examples{
+set.seed(1492)
+x <- sample(0:100, 1000000, replace = TRUE)
+td <- tdigest(x, 1000)
+as_tdigest(as.list(td))
+}
M src/init.c => src/init.c +33 -18
@@ 3,10 3,6 @@
#include <stdlib.h> // for NULL
#include <R_ext/Rdynload.h>
-/* FIXME:
- Check these declarations against the C/Fortran source code.
-*/
-
/* .Call calls */
extern SEXP is_null_xptr_(SEXP);
extern SEXP Rtd_add(SEXP, SEXP, SEXP);
@@ 17,22 13,41 @@ extern SEXP Rtd_total_count(SEXP);
extern SEXP Rtd_value_at(SEXP, SEXP);
extern SEXP Rtdig(SEXP, SEXP);
extern SEXP Rtquant(SEXP, SEXP);
+extern SEXP Rg_compression(SEXP);
+extern SEXP Rg_cap(SEXP);
+extern SEXP Rg_merged_nodes(SEXP);
+extern SEXP Rg_unmerged_nodes(SEXP);
+extern SEXP Rg_merged_count(SEXP);
+extern SEXP Rg_unmerged_count(SEXP);
+extern SEXP Rg_nodes_mean(SEXP);
+extern SEXP Rg_nodes_count(SEXP);
+extern SEXP Rg_toR(SEXP);
+extern SEXP Rg_fromR(SEXP);
static const R_CallMethodDef CallEntries[] = {
- {"is_null_xptr_", (DL_FUNC) &is_null_xptr_, 1},
- {"Rtd_add", (DL_FUNC) &Rtd_add, 3},
- {"Rtd_create", (DL_FUNC) &Rtd_create, 1},
- {"Rtd_merge", (DL_FUNC) &Rtd_merge, 2},
- {"Rtd_quantile_of", (DL_FUNC) &Rtd_quantile_of, 2},
- {"Rtd_total_count", (DL_FUNC) &Rtd_total_count, 1},
- {"Rtd_value_at", (DL_FUNC) &Rtd_value_at, 2},
- {"Rtdig", (DL_FUNC) &Rtdig, 2},
- {"Rtquant", (DL_FUNC) &Rtquant, 2},
- {NULL, NULL, 0}
+ {"is_null_xptr_", (DL_FUNC) &is_null_xptr_, 1},
+ {"Rtd_add", (DL_FUNC) &Rtd_add, 3},
+ {"Rtd_create", (DL_FUNC) &Rtd_create, 1},
+ {"Rtd_merge", (DL_FUNC) &Rtd_merge, 2},
+ {"Rtd_quantile_of", (DL_FUNC) &Rtd_quantile_of, 2},
+ {"Rtd_total_count", (DL_FUNC) &Rtd_total_count, 1},
+ {"Rtd_value_at", (DL_FUNC) &Rtd_value_at, 2},
+ {"Rtdig", (DL_FUNC) &Rtdig, 2},
+ {"Rtquant", (DL_FUNC) &Rtquant, 2},
+ {"Rg_compression", (DL_FUNC) &Rg_compression, 1},
+ {"Rg_cap", (DL_FUNC) &Rg_cap, 1},
+ {"Rg_merged_nodes", (DL_FUNC) &Rg_merged_nodes, 1},
+ {"Rg_unmerged_nodes", (DL_FUNC) &Rg_unmerged_nodes, 1},
+ {"Rg_merged_count", (DL_FUNC) &Rg_merged_count, 1},
+ {"Rg_unmerged_count", (DL_FUNC) &Rg_unmerged_count, 1},
+ {"Rg_nodes_mean", (DL_FUNC) &Rg_nodes_mean, 1},
+ {"Rg_nodes_count", (DL_FUNC) &Rg_nodes_count, 1},
+ {"Rg_toR", (DL_FUNC) &Rg_toR, 1},
+ {"Rg_fromR", (DL_FUNC) &Rg_fromR, 1},
+ {NULL, NULL, 0}
};
-void R_init_tdigest(DllInfo *dll)
-{
- R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
- R_useDynamicSymbols(dll, FALSE);
+void R_init_tdigest(DllInfo *dll) {
+ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+ R_useDynamicSymbols(dll, FALSE);
}
M src/tdigest-main.c => src/tdigest-main.c +168 -0
@@ 11,6 11,17 @@
#include "tdigest.h"
+#define O_COMPRESSION 0
+#define O_CAP 1
+#define O_MNODES 2
+#define O_UMNODES 3
+#define O_MCOUNT 4
+#define O_UMCOUNT 5
+#define O_OUT_NODES 6
+
+#define O_MEANS 0
+#define O_COUNTS 1
+
// next 2 ƒ() via <https://github.com/randy3k/xptr/blob/master/src/xptr.c>
void check_is_xptr(SEXP s) {
@@ 125,3 136,160 @@ SEXP Rtd_merge(SEXP from, SEXP into) {
}
return(R_NilValue);
}
+
+SEXP Rg_compression(SEXP from) {
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+ return(f ? ScalarReal(f->compression) : R_NilValue);
+}
+
+SEXP Rg_cap(SEXP from) {
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+ return(f ? ScalarInteger(f->cap) : R_NilValue);
+}
+
+SEXP Rg_merged_nodes(SEXP from) {
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+ return(f ? ScalarInteger(f->merged_nodes) : R_NilValue);
+}
+
+SEXP Rg_unmerged_nodes(SEXP from) {
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+ return(f ? ScalarInteger(f->unmerged_nodes) : R_NilValue);
+}
+
+SEXP Rg_merged_count(SEXP from) {
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+ return(f ? ScalarReal(f->merged_count) : R_NilValue);
+}
+
+SEXP Rg_unmerged_count(SEXP from) {
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+ return(f ? ScalarReal(f->unmerged_count) : R_NilValue);
+}
+
+SEXP Rg_nodes_mean(SEXP from) {
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+ if (f) {
+ int N = f->merged_nodes + f->unmerged_nodes;
+ SEXP out = PROTECT(allocVector(REALSXP, N));
+ for (int i=0; i<N; i++) {
+ REAL(out)[i] = f->nodes[i].mean;
+ }
+ UNPROTECT(1);
+ return(out);
+ } else {
+ return(R_NilValue);
+ }
+}
+
+SEXP Rg_nodes_count(SEXP from) {
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+ if (f) {
+ int N = f->merged_nodes + f->unmerged_nodes;
+ SEXP out = PROTECT(allocVector(REALSXP, N));
+ for (int i=0; i<N; i++) {
+ REAL(out)[i] = f->nodes[i].count;
+ }
+ UNPROTECT(1);
+ return(out);
+ } else {
+ return(R_NilValue);
+ }
+}
+
+SEXP Rg_toR(SEXP from) {
+
+ td_histogram_t *f = (td_histogram_t *)R_ExternalPtrAddr(from);
+
+ if (f) {
+
+ SEXP o_cap = PROTECT(ScalarInteger(f->cap));
+ SEXP o_compression = PROTECT(ScalarReal(f->compression));
+ SEXP o_mcount = PROTECT(ScalarReal(f->merged_count));
+ SEXP o_umcount = PROTECT(ScalarReal(f->unmerged_count));
+ SEXP o_mnodes = PROTECT(ScalarInteger(f->merged_nodes));
+ SEXP o_umnodes = PROTECT(ScalarInteger(f->unmerged_nodes));
+
+ int N = f->merged_nodes + f->unmerged_nodes;
+
+ SEXP o_means = PROTECT(allocVector(REALSXP, N));
+ SEXP o_counts = PROTECT(allocVector(REALSXP, N));
+
+ for (int i=0; i<N; i++) {
+ REAL(o_means)[i] = f->nodes[i].mean;
+ REAL(o_counts)[i] = f->nodes[i].count;
+ }
+
+ const char *names[] = {
+ "compression",
+ "cap",
+ "merged_nodes",
+ "unmerged_nodes",
+ "merged_count",
+ "unmerged_count",
+ "nodes",
+ ""
+ };
+
+ SEXP out = PROTECT(mkNamed(VECSXP, names));
+
+ SET_VECTOR_ELT(out, O_COMPRESSION, o_compression);
+ SET_VECTOR_ELT(out, O_CAP, o_cap);
+ SET_VECTOR_ELT(out, O_MNODES, o_mnodes);
+ SET_VECTOR_ELT(out, O_UMNODES, o_umnodes);
+ SET_VECTOR_ELT(out, O_MCOUNT, o_mcount);
+ SET_VECTOR_ELT(out, O_UMCOUNT, o_umcount);
+
+ const char *node_names[] = {
+ "counts",
+ "means",
+ ""
+ };
+
+ SEXP out_nodes = PROTECT(mkNamed(VECSXP, node_names));
+
+ SET_VECTOR_ELT(out_nodes, 0, o_means);
+ SET_VECTOR_ELT(out_nodes, 1, o_counts);
+
+ SET_VECTOR_ELT(out, O_OUT_NODES, out_nodes);
+
+ UNPROTECT(10);
+
+ return(out);
+
+ } else {
+ return(R_NilValue);
+ }
+
+}
+
+SEXP Rg_fromR(SEXP td_list) {
+
+ SEXP out = PROTECT(Rtd_create(VECTOR_ELT(td_list, O_COMPRESSION)));
+
+ td_histogram_t *t = (td_histogram_t *)R_ExternalPtrAddr(out);
+
+ t->compression = asReal(VECTOR_ELT(td_list, O_COMPRESSION));
+ t->cap = asInteger(VECTOR_ELT(td_list, O_CAP));
+ t->merged_nodes = asInteger(VECTOR_ELT(td_list, O_MNODES));
+ t->unmerged_nodes = asInteger(VECTOR_ELT(td_list, O_UMNODES));
+ t->merged_count = asReal(VECTOR_ELT(td_list, O_MCOUNT));
+ t->unmerged_count = asReal(VECTOR_ELT(td_list, O_UMCOUNT));
+
+ int N = t->merged_nodes + t->unmerged_nodes;
+
+ SEXP node_list = VECTOR_ELT(td_list, O_OUT_NODES);
+
+ SEXP o_means = VECTOR_ELT(node_list, O_MEANS);
+ SEXP o_counts = VECTOR_ELT(node_list, O_COUNTS);
+
+ for (int i=0; i<N; i++) {
+ t->nodes[i].count = REAL(o_counts)[i];
+ t->nodes[i].mean = REAL(o_means)[i];
+ }
+
+ UNPROTECT(1);
+
+ return(out);
+
+}
M src/tdigest.c => src/tdigest.c +49 -47
@@ 5,48 5,52 @@
#include "tdigest.h"
-#define MM_PI 3.14159265358979323846
-
-typedef struct node {
- double mean;
- double count;
-} node_t;
-
void bbzero(void *to, size_t count) {
memset(to, 0, count);
}
-struct td_histogram {
- // compression is a setting used to configure the size of centroids when merged.
- double compression;
-
- // cap is the total size of nodes
- int cap;
- // merged_nodes is the number of merged nodes at the front of nodes.
- int merged_nodes;
- // unmerged_nodes is the number of buffered nodes.
- int unmerged_nodes;
-
- double merged_count;
- double unmerged_count;
-
- node_t nodes[];
-};
+// #define MM_PI 3.14159265358979323846
+//
+// typedef struct node {
+// double mean;
+// double count;
+// } node_t;
+//
+// void bbzero(void *to, size_t count) {
+// memset(to, 0, count);
+// }
+//
+// struct td_histogram {
+// // compression is a setting used to configure the size of centroids when merged.
+// double compression;
+//
+// // cap is the total size of nodes
+// int cap;
+// // merged_nodes is the number of merged nodes at the front of nodes.
+// int merged_nodes;
+// // unmerged_nodes is the number of buffered nodes.
+// int unmerged_nodes;
+//
+// double merged_count;
+// double unmerged_count;
+//
+// node_t nodes[];
+// };
static bool is_very_small(double val) {
- return !(val > .000000001 || val < -.000000001);
+ return !(val > .000000001 || val < -.000000001);
}
static int cap_from_compression(double compression) {
- return (6 * (int)(compression)) + 10;
+ return (6 * (int)(compression)) + 10;
}
static bool should_merge(td_histogram_t *h) {
- return ((h->merged_nodes + h->unmerged_nodes) == h->cap);
+ return ((h->merged_nodes + h->unmerged_nodes) == h->cap);
}
static int next_node(td_histogram_t *h) {
- return h->merged_nodes + h->unmerged_nodes;
+ return h->merged_nodes + h->unmerged_nodes;
}
static void merge(td_histogram_t *h);
@@ 55,11 59,9 @@ static void merge(td_histogram_t *h);
// Constructors
////////////////////////////////////////////////////////////////////////////////
-
-
static size_t td_required_buf_size(double compression) {
- return sizeof(td_histogram_t) +
- (cap_from_compression(compression) * sizeof(node_t));
+ return sizeof(td_histogram_t) +
+ (cap_from_compression(compression) * sizeof(node_t));
}
// td_init will initialize a td_histogram_t inside buf which is buf_size bytes.
@@ 69,25 71,25 @@ static size_t td_required_buf_size(double compression) {
// In general use td_required_buf_size to figure out what size buffer to
// pass.
static td_histogram_t *td_init(double compression, size_t buf_size, char *buf) {
- td_histogram_t *h = (td_histogram_t *)(buf);
- if (!h) {
- return NULL;
- }
- bbzero((void *)(h), buf_size);
- *h = (td_histogram_t) {
- .compression = compression,
- .cap = (buf_size - sizeof(td_histogram_t)) / sizeof(node_t),
- .merged_nodes = 0,
- .merged_count = 0,
- .unmerged_nodes = 0,
- .unmerged_count = 0,
- };
- return h;
+ td_histogram_t *h = (td_histogram_t *)(buf);
+ if (!h) {
+ return NULL;
+ }
+ bbzero((void *)(h), buf_size);
+ *h = (td_histogram_t) {
+ .compression = compression,
+ .cap = (buf_size - sizeof(td_histogram_t)) / sizeof(node_t),
+ .merged_nodes = 0,
+ .merged_count = 0,
+ .unmerged_nodes = 0,
+ .unmerged_count = 0,
+ };
+ return h;
}
td_histogram_t *td_new(double compression) {
- size_t memsize = td_required_buf_size(compression);
- return td_init(compression, memsize, (char *)(malloc(memsize)));
+ size_t memsize = td_required_buf_size(compression);
+ return td_init(compression, memsize, (char *)(malloc(memsize)));
}
void td_free(td_histogram_t *h) {
M src/tdigest.h => src/tdigest.h +24 -0
@@ 19,6 19,30 @@
#include <stdlib.h>
+#define MM_PI 3.14159265358979323846
+
+typedef struct node {
+ double mean;
+ double count;
+} node_t;
+
+struct td_histogram {
+ // compression is a setting used to configure the size of centroids when merged.
+ double compression;
+
+ // cap is the total size of nodes
+ int cap;
+ // merged_nodes is the number of merged nodes at the front of nodes.
+ int merged_nodes;
+ // unmerged_nodes is the number of buffered nodes.
+ int unmerged_nodes;
+
+ double merged_count;
+ double unmerged_count;
+
+ node_t nodes[];
+};
+
typedef struct td_histogram td_histogram_t;
// td_new allocates a new histogram.
M tests/testthat/test-tdigest.R => tests/testthat/test-tdigest.R +9 -0
@@ 63,3 63,12 @@ td <- tdigest(x.altrep)
expect_equal(as.integer(td[0.1]), 93051)
expect_equal(as.integer(td[0.5]), 491472)
expect_equal(length(td), 1000000)
+
+context("Serialization test")
+
+set.seed(1492)
+x <- sample(0:100, 1000000, replace = TRUE)
+td <- tdigest(x, 1000)
+a <- as.list(td)
+b <- as.list(as_tdigest(a))
+expect_true(identical(a, b))