~javiljoen/lttb-numpy

6d279912301ff753a290339efd899d9d87eb32dc — Jack Viljoen 2 years ago cfc86f5
Make validation of input data configurable

By changing the `validators` argument of `lttb.downsample()` from the
default list of validation functions, the data can be checked against
stricter or looser requirements.
For example, if the data is known to always satisfy the requirement for
the x-values to be strictly monotonic, some CPU cycles can be saved by
removing `x_is_strictly_increasing()` from the list.

This change is backwards-compatible.

It would be better to make the `validators` argument keyword-only, but
that would prevent this library from working on Python 2.7.
6 files changed, 215 insertions(+), 7 deletions(-)

M README.rst
M src/lttb/lttb.py
A src/lttb/validators.py
M tests/test_lttb.py
A tests/test_validators.py
M tox.ini
M README.rst => README.rst +17 -0
@@ 31,6 31,23 @@ This is what it looks like, downsampled to 100 points:
.. image:: https://github.com/javiljoen/lttb.py/raw/master/tests/timeseries.png


Input validation
----------------

By default, `downsample()` checks that the data is of the right shape
and that the values in the first column are strictly increasing.
These checks can be skipped (e.g. if you know that your data will always meet these constraints),
or additional checks can be added (e.g. that the time values must be evenly spaced),
by passing in a different list of validation functions, e.g.:

.. code:: python

   small_data = lttb.downsample(data, n_out=20, validators=[])

   from lttb.validators import *
   small_data = lttb.downsample(data, n_out=20, validators=[has_two_columns, x_is_regular])


Installation
============


M src/lttb/lttb.py => src/lttb/lttb.py +23 -7
@@ 1,5 1,9 @@
import numpy as np

from .validators import has_two_columns, validate, x_is_strictly_increasing

default_validators = [has_two_columns, x_is_strictly_increasing]


def _areas_of_triangles(a, bs, c):
    """Calculate areas of triangles from duples of vertex coordinates.


@@ 18,7 22,7 @@ def _areas_of_triangles(a, bs, c):
    )


def downsample(data, n_out):
def downsample(data, n_out, validators=default_validators):
    """Downsample ``data`` to ``n_out`` points using the LTTB algorithm.

    Reference


@@ 26,23 30,35 @@ def downsample(data, n_out):
    Sveinn Steinarsson. 2013. Downsampling Time Series for Visual
    Representation. MSc thesis. University of Iceland.

    Parameters
    ----------
    data : numpy.array
        A 2-dimensional array with time values in the first column
    n_out : int
        Number of data points to downsample to
    validators : sequence of callables, optional
        Validation functions that take an array as argument and
        raise ``ValueError`` if the array fails some criterion

    Constraints
    -----------
      - ncols(data) == 2
      - 3 <= n_out <= nrows(data)
      - ``data`` should be sorted on the first column.
      - the first column of ``data`` should be strictly monotonic.

    Returns
    -------
    numpy.array
        Array of shape (n_out, 2)

    Raises
    ------
    ValueError
        If ``data`` fails the validation checks,
        or if ``n_out`` falls outside the valid range.
    """
    # Validate input
    if data.shape[1] != 2:
        raise ValueError("data should have 2 columns")

    if np.any(data[1:, 0] <= data[:-1, 0]):
        raise ValueError("data should be sorted on first column")
    validate(data, validators)

    if n_out > data.shape[0]:
        raise ValueError("n_out must be <= number of rows in data")

A src/lttb/validators.py => src/lttb/validators.py +56 -0
@@ 0,0 1,56 @@
import numpy as np


def has_two_columns(data):
    if len(data.shape) != 2:
        raise ValueError("data is not a 2D array")

    if data.shape[1] != 2:
        raise ValueError("data does not have 2 columns")


def x_is_sorted(data):
    if np.any(data[1:, 0] < data[:-1, 0]):
        raise ValueError("data is not sorted on the first column")


def x_is_strictly_increasing(data):
    if np.any(data[1:, 0] <= data[:-1, 0]):
        raise ValueError("first column is not strictly increasing")


def x_is_regular(data):
    if len(np.unique(np.diff(data[:, 0]))) != 1:
        raise ValueError("first column is not regularly spaced")


def validate(data, validators):
    """Checks an array against each of the given validators.

    All validators are run (rather than failing at the first error)
    and their error messages are concatenated into the message for the
    raised ``ValueError``, if any.

    Parameters
    ----------
    data : numpy.array
        Data to validate
    validators : sequence of callables
        Validation functions that take an array as argument and
        raise ``ValueError`` if the array fails some criterion

    Raises
    ------
    ValueError
        If any of the validators raise a ``ValueError`` for ``data``
    """
    errors = []

    for validator in validators:
        try:
            validator(data)
        except ValueError as err:
            errors.append(err)

    if errors:
        raise ValueError("; ".join(map(str, errors)))

M tests/test_lttb.py => tests/test_lttb.py +9 -0
@@ 1,4 1,5 @@
import numpy as np
import pytest
from hypothesis import assume, given
from hypothesis.strategies import builds, integers



@@ 53,3 54,11 @@ def test_downsampling_random_data_retains_variation(data, n_out):
    var_in = np.var(data[:, 1])
    var_out = np.var(out[:, 1])
    assert var_out >= 0.95 * var_in


@pytest.mark.parametrize("n_out", [2, 7])
def test_invalid_n_out_raises_error(n_out):
    data = gen_valid_data(6)

    with pytest.raises(ValueError):
        lttb.downsample(data, n_out)

A tests/test_validators.py => tests/test_validators.py +107 -0
@@ 0,0 1,107 @@
import numpy as np
import pytest

from lttb.validators import (
    has_two_columns,
    validate,
    x_is_regular,
    x_is_sorted,
    x_is_strictly_increasing,
)


@pytest.fixture(scope="module")
def valid_data():
    nrows = 10
    ys = np.random.standard_normal(nrows)
    xs = np.linspace(1, nrows, nrows)
    return np.array([xs, ys]).T


def test_has_two_columns_passes_for_2d_array_with_2_columns(valid_data):
    assert has_two_columns(valid_data) is None


def test_has_two_columns_fails_for_1d_array():
    array1d = np.random.standard_normal(10)

    with pytest.raises(ValueError):
        has_two_columns(array1d)


def test_has_two_columns_fails_for_3d_array():
    array3d = np.random.standard_normal((3, 10, 2))

    with pytest.raises(ValueError):
        has_two_columns(array3d)


def test_has_two_columns_fails_for_2d_array_with_1_column():
    matrix1c = np.random.standard_normal((10, 1))

    with pytest.raises(ValueError):
        has_two_columns(matrix1c)


def test_has_two_columns_fails_for_2d_array_with_3_columns():
    matrix3c = np.random.standard_normal((10, 3))

    with pytest.raises(ValueError):
        has_two_columns(matrix3c)


def test_x_is_strictly_increasing_passes_for_valid_data(valid_data):
    assert x_is_strictly_increasing(valid_data) is None


def test_x_is_strictly_increasing_fails_with_repeated_xs():
    data = np.array([[1, 1, 2, 2], np.random.standard_normal(4)]).T

    with pytest.raises(ValueError):
        x_is_strictly_increasing(data)


def test_x_is_sorted_passes_for_valid_data(valid_data):
    assert x_is_sorted(valid_data) is None


def test_x_is_sorted_passes_with_repeated_xs():
    data = np.array([[1, 1, 2, 2], np.random.standard_normal(4)]).T
    assert x_is_sorted(data) is None


def test_x_is_sorted_fails_if_xs_not_sorted():
    data = np.array([[1, 4, 3, 2], np.random.standard_normal(4)]).T

    with pytest.raises(ValueError):
        x_is_sorted(data)


def test_x_is_regular_passes_with_valid_data(valid_data):
    assert x_is_regular(valid_data) is None


def test_x_is_regular_fails_if_x_intervals_are_not_constant():
    data = np.array([[1, 2, 4, 9], np.random.standard_normal(4)]).T

    with pytest.raises(ValueError):
        x_is_regular(data)


def test_validate_multiple_criteria_passes_for_valid_data(valid_data):
    validate(valid_data, [has_two_columns, x_is_regular])


def test_validate_raises_with_multiple_messages(valid_data):
    data = np.random.standard_normal((4, 3))  # 3 columns
    data[:, 0] = [1, 4, 2, 9]  # unsorted x values
    validators = [has_two_columns, x_is_sorted, x_is_regular]

    with pytest.raises(ValueError) as exc:
        validate(data, validators)

    assert exc.match(
        "data does not have 2 columns; "
        "data is not sorted on the first column; "
        "first column is not regularly spaced"
    )

M tox.ini => tox.ini +3 -0
@@ 30,3 30,6 @@ source = lttb
max-line-length = 80
select = C,E,F,W,B,B950
ignore = E203,E501,W503

[isort]
profile = black