~javiljoen/lttb-numpy

24de93fe69861526cc2daa5b2427e279a3cf15c2 — Jack Viljoen 2 years ago 6d27991
Add check for NaN values in input data

This check has also been added to the list of default validators.
This change is not backwards-compatible,
but the previous behaviour was that `downsample()` would preferentially
select the NaN values for inclusion in the output
(presumably because it considers these points to be maximally distant
from the anchors in the neighbouring bins),
so users with missing data would either have gotten suboptimal results
(a higher proportion of missing data in the output array)
or they might have implemented their own up-front checks.
So in the unlikely case where this breaks someone's workflow,
this now requires them to make an explicit decision whether to
throw an error, exclude missing values up-front,
or accept output in which nulls are overrepresented.
6 files changed, 56 insertions(+), 9 deletions(-)

M README.rst
M src/lttb/lttb.py
M src/lttb/validators.py
M tests/test_lttb.py
M tests/test_validators.py
M tox.ini
M README.rst => README.rst +3 -2
@@ 34,8 34,9 @@ This is what it looks like, downsampled to 100 points:
Input validation
----------------

By default, `downsample()` checks that the data is of the right shape
and that the values in the first column are strictly increasing.
By default, `downsample()` checks that the data is of the right shape,
that the values in the first column are strictly increasing,
and that there are no missing (NaN) values in the data.
These checks can be skipped (e.g. if you know that your data will always meet these constraints),
or additional checks can be added (e.g. that the time values must be evenly spaced),
by passing in a different list of validation functions, e.g.:

M src/lttb/lttb.py => src/lttb/lttb.py +8 -3
@@ 1,8 1,13 @@
import numpy as np

from .validators import has_two_columns, validate, x_is_strictly_increasing

default_validators = [has_two_columns, x_is_strictly_increasing]
from .validators import (
    contains_no_nans,
    has_two_columns,
    validate,
    x_is_strictly_increasing,
)

default_validators = [has_two_columns, contains_no_nans, x_is_strictly_increasing]


def _areas_of_triangles(a, bs, c):

M src/lttb/validators.py => src/lttb/validators.py +5 -0
@@ 24,6 24,11 @@ def x_is_regular(data):
        raise ValueError("first column is not regularly spaced")


def contains_no_nans(data):
    if np.any(np.isnan(data)):
        raise ValueError("data contains NaN values")


def validate(data, validators):
    """Checks an array against each of the given validators.


M tests/test_lttb.py => tests/test_lttb.py +15 -0
@@ 62,3 62,18 @@ def test_invalid_n_out_raises_error(n_out):

    with pytest.raises(ValueError):
        lttb.downsample(data, n_out)


def test_downsample_with_default_validators_raises_error_with_multiple_messages():
    data = np.random.standard_normal((4, 3))  # 3 columns
    data[:, 0] = [1, 2, 2, 3]  # unsorted x values
    data[2, 1] = np.nan  # missing y value

    with pytest.raises(ValueError) as exc:
        lttb.downsample(data, 3)

    assert exc.match(
        "data does not have 2 columns; "
        "data contains NaN values; "
        "first column is not strictly increasing"
    )

M tests/test_validators.py => tests/test_validators.py +24 -3
@@ 2,6 2,7 @@ import numpy as np
import pytest

from lttb.validators import (
    contains_no_nans,
    has_two_columns,
    validate,
    x_is_regular,


@@ 88,14 89,33 @@ def test_x_is_regular_fails_if_x_intervals_are_not_constant():
        x_is_regular(data)


def test_contains_no_nans_passes_with_valid_data(valid_data):
    assert contains_no_nans(valid_data) is None


def test_contains_no_nans_fails_if_nan_in_xs():
    data = np.array([[0, 1, 2, np.nan], [0.0, 1.0, 2.0, 3.0]]).T

    with pytest.raises(ValueError):
        contains_no_nans(data)


def test_contains_no_nans_fails_if_nan_in_ys():
    data = np.array([[0, 1, 2, 3], [1.0, np.nan, 2.6, np.nan]]).T

    with pytest.raises(ValueError):
        contains_no_nans(data)


def test_validate_multiple_criteria_passes_for_valid_data(valid_data):
    validate(valid_data, [has_two_columns, x_is_regular])


def test_validate_raises_with_multiple_messages(valid_data):
def test_validate_raises_with_multiple_messages():
    data = np.random.standard_normal((4, 3))  # 3 columns
    data[:, 0] = [1, 4, 2, 9]  # unsorted x values
    validators = [has_two_columns, x_is_sorted, x_is_regular]
    data[2, 1] = np.nan  # missing y value
    validators = [has_two_columns, x_is_sorted, x_is_regular, contains_no_nans]

    with pytest.raises(ValueError) as exc:
        validate(data, validators)


@@ 103,5 123,6 @@ def test_validate_raises_with_multiple_messages(valid_data):
    assert exc.match(
        "data does not have 2 columns; "
        "data is not sorted on the first column; "
        "first column is not regularly spaced"
        "first column is not regularly spaced; "
        "data contains NaN values"
    )

M tox.ini => tox.ini +1 -1
@@ 19,7 19,7 @@ deps =

commands =
    coverage run -m pytest
    coverage report
    coverage report --fail-under=100
    coverage erase

[coverage:run]