You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
96 lines
3.2 KiB
96 lines
3.2 KiB
from itertools import product
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from dask.dataframe.utils import assert_eq
|
|
import dask.dataframe as dd
|
|
|
|
|
|
def resample(df, freq, how='mean', **kwargs):
|
|
return getattr(df.resample(freq, **kwargs), how)()
|
|
|
|
|
|
@pytest.mark.parametrize(['obj', 'method', 'npartitions', 'freq', 'closed', 'label'],
|
|
list(product(['series', 'frame'],
|
|
['count', 'mean', 'ohlc'],
|
|
[2, 5],
|
|
['30T', 'h', 'd', 'w', 'M'],
|
|
['right', 'left'],
|
|
['right', 'left'])))
|
|
def test_series_resample(obj, method, npartitions, freq, closed, label):
|
|
index = pd.date_range('1-1-2000', '2-15-2000', freq='h')
|
|
index = index.union(pd.date_range('4-15-2000', '5-15-2000', freq='h'))
|
|
if obj == 'series':
|
|
ps = pd.Series(range(len(index)), index=index)
|
|
elif obj == 'frame':
|
|
ps = pd.DataFrame({'a':range(len(index))}, index=index)
|
|
ds = dd.from_pandas(ps, npartitions=npartitions)
|
|
# Series output
|
|
|
|
result = resample(ds, freq, how=method, closed=closed, label=label)
|
|
expected = resample(ps, freq, how=method, closed=closed, label=label)
|
|
assert_eq(result, expected, check_dtype=False)
|
|
|
|
divisions = result.divisions
|
|
|
|
assert expected.index[0] == divisions[0]
|
|
assert expected.index[-1] == divisions[-1]
|
|
|
|
|
|
def test_resample_agg():
|
|
index = pd.date_range('2000-01-01', '2000-02-15', freq='h')
|
|
ps = pd.Series(range(len(index)), index=index)
|
|
ds = dd.from_pandas(ps, npartitions=2)
|
|
|
|
assert_eq(ds.resample('10min').agg('mean'),
|
|
ps.resample('10min').agg('mean'))
|
|
assert_eq(ds.resample('10min').agg(['mean', 'min']),
|
|
ps.resample('10min').agg(['mean', 'min']))
|
|
|
|
|
|
def test_resample_agg_passes_kwargs():
|
|
index = pd.date_range('2000-01-01', '2000-02-15', freq='h')
|
|
ps = pd.Series(range(len(index)), index=index)
|
|
ds = dd.from_pandas(ps, npartitions=2)
|
|
|
|
def foo(series, bar=1, *args, **kwargs):
|
|
return bar
|
|
assert_eq(ds.resample('2h').agg(foo, bar=2),
|
|
ps.resample('2h').agg(foo, bar=2))
|
|
assert (ds.resample('2h').agg(foo, bar=2) == 2).compute().all()
|
|
|
|
|
|
def test_series_resample_not_implemented():
|
|
index = pd.date_range(start='2012-01-02', periods=100, freq='T')
|
|
s = pd.Series(range(len(index)), index=index)
|
|
ds = dd.from_pandas(s, npartitions=5)
|
|
# Frequency doesn't evenly divide day
|
|
pytest.raises(NotImplementedError, lambda: resample(ds, '57T'))
|
|
|
|
|
|
def test_unknown_divisions_error():
|
|
df = pd.DataFrame({'x': [1, 2, 3]})
|
|
ddf = dd.from_pandas(df, npartitions=2, sort=False)
|
|
try:
|
|
ddf.x.resample('1m').mean()
|
|
assert False
|
|
except ValueError as e:
|
|
assert 'divisions' in str(e)
|
|
|
|
|
|
def test_resample_index_name():
|
|
import numpy as np
|
|
from datetime import datetime, timedelta
|
|
|
|
date_today = datetime.now()
|
|
days = pd.date_range(date_today, date_today + timedelta(20), freq='D')
|
|
data = np.random.randint(1, high=100, size=len(days))
|
|
|
|
df = pd.DataFrame({'date': days, 'values': data})
|
|
df = df.set_index('date')
|
|
|
|
ddf = dd.from_pandas(df, npartitions=4)
|
|
|
|
assert ddf.resample('D').mean().head().index.name == "date"
|