You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
211 lines
8.0 KiB
211 lines
8.0 KiB
from __future__ import absolute_import, division, print_function
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from ..core import tokenize, DataFrame
|
|
from .io import from_delayed
|
|
from ...delayed import delayed
|
|
from ...utils import random_state_data
|
|
|
|
__all__ = ['make_timeseries']
|
|
|
|
|
|
def make_float(n, rstate):
|
|
return rstate.rand(n) * 2 - 1
|
|
|
|
|
|
def make_int(n, rstate):
|
|
return rstate.poisson(1000, size=n)
|
|
|
|
|
|
names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank', 'George',
|
|
'Hannah', 'Ingrid', 'Jerry', 'Kevin', 'Laura', 'Michael', 'Norbert',
|
|
'Oliver', 'Patricia', 'Quinn', 'Ray', 'Sarah', 'Tim', 'Ursula',
|
|
'Victor', 'Wendy', 'Xavier', 'Yvonne', 'Zelda']
|
|
|
|
|
|
def make_string(n, rstate):
|
|
return rstate.choice(names, size=n)
|
|
|
|
|
|
def make_categorical(n, rstate):
|
|
return pd.Categorical.from_codes(rstate.randint(0, len(names), size=n),
|
|
names)
|
|
|
|
|
|
make = {float: make_float,
|
|
int: make_int,
|
|
str: make_string,
|
|
object: make_string,
|
|
'category': make_categorical}
|
|
|
|
|
|
def make_timeseries_part(start, end, dtypes, freq, state_data):
|
|
index = pd.date_range(start=start, end=end, freq=freq, name='timestamp')
|
|
state = np.random.RandomState(state_data)
|
|
columns = dict((k, make[dt](len(index), state)) for k, dt in dtypes.items())
|
|
df = pd.DataFrame(columns, index=index, columns=sorted(columns))
|
|
if df.index[-1] == end:
|
|
df = df.iloc[:-1]
|
|
return df
|
|
|
|
|
|
def make_timeseries(start='2000-01-01',
|
|
end='2000-12-31',
|
|
dtypes={'name': str, 'id': int, 'x': float, 'y': float},
|
|
freq='10s',
|
|
partition_freq='1M',
|
|
seed=None):
|
|
""" Create timeseries dataframe with random data
|
|
|
|
Parameters
|
|
----------
|
|
start: datetime (or datetime-like string)
|
|
Start of time series
|
|
end: datetime (or datetime-like string)
|
|
End of time series
|
|
dtypes: dict
|
|
Mapping of column names to types.
|
|
Valid types include {float, int, str, 'category'}
|
|
freq: string
|
|
String like '2s' or '1H' or '12W' for the time series frequency
|
|
partition_freq: string
|
|
String like '1M' or '2Y' to divide the dataframe into partitions
|
|
seed: int (optional)
|
|
Randomstate seed
|
|
|
|
>>> import dask.dataframe as dd
|
|
>>> df = dd.demo.make_timeseries('2000', '2010',
|
|
... {'value': float, 'name': str, 'id': int},
|
|
... freq='2H', partition_freq='1D', seed=1)
|
|
>>> df.head() # doctest: +SKIP
|
|
id name value
|
|
2000-01-01 00:00:00 969 Jerry -0.309014
|
|
2000-01-01 02:00:00 1010 Ray -0.760675
|
|
2000-01-01 04:00:00 1016 Patricia -0.063261
|
|
2000-01-01 06:00:00 960 Charlie 0.788245
|
|
2000-01-01 08:00:00 1031 Kevin 0.466002
|
|
"""
|
|
divisions = list(pd.date_range(start=start, end=end,
|
|
freq=partition_freq))
|
|
state_data = random_state_data(len(divisions) - 1, seed)
|
|
name = 'make-timeseries-' + tokenize(start, end, dtypes, freq,
|
|
partition_freq, state_data)
|
|
dsk = {(name, i): (make_timeseries_part, divisions[i], divisions[i + 1],
|
|
dtypes, freq, state_data[i])
|
|
for i in range(len(divisions) - 1)}
|
|
head = make_timeseries_part('2000', '2000', dtypes, '1H', state_data[0])
|
|
return DataFrame(dsk, name, head, divisions)
|
|
|
|
|
|
def generate_day(date, open, high, low, close, volume,
|
|
freq=pd.Timedelta(seconds=60), random_state=None):
|
|
""" Generate a day of financial data from open/close high/low values """
|
|
if not isinstance(random_state, np.random.RandomState):
|
|
random_state = np.random.RandomState(random_state)
|
|
if not isinstance(date, pd.Timestamp):
|
|
date = pd.Timestamp(date)
|
|
if not isinstance(freq, pd.Timedelta):
|
|
freq = pd.Timedelta(freq)
|
|
|
|
time = pd.date_range(date + pd.Timedelta(hours=9),
|
|
date + pd.Timedelta(hours=12 + 4),
|
|
freq=freq / 5, name='timestamp')
|
|
n = len(time)
|
|
while True:
|
|
values = (random_state.random_sample(n) - 0.5).cumsum()
|
|
values *= (high - low) / (values.max() - values.min()) # scale
|
|
values += np.linspace(open - values[0], close - values[-1],
|
|
len(values)) # endpoints
|
|
assert np.allclose(open, values[0])
|
|
assert np.allclose(close, values[-1])
|
|
|
|
mx = max(close, open)
|
|
mn = min(close, open)
|
|
ind = values > mx
|
|
values[ind] = (values[ind] - mx) * (high - mx) / (values.max() - mx) + mx
|
|
ind = values < mn
|
|
values[ind] = (values[ind] - mn) * (low - mn) / (values.min() - mn) + mn
|
|
# The process fails if min/max are the same as open close. This is rare
|
|
if (np.allclose(values.max(), high) and np.allclose(values.min(), low)):
|
|
break
|
|
|
|
s = pd.Series(values.round(3), index=time)
|
|
rs = s.resample(freq)
|
|
# TODO: add in volume
|
|
return pd.DataFrame({'open': rs.first(),
|
|
'close': rs.last(),
|
|
'high': rs.max(),
|
|
'low': rs.min()})
|
|
|
|
|
|
def daily_stock(symbol, start, stop, freq=pd.Timedelta(seconds=1),
|
|
data_source='yahoo', random_state=None):
|
|
""" Create artificial stock data
|
|
|
|
This data matches daily open/high/low/close values from Yahoo! Finance, but
|
|
interpolates values within each day with random values. This makes the
|
|
results look natural without requiring the downloading of large volumes of
|
|
data. This is useful for education and benchmarking.
|
|
|
|
Parameters
|
|
----------
|
|
symbol: string
|
|
A stock symbol like "GOOG" or "F"
|
|
start: date, str, or pd.Timestamp
|
|
The start date, input will be fed into pd.Timestamp for normalization
|
|
stop: date, str, or pd.Timestamp
|
|
The start date, input will be fed into pd.Timestamp for normalization
|
|
freq: timedelta, str, or pd.Timedelta
|
|
The frequency of sampling
|
|
data_source: str, optional
|
|
defaults to 'yahoo'. See pandas_datareader.data.DataReader for options
|
|
random_state: int, np.random.RandomState object
|
|
random seed, defaults to randomly chosen
|
|
|
|
Examples
|
|
--------
|
|
>>> import dask.dataframe as dd # doctest: +SKIP
|
|
>>> df = dd.demo.daily_stock('GOOG', '2010', '2011', freq='1s') # doctest: +SKIP
|
|
>>> df # doctest: +SKIP
|
|
Dask DataFrame Structure:
|
|
close high low open
|
|
npartitions=252
|
|
2010-01-04 09:00:00 float64 float64 float64 float64
|
|
2010-01-05 09:00:00 ... ... ... ...
|
|
... ... ... ... ...
|
|
2010-12-31 09:00:00 ... ... ... ...
|
|
2010-12-31 16:00:00 ... ... ... ...
|
|
Dask Name: from-delayed, 504 tasks
|
|
|
|
>>> df.head() # doctest: +SKIP
|
|
close high low open
|
|
timestamp
|
|
2010-01-04 09:00:00 626.944 626.964 626.944 626.951
|
|
2010-01-04 09:00:01 626.906 626.931 626.906 626.931
|
|
2010-01-04 09:00:02 626.901 626.911 626.901 626.905
|
|
2010-01-04 09:00:03 626.920 626.920 626.905 626.905
|
|
2010-01-04 09:00:04 626.894 626.917 626.894 626.906
|
|
"""
|
|
from pandas_datareader import data
|
|
df = data.DataReader(symbol, data_source, start, stop)
|
|
seeds = random_state_data(len(df), random_state=random_state)
|
|
parts = []
|
|
divisions = []
|
|
for i, seed in zip(range(len(df)), seeds):
|
|
s = df.iloc[i]
|
|
if s.isnull().any():
|
|
continue
|
|
part = delayed(generate_day)(s.name, s.loc['Open'], s.loc['High'], s.loc['Low'],
|
|
s.loc['Close'], s.loc['Volume'],
|
|
freq=freq, random_state=seed)
|
|
parts.append(part)
|
|
divisions.append(s.name + pd.Timedelta(hours=9))
|
|
|
|
divisions.append(s.name + pd.Timedelta(hours=12 + 4))
|
|
|
|
meta = generate_day('2000-01-01', 1, 2, 0, 1, 100)
|
|
|
|
return from_delayed(parts, meta=meta, divisions=divisions)
|