ORPA-pyOpenRPA/Resources/WPy64-3720/python-3.7.2.amd64/Lib/site-packages/dask/dataframe/io/demo.py

from __future__ import absolute_import, division, print_function

import pandas as pd
import numpy as np

from ..core import tokenize, DataFrame
from .io import from_delayed
from ...delayed import delayed
from ...utils import random_state_data

__all__ = ['make_timeseries']


def make_float(n, rstate):
    return rstate.rand(n) * 2 - 1


def make_int(n, rstate):
    return rstate.poisson(1000, size=n)


names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank', 'George',
         'Hannah', 'Ingrid', 'Jerry', 'Kevin', 'Laura', 'Michael', 'Norbert',
         'Oliver', 'Patricia', 'Quinn', 'Ray', 'Sarah', 'Tim', 'Ursula',
         'Victor', 'Wendy', 'Xavier', 'Yvonne', 'Zelda']


def make_string(n, rstate):
    return rstate.choice(names, size=n)


def make_categorical(n, rstate):
    return pd.Categorical.from_codes(rstate.randint(0, len(names), size=n),
                                     names)


make = {float: make_float,
        int: make_int,
        str: make_string,
        object: make_string,
        'category': make_categorical}


def make_timeseries_part(start, end, dtypes, freq, state_data):
    index = pd.date_range(start=start, end=end, freq=freq, name='timestamp')
    state = np.random.RandomState(state_data)
    columns = dict((k, make[dt](len(index), state)) for k, dt in dtypes.items())
    df = pd.DataFrame(columns, index=index, columns=sorted(columns))
    if df.index[-1] == end:
        df = df.iloc[:-1]
    return df


def make_timeseries(start='2000-01-01',
                    end='2000-12-31',
                    dtypes={'name': str, 'id': int, 'x': float, 'y': float},
                    freq='10s',
                    partition_freq='1M',
                    seed=None):
    """ Create timeseries dataframe with random data

    Parameters
    ----------
    start: datetime (or datetime-like string)
        Start of time series
    end: datetime (or datetime-like string)
        End of time series
    dtypes: dict
        Mapping of column names to types.
        Valid types include {float, int, str, 'category'}
    freq: string
        String like '2s' or '1H' or '12W' for the time series frequency
    partition_freq: string
        String like '1M' or '2Y' to divide the dataframe into partitions
    seed: int (optional)
        Randomstate seed

    >>> import dask.dataframe as dd
    >>> df = dd.demo.make_timeseries('2000', '2010',
    ...                              {'value': float, 'name': str, 'id': int},
    ...                              freq='2H', partition_freq='1D', seed=1)
    >>> df.head()  # doctest: +SKIP
                           id      name     value
    2000-01-01 00:00:00   969     Jerry -0.309014
    2000-01-01 02:00:00  1010       Ray -0.760675
    2000-01-01 04:00:00  1016  Patricia -0.063261
    2000-01-01 06:00:00   960   Charlie  0.788245
    2000-01-01 08:00:00  1031     Kevin  0.466002
    """
    divisions = list(pd.date_range(start=start, end=end,
                                   freq=partition_freq))
    state_data = random_state_data(len(divisions) - 1, seed)
    name = 'make-timeseries-' + tokenize(start, end, dtypes, freq,
                                         partition_freq, state_data)
    dsk = {(name, i): (make_timeseries_part, divisions[i], divisions[i + 1],
                       dtypes, freq, state_data[i])
           for i in range(len(divisions) - 1)}
    head = make_timeseries_part('2000', '2000', dtypes, '1H', state_data[0])
    return DataFrame(dsk, name, head, divisions)


def generate_day(date, open, high, low, close, volume,
                 freq=pd.Timedelta(seconds=60), random_state=None):
    """ Generate a day of financial data from open/close high/low values """
    if not isinstance(random_state, np.random.RandomState):
        random_state = np.random.RandomState(random_state)
    if not isinstance(date, pd.Timestamp):
        date = pd.Timestamp(date)
    if not isinstance(freq, pd.Timedelta):
        freq = pd.Timedelta(freq)

    time = pd.date_range(date + pd.Timedelta(hours=9),
                         date + pd.Timedelta(hours=12 + 4),
                         freq=freq / 5, name='timestamp')
    n = len(time)
    while True:
        values = (random_state.random_sample(n) - 0.5).cumsum()
        values *= (high - low) / (values.max() - values.min())  # scale
        values += np.linspace(open - values[0], close - values[-1],
                              len(values))  # endpoints
        assert np.allclose(open, values[0])
        assert np.allclose(close, values[-1])

        mx = max(close, open)
        mn = min(close, open)
        ind = values > mx
        values[ind] = (values[ind] - mx) * (high - mx) / (values.max() - mx) + mx
        ind = values < mn
        values[ind] = (values[ind] - mn) * (low - mn) / (values.min() - mn) + mn
        # The process fails if min/max are the same as open close.  This is rare
        if (np.allclose(values.max(), high) and np.allclose(values.min(), low)):
            break

    s = pd.Series(values.round(3), index=time)
    rs = s.resample(freq)
    # TODO: add in volume
    return pd.DataFrame({'open': rs.first(),
                         'close': rs.last(),
                         'high': rs.max(),
                         'low': rs.min()})


def daily_stock(symbol, start, stop, freq=pd.Timedelta(seconds=1),
                data_source='yahoo', random_state=None):
    """ Create artificial stock data

    This data matches daily open/high/low/close values from Yahoo! Finance, but
    interpolates values within each day with random values.  This makes the
    results look natural without requiring the downloading of large volumes of
    data.  This is useful for education and benchmarking.

    Parameters
    ----------
    symbol: string
        A stock symbol like "GOOG" or "F"
    start: date, str, or pd.Timestamp
        The start date, input will be fed into pd.Timestamp for normalization
    stop: date, str, or pd.Timestamp
        The start date, input will be fed into pd.Timestamp for normalization
    freq: timedelta, str, or pd.Timedelta
        The frequency of sampling
    data_source: str, optional
        defaults to 'yahoo'.  See pandas_datareader.data.DataReader for options
    random_state: int, np.random.RandomState object
        random seed, defaults to randomly chosen

    Examples
    --------
    >>> import dask.dataframe as dd  # doctest: +SKIP
    >>> df = dd.demo.daily_stock('GOOG', '2010', '2011', freq='1s')  # doctest: +SKIP
    >>> df  # doctest: +SKIP
    Dask DataFrame Structure:
                           close     high      low     open
    npartitions=252
    2010-01-04 09:00:00  float64  float64  float64  float64
    2010-01-05 09:00:00      ...      ...      ...      ...
    ...                      ...      ...      ...      ...
    2010-12-31 09:00:00      ...      ...      ...      ...
    2010-12-31 16:00:00      ...      ...      ...      ...
    Dask Name: from-delayed, 504 tasks

    >>> df.head()  # doctest: +SKIP
                           close     high      low     open
    timestamp
    2010-01-04 09:00:00  626.944  626.964  626.944  626.951
    2010-01-04 09:00:01  626.906  626.931  626.906  626.931
    2010-01-04 09:00:02  626.901  626.911  626.901  626.905
    2010-01-04 09:00:03  626.920  626.920  626.905  626.905
    2010-01-04 09:00:04  626.894  626.917  626.894  626.906
    """
    from pandas_datareader import data
    df = data.DataReader(symbol, data_source, start, stop)
    seeds = random_state_data(len(df), random_state=random_state)
    parts = []
    divisions = []
    for i, seed in zip(range(len(df)), seeds):
        s = df.iloc[i]
        if s.isnull().any():
            continue
        part = delayed(generate_day)(s.name, s.loc['Open'], s.loc['High'], s.loc['Low'],
                                     s.loc['Close'], s.loc['Volume'],
                                     freq=freq, random_state=seed)
        parts.append(part)
        divisions.append(s.name + pd.Timedelta(hours=9))

    divisions.append(s.name + pd.Timedelta(hours=12 + 4))

    meta = generate_day('2000-01-01', 1, 2, 0, 1, 100)

    return from_delayed(parts, meta=meta, divisions=divisions)