from __future__ import absolute_import, print_function, division import random from .utils import import_required def timeseries( start='2000-01-01', end='2000-01-31', freq='1s', partition_freq='1d', dtypes={'name': str, 'id': int, 'x': float, 'y': float}, seed=None, ): """ Create timeseries dataframe with random data Parameters ---------- start : datetime (or datetime-like string) Start of time series end : datetime (or datetime-like string) End of time series dtypes : dict Mapping of column names to types. Valid types include {float, int, str, 'category'} freq : string String like '2s' or '1H' or '12W' for the time series frequency partition_freq : string String like '1M' or '2Y' to divide the dataframe into partitions seed : int (optional) Randomstate seed Examples -------- >>> import dask >>> df = dask.datasets.timeseries() >>> df.head() # doctest: +SKIP timestamp id name x y 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633 2000-01-01 00:00:01 1066 Michael -0.262136 0.307107 2000-01-01 00:00:02 988 Wendy -0.526331 0.128641 2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270 2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278 """ from dask.dataframe.io.demo import make_timeseries return make_timeseries(start=start, end=end, freq=freq, partition_freq=partition_freq, seed=seed, dtypes=dtypes) def _generate_mimesis(field, schema_description, records_per_partition, seed): """ Generate data for a single partition of a dask bag See Also -------- _make_mimesis """ from mimesis.schema import Schema, Field field = Field(seed=seed, **field) schema = Schema(schema=lambda: schema_description(field)) for i in range(records_per_partition): yield schema.create(iterations=1)[0] def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None): """ Make a Dask Bag filled with data randomly generated by the mimesis projet Parameters ---------- field: dict keyword arguments to pass to ``mimesis.Field`` schema: Callable[Field] -> dict The schema to use to generate the data npartitions: int records_per_partition: int seed: int, None Seed for random data Returns ------- Dask Bag See Also -------- make_people """ import dask.bag as db from dask.base import tokenize field = field or {} if seed is None: seed = random.random() seeds = db.core.random_state_data_python(npartitions, seed) name = 'mimesis-' + tokenize(field, schema, npartitions, records_per_partition, seed) dsk = {(name, i): (_generate_mimesis, field, schema, records_per_partition, seed) for i, seed in enumerate(seeds)} return db.Bag(dsk, name, npartitions) def make_people(npartitions=10, records_per_partition=1000, seed=None, locale='en'): """ Make a dataset of random people This makes a Dask Bag with dictionary records of randomly generated people. This requires the optional library ``mimesis`` to generate records. Paramters --------- npartitions : int Number of partitions records_per_partition : int Number of records in each partition seed : int, (optional) Random seed locale : str Language locale, like 'en', 'fr', 'zh', or 'ru' Returns ------- b: Dask Bag """ import_required('mimesis', 'The mimesis module is required for this function. Try:\n' ' pip install mimesis') schema = lambda field: { 'age': field('person.age'), 'name': (field('person.name'), field('person.surname')), 'occupation': field('person.occupation'), 'telephone': field('person.telephone'), 'address': {'address': field('address.address'), 'city': field('address.city')}, 'credit-card': {'number': field('payment.credit_card_number'), 'expiration-date': field('payment.credit_card_expiration_date')}, } return _make_mimesis({'locale': locale}, schema, npartitions, records_per_partition, seed)