You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/Resources/WPy64-3720/python-3.7.2.amd64/Lib/site-packages/dask/dataframe/core.py

4681 lines
174 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function
from functools import wraps, partial
import operator
from operator import getitem
from pprint import pformat
import warnings
from toolz import merge, first, unique, partition_all, remove
import pandas as pd
import numpy as np
from numbers import Number, Integral
try:
from chest import Chest as Cache
except ImportError:
Cache = dict
from .. import array as da
from .. import core
from ..utils import partial_by_order, Dispatch, IndexCallable
from .. import threaded
from ..compatibility import (apply, operator_div, bind_method, string_types,
Iterator, Sequence)
from ..context import globalmethod
from ..utils import (random_state_data, pseudorandom, derived_from, funcname,
memory_repr, put_lines, M, key_split, OperatorMethodMixin,
is_arraylike, typename)
from ..array.core import Array, normalize_arg
from ..blockwise import blockwise, Blockwise
from ..base import DaskMethodsMixin, tokenize, dont_optimize, is_dask_collection
from ..sizeof import sizeof
from ..delayed import delayed, Delayed, unpack_collections
from ..highlevelgraph import HighLevelGraph
from . import methods
from .accessor import DatetimeAccessor, StringAccessor
from .categorical import CategoricalAccessor, categorize
from .hashing import hash_pandas_object
from .optimize import optimize
from .utils import (meta_nonempty, make_meta, insert_meta_param_description,
raise_on_meta_error, clear_known_categories,
is_categorical_dtype, has_known_categories, PANDAS_VERSION,
index_summary, is_dataframe_like, is_series_like,
is_index_like)
no_default = '__no_default__'
if PANDAS_VERSION >= '0.20.0':
from pandas.util import cache_readonly
pd.set_option('compute.use_numexpr', False)
else:
from pandas.util.decorators import cache_readonly
pd.computation.expressions.set_use_numexpr(False)
def _concat(args):
if not args:
return args
if isinstance(first(core.flatten(args)), np.ndarray):
return da.core.concatenate3(args)
if not has_parallel_type(args[0]):
try:
return pd.Series(args)
except Exception:
return args
# We filter out empty partitions here because pandas frequently has
# inconsistent dtypes in results between empty and non-empty frames.
# Ideally this would be handled locally for each operation, but in practice
# this seems easier. TODO: don't do this.
args2 = [i for i in args if len(i)]
return args[0] if not args2 else methods.concat(args2, uniform=True)
def finalize(results):
return _concat(results)
class Scalar(DaskMethodsMixin, OperatorMethodMixin):
""" A Dask object to represent a pandas scalar"""
def __init__(self, dsk, name, meta, divisions=None):
# divisions is ignored, only present to be compatible with other
# objects.
if not isinstance(dsk, HighLevelGraph):
dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[])
self.dask = dsk
self._name = name
meta = make_meta(meta)
if is_dataframe_like(meta) or is_series_like(meta) or is_index_like(meta):
raise TypeError("Expected meta to specify scalar, got "
"{0}".format(typename(type(meta))))
self._meta = meta
def __dask_graph__(self):
return self.dask
def __dask_keys__(self):
return [self.key]
def __dask_tokenize__(self):
return self._name
def __dask_layers__(self):
return (self.key,)
__dask_optimize__ = globalmethod(optimize, key='dataframe_optimize',
falsey=dont_optimize)
__dask_scheduler__ = staticmethod(threaded.get)
def __dask_postcompute__(self):
return first, ()
def __dask_postpersist__(self):
return Scalar, (self._name, self._meta, self.divisions)
@property
def _meta_nonempty(self):
return self._meta
@property
def dtype(self):
return self._meta.dtype
def __dir__(self):
o = set(dir(type(self)))
o.update(self.__dict__)
if not hasattr(self._meta, 'dtype'):
o.remove('dtype') # dtype only in `dir` if available
return list(o)
@property
def divisions(self):
"""Dummy divisions to be compat with Series and DataFrame"""
return [None, None]
def __repr__(self):
name = self._name if len(self._name) < 10 else self._name[:7] + '...'
if hasattr(self._meta, 'dtype'):
extra = ', dtype=%s' % self._meta.dtype
else:
extra = ', type=%s' % type(self._meta).__name__
return "dd.Scalar<%s%s>" % (name, extra)
def __array__(self):
# array interface is required to support pandas instance + Scalar
# Otherwise, above op results in pd.Series of Scalar (object dtype)
return np.asarray(self.compute())
@property
def _args(self):
return (self.dask, self._name, self._meta)
def __getstate__(self):
return self._args
def __setstate__(self, state):
self.dask, self._name, self._meta = state
@property
def key(self):
return (self._name, 0)
@classmethod
def _get_unary_operator(cls, op):
def f(self):
name = funcname(op) + '-' + tokenize(self)
dsk = {(name, 0): (op, (self._name, 0))}
meta = op(self._meta_nonempty)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
return Scalar(graph, name, meta)
return f
@classmethod
def _get_binary_operator(cls, op, inv=False):
return lambda self, other: _scalar_binary(op, self, other, inv=inv)
def to_delayed(self, optimize_graph=True):
"""Convert into a ``dask.delayed`` object.
Parameters
----------
optimize_graph : bool, optional
If True [default], the graph is optimized before converting into
``dask.delayed`` objects.
"""
dsk = self.__dask_graph__()
if optimize_graph:
dsk = self.__dask_optimize__(dsk, self.__dask_keys__())
name = 'delayed-' + self._name
dsk = HighLevelGraph.from_collections(name, dsk, dependencies=())
return Delayed(self.key, dsk)
def _scalar_binary(op, self, other, inv=False):
name = '{0}-{1}'.format(funcname(op), tokenize(self, other))
dependencies = [self]
dsk = {}
return_type = get_parallel_type(other)
if isinstance(other, Scalar):
dependencies.append(other)
other_key = (other._name, 0)
elif is_dask_collection(other):
return NotImplemented
else:
other_key = other
if inv:
dsk.update({(name, 0): (op, other_key, (self._name, 0))})
else:
dsk.update({(name, 0): (op, (self._name, 0), other_key)})
other_meta = make_meta(other)
other_meta_nonempty = meta_nonempty(other_meta)
if inv:
meta = op(other_meta_nonempty, self._meta_nonempty)
else:
meta = op(self._meta_nonempty, other_meta_nonempty)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies)
if return_type is not Scalar:
return return_type(graph, name, meta,
[other.index.min(), other.index.max()])
else:
return Scalar(graph, name, meta)
class _Frame(DaskMethodsMixin, OperatorMethodMixin):
""" Superclass for DataFrame and Series
Parameters
----------
dsk: dict
The dask graph to compute this DataFrame
name: str
The key prefix that specifies which keys in the dask comprise this
particular DataFrame / Series
meta: pandas.DataFrame, pandas.Series, or pandas.Index
An empty pandas object with names, dtypes, and indices matching the
expected output.
divisions: tuple of index values
Values along which we partition our blocks on the index
"""
def __init__(self, dsk, name, meta, divisions):
if not isinstance(dsk, HighLevelGraph):
dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[])
self.dask = dsk
self._name = name
meta = make_meta(meta)
if not isinstance(meta, self._partition_type):
raise TypeError("Expected meta to specify type {0}, got type "
"{1}".format(typename(self._partition_type),
typename(type(meta))))
self._meta = meta
self.divisions = tuple(divisions)
def __dask_graph__(self):
return self.dask
def __dask_keys__(self):
return [(self._name, i) for i in range(self.npartitions)]
def __dask_layers__(self):
return (self._name,)
def __dask_tokenize__(self):
return self._name
__dask_optimize__ = globalmethod(optimize, key='dataframe_optimize',
falsey=dont_optimize)
__dask_scheduler__ = staticmethod(threaded.get)
def __dask_postcompute__(self):
return finalize, ()
def __dask_postpersist__(self):
return type(self), (self._name, self._meta, self.divisions)
@property
def _constructor(self):
return new_dd_object
@property
def npartitions(self):
"""Return number of partitions"""
return len(self.divisions) - 1
@property
def size(self):
"""Size of the Series or DataFrame as a Delayed object.
Examples
--------
>>> series.size # doctest: +SKIP
dd.Scalar<size-ag..., dtype=int64>
"""
return self.reduction(methods.size, np.sum, token='size', meta=int,
split_every=False)
@property
def _meta_nonempty(self):
""" A non-empty version of `_meta` with fake data."""
return meta_nonempty(self._meta)
@property
def _args(self):
return (self.dask, self._name, self._meta, self.divisions)
def __getstate__(self):
return self._args
def __setstate__(self, state):
self.dask, self._name, self._meta, self.divisions = state
def copy(self):
""" Make a copy of the dataframe
This is strictly a shallow copy of the underlying computational graph.
It does not affect the underlying data
"""
return new_dd_object(self.dask, self._name,
self._meta, self.divisions)
def __array__(self, dtype=None, **kwargs):
self._computed = self.compute()
x = np.array(self._computed)
return x
def __array_wrap__(self, array, context=None):
raise NotImplementedError
def __array_ufunc__(self, numpy_ufunc, method, *inputs, **kwargs):
out = kwargs.get('out', ())
for x in inputs + out:
# ufuncs work with 0-dimensional NumPy ndarrays
# so we don't want to raise NotImplemented
if isinstance(x, np.ndarray) and x.shape == ():
continue
elif not isinstance(x, (Number, Scalar, _Frame, Array,
pd.DataFrame, pd.Series, pd.Index)):
return NotImplemented
if method == '__call__':
if numpy_ufunc.signature is not None:
return NotImplemented
if numpy_ufunc.nout > 1:
# ufuncs with multiple output values
# are not yet supported for frames
return NotImplemented
else:
return elemwise(numpy_ufunc, *inputs, **kwargs)
else:
# ufunc methods are not yet supported for frames
return NotImplemented
@property
def _elemwise(self):
return elemwise
def _repr_data(self):
raise NotImplementedError
@property
def _repr_divisions(self):
name = "npartitions={0}".format(self.npartitions)
if self.known_divisions:
divisions = pd.Index(self.divisions, name=name)
else:
# avoid to be converted to NaN
divisions = pd.Index([''] * (self.npartitions + 1), name=name)
return divisions
def __repr__(self):
data = self._repr_data().to_string(max_rows=5, show_dimensions=False)
return """Dask {klass} Structure:
{data}
Dask Name: {name}, {task} tasks""".format(klass=self.__class__.__name__,
data=data, name=key_split(self._name),
task=len(self.dask))
@property
def index(self):
"""Return dask Index instance"""
return self.map_partitions(getattr, 'index', token=self._name + '-index',
meta=self._meta.index)
def reset_index(self, drop=False):
"""Reset the index to the default index.
Note that unlike in ``pandas``, the reset ``dask.dataframe`` index will
not be monotonically increasing from 0. Instead, it will restart at 0
for each partition (e.g. ``index1 = [0, ..., 10], index2 = [0, ...]``).
This is due to the inability to statically know the full length of the
index.
For DataFrame with multi-level index, returns a new DataFrame with
labeling information in the columns under the index names, defaulting
to 'level_0', 'level_1', etc. if any are None. For a standard index,
the index name will be used (if set), otherwise a default 'index' or
'level_0' (if 'index' is already taken) will be used.
Parameters
----------
drop : boolean, default False
Do not try to insert index into dataframe columns.
"""
return self.map_partitions(M.reset_index, drop=drop).clear_divisions()
@property
def known_divisions(self):
"""Whether divisions are already known"""
return len(self.divisions) > 0 and self.divisions[0] is not None
def clear_divisions(self):
""" Forget division information """
divisions = (None,) * (self.npartitions + 1)
return type(self)(self.dask, self._name, self._meta, divisions)
def get_partition(self, n):
"""Get a dask DataFrame/Series representing the `nth` partition."""
if 0 <= n < self.npartitions:
name = 'get-partition-%s-%s' % (str(n), self._name)
divisions = self.divisions[n:n + 2]
layer = {(name, 0): (self._name, n)}
graph = HighLevelGraph.from_collections(name, layer, dependencies=[self])
return new_dd_object(graph, name, self._meta, divisions)
else:
msg = "n must be 0 <= n < {0}".format(self.npartitions)
raise ValueError(msg)
@derived_from(pd.DataFrame)
def drop_duplicates(self, split_every=None, split_out=1, **kwargs):
# Let pandas error on bad inputs
self._meta_nonempty.drop_duplicates(**kwargs)
if 'subset' in kwargs and kwargs['subset'] is not None:
split_out_setup = split_out_on_cols
split_out_setup_kwargs = {'cols': kwargs['subset']}
else:
split_out_setup = split_out_setup_kwargs = None
if kwargs.get('keep', True) is False:
raise NotImplementedError("drop_duplicates with keep=False")
chunk = M.drop_duplicates
return aca(self, chunk=chunk, aggregate=chunk, meta=self._meta,
token='drop-duplicates', split_every=split_every,
split_out=split_out, split_out_setup=split_out_setup,
split_out_setup_kwargs=split_out_setup_kwargs, **kwargs)
def __len__(self):
return self.reduction(len, np.sum, token='len', meta=int,
split_every=False).compute()
def __bool__(self):
raise ValueError("The truth value of a {0} is ambiguous. "
"Use a.any() or a.all()."
.format(self.__class__.__name__))
__nonzero__ = __bool__ # python 2
def _scalarfunc(self, cast_type):
def wrapper():
raise TypeError("cannot convert the series to "
"{0}".format(str(cast_type)))
return wrapper
def __float__(self):
return self._scalarfunc(float)
def __int__(self):
return self._scalarfunc(int)
__long__ = __int__ # python 2
def __complex__(self):
return self._scalarfunc(complex)
@insert_meta_param_description(pad=12)
def map_partitions(self, func, *args, **kwargs):
""" Apply Python function on each DataFrame partition.
Note that the index and divisions are assumed to remain unchanged.
Parameters
----------
func : function
Function applied to each partition.
args, kwargs :
Arguments and keywords to pass to the function. The partition will
be the first argument, and these will be passed *after*. Arguments
and keywords may contain ``Scalar``, ``Delayed`` or regular
python objects.
$META
Examples
--------
Given a DataFrame, Series, or Index, such as:
>>> import dask.dataframe as dd
>>> df = pd.DataFrame({'x': [1, 2, 3, 4, 5],
... 'y': [1., 2., 3., 4., 5.]})
>>> ddf = dd.from_pandas(df, npartitions=2)
One can use ``map_partitions`` to apply a function on each partition.
Extra arguments and keywords can optionally be provided, and will be
passed to the function after the partition.
Here we apply a function with arguments and keywords to a DataFrame,
resulting in a Series:
>>> def myadd(df, a, b=1):
... return df.x + df.y + a + b
>>> res = ddf.map_partitions(myadd, 1, b=2)
>>> res.dtype
dtype('float64')
By default, dask tries to infer the output metadata by running your
provided function on some fake data. This works well in many cases, but
can sometimes be expensive, or even fail. To avoid this, you can
manually specify the output metadata with the ``meta`` keyword. This
can be specified in many forms, for more information see
``dask.dataframe.utils.make_meta``.
Here we specify the output is a Series with no name, and dtype
``float64``:
>>> res = ddf.map_partitions(myadd, 1, b=2, meta=(None, 'f8'))
Here we map a function that takes in a DataFrame, and returns a
DataFrame with a new column:
>>> res = ddf.map_partitions(lambda df: df.assign(z=df.x * df.y))
>>> res.dtypes
x int64
y float64
z float64
dtype: object
As before, the output metadata can also be specified manually. This
time we pass in a ``dict``, as the output is a DataFrame:
>>> res = ddf.map_partitions(lambda df: df.assign(z=df.x * df.y),
... meta={'x': 'i8', 'y': 'f8', 'z': 'f8'})
In the case where the metadata doesn't change, you can also pass in
the object itself directly:
>>> res = ddf.map_partitions(lambda df: df.head(), meta=df)
Also note that the index and divisions are assumed to remain unchanged.
If the function you're mapping changes the index/divisions, you'll need
to clear them afterwards:
>>> ddf.map_partitions(func).clear_divisions() # doctest: +SKIP
"""
return map_partitions(func, self, *args, **kwargs)
@insert_meta_param_description(pad=12)
def map_overlap(self, func, before, after, *args, **kwargs):
"""Apply a function to each partition, sharing rows with adjacent partitions.
This can be useful for implementing windowing functions such as
``df.rolling(...).mean()`` or ``df.diff()``.
Parameters
----------
func : function
Function applied to each partition.
before : int
The number of rows to prepend to partition ``i`` from the end of
partition ``i - 1``.
after : int
The number of rows to append to partition ``i`` from the beginning
of partition ``i + 1``.
args, kwargs :
Arguments and keywords to pass to the function. The partition will
be the first argument, and these will be passed *after*.
$META
Notes
-----
Given positive integers ``before`` and ``after``, and a function
``func``, ``map_overlap`` does the following:
1. Prepend ``before`` rows to each partition ``i`` from the end of
partition ``i - 1``. The first partition has no rows prepended.
2. Append ``after`` rows to each partition ``i`` from the beginning of
partition ``i + 1``. The last partition has no rows appended.
3. Apply ``func`` to each partition, passing in any extra ``args`` and
``kwargs`` if provided.
4. Trim ``before`` rows from the beginning of all but the first
partition.
5. Trim ``after`` rows from the end of all but the last partition.
Note that the index and divisions are assumed to remain unchanged.
Examples
--------
Given a DataFrame, Series, or Index, such as:
>>> import dask.dataframe as dd
>>> df = pd.DataFrame({'x': [1, 2, 4, 7, 11],
... 'y': [1., 2., 3., 4., 5.]})
>>> ddf = dd.from_pandas(df, npartitions=2)
A rolling sum with a trailing moving window of size 2 can be computed by
overlapping 2 rows before each partition, and then mapping calls to
``df.rolling(2).sum()``:
>>> ddf.compute()
x y
0 1 1.0
1 2 2.0
2 4 3.0
3 7 4.0
4 11 5.0
>>> ddf.map_overlap(lambda df: df.rolling(2).sum(), 2, 0).compute()
x y
0 NaN NaN
1 3.0 3.0
2 6.0 5.0
3 11.0 7.0
4 18.0 9.0
The pandas ``diff`` method computes a discrete difference shifted by a
number of periods (can be positive or negative). This can be
implemented by mapping calls to ``df.diff`` to each partition after
prepending/appending that many rows, depending on sign:
>>> def diff(df, periods=1):
... before, after = (periods, 0) if periods > 0 else (0, -periods)
... return df.map_overlap(lambda df, periods=1: df.diff(periods),
... periods, 0, periods=periods)
>>> diff(ddf, 1).compute()
x y
0 NaN NaN
1 1.0 1.0
2 2.0 1.0
3 3.0 1.0
4 4.0 1.0
If you have a ``DatetimeIndex``, you can use a ``pd.Timedelta`` for time-
based windows.
>>> ts = pd.Series(range(10), index=pd.date_range('2017', periods=10))
>>> dts = dd.from_pandas(ts, npartitions=2)
>>> dts.map_overlap(lambda df: df.rolling('2D').sum(),
... pd.Timedelta('2D'), 0).compute()
2017-01-01 0.0
2017-01-02 1.0
2017-01-03 3.0
2017-01-04 5.0
2017-01-05 7.0
2017-01-06 9.0
2017-01-07 11.0
2017-01-08 13.0
2017-01-09 15.0
2017-01-10 17.0
dtype: float64
"""
from .rolling import map_overlap
return map_overlap(func, self, before, after, *args, **kwargs)
@insert_meta_param_description(pad=12)
def reduction(self, chunk, aggregate=None, combine=None, meta=no_default,
token=None, split_every=None, chunk_kwargs=None,
aggregate_kwargs=None, combine_kwargs=None, **kwargs):
"""Generic row-wise reductions.
Parameters
----------
chunk : callable
Function to operate on each partition. Should return a
``pandas.DataFrame``, ``pandas.Series``, or a scalar.
aggregate : callable, optional
Function to operate on the concatenated result of ``chunk``. If not
specified, defaults to ``chunk``. Used to do the final aggregation
in a tree reduction.
The input to ``aggregate`` depends on the output of ``chunk``.
If the output of ``chunk`` is a:
- scalar: Input is a Series, with one row per partition.
- Series: Input is a DataFrame, with one row per partition. Columns
are the rows in the output series.
- DataFrame: Input is a DataFrame, with one row per partition.
Columns are the columns in the output dataframes.
Should return a ``pandas.DataFrame``, ``pandas.Series``, or a
scalar.
combine : callable, optional
Function to operate on intermediate concatenated results of
``chunk`` in a tree-reduction. If not provided, defaults to
``aggregate``. The input/output requirements should match that of
``aggregate`` described above.
$META
token : str, optional
The name to use for the output keys.
split_every : int, optional
Group partitions into groups of this size while performing a
tree-reduction. If set to False, no tree-reduction will be used,
and all intermediates will be concatenated and passed to
``aggregate``. Default is 8.
chunk_kwargs : dict, optional
Keyword arguments to pass on to ``chunk`` only.
aggregate_kwargs : dict, optional
Keyword arguments to pass on to ``aggregate`` only.
combine_kwargs : dict, optional
Keyword arguments to pass on to ``combine`` only.
kwargs :
All remaining keywords will be passed to ``chunk``, ``combine``,
and ``aggregate``.
Examples
--------
>>> import pandas as pd
>>> import dask.dataframe as dd
>>> df = pd.DataFrame({'x': range(50), 'y': range(50, 100)})
>>> ddf = dd.from_pandas(df, npartitions=4)
Count the number of rows in a DataFrame. To do this, count the number
of rows in each partition, then sum the results:
>>> res = ddf.reduction(lambda x: x.count(),
... aggregate=lambda x: x.sum())
>>> res.compute()
x 50
y 50
dtype: int64
Count the number of rows in a Series with elements greater than or
equal to a value (provided via a keyword).
>>> def count_greater(x, value=0):
... return (x >= value).sum()
>>> res = ddf.x.reduction(count_greater, aggregate=lambda x: x.sum(),
... chunk_kwargs={'value': 25})
>>> res.compute()
25
Aggregate both the sum and count of a Series at the same time:
>>> def sum_and_count(x):
... return pd.Series({'count': x.count(), 'sum': x.sum()},
... index=['count', 'sum'])
>>> res = ddf.x.reduction(sum_and_count, aggregate=lambda x: x.sum())
>>> res.compute()
count 50
sum 1225
dtype: int64
Doing the same, but for a DataFrame. Here ``chunk`` returns a
DataFrame, meaning the input to ``aggregate`` is a DataFrame with an
index with non-unique entries for both 'x' and 'y'. We groupby the
index, and sum each group to get the final result.
>>> def sum_and_count(x):
... return pd.DataFrame({'count': x.count(), 'sum': x.sum()},
... columns=['count', 'sum'])
>>> res = ddf.reduction(sum_and_count,
... aggregate=lambda x: x.groupby(level=0).sum())
>>> res.compute()
count sum
x 50 1225
y 50 3725
"""
if aggregate is None:
aggregate = chunk
if combine is None:
if combine_kwargs:
raise ValueError("`combine_kwargs` provided with no `combine`")
combine = aggregate
combine_kwargs = aggregate_kwargs
chunk_kwargs = chunk_kwargs.copy() if chunk_kwargs else {}
chunk_kwargs['aca_chunk'] = chunk
combine_kwargs = combine_kwargs.copy() if combine_kwargs else {}
combine_kwargs['aca_combine'] = combine
aggregate_kwargs = aggregate_kwargs.copy() if aggregate_kwargs else {}
aggregate_kwargs['aca_aggregate'] = aggregate
return aca(self, chunk=_reduction_chunk, aggregate=_reduction_aggregate,
combine=_reduction_combine, meta=meta, token=token,
split_every=split_every, chunk_kwargs=chunk_kwargs,
aggregate_kwargs=aggregate_kwargs,
combine_kwargs=combine_kwargs, **kwargs)
@derived_from(pd.DataFrame)
def pipe(self, func, *args, **kwargs):
# Taken from pandas:
# https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L2698-L2707
if isinstance(func, tuple):
func, target = func
if target in kwargs:
raise ValueError('%s is both the pipe target and a keyword '
'argument' % target)
kwargs[target] = self
return func(*args, **kwargs)
else:
return func(self, *args, **kwargs)
def random_split(self, frac, random_state=None):
""" Pseudorandomly split dataframe into different pieces row-wise
Parameters
----------
frac : list
List of floats that should sum to one.
random_state: int or np.random.RandomState
If int create a new RandomState with this as the seed
Otherwise draw from the passed RandomState
Examples
--------
50/50 split
>>> a, b = df.random_split([0.5, 0.5]) # doctest: +SKIP
80/10/10 split, consistent random_state
>>> a, b, c = df.random_split([0.8, 0.1, 0.1], random_state=123) # doctest: +SKIP
See Also
--------
dask.DataFrame.sample
"""
if not np.allclose(sum(frac), 1):
raise ValueError("frac should sum to 1")
state_data = random_state_data(self.npartitions, random_state)
token = tokenize(self, frac, random_state)
name = 'split-' + token
layer = {(name, i): (pd_split, (self._name, i), frac, state)
for i, state in enumerate(state_data)}
out = []
for i in range(len(frac)):
name2 = 'split-%d-%s' % (i, token)
dsk2 = {(name2, j): (getitem, (name, j), i)
for j in range(self.npartitions)}
graph = HighLevelGraph.from_collections(name2, merge(dsk2, layer), dependencies=[self])
out_df = type(self)(graph, name2, self._meta, self.divisions)
out.append(out_df)
return out
def head(self, n=5, npartitions=1, compute=True):
""" First n rows of the dataset
Parameters
----------
n : int, optional
The number of rows to return. Default is 5.
npartitions : int, optional
Elements are only taken from the first ``npartitions``, with a
default of 1. If there are fewer than ``n`` rows in the first
``npartitions`` a warning will be raised and any found rows
returned. Pass -1 to use all partitions.
compute : bool, optional
Whether to compute the result, default is True.
"""
if npartitions <= -1:
npartitions = self.npartitions
if npartitions > self.npartitions:
msg = "only {} partitions, head received {}"
raise ValueError(msg.format(self.npartitions, npartitions))
name = 'head-%d-%d-%s' % (npartitions, n, self._name)
if npartitions > 1:
name_p = 'head-partial-%d-%s' % (n, self._name)
dsk = {}
for i in range(npartitions):
dsk[(name_p, i)] = (M.head, (self._name, i), n)
concat = (_concat, [(name_p, i) for i in range(npartitions)])
dsk[(name, 0)] = (safe_head, concat, n)
else:
dsk = {(name, 0): (safe_head, (self._name, 0), n)}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
result = new_dd_object(graph, name, self._meta,
[self.divisions[0], self.divisions[npartitions]])
if compute:
result = result.compute()
return result
def tail(self, n=5, compute=True):
""" Last n rows of the dataset
Caveat, the only checks the last n rows of the last partition.
"""
name = 'tail-%d-%s' % (n, self._name)
dsk = {(name, 0): (M.tail, (self._name, self.npartitions - 1), n)}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
result = new_dd_object(graph, name, self._meta, self.divisions[-2:])
if compute:
result = result.compute()
return result
@property
def loc(self):
""" Purely label-location based indexer for selection by label.
>>> df.loc["b"] # doctest: +SKIP
>>> df.loc["b":"d"] # doctest: +SKIP
"""
from .indexing import _LocIndexer
return _LocIndexer(self)
def _partitions(self, index):
if not isinstance(index, tuple):
index = (index,)
from ..array.slicing import normalize_index
index = normalize_index(index, (self.npartitions,))
index = tuple(slice(k, k + 1) if isinstance(k, Number) else k
for k in index)
name = 'blocks-' + tokenize(self, index)
new_keys = np.array(self.__dask_keys__(), dtype=object)[index].tolist()
divisions = [self.divisions[i] for _, i in new_keys] + [self.divisions[new_keys[-1][1] + 1]]
dsk = {(name, i): tuple(key) for i, key in enumerate(new_keys)}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
return new_dd_object(graph, name, self._meta, divisions)
@property
def partitions(self):
""" Slice dataframe by partitions
This allows partitionwise slicing of a Dask Dataframe. You can perform normal
Numpy-style slicing but now rather than slice elements of the array you
slice along partitions so, for example, ``df.partitions[:5]`` produces a new
Dask Dataframe of the first five partitions.
Examples
--------
>>> df.partitions[0] # doctest: +SKIP
>>> df.partitions[:3] # doctest: +SKIP
>>> df.partitions[::10] # doctest: +SKIP
Returns
-------
A Dask DataFrame
"""
return IndexCallable(self._partitions)
# Note: iloc is implemented only on DataFrame
def repartition(self, divisions=None, npartitions=None, freq=None, force=False):
""" Repartition dataframe along new divisions
Parameters
----------
divisions : list, optional
List of partitions to be used. If specified npartitions will be
ignored.
npartitions : int, optional
Number of partitions of output. Only used if divisions isn't
specified.
freq : str, pd.Timedelta
A period on which to partition timeseries data like ``'7D'`` or
``'12h'`` or ``pd.Timedelta(hours=12)``. Assumes a datetime index.
force : bool, default False
Allows the expansion of the existing divisions.
If False then the new divisions lower and upper bounds must be
the same as the old divisions.
Examples
--------
>>> df = df.repartition(npartitions=10) # doctest: +SKIP
>>> df = df.repartition(divisions=[0, 5, 10, 20]) # doctest: +SKIP
>>> df = df.repartition(freq='7d') # doctest: +SKIP
"""
if npartitions is not None and divisions is not None:
warnings.warn("When providing both npartitions and divisions to "
"repartition only npartitions is used.")
if npartitions is not None:
return repartition_npartitions(self, npartitions)
elif divisions is not None:
return repartition(self, divisions, force=force)
elif freq is not None:
return repartition_freq(self, freq=freq)
else:
raise ValueError(
"Provide either divisions= or npartitions= to repartition")
@derived_from(pd.DataFrame)
def fillna(self, value=None, method=None, limit=None, axis=None):
axis = self._validate_axis(axis)
if method is None and limit is not None:
raise NotImplementedError("fillna with set limit and method=None")
if isinstance(value, _Frame):
test_value = value._meta_nonempty.values[0]
else:
test_value = value
meta = self._meta_nonempty.fillna(value=test_value, method=method,
limit=limit, axis=axis)
if axis == 1 or method is None:
# Control whether or not dask's partition alignment happens.
# We don't want for a pandas Series.
# We do want it for a dask Series
if is_series_like(value):
args = ()
kwargs = {'value': value}
else:
args = (value,)
kwargs = {}
return self.map_partitions(M.fillna, *args, method=method,
limit=limit, axis=axis, meta=meta,
**kwargs)
if method in ('pad', 'ffill'):
method = 'ffill'
skip_check = 0
before, after = 1 if limit is None else limit, 0
else:
method = 'bfill'
skip_check = self.npartitions - 1
before, after = 0, 1 if limit is None else limit
if limit is None:
name = 'fillna-chunk-' + tokenize(self, method)
dsk = {(name, i): (methods.fillna_check, (self._name, i),
method, i != skip_check)
for i in range(self.npartitions)}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
parts = new_dd_object(graph, name, meta, self.divisions)
else:
parts = self
return parts.map_overlap(M.fillna, before, after, method=method,
limit=limit, meta=meta)
@derived_from(pd.DataFrame)
def ffill(self, axis=None, limit=None):
return self.fillna(method='ffill', limit=limit, axis=axis)
@derived_from(pd.DataFrame)
def bfill(self, axis=None, limit=None):
return self.fillna(method='bfill', limit=limit, axis=axis)
def sample(self, n=None, frac=None, replace=False, random_state=None):
""" Random sample of items
Parameters
----------
n : int, optional
Number of items to return is not supported by dask. Use frac
instead.
frac : float, optional
Fraction of axis items to return.
replace : boolean, optional
Sample with or without replacement. Default = False.
random_state : int or ``np.random.RandomState``
If int we create a new RandomState with this as the seed
Otherwise we draw from the passed RandomState
See Also
--------
DataFrame.random_split
pandas.DataFrame.sample
"""
if n is not None:
msg = ("sample does not support the number of sampled items "
"parameter, 'n'. Please use the 'frac' parameter instead.")
if isinstance(n, Number) and 0 <= n <= 1:
warnings.warn(msg)
frac = n
else:
raise ValueError(msg)
if frac is None:
raise ValueError("frac must not be None")
if random_state is None:
random_state = np.random.RandomState()
name = 'sample-' + tokenize(self, frac, replace, random_state)
state_data = random_state_data(self.npartitions, random_state)
dsk = {(name, i): (methods.sample, (self._name, i), state, frac, replace)
for i, state in enumerate(state_data)}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
return new_dd_object(graph, name, self._meta, self.divisions)
def to_dask_array(self, lengths=None):
"""Convert a dask DataFrame to a dask array.
Parameters
----------
lengths : bool or Sequence of ints, optional
How to determine the chunks sizes for the output array.
By default, the output array will have unknown chunk lengths
along the first axis, which can cause some later operations
to fail.
* True : immediately compute the length of each partition
* Sequence : a sequence of integers to use for the chunk sizes
on the first axis. These values are *not* validated for
correctness, beyond ensuring that the number of items
matches the number of partitions.
Returns
-------
"""
from dask.array.core import normalize_chunks
if lengths is True:
lengths = tuple(self.map_partitions(len).compute())
arr = self.map_partitions(np.array, )
if isinstance(lengths, Sequence):
lengths = tuple(lengths)
if len(lengths) != self.npartitions:
raise ValueError(
"The number of items in 'lengths' does not match "
"the number of partitions. "
"{} != {}".format(len(lengths), self.npartitions)
)
if self.ndim == 1:
chunks = normalize_chunks((lengths,))
else:
chunks = normalize_chunks((lengths, (len(self.columns),)))
arr._chunks = chunks
elif lengths is not None:
raise ValueError("Unexpected value for 'lengths': '{}'".format(lengths))
return arr
def to_hdf(self, path_or_buf, key, mode='a', append=False, **kwargs):
""" See dd.to_hdf docstring for more information """
from .io import to_hdf
return to_hdf(self, path_or_buf, key, mode, append, **kwargs)
def to_csv(self, filename, **kwargs):
""" See dd.to_csv docstring for more information """
from .io import to_csv
return to_csv(self, filename, **kwargs)
def to_json(self, filename, *args, **kwargs):
""" See dd.to_json docstring for more information """
from .io import to_json
return to_json(self, filename, *args, **kwargs)
def to_delayed(self, optimize_graph=True):
"""Convert into a list of ``dask.delayed`` objects, one per partition.
Parameters
----------
optimize_graph : bool, optional
If True [default], the graph is optimized before converting into
``dask.delayed`` objects.
Examples
--------
>>> partitions = df.to_delayed() # doctest: +SKIP
See Also
--------
dask.dataframe.from_delayed
"""
keys = self.__dask_keys__()
graph = self.__dask_graph__()
if optimize_graph:
graph = self.__dask_optimize__(graph, self.__dask_keys__())
name = 'delayed-' + self._name
graph = HighLevelGraph.from_collections(name, graph, dependencies=())
return [Delayed(k, graph) for k in keys]
@classmethod
def _get_unary_operator(cls, op):
return lambda self: elemwise(op, self)
@classmethod
def _get_binary_operator(cls, op, inv=False):
if inv:
return lambda self, other: elemwise(op, other, self)
else:
return lambda self, other: elemwise(op, self, other)
def rolling(self, window, min_periods=None, freq=None, center=False,
win_type=None, axis=0):
"""Provides rolling transformations.
Parameters
----------
window : int, str, offset
Size of the moving window. This is the number of observations used
for calculating the statistic. The window size must not be so large
as to span more than one adjacent partition. If using an offset
or offset alias like '5D', the data must have a ``DatetimeIndex``
.. versionchanged:: 0.15.0
Now accepts offsets and string offset aliases
min_periods : int, default None
Minimum number of observations in window required to have a value
(otherwise result is NA).
center : boolean, default False
Set the labels at the center of the window.
win_type : string, default None
Provide a window type. The recognized window types are identical
to pandas.
axis : int, default 0
Returns
-------
a Rolling object on which to call a method to compute a statistic
Notes
-----
The `freq` argument is not supported.
"""
from dask.dataframe.rolling import Rolling
if isinstance(window, Integral):
if window < 0:
raise ValueError('window must be >= 0')
if min_periods is not None:
if not isinstance(min_periods, Integral):
raise ValueError('min_periods must be an integer')
if min_periods < 0:
raise ValueError('min_periods must be >= 0')
return Rolling(self, window=window, min_periods=min_periods,
freq=freq, center=center, win_type=win_type, axis=axis)
@derived_from(pd.DataFrame)
def diff(self, periods=1, axis=0):
axis = self._validate_axis(axis)
if not isinstance(periods, Integral):
raise TypeError("periods must be an integer")
if axis == 1:
return self.map_partitions(M.diff, token='diff', periods=periods,
axis=1)
before, after = (periods, 0) if periods > 0 else (0, -periods)
return self.map_overlap(M.diff, before, after, token='diff',
periods=periods)
@derived_from(pd.DataFrame)
def shift(self, periods=1, freq=None, axis=0):
axis = self._validate_axis(axis)
if not isinstance(periods, Integral):
raise TypeError("periods must be an integer")
if axis == 1:
return self.map_partitions(M.shift, token='shift', periods=periods,
freq=freq, axis=1)
if freq is None:
before, after = (periods, 0) if periods > 0 else (0, -periods)
return self.map_overlap(M.shift, before, after, token='shift',
periods=periods)
# Let pandas error on invalid arguments
meta = self._meta_nonempty.shift(periods, freq=freq)
out = self.map_partitions(M.shift, token='shift', periods=periods,
freq=freq, meta=meta)
return maybe_shift_divisions(out, periods, freq=freq)
def _reduction_agg(self, name, axis=None, skipna=True,
split_every=False, out=None):
axis = self._validate_axis(axis)
meta = getattr(self._meta_nonempty, name)(axis=axis, skipna=skipna)
token = self._token_prefix + name
method = getattr(M, name)
if axis == 1:
result = self.map_partitions(method, meta=meta,
token=token, skipna=skipna, axis=axis)
return handle_out(out, result)
else:
result = self.reduction(method, meta=meta, token=token,
skipna=skipna, axis=axis,
split_every=split_every)
if isinstance(self, DataFrame):
result.divisions = (min(self.columns), max(self.columns))
return handle_out(out, result)
@derived_from(pd.DataFrame)
def abs(self):
_raise_if_object_series(self, "abs")
meta = self._meta_nonempty.abs()
return self.map_partitions(M.abs, meta=meta)
@derived_from(pd.DataFrame)
def all(self, axis=None, skipna=True, split_every=False, out=None):
return self._reduction_agg('all', axis=axis, skipna=skipna,
split_every=split_every, out=out)
@derived_from(pd.DataFrame)
def any(self, axis=None, skipna=True, split_every=False, out=None):
return self._reduction_agg('any', axis=axis, skipna=skipna,
split_every=split_every, out=out)
@derived_from(pd.DataFrame)
def sum(self, axis=None, skipna=True, split_every=False, dtype=None,
out=None, min_count=None):
result = self._reduction_agg('sum', axis=axis, skipna=skipna,
split_every=split_every, out=out)
if min_count:
return result.where(self.notnull().sum(axis=axis) >= min_count,
other=np.NaN)
else:
return result
@derived_from(pd.DataFrame)
def prod(self, axis=None, skipna=True, split_every=False, dtype=None,
out=None, min_count=None):
result = self._reduction_agg('prod', axis=axis, skipna=skipna,
split_every=split_every, out=out)
if min_count:
return result.where(self.notnull().sum(axis=axis) >= min_count,
other=np.NaN)
else:
return result
@derived_from(pd.DataFrame)
def max(self, axis=None, skipna=True, split_every=False, out=None):
return self._reduction_agg('max', axis=axis, skipna=skipna,
split_every=split_every, out=out)
@derived_from(pd.DataFrame)
def min(self, axis=None, skipna=True, split_every=False, out=None):
return self._reduction_agg('min', axis=axis, skipna=skipna,
split_every=split_every, out=out)
@derived_from(pd.DataFrame)
def idxmax(self, axis=None, skipna=True, split_every=False):
fn = 'idxmax'
axis = self._validate_axis(axis)
meta = self._meta_nonempty.idxmax(axis=axis, skipna=skipna)
if axis == 1:
return map_partitions(M.idxmax, self, meta=meta,
token=self._token_prefix + fn,
skipna=skipna, axis=axis)
else:
scalar = not is_series_like(meta)
result = aca([self], chunk=idxmaxmin_chunk, aggregate=idxmaxmin_agg,
combine=idxmaxmin_combine, meta=meta,
aggregate_kwargs={'scalar': scalar},
token=self._token_prefix + fn, split_every=split_every,
skipna=skipna, fn=fn)
if isinstance(self, DataFrame):
result.divisions = (min(self.columns), max(self.columns))
return result
@derived_from(pd.DataFrame)
def idxmin(self, axis=None, skipna=True, split_every=False):
fn = 'idxmin'
axis = self._validate_axis(axis)
meta = self._meta_nonempty.idxmax(axis=axis)
if axis == 1:
return map_partitions(M.idxmin, self, meta=meta,
token=self._token_prefix + fn,
skipna=skipna, axis=axis)
else:
scalar = not is_series_like(meta)
result = aca([self], chunk=idxmaxmin_chunk, aggregate=idxmaxmin_agg,
combine=idxmaxmin_combine, meta=meta,
aggregate_kwargs={'scalar': scalar},
token=self._token_prefix + fn, split_every=split_every,
skipna=skipna, fn=fn)
if isinstance(self, DataFrame):
result.divisions = (min(self.columns), max(self.columns))
return result
@derived_from(pd.DataFrame)
def count(self, axis=None, split_every=False):
axis = self._validate_axis(axis)
token = self._token_prefix + 'count'
if axis == 1:
meta = self._meta_nonempty.count(axis=axis)
return self.map_partitions(M.count, meta=meta, token=token,
axis=axis)
else:
meta = self._meta_nonempty.count()
result = self.reduction(M.count, aggregate=M.sum, meta=meta,
token=token, split_every=split_every)
if isinstance(self, DataFrame):
result.divisions = (min(self.columns), max(self.columns))
return result
@derived_from(pd.DataFrame)
def mean(self, axis=None, skipna=True, split_every=False, dtype=None, out=None):
axis = self._validate_axis(axis)
_raise_if_object_series(self, "mean")
meta = self._meta_nonempty.mean(axis=axis, skipna=skipna)
if axis == 1:
result = map_partitions(M.mean, self, meta=meta,
token=self._token_prefix + 'mean',
axis=axis, skipna=skipna)
return handle_out(out, result)
else:
num = self._get_numeric_data()
s = num.sum(skipna=skipna, split_every=split_every)
n = num.count(split_every=split_every)
name = self._token_prefix + 'mean-%s' % tokenize(self, axis, skipna)
result = map_partitions(methods.mean_aggregate, s, n,
token=name, meta=meta)
if isinstance(self, DataFrame):
result.divisions = (min(self.columns), max(self.columns))
return handle_out(out, result)
@derived_from(pd.DataFrame)
def var(self, axis=None, skipna=True, ddof=1, split_every=False, dtype=None, out=None):
axis = self._validate_axis(axis)
_raise_if_object_series(self, "var")
meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
if axis == 1:
result = map_partitions(M.var, self, meta=meta,
token=self._token_prefix + 'var',
axis=axis, skipna=skipna, ddof=ddof)
return handle_out(out, result)
else:
num = self._get_numeric_data()
x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every)
n = num.count(split_every=split_every)
name = self._token_prefix + 'var'
result = map_partitions(methods.var_aggregate, x2, x, n,
token=name, meta=meta, ddof=ddof)
if isinstance(self, DataFrame):
result.divisions = (min(self.columns), max(self.columns))
return handle_out(out, result)
@derived_from(pd.DataFrame)
def std(self, axis=None, skipna=True, ddof=1, split_every=False, dtype=None, out=None):
axis = self._validate_axis(axis)
_raise_if_object_series(self, "std")
meta = self._meta_nonempty.std(axis=axis, skipna=skipna)
if axis == 1:
result = map_partitions(M.std, self, meta=meta,
token=self._token_prefix + 'std',
axis=axis, skipna=skipna, ddof=ddof)
return handle_out(out, result)
else:
v = self.var(skipna=skipna, ddof=ddof, split_every=split_every)
name = self._token_prefix + 'std'
result = map_partitions(np.sqrt, v, meta=meta, token=name)
return handle_out(out, result)
@derived_from(pd.DataFrame)
def sem(self, axis=None, skipna=None, ddof=1, split_every=False):
axis = self._validate_axis(axis)
_raise_if_object_series(self, "sem")
meta = self._meta_nonempty.sem(axis=axis, skipna=skipna, ddof=ddof)
if axis == 1:
return map_partitions(M.sem, self, meta=meta,
token=self._token_prefix + 'sem',
axis=axis, skipna=skipna, ddof=ddof)
else:
num = self._get_numeric_data()
v = num.var(skipna=skipna, ddof=ddof, split_every=split_every)
n = num.count(split_every=split_every)
name = self._token_prefix + 'sem'
result = map_partitions(np.sqrt, v / n, meta=meta, token=name)
if isinstance(self, DataFrame):
result.divisions = (min(self.columns), max(self.columns))
return result
def quantile(self, q=0.5, axis=0):
""" Approximate row-wise and precise column-wise quantiles of DataFrame
Parameters
----------
q : list/array of floats, default 0.5 (50%)
Iterable of numbers ranging from 0 to 1 for the desired quantiles
axis : {0, 1, 'index', 'columns'} (default 0)
0 or 'index' for row-wise, 1 or 'columns' for column-wise
"""
axis = self._validate_axis(axis)
keyname = 'quantiles-concat--' + tokenize(self, q, axis)
if axis == 1:
if isinstance(q, list):
# Not supported, the result will have current index as columns
raise ValueError("'q' must be scalar when axis=1 is specified")
return map_partitions(M.quantile, self, q, axis,
token=keyname, meta=(q, 'f8'))
else:
_raise_if_object_series(self, "quantile")
meta = self._meta.quantile(q, axis=axis)
num = self._get_numeric_data()
quantiles = tuple(quantile(self[c], q) for c in num.columns)
qnames = [(_q._name, 0) for _q in quantiles]
if isinstance(quantiles[0], Scalar):
layer = {(keyname, 0): (pd.Series, qnames, num.columns, None, meta.name)}
graph = HighLevelGraph.from_collections(keyname, layer, dependencies=quantiles)
divisions = (min(num.columns), max(num.columns))
return Series(graph, keyname, meta, divisions)
else:
layer = {(keyname, 0): (methods.concat, qnames, 1)}
graph = HighLevelGraph.from_collections(keyname, layer, dependencies=quantiles)
return DataFrame(graph, keyname, meta, quantiles[0].divisions)
@derived_from(pd.DataFrame)
def describe(self, split_every=False, percentiles=None):
# currently, only numeric describe is supported
num = self._get_numeric_data()
if self.ndim == 2 and len(num.columns) == 0:
raise ValueError("DataFrame contains only non-numeric data.")
elif self.ndim == 1 and self.dtype == 'object':
raise ValueError("Cannot compute ``describe`` on object dtype.")
if percentiles is None:
percentiles = [0.25, 0.5, 0.75]
else:
percentiles = list(set(sorted(percentiles + [0.5])))
stats = [num.count(split_every=split_every),
num.mean(split_every=split_every),
num.std(split_every=split_every),
num.min(split_every=split_every),
num.quantile(percentiles),
num.max(split_every=split_every)]
stats_names = [(s._name, 0) for s in stats]
name = 'describe--' + tokenize(self, split_every)
layer = {(name, 0): (methods.describe_aggregate, stats_names)}
graph = HighLevelGraph.from_collections(name, layer, dependencies=stats)
return new_dd_object(graph, name, num._meta, divisions=[None, None])
def _cum_agg(self, op_name, chunk, aggregate, axis, skipna=True,
chunk_kwargs=None, out=None):
""" Wrapper for cumulative operation """
axis = self._validate_axis(axis)
if axis == 1:
name = '{0}{1}(axis=1)'.format(self._token_prefix, op_name)
result = self.map_partitions(chunk, token=name, **chunk_kwargs)
return handle_out(out, result)
else:
# cumulate each partitions
name1 = '{0}{1}-map'.format(self._token_prefix, op_name)
cumpart = map_partitions(chunk, self, token=name1, meta=self,
**chunk_kwargs)
name2 = '{0}{1}-take-last'.format(self._token_prefix, op_name)
cumlast = map_partitions(_take_last, cumpart, skipna,
meta=pd.Series([]), token=name2)
suffix = tokenize(self)
name = '{0}{1}-{2}'.format(self._token_prefix, op_name, suffix)
cname = '{0}{1}-cum-last-{2}'.format(self._token_prefix, op_name,
suffix)
# aggregate cumulated partisions and its previous last element
layer = {}
layer[(name, 0)] = (cumpart._name, 0)
for i in range(1, self.npartitions):
# store each cumulative step to graph to reduce computation
if i == 1:
layer[(cname, i)] = (cumlast._name, i - 1)
else:
# aggregate with previous cumulation results
layer[(cname, i)] = (aggregate, (cname, i - 1), (cumlast._name, i - 1))
layer[(name, i)] = (aggregate, (cumpart._name, i), (cname, i))
graph = HighLevelGraph.from_collections(cname, layer, dependencies=[cumpart, cumlast])
result = new_dd_object(graph, name, chunk(self._meta), self.divisions)
return handle_out(out, result)
@derived_from(pd.DataFrame)
def cumsum(self, axis=None, skipna=True, dtype=None, out=None):
return self._cum_agg('cumsum',
chunk=M.cumsum,
aggregate=operator.add,
axis=axis, skipna=skipna,
chunk_kwargs=dict(axis=axis, skipna=skipna),
out=out)
@derived_from(pd.DataFrame)
def cumprod(self, axis=None, skipna=True, dtype=None, out=None):
return self._cum_agg('cumprod',
chunk=M.cumprod,
aggregate=operator.mul,
axis=axis, skipna=skipna,
chunk_kwargs=dict(axis=axis, skipna=skipna),
out=out)
@derived_from(pd.DataFrame)
def cummax(self, axis=None, skipna=True, out=None):
return self._cum_agg('cummax',
chunk=M.cummax,
aggregate=methods.cummax_aggregate,
axis=axis, skipna=skipna,
chunk_kwargs=dict(axis=axis, skipna=skipna),
out=out)
@derived_from(pd.DataFrame)
def cummin(self, axis=None, skipna=True, out=None):
return self._cum_agg('cummin',
chunk=M.cummin,
aggregate=methods.cummin_aggregate,
axis=axis, skipna=skipna,
chunk_kwargs=dict(axis=axis, skipna=skipna),
out=out)
@derived_from(pd.DataFrame)
def where(self, cond, other=np.nan):
# cond and other may be dask instance,
# passing map_partitions via keyword will not be aligned
return map_partitions(M.where, self, cond, other)
@derived_from(pd.DataFrame)
def mask(self, cond, other=np.nan):
return map_partitions(M.mask, self, cond, other)
@derived_from(pd.DataFrame)
def notnull(self):
return self.map_partitions(M.notnull)
@derived_from(pd.DataFrame)
def isnull(self):
return self.map_partitions(M.isnull)
@derived_from(pd.DataFrame)
def isna(self):
if hasattr(pd, 'isna'):
return self.map_partitions(M.isna)
else:
raise NotImplementedError("Need more recent version of Pandas "
"to support isna. "
"Please use isnull instead.")
@derived_from(pd.DataFrame)
def isin(self, values):
return elemwise(M.isin, self, list(values))
@derived_from(pd.DataFrame)
def astype(self, dtype):
# XXX: Pandas will segfault for empty dataframes when setting
# categorical dtypes. This operation isn't allowed currently anyway. We
# get the metadata with a non-empty frame to throw the error instead of
# segfaulting.
if is_dataframe_like(self._meta) and is_categorical_dtype(dtype):
meta = self._meta_nonempty.astype(dtype)
else:
meta = self._meta.astype(dtype)
if hasattr(dtype, 'items'):
# Pandas < 0.21.0, no `categories` attribute, so unknown
# Pandas >= 0.21.0, known if `categories` attribute is not None
set_unknown = [k for k, v in dtype.items()
if (is_categorical_dtype(v) and
getattr(v, 'categories', None) is None)]
meta = clear_known_categories(meta, cols=set_unknown)
elif (is_categorical_dtype(dtype) and
getattr(dtype, 'categories', None) is None):
meta = clear_known_categories(meta)
return self.map_partitions(M.astype, dtype=dtype, meta=meta)
@derived_from(pd.Series)
def append(self, other):
# because DataFrame.append will override the method,
# wrap by pd.Series.append docstring
from .multi import concat
if isinstance(other, (list, dict)):
msg = "append doesn't support list or dict input"
raise NotImplementedError(msg)
return concat([self, other], join='outer', interleave_partitions=False)
@derived_from(pd.DataFrame)
def align(self, other, join='outer', axis=None, fill_value=None):
meta1, meta2 = _emulate(M.align, self, other, join, axis=axis,
fill_value=fill_value)
aligned = self.map_partitions(M.align, other, join=join, axis=axis,
fill_value=fill_value)
token = tokenize(self, other, join, axis, fill_value)
name1 = 'align1-' + token
dsk1 = {(name1, i): (getitem, key, 0)
for i, key in enumerate(aligned.__dask_keys__())}
dsk1.update(aligned.dask)
result1 = new_dd_object(dsk1, name1, meta1, aligned.divisions)
name2 = 'align2-' + token
dsk2 = {(name2, i): (getitem, key, 1)
for i, key in enumerate(aligned.__dask_keys__())}
dsk2.update(aligned.dask)
result2 = new_dd_object(dsk2, name2, meta2, aligned.divisions)
return result1, result2
@derived_from(pd.DataFrame)
def combine(self, other, func, fill_value=None, overwrite=True):
return self.map_partitions(M.combine, other, func,
fill_value=fill_value, overwrite=overwrite)
@derived_from(pd.DataFrame)
def combine_first(self, other):
return self.map_partitions(M.combine_first, other)
@classmethod
def _bind_operator_method(cls, name, op):
""" bind operator method like DataFrame.add to this class """
raise NotImplementedError
@derived_from(pd.DataFrame)
def resample(self, rule, closed=None, label=None):
from .tseries.resample import Resampler
return Resampler(self, rule, closed=closed, label=label)
@derived_from(pd.DataFrame)
def first(self, offset):
# Let pandas error on bad args
self._meta_nonempty.first(offset)
if not self.known_divisions:
raise ValueError("`first` is not implemented for unknown divisions")
offset = pd.tseries.frequencies.to_offset(offset)
date = self.divisions[0] + offset
end = self.loc._get_partitions(date)
include_right = offset.isAnchored() or not hasattr(offset, '_inc')
if end == self.npartitions - 1:
divs = self.divisions
else:
divs = self.divisions[:end + 1] + (date,)
name = 'first-' + tokenize(self, offset)
dsk = {(name, i): (self._name, i) for i in range(end)}
dsk[(name, end)] = (methods.boundary_slice, (self._name, end),
None, date, include_right, True, 'loc')
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
return new_dd_object(graph, name, self, divs)
@derived_from(pd.DataFrame)
def last(self, offset):
# Let pandas error on bad args
self._meta_nonempty.first(offset)
if not self.known_divisions:
raise ValueError("`last` is not implemented for unknown divisions")
offset = pd.tseries.frequencies.to_offset(offset)
date = self.divisions[-1] - offset
start = self.loc._get_partitions(date)
if start == 0:
divs = self.divisions
else:
divs = (date,) + self.divisions[start + 1:]
name = 'last-' + tokenize(self, offset)
dsk = {(name, i + 1): (self._name, j + 1)
for i, j in enumerate(range(start, self.npartitions))}
dsk[(name, 0)] = (methods.boundary_slice, (self._name, start),
date, None, True, False, 'loc')
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
return new_dd_object(graph, name, self, divs)
def nunique_approx(self, split_every=None):
"""Approximate number of unique rows.
This method uses the HyperLogLog algorithm for cardinality
estimation to compute the approximate number of unique rows.
The approximate error is 0.406%.
Parameters
----------
split_every : int, optional
Group partitions into groups of this size while performing a
tree-reduction. If set to False, no tree-reduction will be used.
Default is 8.
Returns
-------
a float representing the approximate number of elements
"""
from . import hyperloglog # here to avoid circular import issues
return aca([self], chunk=hyperloglog.compute_hll_array,
combine=hyperloglog.reduce_state,
aggregate=hyperloglog.estimate_count,
split_every=split_every, b=16, meta=float)
@property
def values(self):
""" Return a dask.array of the values of this dataframe
Warning: This creates a dask.array without precise shape information.
Operations that depend on shape information, like slicing or reshaping,
will not work.
"""
return self.map_partitions(methods.values)
def _raise_if_object_series(x, funcname):
"""
Utility function to raise an error if an object column does not support
a certain operation like `mean`.
"""
if isinstance(x, Series) and hasattr(x, "dtype") and x.dtype == object:
raise ValueError("`%s` not supported with object series" % funcname)
class Series(_Frame):
""" Parallel Pandas Series
Do not use this class directly. Instead use functions like
``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.
Parameters
----------
dsk: dict
The dask graph to compute this Series
_name: str
The key prefix that specifies which keys in the dask comprise this
particular Series
meta: pandas.Series
An empty ``pandas.Series`` with names, dtypes, and index matching the
expected output.
divisions: tuple of index values
Values along which we partition our blocks on the index
See Also
--------
dask.dataframe.DataFrame
"""
_partition_type = pd.Series
_token_prefix = 'series-'
def __array_wrap__(self, array, context=None):
if isinstance(context, tuple) and len(context) > 0:
if isinstance(context[1][0], np.ndarray) and context[1][0].shape == ():
index = None
else:
index = context[1][0].index
return pd.Series(array, index=index, name=self.name)
@property
def name(self):
return self._meta.name
@name.setter
def name(self, name):
self._meta.name = name
renamed = _rename_dask(self, name)
# update myself
self.dask = renamed.dask
self._name = renamed._name
@property
def ndim(self):
""" Return dimensionality """
return 1
@property
def shape(self):
"""
Return a tuple representing the dimensionality of a Series.
The single element of the tuple is a Delayed result.
Examples
--------
>>> series.shape # doctest: +SKIP
# (dd.Scalar<size-ag..., dtype=int64>,)
"""
return (self.size,)
@property
def dtype(self):
""" Return data type """
return self._meta.dtype
@cache_readonly
def dt(self):
""" Namespace of datetime methods """
return DatetimeAccessor(self)
@cache_readonly
def cat(self):
return CategoricalAccessor(self)
@cache_readonly
def str(self):
""" Namespace for string methods """
return StringAccessor(self)
def __dir__(self):
o = set(dir(type(self)))
o.update(self.__dict__)
# Remove the `cat` and `str` accessors if not available. We can't
# decide this statically for the `dt` accessor, as it works on
# datetime-like things as well.
for accessor in ['cat', 'str']:
if not hasattr(self._meta, accessor):
o.remove(accessor)
return list(o)
@property
def nbytes(self):
""" Number of bytes """
return self.reduction(methods.nbytes, np.sum, token='nbytes',
meta=int, split_every=False)
def _repr_data(self):
return _repr_data_series(self._meta, self._repr_divisions)
def __repr__(self):
""" have to overwrite footer """
if self.name is not None:
footer = "Name: {name}, dtype: {dtype}".format(name=self.name,
dtype=self.dtype)
else:
footer = "dtype: {dtype}".format(dtype=self.dtype)
return """Dask {klass} Structure:
{data}
{footer}
Dask Name: {name}, {task} tasks""".format(klass=self.__class__.__name__,
data=self.to_string(),
footer=footer,
name=key_split(self._name),
task=len(self.dask))
def rename(self, index=None, inplace=False, sorted_index=False):
"""Alter Series index labels or name
Function / dict values must be unique (1-to-1). Labels not contained in
a dict / Series will be left as-is. Extra labels listed don't throw an
error.
Alternatively, change ``Series.name`` with a scalar value.
Parameters
----------
index : scalar, hashable sequence, dict-like or callable, optional
If dict-like or callable, the transformation is applied to the
index. Scalar or hashable sequence-like will alter the
``Series.name`` attribute.
inplace : boolean, default False
Whether to return a new Series or modify this one inplace.
sorted_index : bool, default False
If true, the output ``Series`` will have known divisions inferred
from the input series and the transformation. Ignored for
non-callable/dict-like ``index`` or when the input series has
unknown divisions. Note that this may only be set to ``True`` if
you know that the transformed index is monotonicly increasing. Dask
will check that transformed divisions are monotonic, but cannot
check all the values between divisions, so incorrectly setting this
can result in bugs.
Returns
-------
renamed : Series
See Also
--------
pandas.Series.rename
"""
from pandas.api.types import is_scalar, is_list_like, is_dict_like
if is_scalar(index) or (is_list_like(index) and not is_dict_like(index)):
res = self if inplace else self.copy()
res.name = index
else:
res = self.map_partitions(M.rename, index)
if self.known_divisions:
if sorted_index and (callable(index) or is_dict_like(index)):
old = pd.Series(range(self.npartitions + 1),
index=self.divisions)
new = old.rename(index).index
if not new.is_monotonic_increasing:
msg = ("sorted_index=True, but the transformed index "
"isn't monotonic_increasing")
raise ValueError(msg)
res.divisions = tuple(new.tolist())
else:
res = res.clear_divisions()
if inplace:
self.dask = res.dask
self._name = res._name
self.divisions = res.divisions
self._meta = res._meta
res = self
return res
@derived_from(pd.Series)
def round(self, decimals=0):
return elemwise(M.round, self, decimals)
@derived_from(pd.DataFrame)
def to_timestamp(self, freq=None, how='start', axis=0):
df = elemwise(M.to_timestamp, self, freq, how, axis)
df.divisions = tuple(pd.Index(self.divisions).to_timestamp())
return df
def quantile(self, q=0.5):
""" Approximate quantiles of Series
q : list/array of floats, default 0.5 (50%)
Iterable of numbers ranging from 0 to 1 for the desired quantiles
"""
return quantile(self, q)
def _repartition_quantiles(self, npartitions, upsample=1.0):
""" Approximate quantiles of Series used for repartitioning
"""
from .partitionquantiles import partition_quantiles
return partition_quantiles(self, npartitions, upsample=upsample)
def __getitem__(self, key):
if isinstance(key, Series) and self.divisions == key.divisions:
name = 'index-%s' % tokenize(self, key)
dsk = partitionwise_graph(operator.getitem, name, self, key)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self, key])
return Series(graph, name, self._meta, self.divisions)
raise NotImplementedError()
@derived_from(pd.DataFrame)
def _get_numeric_data(self, how='any', subset=None):
return self
@derived_from(pd.Series)
def iteritems(self):
for i in range(self.npartitions):
s = self.get_partition(i).compute()
for item in s.iteritems():
yield item
@classmethod
def _validate_axis(cls, axis=0):
if axis not in (0, 'index', None):
raise ValueError('No axis named {0}'.format(axis))
# convert to numeric axis
return {None: 0, 'index': 0}.get(axis, axis)
@derived_from(pd.Series)
def groupby(self, by=None, **kwargs):
from dask.dataframe.groupby import SeriesGroupBy
return SeriesGroupBy(self, by=by, **kwargs)
@derived_from(pd.Series)
def count(self, split_every=False):
return super(Series, self).count(split_every=split_every)
def unique(self, split_every=None, split_out=1):
"""
Return Series of unique values in the object. Includes NA values.
Returns
-------
uniques : Series
"""
return aca(self, chunk=methods.unique, aggregate=methods.unique,
meta=self._meta, token='unique', split_every=split_every,
series_name=self.name, split_out=split_out)
@derived_from(pd.Series)
def nunique(self, split_every=None):
return self.drop_duplicates(split_every=split_every).count()
@derived_from(pd.Series)
def value_counts(self, split_every=None, split_out=1):
return aca(self, chunk=M.value_counts,
aggregate=methods.value_counts_aggregate,
combine=methods.value_counts_combine,
meta=self._meta.value_counts(), token='value-counts',
split_every=split_every, split_out=split_out,
split_out_setup=split_out_on_index)
@derived_from(pd.Series)
def nlargest(self, n=5, split_every=None):
return aca(self, chunk=M.nlargest, aggregate=M.nlargest,
meta=self._meta, token='series-nlargest',
split_every=split_every, n=n)
@derived_from(pd.Series)
def nsmallest(self, n=5, split_every=None):
return aca(self, chunk=M.nsmallest, aggregate=M.nsmallest,
meta=self._meta, token='series-nsmallest',
split_every=split_every, n=n)
@derived_from(pd.Series)
def isin(self, values):
return elemwise(M.isin, self, list(values))
@insert_meta_param_description(pad=12)
@derived_from(pd.Series)
def map(self, arg, na_action=None, meta=no_default):
if not (isinstance(arg, dict) or
callable(arg) or
is_series_like(arg) and not is_dask_collection(arg)):
raise TypeError("arg must be pandas.Series, dict or callable."
" Got {0}".format(type(arg)))
name = 'map-' + tokenize(self, arg, na_action)
dsk = {(name, i): (M.map, k, arg, na_action) for i, k in
enumerate(self.__dask_keys__())}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
if meta is no_default:
meta = _emulate(M.map, self, arg, na_action=na_action, udf=True)
else:
meta = make_meta(meta, index=getattr(make_meta(self), 'index', None))
return Series(graph, name, meta, self.divisions)
@derived_from(pd.Series)
def dropna(self):
return self.map_partitions(M.dropna)
@derived_from(pd.Series)
def between(self, left, right, inclusive=True):
return self.map_partitions(M.between, left=left,
right=right, inclusive=inclusive)
@derived_from(pd.Series)
def clip(self, lower=None, upper=None, out=None):
if out is not None:
raise ValueError("'out' must be None")
# np.clip may pass out
return self.map_partitions(M.clip, lower=lower, upper=upper)
@derived_from(pd.Series)
def clip_lower(self, threshold):
return self.map_partitions(M.clip_lower, threshold=threshold)
@derived_from(pd.Series)
def clip_upper(self, threshold):
return self.map_partitions(M.clip_upper, threshold=threshold)
@derived_from(pd.Series)
def align(self, other, join='outer', axis=None, fill_value=None):
return super(Series, self).align(other, join=join, axis=axis,
fill_value=fill_value)
@derived_from(pd.Series)
def combine(self, other, func, fill_value=None):
return self.map_partitions(M.combine, other, func,
fill_value=fill_value)
@derived_from(pd.Series)
def squeeze(self):
return self
@derived_from(pd.Series)
def combine_first(self, other):
return self.map_partitions(M.combine_first, other)
def to_bag(self, index=False):
""" Craeate a Dask Bag from a Series """
from .io import to_bag
return to_bag(self, index)
@derived_from(pd.Series)
def to_frame(self, name=None):
return self.map_partitions(M.to_frame, name,
meta=self._meta.to_frame(name))
@derived_from(pd.Series)
def to_string(self, max_rows=5):
# option_context doesn't affect
return self._repr_data().to_string(max_rows=max_rows)
@classmethod
def _bind_operator_method(cls, name, op):
""" bind operator method like Series.add to this class """
def meth(self, other, level=None, fill_value=None, axis=0):
if level is not None:
raise NotImplementedError('level must be None')
axis = self._validate_axis(axis)
meta = _emulate(op, self, other, axis=axis, fill_value=fill_value)
return map_partitions(op, self, other, meta=meta,
axis=axis, fill_value=fill_value)
meth.__doc__ = op.__doc__
bind_method(cls, name, meth)
@classmethod
def _bind_comparison_method(cls, name, comparison):
""" bind comparison method like Series.eq to this class """
def meth(self, other, level=None, fill_value=None, axis=0):
if level is not None:
raise NotImplementedError('level must be None')
axis = self._validate_axis(axis)
if fill_value is None:
return elemwise(comparison, self, other, axis=axis)
else:
op = partial(comparison, fill_value=fill_value)
return elemwise(op, self, other, axis=axis)
meth.__doc__ = comparison.__doc__
bind_method(cls, name, meth)
@insert_meta_param_description(pad=12)
def apply(self, func, convert_dtype=True, meta=no_default, args=(), **kwds):
""" Parallel version of pandas.Series.apply
Parameters
----------
func : function
Function to apply
convert_dtype : boolean, default True
Try to find better dtype for elementwise function results.
If False, leave as dtype=object.
$META
args : tuple
Positional arguments to pass to function in addition to the value.
Additional keyword arguments will be passed as keywords to the function.
Returns
-------
applied : Series or DataFrame if func returns a Series.
Examples
--------
>>> import dask.dataframe as dd
>>> s = pd.Series(range(5), name='x')
>>> ds = dd.from_pandas(s, npartitions=2)
Apply a function elementwise across the Series, passing in extra
arguments in ``args`` and ``kwargs``:
>>> def myadd(x, a, b=1):
... return x + a + b
>>> res = ds.apply(myadd, args=(2,), b=1.5)
By default, dask tries to infer the output metadata by running your
provided function on some fake data. This works well in many cases, but
can sometimes be expensive, or even fail. To avoid this, you can
manually specify the output metadata with the ``meta`` keyword. This
can be specified in many forms, for more information see
``dask.dataframe.utils.make_meta``.
Here we specify the output is a Series with name ``'x'``, and dtype
``float64``:
>>> res = ds.apply(myadd, args=(2,), b=1.5, meta=('x', 'f8'))
In the case where the metadata doesn't change, you can also pass in
the object itself directly:
>>> res = ds.apply(lambda x: x + 1, meta=ds)
See Also
--------
dask.Series.map_partitions
"""
if meta is no_default:
msg = ("`meta` is not specified, inferred from partial data. "
"Please provide `meta` if the result is unexpected.\n"
" Before: .apply(func)\n"
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n"
" or: .apply(func, meta=('x', 'f8')) for series result")
warnings.warn(msg)
meta = _emulate(M.apply, self._meta_nonempty, func,
convert_dtype=convert_dtype,
args=args, udf=True, **kwds)
return map_partitions(M.apply, self, func,
convert_dtype, args, meta=meta, **kwds)
@derived_from(pd.Series)
def cov(self, other, min_periods=None, split_every=False):
from .multi import concat
if not isinstance(other, Series):
raise TypeError("other must be a dask.dataframe.Series")
df = concat([self, other], axis=1)
return cov_corr(df, min_periods, scalar=True, split_every=split_every)
@derived_from(pd.Series)
def corr(self, other, method='pearson', min_periods=None,
split_every=False):
from .multi import concat
if not isinstance(other, Series):
raise TypeError("other must be a dask.dataframe.Series")
if method != 'pearson':
raise NotImplementedError("Only Pearson correlation has been "
"implemented")
df = concat([self, other], axis=1)
return cov_corr(df, min_periods, corr=True, scalar=True,
split_every=split_every)
@derived_from(pd.Series)
def autocorr(self, lag=1, split_every=False):
if not isinstance(lag, Integral):
raise TypeError("lag must be an integer")
return self.corr(self if lag == 0 else self.shift(lag),
split_every=split_every)
@derived_from(pd.Series)
def memory_usage(self, index=True, deep=False):
result = self.map_partitions(M.memory_usage, index=index, deep=deep)
return delayed(sum)(result.to_delayed())
class Index(Series):
_partition_type = pd.Index
_token_prefix = 'index-'
_dt_attributes = {'nanosecond', 'microsecond', 'millisecond', 'dayofyear',
'minute', 'hour', 'day', 'dayofweek', 'second', 'week',
'weekday', 'weekofyear', 'month', 'quarter', 'year'}
_cat_attributes = {'known', 'as_known', 'as_unknown', 'add_categories',
'categories', 'remove_categories', 'reorder_categories',
'as_ordered', 'codes', 'remove_unused_categories',
'set_categories', 'as_unordered', 'ordered',
'rename_categories'}
def __getattr__(self, key):
if is_categorical_dtype(self.dtype) and key in self._cat_attributes:
return getattr(self.cat, key)
elif key in self._dt_attributes:
return getattr(self.dt, key)
raise AttributeError("'Index' object has no attribute %r" % key)
def __dir__(self):
out = super(Index, self).__dir__()
out.extend(self._dt_attributes)
if is_categorical_dtype(self.dtype):
out.extend(self._cat_attributes)
return out
@property
def index(self):
msg = "'{0}' object has no attribute 'index'"
raise AttributeError(msg.format(self.__class__.__name__))
def __array_wrap__(self, array, context=None):
return pd.Index(array, name=self.name)
def head(self, n=5, compute=True):
""" First n items of the Index.
Caveat, this only checks the first partition.
"""
name = 'head-%d-%s' % (n, self._name)
dsk = {(name, 0): (operator.getitem, (self._name, 0), slice(0, n))}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
result = new_dd_object(graph, name, self._meta, self.divisions[:2])
if compute:
result = result.compute()
return result
@derived_from(pd.Index)
def max(self, split_every=False):
return self.reduction(M.max, meta=self._meta_nonempty.max(),
token=self._token_prefix + 'max',
split_every=split_every)
@derived_from(pd.Index)
def min(self, split_every=False):
return self.reduction(M.min, meta=self._meta_nonempty.min(),
token=self._token_prefix + 'min',
split_every=split_every)
def count(self, split_every=False):
return self.reduction(methods.index_count, np.sum,
token='index-count', meta=int,
split_every=split_every)
@derived_from(pd.Index)
def shift(self, periods=1, freq=None):
if isinstance(self._meta, pd.PeriodIndex):
if freq is not None:
raise ValueError("PeriodIndex doesn't accept `freq` argument")
meta = self._meta_nonempty.shift(periods)
out = self.map_partitions(M.shift, periods, meta=meta,
token='shift')
else:
# Pandas will raise for other index types that don't implement shift
meta = self._meta_nonempty.shift(periods, freq=freq)
out = self.map_partitions(M.shift, periods, token='shift',
meta=meta, freq=freq)
if freq is None:
freq = meta.freq
return maybe_shift_divisions(out, periods, freq=freq)
@derived_from(pd.Index)
def to_series(self):
return self.map_partitions(M.to_series,
meta=self._meta.to_series())
class DataFrame(_Frame):
"""
Parallel Pandas DataFrame
Do not use this class directly. Instead use functions like
``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.
Parameters
----------
dsk: dict
The dask graph to compute this DataFrame
name: str
The key prefix that specifies which keys in the dask comprise this
particular DataFrame
meta: pandas.DataFrame
An empty ``pandas.DataFrame`` with names, dtypes, and index matching
the expected output.
divisions: tuple of index values
Values along which we partition our blocks on the index
"""
_partition_type = pd.DataFrame
_token_prefix = 'dataframe-'
def __array_wrap__(self, array, context=None):
if isinstance(context, tuple) and len(context) > 0:
if isinstance(context[1][0], np.ndarray) and context[1][0].shape == ():
index = None
else:
index = context[1][0].index
return pd.DataFrame(array, index=index, columns=self.columns)
@property
def columns(self):
return self._meta.columns
@columns.setter
def columns(self, columns):
renamed = _rename_dask(self, columns)
self._meta = renamed._meta
self._name = renamed._name
self.dask = renamed.dask
@property
def iloc(self):
"""Purely integer-location based indexing for selection by position.
Only indexing the column positions is supported. Trying to select
row positions will raise a ValueError.
See :ref:`dataframe.indexing` for more.
Examples
--------
>>> df.iloc[:, [2, 0, 1]] # doctest: +SKIP
"""
from .indexing import _iLocIndexer
return _iLocIndexer(self)
def __getitem__(self, key):
name = 'getitem-%s' % tokenize(self, key)
if np.isscalar(key) or isinstance(key, (tuple, string_types)):
if isinstance(self._meta.index, (pd.DatetimeIndex, pd.PeriodIndex)):
if key not in self._meta.columns:
return self.loc[key]
# error is raised from pandas
meta = self._meta[_extract_meta(key)]
dsk = partitionwise_graph(operator.getitem, name, self, key)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
return new_dd_object(graph, name, meta, self.divisions)
elif isinstance(key, slice):
from pandas.api.types import is_float_dtype
is_integer_slice = any(isinstance(i, Integral)
for i in (key.start, key.step, key.stop))
# Slicing with integer labels is always iloc based except for a
# float indexer for some reason
if is_integer_slice and not is_float_dtype(self.index.dtype):
self.iloc[key]
else:
return self.loc[key]
if (isinstance(key, (np.ndarray, list)) or (
not is_dask_collection(key) and (is_series_like(key) or is_index_like(key)))):
# error is raised from pandas
meta = self._meta[_extract_meta(key)]
dsk = partitionwise_graph(operator.getitem, name, self, key)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self])
return new_dd_object(graph, name, meta, self.divisions)
if isinstance(key, Series):
# do not perform dummy calculation, as columns will not be changed.
#
if self.divisions != key.divisions:
from .multi import _maybe_align_partitions
self, key = _maybe_align_partitions([self, key])
dsk = partitionwise_graph(operator.getitem, name, self, key)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self, key])
return new_dd_object(graph, name, self, self.divisions)
raise NotImplementedError(key)
def __setitem__(self, key, value):
if isinstance(key, (tuple, list)) and isinstance(value, DataFrame):
df = self.assign(**{k: value[c]
for k, c in zip(key, value.columns)})
elif isinstance(key, pd.Index) and not isinstance(value, DataFrame):
key = list(key)
df = self.assign(**{k: value for k in key})
else:
df = self.assign(**{key: value})
self.dask = df.dask
self._name = df._name
self._meta = df._meta
self.divisions = df.divisions
def __delitem__(self, key):
result = self.drop([key], axis=1)
self.dask = result.dask
self._name = result._name
self._meta = result._meta
def __setattr__(self, key, value):
try:
columns = object.__getattribute__(self, '_meta').columns
except AttributeError:
columns = ()
if key in columns:
self[key] = value
else:
object.__setattr__(self, key, value)
def __getattr__(self, key):
if key in self.columns:
return self[key]
else:
raise AttributeError("'DataFrame' object has no attribute %r" % key)
def __dir__(self):
o = set(dir(type(self)))
o.update(self.__dict__)
o.update(c for c in self.columns if
(isinstance(c, pd.compat.string_types) and
pd.compat.isidentifier(c)))
return list(o)
def _ipython_key_completions_(self):
return self.columns.tolist()
@property
def ndim(self):
""" Return dimensionality """
return 2
@property
def shape(self):
"""
Return a tuple representing the dimensionality of the DataFrame.
The number of rows is a Delayed result. The number of columns
is a concrete integer.
Examples
--------
>>> df.size # doctest: +SKIP
(Delayed('int-07f06075-5ecc-4d77-817e-63c69a9188a8'), 2)
"""
col_size = len(self.columns)
row_size = delayed(int)(self.size / col_size)
return (row_size, col_size)
@property
def dtypes(self):
""" Return data types """
return self._meta.dtypes
@derived_from(pd.DataFrame)
def get_dtype_counts(self):
return self._meta.get_dtype_counts()
@derived_from(pd.DataFrame)
def get_ftype_counts(self):
return self._meta.get_ftype_counts()
@derived_from(pd.DataFrame)
def select_dtypes(self, include=None, exclude=None):
cs = self._meta.select_dtypes(include=include, exclude=exclude).columns
return self[list(cs)]
def set_index(self, other, drop=True, sorted=False, npartitions=None,
divisions=None, inplace=False, **kwargs):
"""Set the DataFrame index (row labels) using an existing column
This realigns the dataset to be sorted by a new column. This can have a
significant impact on performance, because joins, groupbys, lookups, etc.
are all much faster on that column. However, this performance increase
comes with a cost, sorting a parallel dataset requires expensive shuffles.
Often we ``set_index`` once directly after data ingest and filtering and
then perform many cheap computations off of the sorted dataset.
This function operates exactly like ``pandas.set_index`` except with
different performance costs (it is much more expensive). Under normal
operation this function does an initial pass over the index column to
compute approximate qunatiles to serve as future divisions. It then passes
over the data a second time, splitting up each input partition into several
pieces and sharing those pieces to all of the output partitions now in
sorted order.
In some cases we can alleviate those costs, for example if your dataset is
sorted already then we can avoid making many small pieces or if you know
good values to split the new index column then we can avoid the initial
pass over the data. For example if your new index is a datetime index and
your data is already sorted by day then this entire operation can be done
for free. You can control these options with the following parameters.
Parameters
----------
df: Dask DataFrame
index: string or Dask Series
npartitions: int, None, or 'auto'
The ideal number of output partitions. If None use the same as
the input. If 'auto' then decide by memory use.
shuffle: string, optional
Either ``'disk'`` for single-node operation or ``'tasks'`` for
distributed operation. Will be inferred by your current scheduler.
sorted: bool, optional
If the index column is already sorted in increasing order.
Defaults to False
divisions: list, optional
Known values on which to separate index values of the partitions.
See https://docs.dask.org/en/latest/dataframe-design.html#partitions
Defaults to computing this with a single pass over the data. Note
that if ``sorted=True``, specified divisions are assumed to match
the existing partitions in the data. If this is untrue, you should
leave divisions empty and call ``repartition`` after ``set_index``.
inplace : bool, optional
Modifying the DataFrame in place is not supported by Dask.
Defaults to False.
compute: bool
Whether or not to trigger an immediate computation. Defaults to False.
Examples
--------
>>> df2 = df.set_index('x') # doctest: +SKIP
>>> df2 = df.set_index(d.x) # doctest: +SKIP
>>> df2 = df.set_index(d.timestamp, sorted=True) # doctest: +SKIP
A common case is when we have a datetime column that we know to be
sorted and is cleanly divided by day. We can set this index for free
by specifying both that the column is pre-sorted and the particular
divisions along which is is separated
>>> import pandas as pd
>>> divisions = pd.date_range('2000', '2010', freq='1D')
>>> df2 = df.set_index('timestamp', sorted=True, divisions=divisions) # doctest: +SKIP
"""
if inplace:
raise NotImplementedError("The inplace= keyword is not supported")
pre_sorted = sorted
del sorted
if divisions is not None:
check_divisions(divisions)
if pre_sorted:
from .shuffle import set_sorted_index
return set_sorted_index(self, other, drop=drop, divisions=divisions,
**kwargs)
else:
from .shuffle import set_index
return set_index(self, other, drop=drop, npartitions=npartitions,
divisions=divisions, **kwargs)
@derived_from(pd.DataFrame)
def nlargest(self, n=5, columns=None, split_every=None):
token = 'dataframe-nlargest'
return aca(self, chunk=M.nlargest, aggregate=M.nlargest,
meta=self._meta, token=token, split_every=split_every,
n=n, columns=columns)
@derived_from(pd.DataFrame)
def nsmallest(self, n=5, columns=None, split_every=None):
token = 'dataframe-nsmallest'
return aca(self, chunk=M.nsmallest, aggregate=M.nsmallest,
meta=self._meta, token=token, split_every=split_every,
n=n, columns=columns)
@derived_from(pd.DataFrame)
def groupby(self, by=None, **kwargs):
from dask.dataframe.groupby import DataFrameGroupBy
return DataFrameGroupBy(self, by=by, **kwargs)
@wraps(categorize)
def categorize(self, columns=None, index=None, split_every=None, **kwargs):
return categorize(self, columns=columns, index=index,
split_every=split_every, **kwargs)
@derived_from(pd.DataFrame)
def assign(self, **kwargs):
for k, v in kwargs.items():
if not (isinstance(v, Scalar) or is_series_like(v) or
callable(v) or pd.api.types.is_scalar(v)):
raise TypeError("Column assignment doesn't support type "
"{0}".format(type(v).__name__))
if callable(v):
kwargs[k] = v(self)
pairs = list(sum(kwargs.items(), ()))
# Figure out columns of the output
df2 = self._meta.assign(**_extract_meta(kwargs))
return elemwise(methods.assign, self, *pairs, meta=df2)
@derived_from(pd.DataFrame, ua_args=['index'])
def rename(self, index=None, columns=None):
if index is not None:
raise ValueError("Cannot rename index.")
# *args here is index, columns but columns arg is already used
return self.map_partitions(M.rename, None, columns=columns)
def query(self, expr, **kwargs):
""" Filter dataframe with complex expression
Blocked version of pd.DataFrame.query
This is like the sequential version except that this will also happen
in many threads. This may conflict with ``numexpr`` which will use
multiple threads itself. We recommend that you set numexpr to use a
single thread
import numexpr
numexpr.set_nthreads(1)
See also
--------
pandas.DataFrame.query
"""
return self.map_partitions(M.query, expr, **kwargs)
@derived_from(pd.DataFrame)
def eval(self, expr, inplace=None, **kwargs):
if inplace is None:
if PANDAS_VERSION >= '0.21.0':
inplace = False
if '=' in expr and inplace in (True, None):
raise NotImplementedError("Inplace eval not supported."
" Please use inplace=False")
meta = self._meta.eval(expr, inplace=inplace, **kwargs)
return self.map_partitions(M.eval, expr, meta=meta, inplace=inplace, **kwargs)
@derived_from(pd.DataFrame)
def dropna(self, how='any', subset=None):
return self.map_partitions(M.dropna, how=how, subset=subset)
@derived_from(pd.DataFrame)
def clip(self, lower=None, upper=None, out=None):
if out is not None:
raise ValueError("'out' must be None")
return self.map_partitions(M.clip, lower=lower, upper=upper)
@derived_from(pd.DataFrame)
def clip_lower(self, threshold):
return self.map_partitions(M.clip_lower, threshold=threshold)
@derived_from(pd.DataFrame)
def clip_upper(self, threshold):
return self.map_partitions(M.clip_upper, threshold=threshold)
@derived_from(pd.DataFrame)
def squeeze(self, axis=None):
if axis in [None, 1]:
if len(self.columns) == 1:
return self[self.columns[0]]
else:
return self
elif axis == 0:
raise NotImplementedError("{0} does not support "
"squeeze along axis 0".format(type(self)))
elif axis not in [0, 1, None]:
raise ValueError('No axis {0} for object type {1}'.format(
axis, type(self)))
@derived_from(pd.DataFrame)
def to_timestamp(self, freq=None, how='start', axis=0):
df = elemwise(M.to_timestamp, self, freq, how, axis)
df.divisions = tuple(pd.Index(self.divisions).to_timestamp())
return df
def to_bag(self, index=False):
"""Convert to a dask Bag of tuples of each row.
Parameters
----------
index : bool, optional
If True, the index is included as the first element of each tuple.
Default is False.
"""
from .io import to_bag
return to_bag(self, index)
def to_parquet(self, path, *args, **kwargs):
""" See dd.to_parquet docstring for more information """
from .io import to_parquet
return to_parquet(self, path, *args, **kwargs)
@derived_from(pd.DataFrame)
def to_string(self, max_rows=5):
# option_context doesn't affect
return self._repr_data().to_string(max_rows=max_rows,
show_dimensions=False)
def _get_numeric_data(self, how='any', subset=None):
# calculate columns to avoid unnecessary calculation
numerics = self._meta._get_numeric_data()
if len(numerics.columns) < len(self.columns):
name = self._token_prefix + '-get_numeric_data'
return self.map_partitions(M._get_numeric_data,
meta=numerics, token=name)
else:
# use myself if all numerics
return self
@classmethod
def _validate_axis(cls, axis=0):
if axis not in (0, 1, 'index', 'columns', None):
raise ValueError('No axis named {0}'.format(axis))
# convert to numeric axis
return {None: 0, 'index': 0, 'columns': 1}.get(axis, axis)
@derived_from(pd.DataFrame)
def drop(self, labels, axis=0, errors='raise'):
axis = self._validate_axis(axis)
if axis == 1:
return self.map_partitions(M.drop, labels, axis=axis, errors=errors)
raise NotImplementedError("Drop currently only works for axis=1")
def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, suffixes=('_x', '_y'),
indicator=False, npartitions=None, shuffle=None):
"""Merge the DataFrame with another DataFrame
This will merge the two datasets, either on the indices, a certain column
in each dataset or the index in one dataset and the column in another.
Parameters
----------
right: dask.dataframe.DataFrame
how : {'left', 'right', 'outer', 'inner'}, default: 'inner'
How to handle the operation of the two objects:
- left: use calling frame's index (or column if on is specified)
- right: use other frame's index
- outer: form union of calling frame's index (or column if on is
specified) with other frame's index, and sort it
lexicographically
- inner: form intersection of calling frame's index (or column if
on is specified) with other frame's index, preserving the order
of the calling's one
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If on is None and not merging on indexes then this
defaults to the intersection of the columns in both DataFrames.
left_on : label or list, or array-like
Column to join on in the left DataFrame. Other than in pandas
arrays and lists are only support if their length is 1.
right_on : label or list, or array-like
Column to join on in the right DataFrame. Other than in pandas
arrays and lists are only support if their length is 1.
left_index : boolean, default False
Use the index from the left DataFrame as the join key.
right_index : boolean, default False
Use the index from the right DataFrame as the join key.
suffixes : 2-length sequence (tuple, list, ...)
Suffix to apply to overlapping column names in the left and
right side, respectively
indicator : boolean or string, default False
If True, adds a column to output DataFrame called "_merge" with
information on the source of each row. If string, column with
information on source of each row will be added to output DataFrame,
and column will be named value of string. Information column is
Categorical-type and takes on a value of "left_only" for observations
whose merge key only appears in `left` DataFrame, "right_only" for
observations whose merge key only appears in `right` DataFrame,
and "both" if the observations merge key is found in both.
npartitions: int, None, or 'auto'
The ideal number of output partitions. This is only utilised when
performing a hash_join (merging on columns only). If `None`
npartitions = max(lhs.npartitions, rhs.npartitions)
shuffle: {'disk', 'tasks'}, optional
Either ``'disk'`` for single-node operation or ``'tasks'`` for
distributed operation. Will be inferred by your current scheduler.
Notes
-----
There are three ways to join dataframes:
1. Joining on indices. In this case the divisions are
aligned using the function ``dask.dataframe.multi.align_partitions``.
Afterwards, each partition is merged with the pandas merge function.
2. Joining one on index and one on column. In this case the divisions of
dataframe merged by index (:math:`d_i`) are used to divide the column
merged dataframe (:math:`d_c`) one using
``dask.dataframe.multi.rearrange_by_divisions``. In this case the
merged dataframe (:math:`d_m`) has the exact same divisions
as (:math:`d_i`). This can lead to issues if you merge multiple rows from
(:math:`d_c`) to one row in (:math:`d_i`).
3. Joining both on columns. In this case a hash join is performed using
``dask.dataframe.multi.hash_join``.
"""
if not is_dataframe_like(right):
raise ValueError('right must be DataFrame')
from .multi import merge
return merge(self, right, how=how, on=on, left_on=left_on,
right_on=right_on, left_index=left_index,
right_index=right_index, suffixes=suffixes,
npartitions=npartitions, indicator=indicator,
shuffle=shuffle)
@derived_from(pd.DataFrame)
def join(self, other, on=None, how='left',
lsuffix='', rsuffix='', npartitions=None, shuffle=None):
if not is_dataframe_like(other):
raise ValueError('other must be DataFrame')
from .multi import merge
return merge(self, other, how=how,
left_index=on is None, right_index=True,
left_on=on, suffixes=[lsuffix, rsuffix],
npartitions=npartitions, shuffle=shuffle)
@derived_from(pd.DataFrame)
def append(self, other):
if isinstance(other, Series):
msg = ('Unable to appending dd.Series to dd.DataFrame.'
'Use pd.Series to append as row.')
raise ValueError(msg)
elif is_series_like(other):
other = other.to_frame().T
return super(DataFrame, self).append(other)
@derived_from(pd.DataFrame)
def iterrows(self):
for i in range(self.npartitions):
df = self.get_partition(i).compute()
for row in df.iterrows():
yield row
@derived_from(pd.DataFrame)
def itertuples(self):
for i in range(self.npartitions):
df = self.get_partition(i).compute()
for row in df.itertuples():
yield row
@classmethod
def _bind_operator_method(cls, name, op):
""" bind operator method like DataFrame.add to this class """
# name must be explicitly passed for div method whose name is truediv
def meth(self, other, axis='columns', level=None, fill_value=None):
if level is not None:
raise NotImplementedError('level must be None')
axis = self._validate_axis(axis)
if axis in (1, 'columns'):
# When axis=1 and other is a series, `other` is transposed
# and the operator is applied broadcast across rows. This
# isn't supported with dd.Series.
if isinstance(other, Series):
msg = 'Unable to {0} dd.Series with axis=1'.format(name)
raise ValueError(msg)
elif is_series_like(other):
# Special case for pd.Series to avoid unwanted partitioning
# of other. We pass it in as a kwarg to prevent this.
meta = _emulate(op, self, other=other, axis=axis,
fill_value=fill_value)
return map_partitions(op, self, other=other, meta=meta,
axis=axis, fill_value=fill_value)
meta = _emulate(op, self, other, axis=axis, fill_value=fill_value)
return map_partitions(op, self, other, meta=meta,
axis=axis, fill_value=fill_value)
meth.__doc__ = op.__doc__
bind_method(cls, name, meth)
@classmethod
def _bind_comparison_method(cls, name, comparison):
""" bind comparison method like DataFrame.eq to this class """
def meth(self, other, axis='columns', level=None):
if level is not None:
raise NotImplementedError('level must be None')
axis = self._validate_axis(axis)
return elemwise(comparison, self, other, axis=axis)
meth.__doc__ = comparison.__doc__
bind_method(cls, name, meth)
@insert_meta_param_description(pad=12)
def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
args=(), meta=no_default, **kwds):
""" Parallel version of pandas.DataFrame.apply
This mimics the pandas version except for the following:
1. Only ``axis=1`` is supported (and must be specified explicitly).
2. The user should provide output metadata via the `meta` keyword.
Parameters
----------
func : function
Function to apply to each column/row
axis : {0 or 'index', 1 or 'columns'}, default 0
- 0 or 'index': apply function to each column (NOT SUPPORTED)
- 1 or 'columns': apply function to each row
$META
args : tuple
Positional arguments to pass to function in addition to the array/series
Additional keyword arguments will be passed as keywords to the function
Returns
-------
applied : Series or DataFrame
Examples
--------
>>> import dask.dataframe as dd
>>> df = pd.DataFrame({'x': [1, 2, 3, 4, 5],
... 'y': [1., 2., 3., 4., 5.]})
>>> ddf = dd.from_pandas(df, npartitions=2)
Apply a function to row-wise passing in extra arguments in ``args`` and
``kwargs``:
>>> def myadd(row, a, b=1):
... return row.sum() + a + b
>>> res = ddf.apply(myadd, axis=1, args=(2,), b=1.5)
By default, dask tries to infer the output metadata by running your
provided function on some fake data. This works well in many cases, but
can sometimes be expensive, or even fail. To avoid this, you can
manually specify the output metadata with the ``meta`` keyword. This
can be specified in many forms, for more information see
``dask.dataframe.utils.make_meta``.
Here we specify the output is a Series with name ``'x'``, and dtype
``float64``:
>>> res = ddf.apply(myadd, axis=1, args=(2,), b=1.5, meta=('x', 'f8'))
In the case where the metadata doesn't change, you can also pass in
the object itself directly:
>>> res = ddf.apply(lambda row: row + 1, axis=1, meta=ddf)
See Also
--------
dask.DataFrame.map_partitions
"""
axis = self._validate_axis(axis)
pandas_kwargs = {
'axis': axis,
'broadcast': broadcast,
'raw': raw,
'reduce': None,
}
if PANDAS_VERSION >= '0.23.0':
kwds.setdefault('result_type', None)
kwds.update(pandas_kwargs)
if axis == 0:
msg = ("dd.DataFrame.apply only supports axis=1\n"
" Try: df.apply(func, axis=1)")
raise NotImplementedError(msg)
if meta is no_default:
msg = ("`meta` is not specified, inferred from partial data. "
"Please provide `meta` if the result is unexpected.\n"
" Before: .apply(func)\n"
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n"
" or: .apply(func, meta=('x', 'f8')) for series result")
warnings.warn(msg)
meta = _emulate(M.apply, self._meta_nonempty, func,
args=args, udf=True, **kwds)
return map_partitions(M.apply, self, func, args=args, meta=meta, **kwds)
@derived_from(pd.DataFrame)
def applymap(self, func, meta='__no_default__'):
return elemwise(M.applymap, self, func, meta=meta)
@derived_from(pd.DataFrame)
def round(self, decimals=0):
return elemwise(M.round, self, decimals)
@derived_from(pd.DataFrame)
def cov(self, min_periods=None, split_every=False):
return cov_corr(self, min_periods, split_every=split_every)
@derived_from(pd.DataFrame)
def corr(self, method='pearson', min_periods=None, split_every=False):
if method != 'pearson':
raise NotImplementedError("Only Pearson correlation has been "
"implemented")
return cov_corr(self, min_periods, True, split_every=split_every)
def info(self, buf=None, verbose=False, memory_usage=False):
"""
Concise summary of a Dask DataFrame.
"""
if buf is None:
import sys
buf = sys.stdout
lines = [str(type(self))]
if len(self.columns) == 0:
lines.append('Index: 0 entries')
lines.append('Empty %s' % type(self).__name__)
put_lines(buf, lines)
return
# Group and execute the required computations
computations = {}
if verbose:
computations.update({'index': self.index, 'count': self.count()})
if memory_usage:
computations.update({'memory_usage': self.map_partitions(M.memory_usage, index=True)})
computations = dict(zip(computations.keys(), da.compute(*computations.values())))
if verbose:
index = computations['index']
counts = computations['count']
lines.append(index_summary(index))
lines.append('Data columns (total {} columns):'.format(len(self.columns)))
if PANDAS_VERSION >= '0.20.0':
from pandas.io.formats.printing import pprint_thing
else:
from pandas.formats.printing import pprint_thing
space = max([len(pprint_thing(k)) for k in self.columns]) + 3
column_template = '{!s:<%d} {} non-null {}' % space
column_info = [column_template.format(pprint_thing(x[0]), x[1], x[2])
for x in zip(self.columns, counts, self.dtypes)]
else:
column_info = [index_summary(self.columns, name='Columns')]
lines.extend(column_info)
dtype_counts = ['%s(%d)' % k for k in sorted(self.dtypes.value_counts().iteritems(), key=str)]
lines.append('dtypes: {}'.format(', '.join(dtype_counts)))
if memory_usage:
memory_int = computations['memory_usage'].sum()
lines.append('memory usage: {}\n'.format(memory_repr(memory_int)))
put_lines(buf, lines)
@derived_from(pd.DataFrame)
def memory_usage(self, index=True, deep=False):
result = self.map_partitions(M.memory_usage, index=index, deep=deep)
result = result.groupby(result.index).sum()
return result
def pivot_table(self, index=None, columns=None,
values=None, aggfunc='mean'):
"""
Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
must have category dtype to infer result's ``columns``.
``index``, ``columns``, ``values`` and ``aggfunc`` must be all scalar.
Parameters
----------
values : scalar
column to aggregate
index : scalar
column to be index
columns : scalar
column to be columns
aggfunc : {'mean', 'sum', 'count'}, default 'mean'
Returns
-------
table : DataFrame
"""
from .reshape import pivot_table
return pivot_table(self, index=index, columns=columns, values=values,
aggfunc=aggfunc)
def to_records(self, index=False):
from .io import to_records
return to_records(self)
@derived_from(pd.DataFrame)
def to_html(self, max_rows=5):
# pd.Series doesn't have html repr
data = self._repr_data().to_html(max_rows=max_rows,
show_dimensions=False)
return self._HTML_FMT.format(data=data, name=key_split(self._name),
task=len(self.dask))
def _repr_data(self):
meta = self._meta
index = self._repr_divisions
series_list = [_repr_data_series(s, index=index) for _, s in meta.iteritems()]
return pd.concat(series_list, axis=1)
_HTML_FMT = """<div><strong>Dask DataFrame Structure:</strong></div>
{data}
<div>Dask Name: {name}, {task} tasks</div>"""
def _repr_html_(self):
data = self._repr_data().to_html(
max_rows=5,
show_dimensions=False,
notebook=True
)
return self._HTML_FMT.format(data=data, name=key_split(self._name),
task=len(self.dask))
def _select_columns_or_index(self, columns_or_index):
"""
Parameters
----------
columns_or_index
Column or index name, or a list of these
Returns
-------
dd.DataFrame
Dask DataFrame with columns corresponding to each column or
index level in columns_or_index. If included, the column
corresponding to the index level is named _index
"""
# Ensure columns_or_index is a list
columns_or_index = (columns_or_index
if isinstance(columns_or_index, list)
else [columns_or_index])
column_names = [n for n in columns_or_index if self._is_column_label_reference(n)]
selected_df = self[column_names]
if self._contains_index_name(columns_or_index):
# Index name was included
selected_df = selected_df.assign(_index=self.index)
return selected_df
def _is_column_label_reference(self, key):
"""
Test whether a key is a column label reference
To be considered a column label reference, `key` must match the name of at
least one column.
"""
return (not is_dask_collection(key) and
(np.isscalar(key) or isinstance(key, tuple)) and
key in self.columns)
def _is_index_level_reference(self, key):
"""
Test whether a key is an index level reference
To be considered an index level reference, `key` must match the index name
and must NOT match the name of any column.
"""
return (self.index.name is not None and
not is_dask_collection(key) and
(np.isscalar(key) or isinstance(key, tuple)) and
key == self.index.name and
key not in self.columns)
def _contains_index_name(self, columns_or_index):
"""
Test whether the input contains a reference to the index of the DataFrame
"""
if isinstance(columns_or_index, list):
return any(self._is_index_level_reference(n) for n in columns_or_index)
else:
return self._is_index_level_reference(columns_or_index)
# bind operators
for op in [operator.abs, operator.add, operator.and_, operator_div,
operator.eq, operator.gt, operator.ge, operator.inv,
operator.lt, operator.le, operator.mod, operator.mul,
operator.ne, operator.neg, operator.or_, operator.pow,
operator.sub, operator.truediv, operator.floordiv, operator.xor]:
_Frame._bind_operator(op)
Scalar._bind_operator(op)
for name in ['add', 'sub', 'mul', 'div',
'truediv', 'floordiv', 'mod', 'pow',
'radd', 'rsub', 'rmul', 'rdiv',
'rtruediv', 'rfloordiv', 'rmod', 'rpow']:
meth = getattr(pd.DataFrame, name)
DataFrame._bind_operator_method(name, meth)
meth = getattr(pd.Series, name)
Series._bind_operator_method(name, meth)
for name in ['lt', 'gt', 'le', 'ge', 'ne', 'eq']:
meth = getattr(pd.DataFrame, name)
DataFrame._bind_comparison_method(name, meth)
meth = getattr(pd.Series, name)
Series._bind_comparison_method(name, meth)
def is_broadcastable(dfs, s):
"""
This Series is broadcastable against another dataframe in the sequence
"""
return (isinstance(s, Series) and
s.npartitions == 1 and
s.known_divisions and
any(s.divisions == (min(df.columns), max(df.columns))
for df in dfs if isinstance(df, DataFrame)))
def elemwise(op, *args, **kwargs):
""" Elementwise operation for Dask dataframes
Parameters
----------
op: callable
Function to apply across input dataframes
*args: DataFrames, Series, Scalars, Arrays,
The arguments of the operation
**kwrags: scalars
meta: pd.DataFrame, pd.Series (optional)
Valid metadata for the operation. Will evaluate on a small piece of
data if not provided.
Examples
--------
>>> elemwise(operator.add, df.x, df.y) # doctest: +SKIP
"""
meta = kwargs.pop('meta', no_default)
out = kwargs.pop('out', None)
_name = funcname(op) + '-' + tokenize(op, *args, **kwargs)
args = _maybe_from_pandas(args)
from .multi import _maybe_align_partitions
args = _maybe_align_partitions(args)
dasks = [arg for arg in args if isinstance(arg, (_Frame, Scalar, Array))]
dfs = [df for df in dasks if isinstance(df, _Frame)]
# Clean up dask arrays if present
for i, a in enumerate(dasks):
if not isinstance(a, Array):
continue
# Ensure that they have similar-ish chunk structure
if not all(not a.chunks or len(a.chunks[0]) == df.npartitions for df in dfs):
msg = ("When combining dask arrays with dataframes they must "
"match chunking exactly. Operation: %s" % funcname(op))
raise ValueError(msg)
# Rechunk to have a single chunk along all other axes
if a.ndim > 1:
a = a.rechunk({i + 1: d for i, d in enumerate(a.shape[1:])})
dasks[i] = a
divisions = dfs[0].divisions
_is_broadcastable = partial(is_broadcastable, dfs)
dfs = list(remove(_is_broadcastable, dfs))
other = [(i, arg) for i, arg in enumerate(args)
if not isinstance(arg, (_Frame, Scalar, Array))]
# adjust the key length of Scalar
dsk = partitionwise_graph(op, _name, *args, **kwargs)
graph = HighLevelGraph.from_collections(_name, dsk, dependencies=dasks)
if meta is no_default:
if len(dfs) >= 2 and not all(hasattr(d, 'npartitions') for d in dasks):
# should not occur in current funcs
msg = 'elemwise with 2 or more DataFrames and Scalar is not supported'
raise NotImplementedError(msg)
# For broadcastable series, use no rows.
parts = [d._meta if _is_broadcastable(d) or isinstance(d, Array)
else d._meta_nonempty for d in dasks]
with raise_on_meta_error(funcname(op)):
meta = partial_by_order(*parts, function=op, other=other)
result = new_dd_object(graph, _name, meta, divisions)
return handle_out(out, result)
def handle_out(out, result):
""" Handle out parameters
If out is a dask.DataFrame, dask.Series or dask.Scalar then
this overwrites the contents of it with the result
"""
if isinstance(out, tuple):
if len(out) == 1:
out = out[0]
elif len(out) > 1:
raise NotImplementedError("The out parameter is not fully supported")
else:
out = None
if out is not None and type(out) != type(result):
raise TypeError(
"Mismatched types between result and out parameter. "
"out=%s, result=%s" % (str(type(out)), str(type(result))))
if isinstance(out, DataFrame):
if len(out.columns) != len(result.columns):
raise ValueError(
"Mismatched columns count between result and out parameter. "
"out=%s, result=%s" % (str(len(out.columns)), str(len(result.columns))))
if isinstance(out, (Series, DataFrame, Scalar)):
out._meta = result._meta
out._name = result._name
out.dask = result.dask
if not isinstance(out, Scalar):
out.divisions = result.divisions
elif out is not None:
msg = ("The out parameter is not fully supported."
" Received type %s, expected %s " % ( type(out).__name__, type(result).__name__))
raise NotImplementedError(msg)
else:
return result
def _maybe_from_pandas(dfs):
from .io import from_pandas
dfs = [from_pandas(df, 1)
if (is_series_like(df) or is_dataframe_like(df)) and not is_dask_collection(df)
else df for df in dfs]
return dfs
def hash_shard(df, nparts, split_out_setup=None, split_out_setup_kwargs=None):
if split_out_setup:
h = split_out_setup(df, **(split_out_setup_kwargs or {}))
else:
h = df
h = hash_pandas_object(h, index=False)
if is_series_like(h):
h = h._values
h %= nparts
return {i: df.iloc[h == i] for i in range(nparts)}
def split_evenly(df, k):
""" Split dataframe into k roughly equal parts """
divisions = np.linspace(0, len(df), k + 1).astype(int)
return {i: df.iloc[divisions[i]: divisions[i + 1]] for i in range(k)}
def split_out_on_index(df):
h = df.index
if isinstance(h, pd.MultiIndex):
h = pd.DataFrame([], index=h).reset_index()
return h
def split_out_on_cols(df, cols=None):
return df[cols]
@insert_meta_param_description
def apply_concat_apply(args, chunk=None, aggregate=None, combine=None,
meta=no_default, token=None, chunk_kwargs=None,
aggregate_kwargs=None, combine_kwargs=None,
split_every=None, split_out=None, split_out_setup=None,
split_out_setup_kwargs=None, **kwargs):
"""Apply a function to blocks, then concat, then apply again
Parameters
----------
args :
Positional arguments for the `chunk` function. All `dask.dataframe`
objects should be partitioned and indexed equivalently.
chunk : function [block-per-arg] -> block
Function to operate on each block of data
aggregate : function concatenated-block -> block
Function to operate on the concatenated result of chunk
combine : function concatenated-block -> block, optional
Function to operate on intermediate concatenated results of chunk
in a tree-reduction. If not provided, defaults to aggregate.
$META
token : str, optional
The name to use for the output keys.
chunk_kwargs : dict, optional
Keywords for the chunk function only.
aggregate_kwargs : dict, optional
Keywords for the aggregate function only.
combine_kwargs : dict, optional
Keywords for the combine function only.
split_every : int, optional
Group partitions into groups of this size while performing a
tree-reduction. If set to False, no tree-reduction will be used,
and all intermediates will be concatenated and passed to ``aggregate``.
Default is 8.
split_out : int, optional
Number of output partitions. Split occurs after first chunk reduction.
split_out_setup : callable, optional
If provided, this function is called on each chunk before performing
the hash-split. It should return a pandas object, where each row
(excluding the index) is hashed. If not provided, the chunk is hashed
as is.
split_out_setup_kwargs : dict, optional
Keywords for the `split_out_setup` function only.
kwargs :
All remaining keywords will be passed to ``chunk``, ``aggregate``, and
``combine``.
Examples
--------
>>> def chunk(a_block, b_block):
... pass
>>> def agg(df):
... pass
>>> apply_concat_apply([a, b], chunk=chunk, aggregate=agg) # doctest: +SKIP
"""
if chunk_kwargs is None:
chunk_kwargs = dict()
if aggregate_kwargs is None:
aggregate_kwargs = dict()
chunk_kwargs.update(kwargs)
aggregate_kwargs.update(kwargs)
if combine is None:
if combine_kwargs:
raise ValueError("`combine_kwargs` provided with no `combine`")
combine = aggregate
combine_kwargs = aggregate_kwargs
else:
if combine_kwargs is None:
combine_kwargs = dict()
combine_kwargs.update(kwargs)
if not isinstance(args, (tuple, list)):
args = [args]
dfs = [arg for arg in args if isinstance(arg, _Frame)]
npartitions = set(arg.npartitions for arg in dfs)
if len(npartitions) > 1:
raise ValueError("All arguments must have same number of partitions")
npartitions = npartitions.pop()
if split_every is None:
split_every = 8
elif split_every is False:
split_every = npartitions
elif split_every < 2 or not isinstance(split_every, Integral):
raise ValueError("split_every must be an integer >= 2")
token_key = tokenize(token or (chunk, aggregate), meta, args,
chunk_kwargs, aggregate_kwargs, combine_kwargs,
split_every, split_out, split_out_setup,
split_out_setup_kwargs)
# Chunk
a = '{0}-chunk-{1}'.format(token or funcname(chunk), token_key)
if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
dsk = {(a, 0, i, 0): (chunk, key)
for i, key in enumerate(args[0].__dask_keys__())}
else:
dsk = {(a, 0, i, 0): (apply, chunk,
[(x._name, i) if isinstance(x, _Frame)
else x for x in args], chunk_kwargs)
for i in range(npartitions)}
# Split
if split_out and split_out > 1:
split_prefix = 'split-%s' % token_key
shard_prefix = 'shard-%s' % token_key
for i in range(npartitions):
dsk[(split_prefix, i)] = (hash_shard, (a, 0, i, 0), split_out,
split_out_setup, split_out_setup_kwargs)
for j in range(split_out):
dsk[(shard_prefix, 0, i, j)] = (getitem, (split_prefix, i), j)
a = shard_prefix
else:
split_out = 1
# Combine
b = '{0}-combine-{1}'.format(token or funcname(combine), token_key)
k = npartitions
depth = 0
while k > split_every:
for part_i, inds in enumerate(partition_all(split_every, range(k))):
for j in range(split_out):
conc = (_concat, [(a, depth, i, j) for i in inds])
if combine_kwargs:
dsk[(b, depth + 1, part_i, j)] = (apply, combine, [conc], combine_kwargs)
else:
dsk[(b, depth + 1, part_i, j)] = (combine, conc)
k = part_i + 1
a = b
depth += 1
# Aggregate
for j in range(split_out):
b = '{0}-agg-{1}'.format(token or funcname(aggregate), token_key)
conc = (_concat, [(a, depth, i, j) for i in range(k)])
if aggregate_kwargs:
dsk[(b, j)] = (apply, aggregate, [conc], aggregate_kwargs)
else:
dsk[(b, j)] = (aggregate, conc)
if meta is no_default:
meta_chunk = _emulate(chunk, *args, udf=True, **chunk_kwargs)
meta = _emulate(aggregate, _concat([meta_chunk]), udf=True,
**aggregate_kwargs)
meta = make_meta(meta, index=(getattr(make_meta(dfs[0]), 'index', None)
if dfs else None))
graph = HighLevelGraph.from_collections(b, dsk, dependencies=dfs)
divisions = [None] * (split_out + 1)
return new_dd_object(graph, b, meta, divisions)
aca = apply_concat_apply
def _extract_meta(x, nonempty=False):
"""
Extract internal cache data (``_meta``) from dd.DataFrame / dd.Series
"""
if isinstance(x, (Scalar, _Frame)):
return x._meta_nonempty if nonempty else x._meta
elif isinstance(x, list):
return [_extract_meta(_x, nonempty) for _x in x]
elif isinstance(x, tuple):
return tuple([_extract_meta(_x, nonempty) for _x in x])
elif isinstance(x, dict):
res = {}
for k in x:
res[k] = _extract_meta(x[k], nonempty)
return res
else:
return x
def _emulate(func, *args, **kwargs):
"""
Apply a function using args / kwargs. If arguments contain dd.DataFrame /
dd.Series, using internal cache (``_meta``) for calculation
"""
with raise_on_meta_error(funcname(func), udf=kwargs.pop('udf', False)):
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
@insert_meta_param_description
def map_partitions(func, *args, **kwargs):
""" Apply Python function on each DataFrame partition.
Parameters
----------
func : function
Function applied to each partition.
args, kwargs :
Arguments and keywords to pass to the function. At least one of the
args should be a Dask.dataframe. Arguments and keywords may contain
``Scalar``, ``Delayed`` or regular python objects.
$META
"""
meta = kwargs.pop('meta', no_default)
name = kwargs.pop('token', None)
# Normalize keyword arguments
kwargs2 = {k: normalize_arg(v) for k, v in kwargs.items()}
assert callable(func)
if name is not None:
token = tokenize(meta, *args, **kwargs2)
else:
name = funcname(func)
token = tokenize(func, meta, *args, **kwargs2)
name = '{0}-{1}'.format(name, token)
from .multi import _maybe_align_partitions
args = _maybe_from_pandas(args)
args = _maybe_align_partitions(args)
dfs = [df for df in args if isinstance(df, _Frame)]
meta_index = getattr(make_meta(dfs[0]), 'index', None) if dfs else None
if meta is no_default:
meta = _emulate(func, *args, udf=True, **kwargs2)
else:
meta = make_meta(meta, index=meta_index)
if all(isinstance(arg, Scalar) for arg in args):
layer = {(name, 0):
(apply, func, (tuple, [(arg._name, 0) for arg in args]), kwargs)}
graph = HighLevelGraph.from_collections(name, layer, dependencies=args)
return Scalar(graph, name, meta)
elif not (has_parallel_type(meta) or is_arraylike(meta)):
# If `meta` is not a pandas object, the concatenated results will be a
# different type
meta = make_meta(_concat([meta]), index=meta_index)
# Ensure meta is empty series
meta = make_meta(meta)
args2 = []
dependencies = []
for arg in args:
if isinstance(arg, _Frame):
args2.append(arg)
dependencies.append(arg)
continue
if not is_dask_collection(arg) and sizeof(arg) > 1e6:
arg = delayed(arg)
arg2, collections = unpack_collections(arg)
if collections:
args2.append(arg2)
dependencies.extend(collections)
else:
args2.append(arg)
kwargs3 = {}
for k, v in kwargs2.items():
v, collections = unpack_collections(v)
dependencies.extend(collections)
kwargs3[k] = v
dsk = partitionwise_graph(
apply_and_enforce,
name,
*args2,
dependencies=dependencies,
_func=func,
_meta=meta,
**kwargs3
)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies)
return new_dd_object(graph, name, meta, dfs[0].divisions)
def apply_and_enforce(*args, **kwargs):
"""Apply a function, and enforce the output to match meta
Ensures the output has the same columns, even if empty."""
func = kwargs.pop('_func')
meta = kwargs.pop('_meta')
df = func(*args, **kwargs)
if is_dataframe_like(df) or is_series_like(df) or is_index_like(df):
if not len(df):
return meta
if is_dataframe_like(df):
# Need nan_to_num otherwise nan comparison gives False
if not np.array_equal(np.nan_to_num(meta.columns),
np.nan_to_num(df.columns)):
raise ValueError("The columns in the computed data do not match"
" the columns in the provided metadata")
else:
c = meta.columns
else:
c = meta.name
return _rename(c, df)
return df
def _rename(columns, df):
"""
Rename columns of pd.DataFrame or name of pd.Series.
Not for dd.DataFrame or dd.Series.
Parameters
----------
columns : tuple, string, pd.DataFrame or pd.Series
Column names, Series name or pandas instance which has the
target column names / name.
df : pd.DataFrame or pd.Series
target DataFrame / Series to be renamed
"""
assert not isinstance(df, _Frame)
if columns is no_default:
return df
if isinstance(columns, Iterator):
columns = list(columns)
if is_dataframe_like(df):
if is_dataframe_like(columns):
columns = columns.columns
if not isinstance(columns, pd.Index):
columns = pd.Index(columns)
if (len(columns) == len(df.columns) and
type(columns) is type(df.columns) and
columns.equals(df.columns)):
# if target is identical, rename is not necessary
return df
# deep=False doesn't doesn't copy any data/indices, so this is cheap
df = df.copy(deep=False)
df.columns = columns
return df
elif is_series_like(df) or is_index_like(df):
if is_series_like(columns) or is_index_like(columns):
columns = columns.name
if df.name == columns:
return df
return df.rename(columns)
# map_partition may pass other types
return df
def _rename_dask(df, names):
"""
Destructively rename columns of dd.DataFrame or name of dd.Series.
Not for pd.DataFrame or pd.Series.
Internaly used to overwrite dd.DataFrame.columns and dd.Series.name
We can't use map_partition because it applies function then rename
Parameters
----------
df : dd.DataFrame or dd.Series
target DataFrame / Series to be renamed
names : tuple, string
Column names/Series name
"""
assert isinstance(df, _Frame)
metadata = _rename(names, df._meta)
name = 'rename-{0}'.format(tokenize(df, metadata))
dsk = partitionwise_graph(_rename, name, metadata, df)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
return new_dd_object(graph, name, metadata, df.divisions)
def quantile(df, q):
"""Approximate quantiles of Series.
Parameters
----------
q : list/array of floats
Iterable of numbers ranging from 0 to 100 for the desired quantiles
"""
assert isinstance(df, Series)
from dask.array.percentile import _percentile, merge_percentiles
# currently, only Series has quantile method
if isinstance(df, Index):
meta = pd.Series(df._meta_nonempty).quantile(q)
else:
meta = df._meta_nonempty.quantile(q)
if is_series_like(meta):
# Index.quantile(list-like) must be pd.Series, not pd.Index
df_name = df.name
finalize_tsk = lambda tsk: (pd.Series, tsk, q, None, df_name)
return_type = Series
else:
finalize_tsk = lambda tsk: (getitem, tsk, 0)
return_type = Scalar
q = [q]
# pandas uses quantile in [0, 1]
# numpy / everyone else uses [0, 100]
qs = np.asarray(q) * 100
token = tokenize(df, qs)
if len(qs) == 0:
name = 'quantiles-' + token
empty_index = pd.Index([], dtype=float)
return Series({(name, 0): pd.Series([], name=df.name, index=empty_index)},
name, df._meta, [None, None])
else:
new_divisions = [np.min(q), np.max(q)]
df = df.dropna()
name = 'quantiles-1-' + token
val_dsk = {(name, i): (_percentile, (getattr, key, 'values'), qs)
for i, key in enumerate(df.__dask_keys__())}
name3 = 'quantiles-3-' + token
merge_dsk = {(name3, 0): finalize_tsk((merge_percentiles, qs,
[qs] * df.npartitions,
sorted(val_dsk)))}
dsk = merge(val_dsk, merge_dsk)
graph = HighLevelGraph.from_collections(name3, dsk, dependencies=[df])
return return_type(graph, name3, meta, new_divisions)
def cov_corr(df, min_periods=None, corr=False, scalar=False, split_every=False):
"""DataFrame covariance and pearson correlation.
Computes pairwise covariance or correlation of columns, excluding NA/null
values.
Parameters
----------
df : DataFrame
min_periods : int, optional
Minimum number of observations required per pair of columns
to have a valid result.
corr : bool, optional
If True, compute the Pearson correlation. If False [default], compute
the covariance.
scalar : bool, optional
If True, compute covariance between two variables as a scalar. Only
valid if `df` has 2 columns. If False [default], compute the entire
covariance/correlation matrix.
split_every : int, optional
Group partitions into groups of this size while performing a
tree-reduction. If set to False, no tree-reduction will be used.
Default is False.
"""
if min_periods is None:
min_periods = 2
elif min_periods < 2:
raise ValueError("min_periods must be >= 2")
if split_every is False:
split_every = df.npartitions
elif split_every < 2 or not isinstance(split_every, Integral):
raise ValueError("split_every must be an integer >= 2")
df = df._get_numeric_data()
if scalar and len(df.columns) != 2:
raise ValueError("scalar only valid for 2 column dataframe")
token = tokenize(df, min_periods, scalar, split_every)
funcname = 'corr' if corr else 'cov'
a = '{0}-chunk-{1}'.format(funcname, df._name)
dsk = {(a, i): (cov_corr_chunk, f, corr)
for (i, f) in enumerate(df.__dask_keys__())}
prefix = '{0}-combine-{1}-'.format(funcname, df._name)
k = df.npartitions
b = a
depth = 0
while k > split_every:
b = prefix + str(depth)
for part_i, inds in enumerate(partition_all(split_every, range(k))):
dsk[(b, part_i)] = (cov_corr_combine, [(a, i) for i in inds], corr)
k = part_i + 1
a = b
depth += 1
name = '{0}-{1}'.format(funcname, token)
dsk[(name, 0)] = (cov_corr_agg, [(a, i) for i in range(k)],
df.columns, min_periods, corr, scalar)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
if scalar:
return Scalar(graph, name, 'f8')
meta = make_meta([(c, 'f8') for c in df.columns], index=df.columns)
return DataFrame(graph, name, meta, (df.columns[0], df.columns[-1]))
def cov_corr_chunk(df, corr=False):
"""Chunk part of a covariance or correlation computation
"""
shape = (df.shape[1], df.shape[1])
sums = np.zeros(shape)
counts = np.zeros(shape)
df = df.astype('float64', copy=False)
for idx, col in enumerate(df):
mask = df[col].notnull()
sums[idx] = df[mask].sum().values
counts[idx] = df[mask].count().values
cov = df.cov().values
dtype = [('sum', sums.dtype), ('count', counts.dtype), ('cov', cov.dtype)]
if corr:
with warnings.catch_warnings(record=True):
warnings.simplefilter("always")
mu = (sums / counts).T
m = np.zeros(shape)
mask = df.isnull().values
for idx, x in enumerate(df):
mu_discrepancy = np.subtract.outer(df[x], mu[idx]) ** 2
mu_discrepancy[mask] = np.nan
m[idx] = np.nansum(mu_discrepancy, axis=0)
m = m.T
dtype.append(('m', m.dtype))
out = np.empty(counts.shape, dtype=dtype)
out['sum'] = sums
out['count'] = counts
out['cov'] = cov * (counts - 1)
if corr:
out['m'] = m
return out
def cov_corr_combine(data, corr=False):
data = np.concatenate(data).reshape((len(data),) + data[0].shape)
sums = np.nan_to_num(data['sum'])
counts = data['count']
cum_sums = np.cumsum(sums, 0)
cum_counts = np.cumsum(counts, 0)
s1 = cum_sums[:-1]
s2 = sums[1:]
n1 = cum_counts[:-1]
n2 = counts[1:]
with np.errstate(invalid='ignore'):
d = (s2 / n2) - (s1 / n1)
C = (np.nansum((n1 * n2) / (n1 + n2) * (d * d.transpose((0, 2, 1))), 0) +
np.nansum(data['cov'], 0))
out = np.empty(C.shape, dtype=data.dtype)
out['sum'] = cum_sums[-1]
out['count'] = cum_counts[-1]
out['cov'] = C
if corr:
nobs = np.where(cum_counts[-1], cum_counts[-1], np.nan)
mu = cum_sums[-1] / nobs
counts_na = np.where(counts, counts, np.nan)
m = np.nansum(data['m'] + counts * (sums / counts_na - mu) ** 2,
axis=0)
out['m'] = m
return out
def cov_corr_agg(data, cols, min_periods=2, corr=False, scalar=False):
out = cov_corr_combine(data, corr)
counts = out['count']
C = out['cov']
C[counts < min_periods] = np.nan
if corr:
m2 = out['m']
den = np.sqrt(m2 * m2.T)
else:
den = np.where(counts, counts, np.nan) - 1
with np.errstate(invalid='ignore', divide='ignore'):
mat = C / den
if scalar:
return mat[0, 1]
return pd.DataFrame(mat, columns=cols, index=cols)
def pd_split(df, p, random_state=None):
""" Split DataFrame into multiple pieces pseudorandomly
>>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
... 'b': [2, 3, 4, 5, 6, 7]})
>>> a, b = pd_split(df, [0.5, 0.5], random_state=123) # roughly 50/50 split
>>> a
a b
1 2 3
2 3 4
5 6 7
>>> b
a b
0 1 2
3 4 5
4 5 6
"""
p = list(p)
index = pseudorandom(len(df), p, random_state)
return [df.iloc[index == i] for i in range(len(p))]
def _take_last(a, skipna=True):
"""
take last row (Series) of DataFrame / last value of Series
considering NaN.
Parameters
----------
a : pd.DataFrame or pd.Series
skipna : bool, default True
Whether to exclude NaN
"""
if skipna is False:
return a.iloc[-1]
else:
# take last valid value excluding NaN, NaN location may be different
# in each columns
group_dummy = np.ones(len(a.index))
last_row = a.groupby(group_dummy).last()
if isinstance(a, pd.DataFrame): # TODO: handle explicit pandas reference
return pd.Series(last_row.values[0], index=a.columns)
else:
return last_row.values[0]
def check_divisions(divisions):
if not isinstance(divisions, (list, tuple)):
raise ValueError('New division must be list or tuple')
divisions = list(divisions)
if divisions != sorted(divisions):
raise ValueError('New division must be sorted')
if len(divisions[:-1]) != len(list(unique(divisions[:-1]))):
msg = 'New division must be unique, except for the last element'
raise ValueError(msg)
def repartition_divisions(a, b, name, out1, out2, force=False):
""" dask graph to repartition dataframe by new divisions
Parameters
----------
a : tuple
old divisions
b : tuple, list
new divisions
name : str
name of old dataframe
out1 : str
name of temporary splits
out2 : str
name of new dataframe
force : bool, default False
Allows the expansion of the existing divisions.
If False then the new divisions lower and upper bounds must be
the same as the old divisions.
Examples
--------
>>> repartition_divisions([1, 3, 7], [1, 4, 6, 7], 'a', 'b', 'c') # doctest: +SKIP
{('b', 0): (<function boundary_slice at ...>, ('a', 0), 1, 3, False),
('b', 1): (<function boundary_slice at ...>, ('a', 1), 3, 4, False),
('b', 2): (<function boundary_slice at ...>, ('a', 1), 4, 6, False),
('b', 3): (<function boundary_slice at ...>, ('a', 1), 6, 7, False)
('c', 0): (<function concat at ...>,
(<type 'list'>, [('b', 0), ('b', 1)])),
('c', 1): ('b', 2),
('c', 2): ('b', 3)}
"""
check_divisions(b)
if len(b) < 2:
# minimum division is 2 elements, like [0, 0]
raise ValueError('New division must be longer than 2 elements')
if force:
if a[0] < b[0]:
msg = ('left side of the new division must be equal or smaller '
'than old division')
raise ValueError(msg)
if a[-1] > b[-1]:
msg = ('right side of the new division must be equal or larger '
'than old division')
raise ValueError(msg)
else:
if a[0] != b[0]:
msg = 'left side of old and new divisions are different'
raise ValueError(msg)
if a[-1] != b[-1]:
msg = 'right side of old and new divisions are different'
raise ValueError(msg)
def _is_single_last_div(x):
"""Whether last division only contains single label"""
return len(x) >= 2 and x[-1] == x[-2]
c = [a[0]]
d = dict()
low = a[0]
i, j = 1, 1 # indices for old/new divisions
k = 0 # index for temp divisions
last_elem = _is_single_last_div(a)
# process through old division
# left part of new division can be processed in this loop
while (i < len(a) and j < len(b)):
if a[i] < b[j]:
# tuple is something like:
# (methods.boundary_slice, ('from_pandas-#', 0), 3, 4, False))
d[(out1, k)] = (methods.boundary_slice, (name, i - 1), low, a[i], False)
low = a[i]
i += 1
elif a[i] > b[j]:
d[(out1, k)] = (methods.boundary_slice, (name, i - 1), low, b[j], False)
low = b[j]
j += 1
else:
d[(out1, k)] = (methods.boundary_slice, (name, i - 1), low, b[j], False)
low = b[j]
if len(a) == i + 1 or a[i] < a[i + 1]:
j += 1
i += 1
c.append(low)
k += 1
# right part of new division can remain
if a[-1] < b[-1] or b[-1] == b[-2]:
for _j in range(j, len(b)):
# always use right-most of old division
# because it may contain last element
m = len(a) - 2
d[(out1, k)] = (methods.boundary_slice, (name, m), low, b[_j], False)
low = b[_j]
c.append(low)
k += 1
else:
# even if new division is processed through,
# right-most element of old division can remain
if last_elem and i < len(a):
d[(out1, k)] = (methods.boundary_slice, (name, i - 1), a[i], a[i], False)
k += 1
c.append(a[-1])
# replace last element of tuple with True
d[(out1, k - 1)] = d[(out1, k - 1)][:-1] + (True,)
i, j = 0, 1
last_elem = _is_single_last_div(c)
while j < len(b):
tmp = []
while c[i] < b[j]:
tmp.append((out1, i))
i += 1
while last_elem and c[i] == b[-1] and (b[-1] != b[-2] or j == len(b) - 1) and i < k:
# append if last split is not included
tmp.append((out1, i))
i += 1
if len(tmp) == 0:
# dummy slice to return empty DataFrame or Series,
# which retain original data attributes (columns / name)
d[(out2, j - 1)] = (methods.boundary_slice, (name, 0), a[0], a[0], False)
elif len(tmp) == 1:
d[(out2, j - 1)] = tmp[0]
else:
if not tmp:
raise ValueError('check for duplicate partitions\nold:\n%s\n\n'
'new:\n%s\n\ncombined:\n%s'
% (pformat(a), pformat(b), pformat(c)))
d[(out2, j - 1)] = (methods.concat, tmp)
j += 1
return d
def repartition_freq(df, freq=None):
""" Repartition a timeseries dataframe by a new frequency """
if not isinstance(df.divisions[0], pd.Timestamp):
raise TypeError("Can only repartition on frequency for timeseries")
try:
start = df.divisions[0].ceil(freq)
except ValueError:
start = df.divisions[0]
divisions = pd.date_range(start=start,
end=df.divisions[-1],
freq=freq).tolist()
if not len(divisions):
divisions = [df.divisions[0], df.divisions[-1]]
else:
if divisions[-1] != df.divisions[-1]:
divisions.append(df.divisions[-1])
if divisions[0] != df.divisions[0]:
divisions = [df.divisions[0]] + divisions
return df.repartition(divisions=divisions)
def repartition_npartitions(df, npartitions):
""" Repartition dataframe to a smaller number of partitions """
new_name = 'repartition-%d-%s' % (npartitions, tokenize(df))
if df.npartitions == npartitions:
return df
elif df.npartitions > npartitions:
npartitions_ratio = df.npartitions / npartitions
new_partitions_boundaries = [int(new_partition_index * npartitions_ratio)
for new_partition_index in range(npartitions + 1)]
dsk = {}
for new_partition_index in range(npartitions):
value = (methods.concat,
[(df._name, old_partition_index) for old_partition_index in
range(new_partitions_boundaries[new_partition_index],
new_partitions_boundaries[new_partition_index + 1])])
dsk[new_name, new_partition_index] = value
divisions = [df.divisions[new_partition_index]
for new_partition_index in new_partitions_boundaries]
graph = HighLevelGraph.from_collections(new_name, dsk, dependencies=[df])
return new_dd_object(graph, new_name, df._meta, divisions)
else:
original_divisions = divisions = pd.Series(df.divisions)
if (df.known_divisions and (np.issubdtype(divisions.dtype, np.datetime64) or
np.issubdtype(divisions.dtype, np.number))):
if np.issubdtype(divisions.dtype, np.datetime64):
divisions = divisions.values.astype('float64')
if is_series_like(divisions):
divisions = divisions.values
n = len(divisions)
divisions = np.interp(x=np.linspace(0, n, npartitions + 1),
xp=np.linspace(0, n, n),
fp=divisions)
if np.issubdtype(original_divisions.dtype, np.datetime64):
divisions = pd.Series(divisions).astype(original_divisions.dtype).tolist()
elif np.issubdtype(original_divisions.dtype, np.integer):
divisions = divisions.astype(original_divisions.dtype)
if isinstance(divisions, np.ndarray):
divisions = divisions.tolist()
divisions = list(divisions)
divisions[0] = df.divisions[0]
divisions[-1] = df.divisions[-1]
return df.repartition(divisions=divisions)
else:
ratio = npartitions / df.npartitions
split_name = 'split-%s' % tokenize(df, npartitions)
dsk = {}
last = 0
j = 0
for i in range(df.npartitions):
new = last + ratio
if i == df.npartitions - 1:
k = npartitions - j
else:
k = int(new - last)
dsk[(split_name, i)] = (split_evenly, (df._name, i), k)
for jj in range(k):
dsk[(new_name, j)] = (getitem, (split_name, i), jj)
j += 1
last = new
divisions = [None] * (npartitions + 1)
graph = HighLevelGraph.from_collections(new_name, dsk, dependencies=[df])
return new_dd_object(graph, new_name, df._meta, divisions)
def repartition(df, divisions=None, force=False):
""" Repartition dataframe along new divisions
Dask.DataFrame objects are partitioned along their index. Often when
multiple dataframes interact we need to align these partitionings. The
``repartition`` function constructs a new DataFrame object holding the same
data but partitioned on different values. It does this by performing a
sequence of ``loc`` and ``concat`` calls to split and merge the previous
generation of partitions.
Parameters
----------
divisions : list
List of partitions to be used
force : bool, default False
Allows the expansion of the existing divisions.
If False then the new divisions lower and upper bounds must be
the same as the old divisions.
Examples
--------
>>> df = df.repartition([0, 5, 10, 20]) # doctest: +SKIP
Also works on Pandas objects
>>> ddf = dd.repartition(df, [0, 5, 10, 20]) # doctest: +SKIP
"""
token = tokenize(df, divisions)
if isinstance(df, _Frame):
tmp = 'repartition-split-' + token
out = 'repartition-merge-' + token
dsk = repartition_divisions(df.divisions, divisions,
df._name, tmp, out, force=force)
graph = HighLevelGraph.from_collections(out, dsk, dependencies=[df])
return new_dd_object(graph, out, df._meta, divisions)
elif is_dataframe_like(df) or is_series_like(df):
name = 'repartition-dataframe-' + token
from .utils import shard_df_on_index
dfs = shard_df_on_index(df, divisions[1:-1])
dsk = dict(((name, i), df) for i, df in enumerate(dfs))
return new_dd_object(dsk, name, df, divisions)
raise ValueError('Data must be DataFrame or Series')
def _reduction_chunk(x, aca_chunk=None, **kwargs):
o = aca_chunk(x, **kwargs)
# Return a dataframe so that the concatenated version is also a dataframe
return o.to_frame().T if is_series_like(o) else o
def _reduction_combine(x, aca_combine=None, **kwargs):
if isinstance(x, list):
x = pd.Series(x)
o = aca_combine(x, **kwargs)
# Return a dataframe so that the concatenated version is also a dataframe
return o.to_frame().T if is_series_like(o) else o
def _reduction_aggregate(x, aca_aggregate=None, **kwargs):
if isinstance(x, list):
x = pd.Series(x)
return aca_aggregate(x, **kwargs)
def idxmaxmin_chunk(x, fn=None, skipna=True):
minmax = 'max' if fn == 'idxmax' else 'min'
if len(x) > 0:
idx = getattr(x, fn)(skipna=skipna)
value = getattr(x, minmax)(skipna=skipna)
else:
idx = value = pd.Series([], dtype='i8')
if is_series_like(idx):
return pd.DataFrame({'idx': idx, 'value': value})
return pd.DataFrame({'idx': [idx], 'value': [value]})
def idxmaxmin_row(x, fn=None, skipna=True):
minmax = 'max' if fn == 'idxmax' else 'min'
if len(x) > 0:
x = x.set_index('idx')
idx = [getattr(x.value, fn)(skipna=skipna)]
value = [getattr(x.value, minmax)(skipna=skipna)]
else:
idx = value = pd.Series([], dtype='i8')
return pd.DataFrame({'idx': idx, 'value': value})
def idxmaxmin_combine(x, fn=None, skipna=True):
if len(x) == 0:
return x
return (x.groupby(level=0)
.apply(idxmaxmin_row, fn=fn, skipna=skipna)
.reset_index(level=1, drop=True))
def idxmaxmin_agg(x, fn=None, skipna=True, scalar=False):
res = idxmaxmin_combine(x, fn, skipna=skipna)['idx']
if len(res) == 0:
raise ValueError("attempt to get argmax of an empty sequence")
if scalar:
return res[0]
res.name = None
return res
def safe_head(df, n):
r = df.head(n=n)
if len(r) != n:
msg = ("Insufficient elements for `head`. {0} elements "
"requested, only {1} elements available. Try passing larger "
"`npartitions` to `head`.")
warnings.warn(msg.format(n, len(r)))
return r
def maybe_shift_divisions(df, periods, freq):
"""Maybe shift divisions by periods of size freq
Used to shift the divisions for the `shift` method. If freq isn't a fixed
size (not anchored or relative), then the divisions are shifted
appropriately. Otherwise the divisions are cleared.
Parameters
----------
df : dd.DataFrame, dd.Series, or dd.Index
periods : int
The number of periods to shift.
freq : DateOffset, timedelta, or time rule string
The frequency to shift by.
"""
if isinstance(freq, str):
freq = pd.tseries.frequencies.to_offset(freq)
if (isinstance(freq, pd.DateOffset) and
(freq.isAnchored() or not hasattr(freq, 'delta'))):
# Can't infer divisions on relative or anchored offsets, as
# divisions may now split identical index value.
# (e.g. index_partitions = [[1, 2, 3], [3, 4, 5]])
return df.clear_divisions()
if df.known_divisions:
divs = pd.Series(range(len(df.divisions)), index=df.divisions)
divisions = divs.shift(periods, freq=freq).index
return type(df)(df.dask, df._name, df._meta, divisions)
return df
@wraps(pd.to_datetime)
def to_datetime(arg, **kwargs):
meta = pd.Series([pd.Timestamp('2000')])
return map_partitions(pd.to_datetime, arg, meta=meta, **kwargs)
@wraps(pd.to_timedelta)
def to_timedelta(arg, unit='ns', errors='raise'):
meta = pd.Series([pd.Timedelta(1, unit=unit)])
return map_partitions(pd.to_timedelta, arg, unit=unit, errors=errors,
meta=meta)
if hasattr(pd, 'isna'):
@wraps(pd.isna)
def isna(arg):
return map_partitions(pd.isna, arg)
def _repr_data_series(s, index):
"""A helper for creating the ``_repr_data`` property"""
npartitions = len(index) - 1
if is_categorical_dtype(s):
if has_known_categories(s):
dtype = 'category[known]'
else:
dtype = 'category[unknown]'
else:
dtype = str(s.dtype)
return pd.Series([dtype] + ['...'] * npartitions, index=index, name=s.name)
get_parallel_type = Dispatch('get_parallel_type')
@get_parallel_type.register(pd.Series)
def get_parallel_type_series(_):
return Series
@get_parallel_type.register(pd.DataFrame)
def get_parallel_type_dataframe(_):
return DataFrame
@get_parallel_type.register(pd.Index)
def get_parallel_type_index(_):
return Index
@get_parallel_type.register(object)
def get_parallel_type_object(o):
return Scalar
@get_parallel_type.register(_Frame)
def get_parallel_type_frame(o):
return get_parallel_type(o._meta)
def parallel_types():
return tuple(k for k, v in get_parallel_type._lookup.items()
if v is not get_parallel_type_object)
def has_parallel_type(x):
""" Does this object have a dask dataframe equivalent? """
get_parallel_type(x) # trigger lazy registration
return isinstance(x, parallel_types())
def new_dd_object(dsk, name, meta, divisions):
"""Generic constructor for dask.dataframe objects.
Decides the appropriate output class based on the type of `meta` provided.
"""
if has_parallel_type(meta):
return get_parallel_type(meta)(dsk, name, meta, divisions)
elif is_arraylike(meta):
import dask.array as da
chunks = (((np.nan,) * (len(divisions) - 1),) +
tuple((d,) for d in meta.shape[1:]))
if len(chunks) > 1:
layer = dsk.layers[name]
if isinstance(layer, Blockwise):
layer.new_axes['j'] = chunks[1][0]
layer.output_indices = layer.output_indices + ('j',)
else:
suffix = (0,) * (len(chunks) - 1)
for i in range(len(chunks[0])):
layer[(name, i) + suffix] = layer.pop((name, i))
return da.Array(dsk, name=name, chunks=chunks, dtype=meta.dtype)
else:
return get_parallel_type(meta)(dsk, name, meta, divisions)
def partitionwise_graph(func, name, *args, **kwargs):
"""
Apply a function partition-wise across arguments to create layer of a graph
This applies a function, ``func``, in an embarrassingly parallel fashion
across partitions/chunks in the provided arguments. It handles Dataframes,
Arrays, and scalars smoothly, and relies on the ``blockwise`` machinery
to provide a nicely symbolic graph.
It is most commonly used in other graph-building functions to create the
appropriate layer of the resulting dataframe.
Parameters
----------
func: callable
name: str
descriptive name for the operation
*args:
**kwargs:
Returns
-------
out: Blockwise graph
Examples
--------
>>> subgraph = partitionwise_graph(function, x, y, z=123) # doctest: +SKIP
>>> layer = partitionwise_graph(function, df, x, z=123) # doctest: +SKIP
>>> graph = HighLevelGraph.from_collections(name, layer, dependencies=[df, x]) # doctest: +SKIP
>>> result = new_dd_object(graph, name, metadata, df.divisions) # doctest: +SKIP
See Also
--------
map_partitions
"""
pairs = []
numblocks = {}
for arg in args:
if isinstance(arg, _Frame):
pairs.extend([arg._name, 'i'])
numblocks[arg._name] = (arg.npartitions,)
elif isinstance(arg, Scalar):
pairs.extend([arg._name, 'i'])
numblocks[arg._name] = (1,)
elif isinstance(arg, Array):
if arg.ndim == 1:
pairs.extend([arg.name, 'i'])
elif arg.ndim == 0:
pairs.extend([arg.name, ''])
elif arg.ndim == 2:
pairs.extend([arg.name, 'ij'])
else:
raise ValueError("Can't add multi-dimensional array to dataframes")
numblocks[arg._name] = arg.numblocks
else:
pairs.extend([arg, None])
return blockwise(func, name, 'i', *pairs, numblocks=numblocks, concatenate=True, **kwargs)