You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
888 lines
34 KiB
888 lines
34 KiB
6 years ago
|
from __future__ import absolute_import, division, print_function
|
||
|
|
||
|
import operator
|
||
|
from functools import partial, wraps
|
||
|
from itertools import product, repeat
|
||
|
from math import factorial, log, ceil
|
||
|
|
||
|
import numpy as np
|
||
|
from numbers import Integral
|
||
|
|
||
|
from toolz import compose, partition_all, get, accumulate, pluck
|
||
|
|
||
|
from . import chunk
|
||
|
from .core import _concatenate2, Array, handle_out
|
||
|
from .blockwise import blockwise
|
||
|
from ..blockwise import lol_tuples
|
||
|
from .creation import arange
|
||
|
from .ufunc import sqrt
|
||
|
from .utils import validate_axis
|
||
|
from .wrap import zeros, ones
|
||
|
from .numpy_compat import ma_divide, divide as np_divide
|
||
|
from ..compatibility import getargspec, builtins
|
||
|
from ..base import tokenize
|
||
|
from ..highlevelgraph import HighLevelGraph
|
||
|
from ..utils import ignoring, funcname, Dispatch, deepmap
|
||
|
from .. import config
|
||
|
|
||
|
# Generic functions to support chunks of different types
|
||
|
empty_lookup = Dispatch('empty')
|
||
|
empty_lookup.register((object, np.ndarray), np.empty)
|
||
|
empty_lookup.register(np.ma.masked_array, np.ma.empty)
|
||
|
divide_lookup = Dispatch('divide')
|
||
|
divide_lookup.register((object, np.ndarray), np_divide)
|
||
|
divide_lookup.register(np.ma.masked_array, ma_divide)
|
||
|
|
||
|
|
||
|
def divide(a, b, dtype=None):
|
||
|
key = lambda x: getattr(x, '__array_priority__', float('-inf'))
|
||
|
f = divide_lookup.dispatch(type(builtins.max(a, b, key=key)))
|
||
|
return f(a, b, dtype=dtype)
|
||
|
|
||
|
|
||
|
def reduction(x, chunk, aggregate, axis=None, keepdims=False, dtype=None,
|
||
|
split_every=None, combine=None, name=None, out=None,
|
||
|
concatenate=True, output_size=1):
|
||
|
""" General version of reductions
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x: Array
|
||
|
Data being reduced along one or more axes
|
||
|
chunk: callable(x_chunk, axis, keepdims)
|
||
|
First function to be executed when resolving the dask graph.
|
||
|
This function is applied in parallel to all original chunks of x.
|
||
|
See below for function parameters.
|
||
|
combine: callable(x_chunk, axis, keepdims), optional
|
||
|
Function used for intermediate recursive aggregation (see
|
||
|
split_every below). If omitted, it defaults to aggregate.
|
||
|
If the reduction can be performed in less than 3 steps, it will not
|
||
|
be invoked at all.
|
||
|
aggregate: callable(x_chunk, axis, keepdims)
|
||
|
Last function to be executed when resolving the dask graph,
|
||
|
producing the final output. It is always invoked, even when the reduced
|
||
|
Array counts a single chunk along the reduced axes.
|
||
|
axis: int or sequence of ints, optional
|
||
|
Axis or axes to aggregate upon. If omitted, aggregate along all axes.
|
||
|
keepdims: boolean, optional
|
||
|
Whether the reduction function should preserve the reduced axes,
|
||
|
leaving them at size ``output_size``, or remove them.
|
||
|
dtype: np.dtype, optional
|
||
|
Force output dtype. Defaults to x.dtype if omitted.
|
||
|
split_every: int >= 2 or dict(axis: int), optional
|
||
|
Determines the depth of the recursive aggregation. If set to or more
|
||
|
than the number of input chunks, the aggregation will be performed in
|
||
|
two steps, one ``chunk`` function per input chunk and a single
|
||
|
``aggregate`` function at the end. If set to less than that, an
|
||
|
intermediate ``combine`` function will be used, so that any one
|
||
|
``combine`` or ``aggregate`` function has no more than ``split_every``
|
||
|
inputs. The depth of the aggregation graph will be
|
||
|
:math:`log_{split_every}(input chunks along reduced axes)`. Setting to
|
||
|
a low value can reduce cache size and network transfers, at the cost of
|
||
|
more CPU and a larger dask graph.
|
||
|
|
||
|
Omit to let dask heuristically decide a good default. A default can
|
||
|
also be set globally with the ``split_every`` key in
|
||
|
:mod:`dask.config`.
|
||
|
name: str, optional
|
||
|
Prefix of the keys of the intermediate and output nodes. If omitted it
|
||
|
defaults to the function names.
|
||
|
out: Array, optional
|
||
|
Another dask array whose contents will be replaced. Omit to create a
|
||
|
new one. Note that, unlike in numpy, this setting gives no performance
|
||
|
benefits whatsoever, but can still be useful if one needs to preserve
|
||
|
the references to a previously existing Array.
|
||
|
concatenate: bool, optional
|
||
|
If True (the default), the outputs of the ``chunk``/``combine``
|
||
|
functions are concatenated into a single np.array before being passed
|
||
|
to the ``combine``/``aggregate`` functions. If False, the input of
|
||
|
``combine`` and ``aggregate`` will be either a list of the raw outputs
|
||
|
of the previous step or a single output, and the function will have to
|
||
|
concatenate it itself. It can be useful to set this to False if the
|
||
|
chunk and/or combine steps do not produce np.arrays.
|
||
|
output_size: int >= 1, optional
|
||
|
Size of the output of the ``aggregate`` function along the reduced
|
||
|
axes. Ignored if keepdims is False.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dask array
|
||
|
|
||
|
**Function Parameters**
|
||
|
|
||
|
x_chunk: numpy.ndarray
|
||
|
Individual input chunk. For ``chunk`` functions, it is one of the
|
||
|
original chunks of x. For ``combine`` and ``aggregate`` functions, it's
|
||
|
the concatenation of the outputs produced by the previous ``chunk`` or
|
||
|
``combine`` functions. If concatenate=False, it's a list of the raw
|
||
|
outputs from the previous functions.
|
||
|
axis: tuple
|
||
|
Normalized list of axes to reduce upon, e.g. ``(0, )``
|
||
|
Scalar, negative, and None axes have been normalized away.
|
||
|
Note that some numpy reduction functions cannot reduce along multiple
|
||
|
axes at once and strictly require an int in input. Such functions have
|
||
|
to be wrapped to cope.
|
||
|
keepdims: bool
|
||
|
Whether the reduction function should preserve the reduced axes or
|
||
|
remove them.
|
||
|
"""
|
||
|
if axis is None:
|
||
|
axis = tuple(range(x.ndim))
|
||
|
if isinstance(axis, Integral):
|
||
|
axis = (axis,)
|
||
|
axis = validate_axis(axis, x.ndim)
|
||
|
|
||
|
if dtype is None:
|
||
|
raise ValueError("Must specify dtype")
|
||
|
if 'dtype' in getargspec(chunk).args:
|
||
|
chunk = partial(chunk, dtype=dtype)
|
||
|
if 'dtype' in getargspec(aggregate).args:
|
||
|
aggregate = partial(aggregate, dtype=dtype)
|
||
|
|
||
|
# Map chunk across all blocks
|
||
|
inds = tuple(range(x.ndim))
|
||
|
# The dtype of `tmp` doesn't actually matter, and may be incorrect.
|
||
|
tmp = blockwise(chunk, inds, x, inds, axis=axis, keepdims=True, dtype=x.dtype)
|
||
|
tmp._chunks = tuple((output_size, ) * len(c) if i in axis else c
|
||
|
for i, c in enumerate(tmp.chunks))
|
||
|
result = _tree_reduce(tmp, aggregate, axis, keepdims, dtype, split_every,
|
||
|
combine, name=name, concatenate=concatenate)
|
||
|
if keepdims and output_size != 1:
|
||
|
result._chunks = tuple((output_size, ) if i in axis else c
|
||
|
for i, c in enumerate(tmp.chunks))
|
||
|
return handle_out(out, result)
|
||
|
|
||
|
|
||
|
def _tree_reduce(x, aggregate, axis, keepdims, dtype, split_every=None,
|
||
|
combine=None, name=None, concatenate=True):
|
||
|
""" Perform the tree reduction step of a reduction.
|
||
|
|
||
|
Lower level, users should use ``reduction`` or ``arg_reduction`` directly.
|
||
|
"""
|
||
|
# Normalize split_every
|
||
|
split_every = split_every or config.get('split_every', 4)
|
||
|
if isinstance(split_every, dict):
|
||
|
split_every = dict((k, split_every.get(k, 2)) for k in axis)
|
||
|
elif isinstance(split_every, Integral):
|
||
|
n = builtins.max(int(split_every ** (1 / (len(axis) or 1))), 2)
|
||
|
split_every = dict.fromkeys(axis, n)
|
||
|
else:
|
||
|
raise ValueError("split_every must be a int or a dict")
|
||
|
|
||
|
# Reduce across intermediates
|
||
|
depth = 1
|
||
|
for i, n in enumerate(x.numblocks):
|
||
|
if i in split_every and split_every[i] != 1:
|
||
|
depth = int(builtins.max(depth, ceil(log(n, split_every[i]))))
|
||
|
func = partial(combine or aggregate, axis=axis, keepdims=True)
|
||
|
if concatenate:
|
||
|
func = compose(func, partial(_concatenate2, axes=axis))
|
||
|
for i in range(depth - 1):
|
||
|
x = partial_reduce(func, x, split_every, True, dtype=dtype,
|
||
|
name=(name or funcname(combine or aggregate)) + '-partial')
|
||
|
func = partial(aggregate, axis=axis, keepdims=keepdims)
|
||
|
if concatenate:
|
||
|
func = compose(func, partial(_concatenate2, axes=axis))
|
||
|
return partial_reduce(func, x, split_every, keepdims=keepdims, dtype=dtype,
|
||
|
name=(name or funcname(aggregate)) + '-aggregate')
|
||
|
|
||
|
|
||
|
def partial_reduce(func, x, split_every, keepdims=False, dtype=None, name=None):
|
||
|
""" Partial reduction across multiple axes.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
func : function
|
||
|
x : Array
|
||
|
split_every : dict
|
||
|
Maximum reduction block sizes in each dimension.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Reduce across axis 0 and 2, merging a maximum of 1 block in the 0th
|
||
|
dimension, and 3 blocks in the 2nd dimension:
|
||
|
|
||
|
>>> partial_reduce(np.min, x, {0: 1, 2: 3}) # doctest: +SKIP
|
||
|
"""
|
||
|
name = (name or funcname(func)) + '-' + tokenize(func, x, split_every,
|
||
|
keepdims, dtype)
|
||
|
parts = [list(partition_all(split_every.get(i, 1), range(n))) for (i, n)
|
||
|
in enumerate(x.numblocks)]
|
||
|
keys = product(*map(range, map(len, parts)))
|
||
|
out_chunks = [tuple(1 for p in partition_all(split_every[i], c)) if i
|
||
|
in split_every else c for (i, c) in enumerate(x.chunks)]
|
||
|
if not keepdims:
|
||
|
out_axis = [i for i in range(x.ndim) if i not in split_every]
|
||
|
getter = lambda k: get(out_axis, k)
|
||
|
keys = map(getter, keys)
|
||
|
out_chunks = list(getter(out_chunks))
|
||
|
dsk = {}
|
||
|
for k, p in zip(keys, product(*parts)):
|
||
|
decided = dict((i, j[0]) for (i, j) in enumerate(p) if len(j) == 1)
|
||
|
dummy = dict(i for i in enumerate(p) if i[0] not in decided)
|
||
|
g = lol_tuples((x.name,), range(x.ndim), decided, dummy)
|
||
|
dsk[(name,) + k] = (func, g)
|
||
|
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x])
|
||
|
return Array(graph, name, out_chunks, dtype=dtype)
|
||
|
|
||
|
|
||
|
@wraps(chunk.sum)
|
||
|
def sum(a, axis=None, dtype=None, keepdims=False, split_every=None, out=None):
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(np.empty((1,), dtype=a.dtype).sum(), 'dtype', object)
|
||
|
result = reduction(a, chunk.sum, chunk.sum, axis=axis, keepdims=keepdims,
|
||
|
dtype=dt, split_every=split_every, out=out)
|
||
|
return result
|
||
|
|
||
|
|
||
|
@wraps(chunk.prod)
|
||
|
def prod(a, axis=None, dtype=None, keepdims=False, split_every=None, out=None):
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(np.empty((1,), dtype=a.dtype).prod(), 'dtype', object)
|
||
|
return reduction(a, chunk.prod, chunk.prod, axis=axis, keepdims=keepdims,
|
||
|
dtype=dt, split_every=split_every, out=out)
|
||
|
|
||
|
|
||
|
@wraps(chunk.min)
|
||
|
def min(a, axis=None, keepdims=False, split_every=None, out=None):
|
||
|
return reduction(a, chunk.min, chunk.min, axis=axis, keepdims=keepdims,
|
||
|
dtype=a.dtype, split_every=split_every, out=out)
|
||
|
|
||
|
|
||
|
@wraps(chunk.max)
|
||
|
def max(a, axis=None, keepdims=False, split_every=None, out=None):
|
||
|
return reduction(a, chunk.max, chunk.max, axis=axis, keepdims=keepdims,
|
||
|
dtype=a.dtype, split_every=split_every, out=out)
|
||
|
|
||
|
|
||
|
@wraps(chunk.any)
|
||
|
def any(a, axis=None, keepdims=False, split_every=None, out=None):
|
||
|
return reduction(a, chunk.any, chunk.any, axis=axis, keepdims=keepdims,
|
||
|
dtype='bool', split_every=split_every, out=out)
|
||
|
|
||
|
|
||
|
@wraps(chunk.all)
|
||
|
def all(a, axis=None, keepdims=False, split_every=None, out=None):
|
||
|
return reduction(a, chunk.all, chunk.all, axis=axis, keepdims=keepdims,
|
||
|
dtype='bool', split_every=split_every, out=out)
|
||
|
|
||
|
|
||
|
@wraps(chunk.nansum)
|
||
|
def nansum(a, axis=None, dtype=None, keepdims=False, split_every=None, out=None):
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(chunk.nansum(np.empty((1,), dtype=a.dtype)), 'dtype', object)
|
||
|
return reduction(a, chunk.nansum, chunk.sum, axis=axis, keepdims=keepdims,
|
||
|
dtype=dt, split_every=split_every, out=out)
|
||
|
|
||
|
|
||
|
with ignoring(AttributeError):
|
||
|
@wraps(chunk.nanprod)
|
||
|
def nanprod(a, axis=None, dtype=None, keepdims=False, split_every=None,
|
||
|
out=None):
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(chunk.nansum(np.empty((1,), dtype=a.dtype)), 'dtype', object)
|
||
|
return reduction(a, chunk.nanprod, chunk.prod, axis=axis,
|
||
|
keepdims=keepdims, dtype=dt, split_every=split_every,
|
||
|
out=out)
|
||
|
|
||
|
@wraps(chunk.nancumsum)
|
||
|
def nancumsum(x, axis, dtype=None, out=None):
|
||
|
return cumreduction(chunk.nancumsum, operator.add, 0, x, axis, dtype,
|
||
|
out=out)
|
||
|
|
||
|
@wraps(chunk.nancumprod)
|
||
|
def nancumprod(x, axis, dtype=None, out=None):
|
||
|
return cumreduction(chunk.nancumprod, operator.mul, 1, x, axis, dtype,
|
||
|
out=out)
|
||
|
|
||
|
|
||
|
@wraps(chunk.nanmin)
|
||
|
def nanmin(a, axis=None, keepdims=False, split_every=None, out=None):
|
||
|
return reduction(a, chunk.nanmin, chunk.nanmin, axis=axis,
|
||
|
keepdims=keepdims, dtype=a.dtype, split_every=split_every,
|
||
|
out=out)
|
||
|
|
||
|
|
||
|
@wraps(chunk.nanmax)
|
||
|
def nanmax(a, axis=None, keepdims=False, split_every=None, out=None):
|
||
|
return reduction(a, chunk.nanmax, chunk.nanmax, axis=axis,
|
||
|
keepdims=keepdims, dtype=a.dtype, split_every=split_every,
|
||
|
out=out)
|
||
|
|
||
|
|
||
|
def numel(x, **kwargs):
|
||
|
""" A reduction to count the number of elements """
|
||
|
return chunk.sum(np.ones_like(x), **kwargs)
|
||
|
|
||
|
|
||
|
def nannumel(x, **kwargs):
|
||
|
""" A reduction to count the number of elements """
|
||
|
return chunk.sum(~np.isnan(x), **kwargs)
|
||
|
|
||
|
|
||
|
def mean_chunk(x, sum=chunk.sum, numel=numel, dtype='f8', **kwargs):
|
||
|
n = numel(x, dtype=dtype, **kwargs)
|
||
|
total = sum(x, dtype=dtype, **kwargs)
|
||
|
return {'n': n, 'total': total}
|
||
|
|
||
|
|
||
|
def mean_combine(pairs, sum=chunk.sum, numel=numel, dtype='f8', axis=None, **kwargs):
|
||
|
if not isinstance(pairs, list):
|
||
|
pairs = [pairs]
|
||
|
ns = deepmap(lambda pair: pair['n'], pairs)
|
||
|
totals = deepmap(lambda pair: pair['total'], pairs)
|
||
|
n = _concatenate2(ns, axes=axis).sum(axis=axis, **kwargs)
|
||
|
total = _concatenate2(totals, axes=axis).sum(axis=axis, **kwargs)
|
||
|
return {'n': n, 'total': total}
|
||
|
|
||
|
|
||
|
def mean_agg(pairs, dtype='f8', axis=None, **kwargs):
|
||
|
ns = deepmap(lambda pair: pair['n'], pairs)
|
||
|
totals = deepmap(lambda pair: pair['total'], pairs)
|
||
|
n = _concatenate2(ns, axes=axis).sum(axis=axis, dtype=dtype, **kwargs)
|
||
|
total = _concatenate2(totals, axes=axis).sum(axis=axis, dtype=dtype, **kwargs)
|
||
|
|
||
|
return divide(total, n, dtype=dtype)
|
||
|
|
||
|
|
||
|
@wraps(chunk.mean)
|
||
|
def mean(a, axis=None, dtype=None, keepdims=False, split_every=None, out=None):
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(np.mean(np.empty(shape=(1,), dtype=a.dtype)), 'dtype', object)
|
||
|
return reduction(a, mean_chunk, mean_agg, axis=axis, keepdims=keepdims,
|
||
|
dtype=dt, split_every=split_every, combine=mean_combine,
|
||
|
out=out, concatenate=False)
|
||
|
|
||
|
|
||
|
def nanmean(a, axis=None, dtype=None, keepdims=False, split_every=None,
|
||
|
out=None):
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(np.mean(np.empty(shape=(1,), dtype=a.dtype)), 'dtype', object)
|
||
|
return reduction(a, partial(mean_chunk, sum=chunk.nansum, numel=nannumel),
|
||
|
mean_agg, axis=axis, keepdims=keepdims, dtype=dt,
|
||
|
split_every=split_every, out=out,
|
||
|
concatenate=False,
|
||
|
combine=partial(mean_combine, sum=chunk.nansum, numel=nannumel))
|
||
|
|
||
|
|
||
|
with ignoring(AttributeError):
|
||
|
nanmean = wraps(chunk.nanmean)(nanmean)
|
||
|
|
||
|
|
||
|
def moment_chunk(A, order=2, sum=chunk.sum, numel=numel, dtype='f8', **kwargs):
|
||
|
total = sum(A, dtype=dtype, **kwargs)
|
||
|
n = numel(A, **kwargs).astype(np.int64)
|
||
|
u = total / n
|
||
|
xs = [sum((A - u)**i, dtype=dtype, **kwargs) for i in range(2, order + 1)]
|
||
|
M = np.stack(xs, axis=-1)
|
||
|
return {'total': total, 'n': n, 'M': M}
|
||
|
|
||
|
|
||
|
def _moment_helper(Ms, ns, inner_term, order, sum, axis, kwargs):
|
||
|
M = Ms[..., order - 2].sum(axis=axis, **kwargs) + sum(ns * inner_term ** order, axis=axis, **kwargs)
|
||
|
for k in range(1, order - 1):
|
||
|
coeff = factorial(order) / (factorial(k) * factorial(order - k))
|
||
|
M += coeff * sum(Ms[..., order - k - 2] * inner_term**k, axis=axis, **kwargs)
|
||
|
return M
|
||
|
|
||
|
|
||
|
def moment_combine(pairs, order=2, ddof=0, dtype='f8', sum=np.sum, axis=None, **kwargs):
|
||
|
if not isinstance(pairs, list):
|
||
|
pairs = [pairs]
|
||
|
totals = _concatenate2(deepmap(lambda pair: pair['total'], pairs), axes=axis)
|
||
|
ns = _concatenate2(deepmap(lambda pair: pair['n'], pairs), axes=axis)
|
||
|
Ms = _concatenate2(deepmap(lambda pair: pair['M'], pairs), axes=axis)
|
||
|
|
||
|
kwargs['dtype'] = dtype
|
||
|
kwargs['keepdims'] = True
|
||
|
|
||
|
total = totals.sum(axis=axis, **kwargs)
|
||
|
n = ns.sum(axis=axis, **kwargs)
|
||
|
mu = divide(total, n, dtype=dtype)
|
||
|
inner_term = divide(totals, ns, dtype=dtype) - mu
|
||
|
|
||
|
xs = [_moment_helper(Ms, ns, inner_term, o, sum, axis, kwargs) for o in range(2, order + 1)]
|
||
|
M = np.stack(xs, axis=-1)
|
||
|
return {'total': total, 'n': n, 'M': M}
|
||
|
|
||
|
|
||
|
def moment_agg(pairs, order=2, ddof=0, dtype='f8', sum=np.sum, axis=None, **kwargs):
|
||
|
if not isinstance(pairs, list):
|
||
|
pairs = [pairs]
|
||
|
totals = _concatenate2(deepmap(lambda pair: pair['total'], pairs), axes=axis)
|
||
|
ns = _concatenate2(deepmap(lambda pair: pair['n'], pairs), axes=axis)
|
||
|
Ms = _concatenate2(deepmap(lambda pair: pair['M'], pairs), axes=axis)
|
||
|
|
||
|
kwargs['dtype'] = dtype
|
||
|
# To properly handle ndarrays, the original dimensions need to be kept for
|
||
|
# part of the calculation.
|
||
|
keepdim_kw = kwargs.copy()
|
||
|
keepdim_kw['keepdims'] = True
|
||
|
|
||
|
n = ns.sum(axis=axis, **keepdim_kw)
|
||
|
mu = divide(totals.sum(axis=axis, **keepdim_kw), n, dtype=dtype)
|
||
|
inner_term = divide(totals, ns, dtype=dtype) - mu
|
||
|
|
||
|
M = _moment_helper(Ms, ns, inner_term, order, sum, axis, kwargs)
|
||
|
return divide(M, n.sum(axis=axis, **kwargs) - ddof, dtype=dtype)
|
||
|
|
||
|
|
||
|
def moment(a, order, axis=None, dtype=None, keepdims=False, ddof=0,
|
||
|
split_every=None, out=None):
|
||
|
if not isinstance(order, Integral) or order < 0:
|
||
|
raise ValueError("Order must be an integer >= 0")
|
||
|
|
||
|
if order < 2:
|
||
|
reduced = a.sum(axis=axis) # get reduced shape and chunks
|
||
|
if order == 0:
|
||
|
# When order equals 0, the result is 1, by definition.
|
||
|
return ones(reduced.shape, chunks=reduced.chunks, dtype='f8')
|
||
|
# By definition the first order about the mean is 0.
|
||
|
return zeros(reduced.shape, chunks=reduced.chunks, dtype='f8')
|
||
|
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(np.var(np.ones(shape=(1,), dtype=a.dtype)), 'dtype', object)
|
||
|
return reduction(a, partial(moment_chunk, order=order),
|
||
|
partial(moment_agg, order=order, ddof=ddof),
|
||
|
axis=axis, keepdims=keepdims,
|
||
|
dtype=dt, split_every=split_every, out=out,
|
||
|
concatenate=False,
|
||
|
combine=partial(moment_combine, order=order))
|
||
|
|
||
|
|
||
|
@wraps(chunk.var)
|
||
|
def var(a, axis=None, dtype=None, keepdims=False, ddof=0, split_every=None,
|
||
|
out=None):
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(np.var(np.ones(shape=(1,), dtype=a.dtype)), 'dtype', object)
|
||
|
return reduction(a, moment_chunk, partial(moment_agg, ddof=ddof), axis=axis,
|
||
|
keepdims=keepdims, dtype=dt, split_every=split_every,
|
||
|
combine=moment_combine, name='var', out=out,
|
||
|
concatenate=False)
|
||
|
|
||
|
|
||
|
def nanvar(a, axis=None, dtype=None, keepdims=False, ddof=0, split_every=None,
|
||
|
out=None):
|
||
|
if dtype is not None:
|
||
|
dt = dtype
|
||
|
else:
|
||
|
dt = getattr(np.var(np.ones(shape=(1,), dtype=a.dtype)), 'dtype', object)
|
||
|
return reduction(a, partial(moment_chunk, sum=chunk.nansum, numel=nannumel),
|
||
|
partial(moment_agg, sum=np.nansum, ddof=ddof), axis=axis,
|
||
|
keepdims=keepdims, dtype=dt, split_every=split_every,
|
||
|
combine=partial(moment_combine, sum=np.nansum), out=out,
|
||
|
concatenate=False)
|
||
|
|
||
|
|
||
|
with ignoring(AttributeError):
|
||
|
nanvar = wraps(chunk.nanvar)(nanvar)
|
||
|
|
||
|
|
||
|
@wraps(chunk.std)
|
||
|
def std(a, axis=None, dtype=None, keepdims=False, ddof=0, split_every=None,
|
||
|
out=None):
|
||
|
result = sqrt(a.var(axis=axis, dtype=dtype, keepdims=keepdims, ddof=ddof,
|
||
|
split_every=split_every, out=out))
|
||
|
if dtype and dtype != result.dtype:
|
||
|
result = result.astype(dtype)
|
||
|
return result
|
||
|
|
||
|
|
||
|
def nanstd(a, axis=None, dtype=None, keepdims=False, ddof=0, split_every=None,
|
||
|
out=None):
|
||
|
result = sqrt(nanvar(a, axis=axis, dtype=dtype, keepdims=keepdims,
|
||
|
ddof=ddof, split_every=split_every, out=out))
|
||
|
if dtype and dtype != result.dtype:
|
||
|
result = result.astype(dtype)
|
||
|
return result
|
||
|
|
||
|
|
||
|
with ignoring(AttributeError):
|
||
|
nanstd = wraps(chunk.nanstd)(nanstd)
|
||
|
|
||
|
|
||
|
def _arg_combine(data, axis, argfunc, keepdims=False):
|
||
|
""" Merge intermediate results from ``arg_*`` functions"""
|
||
|
axis = None if len(axis) == data.ndim or data.ndim == 1 else axis[0]
|
||
|
vals = data['vals']
|
||
|
arg = data['arg']
|
||
|
if axis is None:
|
||
|
local_args = argfunc(vals, axis=axis, keepdims=keepdims)
|
||
|
vals = vals.ravel()[local_args]
|
||
|
arg = arg.ravel()[local_args]
|
||
|
else:
|
||
|
local_args = argfunc(vals, axis=axis)
|
||
|
inds = np.ogrid[tuple(map(slice, local_args.shape))]
|
||
|
inds.insert(axis, local_args)
|
||
|
inds = tuple(inds)
|
||
|
vals = vals[inds]
|
||
|
arg = arg[inds]
|
||
|
if keepdims:
|
||
|
vals = np.expand_dims(vals, axis)
|
||
|
arg = np.expand_dims(arg, axis)
|
||
|
return arg, vals
|
||
|
|
||
|
|
||
|
def arg_chunk(func, argfunc, x, axis, offset_info):
|
||
|
arg_axis = None if len(axis) == x.ndim or x.ndim == 1 else axis[0]
|
||
|
vals = func(x, axis=arg_axis, keepdims=True)
|
||
|
arg = argfunc(x, axis=arg_axis, keepdims=True)
|
||
|
if arg_axis is None:
|
||
|
offset, total_shape = offset_info
|
||
|
ind = np.unravel_index(arg.ravel()[0], x.shape)
|
||
|
total_ind = tuple(o + i for (o, i) in zip(offset, ind))
|
||
|
arg[:] = np.ravel_multi_index(total_ind, total_shape)
|
||
|
else:
|
||
|
arg += offset_info
|
||
|
|
||
|
if isinstance(vals, np.ma.masked_array):
|
||
|
if 'min' in argfunc.__name__:
|
||
|
fill_value = np.ma.minimum_fill_value(vals)
|
||
|
else:
|
||
|
fill_value = np.ma.maximum_fill_value(vals)
|
||
|
vals = np.ma.filled(vals, fill_value)
|
||
|
|
||
|
result = np.empty(shape=vals.shape, dtype=[('vals', vals.dtype),
|
||
|
('arg', arg.dtype)])
|
||
|
result['vals'] = vals
|
||
|
result['arg'] = arg
|
||
|
return result
|
||
|
|
||
|
|
||
|
def arg_combine(func, argfunc, data, axis=None, **kwargs):
|
||
|
arg, vals = _arg_combine(data, axis, argfunc, keepdims=True)
|
||
|
result = np.empty(shape=vals.shape, dtype=[('vals', vals.dtype),
|
||
|
('arg', arg.dtype)])
|
||
|
result['vals'] = vals
|
||
|
result['arg'] = arg
|
||
|
return result
|
||
|
|
||
|
|
||
|
def arg_agg(func, argfunc, data, axis=None, **kwargs):
|
||
|
return _arg_combine(data, axis, argfunc, keepdims=False)[0]
|
||
|
|
||
|
|
||
|
def nanarg_agg(func, argfunc, data, axis=None, **kwargs):
|
||
|
arg, vals = _arg_combine(data, axis, argfunc, keepdims=False)
|
||
|
if np.any(np.isnan(vals)):
|
||
|
raise ValueError("All NaN slice encountered")
|
||
|
return arg
|
||
|
|
||
|
|
||
|
def arg_reduction(x, chunk, combine, agg, axis=None, split_every=None, out=None):
|
||
|
""" Generic function for argreduction.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x : Array
|
||
|
chunk : callable
|
||
|
Partialed ``arg_chunk``.
|
||
|
combine : callable
|
||
|
Partialed ``arg_combine``.
|
||
|
agg : callable
|
||
|
Partialed ``arg_agg``.
|
||
|
axis : int, optional
|
||
|
split_every : int or dict, optional
|
||
|
"""
|
||
|
if axis is None:
|
||
|
axis = tuple(range(x.ndim))
|
||
|
ravel = True
|
||
|
elif isinstance(axis, Integral):
|
||
|
axis = validate_axis(axis, x.ndim)
|
||
|
axis = (axis,)
|
||
|
ravel = x.ndim == 1
|
||
|
else:
|
||
|
raise TypeError("axis must be either `None` or int, "
|
||
|
"got '{0}'".format(axis))
|
||
|
|
||
|
for ax in axis:
|
||
|
chunks = x.chunks[ax]
|
||
|
if len(chunks) > 1 and np.isnan(chunks).any():
|
||
|
raise ValueError(
|
||
|
"Arg-reductions do not work with arrays that have "
|
||
|
"unknown chunksizes. At some point in your computation "
|
||
|
"this array lost chunking information"
|
||
|
)
|
||
|
|
||
|
# Map chunk across all blocks
|
||
|
name = 'arg-reduce-{0}'.format(tokenize(axis, x, chunk,
|
||
|
combine, split_every))
|
||
|
old = x.name
|
||
|
keys = list(product(*map(range, x.numblocks)))
|
||
|
offsets = list(product(*(accumulate(operator.add, bd[:-1], 0)
|
||
|
for bd in x.chunks)))
|
||
|
if ravel:
|
||
|
offset_info = zip(offsets, repeat(x.shape))
|
||
|
else:
|
||
|
offset_info = pluck(axis[0], offsets)
|
||
|
|
||
|
chunks = tuple((1, ) * len(c) if i in axis else c for (i, c)
|
||
|
in enumerate(x.chunks))
|
||
|
dsk = dict(((name,) + k, (chunk, (old,) + k, axis, off)) for (k, off)
|
||
|
in zip(keys, offset_info))
|
||
|
# The dtype of `tmp` doesn't actually matter, just need to provide something
|
||
|
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x])
|
||
|
tmp = Array(graph, name, chunks, dtype=x.dtype)
|
||
|
dtype = np.argmin([1]).dtype
|
||
|
result = _tree_reduce(tmp, agg, axis, False, dtype, split_every, combine)
|
||
|
return handle_out(out, result)
|
||
|
|
||
|
|
||
|
def make_arg_reduction(func, argfunc, is_nan_func=False):
|
||
|
""" Create an argreduction callable
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
func : callable
|
||
|
The reduction (e.g. ``min``)
|
||
|
argfunc : callable
|
||
|
The argreduction (e.g. ``argmin``)
|
||
|
"""
|
||
|
chunk = partial(arg_chunk, func, argfunc)
|
||
|
combine = partial(arg_combine, func, argfunc)
|
||
|
if is_nan_func:
|
||
|
agg = partial(nanarg_agg, func, argfunc)
|
||
|
else:
|
||
|
agg = partial(arg_agg, func, argfunc)
|
||
|
|
||
|
@wraps(argfunc)
|
||
|
def _(x, axis=None, split_every=None, out=None):
|
||
|
return arg_reduction(x, chunk, combine, agg, axis,
|
||
|
split_every=split_every, out=out)
|
||
|
|
||
|
return _
|
||
|
|
||
|
|
||
|
def _nanargmin(x, axis, **kwargs):
|
||
|
try:
|
||
|
return chunk.nanargmin(x, axis, **kwargs)
|
||
|
except ValueError:
|
||
|
return chunk.nanargmin(np.where(np.isnan(x), np.inf, x), axis, **kwargs)
|
||
|
|
||
|
|
||
|
def _nanargmax(x, axis, **kwargs):
|
||
|
try:
|
||
|
return chunk.nanargmax(x, axis, **kwargs)
|
||
|
except ValueError:
|
||
|
return chunk.nanargmax(np.where(np.isnan(x), -np.inf, x), axis, **kwargs)
|
||
|
|
||
|
|
||
|
argmin = make_arg_reduction(chunk.min, chunk.argmin)
|
||
|
argmax = make_arg_reduction(chunk.max, chunk.argmax)
|
||
|
nanargmin = make_arg_reduction(chunk.nanmin, _nanargmin, True)
|
||
|
nanargmax = make_arg_reduction(chunk.nanmax, _nanargmax, True)
|
||
|
|
||
|
|
||
|
def cumreduction(func, binop, ident, x, axis=None, dtype=None, out=None):
|
||
|
""" Generic function for cumulative reduction
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
func: callable
|
||
|
Cumulative function like np.cumsum or np.cumprod
|
||
|
binop: callable
|
||
|
Associated binary operator like ``np.cumsum->add`` or ``np.cumprod->mul``
|
||
|
ident: Number
|
||
|
Associated identity like ``np.cumsum->0`` or ``np.cumprod->1``
|
||
|
x: dask Array
|
||
|
axis: int
|
||
|
dtype: dtype
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dask array
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
cumsum
|
||
|
cumprod
|
||
|
"""
|
||
|
if axis is None:
|
||
|
x = x.flatten()
|
||
|
axis = 0
|
||
|
if dtype is None:
|
||
|
dtype = getattr(func(np.empty((0,), dtype=x.dtype)), 'dtype', object)
|
||
|
assert isinstance(axis, Integral)
|
||
|
axis = validate_axis(axis, x.ndim)
|
||
|
|
||
|
m = x.map_blocks(func, axis=axis, dtype=dtype)
|
||
|
|
||
|
name = '{0}-{1}'.format(func.__name__, tokenize(func, axis, binop,
|
||
|
ident, x, dtype))
|
||
|
n = x.numblocks[axis]
|
||
|
full = slice(None, None, None)
|
||
|
slc = (full,) * axis + (slice(-1, None),) + (full,) * (x.ndim - axis - 1)
|
||
|
|
||
|
indices = list(product(*[range(nb) if i != axis else [0]
|
||
|
for i, nb in enumerate(x.numblocks)]))
|
||
|
dsk = dict()
|
||
|
for ind in indices:
|
||
|
shape = tuple(x.chunks[i][ii] if i != axis else 1
|
||
|
for i, ii in enumerate(ind))
|
||
|
dsk[(name, 'extra') + ind] = (np.full, shape, ident, m.dtype)
|
||
|
dsk[(name,) + ind] = (m.name,) + ind
|
||
|
|
||
|
for i in range(1, n):
|
||
|
last_indices = indices
|
||
|
indices = list(product(*[range(nb) if ii != axis else [i]
|
||
|
for ii, nb in enumerate(x.numblocks)]))
|
||
|
for old, ind in zip(last_indices, indices):
|
||
|
this_slice = (name, 'extra') + ind
|
||
|
dsk[this_slice] = (binop, (name, 'extra') + old,
|
||
|
(operator.getitem, (m.name,) + old, slc))
|
||
|
dsk[(name,) + ind] = (binop, this_slice, (m.name,) + ind)
|
||
|
|
||
|
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[m])
|
||
|
result = Array(graph, name, x.chunks, m.dtype)
|
||
|
return handle_out(out, result)
|
||
|
|
||
|
|
||
|
def _cumsum_merge(a, b):
|
||
|
if isinstance(a, np.ma.masked_array) or isinstance(b, np.ma.masked_array):
|
||
|
values = np.ma.getdata(a) + np.ma.getdata(b)
|
||
|
return np.ma.masked_array(values, mask=np.ma.getmaskarray(b))
|
||
|
return a + b
|
||
|
|
||
|
|
||
|
def _cumprod_merge(a, b):
|
||
|
if isinstance(a, np.ma.masked_array) or isinstance(b, np.ma.masked_array):
|
||
|
values = np.ma.getdata(a) * np.ma.getdata(b)
|
||
|
return np.ma.masked_array(values, mask=np.ma.getmaskarray(b))
|
||
|
return a * b
|
||
|
|
||
|
|
||
|
@wraps(np.cumsum)
|
||
|
def cumsum(x, axis=None, dtype=None, out=None):
|
||
|
return cumreduction(np.cumsum, _cumsum_merge, 0, x, axis, dtype, out=out)
|
||
|
|
||
|
|
||
|
@wraps(np.cumprod)
|
||
|
def cumprod(x, axis=None, dtype=None, out=None):
|
||
|
return cumreduction(np.cumprod, _cumprod_merge, 1, x, axis, dtype, out=out)
|
||
|
|
||
|
|
||
|
def topk(a, k, axis=-1, split_every=None):
|
||
|
""" Extract the k largest elements from a on the given axis,
|
||
|
and return them sorted from largest to smallest.
|
||
|
If k is negative, extract the -k smallest elements instead,
|
||
|
and return them sorted from smallest to largest.
|
||
|
|
||
|
This performs best when ``k`` is much smaller than the chunk size. All
|
||
|
results will be returned in a single chunk along the given axis.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x: Array
|
||
|
Data being sorted
|
||
|
k: int
|
||
|
axis: int, optional
|
||
|
split_every: int >=2, optional
|
||
|
See :func:`reduce`. This parameter becomes very important when k is
|
||
|
on the same order of magnitude of the chunk size or more, as it
|
||
|
prevents getting the whole or a significant portion of the input array
|
||
|
in memory all at once, with a negative impact on network transfer
|
||
|
too when running on distributed.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Selection of x with size abs(k) along the given axis.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import dask.array as da
|
||
|
>>> x = np.array([5, 1, 3, 6])
|
||
|
>>> d = da.from_array(x, chunks=2)
|
||
|
>>> d.topk(2).compute()
|
||
|
array([6, 5])
|
||
|
>>> d.topk(-2).compute()
|
||
|
array([1, 3])
|
||
|
"""
|
||
|
axis = validate_axis(axis, a.ndim)
|
||
|
|
||
|
# chunk and combine steps of the reduction, which recursively invoke
|
||
|
# np.partition to pick the top/bottom k elements from the previous step.
|
||
|
# The selection is not sorted internally.
|
||
|
chunk_combine = partial(chunk.topk, k=k)
|
||
|
# aggregate step of the reduction. Internally invokes the chunk/combine
|
||
|
# function, then sorts the results internally.
|
||
|
aggregate = partial(chunk.topk_aggregate, k=k)
|
||
|
|
||
|
return reduction(
|
||
|
a, chunk=chunk_combine, combine=chunk_combine, aggregate=aggregate,
|
||
|
axis=axis, keepdims=True, dtype=a.dtype, split_every=split_every,
|
||
|
output_size=abs(k))
|
||
|
|
||
|
|
||
|
def argtopk(a, k, axis=-1, split_every=None):
|
||
|
""" Extract the indices of the k largest elements from a on the given axis,
|
||
|
and return them sorted from largest to smallest. If k is negative, extract
|
||
|
the indices of the -k smallest elements instead, and return them sorted
|
||
|
from smallest to largest.
|
||
|
|
||
|
This performs best when ``k`` is much smaller than the chunk size. All
|
||
|
results will be returned in a single chunk along the given axis.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x: Array
|
||
|
Data being sorted
|
||
|
k: int
|
||
|
axis: int, optional
|
||
|
split_every: int >=2, optional
|
||
|
See :func:`topk`. The performance considerations for topk also apply
|
||
|
here.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Selection of np.intp indices of x with size abs(k) along the given axis.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import dask.array as da
|
||
|
>>> x = np.array([5, 1, 3, 6])
|
||
|
>>> d = da.from_array(x, chunks=2)
|
||
|
>>> d.argtopk(2).compute()
|
||
|
array([3, 0])
|
||
|
>>> d.argtopk(-2).compute()
|
||
|
array([1, 2])
|
||
|
"""
|
||
|
axis = validate_axis(axis, a.ndim)
|
||
|
|
||
|
# Generate nodes where every chunk is a tuple of (a, original index of a)
|
||
|
idx = arange(a.shape[axis], chunks=(a.chunks[axis], ), dtype=np.intp)
|
||
|
idx = idx[tuple(slice(None) if i == axis else np.newaxis
|
||
|
for i in range(a.ndim))]
|
||
|
a_plus_idx = a.map_blocks(chunk.argtopk_preprocess, idx,
|
||
|
dtype=object)
|
||
|
|
||
|
# chunk and combine steps of the reduction. They acquire in input a tuple
|
||
|
# of (a, original indices of a) and return another tuple containing the top
|
||
|
# k elements of a and the matching original indices. The selection is not
|
||
|
# sorted internally, as in np.argpartition.
|
||
|
chunk_combine = partial(chunk.argtopk, k=k)
|
||
|
# aggregate step of the reduction. Internally invokes the chunk/combine
|
||
|
# function, then sorts the results internally, drops a and returns the
|
||
|
# index only.
|
||
|
aggregate = partial(chunk.argtopk_aggregate, k=k)
|
||
|
|
||
|
return reduction(
|
||
|
a_plus_idx, chunk=chunk_combine, combine=chunk_combine,
|
||
|
aggregate=aggregate, axis=axis, keepdims=True, dtype=np.intp,
|
||
|
split_every=split_every, concatenate=False, output_size=abs(k))
|