You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/Resources/WPy64-3720/python-3.7.2.amd64/Lib/site-packages/dask/array/percentile.py

175 lines
6.4 KiB

from __future__ import absolute_import, division, print_function
from functools import wraps
from numbers import Number
import numpy as np
from toolz import merge, merge_sorted
from .core import Array
from ..base import tokenize
from ..highlevelgraph import HighLevelGraph
from ..compatibility import Iterator
@wraps(np.percentile)
def _percentile(a, q, interpolation='linear'):
n = len(a)
if not len(a):
return None, n
if isinstance(q, Iterator):
q = list(q)
if a.dtype.name == 'category':
result = np.percentile(a.codes, q, interpolation=interpolation)
import pandas as pd
return pd.Categorical.from_codes(result, a.categories, a.ordered), n
if np.issubdtype(a.dtype, np.datetime64):
a2 = a.astype('i8')
result = np.percentile(a2, q, interpolation=interpolation)
return result.astype(a.dtype), n
if not np.issubdtype(a.dtype, np.number):
interpolation = 'nearest'
return np.percentile(a, q, interpolation=interpolation), n
def percentile(a, q, interpolation='linear'):
""" Approximate percentile of 1-D array
See :func:`numpy.percentile` for more information
"""
if not a.ndim == 1:
raise NotImplementedError(
"Percentiles only implemented for 1-d arrays")
if isinstance(q, Number):
q = [q]
q = np.array(q)
token = tokenize(a, list(q), interpolation)
name = 'percentile_chunk-' + token
dsk = dict(((name, i), (_percentile, (key), q, interpolation))
for i, key in enumerate(a.__dask_keys__()))
name2 = 'percentile-' + token
dsk2 = {(name2, 0): (merge_percentiles, q, [q] * len(a.chunks[0]),
sorted(dsk), interpolation)}
dtype = a.dtype
if np.issubdtype(dtype, np.integer):
dtype = (np.array([], dtype=dtype) / 0.5).dtype
dsk = merge(dsk, dsk2)
graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a])
return Array(graph, name2, chunks=((len(q),),), dtype=dtype)
def merge_percentiles(finalq, qs, vals, interpolation='lower', Ns=None):
""" Combine several percentile calculations of different data.
Parameters
----------
finalq : numpy.array
Percentiles to compute (must use same scale as ``qs``).
qs : sequence of :class:`numpy.array`s
Percentiles calculated on different sets of data.
vals : sequence of :class:`numpy.array`s
Resulting values associated with percentiles ``qs``.
Ns : sequence of integers
The number of data elements associated with each data set.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
Specify the type of interpolation to use to calculate final
percentiles. For more information, see :func:`numpy.percentile`.
Examples
--------
>>> finalq = [10, 20, 30, 40, 50, 60, 70, 80]
>>> qs = [[20, 40, 60, 80], [20, 40, 60, 80]]
>>> vals = [np.array([1, 2, 3, 4]), np.array([10, 11, 12, 13])]
>>> Ns = [100, 100] # Both original arrays had 100 elements
>>> merge_percentiles(finalq, qs, vals, Ns=Ns)
array([ 1, 2, 3, 4, 10, 11, 12, 13])
"""
if isinstance(finalq, Iterator):
finalq = list(finalq)
finalq = np.array(finalq)
qs = list(map(list, qs))
vals = list(vals)
if Ns is None:
vals, Ns = zip(*vals)
Ns = list(Ns)
L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N]))
if not L:
raise ValueError("No non-trivial arrays found")
qs, vals, Ns = L
# TODO: Perform this check above in percentile once dtype checking is easy
# Here we silently change meaning
if vals[0].dtype.name == 'category':
result = merge_percentiles(finalq, qs, [v.codes for v in vals], interpolation, Ns)
import pandas as pd
return pd.Categorical.from_codes(result, vals[0].categories, vals[0].ordered)
if not np.issubdtype(vals[0].dtype, np.number):
interpolation = 'nearest'
if len(vals) != len(qs) or len(Ns) != len(qs):
raise ValueError('qs, vals, and Ns parameters must be the same length')
# transform qs and Ns into number of observations between percentiles
counts = []
for q, N in zip(qs, Ns):
count = np.empty(len(q))
count[1:] = np.diff(q)
count[0] = q[0]
count *= N
counts.append(count)
# Sort by calculated percentile values, then number of observations.
# >95% of the time in this function is spent in `merge_sorted` below.
# An alternative that uses numpy sort is shown. It is sometimes
# comparable to, but typically slower than, `merge_sorted`.
#
# >>> A = np.concatenate(map(np.array, map(zip, vals, counts)))
# >>> A.sort(0, kind='mergesort')
combined_vals_counts = merge_sorted(*map(zip, vals, counts))
combined_vals, combined_counts = zip(*combined_vals_counts)
combined_vals = np.array(combined_vals)
combined_counts = np.array(combined_counts)
# percentile-like, but scaled by total number of observations
combined_q = np.cumsum(combined_counts)
# rescale finalq percentiles to match combined_q
desired_q = finalq * sum(Ns)
# the behavior of different interpolation methods should be
# investigated further.
if interpolation == 'linear':
rv = np.interp(desired_q, combined_q, combined_vals)
else:
left = np.searchsorted(combined_q, desired_q, side='left')
right = np.searchsorted(combined_q, desired_q, side='right') - 1
np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index
lower = np.minimum(left, right)
upper = np.maximum(left, right)
if interpolation == 'lower':
rv = combined_vals[lower]
elif interpolation == 'higher':
rv = combined_vals[upper]
elif interpolation == 'midpoint':
rv = 0.5 * (combined_vals[lower] + combined_vals[upper])
elif interpolation == 'nearest':
lower_residual = np.abs(combined_q[lower] - desired_q)
upper_residual = np.abs(combined_q[upper] - desired_q)
mask = lower_residual > upper_residual
index = lower # alias; we no longer need lower
index[mask] = upper[mask]
rv = combined_vals[index]
else:
raise ValueError("interpolation can only be 'linear', 'lower', "
"'higher', 'midpoint', or 'nearest'")
return rv