You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
386 lines
12 KiB
386 lines
12 KiB
from __future__ import print_function, absolute_import, division
|
|
|
|
import warnings
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pandas.api.types import is_categorical_dtype
|
|
from toolz import partition
|
|
|
|
from .utils import PANDAS_VERSION
|
|
from ..utils import Dispatch
|
|
if PANDAS_VERSION >= '0.20.0':
|
|
from pandas.api.types import union_categoricals
|
|
else:
|
|
from pandas.types.concat import union_categoricals
|
|
if PANDAS_VERSION >= '0.23':
|
|
concat_kwargs = {'sort': False}
|
|
else:
|
|
concat_kwargs = {}
|
|
|
|
# ---------------------------------
|
|
# indexing
|
|
# ---------------------------------
|
|
|
|
|
|
def loc(df, iindexer, cindexer=None):
|
|
"""
|
|
.loc for known divisions
|
|
"""
|
|
if cindexer is None:
|
|
return df.loc[iindexer]
|
|
else:
|
|
return df.loc[iindexer, cindexer]
|
|
|
|
|
|
def iloc(df, cindexer=None):
|
|
return df.iloc[:, cindexer]
|
|
|
|
|
|
def try_loc(df, iindexer, cindexer=None):
|
|
"""
|
|
.loc for unknown divisions
|
|
"""
|
|
try:
|
|
return loc(df, iindexer, cindexer)
|
|
except KeyError:
|
|
return df.head(0).loc[:, cindexer]
|
|
|
|
|
|
def boundary_slice(df, start, stop, right_boundary=True, left_boundary=True,
|
|
kind='loc'):
|
|
"""Index slice start/stop. Can switch include/exclude boundaries.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pd.DataFrame({'x': [10, 20, 30, 40, 50]}, index=[1, 2, 2, 3, 4])
|
|
>>> boundary_slice(df, 2, None)
|
|
x
|
|
2 20
|
|
2 30
|
|
3 40
|
|
4 50
|
|
>>> boundary_slice(df, 1, 3)
|
|
x
|
|
1 10
|
|
2 20
|
|
2 30
|
|
3 40
|
|
>>> boundary_slice(df, 1, 3, right_boundary=False)
|
|
x
|
|
1 10
|
|
2 20
|
|
2 30
|
|
|
|
Empty input DataFrames are returned
|
|
|
|
>>> df_empty = pd.DataFrame()
|
|
>>> boundary_slice(df_empty, 1, 3)
|
|
Empty DataFrame
|
|
Columns: []
|
|
Index: []
|
|
"""
|
|
if df.empty:
|
|
return df
|
|
|
|
if kind == 'loc' and not df.index.is_monotonic:
|
|
# Pandas treats missing keys differently for label-slicing
|
|
# on monotonic vs. non-monotonic indexes
|
|
# If the index is monotonic, `df.loc[start:stop]` is fine.
|
|
# If it's not, `df.loc[start:stop]` raises when `start` is missing
|
|
if start is not None:
|
|
if left_boundary:
|
|
df = df[df.index >= start]
|
|
else:
|
|
df = df[df.index > start]
|
|
if stop is not None:
|
|
if right_boundary:
|
|
df = df[df.index <= stop]
|
|
else:
|
|
df = df[df.index < stop]
|
|
return df
|
|
else:
|
|
result = getattr(df, kind)[start:stop]
|
|
if not right_boundary:
|
|
right_index = result.index.get_slice_bound(stop, 'left', kind)
|
|
result = result.iloc[:right_index]
|
|
if not left_boundary:
|
|
left_index = result.index.get_slice_bound(start, 'right', kind)
|
|
result = result.iloc[left_index:]
|
|
return result
|
|
|
|
|
|
def index_count(x):
|
|
# Workaround since Index doesn't implement `.count`
|
|
return pd.notnull(x).sum()
|
|
|
|
|
|
def mean_aggregate(s, n):
|
|
try:
|
|
with warnings.catch_warnings(record=True):
|
|
warnings.simplefilter('always')
|
|
return s / n
|
|
except ZeroDivisionError:
|
|
return np.float64(np.nan)
|
|
|
|
|
|
def var_aggregate(x2, x, n, ddof):
|
|
try:
|
|
with warnings.catch_warnings(record=True):
|
|
warnings.simplefilter('always')
|
|
result = (x2 / n) - (x / n)**2
|
|
if ddof != 0:
|
|
result = result * n / (n - ddof)
|
|
return result
|
|
except ZeroDivisionError:
|
|
return np.float64(np.nan)
|
|
|
|
|
|
def describe_aggregate(values):
|
|
assert len(values) == 6
|
|
count, mean, std, min, q, max = values
|
|
typ = pd.DataFrame if isinstance(count, pd.Series) else pd.Series
|
|
part1 = typ([count, mean, std, min],
|
|
index=['count', 'mean', 'std', 'min'])
|
|
q.index = ['{0:g}%'.format(l * 100) for l in q.index.tolist()]
|
|
part3 = typ([max], index=['max'])
|
|
return pd.concat([part1, q, part3], **concat_kwargs)
|
|
|
|
|
|
def cummin_aggregate(x, y):
|
|
if isinstance(x, (pd.Series, pd.DataFrame)):
|
|
return x.where((x < y) | x.isnull(), y, axis=x.ndim - 1)
|
|
else: # scalar
|
|
return x if x < y else y
|
|
|
|
|
|
def cummax_aggregate(x, y):
|
|
if isinstance(x, (pd.Series, pd.DataFrame)):
|
|
return x.where((x > y) | x.isnull(), y, axis=x.ndim - 1)
|
|
else: # scalar
|
|
return x if x > y else y
|
|
|
|
|
|
def assign(df, *pairs):
|
|
kwargs = dict(partition(2, pairs))
|
|
return df.assign(**kwargs)
|
|
|
|
|
|
def unique(x, series_name=None):
|
|
# unique returns np.ndarray, it must be wrapped
|
|
return pd.Series(x.unique(), name=series_name)
|
|
|
|
|
|
def value_counts_combine(x):
|
|
return x.groupby(level=0).sum()
|
|
|
|
|
|
def value_counts_aggregate(x):
|
|
return x.groupby(level=0).sum().sort_values(ascending=False)
|
|
|
|
|
|
def nbytes(x):
|
|
return x.nbytes
|
|
|
|
|
|
def size(x):
|
|
return x.size
|
|
|
|
|
|
def values(df):
|
|
return df.values
|
|
|
|
|
|
def sample(df, state, frac, replace):
|
|
rs = np.random.RandomState(state)
|
|
return df.sample(random_state=rs, frac=frac, replace=replace) if len(df) > 0 else df
|
|
|
|
|
|
def drop_columns(df, columns, dtype):
|
|
df = df.drop(columns, axis=1)
|
|
df.columns = df.columns.astype(dtype)
|
|
return df
|
|
|
|
|
|
def fillna_check(df, method, check=True):
|
|
out = df.fillna(method=method)
|
|
if check and out.isnull().values.all(axis=0).any():
|
|
raise ValueError("All NaN partition encountered in `fillna`. Try "
|
|
"using ``df.repartition`` to increase the partition "
|
|
"size, or specify `limit` in `fillna`.")
|
|
return out
|
|
|
|
|
|
# ---------------------------------
|
|
# reshape
|
|
# ---------------------------------
|
|
|
|
|
|
def pivot_agg(df):
|
|
return df.groupby(level=0).sum()
|
|
|
|
|
|
def pivot_sum(df, index, columns, values):
|
|
return pd.pivot_table(df, index=index, columns=columns,
|
|
values=values, aggfunc='sum')
|
|
|
|
|
|
def pivot_count(df, index, columns, values):
|
|
# we cannot determine dtype until concatenationg all partitions.
|
|
# make dtype deterministic, always coerce to np.float64
|
|
return pd.pivot_table(df, index=index, columns=columns,
|
|
values=values, aggfunc='count').astype(np.float64)
|
|
|
|
|
|
# ---------------------------------
|
|
# concat
|
|
# ---------------------------------
|
|
|
|
if PANDAS_VERSION < '0.20.0':
|
|
def _get_level_values(x, n):
|
|
return x.get_level_values(n)
|
|
else:
|
|
def _get_level_values(x, n):
|
|
return x._get_level_values(n)
|
|
|
|
|
|
concat_dispatch = Dispatch('concat')
|
|
|
|
|
|
def concat(dfs, axis=0, join='outer', uniform=False, filter_warning=True):
|
|
"""Concatenate, handling some edge cases:
|
|
|
|
- Unions categoricals between partitions
|
|
- Ignores empty partitions
|
|
|
|
Parameters
|
|
----------
|
|
dfs : list of DataFrame, Series, or Index
|
|
axis : int or str, optional
|
|
join : str, optional
|
|
uniform : bool, optional
|
|
Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to
|
|
True if all arguments have the same columns and dtypes (but not
|
|
necessarily categories). Default is False.
|
|
"""
|
|
if len(dfs) == 1:
|
|
return dfs[0]
|
|
else:
|
|
func = concat_dispatch.dispatch(type(dfs[0]))
|
|
return func(dfs, axis=axis, join=join, uniform=uniform,
|
|
filter_warning=filter_warning)
|
|
|
|
|
|
@concat_dispatch.register((pd.DataFrame, pd.Series, pd.Index))
|
|
def concat_pandas(dfs, axis=0, join='outer', uniform=False, filter_warning=True):
|
|
if axis == 1:
|
|
return pd.concat(dfs, axis=axis, join=join, **concat_kwargs)
|
|
|
|
# Support concatenating indices along axis 0
|
|
if isinstance(dfs[0], pd.Index):
|
|
if isinstance(dfs[0], pd.CategoricalIndex):
|
|
return pd.CategoricalIndex(union_categoricals(dfs),
|
|
name=dfs[0].name)
|
|
elif isinstance(dfs[0], pd.MultiIndex):
|
|
first, rest = dfs[0], dfs[1:]
|
|
if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels)
|
|
for o in rest):
|
|
arrays = [concat([_get_level_values(i, n) for i in dfs])
|
|
for n in range(first.nlevels)]
|
|
return pd.MultiIndex.from_arrays(arrays, names=first.names)
|
|
|
|
to_concat = (first.values, ) + tuple(k._values for k in rest)
|
|
new_tuples = np.concatenate(to_concat)
|
|
try:
|
|
return pd.MultiIndex.from_tuples(new_tuples, names=first.names)
|
|
except Exception:
|
|
return pd.Index(new_tuples)
|
|
return dfs[0].append(dfs[1:])
|
|
|
|
# Handle categorical index separately
|
|
dfs0_index = dfs[0].index
|
|
|
|
has_categoricalindex = (
|
|
isinstance(dfs0_index, pd.CategoricalIndex) or
|
|
(isinstance(dfs0_index, pd.MultiIndex) and
|
|
any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels)))
|
|
|
|
if has_categoricalindex:
|
|
dfs2 = [df.reset_index(drop=True) for df in dfs]
|
|
ind = concat([df.index for df in dfs])
|
|
else:
|
|
dfs2 = dfs
|
|
ind = None
|
|
|
|
# Concatenate the partitions together, handling categories as needed
|
|
if (isinstance(dfs2[0], pd.DataFrame) if uniform else
|
|
any(isinstance(df, pd.DataFrame) for df in dfs2)):
|
|
if uniform:
|
|
dfs3 = dfs2
|
|
cat_mask = dfs2[0].dtypes == 'category'
|
|
else:
|
|
# When concatenating mixed dataframes and series on axis 1, Pandas
|
|
# converts series to dataframes with a single column named 0, then
|
|
# concatenates.
|
|
dfs3 = [df if isinstance(df, pd.DataFrame) else
|
|
df.to_frame().rename(columns={df.name: 0}) for df in dfs2]
|
|
# pandas may raise a RuntimeWarning for comparing ints and strs
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", RuntimeWarning)
|
|
if filter_warning:
|
|
warnings.simplefilter('ignore', FutureWarning)
|
|
cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
|
|
for df in dfs3], join=join,
|
|
**concat_kwargs).any()
|
|
|
|
if cat_mask.any():
|
|
not_cat = cat_mask[~cat_mask].index
|
|
# this should be aligned, so no need to filter warning
|
|
out = pd.concat([df[df.columns.intersection(not_cat)]
|
|
for df in dfs3], join=join, **concat_kwargs)
|
|
temp_ind = out.index
|
|
for col in cat_mask.index.difference(not_cat):
|
|
# Find an example of categoricals in this column
|
|
for df in dfs3:
|
|
sample = df.get(col)
|
|
if sample is not None:
|
|
break
|
|
# Extract partitions, subbing in missing if needed
|
|
parts = []
|
|
for df in dfs3:
|
|
if col in df.columns:
|
|
parts.append(df[col])
|
|
else:
|
|
codes = np.full(len(df), -1, dtype='i8')
|
|
data = pd.Categorical.from_codes(codes,
|
|
sample.cat.categories,
|
|
sample.cat.ordered)
|
|
parts.append(data)
|
|
out[col] = union_categoricals(parts)
|
|
# Pandas resets index type on assignment if frame is empty
|
|
# https://github.com/pandas-dev/pandas/issues/17101
|
|
if not len(temp_ind):
|
|
out.index = temp_ind
|
|
out = out.reindex(columns=cat_mask.index)
|
|
else:
|
|
# pandas may raise a RuntimeWarning for comparing ints and strs
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", RuntimeWarning)
|
|
if filter_warning:
|
|
warnings.simplefilter("ignore", FutureWarning)
|
|
out = pd.concat(dfs3, join=join, **concat_kwargs)
|
|
else:
|
|
if is_categorical_dtype(dfs2[0].dtype):
|
|
if ind is None:
|
|
ind = concat([df.index for df in dfs2])
|
|
return pd.Series(union_categoricals(dfs2), index=ind,
|
|
name=dfs2[0].name)
|
|
with warnings.catch_warnings():
|
|
if filter_warning:
|
|
warnings.simplefilter('ignore', FutureWarning)
|
|
out = pd.concat(dfs2, join=join, **concat_kwargs)
|
|
# Re-add the index if needed
|
|
if ind is not None:
|
|
out.index = ind
|
|
return out
|