You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
259 lines
8.6 KiB
259 lines
8.6 KiB
from __future__ import absolute_import, division, print_function
|
|
|
|
from collections import defaultdict
|
|
import pandas as pd
|
|
from toolz import partition_all
|
|
from numbers import Integral
|
|
|
|
from ..base import tokenize, compute_as_if_collection
|
|
from .accessor import Accessor
|
|
from .utils import (has_known_categories, clear_known_categories, is_scalar,
|
|
is_categorical_dtype)
|
|
|
|
|
|
def _categorize_block(df, categories, index):
|
|
""" Categorize a dataframe with given categories
|
|
|
|
df: DataFrame
|
|
categories: dict mapping column name to iterable of categories
|
|
"""
|
|
df = df.copy()
|
|
for col, vals in categories.items():
|
|
if is_categorical_dtype(df[col]):
|
|
df[col] = df[col].cat.set_categories(vals)
|
|
else:
|
|
df[col] = pd.Categorical(df[col], categories=vals, ordered=False)
|
|
if index is not None:
|
|
if is_categorical_dtype(df.index):
|
|
ind = df.index.set_categories(index)
|
|
else:
|
|
ind = pd.Categorical(df.index, categories=index, ordered=False)
|
|
ind.name = df.index.name
|
|
df.index = ind
|
|
return df
|
|
|
|
|
|
def _get_categories(df, columns, index):
|
|
res = {}
|
|
for col in columns:
|
|
x = df[col]
|
|
if is_categorical_dtype(x):
|
|
res[col] = pd.Series(x.cat.categories)
|
|
else:
|
|
res[col] = x.dropna().drop_duplicates()
|
|
if index:
|
|
if is_categorical_dtype(df.index):
|
|
return res, df.index.categories
|
|
return res, df.index.dropna().drop_duplicates()
|
|
return res, None
|
|
|
|
|
|
def _get_categories_agg(parts):
|
|
res = defaultdict(list)
|
|
res_ind = []
|
|
for p in parts:
|
|
for k, v in p[0].items():
|
|
res[k].append(v)
|
|
res_ind.append(p[1])
|
|
res = {k: pd.concat(v, ignore_index=True).drop_duplicates()
|
|
for k, v in res.items()}
|
|
if res_ind[0] is None:
|
|
return res, None
|
|
return res, res_ind[0].append(res_ind[1:]).drop_duplicates()
|
|
|
|
|
|
def categorize(df, columns=None, index=None, split_every=None, **kwargs):
|
|
"""Convert columns of the DataFrame to category dtype.
|
|
|
|
Parameters
|
|
----------
|
|
columns : list, optional
|
|
A list of column names to convert to categoricals. By default any
|
|
column with an object dtype is converted to a categorical, and any
|
|
unknown categoricals are made known.
|
|
index : bool, optional
|
|
Whether to categorize the index. By default, object indices are
|
|
converted to categorical, and unknown categorical indices are made
|
|
known. Set True to always categorize the index, False to never.
|
|
split_every : int, optional
|
|
Group partitions into groups of this size while performing a
|
|
tree-reduction. If set to False, no tree-reduction will be used.
|
|
Default is 16.
|
|
kwargs
|
|
Keyword arguments are passed on to compute.
|
|
"""
|
|
meta = df._meta
|
|
if columns is None:
|
|
columns = list(meta.select_dtypes(['object', 'category']).columns)
|
|
elif is_scalar(columns):
|
|
columns = [columns]
|
|
|
|
# Filter out known categorical columns
|
|
columns = [c for c in columns if not (is_categorical_dtype(meta[c]) and
|
|
has_known_categories(meta[c]))]
|
|
|
|
if index is not False:
|
|
if is_categorical_dtype(meta.index):
|
|
index = not has_known_categories(meta.index)
|
|
elif index is None:
|
|
index = meta.index.dtype == object
|
|
|
|
# Nothing to do
|
|
if not len(columns) and index is False:
|
|
return df
|
|
|
|
if split_every is None:
|
|
split_every = 16
|
|
elif split_every is False:
|
|
split_every = df.npartitions
|
|
elif not isinstance(split_every, Integral) or split_every < 2:
|
|
raise ValueError("split_every must be an integer >= 2")
|
|
|
|
token = tokenize(df, columns, index, split_every)
|
|
a = 'get-categories-chunk-' + token
|
|
dsk = {(a, i): (_get_categories, key, columns, index)
|
|
for (i, key) in enumerate(df.__dask_keys__())}
|
|
|
|
prefix = 'get-categories-agg-' + token
|
|
k = df.npartitions
|
|
depth = 0
|
|
while k > split_every:
|
|
b = prefix + str(depth)
|
|
for part_i, inds in enumerate(partition_all(split_every, range(k))):
|
|
dsk[(b, part_i)] = (_get_categories_agg, [(a, i) for i in inds])
|
|
k = part_i + 1
|
|
a = b
|
|
depth += 1
|
|
|
|
dsk[(prefix, 0)] = (_get_categories_agg, [(a, i) for i in range(k)])
|
|
dsk.update(df.dask)
|
|
|
|
# Compute the categories
|
|
categories, index = compute_as_if_collection(type(df), dsk, (prefix, 0),
|
|
**kwargs)
|
|
|
|
# Categorize each partition
|
|
return df.map_partitions(_categorize_block, categories, index)
|
|
|
|
|
|
class CategoricalAccessor(Accessor):
|
|
"""
|
|
Accessor object for categorical properties of the Series values.
|
|
|
|
Examples
|
|
--------
|
|
>>> s.cat.categories # doctest: +SKIP
|
|
|
|
Notes
|
|
-----
|
|
Attributes that depend only on metadata are eager
|
|
|
|
* categories
|
|
* ordered
|
|
|
|
Attributes depending on the entire dataset are lazy
|
|
|
|
* codes
|
|
* ...
|
|
|
|
So `df.a.cat.categories` <=> `df.a._meta.cat.categories`
|
|
So `df.a.cat.codes` <=> `df.a.map_partitions(lambda x: x.cat.codes)`
|
|
"""
|
|
_accessor = pd.Series.cat
|
|
_accessor_name = 'cat'
|
|
|
|
def _validate(self, series):
|
|
if not is_categorical_dtype(series.dtype):
|
|
raise AttributeError("Can only use .cat accessor with a "
|
|
"'category' dtype")
|
|
|
|
@property
|
|
def known(self):
|
|
"""Whether the categories are fully known"""
|
|
return has_known_categories(self._series)
|
|
|
|
def as_known(self, **kwargs):
|
|
"""Ensure the categories in this series are known.
|
|
|
|
If the categories are known, this is a no-op. If unknown, the
|
|
categories are computed, and a new series with known categories is
|
|
returned.
|
|
|
|
Parameters
|
|
----------
|
|
kwargs
|
|
Keywords to pass on to the call to `compute`.
|
|
"""
|
|
if self.known:
|
|
return self._series
|
|
categories = self._property_map('categories').unique().compute(**kwargs)
|
|
return self.set_categories(categories.values)
|
|
|
|
def as_unknown(self):
|
|
"""Ensure the categories in this series are unknown"""
|
|
if not self.known:
|
|
return self._series
|
|
out = self._series.copy()
|
|
out._meta = clear_known_categories(out._meta)
|
|
return out
|
|
|
|
@property
|
|
def ordered(self):
|
|
return self._delegate_property(self._series._meta, 'cat', 'ordered')
|
|
|
|
@property
|
|
def categories(self):
|
|
"""The categories of this categorical.
|
|
|
|
If categories are unknown, an error is raised"""
|
|
if not self.known:
|
|
msg = ("`df.column.cat.categories` with unknown categories is not "
|
|
"supported. Please use `column.cat.as_known()` or "
|
|
"`df.categorize()` beforehand to ensure known categories")
|
|
raise NotImplementedError(msg)
|
|
return self._delegate_property(self._series._meta, 'cat', 'categories')
|
|
|
|
@property
|
|
def codes(self):
|
|
"""The codes of this categorical.
|
|
|
|
If categories are unknown, an error is raised"""
|
|
if not self.known:
|
|
msg = ("`df.column.cat.codes` with unknown categories is not "
|
|
"supported. Please use `column.cat.as_known()` or "
|
|
"`df.categorize()` beforehand to ensure known categories")
|
|
raise NotImplementedError(msg)
|
|
return self._property_map('codes')
|
|
|
|
def remove_unused_categories(self):
|
|
"""
|
|
Removes categories which are not used
|
|
|
|
Notes
|
|
-----
|
|
This method requires a full scan of the data to compute the
|
|
unique values, which can be expensive.
|
|
"""
|
|
# get the set of used categories
|
|
present = self._series.dropna().unique()
|
|
present = pd.Index(present.compute())
|
|
|
|
if isinstance(self._series._meta, pd.CategoricalIndex):
|
|
meta_cat = self._series._meta
|
|
else:
|
|
meta_cat = self._series._meta.cat
|
|
|
|
# Reorder to keep cat:code relationship, filtering unused (-1)
|
|
ordered, mask = present.reindex(meta_cat.categories)
|
|
if mask is None:
|
|
# PANDAS-23963: old and new categories match.
|
|
return self._series
|
|
|
|
new_categories = ordered[mask != -1]
|
|
meta = meta_cat.set_categories(new_categories, ordered=meta_cat.ordered)
|
|
return self._series.map_partitions(self._delegate_method, 'cat',
|
|
'set_categories', (),
|
|
{'new_categories': new_categories},
|
|
meta=meta,
|
|
token='cat-set_categories')
|