You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
247 lines
9.1 KiB
247 lines
9.1 KiB
from __future__ import absolute_import, division, print_function
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from .core import Series, DataFrame, map_partitions, apply_concat_apply
|
|
from . import methods
|
|
from .utils import (
|
|
is_categorical_dtype, is_scalar, has_known_categories, PANDAS_VERSION
|
|
)
|
|
|
|
|
|
###############################################################
|
|
# Dummies
|
|
###############################################################
|
|
|
|
|
|
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
|
|
columns=None, sparse=False, drop_first=False,
|
|
dtype=np.uint8):
|
|
"""
|
|
Convert categorical variable into dummy/indicator variables.
|
|
|
|
Data must have category dtype to infer result's ``columns``.
|
|
|
|
Parameters
|
|
----------
|
|
data : Series, or DataFrame
|
|
For Series, the dtype must be categorical.
|
|
For DataFrame, at least one column must be categorical.
|
|
prefix : string, list of strings, or dict of strings, default None
|
|
String to append DataFrame column names.
|
|
Pass a list with length equal to the number of columns
|
|
when calling get_dummies on a DataFrame. Alternatively, `prefix`
|
|
can be a dictionary mapping column names to prefixes.
|
|
prefix_sep : string, default '_'
|
|
If appending prefix, separator/delimiter to use. Or pass a
|
|
list or dictionary as with `prefix.`
|
|
dummy_na : bool, default False
|
|
Add a column to indicate NaNs, if False NaNs are ignored.
|
|
columns : list-like, default None
|
|
Column names in the DataFrame to be encoded.
|
|
If `columns` is None then all the columns with
|
|
`category` dtype will be converted.
|
|
sparse : bool, default False
|
|
Whether the dummy columns should be sparse or not. Returns
|
|
SparseDataFrame if `data` is a Series or if all columns are included.
|
|
Otherwise returns a DataFrame with some SparseBlocks.
|
|
|
|
.. versionadded:: 0.18.2
|
|
|
|
drop_first : bool, default False
|
|
Whether to get k-1 dummies out of k categorical levels by removing the
|
|
first level.
|
|
|
|
dtype : dtype, default np.uint8
|
|
Data type for new columns. Only a single dtype is allowed.
|
|
Only valid if pandas is 0.23.0 or newer.
|
|
|
|
.. versionadded:: 0.18.2
|
|
|
|
Returns
|
|
-------
|
|
dummies : DataFrame
|
|
|
|
Examples
|
|
--------
|
|
Dask's version only works with Categorical data, as this is the only way to
|
|
know the output shape without computing all the data.
|
|
|
|
>>> import pandas as pd
|
|
>>> import dask.dataframe as dd
|
|
>>> s = dd.from_pandas(pd.Series(list('abca')), npartitions=2)
|
|
>>> dd.get_dummies(s)
|
|
Traceback (most recent call last):
|
|
...
|
|
NotImplementedError: `get_dummies` with non-categorical dtypes is not supported...
|
|
|
|
With categorical data:
|
|
|
|
>>> s = dd.from_pandas(pd.Series(list('abca'), dtype='category'), npartitions=2)
|
|
>>> dd.get_dummies(s) # doctest: +NORMALIZE_WHITESPACE
|
|
Dask DataFrame Structure:
|
|
a b c
|
|
npartitions=2
|
|
0 uint8 uint8 uint8
|
|
2 ... ... ...
|
|
3 ... ... ...
|
|
Dask Name: get_dummies, 4 tasks
|
|
>>> dd.get_dummies(s).compute()
|
|
a b c
|
|
0 1 0 0
|
|
1 0 1 0
|
|
2 0 0 1
|
|
3 1 0 0
|
|
|
|
See Also
|
|
--------
|
|
pandas.get_dummies
|
|
"""
|
|
if PANDAS_VERSION >= "0.23.0":
|
|
# dtype added to pandas
|
|
kwargs = {'dtype': dtype}
|
|
elif dtype != np.uint8:
|
|
# User specified something other than the default.
|
|
raise ValueError("Your version of pandas is '{}'. "
|
|
"The 'dtype' keyword was added in pandas "
|
|
"0.23.0.".format(PANDAS_VERSION))
|
|
else:
|
|
kwargs = {}
|
|
|
|
if isinstance(data, (pd.Series, pd.DataFrame)):
|
|
return pd.get_dummies(data, prefix=prefix,
|
|
prefix_sep=prefix_sep, dummy_na=dummy_na,
|
|
columns=columns, sparse=sparse,
|
|
drop_first=drop_first,
|
|
**kwargs)
|
|
|
|
not_cat_msg = ("`get_dummies` with non-categorical dtypes is not "
|
|
"supported. Please use `df.categorize()` beforehand to "
|
|
"convert to categorical dtype.")
|
|
|
|
unknown_cat_msg = ("`get_dummies` with unknown categories is not "
|
|
"supported. Please use `column.cat.as_known()` or "
|
|
"`df.categorize()` beforehand to ensure known "
|
|
"categories")
|
|
|
|
if isinstance(data, Series):
|
|
if not is_categorical_dtype(data):
|
|
raise NotImplementedError(not_cat_msg)
|
|
if not has_known_categories(data):
|
|
raise NotImplementedError(unknown_cat_msg)
|
|
elif isinstance(data, DataFrame):
|
|
if columns is None:
|
|
if (data.dtypes == 'object').any():
|
|
raise NotImplementedError(not_cat_msg)
|
|
columns = data._meta.select_dtypes(include=['category']).columns
|
|
else:
|
|
if not all(is_categorical_dtype(data[c]) for c in columns):
|
|
raise NotImplementedError(not_cat_msg)
|
|
|
|
if not all(has_known_categories(data[c]) for c in columns):
|
|
raise NotImplementedError(unknown_cat_msg)
|
|
|
|
# We explicitly create `meta` on `data._meta` (the empty version) to
|
|
# work around https://github.com/pandas-dev/pandas/issues/21993
|
|
meta = pd.get_dummies(data._meta, prefix=prefix,
|
|
prefix_sep=prefix_sep, dummy_na=dummy_na,
|
|
columns=columns, sparse=sparse, drop_first=drop_first,
|
|
**kwargs)
|
|
|
|
return map_partitions(pd.get_dummies, data, prefix=prefix,
|
|
prefix_sep=prefix_sep, dummy_na=dummy_na,
|
|
columns=columns, sparse=sparse,
|
|
drop_first=drop_first,
|
|
meta=meta, **kwargs)
|
|
|
|
|
|
###############################################################
|
|
# Pivot table
|
|
###############################################################
|
|
|
|
|
|
def pivot_table(df, index=None, columns=None,
|
|
values=None, aggfunc='mean'):
|
|
"""
|
|
Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
|
|
must have category dtype to infer result's ``columns``.
|
|
``index``, ``columns``, ``values`` and ``aggfunc`` must be all scalar.
|
|
|
|
Parameters
|
|
----------
|
|
data : DataFrame
|
|
values : scalar
|
|
column to aggregate
|
|
index : scalar
|
|
column to be index
|
|
columns : scalar
|
|
column to be columns
|
|
aggfunc : {'mean', 'sum', 'count'}, default 'mean'
|
|
|
|
Returns
|
|
-------
|
|
table : DataFrame
|
|
"""
|
|
|
|
if not is_scalar(index) or index is None:
|
|
raise ValueError("'index' must be the name of an existing column")
|
|
if not is_scalar(columns) or columns is None:
|
|
raise ValueError("'columns' must be the name of an existing column")
|
|
if not is_categorical_dtype(df[columns]):
|
|
raise ValueError("'columns' must be category dtype")
|
|
if not has_known_categories(df[columns]):
|
|
raise ValueError("'columns' must have known categories. Please use "
|
|
"`df[columns].cat.as_known()` beforehand to ensure "
|
|
"known categories")
|
|
if not is_scalar(values) or values is None:
|
|
raise ValueError("'values' must be the name of an existing column")
|
|
if not is_scalar(aggfunc) or aggfunc not in ('mean', 'sum', 'count'):
|
|
raise ValueError("aggfunc must be either 'mean', 'sum' or 'count'")
|
|
|
|
# _emulate can't work for empty data
|
|
# the result must have CategoricalIndex columns
|
|
new_columns = pd.CategoricalIndex(df[columns].cat.categories, name=columns)
|
|
meta = pd.DataFrame(columns=new_columns, dtype=np.float64)
|
|
meta.index.name = index
|
|
|
|
kwargs = {'index': index, 'columns': columns, 'values': values}
|
|
|
|
pv_sum = apply_concat_apply([df],
|
|
chunk=methods.pivot_sum,
|
|
aggregate=methods.pivot_agg,
|
|
meta=meta,
|
|
token='pivot_table_sum',
|
|
chunk_kwargs=kwargs)
|
|
pv_count = apply_concat_apply([df],
|
|
chunk=methods.pivot_count,
|
|
aggregate=methods.pivot_agg,
|
|
meta=meta,
|
|
token='pivot_table_count',
|
|
chunk_kwargs=kwargs)
|
|
|
|
if aggfunc == 'sum':
|
|
return pv_sum
|
|
elif aggfunc == 'count':
|
|
return pv_count
|
|
elif aggfunc == 'mean':
|
|
return pv_sum / pv_count
|
|
else:
|
|
raise ValueError
|
|
|
|
|
|
###############################################################
|
|
# Melt
|
|
###############################################################
|
|
|
|
|
|
def melt(frame, id_vars=None, value_vars=None, var_name=None,
|
|
value_name='value', col_level=None):
|
|
|
|
from dask.dataframe.core import no_default
|
|
|
|
return frame.map_partitions(pd.melt, meta=no_default, id_vars=id_vars,
|
|
value_vars=value_vars,
|
|
var_name=var_name, value_name=value_name,
|
|
col_level=col_level, token='melt')
|