You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/WPy32-3720/python-3.7.2/Lib/site-packages/dask/dataframe/reshape.py

247 lines
9.1 KiB

from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd
from .core import Series, DataFrame, map_partitions, apply_concat_apply
from . import methods
from .utils import (
is_categorical_dtype, is_scalar, has_known_categories, PANDAS_VERSION
)
###############################################################
# Dummies
###############################################################
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
columns=None, sparse=False, drop_first=False,
dtype=np.uint8):
"""
Convert categorical variable into dummy/indicator variables.
Data must have category dtype to infer result's ``columns``.
Parameters
----------
data : Series, or DataFrame
For Series, the dtype must be categorical.
For DataFrame, at least one column must be categorical.
prefix : string, list of strings, or dict of strings, default None
String to append DataFrame column names.
Pass a list with length equal to the number of columns
when calling get_dummies on a DataFrame. Alternatively, `prefix`
can be a dictionary mapping column names to prefixes.
prefix_sep : string, default '_'
If appending prefix, separator/delimiter to use. Or pass a
list or dictionary as with `prefix.`
dummy_na : bool, default False
Add a column to indicate NaNs, if False NaNs are ignored.
columns : list-like, default None
Column names in the DataFrame to be encoded.
If `columns` is None then all the columns with
`category` dtype will be converted.
sparse : bool, default False
Whether the dummy columns should be sparse or not. Returns
SparseDataFrame if `data` is a Series or if all columns are included.
Otherwise returns a DataFrame with some SparseBlocks.
.. versionadded:: 0.18.2
drop_first : bool, default False
Whether to get k-1 dummies out of k categorical levels by removing the
first level.
dtype : dtype, default np.uint8
Data type for new columns. Only a single dtype is allowed.
Only valid if pandas is 0.23.0 or newer.
.. versionadded:: 0.18.2
Returns
-------
dummies : DataFrame
Examples
--------
Dask's version only works with Categorical data, as this is the only way to
know the output shape without computing all the data.
>>> import pandas as pd
>>> import dask.dataframe as dd
>>> s = dd.from_pandas(pd.Series(list('abca')), npartitions=2)
>>> dd.get_dummies(s)
Traceback (most recent call last):
...
NotImplementedError: `get_dummies` with non-categorical dtypes is not supported...
With categorical data:
>>> s = dd.from_pandas(pd.Series(list('abca'), dtype='category'), npartitions=2)
>>> dd.get_dummies(s) # doctest: +NORMALIZE_WHITESPACE
Dask DataFrame Structure:
a b c
npartitions=2
0 uint8 uint8 uint8
2 ... ... ...
3 ... ... ...
Dask Name: get_dummies, 4 tasks
>>> dd.get_dummies(s).compute()
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
See Also
--------
pandas.get_dummies
"""
if PANDAS_VERSION >= "0.23.0":
# dtype added to pandas
kwargs = {'dtype': dtype}
elif dtype != np.uint8:
# User specified something other than the default.
raise ValueError("Your version of pandas is '{}'. "
"The 'dtype' keyword was added in pandas "
"0.23.0.".format(PANDAS_VERSION))
else:
kwargs = {}
if isinstance(data, (pd.Series, pd.DataFrame)):
return pd.get_dummies(data, prefix=prefix,
prefix_sep=prefix_sep, dummy_na=dummy_na,
columns=columns, sparse=sparse,
drop_first=drop_first,
**kwargs)
not_cat_msg = ("`get_dummies` with non-categorical dtypes is not "
"supported. Please use `df.categorize()` beforehand to "
"convert to categorical dtype.")
unknown_cat_msg = ("`get_dummies` with unknown categories is not "
"supported. Please use `column.cat.as_known()` or "
"`df.categorize()` beforehand to ensure known "
"categories")
if isinstance(data, Series):
if not is_categorical_dtype(data):
raise NotImplementedError(not_cat_msg)
if not has_known_categories(data):
raise NotImplementedError(unknown_cat_msg)
elif isinstance(data, DataFrame):
if columns is None:
if (data.dtypes == 'object').any():
raise NotImplementedError(not_cat_msg)
columns = data._meta.select_dtypes(include=['category']).columns
else:
if not all(is_categorical_dtype(data[c]) for c in columns):
raise NotImplementedError(not_cat_msg)
if not all(has_known_categories(data[c]) for c in columns):
raise NotImplementedError(unknown_cat_msg)
# We explicitly create `meta` on `data._meta` (the empty version) to
# work around https://github.com/pandas-dev/pandas/issues/21993
meta = pd.get_dummies(data._meta, prefix=prefix,
prefix_sep=prefix_sep, dummy_na=dummy_na,
columns=columns, sparse=sparse, drop_first=drop_first,
**kwargs)
return map_partitions(pd.get_dummies, data, prefix=prefix,
prefix_sep=prefix_sep, dummy_na=dummy_na,
columns=columns, sparse=sparse,
drop_first=drop_first,
meta=meta, **kwargs)
###############################################################
# Pivot table
###############################################################
def pivot_table(df, index=None, columns=None,
values=None, aggfunc='mean'):
"""
Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
must have category dtype to infer result's ``columns``.
``index``, ``columns``, ``values`` and ``aggfunc`` must be all scalar.
Parameters
----------
data : DataFrame
values : scalar
column to aggregate
index : scalar
column to be index
columns : scalar
column to be columns
aggfunc : {'mean', 'sum', 'count'}, default 'mean'
Returns
-------
table : DataFrame
"""
if not is_scalar(index) or index is None:
raise ValueError("'index' must be the name of an existing column")
if not is_scalar(columns) or columns is None:
raise ValueError("'columns' must be the name of an existing column")
if not is_categorical_dtype(df[columns]):
raise ValueError("'columns' must be category dtype")
if not has_known_categories(df[columns]):
raise ValueError("'columns' must have known categories. Please use "
"`df[columns].cat.as_known()` beforehand to ensure "
"known categories")
if not is_scalar(values) or values is None:
raise ValueError("'values' must be the name of an existing column")
if not is_scalar(aggfunc) or aggfunc not in ('mean', 'sum', 'count'):
raise ValueError("aggfunc must be either 'mean', 'sum' or 'count'")
# _emulate can't work for empty data
# the result must have CategoricalIndex columns
new_columns = pd.CategoricalIndex(df[columns].cat.categories, name=columns)
meta = pd.DataFrame(columns=new_columns, dtype=np.float64)
meta.index.name = index
kwargs = {'index': index, 'columns': columns, 'values': values}
pv_sum = apply_concat_apply([df],
chunk=methods.pivot_sum,
aggregate=methods.pivot_agg,
meta=meta,
token='pivot_table_sum',
chunk_kwargs=kwargs)
pv_count = apply_concat_apply([df],
chunk=methods.pivot_count,
aggregate=methods.pivot_agg,
meta=meta,
token='pivot_table_count',
chunk_kwargs=kwargs)
if aggfunc == 'sum':
return pv_sum
elif aggfunc == 'count':
return pv_count
elif aggfunc == 'mean':
return pv_sum / pv_count
else:
raise ValueError
###############################################################
# Melt
###############################################################
def melt(frame, id_vars=None, value_vars=None, var_name=None,
value_name='value', col_level=None):
from dask.dataframe.core import no_default
return frame.map_partitions(pd.melt, meta=no_default, id_vars=id_vars,
value_vars=value_vars,
var_name=var_name, value_name=value_name,
col_level=col_level, token='melt')