You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/Resources/WPy64-3720/python-3.7.2.amd64/Lib/site-packages/dask/dataframe/hashing.py

144 lines
6.3 KiB

from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd
from .utils import PANDAS_VERSION
# In Pandas 0.19.2, a function to hash pandas objects was introduced. Object
# arrays are assumed to be strings, and are hashed with a cython implementation
# of siphash. However, the version in 0.19.2 hashes categoricals based on their
# integer codes, instead of taking into account the values they represent. This
# is fixed in pandas > 0.19.2. To support versions 0.19.0 and up, we do do the
# following:
#
# - For versions > 0.19.2, we use the provided `hash_pandas_object` function.
# - For 0.19.0 through 0.19.2, we copy the definition of `hash_pandas_object`
# from pandas master (will be released as 0.20.0).
# - For 0.19.0 and 0.19.1, we use python's `hash` builtin to hash strings.
# - For 0.19.2, we use the `hash_object_array` method provided in pandas
# (an implementation of siphash)
#
# When dask drops support for pandas <= 0.19.2, all this can be removed.
# XXX: Pandas uses release branches > 0.19.0, which doesn't play well with
# versioneer, since the release tags aren't ancestors of master. As such, we
# need to use this hacky awfulness to check if we're > 0.19.2.
if PANDAS_VERSION >= '0.20.0':
from pandas.util import hash_pandas_object
elif PANDAS_VERSION not in ('0.19.1', '0.19.2') and PANDAS_VERSION > '0.19.0+460':
from pandas.tools.hashing import hash_pandas_object
else:
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
is_datetime64_dtype, is_timedelta64_dtype)
from pandas.lib import is_bool_array
if PANDAS_VERSION == '0.19.2':
from pandas._hash import hash_object_array
else: # 0.19.0 and 0.19.1
def hash_object_array(x, hash_key, encoding):
return np.array([hash(i) for i in x], dtype=np.uint64)
# 16 byte long hashing key
_default_hash_key = '0123456789123456'
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
categorize=True):
if hash_key is None:
hash_key = _default_hash_key
def adder(h, hashed_to_add):
h = np.multiply(h, np.uint(3), h)
return np.add(h, hashed_to_add, h)
if isinstance(obj, pd.Index):
h = hash_array(obj.values, encoding, hash_key,
categorize).astype('uint64')
h = pd.Series(h, index=obj, dtype='uint64')
elif isinstance(obj, pd.Series):
h = hash_array(obj.values, encoding, hash_key,
categorize).astype('uint64')
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key,
categorize=categorize).values)
h = pd.Series(h, index=obj.index, dtype='uint64')
elif isinstance(obj, pd.DataFrame):
cols = obj.iteritems()
first_series = next(cols)[1]
h = hash_array(first_series.values, encoding,
hash_key, categorize).astype('uint64')
for _, col in cols:
h = adder(h, hash_array(col.values, encoding, hash_key,
categorize))
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key,
categorize=categorize).values)
h = pd.Series(h, index=obj.index, dtype='uint64')
else:
raise TypeError("Unexpected type for hashing %s" % type(obj))
return h
def _hash_categorical(c, encoding, hash_key):
hashed = hash_array(c.categories.values, encoding, hash_key,
categorize=False)
mask = c.isnull()
if len(hashed):
result = hashed.take(c.codes)
else:
result = np.zeros(len(mask), dtype='uint64')
if mask.any():
result[mask] = np.iinfo(np.uint64).max
return result
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
if hash_key is None:
hash_key = _default_hash_key
# For categoricals, we hash the categories, then remap the codes to the
# hash values. (This check is above the complex check so that we don't
# ask numpy if categorical is a subdtype of complex, as it will choke.
if is_categorical_dtype(vals.dtype):
return _hash_categorical(vals, encoding, hash_key)
# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
if np.issubdtype(vals.dtype, np.complex128):
return hash_array(vals.real) + 23 * hash_array(vals.imag)
# First, turn whatever array this is into unsigned 64-bit ints, if we
# can manage it.
if is_bool_array(vals):
vals = vals.astype('u8')
elif ((is_datetime64_dtype(vals) or
is_timedelta64_dtype(vals) or
is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
else:
# With repeated values, its MUCH faster to categorize object
# dtypes, then hash and rename categories. We allow skipping the
# categorization when the values are known/likely to be unique.
if categorize:
codes, categories = pd.factorize(vals, sort=False)
cat = pd.Categorical(codes, pd.Index(categories),
ordered=False, fastpath=True)
return _hash_categorical(cat, encoding, hash_key)
vals = hash_object_array(vals, hash_key, encoding)
# Then, redistribute these 64-bit ints within the space of 64-bit ints
vals ^= vals >> 30
vals *= np.uint64(0xbf58476d1ce4e5b9)
vals ^= vals >> 27
vals *= np.uint64(0x94d049bb133111eb)
vals ^= vals >> 31
return vals