You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
144 lines
6.3 KiB
144 lines
6.3 KiB
from __future__ import absolute_import, division, print_function
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from .utils import PANDAS_VERSION
|
|
|
|
# In Pandas 0.19.2, a function to hash pandas objects was introduced. Object
|
|
# arrays are assumed to be strings, and are hashed with a cython implementation
|
|
# of siphash. However, the version in 0.19.2 hashes categoricals based on their
|
|
# integer codes, instead of taking into account the values they represent. This
|
|
# is fixed in pandas > 0.19.2. To support versions 0.19.0 and up, we do do the
|
|
# following:
|
|
#
|
|
# - For versions > 0.19.2, we use the provided `hash_pandas_object` function.
|
|
# - For 0.19.0 through 0.19.2, we copy the definition of `hash_pandas_object`
|
|
# from pandas master (will be released as 0.20.0).
|
|
# - For 0.19.0 and 0.19.1, we use python's `hash` builtin to hash strings.
|
|
# - For 0.19.2, we use the `hash_object_array` method provided in pandas
|
|
# (an implementation of siphash)
|
|
#
|
|
# When dask drops support for pandas <= 0.19.2, all this can be removed.
|
|
|
|
# XXX: Pandas uses release branches > 0.19.0, which doesn't play well with
|
|
# versioneer, since the release tags aren't ancestors of master. As such, we
|
|
# need to use this hacky awfulness to check if we're > 0.19.2.
|
|
if PANDAS_VERSION >= '0.20.0':
|
|
from pandas.util import hash_pandas_object
|
|
elif PANDAS_VERSION not in ('0.19.1', '0.19.2') and PANDAS_VERSION > '0.19.0+460':
|
|
from pandas.tools.hashing import hash_pandas_object
|
|
else:
|
|
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
|
|
is_datetime64_dtype, is_timedelta64_dtype)
|
|
from pandas.lib import is_bool_array
|
|
|
|
if PANDAS_VERSION == '0.19.2':
|
|
from pandas._hash import hash_object_array
|
|
else: # 0.19.0 and 0.19.1
|
|
def hash_object_array(x, hash_key, encoding):
|
|
return np.array([hash(i) for i in x], dtype=np.uint64)
|
|
|
|
# 16 byte long hashing key
|
|
_default_hash_key = '0123456789123456'
|
|
|
|
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
|
|
categorize=True):
|
|
if hash_key is None:
|
|
hash_key = _default_hash_key
|
|
|
|
def adder(h, hashed_to_add):
|
|
h = np.multiply(h, np.uint(3), h)
|
|
return np.add(h, hashed_to_add, h)
|
|
|
|
if isinstance(obj, pd.Index):
|
|
h = hash_array(obj.values, encoding, hash_key,
|
|
categorize).astype('uint64')
|
|
h = pd.Series(h, index=obj, dtype='uint64')
|
|
elif isinstance(obj, pd.Series):
|
|
h = hash_array(obj.values, encoding, hash_key,
|
|
categorize).astype('uint64')
|
|
if index:
|
|
h = adder(h, hash_pandas_object(obj.index,
|
|
index=False,
|
|
encoding=encoding,
|
|
hash_key=hash_key,
|
|
categorize=categorize).values)
|
|
h = pd.Series(h, index=obj.index, dtype='uint64')
|
|
elif isinstance(obj, pd.DataFrame):
|
|
cols = obj.iteritems()
|
|
first_series = next(cols)[1]
|
|
h = hash_array(first_series.values, encoding,
|
|
hash_key, categorize).astype('uint64')
|
|
for _, col in cols:
|
|
h = adder(h, hash_array(col.values, encoding, hash_key,
|
|
categorize))
|
|
if index:
|
|
h = adder(h, hash_pandas_object(obj.index,
|
|
index=False,
|
|
encoding=encoding,
|
|
hash_key=hash_key,
|
|
categorize=categorize).values)
|
|
|
|
h = pd.Series(h, index=obj.index, dtype='uint64')
|
|
else:
|
|
raise TypeError("Unexpected type for hashing %s" % type(obj))
|
|
return h
|
|
|
|
def _hash_categorical(c, encoding, hash_key):
|
|
hashed = hash_array(c.categories.values, encoding, hash_key,
|
|
categorize=False)
|
|
mask = c.isnull()
|
|
if len(hashed):
|
|
result = hashed.take(c.codes)
|
|
else:
|
|
result = np.zeros(len(mask), dtype='uint64')
|
|
|
|
if mask.any():
|
|
result[mask] = np.iinfo(np.uint64).max
|
|
|
|
return result
|
|
|
|
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
|
|
if hash_key is None:
|
|
hash_key = _default_hash_key
|
|
|
|
# For categoricals, we hash the categories, then remap the codes to the
|
|
# hash values. (This check is above the complex check so that we don't
|
|
# ask numpy if categorical is a subdtype of complex, as it will choke.
|
|
if is_categorical_dtype(vals.dtype):
|
|
return _hash_categorical(vals, encoding, hash_key)
|
|
|
|
# we'll be working with everything as 64-bit values, so handle this
|
|
# 128-bit value early
|
|
if np.issubdtype(vals.dtype, np.complex128):
|
|
return hash_array(vals.real) + 23 * hash_array(vals.imag)
|
|
|
|
# First, turn whatever array this is into unsigned 64-bit ints, if we
|
|
# can manage it.
|
|
if is_bool_array(vals):
|
|
vals = vals.astype('u8')
|
|
elif ((is_datetime64_dtype(vals) or
|
|
is_timedelta64_dtype(vals) or
|
|
is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
|
|
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
|
|
else:
|
|
# With repeated values, its MUCH faster to categorize object
|
|
# dtypes, then hash and rename categories. We allow skipping the
|
|
# categorization when the values are known/likely to be unique.
|
|
if categorize:
|
|
codes, categories = pd.factorize(vals, sort=False)
|
|
cat = pd.Categorical(codes, pd.Index(categories),
|
|
ordered=False, fastpath=True)
|
|
return _hash_categorical(cat, encoding, hash_key)
|
|
|
|
vals = hash_object_array(vals, hash_key, encoding)
|
|
|
|
# Then, redistribute these 64-bit ints within the space of 64-bit ints
|
|
vals ^= vals >> 30
|
|
vals *= np.uint64(0xbf58476d1ce4e5b9)
|
|
vals ^= vals >> 27
|
|
vals *= np.uint64(0x94d049bb133111eb)
|
|
vals ^= vals >> 31
|
|
return vals
|