You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/WPy32-3720/python-3.7.2/Lib/site-packages/dask/dataframe/hyperloglog.py

85 lines
2.5 KiB

# -*- coding: utf-8 -*-
u"""Implementation of HyperLogLog
This implements the HyperLogLog algorithm for cardinality estimation, found
in
Philippe Flajolet, Éric Fusy, Olivier Gandouet and Frédéric Meunier.
"HyperLogLog: the analysis of a near-optimal cardinality estimation
algorithm". 2007 Conference on Analysis of Algorithms. Nice, France
(2007)
"""
from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd
from .hashing import hash_pandas_object
def compute_first_bit(a):
"Compute the position of the first nonzero bit for each int in an array."
# TODO: consider making this less memory-hungry
bits = np.bitwise_and.outer(a, 1 << np.arange(32))
bits = bits.cumsum(axis=1).astype(np.bool)
return 33 - bits.sum(axis=1)
def compute_hll_array(obj, b):
# b is the number of bits
if not 8 <= b <= 16:
raise ValueError('b should be between 8 and 16')
num_bits_discarded = 32 - b
m = 1 << b
# Get an array of the hashes
hashes = hash_pandas_object(obj, index=False)
if isinstance(hashes, pd.Series):
hashes = hashes._values
hashes = hashes.astype(np.uint32)
# Of the first b bits, which is the first nonzero?
j = hashes >> num_bits_discarded
first_bit = compute_first_bit(hashes)
# Pandas can do the max aggregation
df = pd.DataFrame({'j': j, 'first_bit': first_bit})
series = df.groupby('j').max()['first_bit']
# Return a dense array so we can concat them and get a result
# that is easy to deal with
return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8)
def reduce_state(Ms, b):
m = 1 << b
# We concatenated all of the states, now we need to get the max
# value for each j in both
Ms = Ms.reshape((len(Ms) // m), m)
return Ms.max(axis=0)
def estimate_count(Ms, b):
m = 1 << b
# Combine one last time
M = reduce_state(Ms, b)
# Estimate cardinality, no adjustments
alpha = 0.7213 / (1 + 1.079 / m)
E = alpha * m / (2.0 ** -M.astype('f8')).sum() * m
# ^^^^ starts as unsigned, need a signed type for
# negation operator to do something useful
# Apply adjustments for small / big cardinalities, if applicable
if E < 2.5 * m:
V = (M == 0).sum()
if V:
return m * np.log(m / V)
if E > 2**32 / 30.0:
return -2**32 * np.log1p(-E / 2**32)
return E