You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/WPy32-3720/python-3.7.2/Lib/site-packages/dask/dataframe/indexing.py

328 lines
11 KiB

from __future__ import absolute_import, division, print_function
from datetime import datetime
from collections import defaultdict
import bisect
import numpy as np
import pandas as pd
from .core import new_dd_object, Series
from .utils import is_index_like
from . import methods
from ..base import tokenize
from ..highlevelgraph import HighLevelGraph
class _IndexerBase(object):
def __init__(self, obj):
self.obj = obj
@property
def _name(self):
return self.obj._name
@property
def _meta_indexer(self):
raise NotImplementedError
def _make_meta(self, iindexer, cindexer):
"""
get metadata
"""
if cindexer is None:
return self.obj
else:
return self._meta_indexer[:, cindexer]
class _iLocIndexer(_IndexerBase):
@property
def _meta_indexer(self):
return self.obj._meta.iloc
def __getitem__(self, key):
# dataframe
msg = ("'DataFrame.iloc' only supports selecting columns. "
"It must be used like 'df.iloc[:, column_indexer]'.")
if not isinstance(key, tuple):
raise NotImplementedError(msg)
if len(key) > 2:
raise ValueError("Too many indexers")
iindexer, cindexer = key
if iindexer != slice(None):
raise NotImplementedError(msg)
return self._iloc(iindexer, cindexer)
def _iloc(self, iindexer, cindexer):
assert iindexer == slice(None)
meta = self._make_meta( iindexer, cindexer)
return self.obj.map_partitions(methods.iloc, cindexer, meta=meta)
class _LocIndexer(_IndexerBase):
""" Helper class for the .loc accessor """
@property
def _meta_indexer(self):
return self.obj._meta.loc
def __getitem__(self, key):
if isinstance(key, tuple):
# multi-dimensional selection
if len(key) > self.obj.ndim:
# raise from pandas
msg = 'Too many indexers'
raise pd.core.indexing.IndexingError(msg)
iindexer = key[0]
cindexer = key[1]
else:
# if self.obj is Series, cindexer is always None
iindexer = key
cindexer = None
return self._loc(iindexer, cindexer)
def _loc(self, iindexer, cindexer):
""" Helper function for the .loc accessor """
if isinstance(iindexer, Series):
return self._loc_series(iindexer, cindexer)
if self.obj.known_divisions:
iindexer = self._maybe_partial_time_string(iindexer)
if isinstance(iindexer, slice):
return self._loc_slice(iindexer, cindexer)
elif isinstance(iindexer, (list, np.ndarray)):
return self._loc_list(iindexer, cindexer)
else:
# element should raise KeyError
return self._loc_element(iindexer, cindexer)
else:
if isinstance(iindexer, (list, np.ndarray)):
# applying map_pattition to each partitions
# results in duplicated NaN rows
msg = 'Cannot index with list against unknown division'
raise KeyError(msg)
elif not isinstance(iindexer, slice):
iindexer = slice(iindexer, iindexer)
meta = self._make_meta(iindexer, cindexer)
return self.obj.map_partitions(methods.try_loc, iindexer, cindexer,
meta=meta)
def _maybe_partial_time_string(self, iindexer):
"""
Convert index-indexer for partial time string slicing
if obj.index is DatetimeIndex / PeriodIndex
"""
iindexer = _maybe_partial_time_string(self.obj._meta_nonempty.index,
iindexer, kind='loc')
return iindexer
def _loc_series(self, iindexer, cindexer):
meta = self._make_meta(iindexer, cindexer)
return self.obj.map_partitions(methods.loc, iindexer, cindexer,
token='loc-series', meta=meta)
def _loc_list(self, iindexer, cindexer):
name = 'loc-%s' % tokenize(iindexer, self.obj)
parts = self._get_partitions(iindexer)
meta = self._make_meta(iindexer, cindexer)
if len(iindexer):
dsk = {}
divisions = []
items = sorted(parts.items())
for i, (div, indexer) in enumerate(items):
dsk[name, i] = (methods.loc, (self._name, div),
indexer, cindexer)
# append minimum value as division
divisions.append(sorted(indexer)[0])
# append maximum value of the last division
divisions.append(sorted(items[-1][1])[-1])
else:
divisions = [None, None]
dsk = {(name, 0): meta.head(0)}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self.obj])
return new_dd_object(graph, name, meta=meta, divisions=divisions)
def _loc_element(self, iindexer, cindexer):
name = 'loc-%s' % tokenize(iindexer, self.obj)
part = self._get_partitions(iindexer)
if iindexer < self.obj.divisions[0] or iindexer > self.obj.divisions[-1]:
raise KeyError('the label [%s] is not in the index' % str(iindexer))
dsk = {(name, 0): (methods.loc, (self._name, part),
slice(iindexer, iindexer), cindexer)}
meta = self._make_meta(iindexer, cindexer)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self.obj])
return new_dd_object(graph, name, meta=meta, divisions=[iindexer, iindexer])
def _get_partitions(self, keys):
if isinstance(keys, (list, np.ndarray)):
return _partitions_of_index_values(self.obj.divisions, keys)
else:
# element
return _partition_of_index_value(self.obj.divisions, keys)
def _coerce_loc_index(self, key):
return _coerce_loc_index(self.obj.divisions, key)
def _loc_slice(self, iindexer, cindexer):
name = 'loc-%s' % tokenize(iindexer, cindexer, self)
assert isinstance(iindexer, slice)
assert iindexer.step in (None, 1)
if iindexer.start is not None:
start = self._get_partitions(iindexer.start)
else:
start = 0
if iindexer.stop is not None:
stop = self._get_partitions(iindexer.stop)
else:
stop = self.obj.npartitions - 1
if iindexer.start is None and self.obj.known_divisions:
istart = self.obj.divisions[0]
else:
istart = self._coerce_loc_index(iindexer.start)
if iindexer.stop is None and self.obj.known_divisions:
istop = self.obj.divisions[-1]
else:
istop = self._coerce_loc_index(iindexer.stop)
if stop == start:
dsk = {(name, 0): (methods.loc, (self._name, start),
slice(iindexer.start, iindexer.stop), cindexer)}
divisions = [istart, istop]
else:
dsk = {(name, 0): (methods.loc, (self._name, start),
slice(iindexer.start, None), cindexer)}
for i in range(1, stop - start):
if cindexer is None:
dsk[name, i] = (self._name, start + i)
else:
dsk[name, i] = (methods.loc, (self._name, start + i),
slice(None, None), cindexer)
dsk[name, stop - start] = (methods.loc, (self._name, stop),
slice(None, iindexer.stop), cindexer)
if iindexer.start is None:
div_start = self.obj.divisions[0]
else:
div_start = max(istart, self.obj.divisions[start])
if iindexer.stop is None:
div_stop = self.obj.divisions[-1]
else:
div_stop = min(istop, self.obj.divisions[stop + 1])
divisions = ((div_start, ) +
self.obj.divisions[start + 1:stop + 1] +
(div_stop, ))
assert len(divisions) == len(dsk) + 1
meta = self._make_meta(iindexer, cindexer)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[self.obj])
return new_dd_object(graph, name, meta=meta, divisions=divisions)
def _partition_of_index_value(divisions, val):
""" In which partition does this value lie?
>>> _partition_of_index_value([0, 5, 10], 3)
0
>>> _partition_of_index_value([0, 5, 10], 8)
1
>>> _partition_of_index_value([0, 5, 10], 100)
1
>>> _partition_of_index_value([0, 5, 10], 5) # left-inclusive divisions
1
"""
if divisions[0] is None:
msg = "Can not use loc on DataFrame without known divisions"
raise ValueError(msg)
val = _coerce_loc_index(divisions, val)
i = bisect.bisect_right(divisions, val)
return min(len(divisions) - 2, max(0, i - 1))
def _partitions_of_index_values(divisions, values):
""" Return defaultdict of division and values pairs
Each key corresponds to the division which values are index values belong
to the division.
>>> sorted(_partitions_of_index_values([0, 5, 10], [3]).items())
[(0, [3])]
>>> sorted(_partitions_of_index_values([0, 5, 10], [3, 8, 5]).items())
[(0, [3]), (1, [8, 5])]
"""
if divisions[0] is None:
msg = "Can not use loc on DataFrame without known divisions"
raise ValueError(msg)
results = defaultdict(list)
values = pd.Index(values, dtype=object)
for val in values:
i = bisect.bisect_right(divisions, val)
div = min(len(divisions) - 2, max(0, i - 1))
results[div].append(val)
return results
def _coerce_loc_index(divisions, o):
""" Transform values to be comparable against divisions
This is particularly valuable to use with pandas datetimes
"""
if divisions and isinstance(divisions[0], datetime):
return pd.Timestamp(o)
if divisions and isinstance(divisions[0], np.datetime64):
return np.datetime64(o).astype(divisions[0].dtype)
return o
def _maybe_partial_time_string(index, indexer, kind):
"""
Convert indexer for partial string selection
if data has DatetimeIndex/PeriodIndex
"""
# do not pass dd.Index
assert is_index_like(index)
if not isinstance(index, (pd.DatetimeIndex, pd.PeriodIndex)):
return indexer
if isinstance(indexer, slice):
if isinstance(indexer.start, pd.compat.string_types):
start = index._maybe_cast_slice_bound(indexer.start, 'left', kind)
else:
start = indexer.start
if isinstance(indexer.stop, pd.compat.string_types):
stop = index._maybe_cast_slice_bound(indexer.stop, 'right', kind)
else:
stop = indexer.stop
return slice(start, stop)
elif isinstance(indexer, pd.compat.string_types):
start = index._maybe_cast_slice_bound(indexer, 'left', 'loc')
stop = index._maybe_cast_slice_bound(indexer, 'right', 'loc')
return slice(min(start, stop), max(start, stop))
return indexer