You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/Resources/WPy64-3720/python-3.7.2.amd64/Lib/site-packages/dask/dataframe/io/hdf.py

430 lines
15 KiB

from __future__ import absolute_import, division, print_function
from fnmatch import fnmatch
from glob import glob
import os
import uuid
from warnings import warn
import pandas as pd
from toolz import merge
from .io import _link
from ...base import get_scheduler
from ..core import DataFrame, new_dd_object
from ... import config, multiprocessing
from ...base import tokenize, compute_as_if_collection
from ...bytes.utils import build_name_function
from ...compatibility import PY3
from ...delayed import Delayed, delayed
from ...utils import get_scheduler_lock
def _pd_to_hdf(pd_to_hdf, lock, args, kwargs=None):
""" A wrapper function around pd_to_hdf that enables locking"""
if lock:
lock.acquire()
try:
pd_to_hdf(*args, **kwargs)
finally:
if lock:
lock.release()
return None
def to_hdf(df, path, key, mode='a', append=False, scheduler=None,
name_function=None, compute=True, lock=None, dask_kwargs={},
**kwargs):
""" Store Dask Dataframe to Hierarchical Data Format (HDF) files
This is a parallel version of the Pandas function of the same name. Please
see the Pandas docstring for more detailed information about shared keyword
arguments.
This function differs from the Pandas version by saving the many partitions
of a Dask DataFrame in parallel, either to many files, or to many datasets
within the same file. You may specify this parallelism with an asterix
``*`` within the filename or datapath, and an optional ``name_function``.
The asterix will be replaced with an increasing sequence of integers
starting from ``0`` or with the result of calling ``name_function`` on each
of those integers.
This function only supports the Pandas ``'table'`` format, not the more
specialized ``'fixed'`` format.
Parameters
----------
path : string
Path to a target filename. May contain a ``*`` to denote many filenames
key : string
Datapath within the files. May contain a ``*`` to denote many locations
name_function : function
A function to convert the ``*`` in the above options to a string.
Should take in a number from 0 to the number of partitions and return a
string. (see examples below)
compute : bool
Whether or not to execute immediately. If False then this returns a
``dask.Delayed`` value.
lock : Lock, optional
Lock to use to prevent concurrency issues. By default a
``threading.Lock``, ``multiprocessing.Lock`` or ``SerializableLock``
will be used depending on your scheduler if a lock is required. See
dask.utils.get_scheduler_lock for more information about lock
selection.
scheduler : string
The scheduler to use, like "threads" or "processes"
**other:
See pandas.to_hdf for more information
Examples
--------
Save Data to a single file
>>> df.to_hdf('output.hdf', '/data') # doctest: +SKIP
Save data to multiple datapaths within the same file:
>>> df.to_hdf('output.hdf', '/data-*') # doctest: +SKIP
Save data to multiple files:
>>> df.to_hdf('output-*.hdf', '/data') # doctest: +SKIP
Save data to multiple files, using the multiprocessing scheduler:
>>> df.to_hdf('output-*.hdf', '/data', scheduler='processes') # doctest: +SKIP
Specify custom naming scheme. This writes files as
'2000-01-01.hdf', '2000-01-02.hdf', '2000-01-03.hdf', etc..
>>> from datetime import date, timedelta
>>> base = date(year=2000, month=1, day=1)
>>> def name_function(i):
... ''' Convert integer 0 to n to a string '''
... return base + timedelta(days=i)
>>> df.to_hdf('*.hdf', '/data', name_function=name_function) # doctest: +SKIP
Returns
-------
filenames : list
Returned if ``compute`` is True. List of file names that each partition
is saved to.
delayed : dask.Delayed
Returned if ``compute`` is False. Delayed object to execute ``to_hdf``
when computed.
See Also
--------
read_hdf:
to_parquet:
"""
name = 'to-hdf-' + uuid.uuid1().hex
pd_to_hdf = getattr(df._partition_type, 'to_hdf')
single_file = True
single_node = True
# if path is string, format using i_name
if isinstance(path, str):
if path.count('*') + key.count('*') > 1:
raise ValueError("A maximum of one asterisk is accepted in file "
"path and dataset key")
fmt_obj = lambda path, i_name: path.replace('*', i_name)
if '*' in path:
single_file = False
else:
if key.count('*') > 1:
raise ValueError("A maximum of one asterisk is accepted in "
"dataset key")
fmt_obj = lambda path, _: path
if '*' in key:
single_node = False
if 'format' in kwargs and kwargs['format'] not in ['t', 'table']:
raise ValueError("Dask only support 'table' format in hdf files.")
if mode not in ('a', 'w', 'r+'):
raise ValueError("Mode must be one of 'a', 'w' or 'r+'")
if name_function is None:
name_function = build_name_function(df.npartitions - 1)
# we guarantee partition order is preserved when its saved and read
# so we enforce name_function to maintain the order of its input.
if not (single_file and single_node):
formatted_names = [name_function(i) for i in range(df.npartitions)]
if formatted_names != sorted(formatted_names):
warn("To preserve order between partitions name_function "
"must preserve the order of its input")
# If user did not specify scheduler and write is sequential default to the
# sequential scheduler. otherwise let the _get method choose the scheduler
if (scheduler is None and
not config.get('scheduler', None) and
single_node and single_file):
scheduler = 'single-threaded'
# handle lock default based on whether we're writing to a single entity
_actual_get = get_scheduler(collections=[df], scheduler=scheduler)
if lock is None:
if not single_node:
lock = True
elif not single_file and _actual_get is not multiprocessing.get:
# if we're writing to multiple files with the multiprocessing
# scheduler we don't need to lock
lock = True
else:
lock = False
if lock:
lock = get_scheduler_lock(df, scheduler=scheduler)
kwargs.update({'format': 'table', 'mode': mode, 'append': append})
dsk = dict()
i_name = name_function(0)
dsk[(name, 0)] = (_pd_to_hdf, pd_to_hdf, lock,
[(df._name, 0), fmt_obj(path, i_name),
key.replace('*', i_name)], kwargs)
kwargs2 = kwargs.copy()
if single_file:
kwargs2['mode'] = 'a'
if single_node:
kwargs2['append'] = True
filenames = []
for i in range(0,df.npartitions):
i_name = name_function(i)
filenames.append(fmt_obj(path, i_name))
for i in range(1, df.npartitions):
i_name = name_function(i)
task = (_pd_to_hdf, pd_to_hdf, lock,
[(df._name, i), fmt_obj(path, i_name),
key.replace('*', i_name)], kwargs2)
if single_file:
link_dep = i - 1 if single_node else 0
task = (_link, (name, link_dep), task)
dsk[(name, i)] = task
dsk = merge(df.dask, dsk)
if single_file and single_node:
keys = [(name, df.npartitions - 1)]
else:
keys = [(name, i) for i in range(df.npartitions)]
if compute:
compute_as_if_collection(DataFrame, dsk, keys,
scheduler=scheduler, **dask_kwargs)
return filenames
else:
return delayed([Delayed(k, dsk) for k in keys])
dont_use_fixed_error_message = """
This HDFStore is not partitionable and can only be use monolithically with
pandas. In the future when creating HDFStores use the ``format='table'``
option to ensure that your dataset can be parallelized"""
read_hdf_error_msg = """
The start and stop keywords are not supported when reading from more than
one file/dataset.
The combination is ambiguous because it could be interpreted as the starting
and stopping index per file, or starting and stopping index of the global
dataset."""
def _read_single_hdf(path, key, start=0, stop=None, columns=None,
chunksize=int(1e6), sorted_index=False, lock=None,
mode='a'):
"""
Read a single hdf file into a dask.dataframe. Used for each file in
read_hdf.
"""
def get_keys_stops_divisions(path, key, stop, sorted_index, chunksize):
"""
Get the "keys" or group identifiers which match the given key, which
can contain wildcards. This uses the hdf file identified by the
given path. Also get the index of the last row of data for each matched
key.
"""
with pd.HDFStore(path, mode=mode) as hdf:
keys = [k for k in hdf.keys() if fnmatch(k, key)]
stops = []
divisions = []
for k in keys:
storer = hdf.get_storer(k)
if storer.format_type != 'table':
raise TypeError(dont_use_fixed_error_message)
if stop is None:
stops.append(storer.nrows)
elif stop > storer.nrows:
raise ValueError("Stop keyword exceeds dataset number "
"of rows ({})".format(storer.nrows))
else:
stops.append(stop)
if sorted_index:
division = [storer.read_column('index', start=start, stop=start + 1)[0]
for start in range(0, storer.nrows, chunksize)]
division_end = storer.read_column('index',
start=storer.nrows - 1,
stop=storer.nrows)[0]
division.append(division_end)
divisions.append(division)
else:
divisions.append(None)
return keys, stops, divisions
def one_path_one_key(path, key, start, stop, columns, chunksize, division, lock):
"""
Get the data frame corresponding to one path and one key (which should
not contain any wildcards).
"""
empty = pd.read_hdf(path, key, mode=mode, stop=0)
if columns is not None:
empty = empty[columns]
token = tokenize((path, os.path.getmtime(path), key, start,
stop, empty, chunksize, division))
name = 'read-hdf-' + token
if empty.ndim == 1:
base = {'name': empty.name, 'mode': mode}
else:
base = {'columns': empty.columns, 'mode': mode}
if start >= stop:
raise ValueError("Start row number ({}) is above or equal to stop "
"row number ({})".format(start, stop))
def update(s):
new = base.copy()
new.update({'start': s, 'stop': s + chunksize})
return new
dsk = dict(((name, i), (_pd_read_hdf, path, key, lock,
update(s)))
for i, s in enumerate(range(start, stop, chunksize)))
if division:
divisions = division
else:
divisions = [None] * (len(dsk) + 1)
return new_dd_object(dsk, name, empty, divisions)
keys, stops, divisions = get_keys_stops_divisions(path, key, stop, sorted_index, chunksize)
if (start != 0 or stop is not None) and len(keys) > 1:
raise NotImplementedError(read_hdf_error_msg)
from ..multi import concat
return concat([one_path_one_key(path, k, start, s, columns, chunksize, d, lock)
for k, s, d in zip(keys, stops, divisions)])
def _pd_read_hdf(path, key, lock, kwargs):
""" Read from hdf5 file with a lock """
if lock:
lock.acquire()
try:
result = pd.read_hdf(path, key, **kwargs)
finally:
if lock:
lock.release()
return result
def read_hdf(pattern, key, start=0, stop=None, columns=None,
chunksize=1000000, sorted_index=False, lock=True, mode='a'):
"""
Read HDF files into a Dask DataFrame
Read hdf files into a dask dataframe. This function is like
``pandas.read_hdf``, except it can read from a single large file, or from
multiple files, or from multiple keys from the same file.
Parameters
----------
pattern : string, list
File pattern (string), buffer to read from, or list of file
paths. Can contain wildcards.
key : group identifier in the store. Can contain wildcards
start : optional, integer (defaults to 0), row number to start at
stop : optional, integer (defaults to None, the last row), row number to
stop at
columns : list of columns, optional
A list of columns that if not None, will limit the return
columns (default is None)
chunksize : positive integer, optional
Maximal number of rows per partition (default is 1000000).
sorted_index : boolean, optional
Option to specify whether or not the input hdf files have a sorted
index (default is False).
lock : boolean, optional
Option to use a lock to prevent concurrency issues (default is True).
mode : {'a', 'r', 'r+'}, default 'a'. Mode to use when opening file(s).
'r'
Read-only; no data can be modified.
'a'
Append; an existing file is opened for reading and writing,
and if the file does not exist it is created.
'r+'
It is similar to 'a', but the file must already exist.
Returns
-------
dask.DataFrame
Examples
--------
Load single file
>>> dd.read_hdf('myfile.1.hdf5', '/x') # doctest: +SKIP
Load multiple files
>>> dd.read_hdf('myfile.*.hdf5', '/x') # doctest: +SKIP
>>> dd.read_hdf(['myfile.1.hdf5', 'myfile.2.hdf5'], '/x') # doctest: +SKIP
Load multiple datasets
>>> dd.read_hdf('myfile.1.hdf5', '/*') # doctest: +SKIP
"""
if lock is True:
lock = get_scheduler_lock()
key = key if key.startswith('/') else '/' + key
if isinstance(pattern, str):
paths = sorted(glob(pattern))
else:
paths = pattern
if (start != 0 or stop is not None) and len(paths) > 1:
raise NotImplementedError(read_hdf_error_msg)
if chunksize <= 0:
raise ValueError("Chunksize must be a positive integer")
if (start != 0 or stop is not None) and sorted_index:
raise ValueError("When assuming pre-partitioned data, data must be "
"read in its entirety using the same chunksizes")
from ..multi import concat
return concat([_read_single_hdf(path, key, start=start, stop=stop,
columns=columns, chunksize=chunksize,
sorted_index=sorted_index,
lock=lock, mode=mode)
for path in paths])
if PY3:
from ..core import _Frame
_Frame.to_hdf.__doc__ = to_hdf.__doc__