You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/Resources/WPy64-3720/python-3.7.2.amd64/Lib/site-packages/dask/dataframe/io/parquet.py

1241 lines
47 KiB

from __future__ import absolute_import, division, print_function
import re
from collections import OrderedDict
import copy
import json
import os
from distutils.version import LooseVersion
import numpy as np
import pandas as pd
from ..core import DataFrame, Series
from ..utils import (clear_known_categories, strip_unknown_categories,
UNKNOWN_CATEGORIES)
from ...bytes.compression import compress
from ...base import tokenize
from ...compatibility import PY3, string_types
from ...delayed import delayed
from ...bytes.core import get_fs_token_paths
from ...bytes.utils import infer_storage_options
from ...utils import import_required, natural_sort_key
from .utils import _get_pyarrow_dtypes, _meta_from_dtypes
__all__ = ('read_parquet', 'to_parquet')
def _parse_pandas_metadata(pandas_metadata):
"""Get the set of names from the pandas metadata section
Parameters
----------
pandas_metadata : dict
Should conform to the pandas parquet metadata spec
Returns
-------
index_names : list
List of strings indicating the actual index names
column_names : list
List of strings indicating the actual column names
storage_name_mapping : dict
Pairs of storage names (e.g. the field names for
PyArrow) and actual names. The storage and field names will
differ for index names for certain writers (pyarrow > 0.8).
column_indexes_names : list
The names for ``df.columns.name`` or ``df.columns.names`` for
a MultiIndex in the columns
Notes
-----
This should support metadata written by at least
* fastparquet>=0.1.3
* pyarrow>=0.7.0
"""
index_storage_names = pandas_metadata['index_columns']
index_name_xpr = re.compile(r'__index_level_\d+__')
# older metadatas will not have a 'field_name' field so we fall back
# to the 'name' field
pairs = [(x.get('field_name', x['name']), x['name'])
for x in pandas_metadata['columns']]
# Need to reconcile storage and real names. These will differ for
# pyarrow, which uses __index_leveL_d__ for the storage name of indexes.
# The real name may be None (e.g. `df.index.name` is None).
pairs2 = []
for storage_name, real_name in pairs:
if real_name and index_name_xpr.match(real_name):
real_name = None
pairs2.append((storage_name, real_name))
index_names = [name for (storage_name, name) in pairs2
if name != storage_name]
# column_indexes represents df.columns.name
# It was added to the spec after pandas 0.21.0+, and implemented
# in PyArrow 0.8. It's not currently impelmented in fastparquet.
column_index_names = pandas_metadata.get("column_indexes", [{'name': None}])
column_index_names = [x['name'] for x in column_index_names]
# Now we need to disambiguate between columns and index names. PyArrow
# 0.8.0+ allows for duplicates between df.index.names and df.columns
if not index_names:
# For PyArrow < 0.8, Any fastparquet. This relies on the facts that
# 1. Those versions used the real index name as the index storage name
# 2. Those versions did not allow for duplicate index / column names
# So we know that if a name is in index_storage_names, it must be an
# index name
index_names = list(index_storage_names) # make a copy
index_storage_names2 = set(index_storage_names)
column_names = [name for (storage_name, name)
in pairs if name not in index_storage_names2]
else:
# For newer PyArrows the storage names differ from the index names
# iff it's an index level. Though this is a fragile assumption for
# other systems...
column_names = [name for (storage_name, name) in pairs2
if name == storage_name]
storage_name_mapping = dict(pairs2) # TODO: handle duplicates gracefully
return index_names, column_names, storage_name_mapping, column_index_names
def _normalize_index_columns(user_columns, data_columns, user_index, data_index):
"""Normalize user and file-provided column and index names
Parameters
----------
user_columns : None, str or list of str
data_columns : list of str
user_index : None, str, or list of str
data_index : list of str
Returns
-------
column_names : list of str
index_names : list of str
out_type : {pd.Series, pd.DataFrame}
"""
specified_columns = user_columns is not None
specified_index = user_index is not None
out_type = DataFrame
if user_columns is None:
user_columns = list(data_columns)
elif isinstance(user_columns, string_types):
user_columns = [user_columns]
out_type = Series
else:
user_columns = list(user_columns)
if user_index is None:
user_index = data_index
elif user_index is False:
# When index is False, use no index and all fields should be treated as
# columns (unless `columns` provided).
user_index = []
data_columns = data_index + data_columns
elif isinstance(user_index, string_types):
user_index = [user_index]
else:
user_index = list(user_index)
if specified_index and not specified_columns:
# Only `index` provided. Use specified index, and all column fields
# that weren't specified as indices
index_names = user_index
column_names = [x for x in data_columns if x not in index_names]
elif specified_columns and not specified_index:
# Only `columns` provided. Use specified columns, and all index fields
# that weren't specified as columns
column_names = user_columns
index_names = [x for x in data_index if x not in column_names]
elif specified_index and specified_columns:
# Both `index` and `columns` provided. Use as specified, but error if
# they intersect.
column_names = user_columns
index_names = user_index
if set(column_names).intersection(index_names):
raise ValueError("Specified index and column names must not "
"intersect")
else:
# Use default columns and index from the metadata
column_names = data_columns
index_names = data_index
return column_names, index_names, out_type
# ----------------------------------------------------------------------
# Fastparquet interface
def _read_fastparquet(fs, fs_token, paths, columns=None, filters=None,
categories=None, index=None, infer_divisions=None):
import fastparquet
if isinstance(paths, fastparquet.api.ParquetFile):
pf = paths
elif len(paths) > 1:
if infer_divisions is not False:
# this scans all the files, allowing index/divisions and filtering
pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep)
else:
return _read_fp_multifile(fs, fs_token, paths, columns=columns,
categories=categories, index=index)
else:
try:
pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata',
open_with=fs.open,
sep=fs.sep)
except Exception:
pf = fastparquet.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep)
# Validate infer_divisions
if os.path.split(pf.fn)[-1] != '_metadata' and infer_divisions is True:
raise NotImplementedError("infer_divisions=True is not supported by the "
"fastparquet engine for datasets that "
"do not contain a global '_metadata' file")
(meta, filters, index_name, out_type, all_columns, index_names,
storage_name_mapping) = _pf_validation(
pf, columns, index, categories, filters)
rgs = [rg for rg in pf.row_groups if
not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and
not (fastparquet.api.filter_out_cats(rg, filters))]
name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, filters,
categories)
dsk = {(name, i): (_read_parquet_row_group, fs, pf.row_group_filename(rg),
index_names, all_columns, rg, out_type == Series,
categories, pf.schema, pf.cats, pf.dtypes,
pf.file_scheme, storage_name_mapping)
for i, rg in enumerate(rgs)}
if not dsk:
# empty dataframe
dsk = {(name, 0): meta}
divisions = (None, None)
return out_type(dsk, name, meta, divisions)
if index_names and infer_divisions is not False:
index_name = meta.index.name
try:
# is https://github.com/dask/fastparquet/pull/371 available in
# current fastparquet installation?
minmax = fastparquet.api.sorted_partitioned_columns(pf, filters)
except TypeError:
minmax = fastparquet.api.sorted_partitioned_columns(pf)
if index_name in minmax:
divisions = minmax[index_name]
divisions = divisions['min'] + [divisions['max'][-1]]
else:
if infer_divisions is True:
raise ValueError(
("Unable to infer divisions for index of '{index_name}'"
" because it is not known to be "
"sorted across partitions").format(index_name=index_name))
divisions = (None,) * (len(rgs) + 1)
else:
if infer_divisions is True:
raise ValueError(
'Unable to infer divisions for because no index column'
' was discovered')
divisions = (None,) * (len(rgs) + 1)
if isinstance(divisions[0], np.datetime64):
divisions = [pd.Timestamp(d) for d in divisions]
return out_type(dsk, name, meta, divisions)
def _pf_validation(pf, columns, index, categories, filters):
"""Validate user options against metadata in dataset
columns, index and categories must be in the list of columns available
(both data columns and path-based partitioning - subject to possible
renaming, if pandas metadata is present). The output index will
be inferred from any available pandas metadata, if not given.
"""
from fastparquet.util import check_column_names
check_column_names(pf.columns, categories)
check_column_names(pf.columns + list(pf.cats or []), columns)
if isinstance(columns, tuple):
# ensure they tokenize the same
columns = list(columns)
if pf.fmd.key_value_metadata:
pandas_md = [x.value for x in pf.fmd.key_value_metadata
if x.key == 'pandas']
else:
pandas_md = []
if len(pandas_md) == 0:
# Fall back to the storage information
index_names = pf._get_index()
if not isinstance(index_names, list):
index_names = [index_names]
column_names = pf.columns + list(pf.cats)
storage_name_mapping = {k: k for k in column_names}
elif len(pandas_md) == 1:
index_names, column_names, storage_name_mapping, column_index_names = (
_parse_pandas_metadata(json.loads(pandas_md[0]))
)
column_names.extend(pf.cats)
else:
raise ValueError("File has multiple entries for 'pandas' metadata")
# Normalize user inputs
if filters is None:
filters = []
column_names, index_names, out_type = _normalize_index_columns(
columns, column_names, index, index_names)
if categories is None:
categories = pf.categories
elif isinstance(categories, string_types):
categories = [categories]
else:
categories = list(categories)
# TODO: write partition_on to pandas metadata...
all_columns = list(column_names)
all_columns.extend(x for x in index_names if x not in column_names)
dtypes = pf._dtypes(categories)
dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}
meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None])
# fastparquet doesn't handle multiindex
if len(index_names) > 1:
raise ValueError("Cannot read DataFrame with MultiIndex.")
elif len(index_names) == 0:
index_names = None
for cat in categories:
if cat in meta:
meta[cat] = pd.Series(pd.Categorical([],
categories=[UNKNOWN_CATEGORIES]),
index=meta.index)
for catcol in pf.cats:
if catcol in meta.columns:
meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
elif meta.index.name == catcol:
meta.index = meta.index.set_categories(pf.cats[catcol])
if out_type == Series:
assert len(meta.columns) == 1
meta = meta[meta.columns[0]]
return (meta, filters, index_names, out_type, all_columns, index_names,
storage_name_mapping)
def _read_fp_multifile(fs, fs_token, paths, columns=None,
categories=None, index=None):
"""Read dataset with fastparquet by assuming metadata from first file"""
from fastparquet import ParquetFile
from fastparquet.util import analyse_paths, get_file_scheme, join_path
base, fns = analyse_paths(paths)
parsed_paths = [join_path(p) for p in paths]
scheme = get_file_scheme(fns)
pf = ParquetFile(paths[0], open_with=fs.open)
pf.file_scheme = scheme
pf.cats = _paths_to_cats(fns, scheme)
(meta, _, index_name, out_type, all_columns, index_names,
storage_name_mapping) = _pf_validation(
pf, columns, index, categories, [])
name = 'read-parquet-' + tokenize(fs_token, paths, all_columns,
categories)
dsk = {(name, i): (_read_pf_simple, fs, path, base,
index_names, all_columns, out_type == Series,
categories, pf.cats,
pf.file_scheme, storage_name_mapping)
for i, path in enumerate(parsed_paths)}
divisions = (None, ) * (len(paths) + 1)
return out_type(dsk, name, meta, divisions)
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
categories, cats, scheme, storage_name_mapping):
"""Read dataset with fastparquet using ParquetFile machinery"""
from fastparquet import ParquetFile
pf = ParquetFile(path, open_with=fs.open)
relpath = path.replace(base, '').lstrip('/')
for rg in pf.row_groups:
for ch in rg.columns:
ch.file_path = relpath
pf.file_scheme = scheme
pf.cats = cats
pf.fn = base
df = pf.to_pandas(all_columns, categories, index=index_names)
if df.index.nlevels == 1:
if index_names:
df.index.name = storage_name_mapping.get(index_names[0],
index_names[0])
else:
if index_names:
df.index.names = [storage_name_mapping.get(name, name)
for name in index_names]
df.columns = [storage_name_mapping.get(col, col)
for col in all_columns
if col not in (index_names or [])]
if is_series:
return df[df.columns[0]]
else:
return df
def _paths_to_cats(paths, scheme):
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
from fastparquet.util import ex_from_sep, val_to_num, groupby_types
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep('/')
if scheme == 'hive':
partitions = s.findall(path)
for key, val in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for i, val in enumerate(path.split('/')[:-1]):
key = 'dir%i' % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for key, v in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val),
set()).add(raw_val)
conflicts = [c for k in conflicts_by_value.values()
if len(k) > 1 for c in k]
raise ValueError("Partition names map to the same value: %s"
% conflicts)
vals_by_type = groupby_types(v)
# Check that all partition names map to the same type after
# transformation by val_to_num
if len(vals_by_type) > 1:
import warnings
examples = [x[0] for x in vals_by_type.values()]
warnings.warn("Partition names coerce to values of different"
" types, e.g. %s" % examples)
return {k: list(v) for k, v in cats.items()}
def _read_parquet_file(fs, base, fn, index, columns, series, categories,
cs, dt, scheme, storage_name_mapping, *args):
"""Read a single file with fastparquet, to be used in a task"""
from fastparquet.api import ParquetFile
from collections import OrderedDict
name_storage_mapping = {v: k for k, v in storage_name_mapping.items()}
if not isinstance(columns, (tuple, list)):
columns = [columns,]
series = True
if index:
index, = index
if index not in columns:
columns = columns + [index]
columns = [name_storage_mapping.get(col, col) for col in columns]
index = name_storage_mapping.get(index, index)
cs = OrderedDict([(k, v) for k, v in cs.items() if k in columns])
pf = ParquetFile(fn, open_with=fs.open)
pf.file_scheme = scheme
for rg in pf.row_groups:
for ch in rg.columns:
ch.file_path = fn.replace(base, "").lstrip('/')
pf.fn = base
df = pf.to_pandas(columns=columns, index=index, categories=categories)
if df.index.nlevels == 1:
if index:
df.index.name = storage_name_mapping.get(index, index)
else:
if index:
df.index.names = [storage_name_mapping.get(name, name)
for name in index]
df.columns = [storage_name_mapping.get(col, col)
for col in columns
if col != index]
if series:
return df[df.columns[0]]
else:
return df
def _read_parquet_row_group(fs, fn, index, columns, rg, series, categories,
schema, cs, dt, scheme, storage_name_mapping, *args):
from fastparquet.api import _pre_allocate
from fastparquet.core import read_row_group_file
from collections import OrderedDict
name_storage_mapping = {v: k for k, v in storage_name_mapping.items()}
if not isinstance(columns, (tuple, list)):
columns = [columns,]
series = True
if index:
index, = index
if index not in columns:
columns = columns + [index]
columns = [name_storage_mapping.get(col, col) for col in columns]
index = name_storage_mapping.get(index, index)
cs = OrderedDict([(k, v) for k, v in cs.items() if k in columns])
df, views = _pre_allocate(rg.num_rows, columns, categories, index, cs, dt)
read_row_group_file(fn, rg, columns, categories, schema, cs,
open=fs.open, assign=views, scheme=scheme)
if df.index.nlevels == 1:
if index:
df.index.name = storage_name_mapping.get(index, index)
else:
if index:
df.index.names = [storage_name_mapping.get(name, name)
for name in index]
df.columns = [storage_name_mapping.get(col, col)
for col in columns
if col != index]
if series:
return df[df.columns[0]]
else:
return df
def _write_partition_fastparquet(df, fs, path, filename, fmd, compression,
partition_on):
from fastparquet.writer import partition_on_columns, make_part_file
import fastparquet
# Fastparquet mutates this in a non-threadsafe manner. For now we just copy
# it before forwarding to fastparquet.
fmd = copy.copy(fmd)
if not len(df):
# Write nothing for empty partitions
rgs = None
elif partition_on:
if LooseVersion(fastparquet.__version__) >= '0.1.4':
rgs = partition_on_columns(df, partition_on, path, filename, fmd,
compression, fs.open, fs.mkdirs)
else:
rgs = partition_on_columns(df, partition_on, path, filename, fmd,
fs.sep, compression, fs.open, fs.mkdirs)
else:
# Fastparquet current doesn't properly set `num_rows` in the output
# metadata. Set it here to fix that.
fmd.num_rows = len(df)
with fs.open(fs.sep.join([path, filename]), 'wb') as fil:
rgs = make_part_file(fil, df, fmd.schema, compression=compression,
fmd=fmd)
return rgs
def _write_fastparquet(df, fs, fs_token, path, write_index=None, append=False,
ignore_divisions=False, partition_on=None,
compression=None, **kwargs):
import fastparquet
fs.mkdirs(path)
sep = fs.sep
object_encoding = kwargs.pop('object_encoding', 'utf8')
if object_encoding == 'infer' or (isinstance(object_encoding, dict) and 'infer' in object_encoding.values()):
raise ValueError('"infer" not allowed as object encoding, '
'because this required data in memory.')
divisions = df.divisions
if write_index is True or write_index is None and df.known_divisions:
df = df.reset_index()
index_cols = [df.columns[0]]
else:
ignore_divisions = True
index_cols = []
if append:
try:
pf = fastparquet.api.ParquetFile(path, open_with=fs.open, sep=sep)
except (IOError, ValueError):
# append for create
append = False
if append:
if pf.file_scheme not in ['hive', 'empty', 'flat']:
raise ValueError('Requested file scheme is hive, '
'but existing file scheme is not.')
elif ((set(pf.columns) != set(df.columns) - set(partition_on)) or (set(partition_on) != set(pf.cats))):
raise ValueError('Appended columns not the same.\n'
'Previous: {} | New: {}'
.format(pf.columns, list(df.columns)))
elif (pd.Series(pf.dtypes).loc[pf.columns] != df[pf.columns].dtypes).any():
raise ValueError('Appended dtypes differ.\n{}'
.format(set(pf.dtypes.items()) ^
set(df.dtypes.iteritems())))
else:
df = df[pf.columns + partition_on]
fmd = pf.fmd
i_offset = fastparquet.writer.find_max_part(fmd.row_groups)
if not ignore_divisions:
minmax = fastparquet.api.sorted_partitioned_columns(pf)
old_end = minmax[index_cols[0]]['max'][-1]
if divisions[0] < old_end:
raise ValueError(
'Appended divisions overlapping with the previous ones.\n'
'Previous: {} | New: {}'.format(old_end, divisions[0]))
else:
fmd = fastparquet.writer.make_metadata(df._meta,
object_encoding=object_encoding,
index_cols=index_cols,
ignore_columns=partition_on,
**kwargs)
i_offset = 0
filenames = ['part.%i.parquet' % (i + i_offset)
for i in range(df.npartitions)]
write = delayed(_write_partition_fastparquet, pure=False)
writes = [write(part, fs, path, filename, fmd, compression, partition_on)
for filename, part in zip(filenames, df.to_delayed())]
return delayed(_write_metadata)(writes, filenames, fmd, path, fs, sep)
def _write_metadata(writes, filenames, fmd, path, fs, sep):
""" Write Parquet metadata after writing all row groups
See Also
--------
to_parquet
"""
import fastparquet
fmd = copy.copy(fmd)
for fn, rg in zip(filenames, writes):
if rg is not None:
if isinstance(rg, list):
for r in rg:
fmd.row_groups.append(r)
else:
for chunk in rg.columns:
chunk.file_path = fn
fmd.row_groups.append(rg)
fn = sep.join([path, '_metadata'])
fastparquet.writer.write_common_metadata(fn, fmd, open_with=fs.open,
no_row_groups=False)
fn = sep.join([path, '_common_metadata'])
fastparquet.writer.write_common_metadata(fn, fmd, open_with=fs.open)
# ----------------------------------------------------------------------
# PyArrow interface
def _read_pyarrow(fs, fs_token, paths, columns=None, filters=None,
categories=None, index=None, infer_divisions=None):
from ...bytes.core import get_pyarrow_filesystem
import pyarrow.parquet as pq
# In pyarrow, the physical storage field names may differ from
# the actual dataframe names. This is true for Index names when
# PyArrow >= 0.8.
# We would like to resolve these to the correct dataframe names
# as soon as possible.
if isinstance(categories, string_types):
categories = [categories]
elif categories is None:
categories = []
else:
categories = list(categories)
if isinstance(columns, tuple):
columns = list(columns)
dataset = pq.ParquetDataset(paths, filesystem=get_pyarrow_filesystem(fs),
filters=filters)
if dataset.partitions is not None:
partitions = [n for n in dataset.partitions.partition_names
if n is not None]
else:
partitions = []
schema = dataset.schema.to_arrow_schema()
has_pandas_metadata = schema.metadata is not None and b'pandas' in schema.metadata
if has_pandas_metadata:
pandas_metadata = json.loads(schema.metadata[b'pandas'].decode('utf8'))
index_names, column_names, storage_name_mapping, column_index_names = (
_parse_pandas_metadata(pandas_metadata)
)
else:
index_names = []
column_names = schema.names
storage_name_mapping = {k: k for k in column_names}
column_index_names = [None]
column_names += [p for p in partitions if p not in column_names]
column_names, index_names, out_type = _normalize_index_columns(
columns, column_names, index, index_names)
all_columns = index_names + column_names
# Find non-empty pieces
non_empty_pieces = []
# Determine valid pieces
_open = lambda fn: pq.ParquetFile(fs.open(fn, mode='rb'))
for piece in dataset.pieces:
pf = piece.get_metadata(_open)
# non_empty_pieces.append(piece)
if pf.num_row_groups > 0:
non_empty_pieces.append(piece)
# Sort pieces naturally
# If a single input path resulted in multiple dataset pieces, then sort
# the pieces naturally. If multiple paths were supplied then we leave
# the order of the resulting pieces unmodified
if len(paths) == 1 and len(dataset.pieces) > 1:
non_empty_pieces = sorted(
non_empty_pieces, key=lambda piece: natural_sort_key(piece.path))
# Determine divisions
if len(index_names) == 1:
# Look up storage name of the single index column
divisions_names = [storage_name for storage_name, name
in storage_name_mapping.items()
if index_names[0] == name]
if divisions_names:
divisions_name = divisions_names[0]
else:
divisions_name = None
else:
divisions_name = None
divisions = _get_pyarrow_divisions(non_empty_pieces, divisions_name,
schema, infer_divisions)
# Build task
dtypes = _get_pyarrow_dtypes(schema, categories)
dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}
meta = _meta_from_dtypes(all_columns, dtypes, index_names,
column_index_names)
meta = clear_known_categories(meta, cols=categories)
if out_type == Series:
assert len(meta.columns) == 1
meta = meta[meta.columns[0]]
task_name = 'read-parquet-' + tokenize(fs_token, paths, all_columns)
if non_empty_pieces:
task_plan = {
(task_name, i): (_read_pyarrow_parquet_piece,
fs,
piece,
column_names,
index_names,
out_type == Series,
dataset.partitions,
categories)
for i, piece in enumerate(non_empty_pieces)
}
else:
meta = strip_unknown_categories(meta)
task_plan = {(task_name, 0): meta}
return out_type(task_plan, task_name, meta, divisions)
def _to_ns(val, unit):
"""
Convert an input time in the specified units to nanoseconds
Parameters
----------
val: int
Input time value
unit : str
Time units of `val`.
One of 's', 'ms', 'us', 'ns'
Returns
-------
int
Time val in nanoseconds
"""
factors = {'s': int(1e9), 'ms': int(1e6), 'us': int(1e3), 'ns': 1}
try:
factor = factors.get(unit)
except KeyError:
raise ValueError("Unsupported time unit '{unit}'".format(unit=unit))
return val * factor
def _get_pyarrow_divisions(pa_pieces, divisions_name, pa_schema, infer_divisions):
"""
Compute DataFrame divisions from a list of pyarrow dataset pieces
Parameters
----------
pa_pieces : list[pyarrow.parquet.ParquetDatasetPiece]
List of dataset pieces. Each piece corresponds to a single partition in the eventual dask DataFrame
divisions_name : str|None
The name of the column to compute divisions for
pa_schema : pyarrow.lib.Schema
The pyarrow schema for the dataset
infer_divisions : bool or None
If True divisions must be inferred (otherwise an exception is raised). If False or None divisions are not
inferred
Returns
-------
list
"""
# Local imports
import pyarrow as pa
import pyarrow.parquet as pq
if infer_divisions is True and pa.__version__ < LooseVersion('0.9.0'):
raise NotImplementedError('infer_divisions=True requires pyarrow >=0.9.0')
# Check whether divisions_name is in the schema
# Note: get_field_index returns -1 if not found, but it does not accept None
if infer_divisions is True:
divisions_name_in_schema = divisions_name is not None and pa_schema.get_field_index(divisions_name) >= 0
if divisions_name_in_schema is False and infer_divisions is True:
raise ValueError(
'Unable to infer divisions for because no index column was discovered')
else:
divisions_name_in_schema = None
if pa_pieces and divisions_name_in_schema:
# We have pieces and a valid division column.
# Compute min/max for column in each row group
min_maxs = []
last_max = None
# Initialize index of divisions column within the row groups.
# To be computed during while processing the first piece below
for piece in pa_pieces:
pf = piece.get_metadata(pq.ParquetFile)
rg = pf.row_group(0)
# Compute division column index
rg_paths = [rg.column(i).path_in_schema for i in range(rg.num_columns)]
try:
divisions_col_index = rg_paths.index(divisions_name)
except ValueError:
# Divisions not valid
min_maxs = None
break
col_meta = rg.column(divisions_col_index)
stats = col_meta.statistics
if stats.has_min_max and (last_max is None or last_max < stats.min):
min_maxs.append((stats.min, stats.max))
last_max = stats.max
else:
# Divisions not valid
min_maxs = None
break
if min_maxs:
# We have min/max pairs
divisions = [mn for mn, mx in min_maxs] + [min_maxs[-1][1]]
# Handle conversion to pandas timestamp divisions
index_field = pa_schema.field_by_name(divisions_name)
if pa.types.is_timestamp(index_field.type):
time_unit = index_field.type.unit
divisions_ns = [_to_ns(d, time_unit) for d in
divisions]
divisions = [pd.Timestamp(ns) for ns in divisions_ns]
# Handle encoding of bytes string
if index_field.type == pa.string():
# Parquet strings are always encoded as utf-8
encoding = 'utf-8'
divisions = [d.decode(encoding).strip() for d in divisions]
else:
if infer_divisions is True:
raise ValueError(
("Unable to infer divisions for index of '{index_name}' because it is not known to be "
"sorted across partitions").format(index_name=divisions_name_in_schema))
divisions = (None,) * (len(pa_pieces) + 1)
elif pa_pieces:
divisions = (None,) * (len(pa_pieces) + 1)
else:
divisions = (None, None)
return divisions
def _read_pyarrow_parquet_piece(fs, piece, columns, index_cols, is_series,
partitions, categories):
import pyarrow as pa
with fs.open(piece.path, mode='rb') as f:
table = piece.read(columns=index_cols + columns,
partitions=partitions,
use_pandas_metadata=True,
file=f)
if pa.__version__ < LooseVersion('0.9.0'):
df = table.to_pandas()
for cat in categories:
df[cat] = df[cat].astype('category')
elif pa.__version__ < LooseVersion('0.11.0'):
df = table.to_pandas(categories=categories)
else:
df = table.to_pandas(categories=categories, date_as_object=False)
has_index = not isinstance(df.index, pd.RangeIndex)
if not has_index and index_cols:
# Index should be set, but it isn't
df = df.set_index(index_cols)
elif has_index and df.index.names != index_cols:
# Index is set, but isn't correct
# This can happen when reading in not every column in a multi-index
df = df.reset_index(drop=False)
if index_cols:
df = df.set_index(index_cols)
drop = list(set(df.columns).difference(columns))
if drop:
df = df.drop(drop, axis=1)
# Ensure proper ordering
df = df.reindex(columns=columns, copy=False)
if is_series:
return df[df.columns[0]]
else:
return df[columns]
_pyarrow_write_table_kwargs = {'row_group_size', 'version', 'use_dictionary',
'compression', 'use_deprecated_int96_timestamps',
'coerce_timestamps', 'flavor', 'chunk_size'}
_pyarrow_write_metadata_kwargs = {'version', 'use_deprecated_int96_timestamps',
'coerce_timestamps'}
def _write_pyarrow(df, fs, fs_token, path, write_index=None, append=False,
ignore_divisions=False, partition_on=None, **kwargs):
if append:
raise NotImplementedError("`append` not implemented for "
"`engine='pyarrow'`")
if ignore_divisions:
raise NotImplementedError("`ignore_divisions` not implemented for "
"`engine='pyarrow'`")
# We can check only write_table kwargs, as it is a superset of kwargs for write functions
if set(kwargs).difference(_pyarrow_write_table_kwargs):
msg = ("Unexpected keyword arguments: " +
"%r" % list(set(kwargs).difference(_pyarrow_write_table_kwargs)))
raise TypeError(msg)
if write_index is None and df.known_divisions:
write_index = True
fs.mkdirs(path)
template = fs.sep.join([path, 'part.%i.parquet'])
write = delayed(_write_partition_pyarrow, pure=False)
first_kwargs = kwargs.copy()
first_kwargs['metadata_path'] = fs.sep.join([path, '_common_metadata'])
writes = [write(part, path, fs, template % i, write_index, partition_on,
**(kwargs if i else first_kwargs))
for i, part in enumerate(df.to_delayed())]
return delayed(writes)
def _write_partition_pyarrow(df, path, fs, filename, write_index,
partition_on, metadata_path=None, **kwargs):
import pyarrow as pa
from pyarrow import parquet
t = pa.Table.from_pandas(df, preserve_index=write_index)
if partition_on:
parquet.write_to_dataset(t, path, partition_cols=partition_on,
preserve_index=write_index,
filesystem=fs, **kwargs)
else:
with fs.open(filename, 'wb') as fil:
parquet.write_table(t, fil, **kwargs)
if metadata_path is not None:
with fs.open(metadata_path, 'wb') as fil:
# Get only arguments specified in the function
kwargs_meta = {k: v for k, v in kwargs.items()
if k in _pyarrow_write_metadata_kwargs}
parquet.write_metadata(t.schema, fil, **kwargs_meta)
# ----------------------------------------------------------------------
# User API
_ENGINES = {}
def get_engine(engine):
"""Get the parquet engine backend implementation.
Parameters
----------
engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
Parquet reader library to use. Default is first installed in this list.
Returns
-------
A dict containing a ``'read'`` and ``'write'`` function.
"""
if engine in _ENGINES:
return _ENGINES[engine]
if engine == 'auto':
for eng in ['fastparquet', 'pyarrow']:
try:
return get_engine(eng)
except RuntimeError:
pass
else:
raise RuntimeError("Please install either fastparquet or pyarrow")
elif engine == 'fastparquet':
import_required('fastparquet', "`fastparquet` not installed")
_ENGINES['fastparquet'] = eng = {'read': _read_fastparquet,
'write': _write_fastparquet}
return eng
elif engine == 'pyarrow':
pa = import_required('pyarrow', "`pyarrow` not installed")
if LooseVersion(pa.__version__) < '0.8.0':
raise RuntimeError("PyArrow version >= 0.8.0 required")
_ENGINES['pyarrow'] = eng = {'read': _read_pyarrow,
'write': _write_pyarrow}
return eng
else:
raise ValueError('Unsupported engine: "{0}".'.format(engine) +
' Valid choices include "pyarrow" and "fastparquet".')
def read_parquet(path, columns=None, filters=None, categories=None, index=None,
storage_options=None, engine='auto', infer_divisions=None):
"""
Read ParquetFile into a Dask DataFrame
This reads a directory of Parquet data into a Dask.dataframe, one file per
partition. It selects the index among the sorted columns if any exist.
Parameters
----------
path : string, list or fastparquet.ParquetFile
Source directory for data, or path(s) to individual parquet files.
Prefix with a protocol like ``s3://`` to read from alternative
filesystems. To read from multiple files you can pass a globstring or a
list of paths, with the caveat that they must all have the same
protocol.
Alternatively, also accepts a previously opened
fastparquet.ParquetFile()
columns : string, list or None (default)
Field name(s) to read in as columns in the output. By default all
non-index fields will be read (as determined by the pandas parquet
metadata, if present). Provide a single field name instead of a list to
read in the data as a Series.
filters : list
List of filters to apply, like ``[('x', '>', 0), ...]``. This implements
row-group (partition) -level filtering only, i.e., to prevent the
loading of some chunks of the data, and only if relevant statistics
have been included in the metadata.
index : string, list, False or None (default)
Field name(s) to use as the output frame index. By default will be
inferred from the pandas parquet file metadata (if present). Use False
to read all fields as columns.
categories : list, dict or None
For any fields listed here, if the parquet encoding is Dictionary,
the column will be created with dtype category. Use only if it is
guaranteed that the column is encoded as dictionary in all row-groups.
If a list, assumes up to 2**16-1 labels; if a dict, specify the number
of labels expected; if None, will load categories automatically for
data written by dask/fastparquet, not otherwise.
storage_options : dict
Key/value pairs to be passed on to the file-system backend, if any.
engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
Parquet reader library to use. If only one library is installed, it
will use that one; if both, it will use 'fastparquet'
infer_divisions : bool or None (default).
By default, divisions are inferred if the read `engine` supports
doing so efficiently and the `index` of the underlying dataset is
sorted across the individual parquet files. Set to ``True`` to
force divisions to be inferred in all cases. Note that this may
require reading metadata from each file in the dataset, which may
be expensive. Set to ``False`` to never infer divisions.
Examples
--------
>>> df = dd.read_parquet('s3://bucket/my-parquet-data') # doctest: +SKIP
See Also
--------
to_parquet
"""
is_ParquetFile = False
try:
import fastparquet
if isinstance(path, fastparquet.api.ParquetFile):
if path.open != fastparquet.util.default_open:
assert (re.match('.*://', path.fn)), \
("ParquetFile: Path must contain protocol" +
" (e.g., s3://...) when using other than the default" +
" LocalFileSystem. Path given: " + path.fn)
assert (engine in ['auto', 'fastparquet']), \
("'engine' should be set to 'auto' or 'fastparquet' " +
'when reading from fastparquet.ParquetFile')
is_ParquetFile = True
except ImportError:
pass
if is_ParquetFile:
read = get_engine('fastparquet')['read']
if path.fn.endswith('_metadata'):
# remove '_metadata' from path
urlpath = path.fn[:-len('_metadata')]
else:
urlpath = path.fn
fs, fs_token, paths = get_fs_token_paths(
urlpath,
mode='rb',
storage_options=storage_options
)
paths = path
else:
read = get_engine(engine)['read']
fs, fs_token, paths = get_fs_token_paths(
path, mode='rb',
storage_options=storage_options
)
if isinstance(path, string_types) and len(paths) > 1:
# Sort paths naturally if multiple paths resulted from a single
# specification (by '*' globbing)
paths = sorted(paths, key=natural_sort_key)
return read(fs, fs_token, paths, columns=columns, filters=filters,
categories=categories, index=index, infer_divisions=infer_divisions)
def to_parquet(df, path, engine='auto', compression='default', write_index=None,
append=False, ignore_divisions=False, partition_on=None,
storage_options=None, compute=True, **kwargs):
"""Store Dask.dataframe to Parquet files
Notes
-----
Each partition will be written to a separate file.
Parameters
----------
df : dask.dataframe.DataFrame
path : string
Destination directory for data. Prepend with protocol like ``s3://``
or ``hdfs://`` for remote data.
engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
Parquet library to use. If only one library is installed, it will use
that one; if both, it will use 'fastparquet'.
compression : string or dict, optional
Either a string like ``"snappy"`` or a dictionary mapping column names
to compressors like ``{"name": "gzip", "values": "snappy"}``. The
default is ``"default"``, which uses the default compression for
whichever engine is selected.
write_index : boolean, optional
Whether or not to write the index. Defaults to True *if* divisions are
known.
append : bool, optional
If False (default), construct data-set from scratch. If True, add new
row-group(s) to an existing data-set. In the latter case, the data-set
must exist, and the schema must match the input data.
ignore_divisions : bool, optional
If False (default) raises error when previous divisions overlap with
the new appended divisions. Ignored if append=False.
partition_on : list, optional
Construct directory-based partitioning by splitting on these fields'
values. Each dask partition will result in one or more datafiles,
there will be no global groupby.
storage_options : dict, optional
Key/value pairs to be passed on to the file-system backend, if any.
compute : bool, optional
If True (default) then the result is computed immediately. If False
then a ``dask.delayed`` object is returned for future computation.
**kwargs
Extra options to be passed on to the specific backend.
Examples
--------
>>> df = dd.read_csv(...) # doctest: +SKIP
>>> to_parquet('/path/to/output/', df, compression='snappy') # doctest: +SKIP
See Also
--------
read_parquet: Read parquet data to dask.dataframe
"""
partition_on = partition_on or []
if set(partition_on) - set(df.columns):
raise ValueError('Partitioning on non-existent column')
if compression != 'default':
kwargs['compression'] = compression
elif 'snappy' in compress:
kwargs['compression'] = 'snappy'
write = get_engine(engine)['write']
fs, fs_token, _ = get_fs_token_paths(path, mode='wb',
storage_options=storage_options)
# Trim any protocol information from the path before forwarding
path = infer_storage_options(path)['path']
out = write(df, fs, fs_token, path, write_index=write_index, append=append,
ignore_divisions=ignore_divisions, partition_on=partition_on,
**kwargs)
if compute:
out.compute()
return None
return out
if PY3:
DataFrame.to_parquet.__doc__ = to_parquet.__doc__