from __future__ import print_function, division, absolute_import import copy import io import os from distutils.version import LooseVersion from warnings import warn from toolz import merge from .compression import seekable_files, files as compress_files from .utils import (SeekableFile, read_block, infer_compression, infer_storage_options, build_name_function, update_storage_options) from .. import config from ..compatibility import unicode from ..base import tokenize from ..delayed import delayed from ..utils import import_required, is_integer, parse_bytes def read_bytes(urlpath, delimiter=None, not_zero=False, blocksize="128 MiB", sample=True, compression=None, include_path=False, **kwargs): """Given a path or paths, return delayed objects that read from those paths. The path may be a filename like ``'2015-01-01.csv'`` or a globstring like ``'2015-*-*.csv'``. The path may be preceded by a protocol, like ``s3://`` or ``hdfs://`` if those libraries are installed. This cleanly breaks data by a delimiter if given, so that block boundaries start directly after a delimiter and end on the delimiter. Parameters ---------- urlpath : string or list Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. delimiter : bytes An optional delimiter, like ``b'\\n'`` on which to split blocks of bytes. not_zero : bool Force seek of start-of-file delimiter, discarding header. blocksize : int, str Chunk size in bytes, defaults to "128 MiB" compression : string or None String like 'gzip' or 'xz'. Must support efficient random access. sample : bool or int Whether or not to return a header sample. If an integer is given it is used as sample size, otherwise the default sample size is 10kB. include_path : bool Whether or not to include the path with the bytes representing a particular file. Default is False. **kwargs : dict Extra options that make sense to a particular storage connection, e.g. host, port, username, password, etc. Examples -------- >>> sample, blocks = read_bytes('2015-*-*.csv', delimiter=b'\\n') # doctest: +SKIP >>> sample, blocks = read_bytes('s3://bucket/2015-*-*.csv', delimiter=b'\\n') # doctest: +SKIP >>> sample, paths, blocks = read_bytes('2015-*-*.csv', include_path=True) # doctest: +SKIP Returns ------- sample : bytes The sample header blocks : list of lists of ``dask.Delayed`` Each list corresponds to a file, and each delayed object computes to a block of bytes from that file. paths : list of strings, only included if include_path is True List of same length as blocks, where each item is the path to the file represented in the corresponding block. """ fs, fs_token, paths = get_fs_token_paths(urlpath, mode='rb', storage_options=kwargs) if len(paths) == 0: raise IOError("%s resolved to no files" % urlpath) if blocksize is not None: if isinstance(blocksize, (str, unicode)): blocksize = parse_bytes(blocksize) if not is_integer(blocksize): raise TypeError("blocksize must be an integer") blocksize = int(blocksize) if blocksize is None: offsets = [[0]] * len(paths) lengths = [[None]] * len(paths) else: offsets = [] lengths = [] for path in paths: try: size = logical_size(fs, path, compression) except ValueError: raise ValueError("Cannot do chunked reads on files compressed " "with compression=%r. To read, set " "blocksize=None" % get_compression(path, compression)) off = list(range(0, size, blocksize)) length = [blocksize] * len(off) if not_zero: off[0] = 1 length[0] -= 1 offsets.append(off) lengths.append(length) delayed_read = delayed(read_block_from_file) out = [] for path, offset, length in zip(paths, offsets, lengths): token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression, offset) keys = ['read-block-%s-%s' % (o, token) for o in offset] values = [delayed_read(OpenFile(fs, path, compression=compression), o, l, delimiter, dask_key_name=key) for o, key, l in zip(offset, keys, length)] out.append(values) if sample: with OpenFile(fs, paths[0], compression=compression) as f: nbytes = 10000 if sample is True else sample sample = read_block(f, 0, nbytes, delimiter) if include_path: return sample, out, paths return sample, out def read_block_from_file(lazy_file, off, bs, delimiter): with copy.copy(lazy_file) as f: return read_block(f, off, bs, delimiter) class OpenFile(object): """ File-like object to be used in a context These instances are safe to serialize, as the low-level file object is not created until invoked using `with`. Parameters ---------- fs : FileSystem The file system to use for opening the file. Should match the interface of ``dask.bytes.local.LocalFileSystem``. path : str Location to open mode : str like 'rb', optional Mode of the opened file compression : str or None, optional Compression to apply encoding : str or None, optional The encoding to use if opened in text mode. errors : str or None, optional How to handle encoding errors if opened in text mode. """ def __init__(self, fs, path, mode='rb', compression=None, encoding=None, errors=None): self.fs = fs self.path = path self.mode = mode self.compression = get_compression(path, compression) self.encoding = encoding self.errors = errors self.fobjects = [] def __reduce__(self): return (OpenFile, (self.fs, self.path, self.mode, self.compression, self.encoding, self.errors)) def __enter__(self): mode = self.mode.replace('t', '').replace('b', '') + 'b' f = SeekableFile(self.fs.open(self.path, mode=mode)) fobjects = [f] if self.compression is not None: compress = merge(seekable_files, compress_files)[self.compression] f = compress(f, mode=mode) fobjects.append(f) if 't' in self.mode: f = io.TextIOWrapper(f, encoding=self.encoding, errors=self.errors) fobjects.append(f) self.fobjects = fobjects return f def __exit__(self, *args): self.close() def close(self): """Close all encapsulated file objects""" for f in reversed(self.fobjects): f.close() self.fobjects = [] def open_files(urlpath, mode='rb', compression=None, encoding='utf8', errors=None, name_function=None, num=1, **kwargs): """ Given a path or paths, return a list of ``OpenFile`` objects. Parameters ---------- urlpath : string or list Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. mode : 'rb', 'wt', etc. compression : string Compression to use. See ``dask.bytes.compression.files`` for options. encoding : str For text mode only errors : None or str Passed to TextIOWrapper in text mode name_function : function or None if opening a set of files for writing, those files do not yet exist, so we need to generate their names by formatting the urlpath for each sequence number num : int [1] if writing mode, number of files we expect to create (passed to name+function) **kwargs : dict Extra options that make sense to a particular storage connection, e.g. host, port, username, password, etc. Examples -------- >>> files = open_files('2015-*-*.csv') # doctest: +SKIP >>> files = open_files('s3://bucket/2015-*-*.csv.gz', compression='gzip') # doctest: +SKIP Returns ------- List of ``OpenFile`` objects. """ fs, fs_token, paths = get_fs_token_paths(urlpath, mode, num=num, name_function=name_function, storage_options=kwargs) return [OpenFile(fs, path, mode=mode, compression=compression, encoding=encoding, errors=errors) for path in paths] def get_compression(urlpath, compression): if compression == 'infer': compression = infer_compression(urlpath) if compression is not None and compression not in compress_files: raise ValueError("Compression type %s not supported" % compression) return compression def infer_options(urlpath): if hasattr(urlpath, 'name'): # deal with pathlib.Path objects - must be local urlpath = str(urlpath) ispath = True else: ispath = False options = infer_storage_options(urlpath) protocol = options.pop('protocol') urlpath = options.pop('path') if ispath and protocol != 'file': raise ValueError("Only use pathlib.Path with local files.") return urlpath, protocol, options def expand_paths_if_needed(paths, mode, num, fs, name_function): """Expand paths if they have a ``*`` in them. :param paths: list of paths mode : str Mode in which to open files. num : int If opening in writing mode, number of files we expect to create. fs : filesystem object name_function : callable If opening in writing mode, this callable is used to generate path names. Names are generated for each partition by ``urlpath.replace('*', name_function(partition_index))``. :return: list of paths """ expanded_paths = [] paths = list(paths) if 'w' in mode and sum([1 for p in paths if '*' in p]) > 1: raise ValueError("When writing data, only one filename mask can be specified.") for curr_path in paths: if '*' in curr_path: glob = True if 'w' in mode: # expand using name_function expanded_paths.extend(_expand_paths(curr_path, name_function, num)) else: # expand using glob expanded_paths.extend(fs.glob(curr_path)) else: glob = False expanded_paths.append(curr_path) # if we generated more paths that asked for, trim the list if 'w' in mode and len(expanded_paths) > num and glob: expanded_paths = expanded_paths[:num] return expanded_paths def get_fs_token_paths(urlpath, mode='rb', num=1, name_function=None, storage_options=None): """Filesystem, deterministic token, and paths from a urlpath and options. Parameters ---------- urlpath : string Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. mode : str, optional Mode in which to open files. num : int, optional If opening in writing mode, number of files we expect to create. name_function : callable, optional If opening in writing mode, this callable is used to generate path names. Names are generated for each partition by ``urlpath.replace('*', name_function(partition_index))``. storage_options : dict, optional Additional keywords to pass to the filesystem class. """ if isinstance(urlpath, (list, tuple)): if not urlpath: raise ValueError("empty urlpath sequence") paths, protocols, options_list = zip(*map(infer_options, urlpath)) protocol = protocols[0] options = options_list[0] if not (all(p == protocol for p in protocols) and all(o == options for o in options_list)): raise ValueError("When specifying a list of paths, all paths must " "share the same protocol and options") update_storage_options(options, storage_options) fs, fs_token = get_fs(protocol, options) paths = expand_paths_if_needed(paths, mode, num, fs, name_function) elif isinstance(urlpath, (str, unicode)) or hasattr(urlpath, 'name'): urlpath, protocol, options = infer_options(urlpath) update_storage_options(options, storage_options) fs, fs_token = get_fs(protocol, options) if 'w' in mode: paths = _expand_paths(urlpath, name_function, num) elif "*" in urlpath: paths = sorted(fs.glob(urlpath)) else: paths = [urlpath] else: raise TypeError('url type not understood: %s' % urlpath) fs.protocol = protocol return fs, fs_token, paths def get_mapper(fs, path): # This is not the right way to do this. # At the very least, we should have the correct failed import messages if fs.protocol == 'file': from zarr.storage import DirectoryStore return DirectoryStore(path) elif fs.protocol == 's3': from s3fs.mapping import S3Map return S3Map(path, fs) elif fs.protocol in ['gcs', 'gs']: from gcsfs.mapping import GCSMap return GCSMap(path, fs) elif fs.protocol == 'hdfs': from hdfs3.mapping import HDFSMap return HDFSMap(fs, path) else: raise ValueError('No mapper for protocol "%s"' % fs.protocol) def _expand_paths(path, name_function, num): if isinstance(path, (str, unicode)): if path.count('*') > 1: raise ValueError("Output path spec must contain at most one '*'.") elif '*' not in path: path = os.path.join(path, '*.part') if name_function is None: name_function = build_name_function(num - 1) paths = [path.replace('*', name_function(i)) for i in range(num)] if paths != sorted(paths): warn("In order to preserve order between partitions paths created " "with ``name_function`` should sort to partition order") elif isinstance(path, (tuple, list)): assert len(path) == num paths = list(path) else: raise ValueError("Path should be either\n" "1. A list of paths: ['foo.json', 'bar.json', ...]\n" "2. A directory: 'foo/\n" "3. A path with a '*' in it: 'foo.*.json'") return paths def get_hdfs_driver(driver="auto"): """Get the hdfs driver implementation. Parameters ---------- driver : {'auto', 'hdfs3', 'pyarrow'}, default 'auto' HDFS library to use. Default is first installed in this list. Returns ------- A filesystem class """ if driver == 'auto': for d in ['pyarrow', 'hdfs3']: try: return get_hdfs_driver(d) except RuntimeError: pass else: raise RuntimeError("Please install either `pyarrow` (preferred) or `hdfs3`") elif driver == 'hdfs3': import_required('hdfs3', "`hdfs3` not installed") from dask.bytes.hdfs3 import HDFS3HadoopFileSystem as cls return cls elif driver == 'pyarrow': pa = import_required('pyarrow', "`pyarrow` not installed") from dask.bytes.pyarrow import (_MIN_PYARROW_VERSION_SUPPORTED, PyArrowHadoopFileSystem as cls) if LooseVersion(pa.__version__) < _MIN_PYARROW_VERSION_SUPPORTED: raise RuntimeError("pyarrow version >= %r required for hdfs driver " "support" % _MIN_PYARROW_VERSION_SUPPORTED) return cls else: raise ValueError('Unsupported hdfs driver: {0}'.format(driver)) def get_fs(protocol, storage_options=None): """Create a filesystem object from a protocol and options. Parameters ---------- protocol : str The specific protocol to use. storage_options : dict, optional Keywords to pass to the filesystem class. """ if protocol in _filesystems: cls = _filesystems[protocol] elif protocol == 's3': import_required('s3fs', "Need to install `s3fs` library for s3 support\n" " conda install s3fs -c conda-forge\n" " or\n" " pip install s3fs") import dask.bytes.s3 # noqa, register the s3 backend cls = _filesystems[protocol] elif protocol in ('gs', 'gcs'): import_required('gcsfs', "Need to install `gcsfs` library for Google Cloud Storage support\n" " conda install gcsfs -c conda-forge\n" " or\n" " pip install gcsfs") cls = _filesystems[protocol] elif protocol in ['adl', 'adlfs']: import_required('dask_adlfs', "Need to install `dask_adlfs` for Azure Datalake " "Storage support.\n" "First install azure-storage via pip or conda:\n" " conda install -c conda-forge azure-storage\n" " or\n" " pip install azure-storage\n" "and then install `dask_adlfs` via pip:\n" " pip install dask-adlfs") cls = _filesystems[protocol] elif protocol == 'hdfs': cls = get_hdfs_driver(config.get("hdfs_driver", "auto")) elif protocol in ['http', 'https']: import_required('requests', "Need to install `requests` for HTTP support\n" " conda install requests\n" " or\n" " pip install requests") import dask.bytes.http # noqa, registers HTTP backend cls = _filesystems[protocol] else: raise ValueError("Unknown protocol %s" % protocol) if storage_options is None: storage_options = {} fs = cls(**storage_options) fs_token = tokenize(cls, protocol, storage_options) return fs, fs_token _filesystems = dict() def logical_size(fs, path, compression='infer'): if compression == 'infer': compression = infer_compression(path) if compression is None: return fs.size(path) elif compression in seekable_files: with OpenFile(fs, path, compression=compression) as f: f.seek(0, 2) return f.tell() else: raise ValueError("Cannot infer logical size from file compressed with " "compression=%r" % compression) def get_pyarrow_filesystem(fs): """Get an equivalent pyarrow filesystem. Not for public use, will be removed once a consistent filesystem api is defined.""" try: return fs._get_pyarrow_filesystem() except AttributeError: raise NotImplementedError("Using pyarrow with a %r " "filesystem object" % type(fs).__name__)