ORPA-pyOpenRPA/WPy32-3720/python-3.7.2/Lib/site-packages/dask/bag/text.py

from __future__ import print_function, division, absolute_import

import io
import os

from toolz import concat

from ..compatibility import unicode
from ..utils import system_encoding, parse_bytes
from ..delayed import delayed
from ..bytes import open_files, read_bytes
from .core import from_delayed

delayed = delayed(pure=True)


def read_text(urlpath, blocksize=None, compression='infer',
              encoding=system_encoding, errors='strict',
              linedelimiter=os.linesep, collection=True,
              storage_options=None, files_per_partition=None):
    """ Read lines from text files

    Parameters
    ----------
    urlpath : string or list
        Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
        to read from alternative filesystems. To read from multiple files you
        can pass a globstring or a list of paths, with the caveat that they
        must all have the same protocol.
    blocksize: None, int, or str
        Size (in bytes) to cut up larger files.  Streams by default.
        Can be ``None`` for streaming, an integer number of bytes, or a string
        like "128MiB"
    compression: string
        Compression format like 'gzip' or 'xz'.  Defaults to 'infer'
    encoding: string
    errors: string
    linedelimiter: string
    collection: bool, optional
        Return dask.bag if True, or list of delayed values if false
    storage_options: dict
        Extra options that make sense to a particular storage connection, e.g.
        host, port, username, password, etc.
    files_per_partition: None or int
        If set, group input files into partitions of the requested size,
        instead of one partition per file. Mutually exclusive with blocksize.

    Examples
    --------
    >>> b = read_text('myfiles.1.txt')  # doctest: +SKIP
    >>> b = read_text('myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('myfiles.*.txt.gz')  # doctest: +SKIP
    >>> b = read_text('s3://bucket/myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('s3://key:secret@bucket/myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('hdfs://namenode.example.com/myfiles.*.txt')  # doctest: +SKIP

    Parallelize a large file by providing the number of uncompressed bytes to
    load into each partition.

    >>> b = read_text('largefile.txt', blocksize='10MB')  # doctest: +SKIP

    Returns
    -------
    dask.bag.Bag if collection is True or list of Delayed lists otherwise

    See Also
    --------
    from_sequence: Build bag from Python sequence
    """
    if blocksize is not None and files_per_partition is not None:
        raise ValueError('Only one of blocksize or files_per_partition can be set')
    if isinstance(blocksize, (str, unicode)):
        blocksize = parse_bytes(blocksize)

    files = open_files(urlpath, mode='rt', encoding=encoding,
                       errors=errors, compression=compression,
                       **(storage_options or {}))
    if blocksize is None:
        if files_per_partition is None:
            blocks = [delayed(list)(delayed(file_to_blocks)(fil)) for fil in files]
        else:
            blocks = []
            for start in range(0, len(files), files_per_partition):
                block_files = files[start:(start + files_per_partition)]
                block_lines = delayed(concat)(delayed(map)(file_to_blocks, block_files))
                blocks.append(block_lines)
    else:
        _, blocks = read_bytes(urlpath, delimiter=linedelimiter.encode(),
                               blocksize=blocksize, sample=False,
                               compression=compression,
                               **(storage_options or {}))
        blocks = [delayed(decode)(b, encoding, errors) for b in concat(blocks)]

    if not blocks:
        raise ValueError("No files found", urlpath)

    if not collection:
        return blocks
    else:
        return from_delayed(blocks)


def file_to_blocks(lazy_file):
    with lazy_file as f:
        for line in f:
            yield line


def decode(block, encoding, errors):
    text = block.decode(encoding, errors)
    lines = io.StringIO(text)
    return list(lines)