You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
113 lines
4.1 KiB
113 lines
4.1 KiB
from __future__ import print_function, division, absolute_import
|
|
|
|
import io
|
|
import os
|
|
|
|
from toolz import concat
|
|
|
|
from ..compatibility import unicode
|
|
from ..utils import system_encoding, parse_bytes
|
|
from ..delayed import delayed
|
|
from ..bytes import open_files, read_bytes
|
|
from .core import from_delayed
|
|
|
|
delayed = delayed(pure=True)
|
|
|
|
|
|
def read_text(urlpath, blocksize=None, compression='infer',
|
|
encoding=system_encoding, errors='strict',
|
|
linedelimiter=os.linesep, collection=True,
|
|
storage_options=None, files_per_partition=None):
|
|
""" Read lines from text files
|
|
|
|
Parameters
|
|
----------
|
|
urlpath : string or list
|
|
Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
|
|
to read from alternative filesystems. To read from multiple files you
|
|
can pass a globstring or a list of paths, with the caveat that they
|
|
must all have the same protocol.
|
|
blocksize: None, int, or str
|
|
Size (in bytes) to cut up larger files. Streams by default.
|
|
Can be ``None`` for streaming, an integer number of bytes, or a string
|
|
like "128MiB"
|
|
compression: string
|
|
Compression format like 'gzip' or 'xz'. Defaults to 'infer'
|
|
encoding: string
|
|
errors: string
|
|
linedelimiter: string
|
|
collection: bool, optional
|
|
Return dask.bag if True, or list of delayed values if false
|
|
storage_options: dict
|
|
Extra options that make sense to a particular storage connection, e.g.
|
|
host, port, username, password, etc.
|
|
files_per_partition: None or int
|
|
If set, group input files into partitions of the requested size,
|
|
instead of one partition per file. Mutually exclusive with blocksize.
|
|
|
|
Examples
|
|
--------
|
|
>>> b = read_text('myfiles.1.txt') # doctest: +SKIP
|
|
>>> b = read_text('myfiles.*.txt') # doctest: +SKIP
|
|
>>> b = read_text('myfiles.*.txt.gz') # doctest: +SKIP
|
|
>>> b = read_text('s3://bucket/myfiles.*.txt') # doctest: +SKIP
|
|
>>> b = read_text('s3://key:secret@bucket/myfiles.*.txt') # doctest: +SKIP
|
|
>>> b = read_text('hdfs://namenode.example.com/myfiles.*.txt') # doctest: +SKIP
|
|
|
|
Parallelize a large file by providing the number of uncompressed bytes to
|
|
load into each partition.
|
|
|
|
>>> b = read_text('largefile.txt', blocksize='10MB') # doctest: +SKIP
|
|
|
|
Returns
|
|
-------
|
|
dask.bag.Bag if collection is True or list of Delayed lists otherwise
|
|
|
|
See Also
|
|
--------
|
|
from_sequence: Build bag from Python sequence
|
|
"""
|
|
if blocksize is not None and files_per_partition is not None:
|
|
raise ValueError('Only one of blocksize or files_per_partition can be set')
|
|
if isinstance(blocksize, (str, unicode)):
|
|
blocksize = parse_bytes(blocksize)
|
|
|
|
files = open_files(urlpath, mode='rt', encoding=encoding,
|
|
errors=errors, compression=compression,
|
|
**(storage_options or {}))
|
|
if blocksize is None:
|
|
if files_per_partition is None:
|
|
blocks = [delayed(list)(delayed(file_to_blocks)(fil)) for fil in files]
|
|
else:
|
|
blocks = []
|
|
for start in range(0, len(files), files_per_partition):
|
|
block_files = files[start:(start + files_per_partition)]
|
|
block_lines = delayed(concat)(delayed(map)(file_to_blocks, block_files))
|
|
blocks.append(block_lines)
|
|
else:
|
|
_, blocks = read_bytes(urlpath, delimiter=linedelimiter.encode(),
|
|
blocksize=blocksize, sample=False,
|
|
compression=compression,
|
|
**(storage_options or {}))
|
|
blocks = [delayed(decode)(b, encoding, errors) for b in concat(blocks)]
|
|
|
|
if not blocks:
|
|
raise ValueError("No files found", urlpath)
|
|
|
|
if not collection:
|
|
return blocks
|
|
else:
|
|
return from_delayed(blocks)
|
|
|
|
|
|
def file_to_blocks(lazy_file):
|
|
with lazy_file as f:
|
|
for line in f:
|
|
yield line
|
|
|
|
|
|
def decode(block, encoding, errors):
|
|
text = block.decode(encoding, errors)
|
|
lines = io.StringIO(text)
|
|
return list(lines)
|