You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

179 lines
6.5 KiB

from __future__ import absolute_import
import io
import pandas as pd
from dask.bytes import open_files, read_bytes
import dask
def to_json(df, url_path, orient='records', lines=None, storage_options=None,
compute=True, encoding='utf-8', errors='strict',
compression=None, **kwargs):
"""Write dataframe into JSON text files
This utilises ``pandas.DataFrame.to_json()``, and most parameters are
passed through - see its docstring.
Differences: orient is 'records' by default, with lines=True; this
produces the kind of JSON output that is most common in big-data
applications, and which can be chunked when reading (see ``read_json()``).
df: dask.DataFrame
Data to save
url_path: str, list of str
Location to write to. If a string, and there are more than one
partitions in df, should include a glob character to expand into a
set of file names, or provide a ``name_function=`` parameter.
Supports protocol specifications such as ``"s3://"``.
encoding, errors:
The text encoding to implement, e.g., "utf-8" and how to respond
to errors in the conversion (see ``str.encode()``).
orient, lines, kwargs
passed to pandas; if not specified, lines=True when orient='records',
False otherwise.
storage_options: dict
Passed to backend file-system implementation
compute: bool
If true, immediately executes. If False, returns a set of delayed
objects, which can be computed at a later time.
encoding, errors:
Text conversion, ``see str.encode()``
compression : string or None
String like 'gzip' or 'xz'.
if lines is None:
lines = orient == 'records'
if orient != 'records' and lines:
raise ValueError('Line-delimited JSON is only available with'
kwargs['orient'] = orient
kwargs['lines'] = lines and orient == 'records'
outfiles = open_files(
url_path, 'wt', encoding=encoding,
name_function=kwargs.pop('name_function', None),
**(storage_options or {})
parts = [dask.delayed(write_json_partition)(d, outfile, kwargs)
for outfile, d in zip(outfiles, df.to_delayed())]
if compute:
return [f.path for f in outfiles]
return parts
def write_json_partition(df, openfile, kwargs):
with openfile as f:
df.to_json(f, **kwargs)
def read_json(url_path, orient='records', lines=None, storage_options=None,
blocksize=None, sample=2**20, encoding='utf-8', errors='strict',
compression='infer', **kwargs):
"""Create a dataframe from a set of JSON files
This utilises ``pandas.read_json()``, and most parameters are
passed through - see its docstring.
Differences: orient is 'records' by default, with lines=True; this
is appropriate for line-delimited "JSON-lines" data, the kind of JSON output
that is most common in big-data scenarios, and which can be chunked when
reading (see ``read_json()``). All other options require blocksize=None,
i.e., one partition per input file.
url_path: str, list of str
Location to read from. If a string, can include a glob character to
find a set of file names.
Supports protocol specifications such as ``"s3://"``.
encoding, errors:
The text encoding to implement, e.g., "utf-8" and how to respond
to errors in the conversion (see ``str.encode()``).
orient, lines, kwargs
passed to pandas; if not specified, lines=True when orient='records',
False otherwise.
storage_options: dict
Passed to backend file-system implementation
blocksize: None or int
If None, files are not blocked, and you get one partition per input
file. If int, which can only be used for line-delimited JSON files,
each partition will be approximately this size in bytes, to the nearest
newline character.
sample: int
Number of bytes to pre-load, to provide an empty dataframe structure
to any blocks wihout data. Only relevant is using blocksize.
encoding, errors:
Text conversion, ``see bytes.decode()``
compression : string or None
String like 'gzip' or 'xz'.
Load single file
>>> dd.read_json('myfile.1.json') # doctest: +SKIP
Load multiple files
>>> dd.read_json('myfile.*.json') # doctest: +SKIP
>>> dd.read_json(['myfile.1.json', 'myfile.2.json']) # doctest: +SKIP
Load large line-delimited JSON files using partitions of approx
256MB size
>> dd.read_json('data/file*.csv', blocksize=2**28)
import dask.dataframe as dd
if lines is None:
lines = orient == 'records'
if orient != 'records' and lines:
raise ValueError('Line-delimited JSON is only available with'
if blocksize and (orient != 'records' or not lines):
raise ValueError("JSON file chunking only allowed for JSON-lines"
"input (orient='records', lines=True).")
storage_options = storage_options or {}
if blocksize:
first, chunks = read_bytes(url_path, b'\n', blocksize=blocksize,
sample=sample, compression=compression,
chunks = list(dask.core.flatten(chunks))
first = read_json_chunk(first, encoding, errors, kwargs)
parts = [dask.delayed(read_json_chunk)(
chunk, encoding, errors, kwargs, meta=first[:0]
) for chunk in chunks]
files = open_files(url_path, 'rt', encoding=encoding, errors=errors,
compression=compression, **storage_options)
parts = [dask.delayed(read_json_file)(f, orient, lines, kwargs)
for f in files]
return dd.from_delayed(parts)
def read_json_chunk(chunk, encoding, errors, kwargs, meta=None):
s = io.StringIO(chunk.decode(encoding, errors))
df = pd.read_json(s, orient='records', lines=True, **kwargs)
if meta is not None and df.empty:
return meta
return df
def read_json_file(f, orient, lines, kwargs):
with f as f:
return pd.read_json(f, orient=orient, lines=lines, **kwargs)