You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
259 lines
8.7 KiB
259 lines
8.7 KiB
import io
|
|
import uuid
|
|
|
|
from ..highlevelgraph import HighLevelGraph
|
|
|
|
MAGIC = b'Obj\x01'
|
|
SYNC_SIZE = 16
|
|
|
|
|
|
def read_long(fo):
|
|
"""variable-length, zig-zag encoding."""
|
|
c = fo.read(1)
|
|
b = ord(c)
|
|
n = b & 0x7F
|
|
shift = 7
|
|
while (b & 0x80) != 0:
|
|
b = ord(fo.read(1))
|
|
n |= (b & 0x7F) << shift
|
|
shift += 7
|
|
return (n >> 1) ^ -(n & 1)
|
|
|
|
|
|
def read_bytes(fo):
|
|
"""a long followed by that many bytes of data."""
|
|
size = read_long(fo)
|
|
return fo.read(size)
|
|
|
|
|
|
def read_header(fo):
|
|
"""Extract an avro file's header
|
|
|
|
fo: file-like
|
|
This should be in bytes mode, e.g., io.BytesIO
|
|
|
|
Returns dict representing the header
|
|
|
|
Parameters
|
|
----------
|
|
fo: file-like
|
|
"""
|
|
assert fo.read(len(MAGIC)) == MAGIC, 'Magic avro bytes missing'
|
|
meta = {}
|
|
out = {'meta': meta}
|
|
while True:
|
|
n_keys = read_long(fo)
|
|
if n_keys == 0:
|
|
break
|
|
for i in range(n_keys):
|
|
# ignore dtype mapping for bag version
|
|
read_bytes(fo) # schema keys
|
|
read_bytes(fo) # schema values
|
|
out['sync'] = fo.read(SYNC_SIZE)
|
|
out['header_size'] = fo.tell()
|
|
fo.seek(0)
|
|
out['head_bytes'] = fo.read(out['header_size'])
|
|
return out
|
|
|
|
|
|
def open_head(fs, path, compression):
|
|
"""Open a file just to read its head and size"""
|
|
from dask.bytes.core import OpenFile, logical_size
|
|
with OpenFile(fs, path, compression=compression) as f:
|
|
head = read_header(f)
|
|
size = logical_size(fs, path, compression)
|
|
return head, size
|
|
|
|
|
|
def read_avro(urlpath, blocksize=100000000, storage_options=None,
|
|
compression=None):
|
|
"""Read set of avro files
|
|
|
|
Use this with arbitrary nested avro schemas. Please refer to the
|
|
fastavro documentation for its capabilities:
|
|
https://github.com/fastavro/fastavro
|
|
|
|
Parameters
|
|
----------
|
|
urlpath: string or list
|
|
Absolute or relative filepath, URL (may include protocols like
|
|
``s3://``), or globstring pointing to data.
|
|
blocksize: int or None
|
|
Size of chunks in bytes. If None, there will be no chunking and each
|
|
file will become one partition.
|
|
storage_options: dict or None
|
|
passed to backend file-system
|
|
compression: str or None
|
|
Compression format of the targe(s), like 'gzip'. Should only be used
|
|
with blocksize=None.
|
|
"""
|
|
from dask.utils import import_required
|
|
from dask import delayed, compute
|
|
from dask.bytes.core import (open_files, get_fs_token_paths,
|
|
OpenFile, tokenize)
|
|
from dask.bag import from_delayed
|
|
import_required('fastavro',
|
|
"fastavro is a required dependency for using "
|
|
"bag.read_avro().")
|
|
|
|
storage_options = storage_options or {}
|
|
if blocksize is not None:
|
|
fs, fs_token, paths = get_fs_token_paths(
|
|
urlpath, mode='rb', storage_options=storage_options)
|
|
dhead = delayed(open_head)
|
|
out = compute(*[dhead(fs, path, compression) for path in paths])
|
|
heads, sizes = zip(*out)
|
|
dread = delayed(read_chunk)
|
|
|
|
offsets = []
|
|
lengths = []
|
|
for size in sizes:
|
|
off = list(range(0, size, blocksize))
|
|
length = [blocksize] * len(off)
|
|
offsets.append(off)
|
|
lengths.append(length)
|
|
|
|
out = []
|
|
for path, offset, length, head in zip(paths, offsets, lengths, heads):
|
|
delimiter = head['sync']
|
|
f = OpenFile(fs, path, compression=compression)
|
|
token = tokenize(fs_token, delimiter, path, fs.ukey(path),
|
|
compression, offset)
|
|
keys = ['read-avro-%s-%s' % (o, token) for o in offset]
|
|
values = [dread(f, o, l, head, dask_key_name=key)
|
|
for o, key, l in zip(offset, keys, length)]
|
|
out.extend(values)
|
|
|
|
return from_delayed(out)
|
|
else:
|
|
files = open_files(urlpath, compression=compression, **storage_options)
|
|
dread = delayed(read_file)
|
|
chunks = [dread(fo) for fo in files]
|
|
return from_delayed(chunks)
|
|
|
|
|
|
def read_chunk(fobj, off, l, head):
|
|
"""Get rows from raw bytes block"""
|
|
import fastavro
|
|
from dask.bytes.core import read_block
|
|
with fobj as f:
|
|
chunk = read_block(f, off, l, head['sync'])
|
|
head_bytes = head['head_bytes']
|
|
if not chunk.startswith(MAGIC):
|
|
chunk = head_bytes + chunk
|
|
i = io.BytesIO(chunk)
|
|
return list(fastavro.iter_avro(i))
|
|
|
|
|
|
def read_file(fo):
|
|
"""Get rows from file-like"""
|
|
import fastavro
|
|
with fo as f:
|
|
return list(fastavro.iter_avro(f))
|
|
|
|
|
|
def to_avro(b, filename, schema, name_function=None, storage_options=None,
|
|
codec='null', sync_interval=16000, metadata=None, compute=True,
|
|
**kwargs):
|
|
"""Write bag to set of avro files
|
|
|
|
The schema is a complex dictionary dscribing the data, see
|
|
https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema
|
|
and https://fastavro.readthedocs.io/en/latest/writer.html .
|
|
It's structure is as follows::
|
|
|
|
{'name': 'Test',
|
|
'namespace': 'Test',
|
|
'doc': 'Descriptive text',
|
|
'type': 'record',
|
|
'fields': [
|
|
{'name': 'a', 'type': 'int'},
|
|
]}
|
|
|
|
where the "name" field is required, but "namespace" and "doc" are optional
|
|
descriptors; "type" must always be "record". The list of fields should
|
|
have an entry for every key of the input records, and the types are
|
|
like the primitive, complex or logical types of the Avro spec
|
|
( https://avro.apache.org/docs/1.8.2/spec.html ).
|
|
|
|
Results in one avro file per input partition.
|
|
|
|
Parameters
|
|
----------
|
|
b: dask.bag.Bag
|
|
filename: list of str or str
|
|
Filenames to write to. If a list, number must match the number of
|
|
partitions. If a string, must includ a glob character "*", which will
|
|
be expanded using name_function
|
|
schema: dict
|
|
Avro schema dictionary, see above
|
|
name_function: None or callable
|
|
Expands integers into strings, see
|
|
``dask.bytes.utils.build_name_function``
|
|
storage_options: None or dict
|
|
Extra key/value options to pass to the backend file-system
|
|
codec: 'null', 'deflate', or 'snappy'
|
|
Compression algorithm
|
|
sync_interval: int
|
|
Number of records to include in each block within a file
|
|
metadata: None or dict
|
|
Included in the file header
|
|
compute: bool
|
|
If True, files are written immediately, and function blocks. If False,
|
|
returns delayed objects, which can be computed by the user where
|
|
convenient.
|
|
kwargs: passed to compute(), if compute=True
|
|
|
|
Examples
|
|
--------
|
|
>>> import dask.bag as db
|
|
>>> b = db.from_sequence([{'name': 'Alice', 'value': 100},
|
|
... {'name': 'Bob', 'value': 200}])
|
|
>>> schema = {'name': 'People', 'doc': "Set of people's scores",
|
|
... 'type': 'record',
|
|
... 'fields': [
|
|
... {'name': 'name', 'type': 'string'},
|
|
... {'name': 'value', 'type': 'int'}]}
|
|
>>> b.to_avro('my-data.*.avro', schema) # doctest: +SKIP
|
|
['my-data.0.avro', 'my-data.1.avro']
|
|
"""
|
|
# TODO infer schema from first partition of data
|
|
from dask.utils import import_required
|
|
from dask.bytes.core import open_files
|
|
import_required('fastavro',
|
|
"fastavro is a required dependency for using "
|
|
"bag.to_avro().")
|
|
_verify_schema(schema)
|
|
|
|
storage_options = storage_options or {}
|
|
files = open_files(filename, 'wb', name_function=name_function,
|
|
num=b.npartitions, **storage_options)
|
|
name = 'to-avro-' + uuid.uuid4().hex
|
|
dsk = {(name, i): (_write_avro_part, (b.name, i), f, schema, codec,
|
|
sync_interval, metadata)
|
|
for i, f in enumerate(files)}
|
|
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b])
|
|
out = type(b)(graph, name, b.npartitions)
|
|
if compute:
|
|
out.compute(**kwargs)
|
|
return [f.path for f in files]
|
|
else:
|
|
return out.to_delayed()
|
|
|
|
|
|
def _verify_schema(s):
|
|
assert isinstance(s, dict), 'Schema must be dictionary'
|
|
for field in ['name', 'type', 'fields']:
|
|
assert field in s, "Schema missing '%s' field" % field
|
|
assert s['type'] == 'record', "Schema must be of type 'record'"
|
|
assert isinstance(s['fields'], list), 'Fields entry must be a list'
|
|
for f in s['fields']:
|
|
assert 'name' in f and 'type' in f, "Field spec incomplete: %s" % f
|
|
|
|
|
|
def _write_avro_part(part, f, schema, codec, sync_interval, metadata):
|
|
"""Create single avro file from list of dictionaries"""
|
|
import fastavro
|
|
with f as f:
|
|
fastavro.writer(f, schema, part, codec, sync_interval, metadata)
|