You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/WPy32-3720/python-3.7.2/Lib/site-packages/dask/bag/avro.py

259 lines
8.7 KiB

import io
import uuid
from ..highlevelgraph import HighLevelGraph
MAGIC = b'Obj\x01'
SYNC_SIZE = 16
def read_long(fo):
"""variable-length, zig-zag encoding."""
c = fo.read(1)
b = ord(c)
n = b & 0x7F
shift = 7
while (b & 0x80) != 0:
b = ord(fo.read(1))
n |= (b & 0x7F) << shift
shift += 7
return (n >> 1) ^ -(n & 1)
def read_bytes(fo):
"""a long followed by that many bytes of data."""
size = read_long(fo)
return fo.read(size)
def read_header(fo):
"""Extract an avro file's header
fo: file-like
This should be in bytes mode, e.g., io.BytesIO
Returns dict representing the header
Parameters
----------
fo: file-like
"""
assert fo.read(len(MAGIC)) == MAGIC, 'Magic avro bytes missing'
meta = {}
out = {'meta': meta}
while True:
n_keys = read_long(fo)
if n_keys == 0:
break
for i in range(n_keys):
# ignore dtype mapping for bag version
read_bytes(fo) # schema keys
read_bytes(fo) # schema values
out['sync'] = fo.read(SYNC_SIZE)
out['header_size'] = fo.tell()
fo.seek(0)
out['head_bytes'] = fo.read(out['header_size'])
return out
def open_head(fs, path, compression):
"""Open a file just to read its head and size"""
from dask.bytes.core import OpenFile, logical_size
with OpenFile(fs, path, compression=compression) as f:
head = read_header(f)
size = logical_size(fs, path, compression)
return head, size
def read_avro(urlpath, blocksize=100000000, storage_options=None,
compression=None):
"""Read set of avro files
Use this with arbitrary nested avro schemas. Please refer to the
fastavro documentation for its capabilities:
https://github.com/fastavro/fastavro
Parameters
----------
urlpath: string or list
Absolute or relative filepath, URL (may include protocols like
``s3://``), or globstring pointing to data.
blocksize: int or None
Size of chunks in bytes. If None, there will be no chunking and each
file will become one partition.
storage_options: dict or None
passed to backend file-system
compression: str or None
Compression format of the targe(s), like 'gzip'. Should only be used
with blocksize=None.
"""
from dask.utils import import_required
from dask import delayed, compute
from dask.bytes.core import (open_files, get_fs_token_paths,
OpenFile, tokenize)
from dask.bag import from_delayed
import_required('fastavro',
"fastavro is a required dependency for using "
"bag.read_avro().")
storage_options = storage_options or {}
if blocksize is not None:
fs, fs_token, paths = get_fs_token_paths(
urlpath, mode='rb', storage_options=storage_options)
dhead = delayed(open_head)
out = compute(*[dhead(fs, path, compression) for path in paths])
heads, sizes = zip(*out)
dread = delayed(read_chunk)
offsets = []
lengths = []
for size in sizes:
off = list(range(0, size, blocksize))
length = [blocksize] * len(off)
offsets.append(off)
lengths.append(length)
out = []
for path, offset, length, head in zip(paths, offsets, lengths, heads):
delimiter = head['sync']
f = OpenFile(fs, path, compression=compression)
token = tokenize(fs_token, delimiter, path, fs.ukey(path),
compression, offset)
keys = ['read-avro-%s-%s' % (o, token) for o in offset]
values = [dread(f, o, l, head, dask_key_name=key)
for o, key, l in zip(offset, keys, length)]
out.extend(values)
return from_delayed(out)
else:
files = open_files(urlpath, compression=compression, **storage_options)
dread = delayed(read_file)
chunks = [dread(fo) for fo in files]
return from_delayed(chunks)
def read_chunk(fobj, off, l, head):
"""Get rows from raw bytes block"""
import fastavro
from dask.bytes.core import read_block
with fobj as f:
chunk = read_block(f, off, l, head['sync'])
head_bytes = head['head_bytes']
if not chunk.startswith(MAGIC):
chunk = head_bytes + chunk
i = io.BytesIO(chunk)
return list(fastavro.iter_avro(i))
def read_file(fo):
"""Get rows from file-like"""
import fastavro
with fo as f:
return list(fastavro.iter_avro(f))
def to_avro(b, filename, schema, name_function=None, storage_options=None,
codec='null', sync_interval=16000, metadata=None, compute=True,
**kwargs):
"""Write bag to set of avro files
The schema is a complex dictionary dscribing the data, see
https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema
and https://fastavro.readthedocs.io/en/latest/writer.html .
It's structure is as follows::
{'name': 'Test',
'namespace': 'Test',
'doc': 'Descriptive text',
'type': 'record',
'fields': [
{'name': 'a', 'type': 'int'},
]}
where the "name" field is required, but "namespace" and "doc" are optional
descriptors; "type" must always be "record". The list of fields should
have an entry for every key of the input records, and the types are
like the primitive, complex or logical types of the Avro spec
( https://avro.apache.org/docs/1.8.2/spec.html ).
Results in one avro file per input partition.
Parameters
----------
b: dask.bag.Bag
filename: list of str or str
Filenames to write to. If a list, number must match the number of
partitions. If a string, must includ a glob character "*", which will
be expanded using name_function
schema: dict
Avro schema dictionary, see above
name_function: None or callable
Expands integers into strings, see
``dask.bytes.utils.build_name_function``
storage_options: None or dict
Extra key/value options to pass to the backend file-system
codec: 'null', 'deflate', or 'snappy'
Compression algorithm
sync_interval: int
Number of records to include in each block within a file
metadata: None or dict
Included in the file header
compute: bool
If True, files are written immediately, and function blocks. If False,
returns delayed objects, which can be computed by the user where
convenient.
kwargs: passed to compute(), if compute=True
Examples
--------
>>> import dask.bag as db
>>> b = db.from_sequence([{'name': 'Alice', 'value': 100},
... {'name': 'Bob', 'value': 200}])
>>> schema = {'name': 'People', 'doc': "Set of people's scores",
... 'type': 'record',
... 'fields': [
... {'name': 'name', 'type': 'string'},
... {'name': 'value', 'type': 'int'}]}
>>> b.to_avro('my-data.*.avro', schema) # doctest: +SKIP
['my-data.0.avro', 'my-data.1.avro']
"""
# TODO infer schema from first partition of data
from dask.utils import import_required
from dask.bytes.core import open_files
import_required('fastavro',
"fastavro is a required dependency for using "
"bag.to_avro().")
_verify_schema(schema)
storage_options = storage_options or {}
files = open_files(filename, 'wb', name_function=name_function,
num=b.npartitions, **storage_options)
name = 'to-avro-' + uuid.uuid4().hex
dsk = {(name, i): (_write_avro_part, (b.name, i), f, schema, codec,
sync_interval, metadata)
for i, f in enumerate(files)}
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b])
out = type(b)(graph, name, b.npartitions)
if compute:
out.compute(**kwargs)
return [f.path for f in files]
else:
return out.to_delayed()
def _verify_schema(s):
assert isinstance(s, dict), 'Schema must be dictionary'
for field in ['name', 'type', 'fields']:
assert field in s, "Schema missing '%s' field" % field
assert s['type'] == 'record', "Schema must be of type 'record'"
assert isinstance(s['fields'], list), 'Fields entry must be a list'
for f in s['fields']:
assert 'name' in f and 'type' in f, "Field spec incomplete: %s" % f
def _write_avro_part(part, f, schema, codec, sync_interval, metadata):
"""Create single avro file from list of dictionaries"""
import fastavro
with f as f:
fastavro.writer(f, schema, part, codec, sync_interval, metadata)