You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
4.3 KiB

6 years ago
import io
import pytest
from dask.bytes.utils import read_block, seek_delimiter, infer_storage_options
def test_read_block():
delimiter = b'\n'
data = delimiter.join([b'123', b'456', b'789'])
f = io.BytesIO(data)
assert read_block(f, 1, 2) == b'23'
assert read_block(f, 0, 1, delimiter=b'\n') == b'123\n'
assert read_block(f, 0, 2, delimiter=b'\n') == b'123\n'
assert read_block(f, 0, 3, delimiter=b'\n') == b'123\n'
assert read_block(f, 0, 5, delimiter=b'\n') == b'123\n456\n'
assert read_block(f, 0, 8, delimiter=b'\n') == b'123\n456\n789'
assert read_block(f, 0, 100, delimiter=b'\n') == b'123\n456\n789'
assert read_block(f, 1, 1, delimiter=b'\n') == b''
assert read_block(f, 1, 5, delimiter=b'\n') == b'456\n'
assert read_block(f, 1, 8, delimiter=b'\n') == b'456\n789'
for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)],
[(0, 4), (4, 4), (8, 4)]]:
out = [read_block(f, o, l, b'\n') for o, l in ols]
assert b"".join(filter(None, out)) == data
def test_seek_delimiter_endline():
f = io.BytesIO(b'123\n456\n789')
# if at zero, stay at zero
seek_delimiter(f, b'\n', 5)
assert f.tell() == 0
# choose the first block
for bs in [1, 5, 100]:
seek_delimiter(f, b'\n', blocksize=bs)
assert f.tell() == 4
# handle long delimiters well, even with short blocksizes
f = io.BytesIO(b'123abc456abc789')
for bs in [1, 2, 3, 4, 5, 6, 10]:
seek_delimiter(f, b'abc', blocksize=bs)
assert f.tell() == 6
# End at the end
f = io.BytesIO(b'123\n456')
seek_delimiter(f, b'\n', 5)
assert f.tell() == 7
def test_infer_storage_options():
so = infer_storage_options('/mnt/datasets/test.csv')
assert so.pop('protocol') == 'file'
assert so.pop('path') == '/mnt/datasets/test.csv'
assert not so
assert infer_storage_options('./test.csv')['path'] == './test.csv'
assert infer_storage_options('../test.csv')['path'] == '../test.csv'
so = infer_storage_options('C:\\test.csv')
assert so.pop('protocol') == 'file'
assert so.pop('path') == 'C:\\test.csv'
assert not so
assert infer_storage_options('d:\\test.csv')['path'] == 'd:\\test.csv'
assert infer_storage_options('\\test.csv')['path'] == '\\test.csv'
assert infer_storage_options('.\\test.csv')['path'] == '.\\test.csv'
assert infer_storage_options('test.csv')['path'] == 'test.csv'
so = infer_storage_options(
inherit_storage_options={'extra': 'value'})
assert so.pop('protocol') == 'hdfs'
assert so.pop('username') == 'username'
assert so.pop('password') == 'pwd'
assert so.pop('host') == 'Node'
assert so.pop('port') == 123
assert so.pop('path') == '/mnt/datasets/test.csv'
assert so.pop('url_query') == 'q=1'
assert so.pop('url_fragment') == 'fragm'
assert so.pop('extra') == 'value'
assert not so
so = infer_storage_options('hdfs://')
assert so.pop('username') == 'User-name'
assert so.pop('host') == ''
u = ''
assert infer_storage_options(u) == {'protocol': 'http', 'path': u}
# For s3 and gcs the netloc is actually the bucket name, so we want to
# include it in the path. Test that:
# - Parsing doesn't lowercase the bucket
# - The bucket is included in path
for protocol in ['s3', 'gcs', 'gs']:
options = infer_storage_options('%s://' % protocol)
assert options['path'] == ''
with pytest.raises(KeyError):
infer_storage_options('file:///bucket/file.csv', {'path': 'collide'})
with pytest.raises(KeyError):
infer_storage_options('hdfs:///bucket/file.csv', {'protocol': 'collide'})
@pytest.mark.parametrize('urlpath, expected_path', (
(r'c:\foo\bar', r'c:\foo\bar'),
(r'C:\\foo\bar', r'C:\\foo\bar'),
(r'c:/foo/bar', r'c:/foo/bar'),
(r'file:///c|\foo\bar', r'c:\foo\bar'),
(r'file:///C|/foo/bar', r'C:/foo/bar'),
(r'file:///C:/foo/bar', r'C:/foo/bar'),
def test_infer_storage_options_c(urlpath, expected_path):
so = infer_storage_options(urlpath)
assert so['protocol'] == 'file'
assert so['path'] == expected_path