You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/Resources/WPy64-3720/python-3.7.2.amd64/Lib/site-packages/dask/bytes/tests/test_hdfs.py

323 lines
9.8 KiB

from __future__ import print_function, division, absolute_import
import os
import posixpath
from distutils.version import LooseVersion
import pytest
from toolz import concat
import dask
from dask.bytes.core import read_bytes, open_files, get_fs
from dask.compatibility import unicode, PY2
try:
import distributed
from distributed import Client
from distributed.utils_test import cluster, loop # noqa: F401
except ImportError:
distributed = None
try:
import hdfs3
except ImportError:
hdfs3 = None
try:
import pyarrow
from dask.bytes.pyarrow import _MIN_PYARROW_VERSION_SUPPORTED
PYARROW_DRIVER = LooseVersion(pyarrow.__version__) >= _MIN_PYARROW_VERSION_SUPPORTED
except ImportError:
PYARROW_DRIVER = False
pyarrow = None
if not os.environ.get('DASK_RUN_HDFS_TESTS', ''):
pytestmark = pytest.mark.skip(reason="HDFS tests not configured to run")
basedir = '/tmp/test-dask'
# This fixture checks for a minimum pyarrow version
@pytest.fixture(params=[
pytest.param('hdfs3', marks=pytest.mark.skipif(not hdfs3, reason='hdfs3 not found')),
pytest.param('pyarrow', marks=pytest.mark.skipif(not PYARROW_DRIVER,
reason='required pyarrow version not found'))])
def hdfs(request):
if request.param == 'hdfs3':
hdfs = hdfs3.HDFileSystem(host='localhost', port=8020)
else:
hdfs = pyarrow.hdfs.connect(host='localhost', port=8020)
if hdfs.exists(basedir):
hdfs.rm(basedir, recursive=True)
hdfs.mkdir(basedir)
with dask.config.set(hdfs_driver=request.param):
yield hdfs
if hdfs.exists(basedir):
hdfs.rm(basedir, recursive=True)
# This mark doesn't check the minimum pyarrow version.
require_pyarrow = pytest.mark.skipif(not pyarrow, reason="pyarrow not installed")
require_hdfs3 = pytest.mark.skipif(not hdfs3, reason="hdfs3 not installed")
@require_pyarrow
@require_hdfs3
def test_fs_driver_backends():
from dask.bytes.hdfs3 import HDFS3HadoopFileSystem
from dask.bytes.pyarrow import PyArrowHadoopFileSystem
fs1, token1 = get_fs('hdfs')
assert isinstance(fs1, PyArrowHadoopFileSystem)
with dask.config.set(hdfs_driver='hdfs3'):
fs2, token2 = get_fs('hdfs')
assert isinstance(fs2, HDFS3HadoopFileSystem)
assert token1 != token2
with pytest.raises(ValueError):
with dask.config.set(hdfs_driver='not-a-valid-driver'):
get_fs('hdfs')
def test_read_bytes(hdfs):
nfiles = 10
data = b'a' * int(1e3)
for fn in ['%s/file.%d' % (basedir, i) for i in range(nfiles)]:
with hdfs.open(fn, 'wb', replication=1) as f:
f.write(data)
sample, values = read_bytes('hdfs://%s/file.*' % basedir)
(results,) = dask.compute(values)
assert [b''.join(r) for r in results] == nfiles * [data]
def test_read_bytes_URL(hdfs):
nfiles = 10
data = b'a' * int(1e3)
for fn in ['%s/file.%d' % (basedir, i) for i in range(nfiles)]:
with hdfs.open(fn, 'wb', replication=1) as f:
f.write(data)
path = 'hdfs://localhost:8020%s/file.*' % basedir
sample, values = read_bytes(path)
(results,) = dask.compute(values)
assert [b''.join(r) for r in results] == nfiles * [data]
def test_read_bytes_big_file(hdfs):
fn = '%s/file' % basedir
# Write 100 MB file
nblocks = int(1e3)
blocksize = int(1e5)
data = b'a' * blocksize
with hdfs.open(fn, 'wb', replication=1) as f:
for i in range(nblocks):
f.write(data)
sample, values = read_bytes('hdfs://' + fn, blocksize=blocksize)
assert sample[:5] == b'aaaaa'
assert len(values[0]) == nblocks
(results,) = dask.compute(values[0])
assert sum(map(len, results)) == nblocks * blocksize
for r in results:
assert set(r.decode('utf-8')) == {'a'}
def test_deterministic_key_names(hdfs):
data = b'abc\n' * int(1e3)
fn = '%s/file' % basedir
with hdfs.open(fn, 'wb', replication=1) as fil:
fil.write(data)
_, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False)
_, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False)
_, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c', sample=False)
assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
def test_open_files_write(hdfs):
path = 'hdfs://%s/' % basedir
data = [b'test data %i' % i for i in range(5)]
files = open_files(path, num=len(data), mode='wb')
for fil, b in zip(files, data):
with fil as f:
f.write(b)
sample, vals = read_bytes('hdfs://%s/*.part' % basedir)
(results,) = dask.compute(list(concat(vals)))
assert data == results
def test_read_csv(hdfs):
dd = pytest.importorskip('dask.dataframe')
with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')
with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')
df = dd.read_csv('hdfs://%s/*.csv' % basedir)
assert isinstance(df, dd.DataFrame)
assert df.id.sum().compute() == 1 + 2 + 3 + 4
@pytest.mark.skipif(PY2, reason=("pyarrow's hdfs isn't fork-safe, requires "
"multiprocessing `spawn` start method"))
def test_read_text(hdfs):
db = pytest.importorskip('dask.bag')
import multiprocessing as mp
pool = mp.get_context('spawn').Pool(2)
with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f:
f.write('Alice 100\nBob 200\nCharlie 300'.encode())
with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f:
f.write('Dan 400\nEdith 500\nFrank 600'.encode())
with hdfs.open('%s/other.txt' % basedir, 'wb') as f:
f.write('a b\nc d'.encode())
b = db.read_text('hdfs://%s/text.*.txt' % basedir)
with dask.config.set(pool=pool):
result = b.str.strip().str.split().map(len).compute()
assert result == [2, 2, 2, 2, 2, 2]
b = db.read_text('hdfs://%s/other.txt' % basedir)
with dask.config.set(pool=pool):
result = b.str.split().flatten().compute()
assert result == ['a', 'b', 'c', 'd']
def test_read_text_unicode(hdfs):
db = pytest.importorskip('dask.bag')
data = b'abcd\xc3\xa9'
fn = '%s/data.txt' % basedir
with hdfs.open(fn, 'wb') as f:
f.write(b'\n'.join([data, data]))
f = db.read_text('hdfs://' + fn, collection=False)
result = f[0].compute()
assert len(result) == 2
assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2
assert len(result[0].strip()) == 5
@require_pyarrow
@require_hdfs3
def test_pyarrow_compat():
from dask.bytes.hdfs3 import HDFS3HadoopFileSystem
dhdfs = HDFS3HadoopFileSystem()
pa_hdfs = dhdfs._get_pyarrow_filesystem()
assert isinstance(pa_hdfs, pyarrow.filesystem.FileSystem)
@require_pyarrow
def test_parquet_pyarrow(hdfs):
dd = pytest.importorskip('dask.dataframe')
import pandas as pd
import numpy as np
fn = '%s/test.parquet' % basedir
hdfs_fn = 'hdfs://%s' % fn
df = pd.DataFrame(np.random.normal(size=(1000, 4)),
columns=list('abcd'))
ddf = dd.from_pandas(df, npartitions=4)
ddf.to_parquet(hdfs_fn, engine='pyarrow')
assert len(hdfs.ls(fn)) # Files are written
ddf2 = dd.read_parquet(hdfs_fn, engine='pyarrow')
assert len(ddf2) == 1000 # smoke test on read
def test_glob(hdfs):
if type(hdfs).__module__.startswith('hdfs3'):
from dask.bytes.hdfs3 import HDFS3HadoopFileSystem
hdfs = HDFS3HadoopFileSystem.from_hdfs3(hdfs)
else:
from dask.bytes.pyarrow import PyArrowHadoopFileSystem
hdfs = PyArrowHadoopFileSystem.from_pyarrow(hdfs)
tree = {basedir: (['c', 'c2'], ['a', 'a1', 'a2', 'a3', 'b1']),
basedir + '/c': (['d'], ['x1', 'x2']),
basedir + '/c2': (['d'], ['x1', 'x2']),
basedir + '/c/d': ([], ['x3'])}
hdfs.mkdirs(basedir + '/c/d/')
hdfs.mkdirs(basedir + '/c2/d/')
for fn in (posixpath.join(dirname, f)
for (dirname, (_, fils)) in tree.items()
for f in fils):
with hdfs.open(fn, mode='wb') as f2:
f2.write(b'000')
assert (set(hdfs.glob(basedir + '/a*')) ==
{basedir + p for p in ['/a', '/a1', '/a2', '/a3']})
assert (set(hdfs.glob(basedir + '/c/*')) ==
{basedir + p for p in ['/c/x1', '/c/x2', '/c/d']})
assert (set(hdfs.glob(basedir + '/*/x*')) ==
{basedir + p for p in ['/c/x1', '/c/x2', '/c2/x1', '/c2/x2']})
assert (set(hdfs.glob(basedir + '/*/x1')) ==
{basedir + p for p in ['/c/x1', '/c2/x1']})
assert hdfs.glob(basedir + '/c') == [basedir + '/c']
assert hdfs.glob(basedir + '/c/') == [basedir + '/c/']
assert hdfs.glob(basedir + '/a') == [basedir + '/a']
assert hdfs.glob('/this-path-doesnt-exist') == []
assert hdfs.glob(basedir + '/missing/') == []
assert hdfs.glob(basedir + '/missing/x1') == []
assert hdfs.glob(basedir + '/missing/*') == []
assert hdfs.glob(basedir + '/*/missing') == []
assert (set(hdfs.glob(basedir + '/*')) ==
{basedir + p for p in ['/a', '/a1', '/a2', '/a3', '/b1', '/c', '/c2']})
@pytest.mark.skipif(not distributed, # noqa: F811
reason="Skipped as distributed is not installed.") # noqa: F811
def test_distributed(hdfs, loop): # noqa: F811
dd = pytest.importorskip('dask.dataframe')
with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')
with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')
with cluster() as (s, [a, b]):
with Client(s['address'], loop=loop): # noqa: F811
df = dd.read_csv('hdfs://%s/*.csv' % basedir)
assert df.id.sum().compute() == 1 + 2 + 3 + 4