You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
324 lines
11 KiB
324 lines
11 KiB
from __future__ import print_function, division, absolute_import
|
|
|
|
import requests
|
|
import uuid
|
|
|
|
from . import core
|
|
|
|
DEFAULT_BLOCK_SIZE = 5 * 2 ** 20
|
|
|
|
|
|
class HTTPFileSystem(object):
|
|
"""
|
|
Simple File-System for fetching data via HTTP(S)
|
|
|
|
Unlike other file-systems, HTTP is limited in that it does not provide glob
|
|
or write capability.
|
|
"""
|
|
sep = '/'
|
|
|
|
def __init__(self, **storage_options):
|
|
"""
|
|
Parameters
|
|
----------
|
|
block_size: int
|
|
Blocks to read bytes; if 0, will default to raw requests file-like
|
|
objects instead of HTTPFile instances
|
|
storage_options: key-value
|
|
May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
|
|
other parameters passed on to requests
|
|
"""
|
|
self.block_size = storage_options.pop('block_size', DEFAULT_BLOCK_SIZE)
|
|
self.kwargs = storage_options
|
|
self.session = requests.Session()
|
|
|
|
def glob(self, url):
|
|
"""For a template path, return matching files"""
|
|
raise NotImplementedError
|
|
|
|
def mkdirs(self, url):
|
|
"""Make any intermediate directories to make path writable"""
|
|
raise NotImplementedError
|
|
|
|
def open(self, url, mode='rb', block_size=None, **kwargs):
|
|
"""Make a file-like object
|
|
|
|
Parameters
|
|
----------
|
|
url: str
|
|
Full URL with protocol
|
|
mode: string
|
|
must be "rb"
|
|
kwargs: key-value
|
|
Any other parameters, passed to requests calls
|
|
"""
|
|
if mode != 'rb':
|
|
raise NotImplementedError
|
|
block_size = block_size if block_size is not None else self.block_size
|
|
if block_size:
|
|
return HTTPFile(url, self.session, block_size, **self.kwargs)
|
|
else:
|
|
kw = self.kwargs.copy()
|
|
kw['stream'] = True
|
|
r = self.session.get(url, **kw)
|
|
r.raise_for_status()
|
|
r.raw.decode_content = True
|
|
return r.raw
|
|
|
|
def ukey(self, url):
|
|
"""Unique identifier, implied file might have changed every time"""
|
|
return uuid.uuid1().hex
|
|
|
|
def size(self, url):
|
|
"""Size in bytes of the file at path"""
|
|
return file_size(url, session=self.session, **self.kwargs)
|
|
|
|
|
|
core._filesystems['http'] = HTTPFileSystem
|
|
core._filesystems['https'] = HTTPFileSystem
|
|
|
|
|
|
class HTTPFile(object):
|
|
"""
|
|
A file-like object pointing to a remove HTTP(S) resource
|
|
|
|
Supports only reading, with read-ahead of a predermined block-size.
|
|
|
|
In the case that the server does not supply the filesize, only reading of
|
|
the complete file in one go is supported.
|
|
|
|
Parameters
|
|
----------
|
|
url: str
|
|
Full URL of the remote resource, including the protocol
|
|
session: requests.Session or None
|
|
All calls will be made within this session, to avoid restarting
|
|
connections where the server allows this
|
|
block_size: int or None
|
|
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
|
|
configured for the FileSystem creating this file.
|
|
kwargs: all other key-values are passed to reqeuests calls.
|
|
"""
|
|
|
|
def __init__(self, url, session=None, block_size=None, **kwargs):
|
|
self.url = url
|
|
self.kwargs = kwargs
|
|
self.loc = 0
|
|
self.session = session if session is not None else requests.Session()
|
|
self.blocksize = (block_size if block_size is not None
|
|
else DEFAULT_BLOCK_SIZE)
|
|
try:
|
|
self.size = file_size(url, self.session, allow_redirects=True,
|
|
**self.kwargs)
|
|
except ValueError:
|
|
# No size information - only allow read() and no seek()
|
|
self.size = None
|
|
self.cache = None
|
|
self.closed = False
|
|
self.start = None
|
|
self.end = None
|
|
|
|
def seek(self, where, whence=0):
|
|
"""Set file position
|
|
|
|
Parameters
|
|
----------
|
|
where: int
|
|
Location to set
|
|
whence: int (default 0)
|
|
If zero, set from start of file (value should be positive); if 1,
|
|
set relative to current position; if 2, set relative to end of file
|
|
(value shoulf be negative)
|
|
|
|
Returns the position.
|
|
"""
|
|
if self.size is None:
|
|
raise ValueError('Cannot seek since size of file is not known')
|
|
if whence == 0:
|
|
nloc = where
|
|
elif whence == 1:
|
|
nloc += where
|
|
elif whence == 2:
|
|
nloc = self.size + where
|
|
else:
|
|
raise ValueError('Whence must be in [1, 2, 3], but got %s' % whence)
|
|
if nloc < 0:
|
|
raise ValueError('Seek before start of file')
|
|
self.loc = nloc
|
|
return nloc
|
|
|
|
def tell(self):
|
|
"""Get current file byte position"""
|
|
return self.loc
|
|
|
|
def read(self, length=-1):
|
|
"""Read bytes from file
|
|
|
|
Parameters
|
|
----------
|
|
length: int
|
|
Read up to this many bytes. If negative, read all content to end of
|
|
file. If the server has not supplied the filesize, attempting to
|
|
read only part of the data will raise a ValueError.
|
|
"""
|
|
if length == 0:
|
|
# asked for no data, so supply no data and shortcut doing work
|
|
return b''
|
|
if self.size is None:
|
|
if length >= 0:
|
|
# asked for specific amount of data, but we don't know how
|
|
# much is available
|
|
raise ValueError('File size is unknown, must read all data')
|
|
else:
|
|
# asked for whole file
|
|
return self._fetch_all()
|
|
if length < 0 and self.loc == 0:
|
|
# size was provided, but asked for whole file, so shortcut
|
|
return self._fetch_all()
|
|
if length < 0 or self.loc + length > self.size:
|
|
end = self.size
|
|
else:
|
|
end = self.loc + length
|
|
if self.loc >= self.size:
|
|
# EOF (python files don't error, just return no data)
|
|
return b''
|
|
self. _fetch(self.loc, end)
|
|
data = self.cache[self.loc - self.start:end - self.start]
|
|
self.loc = end
|
|
return data
|
|
|
|
def _fetch(self, start, end):
|
|
"""Set new bounds for data cache and fetch data, if required"""
|
|
if self.start is None and self.end is None:
|
|
# First read
|
|
self.start = start
|
|
self.end = end + self.blocksize
|
|
self.cache = self._fetch_range(start, self.end)
|
|
elif start < self.start:
|
|
if self.end - end > self.blocksize:
|
|
self.start = start
|
|
self.end = end + self.blocksize
|
|
self.cache = self._fetch_range(self.start, self.end)
|
|
else:
|
|
new = self._fetch_range(start, self.start)
|
|
self.start = start
|
|
self.cache = new + self.cache
|
|
elif end > self.end:
|
|
if self.end > self.size:
|
|
return
|
|
if end - self.end > self.blocksize:
|
|
self.start = start
|
|
self.end = end + self.blocksize
|
|
self.cache = self._fetch_range(self.start, self.end)
|
|
else:
|
|
new = self._fetch_range(self.end, end + self.blocksize)
|
|
self.end = end + self.blocksize
|
|
self.cache = self.cache + new
|
|
|
|
def _fetch_all(self):
|
|
"""Read whole file in one shot, without caching
|
|
|
|
This is only called when size is None or position is still at zero,
|
|
and read() is called without a byte-count.
|
|
"""
|
|
r = self.session.get(self.url, **self.kwargs)
|
|
r.raise_for_status()
|
|
out = r.content
|
|
# set position to end of data; actually expect file might close shortly
|
|
l = len(out)
|
|
if l < self.blocksize:
|
|
# actually all data fits in one block, so cache
|
|
self.start = 0
|
|
self.end = l
|
|
self.cache = out
|
|
self.size = l
|
|
self.loc = len(out)
|
|
return out
|
|
|
|
def _fetch_range(self, start, end):
|
|
"""Download a block of data
|
|
|
|
The expectation is that the server returns only the requested bytes,
|
|
with HTTP code 206. If this is not the case, we first check the headers,
|
|
and then stream the output - if the data size is bigger than we
|
|
requested, an exception is raised.
|
|
"""
|
|
kwargs = self.kwargs.copy()
|
|
headers = kwargs.pop('headers', {})
|
|
headers['Range'] = 'bytes=%i-%i' % (start, end - 1)
|
|
r = self.session.get(self.url, headers=headers, stream=True, **kwargs)
|
|
r.raise_for_status()
|
|
if r.status_code == 206:
|
|
# partial content, as expected
|
|
return r.content
|
|
if 'Content-Length' in r.headers:
|
|
cl = int(r.headers['Content-Length'])
|
|
if cl <= end - start:
|
|
# data size OK
|
|
return r.content
|
|
else:
|
|
raise ValueError('Got more bytes (%i) than requested (%i)' % (
|
|
cl, end - start))
|
|
cl = 0
|
|
out = []
|
|
for chunk in r.iter_content(chunk_size=2 ** 20):
|
|
# data size unknown, let's see if it goes too big
|
|
if chunk:
|
|
out.append(chunk)
|
|
cl += len(chunk)
|
|
if cl > end - start:
|
|
raise ValueError(
|
|
'Got more bytes so far (>%i) than requested (%i)' % (
|
|
cl, end - start))
|
|
else:
|
|
break
|
|
return b''.join(out)
|
|
|
|
def __enter__(self):
|
|
self.loc = 0
|
|
return self
|
|
|
|
def __exit__(self, *args):
|
|
self.close()
|
|
|
|
def __iter__(self):
|
|
# no text lines here, use TextIOWrapper
|
|
raise NotImplementedError
|
|
|
|
def write(self):
|
|
raise NotImplementedError
|
|
|
|
def flush(self):
|
|
pass
|
|
|
|
def close(self):
|
|
self.closed = True
|
|
|
|
def seekable(self):
|
|
return True
|
|
|
|
def writable(self):
|
|
return False
|
|
|
|
def readable(self):
|
|
return True
|
|
|
|
|
|
def file_size(url, session, **kwargs):
|
|
"""Call HEAD on the server to get file size
|
|
|
|
Default operation is to explicitly allow redirects and use encoding
|
|
'identity' (no compression) to get the true size of the target.
|
|
"""
|
|
kwargs = kwargs.copy()
|
|
ar = kwargs.pop('allow_redirects', True)
|
|
head = kwargs.get('headers', {})
|
|
if 'Accept-Encoding' not in head:
|
|
head['Accept-Encoding'] = 'identity'
|
|
r = session.head(url, allow_redirects=ar, **kwargs)
|
|
r.raise_for_status()
|
|
if 'Content-Length' in r.headers:
|
|
return int(r.headers['Content-Length'])
|
|
else:
|
|
raise ValueError("Server did not supply size of %s" % url)
|