ORPA-pyOpenRPA/WPy32-3720/python-3.7.2/Lib/site-packages/dask/bytes/http.py

from __future__ import print_function, division, absolute_import

import requests
import uuid

from . import core

DEFAULT_BLOCK_SIZE = 5 * 2 ** 20


class HTTPFileSystem(object):
    """
    Simple File-System for fetching data via HTTP(S)

    Unlike other file-systems, HTTP is limited in that it does not provide glob
    or write capability.
    """
    sep = '/'

    def __init__(self, **storage_options):
        """
        Parameters
        ----------
        block_size: int
            Blocks to read bytes; if 0, will default to raw requests file-like
            objects instead of HTTPFile instances
        storage_options: key-value
            May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
            other parameters passed on to requests
        """
        self.block_size = storage_options.pop('block_size', DEFAULT_BLOCK_SIZE)
        self.kwargs = storage_options
        self.session = requests.Session()

    def glob(self, url):
        """For a template path, return matching files"""
        raise NotImplementedError

    def mkdirs(self, url):
        """Make any intermediate directories to make path writable"""
        raise NotImplementedError

    def open(self, url, mode='rb', block_size=None, **kwargs):
        """Make a file-like object

        Parameters
        ----------
        url: str
            Full URL with protocol
        mode: string
            must be "rb"
        kwargs: key-value
            Any other parameters, passed to requests calls
        """
        if mode != 'rb':
            raise NotImplementedError
        block_size = block_size if block_size is not None else self.block_size
        if block_size:
            return HTTPFile(url, self.session, block_size, **self.kwargs)
        else:
            kw = self.kwargs.copy()
            kw['stream'] = True
            r = self.session.get(url, **kw)
            r.raise_for_status()
            r.raw.decode_content = True
            return r.raw

    def ukey(self, url):
        """Unique identifier, implied file might have changed every time"""
        return uuid.uuid1().hex

    def size(self, url):
        """Size in bytes of the file at path"""
        return file_size(url, session=self.session, **self.kwargs)


core._filesystems['http'] = HTTPFileSystem
core._filesystems['https'] = HTTPFileSystem


class HTTPFile(object):
    """
    A file-like object pointing to a remove HTTP(S) resource

    Supports only reading, with read-ahead of a predermined block-size.

    In the case that the server does not supply the filesize, only reading of
    the complete file in one go is supported.

    Parameters
    ----------
    url: str
        Full URL of the remote resource, including the protocol
    session: requests.Session or None
        All calls will be made within this session, to avoid restarting
        connections where the server allows this
    block_size: int or None
        The amount of read-ahead to do, in bytes. Default is 5MB, or the value
        configured for the FileSystem creating this file.
    kwargs: all other key-values are passed to reqeuests calls.
    """

    def __init__(self, url, session=None, block_size=None, **kwargs):
        self.url = url
        self.kwargs = kwargs
        self.loc = 0
        self.session = session if session is not None else requests.Session()
        self.blocksize = (block_size if block_size is not None
                          else DEFAULT_BLOCK_SIZE)
        try:
            self.size = file_size(url, self.session, allow_redirects=True,
                                  **self.kwargs)
        except ValueError:
            # No size information - only allow read() and no seek()
            self.size = None
        self.cache = None
        self.closed = False
        self.start = None
        self.end = None

    def seek(self, where, whence=0):
        """Set file position

        Parameters
        ----------
        where: int
            Location to set
        whence: int (default 0)
            If zero, set from start of file (value should be positive); if 1,
            set relative to current position; if 2, set relative to end of file
            (value shoulf be negative)

        Returns the position.
        """
        if self.size is None:
            raise ValueError('Cannot seek since size of file is not known')
        if whence == 0:
            nloc = where
        elif whence == 1:
            nloc += where
        elif whence == 2:
            nloc = self.size + where
        else:
            raise ValueError('Whence must be in [1, 2, 3], but got %s' % whence)
        if nloc < 0:
            raise ValueError('Seek before start of file')
        self.loc = nloc
        return nloc

    def tell(self):
        """Get current file byte position"""
        return self.loc

    def read(self, length=-1):
        """Read bytes from file

        Parameters
        ----------
        length: int
            Read up to this many bytes. If negative, read all content to end of
            file. If the server has not supplied the filesize, attempting to
            read only part of the data will raise a ValueError.
        """
        if length == 0:
            # asked for no data, so supply no data and shortcut doing work
            return b''
        if self.size is None:
            if length >= 0:
                # asked for specific amount of data, but we don't know how
                # much is available
                raise ValueError('File size is unknown, must read all data')
            else:
                # asked for whole file
                return self._fetch_all()
        if length < 0 and self.loc == 0:
            # size was provided, but asked for whole file, so shortcut
            return self._fetch_all()
        if length < 0 or self.loc + length > self.size:
            end = self.size
        else:
            end = self.loc + length
        if self.loc >= self.size:
            # EOF (python files don't error, just return no data)
            return b''
        self. _fetch(self.loc, end)
        data = self.cache[self.loc - self.start:end - self.start]
        self.loc = end
        return data

    def _fetch(self, start, end):
        """Set new bounds for data cache and fetch data, if required"""
        if self.start is None and self.end is None:
            # First read
            self.start = start
            self.end = end + self.blocksize
            self.cache = self._fetch_range(start, self.end)
        elif start < self.start:
            if self.end - end > self.blocksize:
                self.start = start
                self.end = end + self.blocksize
                self.cache = self._fetch_range(self.start, self.end)
            else:
                new = self._fetch_range(start, self.start)
                self.start = start
                self.cache = new + self.cache
        elif end > self.end:
            if self.end > self.size:
                return
            if end - self.end > self.blocksize:
                self.start = start
                self.end = end + self.blocksize
                self.cache = self._fetch_range(self.start, self.end)
            else:
                new = self._fetch_range(self.end, end + self.blocksize)
                self.end = end + self.blocksize
                self.cache = self.cache + new

    def _fetch_all(self):
        """Read whole file in one shot, without caching

        This is only called when size is None or position is still at zero,
        and read() is called without a byte-count.
        """
        r = self.session.get(self.url, **self.kwargs)
        r.raise_for_status()
        out = r.content
        # set position to end of data; actually expect file might close shortly
        l = len(out)
        if l < self.blocksize:
            # actually all data fits in one block, so cache
            self.start = 0
            self.end = l
            self.cache = out
            self.size = l
        self.loc = len(out)
        return out

    def _fetch_range(self, start, end):
        """Download a block of data

        The expectation is that the server returns only the requested bytes,
        with HTTP code 206. If this is not the case, we first check the headers,
        and then stream the output - if the data size is bigger than we
        requested, an exception is raised.
        """
        kwargs = self.kwargs.copy()
        headers = kwargs.pop('headers', {})
        headers['Range'] = 'bytes=%i-%i' % (start, end - 1)
        r = self.session.get(self.url, headers=headers, stream=True, **kwargs)
        r.raise_for_status()
        if r.status_code == 206:
            # partial content, as expected
            return r.content
        if 'Content-Length' in r.headers:
            cl = int(r.headers['Content-Length'])
            if cl <= end - start:
                # data size OK
                return r.content
            else:
                raise ValueError('Got more bytes (%i) than requested (%i)' % (
                    cl, end - start))
        cl = 0
        out = []
        for chunk in r.iter_content(chunk_size=2 ** 20):
            # data size unknown, let's see if it goes too big
            if chunk:
                out.append(chunk)
                cl += len(chunk)
                if cl > end - start:
                    raise ValueError(
                        'Got more bytes so far (>%i) than requested (%i)' % (
                            cl, end - start))
            else:
                break
        return b''.join(out)

    def __enter__(self):
        self.loc = 0
        return self

    def __exit__(self, *args):
        self.close()

    def __iter__(self):
        # no text lines here, use TextIOWrapper
        raise NotImplementedError

    def write(self):
        raise NotImplementedError

    def flush(self):
        pass

    def close(self):
        self.closed = True

    def seekable(self):
        return True

    def writable(self):
        return False

    def readable(self):
        return True


def file_size(url, session, **kwargs):
    """Call HEAD on the server to get file size

    Default operation is to explicitly allow redirects and use encoding
    'identity' (no compression) to get the true size of the target.
    """
    kwargs = kwargs.copy()
    ar = kwargs.pop('allow_redirects', True)
    head = kwargs.get('headers', {})
    if 'Accept-Encoding' not in head:
        head['Accept-Encoding'] = 'identity'
    r = session.head(url, allow_redirects=ar, **kwargs)
    r.raise_for_status()
    if 'Content-Length' in r.headers:
        return int(r.headers['Content-Length'])
    else:
        raise ValueError("Server did not supply size of %s" % url)