You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
179 lines
7.4 KiB
179 lines
7.4 KiB
import numpy as np
|
|
import pandas as pd
|
|
|
|
from ... import delayed
|
|
from ...compatibility import string_types
|
|
from .io import from_delayed, from_pandas
|
|
|
|
|
|
def read_sql_table(table, uri, index_col, divisions=None, npartitions=None,
|
|
limits=None, columns=None, bytes_per_chunk=256 * 2**20,
|
|
head_rows=5, schema=None, meta=None, engine_kwargs=None, **kwargs):
|
|
"""
|
|
Create dataframe from an SQL table.
|
|
|
|
If neither divisions or npartitions is given, the memory footprint of the
|
|
first few rows will be determined, and partitions of size ~256MB will
|
|
be used.
|
|
|
|
Parameters
|
|
----------
|
|
table : string or sqlalchemy expression
|
|
Select columns from here.
|
|
uri : string
|
|
Full sqlalchemy URI for the database connection
|
|
index_col : string
|
|
Column which becomes the index, and defines the partitioning. Should
|
|
be a indexed column in the SQL server, and any orderable type. If the
|
|
type is number or time, then partition boundaries can be inferred from
|
|
npartitions or bytes_per_chunk; otherwide must supply explicit
|
|
``divisions=``.
|
|
``index_col`` could be a function to return a value, e.g.,
|
|
``sql.func.abs(sql.column('value')).label('abs(value)')``.
|
|
Labeling columns created by functions or arithmetic operations is
|
|
required.
|
|
divisions: sequence
|
|
Values of the index column to split the table by. If given, this will
|
|
override npartitions and bytes_per_chunk. The divisions are the value
|
|
boundaries of the index column used to define the partitions. For
|
|
example, ``divisions=list('acegikmoqsuwz')`` could be used to partition
|
|
a string column lexographically into 12 partitions, with the implicit
|
|
assumption that each partition contains similar numbers of records.
|
|
npartitions : int
|
|
Number of partitions, if divisions is not given. Will split the values
|
|
of the index column linearly between limits, if given, or the column
|
|
max/min. The index column must be numeric or time for this to work
|
|
limits: 2-tuple or None
|
|
Manually give upper and lower range of values for use with npartitions;
|
|
if None, first fetches max/min from the DB. Upper limit, if
|
|
given, is inclusive.
|
|
columns : list of strings or None
|
|
Which columns to select; if None, gets all; can include sqlalchemy
|
|
functions, e.g.,
|
|
``sql.func.abs(sql.column('value')).label('abs(value)')``.
|
|
Labeling columns created by functions or arithmetic operations is
|
|
recommended.
|
|
bytes_per_chunk : int
|
|
If both divisions and npartitions is None, this is the target size of
|
|
each partition, in bytes
|
|
head_rows : int
|
|
How many rows to load for inferring the data-types, unless passing meta
|
|
meta : empty DataFrame or None
|
|
If provided, do not attempt to infer dtypes, but use these, coercing
|
|
all chunks on load
|
|
schema : str or None
|
|
If using a table name, pass this to sqlalchemy to select which DB
|
|
schema to use within the URI connection
|
|
engine_kwargs : dict or None
|
|
Specific db engine parameters for sqlalchemy
|
|
kwargs : dict
|
|
Additional parameters to pass to `pd.read_sql()`
|
|
|
|
Returns
|
|
-------
|
|
dask.dataframe
|
|
|
|
Examples
|
|
--------
|
|
>>> df = dd.read_sql_table('accounts', 'sqlite:///path/to/bank.db',
|
|
... npartitions=10, index_col='id') # doctest: +SKIP
|
|
"""
|
|
import sqlalchemy as sa
|
|
from sqlalchemy import sql
|
|
from sqlalchemy.sql import elements
|
|
if index_col is None:
|
|
raise ValueError("Must specify index column to partition on")
|
|
engine_kwargs = {} if engine_kwargs is None else engine_kwargs
|
|
engine = sa.create_engine(uri, **engine_kwargs)
|
|
m = sa.MetaData()
|
|
if isinstance(table, string_types):
|
|
table = sa.Table(table, m, autoload=True, autoload_with=engine,
|
|
schema=schema)
|
|
|
|
index = (table.columns[index_col] if isinstance(index_col, string_types)
|
|
else index_col)
|
|
if not isinstance(index_col, string_types + (elements.Label,)):
|
|
raise ValueError('Use label when passing an SQLAlchemy instance'
|
|
' as the index (%s)' % index)
|
|
if divisions and npartitions:
|
|
raise TypeError('Must supply either divisions or npartitions, not both')
|
|
|
|
columns = ([(table.columns[c] if isinstance(c, string_types) else c)
|
|
for c in columns]
|
|
if columns else list(table.columns))
|
|
if index_col not in columns:
|
|
columns.append(table.columns[index_col]
|
|
if isinstance(index_col, string_types)
|
|
else index_col)
|
|
|
|
if isinstance(index_col, string_types):
|
|
kwargs['index_col'] = index_col
|
|
else:
|
|
# function names get pandas auto-named
|
|
kwargs['index_col'] = index_col.name
|
|
|
|
if meta is None:
|
|
# derrive metadata from first few rows
|
|
q = sql.select(columns).limit(head_rows).select_from(table)
|
|
head = pd.read_sql(q, engine, **kwargs)
|
|
|
|
if head.empty:
|
|
# no results at all
|
|
name = table.name
|
|
schema = table.schema
|
|
head = pd.read_sql_table(name, uri, schema=schema, index_col=index_col)
|
|
return from_pandas(head, npartitions=1)
|
|
|
|
bytes_per_row = (head.memory_usage(deep=True, index=True)).sum() / 5
|
|
meta = head[:0]
|
|
else:
|
|
if divisions is None and npartitions is None:
|
|
raise ValueError('Must provide divisions or npartitions when'
|
|
'using explicit meta.')
|
|
|
|
if divisions is None:
|
|
if limits is None:
|
|
# calculate max and min for given index
|
|
q = sql.select([sql.func.max(index), sql.func.min(index)]
|
|
).select_from(table)
|
|
minmax = pd.read_sql(q, engine)
|
|
maxi, mini = minmax.iloc[0]
|
|
dtype = minmax.dtypes['max_1']
|
|
else:
|
|
mini, maxi = limits
|
|
dtype = pd.Series(limits).dtype
|
|
|
|
if npartitions is None:
|
|
q = sql.select([sql.func.count(index)]).select_from(table)
|
|
count = pd.read_sql(q, engine)['count_1'][0]
|
|
npartitions = round(count * bytes_per_row / bytes_per_chunk) or 1
|
|
if dtype.kind == "M":
|
|
divisions = pd.date_range(
|
|
start=mini, end=maxi, freq='%iS' % (
|
|
(maxi - mini).total_seconds() / npartitions)).tolist()
|
|
divisions[0] = mini
|
|
divisions[-1] = maxi
|
|
elif dtype.kind in ['i', 'u', 'f']:
|
|
divisions = np.linspace(mini, maxi, npartitions + 1).tolist()
|
|
else:
|
|
raise TypeError('Provided index column is of type "{}". If divisions is not provided the '
|
|
'index column type must be numeric or datetime.'.format(dtype))
|
|
|
|
parts = []
|
|
lowers, uppers = divisions[:-1], divisions[1:]
|
|
for i, (lower, upper) in enumerate(zip(lowers, uppers)):
|
|
cond = index <= upper if i == len(lowers) - 1 else index < upper
|
|
q = sql.select(columns).where(sql.and_(index >= lower, cond)
|
|
).select_from(table)
|
|
parts.append(delayed(_read_sql_chunk)(q, uri, meta, **kwargs))
|
|
|
|
return from_delayed(parts, meta, divisions=divisions)
|
|
|
|
|
|
def _read_sql_chunk(q, uri, meta, **kwargs):
|
|
df = pd.read_sql(q, uri, **kwargs)
|
|
if df.empty:
|
|
return meta
|
|
else:
|
|
return df.astype(meta.dtypes.to_dict(), copy=False)
|