You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ORPA-pyOpenRPA/WPy32-3720/python-3.7.2/Lib/site-packages/dask/diagnostics/profile.py

370 lines
11 KiB

6 years ago
from __future__ import absolute_import, division, print_function
from collections import namedtuple
from itertools import starmap
from timeit import default_timer
from time import sleep
from multiprocessing import Process, Pipe, current_process
from ..callbacks import Callback
from ..utils import import_required
# Stores execution data for each task
TaskData = namedtuple('TaskData', ('key', 'task', 'start_time',
'end_time', 'worker_id'))
class Profiler(Callback):
"""A profiler for dask execution at the task level.
Records the following information for each task:
1. Key
2. Task
3. Start time in seconds since the epoch
4. Finish time in seconds since the epoch
5. Worker id
Examples
--------
>>> from operator import add, mul
>>> from dask.threaded import get
>>> dsk = {'x': 1, 'y': (add, 'x', 10), 'z': (mul, 'y', 2)}
>>> with Profiler() as prof:
... get(dsk, 'z')
22
>>> prof.results # doctest: +SKIP
[('y', (add, 'x', 10), 1435352238.48039, 1435352238.480655, 140285575100160),
('z', (mul, 'y', 2), 1435352238.480657, 1435352238.480803, 140285566707456)]
These results can be visualized in a bokeh plot using the ``visualize``
method. Note that this requires bokeh to be installed.
>>> prof.visualize() # doctest: +SKIP
You can activate the profiler globally
>>> prof.register() # doctest: +SKIP
If you use the profiler globally you will need to clear out old results
manually.
>>> prof.clear()
"""
def __init__(self):
self._results = {}
self.results = []
self._dsk = {}
def __enter__(self):
self.clear()
return super(Profiler, self).__enter__()
def _start(self, dsk):
self._dsk.update(dsk)
def _pretask(self, key, dsk, state):
start = default_timer()
self._results[key] = (key, dsk[key], start)
def _posttask(self, key, value, dsk, state, id):
end = default_timer()
self._results[key] += (end, id)
def _finish(self, dsk, state, failed):
results = dict((k, v) for k, v in self._results.items() if len(v) == 5)
self.results += list(starmap(TaskData, results.values()))
self._results.clear()
def _plot(self, **kwargs):
from .profile_visualize import plot_tasks
return plot_tasks(self.results, self._dsk, **kwargs)
def visualize(self, **kwargs):
"""Visualize the profiling run in a bokeh plot.
See also
--------
dask.diagnostics.profile_visualize.visualize
"""
from .profile_visualize import visualize
return visualize(self, **kwargs)
def clear(self):
"""Clear out old results from profiler"""
self._results.clear()
del self.results[:]
self._dsk = {}
ResourceData = namedtuple('ResourceData', ('time', 'mem', 'cpu'))
class ResourceProfiler(Callback):
"""A profiler for resource use.
Records the following each timestep
1. Time in seconds since the epoch
2. Memory usage in MB
3. % CPU usage
Examples
--------
>>> from operator import add, mul
>>> from dask.threaded import get
>>> dsk = {'x': 1, 'y': (add, 'x', 10), 'z': (mul, 'y', 2)}
>>> with ResourceProfiler() as prof: # doctest: +SKIP
... get(dsk, 'z')
22
These results can be visualized in a bokeh plot using the ``visualize``
method. Note that this requires bokeh to be installed.
>>> prof.visualize() # doctest: +SKIP
You can activate the profiler globally
>>> prof.register() # doctest: +SKIP
If you use the profiler globally you will need to clear out old results
manually.
>>> prof.clear() # doctest: +SKIP
Note that when used as a context manager data will be collected throughout
the duration of the enclosed block. In contrast, when registered globally
data will only be collected while a dask scheduler is active.
"""
def __init__(self, dt=1):
self._dt = dt
self._entered = False
self._tracker = None
self.results = []
def _is_running(self):
return self._tracker is not None and self._tracker.is_alive()
def _start_collect(self):
if not self._is_running():
self._tracker = _Tracker(self._dt)
self._tracker.start()
self._tracker.parent_conn.send('collect')
def _stop_collect(self):
if self._is_running():
self._tracker.parent_conn.send('send_data')
self.results.extend(starmap(ResourceData, self._tracker.parent_conn.recv()))
def __enter__(self):
self._entered = True
self.clear()
self._start_collect()
return super(ResourceProfiler, self).__enter__()
def __exit__(self, *args):
self._entered = False
self._stop_collect()
self.close()
super(ResourceProfiler, self).__exit__(*args)
def _start(self, dsk):
self._start_collect()
def _finish(self, dsk, state, failed):
if not self._entered:
self._stop_collect()
def close(self):
"""Shutdown the resource tracker process"""
if self._is_running():
self._tracker.shutdown()
self._tracker = None
__del__ = close
def clear(self):
self.results = []
def _plot(self, **kwargs):
from .profile_visualize import plot_resources
return plot_resources(self.results, **kwargs)
def visualize(self, **kwargs):
"""Visualize the profiling run in a bokeh plot.
See also
--------
dask.diagnostics.profile_visualize.visualize
"""
from .profile_visualize import visualize
return visualize(self, **kwargs)
class _Tracker(Process):
"""Background process for tracking resource usage"""
def __init__(self, dt=1):
Process.__init__(self)
self.daemon = True
self.dt = dt
self.parent_pid = current_process().pid
self.parent_conn, self.child_conn = Pipe()
def shutdown(self):
if not self.parent_conn.closed:
self.parent_conn.send('shutdown')
self.parent_conn.close()
self.join()
def _update_pids(self, pid):
return [self.parent] + [p for p in self.parent.children()
if p.pid != pid and p.status() != 'zombie']
def run(self):
psutil = import_required("psutil", "Tracking resource usage requires "
"`psutil` to be installed")
self.parent = psutil.Process(self.parent_pid)
pid = current_process()
data = []
while True:
try:
msg = self.child_conn.recv()
except KeyboardInterrupt:
continue
if msg == 'shutdown':
break
elif msg == 'collect':
ps = self._update_pids(pid)
while not data or not self.child_conn.poll():
tic = default_timer()
mem = cpu = 0
for p in ps:
try:
mem2 = p.memory_info().rss
cpu2 = p.cpu_percent()
except Exception: # could be a few different exceptions
pass
else:
# Only increment if both were successful
mem += mem2
cpu += cpu2
data.append((tic, mem / 1e6, cpu))
sleep(self.dt)
elif msg == 'send_data':
self.child_conn.send(data)
data = []
self.child_conn.close()
CacheData = namedtuple('CacheData', ('key', 'task', 'metric', 'cache_time',
'free_time'))
class CacheProfiler(Callback):
"""A profiler for dask execution at the scheduler cache level.
Records the following information for each task:
1. Key
2. Task
3. Size metric
4. Cache entry time in seconds since the epoch
5. Cache exit time in seconds since the epoch
Examples
--------
>>> from operator import add, mul
>>> from dask.threaded import get
>>> dsk = {'x': 1, 'y': (add, 'x', 10), 'z': (mul, 'y', 2)}
>>> with CacheProfiler() as prof:
... get(dsk, 'z')
22
>>> prof.results # doctest: +SKIP
[CacheData('y', (add, 'x', 10), 1, 1435352238.48039, 1435352238.480655),
CacheData('z', (mul, 'y', 2), 1, 1435352238.480657, 1435352238.480803)]
The default is to count each task (``metric`` is 1 for all tasks). Other
functions may used as a metric instead through the ``metric`` keyword. For
example, the ``nbytes`` function found in ``cachey`` can be used to measure
the number of bytes in the cache.
>>> from cachey import nbytes # doctest: +SKIP
>>> with CacheProfiler(metric=nbytes) as prof: # doctest: +SKIP
... get(dsk, 'z')
The profiling results can be visualized in a bokeh plot using the
``visualize`` method. Note that this requires bokeh to be installed.
>>> prof.visualize() # doctest: +SKIP
You can activate the profiler globally
>>> prof.register() # doctest: +SKIP
If you use the profiler globally you will need to clear out old results
manually.
>>> prof.clear()
"""
def __init__(self, metric=None, metric_name=None):
self.clear()
self._metric = metric if metric else lambda value: 1
if metric_name:
self._metric_name = metric_name
elif metric:
self._metric_name = metric.__name__
else:
self._metric_name = 'count'
def __enter__(self):
self.clear()
return super(CacheProfiler, self).__enter__()
def _start(self, dsk):
self._dsk.update(dsk)
if not self._start_time:
self._start_time = default_timer()
def _posttask(self, key, value, dsk, state, id):
t = default_timer()
self._cache[key] = (self._metric(value), t)
for k in state['released'].intersection(self._cache):
metric, start = self._cache.pop(k)
self.results.append(CacheData(k, dsk[k], metric, start, t))
def _finish(self, dsk, state, failed):
t = default_timer()
for k, (metric, start) in self._cache.items():
self.results.append(CacheData(k, dsk[k], metric, start, t))
self._cache.clear()
def _plot(self, **kwargs):
from .profile_visualize import plot_cache
return plot_cache(self.results, self._dsk, self._start_time,
self._metric_name, **kwargs)
def visualize(self, **kwargs):
"""Visualize the profiling run in a bokeh plot.
See also
--------
dask.diagnostics.profile_visualize.visualize
"""
from .profile_visualize import visualize
return visualize(self, **kwargs)
def clear(self):
"""Clear out old results from profiler"""
self.results = []
self._cache = {}
self._dsk = {}
self._start_time = None