Skip to content

Commit 7f0b01e

Browse files
authored
Merge pull request #157 from tharittk/benchmark_instrument
Adding Benchmark instrument
2 parents 62927cd + 81e62b9 commit 7f0b01e

File tree

9 files changed

+360
-4
lines changed

9 files changed

+360
-4
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,11 @@ tests:
5353
tests_nccl:
5454
mpiexec -n $(NUM_PROCESSES) pytest tests_nccl/ --with-mpi
5555

56+
# sphinx-build does not work well with NCCL
5657
doc:
5758
cd docs && rm -rf source/api/generated && rm -rf source/gallery &&\
5859
rm -rf source/tutorials && rm -rf build &&\
59-
cd .. && sphinx-build -b html docs/source docs/build
60+
cd .. && NCCL_PYLOPS_MPI=0 sphinx-build -b html docs/source docs/build
6061

6162
doc_cupy:
6263
cp tutorials_cupy/* tutorials/

docs/source/benchmarking.rst

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
.. _benchmarkutility:
2+
3+
Benchmark Utility in PyLops-MPI
4+
===============================
5+
PyLops-MPI users can convenienly benchmark the performance of their code with a simple decorator.
6+
:py:func:`pylops_mpi.utils.benchmark` and :py:func:`pylops_mpi.utils.mark` support various
7+
function calling patterns that may arise when benchmarking distributed code.
8+
9+
- :py:func:`pylops_mpi.utils.benchmark` is a **decorator** used to time the execution of entire functions.
10+
- :py:func:`pylops_mpi.utils.mark` is a **function** used inside decorated functions to insert fine-grained time measurements.
11+
12+
.. note::
13+
This benchmark utility is enabled by default i.e., if the user decorates the function with :py:func:`@benchmark`, the function will go through
14+
the time measurements, adding overheads. Users can turn off the benchmark while leaving the decorator in-place with
15+
16+
.. code-block:: bash
17+
18+
>> export BENCH_PYLOPS_MPI=0
19+
20+
The usage can be as simple as:
21+
22+
.. code-block:: python
23+
24+
@benchmark
25+
def function_to_time():
26+
# Your computation
27+
28+
The result will print out to the standard output.
29+
For fine-grained time measurements, :py:func:`pylops_mpi.utils.mark` can be inserted in the code region of benchmarked functions:
30+
31+
.. code-block:: python
32+
33+
@benchmark
34+
def funtion_to_time():
35+
# You computation that you may want to ignore it in benchmark
36+
mark("Begin Region")
37+
# You computation
38+
mark("Finish Region")
39+
40+
You can also nest benchmarked functions to track execution times across layers of function calls with the output being correctly formatted.
41+
Additionally, the result can also be exported to the text file. For completed and runnable examples, visit :ref:`sphx_glr_tutorials_benchmarking.py`

pylops_mpi/DistributedArray.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ def _check_local_shapes(self, local_shapes):
452452
elif self.partition is Partition.SCATTER:
453453
local_shape = local_shapes[self.rank]
454454
# Check if local shape sum up to global shape and other dimensions align with global shape
455-
if self._allreduce(local_shape[self.axis]) != self.global_shape[self.axis] or \
455+
if self.base_comm.allreduce(local_shape[self.axis]) != self.global_shape[self.axis] or \
456456
not np.array_equal(np.delete(local_shape, self.axis), np.delete(self.global_shape, self.axis)):
457457
raise ValueError(f"Local shapes don't align with the global shape;"
458458
f"{local_shapes} != {self.global_shape}")

pylops_mpi/utils/_nccl.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
__all__ = [
22
"_prepare_nccl_allgather_inputs",
33
"_unroll_nccl_allgather_recv",
4+
"_nccl_sync",
45
"initialize_nccl_comm",
56
"nccl_split",
67
"nccl_allgather",
@@ -19,7 +20,6 @@
1920
import cupy as cp
2021
import cupy.cuda.nccl as nccl
2122

22-
2323
cupy_to_nccl_dtype = {
2424
"float32": nccl.NCCL_FLOAT32,
2525
"float64": nccl.NCCL_FLOAT64,
@@ -63,6 +63,13 @@ def _nccl_buf_size(buf, count=None):
6363
return count if count else buf.size
6464

6565

66+
def _nccl_sync():
67+
"""A thin wrapper of CuPy's synchronization for protected import"""
68+
if cp.cuda.runtime.getDeviceCount() == 0:
69+
return
70+
cp.cuda.runtime.deviceSynchronize()
71+
72+
6673
def _prepare_nccl_allgather_inputs(send_buf, send_buf_shapes) -> Tuple[cp.ndarray, cp.ndarray]:
6774
r""" Prepare send_buf and recv_buf for NCCL allgather (nccl_allgather)
6875

pylops_mpi/utils/benchmark.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import functools
2+
import logging
3+
import os
4+
import time
5+
from typing import Callable, Optional, List
6+
from mpi4py import MPI
7+
8+
from pylops.utils import deps as pylops_deps # avoid namespace crashes with pylops_mpi.utils
9+
from pylops_mpi.utils import deps
10+
11+
cupy_message = pylops_deps.cupy_import("the benchmark module")
12+
nccl_message = deps.nccl_import("the benchmark module")
13+
14+
if nccl_message is None and cupy_message is None:
15+
from pylops_mpi.utils._nccl import _nccl_sync
16+
else:
17+
def _nccl_sync():
18+
pass
19+
20+
# Benchmark is enabled by default
21+
ENABLE_BENCHMARK = int(os.getenv("BENCH_PYLOPS_MPI", 1)) == 1
22+
23+
# Stack of active mark functions for nested support
24+
_mark_func_stack = []
25+
_markers = []
26+
27+
28+
def _parse_output_tree(markers: List[str]):
29+
"""This function parses the list of strings gathered during the benchmark call and output them
30+
as one properly formatted string. The format of output string follows the hierarchy of function calls
31+
i.e., the nested funtion calls are indented.
32+
33+
Parameters
34+
----------
35+
markers: :obj:`list`, optional
36+
A list of markers/labels generated from the benchmark call
37+
"""
38+
global _markers
39+
output = []
40+
stack = []
41+
i = 0
42+
while i < len(markers):
43+
label, time, level = markers[i]
44+
if label.startswith("[decorator]"):
45+
indent = "\t" * (level - 1)
46+
output.append(f"{indent}{label}: total runtime: {time:6f} s\n")
47+
else:
48+
if stack:
49+
prev_label, prev_time, prev_level = stack[-1]
50+
if prev_level == level:
51+
indent = "\t" * level
52+
output.append(f"{indent}{prev_label}-->{label}: {time - prev_time:6f} s\n")
53+
stack.pop()
54+
55+
# Push to the stack only if it is going deeper or still at the same level
56+
if i + 1 <= len(markers) - 1:
57+
_, _ , next_level = markers[i + 1]
58+
if next_level >= level:
59+
stack.append(markers[i])
60+
i += 1
61+
# reset markers, allowing other benchmarked function to start fresh
62+
_markers = []
63+
return output
64+
65+
66+
def _sync():
67+
"""Synchronize all MPI processes or CUDA Devices"""
68+
_nccl_sync()
69+
MPI.COMM_WORLD.Barrier()
70+
71+
72+
def mark(label: str):
73+
"""This function allows users to measure time arbitary lines of the function
74+
75+
Parameters
76+
----------
77+
label: :obj:`str`
78+
A label of the mark. This signifies both 1) the end of the
79+
previous mark 2) the beginning of the new mark
80+
"""
81+
if not ENABLE_BENCHMARK:
82+
return
83+
if not _mark_func_stack:
84+
raise RuntimeError("mark() called outside of a benchmarked region")
85+
_mark_func_stack[-1](label)
86+
87+
88+
def benchmark(func: Optional[Callable] = None,
89+
description: Optional[str] = "",
90+
logger: Optional[logging.Logger] = None,
91+
):
92+
"""A wrapper for code injection for time measurement.
93+
94+
This wrapper measures the start-to-end time of the wrapped function when
95+
decorated without any argument.
96+
97+
It also allows users to put a call to mark() anywhere inside the wrapped function
98+
for fine-grain time benchmark. This wrapper defines the local_mark() and pushes it
99+
to the _mark_func_stack for isolation in case of nested call.
100+
The user-facing mark() will always call the function at the top of the _mark_func_stack.
101+
102+
Parameters
103+
----------
104+
func : :obj:`callable`, optional
105+
Function to be decorated. Defaults to ``None``.
106+
description : :obj:`str`, optional
107+
Description for the output text. Defaults to ``''``.
108+
logger: :obj:`logging.Logger`, optional
109+
A `logging.Logger` object for logging the benchmark text output. This logger must be setup before
110+
passing to this function to either writing output to a file or log to stdout. If `logger`
111+
is not provided, the output is printed to stdout.
112+
"""
113+
114+
def noop_decorator(func):
115+
@functools.wraps(func)
116+
def wrapped(*args, **kwargs):
117+
return func(*args, **kwargs)
118+
return wrapped
119+
120+
@functools.wraps(func)
121+
def decorator(func):
122+
def wrapper(*args, **kwargs):
123+
rank = MPI.COMM_WORLD.Get_rank()
124+
125+
level = len(_mark_func_stack) + 1
126+
# The header is needed for later tree parsing. Here it is allocating its spot.
127+
# the tuple at this index will be replaced after elapsed time is calculated.
128+
_markers.append((f"[decorator]{description or func.__name__}", None, level))
129+
header_index = len(_markers) - 1
130+
131+
def local_mark(label):
132+
_markers.append((label, time.perf_counter(), level))
133+
134+
_mark_func_stack.append(local_mark)
135+
136+
_sync()
137+
start_time = time.perf_counter()
138+
# the mark() called in wrapped function will now call local_mark
139+
result = func(*args, **kwargs)
140+
_sync()
141+
end_time = time.perf_counter()
142+
143+
elapsed = end_time - start_time
144+
_markers[header_index] = (f"[decorator]{description or func.__name__}", elapsed, level)
145+
146+
# In case of nesting, the wrapped callee must pop its closure from stack so that
147+
# when the callee returns, the wrapped caller operates on its closure (and its level label), which now becomes
148+
# the top of the stack.
149+
_mark_func_stack.pop()
150+
151+
# all the calls have fininshed
152+
if not _mark_func_stack:
153+
if rank == 0:
154+
output = _parse_output_tree(_markers)
155+
if logger:
156+
logger.info("".join(output))
157+
else:
158+
print("".join(output))
159+
return result
160+
return wrapper
161+
162+
# The code still has to return decorator so that the in-place decorator with arguments
163+
# like @benchmark(logger=logger) does not throw the error and can be kept untouched.
164+
if not ENABLE_BENCHMARK:
165+
return noop_decorator if func is None else noop_decorator(func)
166+
167+
return decorator if func is None else decorator(func)

tutorials/benchmarking.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
r"""
2+
Benchmark Utility in PyLops-MPI
3+
===============================
4+
This tutorial demonstrates how to use the :py:func:`pylops_mpi.utils.benchmark` and
5+
:py:func:`pylops_mpi.utils.mark` utility methods in PyLops-MPI. It contains various
6+
function calling pattern that may come up during the benchmarking of a distributed code.
7+
8+
:py:func:`pylops_mpi.utils.benchmark` is a decorator used to decorate any
9+
function to measure its execution time from start to finish
10+
:py:func:`pylops_mpi.utils.mark` is a function used inside the benchmark-decorated
11+
function to provide fine-grain time measurements.
12+
"""
13+
14+
import sys
15+
import logging
16+
import numpy as np
17+
from mpi4py import MPI
18+
from pylops_mpi import DistributedArray, Partition
19+
20+
np.random.seed(42)
21+
rank = MPI.COMM_WORLD.Get_rank()
22+
23+
par = {'global_shape': (500, 501),
24+
'partition': Partition.SCATTER, 'dtype': np.float64,
25+
'axis': 1}
26+
27+
###############################################################################
28+
# Let's start by import the utility and a simple exampple
29+
from pylops_mpi.utils.benchmark import benchmark, mark
30+
31+
32+
@benchmark
33+
def inner_func(par):
34+
dist_arr = DistributedArray(global_shape=par['global_shape'],
35+
partition=par['partition'],
36+
dtype=par['dtype'], axis=par['axis'])
37+
# may perform computation here
38+
dist_arr.dot(dist_arr)
39+
40+
41+
###############################################################################
42+
# When we call :py:func:`inner_func`, we will see the result
43+
# of the benchmark print to standard output. If we want to customize the
44+
# function name in the printout, we can pass the parameter `description`
45+
# to the :py:func:`benchmark`
46+
# i.e., :py:func:`@benchmark(description="printout_name")`
47+
48+
inner_func(par)
49+
50+
###############################################################################
51+
# We may want to get the fine-grained time measurements by timing the execution
52+
# time of arbitary lines of code. :py:func:`pylops_mpi.utils.mark` provides such utitlity.
53+
54+
55+
@benchmark
56+
def inner_func_with_mark(par):
57+
mark("Begin array constructor")
58+
dist_arr = DistributedArray(global_shape=par['global_shape'],
59+
partition=par['partition'],
60+
dtype=par['dtype'], axis=par['axis'])
61+
mark("Begin dot")
62+
dist_arr.dot(dist_arr)
63+
mark("Finish dot")
64+
65+
66+
###############################################################################
67+
# Now when we run, we get the detailed time measurement. Note that there is a tag
68+
# [decorator] next to the function name to distinguish between the start-to-end time
69+
# measurement of the top-level function and those that comes from :py:func:`pylops_mpi.utils.mark`
70+
inner_func_with_mark(par)
71+
72+
###############################################################################
73+
# This utility benchmarking routines can also be nested. Let's define
74+
# an outer function that internally calls the decorated :py:func:`inner_func_with_mark`
75+
76+
77+
@benchmark
78+
def outer_func_with_mark(par):
79+
mark("Outer func start")
80+
inner_func_with_mark(par)
81+
dist_arr = DistributedArray(global_shape=par['global_shape'],
82+
partition=par['partition'],
83+
dtype=par['dtype'], axis=par['axis'])
84+
dist_arr + dist_arr
85+
mark("Outer func ends")
86+
87+
88+
###############################################################################
89+
# If we run :py:func:`outer_func_with_mark`, we get the time measurement nicely
90+
# printed out with the nested indentation to specify that nested calls.
91+
outer_func_with_mark(par)
92+
93+
94+
###############################################################################
95+
# In some cases, we may want to write benchmark output to a text file.
96+
# :py:func:`pylops_mpi.utils.benchmark` also takes the py:class:`logging.Logger`
97+
# in its argument.
98+
# Here we define a simple :py:func:`make_logger()`. We set the :py:func:`logger.propagate = False`
99+
# to isolate the logging of our benchmark from that of the rest of the code
100+
101+
save_file = True
102+
file_path = "benchmark.log"
103+
104+
105+
def make_logger(save_file=False, file_path=''):
106+
logger = logging.getLogger(__name__)
107+
logging.basicConfig(filename=file_path if save_file else None, filemode='w', level=logging.INFO, force=True)
108+
logger.propagate = False
109+
if save_file:
110+
handler = logging.FileHandler(file_path, mode='w')
111+
else:
112+
handler = logging.StreamHandler(sys.stdout)
113+
logger.addHandler(handler)
114+
return logger
115+
116+
117+
logger = make_logger(save_file, file_path)
118+
119+
120+
###############################################################################
121+
# Then we can pass the logger to the :py:func:`pylops_mpi.utils.benchmark`
122+
123+
@benchmark(logger=logger)
124+
def inner_func_with_logger(par):
125+
dist_arr = DistributedArray(global_shape=par['global_shape'],
126+
partition=par['partition'],
127+
dtype=par['dtype'], axis=par['axis'])
128+
# may perform computation here
129+
dist_arr.dot(dist_arr)
130+
131+
132+
###############################################################################
133+
# Run this function and observe that the file `benchmark.log` is written.
134+
inner_func_with_logger(par)

0 commit comments

Comments
 (0)