X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=src%2Fpyutils%2Fparallelize%2Fparallelize.py;h=41d9093735d0c2e5b8d69195e19fa2c31c154b49;hb=993b0992473c12294ed659e52b532e1c8cf9cd1e;hp=6d31174424c2b415866a0b539a6cb4d64579a7fc;hpb=b38920f24d1ac948958480c540bc4b8436186765;p=pyutils.git diff --git a/src/pyutils/parallelize/parallelize.py b/src/pyutils/parallelize/parallelize.py index 6d31174..41d9093 100644 --- a/src/pyutils/parallelize/parallelize.py +++ b/src/pyutils/parallelize/parallelize.py @@ -2,21 +2,58 @@ # © Copyright 2021-2022, Scott Gasch -"""A decorator to help with dead simple parallelization. See usage -below. - -This will just work with `Method.THREAD` and `Method.PROCESS` but to -use `Method.REMOTE` you need to do some setup work. You need to -configure a pool of workers. Each worker should run the same version -of Python, ideally in identically configured virtual environments. -And you need to be able to ssh into each machine using key-based -authentication (i.e. non-iteractively) and run python. List machines -in the location set by `--remote_worker_records_file` (see -:file:executors.h for flag and an example JSON file under examples). +"""A decorator to help with dead simple parallelization. When decorated +functions are invoked they execute on a background thread, process or +remote machine depending on the style of decoration:: + + @parallelize # defaults to thread-mode + def my_function(a, b, c) -> int: + ...do some slow / expensive work, e.g., an http request + + @parallelize(method=Method.PROCESS) + def my_other_function(d, e, f) -> str: + ...do more really expensive work, e.g., a network read + + @parallelize(method=Method.REMOTE) + def my_other_other_function(g, h) -> int: + ...this work will be distributed to a remote machine pool + +This will just work out of the box with `Method.THREAD` (the default) +and `Method.PROCESS` but in otder to use `Method.REMOTE` you need to +do some setup work: + + 1. To use this stuff you need to hook into :mod:`pyutils.config` + so that this code can see commandline arguments. + + 2. You need to create and configure a pool of worker machines. + All of these machines should run the same version of Python, + ideally in a virtual environment (venv) with the same + Python dependencies installed. Different versions of code + or of the interpreter itself can cause issues with running + cloudpicked code. + + 3. You need an account that can ssh into any / all of these + machines non-interactively and run Python in the aforementioned + virtual environment. This likely means setting up ssh with + key-based authentication. + + 4. You need to tell this parallelization framework about the pool + of machines where it can dispatch work by creating a JSON based + configuration file. The location of this file defaults to + :file:`.remote_worker_records` in your home directory but can + be overridden via the `--remote_worker_records_file` + commandline argument. An example JSON configuration `can be + found under examples + `_. + + 5. Finally, you will also need tell the + :class:`executors.RemoteExecutor` how to invoke the + :file:`remote_worker.py` on remote machines by passing its path + on remote worker machines in your setup via the + `--remote_worker_helper_path` commandline flag. """ - import atexit import functools import typing @@ -35,7 +72,7 @@ def parallelize( _funct: typing.Optional[typing.Callable] = None, *, method: Method = Method.THREAD ) -> typing.Callable: """This is a decorator that was created to make multi-threading, - multi-processing and remote machine parallelism simple in python. + multi-processing and remote machine parallelism simple in Python. Sample usage:: @@ -59,10 +96,15 @@ def parallelize( The wrapped function returns immediately with a value that is wrapped in a :class:`SmartFuture`. This value will block if it is - either read directly (via a call to :meth:`_resolve`) or indirectly - (by using the result in an expression, printing it, hashing it, - passing it a function argument, etc...). See comments on - :class:`SmartFuture` for details. + either read directly (via a call to :meth:`_resolve`) or + indirectly (by using the result in an expression, printing it, + hashing it, passing it a function argument, etc...). See comments + on :class:`SmartFuture` for details. The value can be safely + stored (without hashing) or passed as an argument without causing + it to block waiting on a result. There are some convenience + methods for dealing with collections of :class:`SmartFuture` + objects defined in :file:`smart_future.py`, namely + :meth:`smart_future.wait_any` and :meth:`smart_future.wait_all`. .. warning:: You may stack @parallelized methods and it will "work". @@ -73,11 +115,12 @@ def parallelize( beyond the control mechanisms built into one instance of the pool. Be careful. - .. note:: + .. warning:: There is non-trivial overhead of pickling code and copying it over the network when you use :code:`Method.REMOTE`. There's a smaller but still considerable cost of creating a new process and passing code to/from it when you use :code:`Method.PROCESS`. + """ def wrapper(funct: typing.Callable):