X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=src%2Fpyutils%2Fparallelize%2Fparallelize.py;h=515d431dea4e4fa8c7c4101a0573d4308990c0e5;hb=278d163705facc2276cd464414fb490ef6af50ab;hp=9824e8ac6d242405e2b1f0ea801266569d69ebae;hpb=69566c003b4f1c3a4905f37d3735d7921502d14a;p=pyutils.git diff --git a/src/pyutils/parallelize/parallelize.py b/src/pyutils/parallelize/parallelize.py index 9824e8a..515d431 100644 --- a/src/pyutils/parallelize/parallelize.py +++ b/src/pyutils/parallelize/parallelize.py @@ -2,8 +2,77 @@ # © Copyright 2021-2022, Scott Gasch -"""A decorator to help with dead simple parallelization.""" - +"""A decorator to help with simple parallelization. When decorated +functions are invoked they execute on a background thread, process or +remote machine depending on the style of decoration:: + + from pyutils.parallelize import parallelize as p + + @p.parallelize # defaults to thread-mode + def my_function(a, b, c) -> int: + ...do some slow / expensive work, e.g., an http request + + @p.parallelize(method=Method.PROCESS) + def my_other_function(d, e, f) -> str: + ...do more really expensive work, e.g., a network read + + @p.parallelize(method=Method.REMOTE) + def my_other_other_function(g, h) -> int: + ...this work will be distributed to a remote machine pool + +This will just work out of the box with `Method.THREAD` (the default) +and `Method.PROCESS` but in order to use `Method.REMOTE` you need to +do some setup work: + + 1. To use `@parallelize(method=Method.REMOTE)` with your code you + need to hook your code into :mod:`pyutils.config` to enable + commandline flags from `pyutil` files. You can do this by + either wrapping your main entry point with the + :meth:`pyutils.bootstrap.initialize` decorator or just calling + `config.parse()` early in your program. See instructions in + :mod:`pyutils.bootstrap` and :mod:`pyutils.config` for + more information. + + 2. You need to create and configure a pool of worker machines. + All of these machines should run the same version of Python, + ideally in a virtual environment (venv) with the same + Python dependencies installed. See: https://docs.python.org/3/library/venv.html + + .. warning:: + + Different versions of code, libraries, or of the interpreter + itself can cause issues with running cloudpicked code. + + 3. You need an account that can ssh into any / all of these pool + machines non-interactively to perform tasks such as copying + code to the worker machine and running Python in the + aforementioned virtual environment. This likely means setting + up `ssh` / `scp` with key-based authentication. + See: https://www.digitalocean.com/community/tutorials/how-to-set-up-ssh-keys-2 + + 4. You need to tell this parallelization framework about the pool + of machines you created by editing a JSON-based configuration + file. The location of this file defaults to + :file:`.remote_worker_records` in your home directory but can + be overridden via the `--remote_worker_records_file` + commandline argument. An example JSON configuration `can be + found under examples + `_. + + 5. Finally, you will also need tell the + :class:`pyutils.parallelize.executors.RemoteExecutor` how to + invoke the :file:`remote_worker.py` on remote machines by + passing its path on remote worker machines in your setup via + the `--remote_worker_helper_path` commandline flag (or, + honestly, if you made it this far, just update the default in + this code -- find `executors.py` under `site-packages` in your + virtual environment and update the default value of the + `--remote_worker_helper_path` flag) + + If you're trying to set this up and struggling, email me at + scott.gasch@gmail.com. I'm happy to help. + +""" import atexit import functools @@ -23,49 +92,60 @@ def parallelize( _funct: typing.Optional[typing.Callable] = None, *, method: Method = Method.THREAD ) -> typing.Callable: """This is a decorator that was created to make multi-threading, - multi-processing and remote machine parallelism simple in python. + multi-processing and remote machine parallelism simple in Python. Sample usage:: - @parallelize # defaults to thread-mode + from pyutils.parallelize import parallelize as p + + @p.parallelize # defaults to thread-mode def my_function(a, b, c) -> int: ...do some slow / expensive work, e.g., an http request - @parallelize(method=Method.PROCESS) + @p.parallelize(method=Method.PROCESS) def my_other_function(d, e, f) -> str: ...do more really expensive work, e.g., a network read - @parallelize(method=Method.REMOTE) + @p.parallelize(method=Method.REMOTE) def my_other_other_function(g, h) -> int: ...this work will be distributed to a remote machine pool - This decorator will invoke the wrapped function on:: + This decorator will invoke the wrapped function on: - Method.THREAD (default): a background thread - Method.PROCESS: a background process - Method.REMOTE: a process on a remote host + - `Method.THREAD` (default): a background thread + - `Method.PROCESS`: a background process + - `Method.REMOTE`: a process on a remote host The wrapped function returns immediately with a value that is - wrapped in a :class:`SmartFuture`. This value will block if it is - either read directly (via a call to :meth:`_resolve`) or indirectly - (by using the result in an expression, printing it, hashing it, - passing it a function argument, etc...). See comments on - :class:`SmartFuture` for details. + wrapped in a + :class:`pyutils.parallelize.smart_future.SmartFuture`. This value + will block if it is either read directly (via a call to + :meth:`_resolve`) or indirectly (by using the result in an + expression, printing it, hashing it, passing it a function + argument, etc...). See comments on :class:`SmartFuture` for + details. The value can be safely stored (without hashing) or + passed as an argument without causing it to block waiting on a + result. There are some convenience methods for dealing with + collections of :class:`SmartFuture` objects defined in + :file:`smart_future.py`, namely + :meth:`pyutils.parallelize.smart_future.wait_any` and + :meth:`pyutils.parallelize.smart_future.wait_all`. .. warning:: You may stack @parallelized methods and it will "work". That said, having multiple layers of :code:`Method.PROCESS` or - :code:`Method.REMOTE` will prove to be problematic because each process in - the stack will use its own independent pool which may overload - your machine with processes or your network with remote processes - beyond the control mechanisms built into one instance of the pool. - Be careful. + :code:`Method.REMOTE` will prove to be problematic because each + process in the stack will use its own independent pool which will + likely overload your machine with processes or your network with + remote processes beyond the control mechanisms built into one + instance of the pool. Be careful. - .. note:: + .. warning:: There is non-trivial overhead of pickling code and copying it over the network when you use :code:`Method.REMOTE`. There's a smaller but still considerable cost of creating a new process and passing code to/from it when you use :code:`Method.PROCESS`. + """ def wrapper(funct: typing.Callable):