More cleanup, yey!
[python_utils.git] / lockfile.py
index 34279ba8392c0e538ede3bfbc09b7af882657c16..6993cb84d5e88f8dd6fc1a9d28a849f0cfd28713 100644 (file)
@@ -1,47 +1,71 @@
 #!/usr/bin/env python3
 
-from dataclasses import dataclass
+"""File-based locking helper."""
+
 import datetime
 import json
 import logging
 import os
 import signal
 import sys
+import warnings
+from dataclasses import dataclass
 from typing import Optional
 
+import config
+import datetime_utils
 import decorator_utils
 
-
+cfg = config.add_commandline_args(f'Lockfile ({__file__})', 'Args related to lockfiles')
+cfg.add_argument(
+    '--lockfile_held_duration_warning_threshold_sec',
+    type=float,
+    default=60.0,
+    metavar='SECONDS',
+    help='If a lock is held for longer than this threshold we log a warning',
+)
 logger = logging.getLogger(__name__)
 
 
 class LockFileException(Exception):
+    """An exception related to lock files."""
+
     pass
 
 
 @dataclass
 class LockFileContents:
+    """The contents we'll write to each lock file."""
+
     pid: int
     commandline: str
-    expiration_timestamp: float
+    expiration_timestamp: Optional[float]
 
 
 class LockFile(object):
     """A file locking mechanism that has context-manager support so you
-    can use it in a with statement.
+    can use it in a with statement.  e.g.
+
+    with LockFile('./foo.lock'):
+        # do a bunch of stuff... if the process dies we have a signal
+        # handler to do cleanup.  Other code (in this process or another)
+        # that tries to take the same lockfile will block.  There is also
+        # some logic for detecting stale locks.
+
     """
 
     def __init__(
-            self,
-            lockfile_path: str,
-            *,
-            do_signal_cleanup: bool = True,
-            expiration_timestamp: Optional[float] = None,
-            override_command: Optional[str] = None,
+        self,
+        lockfile_path: str,
+        *,
+        do_signal_cleanup: bool = True,
+        expiration_timestamp: Optional[float] = None,
+        override_command: Optional[str] = None,
     ) -> None:
-        self.is_locked = False
-        self.lockfile = lockfile_path
-        self.override_command = override_command
+        self.is_locked: bool = False
+        self.lockfile: str = lockfile_path
+        self.locktime: Optional[int] = None
+        self.override_command: Optional[str] = override_command
         if do_signal_cleanup:
             signal.signal(signal.SIGINT, self._signal)
             signal.signal(signal.SIGTERM, self._signal)
@@ -54,7 +78,7 @@ class LockFile(object):
         return not os.path.exists(self.lockfile)
 
     def try_acquire_lock_once(self) -> bool:
-        logger.debug(f"Trying to acquire {self.lockfile}.")
+        logger.debug("Trying to acquire %s.", self.lockfile)
         try:
             # Attempt to create the lockfile.  These flags cause
             # os.open to raise an OSError if the file already
@@ -64,25 +88,24 @@ class LockFile(object):
                 contents = self._get_lockfile_contents()
                 logger.debug(contents)
                 f.write(contents)
-            logger.debug(f'Success; I own {self.lockfile}.')
+            logger.debug('Success; I own %s.', self.lockfile)
             self.is_locked = True
             return True
         except OSError:
             pass
-        logger.debug(f'Failed; I could not acquire {self.lockfile}.')
+        logger.warning('Couldn\'t acquire %s.', self.lockfile)
         return False
 
     def acquire_with_retries(
-            self,
-            *,
-            initial_delay: float = 1.0,
-            backoff_factor: float = 2.0,
-            max_attempts = 5
+        self,
+        *,
+        initial_delay: float = 1.0,
+        backoff_factor: float = 2.0,
+        max_attempts=5,
     ) -> bool:
-
-        @decorator_utils.retry_if_false(tries = max_attempts,
-                                        delay_sec = initial_delay,
-                                        backoff = backoff_factor)
+        @decorator_utils.retry_if_false(
+            tries=max_attempts, delay_sec=initial_delay, backoff=backoff_factor
+        )
         def _try_acquire_lock_with_retries() -> bool:
             success = self.try_acquire_lock_once()
             if not success and os.path.exists(self.lockfile):
@@ -102,12 +125,21 @@ class LockFile(object):
 
     def __enter__(self):
         if self.acquire_with_retries():
+            self.locktime = datetime.datetime.now().timestamp()
             return self
         msg = f"Couldn't acquire {self.lockfile}; giving up."
         logger.warning(msg)
         raise LockFileException(msg)
 
-    def __exit__(self, type, value, traceback):
+    def __exit__(self, _, value, traceback):
+        if self.locktime:
+            ts = datetime.datetime.now().timestamp()
+            duration = ts - self.locktime
+            if duration >= config.config['lockfile_held_duration_warning_threshold_sec']:
+                str_duration = datetime_utils.describe_duration_briefly(duration)
+                msg = f'Held {self.lockfile} for {str_duration}'
+                logger.warning(msg)
+                warnings.warn(msg, stacklevel=2)
         self.release()
 
     def __del__(self):
@@ -123,11 +155,10 @@ class LockFile(object):
             cmd = self.override_command
         else:
             cmd = ' '.join(sys.argv)
-        print(cmd)
         contents = LockFileContents(
-            pid = os.getpid(),
-            commandline = cmd,
-            expiration_timestamp = self.expiration_timestamp,
+            pid=os.getpid(),
+            commandline=cmd,
+            expiration_timestamp=self.expiration_timestamp,
         )
         return json.dumps(contents.__dict__)
 
@@ -139,21 +170,22 @@ class LockFile(object):
                     line = lines[0]
                     line_dict = json.loads(line)
                     contents = LockFileContents(**line_dict)
-                    logger.debug(f'Blocking lock contents="{contents}"')
+                    logger.debug('Blocking lock contents="%s"', contents)
 
                     # Does the PID exist still?
                     try:
                         os.kill(contents.pid, 0)
                     except OSError:
-                        logger.debug('The pid seems stale; killing the lock.')
+                        msg = f'Lockfile {self.lockfile}\'s pid ({contents.pid}) is stale; force acquiring'
+                        logger.warning(msg)
                         self.release()
 
                     # Has the lock expiration expired?
                     if contents.expiration_timestamp is not None:
                         now = datetime.datetime.now().timestamp()
-                        if now > contents.expiration_datetime:
-                            logger.debug('The expiration time has passed; ' +
-                                         'killing the lock')
+                        if now > contents.expiration_timestamp:
+                            msg = f'Lockfile {self.lockfile} expiration time has passed; force acquiring'
+                            logger.warning(msg)
                             self.release()
         except Exception:
             pass