#!/usr/bin/env python3
+# © Copyright 2021-2022, Scott Gasch
+
+"""Two predicates that can help avoid unnecessary disk I/O by
+detecting if a particular file is identical to the contents about to
+be written or if a particular directory already contains a file that
+is identical to the one about to be written. See examples below.
+"""
+
import hashlib
import logging
import os
"""
def __init__(self, directory: str):
+ """C'tor.
+
+ Args:
+ directory: the directory we're filtering accesses to
+ """
super().__init__()
import file_utils
self._update()
def _update(self):
+ """
+ Internal method. Foreach file in the directory, compute its
+ MD5 checksum via :meth:`_update_file`.
+ """
for direntry in os.scandir(self.directory):
if direntry.is_file(follow_symlinks=True):
mtime = direntry.stat(follow_symlinks=True).st_mtime
self._update_file(path, mtime)
def _update_file(self, filename: str, mtime: Optional[float] = None):
+ """
+ Internal method. Given a file and mtime, compute its MD5 checksum
+ and persist it in an internal map.
+ """
import file_utils
assert file_utils.does_file_exist(filename)
assert mtime is not None
if self.mtime_by_filename.get(filename, 0) != mtime:
md5 = file_utils.get_file_md5(filename)
- logger.debug(f'Computed/stored {filename}\'s MD5 at ts={mtime} ({md5})')
+ logger.debug('Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5)
self.mtime_by_filename[filename] = mtime
self.md5_by_filename[filename] = md5
- def apply(self, item: Any, filename: str) -> bool:
+ def apply(self, proposed_contents: Any, filename: str) -> bool:
+ """Call this with the proposed new contents of filename in
+ memory and we'll compute the checksum of those contents and
+ return a value that indicates whether they are identical to
+ the disk contents already (so you can skip the write safely).
+
+ Args:
+ proposed_contents: the contents about to be written to
+ filename
+ filename: the file about to be populated with
+ proposed_contents
+
+ Returns:
+ True if the disk contents of the file are identical to
+ proposed_contents already and False otherwise.
+ """
self._update_file(filename)
file_md5 = self.md5_by_filename.get(filename, 0)
- logger.debug(f'{filename}\'s checksum is {file_md5}')
+ logger.debug('%s\'s checksum is %s', filename, file_md5)
mem_hash = hashlib.md5()
- mem_hash.update(item)
+ mem_hash.update(proposed_contents)
md5 = mem_hash.hexdigest()
- logger.debug(f'Item\'s checksum is {md5}')
+ logger.debug('Item\'s checksum is %s', md5)
return md5 != file_md5
class DirectoryAllFilesFilter(DirectoryFileFilter):
"""A predicate that will return False if a file to-be-written to a
particular directory is identical to any other file in that same
- directory.
+ directory (regardless of its name).
- i.e. this is the same as the above except that its apply() method
- will return true not only if the contents to be written are
- identical to the contents of filename on the disk but also it
- returns true if there exists some other file sitting in the same
- directory which already contains those identical contents.
+ i.e. this is the same as :class:`DirectoryFileFilter` except that
+ our apply() method will return true not only if the contents to be
+ written are identical to the contents of filename on the disk but
+ also it returns true if there exists some other file sitting in
+ the same directory which already contains those identical
+ contents.
>>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt'
True
>>> os.remove(testfile)
+
"""
def __init__(self, directory: str):
+ """C'tor.
+
+ Args:
+ directory: the directory we're watching
+ """
self.all_md5s: Set[str] = set()
super().__init__(directory)
def _update_file(self, filename: str, mtime: Optional[float] = None):
+ """Internal method. Given a file and its mtime, update internal
+ state.
+ """
import file_utils
assert file_utils.does_file_exist(filename)
self.md5_by_filename[filename] = md5
self.all_md5s.add(md5)
- def apply(self, item: Any, ignored_filename: str = None) -> bool:
+ def apply(self, proposed_contents: Any, ignored_filename: str = None) -> bool:
+ """Call this before writing a new file to directory with the
+ proposed_contents to be written and it will return a value that
+ indicates whether the identical contents is already sitting in
+ *any* file in that directory. Useful, e.g., for caching.
+
+ Args:
+ proposed_contents: the contents about to be persisted to
+ directory
+ ignored_filename: unused for now, must be None
+
+ Returns:
+ True if proposed contents does not yet exist in any file in
+ directory or False if it does exist in some file already.
+ """
assert ignored_filename is None
self._update()
mem_hash = hashlib.md5()
- mem_hash.update(item)
+ mem_hash.update(proposed_contents)
md5 = mem_hash.hexdigest()
return md5 not in self.all_md5s