X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=directory_filter.py;h=5d3585e5686603a9e6112a4def4fbd256597c3f1;hb=e46158e49121b8a955bb07b73f5bcf9928b79c90;hp=b057f85a1c8c728a497171a37c4a216db231ba30;hpb=7ff2af6fe7bffea90dc4a31c93140c189917c659;p=python_utils.git diff --git a/directory_filter.py b/directory_filter.py index b057f85..5d3585e 100644 --- a/directory_filter.py +++ b/directory_filter.py @@ -1,5 +1,13 @@ #!/usr/bin/env python3 +# © Copyright 2021-2022, Scott Gasch + +"""Two predicates that can help avoid unnecessary disk I/O by +detecting if a particular file is identical to the contents about to +be written or if a particular directory already contains a file that +is identical to the one about to be written. See examples below. +""" + import hashlib import logging import os @@ -32,6 +40,11 @@ class DirectoryFileFilter(object): """ def __init__(self, directory: str): + """C'tor. + + Args: + directory: the directory we're filtering accesses to + """ super().__init__() import file_utils @@ -43,6 +56,10 @@ class DirectoryFileFilter(object): self._update() def _update(self): + """ + Internal method. Foreach file in the directory, compute its + MD5 checksum via :meth:`_update_file`. + """ for direntry in os.scandir(self.directory): if direntry.is_file(follow_symlinks=True): mtime = direntry.stat(follow_symlinks=True).st_mtime @@ -50,6 +67,10 @@ class DirectoryFileFilter(object): self._update_file(path, mtime) def _update_file(self, filename: str, mtime: Optional[float] = None): + """ + Internal method. Given a file and mtime, compute its MD5 checksum + and persist it in an internal map. + """ import file_utils assert file_utils.does_file_exist(filename) @@ -58,31 +79,47 @@ class DirectoryFileFilter(object): assert mtime is not None if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) - logger.debug(f'Computed/stored {filename}\'s MD5 at ts={mtime} ({md5})') + logger.debug('Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 - def apply(self, item: Any, filename: str) -> bool: + def apply(self, proposed_contents: Any, filename: str) -> bool: + """Call this with the proposed new contents of filename in + memory and we'll compute the checksum of those contents and + return a value that indicates whether they are identical to + the disk contents already (so you can skip the write safely). + + Args: + proposed_contents: the contents about to be written to + filename + filename: the file about to be populated with + proposed_contents + + Returns: + True if the disk contents of the file are identical to + proposed_contents already and False otherwise. + """ self._update_file(filename) file_md5 = self.md5_by_filename.get(filename, 0) - logger.debug(f'{filename}\'s checksum is {file_md5}') + logger.debug('%s\'s checksum is %s', filename, file_md5) mem_hash = hashlib.md5() - mem_hash.update(item) + mem_hash.update(proposed_contents) md5 = mem_hash.hexdigest() - logger.debug(f'Item\'s checksum is {md5}') + logger.debug('Item\'s checksum is %s', md5) return md5 != file_md5 class DirectoryAllFilesFilter(DirectoryFileFilter): """A predicate that will return False if a file to-be-written to a particular directory is identical to any other file in that same - directory. + directory (regardless of its name). - i.e. this is the same as the above except that its apply() method - will return true not only if the contents to be written are - identical to the contents of filename on the disk but also it - returns true if there exists some other file sitting in the same - directory which already contains those identical contents. + i.e. this is the same as :class:`DirectoryFileFilter` except that + our apply() method will return true not only if the contents to be + written are identical to the contents of filename on the disk but + also it returns true if there exists some other file sitting in + the same directory which already contains those identical + contents. >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt' @@ -100,13 +137,22 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): True >>> os.remove(testfile) + """ def __init__(self, directory: str): + """C'tor. + + Args: + directory: the directory we're watching + """ self.all_md5s: Set[str] = set() super().__init__(directory) def _update_file(self, filename: str, mtime: Optional[float] = None): + """Internal method. Given a file and its mtime, update internal + state. + """ import file_utils assert file_utils.does_file_exist(filename) @@ -119,11 +165,25 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): self.md5_by_filename[filename] = md5 self.all_md5s.add(md5) - def apply(self, item: Any, ignored_filename: str = None) -> bool: + def apply(self, proposed_contents: Any, ignored_filename: str = None) -> bool: + """Call this before writing a new file to directory with the + proposed_contents to be written and it will return a value that + indicates whether the identical contents is already sitting in + *any* file in that directory. Useful, e.g., for caching. + + Args: + proposed_contents: the contents about to be persisted to + directory + ignored_filename: unused for now, must be None + + Returns: + True if proposed contents does not yet exist in any file in + directory or False if it does exist in some file already. + """ assert ignored_filename is None self._update() mem_hash = hashlib.md5() - mem_hash.update(item) + mem_hash.update(proposed_contents) md5 = mem_hash.hexdigest() return md5 not in self.all_md5s