X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=directory_filter.py;h=5d3585e5686603a9e6112a4def4fbd256597c3f1;hb=e46158e49121b8a955bb07b73f5bcf9928b79c90;hp=9fa13c2c1169c9895af5de7c4494627cd972d735;hpb=64a9a97fdff29f4bb9eef4e80faaeaa520d59506;p=python_utils.git diff --git a/directory_filter.py b/directory_filter.py index 9fa13c2..5d3585e 100644 --- a/directory_filter.py +++ b/directory_filter.py @@ -1,29 +1,65 @@ #!/usr/bin/env python3 +# © Copyright 2021-2022, Scott Gasch + +"""Two predicates that can help avoid unnecessary disk I/O by +detecting if a particular file is identical to the contents about to +be written or if a particular directory already contains a file that +is identical to the one about to be written. See examples below. +""" + import hashlib +import logging import os -from typing import Any, Optional +from typing import Any, Dict, Optional, Set + +logger = logging.getLogger(__name__) + + +class DirectoryFileFilter(object): + """A predicate that will return False if / when a proposed file's + content to-be-written is identical to the contents of the file on + disk allowing calling code to safely skip the write. + + >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt' + >>> contents = b'This is a test' + >>> with open(testfile, 'wb') as wf: + ... wf.write(contents) + 14 + + >>> d = DirectoryFileFilter('/tmp') -import predicate -import file_utils + >>> d.apply(contents, testfile) # False if testfile already contains contents + False + >>> d.apply(b'That was a test', testfile) # True otherwise + True + + >>> os.remove(testfile) -class DirectoryFileFilter(predicate.Predicate): - """A predicate that will return False if when a proposed file's - content to-be-written is identical to the contents of the file; - skip the write. """ def __init__(self, directory: str): + """C'tor. + + Args: + directory: the directory we're filtering accesses to + """ super().__init__() + import file_utils + if not file_utils.does_directory_exist(directory): raise ValueError(directory) self.directory = directory - self.md5_by_filename = {} - self.mtime_by_filename = {} + self.md5_by_filename: Dict[str, str] = {} + self.mtime_by_filename: Dict[str, float] = {} self._update() def _update(self): + """ + Internal method. Foreach file in the directory, compute its + MD5 checksum via :meth:`_update_file`. + """ for direntry in os.scandir(self.directory): if direntry.is_file(follow_symlinks=True): mtime = direntry.stat(follow_symlinks=True).st_mtime @@ -31,48 +67,128 @@ class DirectoryFileFilter(predicate.Predicate): self._update_file(path, mtime) def _update_file(self, filename: str, mtime: Optional[float] = None): + """ + Internal method. Given a file and mtime, compute its MD5 checksum + and persist it in an internal map. + """ + import file_utils + assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) + assert mtime is not None if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) + logger.debug('Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 - def apply(self, item: Any, filename: str) -> bool: + def apply(self, proposed_contents: Any, filename: str) -> bool: + """Call this with the proposed new contents of filename in + memory and we'll compute the checksum of those contents and + return a value that indicates whether they are identical to + the disk contents already (so you can skip the write safely). + + Args: + proposed_contents: the contents about to be written to + filename + filename: the file about to be populated with + proposed_contents + + Returns: + True if the disk contents of the file are identical to + proposed_contents already and False otherwise. + """ self._update_file(filename) file_md5 = self.md5_by_filename.get(filename, 0) + logger.debug('%s\'s checksum is %s', filename, file_md5) mem_hash = hashlib.md5() - mem_hash.update(item) + mem_hash.update(proposed_contents) md5 = mem_hash.hexdigest() + logger.debug('Item\'s checksum is %s', md5) return md5 != file_md5 class DirectoryAllFilesFilter(DirectoryFileFilter): """A predicate that will return False if a file to-be-written to a particular directory is identical to any other file in that same - directory. + directory (regardless of its name). + + i.e. this is the same as :class:`DirectoryFileFilter` except that + our apply() method will return true not only if the contents to be + written are identical to the contents of filename on the disk but + also it returns true if there exists some other file sitting in + the same directory which already contains those identical + contents. + + >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt' + + >>> contents = b'This is a test' + >>> with open(testfile, 'wb') as wf: + ... wf.write(contents) + 14 + + >>> d = DirectoryAllFilesFilter('/tmp') + + >>> d.apply(contents) # False is _any_ file in /tmp contains contents + False + + >>> d.apply(b'That was a test') # True otherwise + True + + >>> os.remove(testfile) + """ def __init__(self, directory: str): - self.all_md5s = set() + """C'tor. + + Args: + directory: the directory we're watching + """ + self.all_md5s: Set[str] = set() super().__init__(directory) - print(self.all_md5s) def _update_file(self, filename: str, mtime: Optional[float] = None): + """Internal method. Given a file and its mtime, update internal + state. + """ + import file_utils + assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) + assert mtime is not None if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 self.all_md5s.add(md5) - def apply(self, item: Any) -> bool: + def apply(self, proposed_contents: Any, ignored_filename: str = None) -> bool: + """Call this before writing a new file to directory with the + proposed_contents to be written and it will return a value that + indicates whether the identical contents is already sitting in + *any* file in that directory. Useful, e.g., for caching. + + Args: + proposed_contents: the contents about to be persisted to + directory + ignored_filename: unused for now, must be None + + Returns: + True if proposed contents does not yet exist in any file in + directory or False if it does exist in some file already. + """ + assert ignored_filename is None self._update() mem_hash = hashlib.md5() - mem_hash.update(item) + mem_hash.update(proposed_contents) md5 = mem_hash.hexdigest() return md5 not in self.all_md5s + +if __name__ == '__main__': + import doctest + + doctest.testmod()