X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=directory_filter.py;h=69e5547255e20ac66cfea81658867c8697501126;hb=532df2c5b57c7517dfb3dddd8c1358fbadf8baf3;hp=d14dce7c6eec912e2b2c5b07ebe9b6ef4dfcf716;hpb=7e6972bc7c8e891dc669645fa5969ed76fe38314;p=python_utils.git diff --git a/directory_filter.py b/directory_filter.py index d14dce7..69e5547 100644 --- a/directory_filter.py +++ b/directory_filter.py @@ -1,23 +1,55 @@ #!/usr/bin/env python3 +# © Copyright 2021-2022, Scott Gasch + +"""Two predicates that can help avoid unnecessary disk I/O by +detecting if a particular file is identical to the contents about to +be written or if a particular directory already contains a file that +is identical to the one to be written. See class docs below for +examples. + +""" + import hashlib +import logging import os -from typing import Any, Optional +from typing import Any, Dict, Optional, Set + +logger = logging.getLogger(__name__) class DirectoryFileFilter(object): """A predicate that will return False if / when a proposed file's - content to-be-written is identical to the contents of the file; - skip the write. + content to-be-written is identical to the contents of the file on + disk allowing calling code to safely skip the write. + + >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt' + >>> contents = b'This is a test' + >>> with open(testfile, 'wb') as wf: + ... wf.write(contents) + 14 + + >>> d = DirectoryFileFilter('/tmp') + + >>> d.apply(contents, testfile) # False if testfile already contains contents + False + + >>> d.apply(b'That was a test', testfile) # True otherwise + True + + >>> os.remove(testfile) + """ + def __init__(self, directory: str): super().__init__() import file_utils + if not file_utils.does_directory_exist(directory): raise ValueError(directory) self.directory = directory - self.md5_by_filename = {} - self.mtime_by_filename = {} + self.md5_by_filename: Dict[str, str] = {} + self.mtime_by_filename: Dict[str, float] = {} self._update() def _update(self): @@ -29,20 +61,25 @@ class DirectoryFileFilter(object): def _update_file(self, filename: str, mtime: Optional[float] = None): import file_utils + assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) + assert mtime is not None if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) + logger.debug('Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 def apply(self, item: Any, filename: str) -> bool: self._update_file(filename) file_md5 = self.md5_by_filename.get(filename, 0) + logger.debug('%s\'s checksum is %s', filename, file_md5) mem_hash = hashlib.md5() mem_hash.update(item) md5 = mem_hash.hexdigest() + logger.debug('Item\'s checksum is %s', md5) return md5 != file_md5 @@ -50,27 +87,58 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): """A predicate that will return False if a file to-be-written to a particular directory is identical to any other file in that same directory. + + i.e. this is the same as the above except that its apply() method + will return true not only if the contents to be written are + identical to the contents of filename on the disk but also it + returns true if there exists some other file sitting in the same + directory which already contains those identical contents. + + >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt' + + >>> contents = b'This is a test' + >>> with open(testfile, 'wb') as wf: + ... wf.write(contents) + 14 + + >>> d = DirectoryAllFilesFilter('/tmp') + + >>> d.apply(contents) # False is _any_ file in /tmp contains contents + False + + >>> d.apply(b'That was a test') # True otherwise + True + + >>> os.remove(testfile) """ def __init__(self, directory: str): - self.all_md5s = set() + self.all_md5s: Set[str] = set() super().__init__(directory) - print(self.all_md5s) def _update_file(self, filename: str, mtime: Optional[float] = None): import file_utils + assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) + assert mtime is not None if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 self.all_md5s.add(md5) - def apply(self, item: Any) -> bool: + def apply(self, item: Any, ignored_filename: str = None) -> bool: + assert ignored_filename is None self._update() mem_hash = hashlib.md5() mem_hash.update(item) md5 = mem_hash.hexdigest() return md5 not in self.all_md5s + + +if __name__ == '__main__': + import doctest + + doctest.testmod()