X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=directory_filter.py;h=8d03ff603d425bddc24267061a902f67ac7bb3b1;hb=36fea7f15ed17150691b5b3ead75450e575229ef;hp=d275cf24d4e23a6be64fe4073ca4b3860c83ea3c;hpb=11eeb8574b7b4620ac6fd440cb251f8aa2458f5b;p=python_utils.git diff --git a/directory_filter.py b/directory_filter.py index d275cf2..8d03ff6 100644 --- a/directory_filter.py +++ b/directory_filter.py @@ -1,19 +1,40 @@ #!/usr/bin/env python3 import hashlib +import logging import os from typing import Any, Optional +logger = logging.getLogger(__name__) + class DirectoryFileFilter(object): - """A predicate that will return False if when a proposed file's - content to-be-written is identical to the contents of the file; - skip the write. + """A predicate that will return False if / when a proposed file's + content to-be-written is identical to the contents of the file on + disk allowing calling code to safely skip the write. + + >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt' + >>> contents = b'This is a test' + >>> with open(testfile, 'wb') as wf: + ... wf.write(contents) + 14 + + >>> d = DirectoryFileFilter('/tmp') + + >>> d.apply(contents, testfile) # False if testfile already contains contents + False + + >>> d.apply(b'That was a test', testfile) # True otherwise + True + + >>> os.remove(testfile) + """ def __init__(self, directory: str): - import file_utils super().__init__() + import file_utils + if not file_utils.does_directory_exist(directory): raise ValueError(directory) self.directory = directory @@ -30,20 +51,26 @@ class DirectoryFileFilter(object): def _update_file(self, filename: str, mtime: Optional[float] = None): import file_utils + assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) + logger.debug( + f'Computed/stored {filename}\'s MD5 at ts={mtime} ({md5})' + ) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 def apply(self, item: Any, filename: str) -> bool: self._update_file(filename) file_md5 = self.md5_by_filename.get(filename, 0) + logger.debug(f'{filename}\'s checksum is {file_md5}') mem_hash = hashlib.md5() mem_hash.update(item) md5 = mem_hash.hexdigest() + logger.debug(f'Item\'s checksum is {md5}') return md5 != file_md5 @@ -51,15 +78,38 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): """A predicate that will return False if a file to-be-written to a particular directory is identical to any other file in that same directory. + + i.e. this is the same as the above except that its apply() method + will return true not only if the contents to be written are + identical to the contents of filename on the disk but also it + returns true if there exists some other file sitting in the same + directory which already contains those identical contents. + + >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt' + + >>> contents = b'This is a test' + >>> with open(testfile, 'wb') as wf: + ... wf.write(contents) + 14 + + >>> d = DirectoryAllFilesFilter('/tmp') + + >>> d.apply(contents) # False is _any_ file in /tmp contains contents + False + + >>> d.apply(b'That was a test') # True otherwise + True + + >>> os.remove(testfile) """ def __init__(self, directory: str): self.all_md5s = set() super().__init__(directory) - print(self.all_md5s) def _update_file(self, filename: str, mtime: Optional[float] = None): import file_utils + assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) @@ -75,3 +125,9 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): mem_hash.update(item) md5 = mem_hash.hexdigest() return md5 not in self.all_md5s + + +if __name__ == '__main__': + import doctest + + doctest.testmod()