X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=directory_filter.py;h=8d03ff603d425bddc24267061a902f67ac7bb3b1;hb=36fea7f15ed17150691b5b3ead75450e575229ef;hp=9fa13c2c1169c9895af5de7c4494627cd972d735;hpb=64a9a97fdff29f4bb9eef4e80faaeaa520d59506;p=python_utils.git diff --git a/directory_filter.py b/directory_filter.py index 9fa13c2..8d03ff6 100644 --- a/directory_filter.py +++ b/directory_filter.py @@ -1,21 +1,40 @@ #!/usr/bin/env python3 import hashlib +import logging import os from typing import Any, Optional -import predicate -import file_utils +logger = logging.getLogger(__name__) -class DirectoryFileFilter(predicate.Predicate): - """A predicate that will return False if when a proposed file's - content to-be-written is identical to the contents of the file; - skip the write. +class DirectoryFileFilter(object): + """A predicate that will return False if / when a proposed file's + content to-be-written is identical to the contents of the file on + disk allowing calling code to safely skip the write. + + >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt' + >>> contents = b'This is a test' + >>> with open(testfile, 'wb') as wf: + ... wf.write(contents) + 14 + + >>> d = DirectoryFileFilter('/tmp') + + >>> d.apply(contents, testfile) # False if testfile already contains contents + False + + >>> d.apply(b'That was a test', testfile) # True otherwise + True + + >>> os.remove(testfile) + """ def __init__(self, directory: str): super().__init__() + import file_utils + if not file_utils.does_directory_exist(directory): raise ValueError(directory) self.directory = directory @@ -31,20 +50,27 @@ class DirectoryFileFilter(predicate.Predicate): self._update_file(path, mtime) def _update_file(self, filename: str, mtime: Optional[float] = None): + import file_utils + assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) + logger.debug( + f'Computed/stored {filename}\'s MD5 at ts={mtime} ({md5})' + ) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 def apply(self, item: Any, filename: str) -> bool: self._update_file(filename) file_md5 = self.md5_by_filename.get(filename, 0) + logger.debug(f'{filename}\'s checksum is {file_md5}') mem_hash = hashlib.md5() mem_hash.update(item) md5 = mem_hash.hexdigest() + logger.debug(f'Item\'s checksum is {md5}') return md5 != file_md5 @@ -52,14 +78,38 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): """A predicate that will return False if a file to-be-written to a particular directory is identical to any other file in that same directory. + + i.e. this is the same as the above except that its apply() method + will return true not only if the contents to be written are + identical to the contents of filename on the disk but also it + returns true if there exists some other file sitting in the same + directory which already contains those identical contents. + + >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt' + + >>> contents = b'This is a test' + >>> with open(testfile, 'wb') as wf: + ... wf.write(contents) + 14 + + >>> d = DirectoryAllFilesFilter('/tmp') + + >>> d.apply(contents) # False is _any_ file in /tmp contains contents + False + + >>> d.apply(b'That was a test') # True otherwise + True + + >>> os.remove(testfile) """ def __init__(self, directory: str): self.all_md5s = set() super().__init__(directory) - print(self.all_md5s) def _update_file(self, filename: str, mtime: Optional[float] = None): + import file_utils + assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) @@ -76,3 +126,8 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): md5 = mem_hash.hexdigest() return md5 not in self.all_md5s + +if __name__ == '__main__': + import doctest + + doctest.testmod()