#!/usr/bin/env python3 import hashlib import logging import os from typing import Any, Optional logger = logging.getLogger(__name__) class DirectoryFileFilter(object): """A predicate that will return False if / when a proposed file's content to-be-written is identical to the contents of the file on disk allowing calling code to safely skip the write. >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt' >>> contents = b'This is a test' >>> with open(testfile, 'wb') as wf: ... wf.write(contents) 14 >>> d = DirectoryFileFilter('/tmp') >>> d.apply(contents, testfile) # False if testfile already contains contents False >>> d.apply(b'That was a test', testfile) # True otherwise True >>> os.remove(testfile) """ def __init__(self, directory: str): super().__init__() import file_utils if not file_utils.does_directory_exist(directory): raise ValueError(directory) self.directory = directory self.md5_by_filename = {} self.mtime_by_filename = {} self._update() def _update(self): for direntry in os.scandir(self.directory): if direntry.is_file(follow_symlinks=True): mtime = direntry.stat(follow_symlinks=True).st_mtime path = f'{self.directory}/{direntry.name}' self._update_file(path, mtime) def _update_file(self, filename: str, mtime: Optional[float] = None): import file_utils assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) logger.debug(f'Computed/stored {filename}\'s MD5 at ts={mtime} ({md5})') self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 def apply(self, item: Any, filename: str) -> bool: self._update_file(filename) file_md5 = self.md5_by_filename.get(filename, 0) logger.debug(f'{filename}\'s checksum is {file_md5}') mem_hash = hashlib.md5() mem_hash.update(item) md5 = mem_hash.hexdigest() logger.debug(f'Item\'s checksum is {md5}') return md5 != file_md5 class DirectoryAllFilesFilter(DirectoryFileFilter): """A predicate that will return False if a file to-be-written to a particular directory is identical to any other file in that same directory. i.e. this is the same as the above except that its apply() method will return true not only if the contents to be written are identical to the contents of filename on the disk but also it returns true if there exists some other file sitting in the same directory which already contains those identical contents. >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt' >>> contents = b'This is a test' >>> with open(testfile, 'wb') as wf: ... wf.write(contents) 14 >>> d = DirectoryAllFilesFilter('/tmp') >>> d.apply(contents) # False is _any_ file in /tmp contains contents False >>> d.apply(b'That was a test') # True otherwise True >>> os.remove(testfile) """ def __init__(self, directory: str): self.all_md5s = set() super().__init__(directory) def _update_file(self, filename: str, mtime: Optional[float] = None): import file_utils assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 self.all_md5s.add(md5) def apply(self, item: Any) -> bool: self._update() mem_hash = hashlib.md5() mem_hash.update(item) md5 = mem_hash.hexdigest() return md5 not in self.all_md5s if __name__ == '__main__': import doctest doctest.testmod()