3 # © Copyright 2021-2022, Scott Gasch
5 """Two predicates that can help avoid unnecessary disk I/O by
6 detecting if a particular file is identical to the contents about to
7 be written or if a particular directory already contains a file that
8 is identical to the one to be written. See class docs below for
16 from typing import Any, Dict, Optional, Set
18 logger = logging.getLogger(__name__)
21 class DirectoryFileFilter(object):
22 """A predicate that will return False if / when a proposed file's
23 content to-be-written is identical to the contents of the file on
24 disk allowing calling code to safely skip the write.
26 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt'
27 >>> contents = b'This is a test'
28 >>> with open(testfile, 'wb') as wf:
29 ... wf.write(contents)
32 >>> d = DirectoryFileFilter('/tmp')
34 >>> d.apply(contents, testfile) # False if testfile already contains contents
37 >>> d.apply(b'That was a test', testfile) # True otherwise
40 >>> os.remove(testfile)
44 def __init__(self, directory: str):
48 if not file_utils.does_directory_exist(directory):
49 raise ValueError(directory)
50 self.directory = directory
51 self.md5_by_filename: Dict[str, str] = {}
52 self.mtime_by_filename: Dict[str, float] = {}
56 for direntry in os.scandir(self.directory):
57 if direntry.is_file(follow_symlinks=True):
58 mtime = direntry.stat(follow_symlinks=True).st_mtime
59 path = f'{self.directory}/{direntry.name}'
60 self._update_file(path, mtime)
62 def _update_file(self, filename: str, mtime: Optional[float] = None):
65 assert file_utils.does_file_exist(filename)
67 mtime = file_utils.get_file_raw_mtime(filename)
68 assert mtime is not None
69 if self.mtime_by_filename.get(filename, 0) != mtime:
70 md5 = file_utils.get_file_md5(filename)
71 logger.debug('Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5)
72 self.mtime_by_filename[filename] = mtime
73 self.md5_by_filename[filename] = md5
75 def apply(self, item: Any, filename: str) -> bool:
76 self._update_file(filename)
77 file_md5 = self.md5_by_filename.get(filename, 0)
78 logger.debug('%s\'s checksum is %s', filename, file_md5)
79 mem_hash = hashlib.md5()
81 md5 = mem_hash.hexdigest()
82 logger.debug('Item\'s checksum is %s', md5)
83 return md5 != file_md5
86 class DirectoryAllFilesFilter(DirectoryFileFilter):
87 """A predicate that will return False if a file to-be-written to a
88 particular directory is identical to any other file in that same
91 i.e. this is the same as the above except that its apply() method
92 will return true not only if the contents to be written are
93 identical to the contents of filename on the disk but also it
94 returns true if there exists some other file sitting in the same
95 directory which already contains those identical contents.
97 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt'
99 >>> contents = b'This is a test'
100 >>> with open(testfile, 'wb') as wf:
101 ... wf.write(contents)
104 >>> d = DirectoryAllFilesFilter('/tmp')
106 >>> d.apply(contents) # False is _any_ file in /tmp contains contents
109 >>> d.apply(b'That was a test') # True otherwise
112 >>> os.remove(testfile)
115 def __init__(self, directory: str):
116 self.all_md5s: Set[str] = set()
117 super().__init__(directory)
119 def _update_file(self, filename: str, mtime: Optional[float] = None):
122 assert file_utils.does_file_exist(filename)
124 mtime = file_utils.get_file_raw_mtime(filename)
125 assert mtime is not None
126 if self.mtime_by_filename.get(filename, 0) != mtime:
127 md5 = file_utils.get_file_md5(filename)
128 self.mtime_by_filename[filename] = mtime
129 self.md5_by_filename[filename] = md5
130 self.all_md5s.add(md5)
132 def apply(self, item: Any, ignored_filename: str = None) -> bool:
133 assert ignored_filename is None
135 mem_hash = hashlib.md5()
136 mem_hash.update(item)
137 md5 = mem_hash.hexdigest()
138 return md5 not in self.all_md5s
141 if __name__ == '__main__':