3 # © Copyright 2021-2022, Scott Gasch
5 """Two predicates that can help avoid unnecessary disk I/O by
6 detecting if a particular file is identical to the contents about to
7 be written or if a particular directory already contains a file that
8 is identical to the one about to be written. See examples below.
14 from typing import Any, Dict, Optional, Set
16 logger = logging.getLogger(__name__)
19 class DirectoryFileFilter(object):
20 """A predicate that will return False if / when a proposed file's
21 content to-be-written is identical to the contents of the file on
22 disk allowing calling code to safely skip the write.
24 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt'
25 >>> contents = b'This is a test'
26 >>> with open(testfile, 'wb') as wf:
27 ... wf.write(contents)
30 >>> d = DirectoryFileFilter('/tmp')
32 >>> d.apply(contents, testfile) # False if testfile already contains contents
35 >>> d.apply(b'That was a test', testfile) # True otherwise
38 >>> os.remove(testfile)
42 def __init__(self, directory: str):
46 directory: the directory we're filtering accesses to
51 if not file_utils.does_directory_exist(directory):
52 raise ValueError(directory)
53 self.directory = directory
54 self.md5_by_filename: Dict[str, str] = {}
55 self.mtime_by_filename: Dict[str, float] = {}
60 Internal method. Foreach file in the directory, compute its
61 MD5 checksum via :meth:`_update_file`.
63 for direntry in os.scandir(self.directory):
64 if direntry.is_file(follow_symlinks=True):
65 mtime = direntry.stat(follow_symlinks=True).st_mtime
66 path = f'{self.directory}/{direntry.name}'
67 self._update_file(path, mtime)
69 def _update_file(self, filename: str, mtime: Optional[float] = None):
71 Internal method. Given a file and mtime, compute its MD5 checksum
72 and persist it in an internal map.
76 assert file_utils.does_file_exist(filename)
78 mtime = file_utils.get_file_raw_mtime(filename)
79 assert mtime is not None
80 if self.mtime_by_filename.get(filename, 0) != mtime:
81 md5 = file_utils.get_file_md5(filename)
82 logger.debug('Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5)
83 self.mtime_by_filename[filename] = mtime
84 self.md5_by_filename[filename] = md5
86 def apply(self, proposed_contents: Any, filename: str) -> bool:
87 """Call this with the proposed new contents of filename in
88 memory and we'll compute the checksum of those contents and
89 return a value that indicates whether they are identical to
90 the disk contents already (so you can skip the write safely).
93 proposed_contents: the contents about to be written to
95 filename: the file about to be populated with
99 True if the disk contents of the file are identical to
100 proposed_contents already and False otherwise.
102 self._update_file(filename)
103 file_md5 = self.md5_by_filename.get(filename, 0)
104 logger.debug('%s\'s checksum is %s', filename, file_md5)
105 mem_hash = hashlib.md5()
106 mem_hash.update(proposed_contents)
107 md5 = mem_hash.hexdigest()
108 logger.debug('Item\'s checksum is %s', md5)
109 return md5 != file_md5
112 class DirectoryAllFilesFilter(DirectoryFileFilter):
113 """A predicate that will return False if a file to-be-written to a
114 particular directory is identical to any other file in that same
115 directory (regardless of its name).
117 i.e. this is the same as :class:`DirectoryFileFilter` except that
118 our apply() method will return true not only if the contents to be
119 written are identical to the contents of filename on the disk but
120 also it returns true if there exists some other file sitting in
121 the same directory which already contains those identical
124 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt'
126 >>> contents = b'This is a test'
127 >>> with open(testfile, 'wb') as wf:
128 ... wf.write(contents)
131 >>> d = DirectoryAllFilesFilter('/tmp')
133 >>> d.apply(contents) # False is _any_ file in /tmp contains contents
136 >>> d.apply(b'That was a test') # True otherwise
139 >>> os.remove(testfile)
143 def __init__(self, directory: str):
147 directory: the directory we're watching
149 self.all_md5s: Set[str] = set()
150 super().__init__(directory)
152 def _update_file(self, filename: str, mtime: Optional[float] = None):
153 """Internal method. Given a file and its mtime, update internal
158 assert file_utils.does_file_exist(filename)
160 mtime = file_utils.get_file_raw_mtime(filename)
161 assert mtime is not None
162 if self.mtime_by_filename.get(filename, 0) != mtime:
163 md5 = file_utils.get_file_md5(filename)
164 self.mtime_by_filename[filename] = mtime
165 self.md5_by_filename[filename] = md5
166 self.all_md5s.add(md5)
168 def apply(self, proposed_contents: Any, ignored_filename: str = None) -> bool:
169 """Call this before writing a new file to directory with the
170 proposed_contents to be written and it will return a value that
171 indicates whether the identical contents is already sitting in
172 *any* file in that directory. Useful, e.g., for caching.
175 proposed_contents: the contents about to be persisted to
177 ignored_filename: unused for now, must be None
180 True if proposed contents does not yet exist in any file in
181 directory or False if it does exist in some file already.
183 assert ignored_filename is None
185 mem_hash = hashlib.md5()
186 mem_hash.update(proposed_contents)
187 md5 = mem_hash.hexdigest()
188 return md5 not in self.all_md5s
191 if __name__ == '__main__':