3 # © Copyright 2021-2022, Scott Gasch
5 """Two predicates that can help avoid unnecessary disk I/O by
6 detecting if a particular file is identical to the contents about to
7 be written or if a particular directory already contains a file that
8 is identical to the one about to be written. See examples below.
14 from typing import Any, Dict, Optional, Set
16 logger = logging.getLogger(__name__)
19 class DirectoryFileFilter(object):
20 """A predicate that will return False if / when a proposed file's
21 content to-be-written is identical to the contents of the file on
22 disk allowing calling code to safely skip the write.
24 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt'
25 >>> contents = b'This is a test'
26 >>> with open(testfile, 'wb') as wf:
27 ... wf.write(contents)
30 >>> d = DirectoryFileFilter('/tmp')
32 >>> d.apply(contents, testfile) # False if testfile already contains contents
35 >>> d.apply(b'That was a test', testfile) # True otherwise
38 >>> os.remove(testfile)
42 def __init__(self, directory: str):
46 directory: the directory we're filtering accesses to
49 from pyutils.files import file_utils
51 if not file_utils.does_directory_exist(directory):
52 raise ValueError(directory)
53 self.directory = directory
54 self.md5_by_filename: Dict[str, str] = {}
55 self.mtime_by_filename: Dict[str, float] = {}
60 Internal method. Foreach file in the directory, compute its
61 MD5 checksum via :meth:`_update_file`.
63 for direntry in os.scandir(self.directory):
64 if direntry.is_file(follow_symlinks=True):
65 mtime = direntry.stat(follow_symlinks=True).st_mtime
66 path = f'{self.directory}/{direntry.name}'
67 self._update_file(path, mtime)
69 def _update_file(self, filename: str, mtime: Optional[float] = None):
71 Internal method. Given a file and mtime, compute its MD5 checksum
72 and persist it in an internal map.
74 from pyutils.files import file_utils
76 assert file_utils.does_file_exist(filename)
78 mtime = file_utils.get_file_raw_mtime(filename)
79 assert mtime is not None
80 if self.mtime_by_filename.get(filename, 0) != mtime:
81 md5 = file_utils.get_file_md5(filename)
83 'Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5
85 self.mtime_by_filename[filename] = mtime
86 self.md5_by_filename[filename] = md5
88 def apply(self, proposed_contents: Any, filename: str) -> bool:
89 """Call this with the proposed new contents of filename in
90 memory and we'll compute the checksum of those contents and
91 return a value that indicates whether they are identical to
92 the disk contents already (so you can skip the write safely).
95 proposed_contents: the contents about to be written to
97 filename: the file about to be populated with
101 True if the disk contents of the file are identical to
102 proposed_contents already and False otherwise.
104 self._update_file(filename)
105 file_md5 = self.md5_by_filename.get(filename, 0)
106 logger.debug('%s\'s checksum is %s', filename, file_md5)
107 mem_hash = hashlib.md5()
108 mem_hash.update(proposed_contents)
109 md5 = mem_hash.hexdigest()
110 logger.debug('Item\'s checksum is %s', md5)
111 return md5 != file_md5
114 class DirectoryAllFilesFilter(DirectoryFileFilter):
115 """A predicate that will return False if a file to-be-written to a
116 particular directory is identical to any other file in that same
117 directory (regardless of its name).
119 i.e. this is the same as :class:`DirectoryFileFilter` except that
120 our apply() method will return true not only if the contents to be
121 written are identical to the contents of filename on the disk but
122 also it returns true if there exists some other file sitting in
123 the same directory which already contains those identical
126 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt'
128 >>> contents = b'This is a test'
129 >>> with open(testfile, 'wb') as wf:
130 ... wf.write(contents)
133 >>> d = DirectoryAllFilesFilter('/tmp')
135 >>> d.apply(contents) # False is _any_ file in /tmp contains contents
138 >>> d.apply(b'That was a test') # True otherwise
141 >>> os.remove(testfile)
145 def __init__(self, directory: str):
149 directory: the directory we're watching
151 self.all_md5s: Set[str] = set()
152 super().__init__(directory)
154 def _update_file(self, filename: str, mtime: Optional[float] = None):
155 """Internal method. Given a file and its mtime, update internal
158 from pyutils.files import file_utils
160 assert file_utils.does_file_exist(filename)
162 mtime = file_utils.get_file_raw_mtime(filename)
163 assert mtime is not None
164 if self.mtime_by_filename.get(filename, 0) != mtime:
165 md5 = file_utils.get_file_md5(filename)
166 self.mtime_by_filename[filename] = mtime
167 self.md5_by_filename[filename] = md5
168 self.all_md5s.add(md5)
170 def apply(self, proposed_contents: Any, ignored_filename: str = None) -> bool:
171 """Call this before writing a new file to directory with the
172 proposed_contents to be written and it will return a value that
173 indicates whether the identical contents is already sitting in
174 *any* file in that directory. Useful, e.g., for caching.
177 proposed_contents: the contents about to be persisted to
179 ignored_filename: unused for now, must be None
182 True if proposed contents does not yet exist in any file in
183 directory or False if it does exist in some file already.
185 assert ignored_filename is None
187 mem_hash = hashlib.md5()
188 mem_hash.update(proposed_contents)
189 md5 = mem_hash.hexdigest()
190 return md5 not in self.all_md5s
193 if __name__ == '__main__':