3 # © Copyright 2021-2023, Scott Gasch
5 """This module contains two classes meant to help reduce unnecessary disk
8 The first, :class:`DirectoryFileFilter`, determines when the contents
9 of a file held in memory are identical to the file copy already on
12 The second, :class:`DirectoryAllFilesFilter`, is basically the same
13 except for the caller need not indicate the name of the disk file
14 because it will check the memory file's signature against *all file
15 signatures* in a particular directory on disk.
23 from typing import Any, Dict, Optional, Set
25 logger = logging.getLogger(__name__)
28 class DirectoryFileFilter(object):
29 """A predicate that will return False if / when a proposed file's
30 content to-be-written is identical to the contents of the file on
31 disk allowing calling code to safely skip the write.
34 ValueError: directory doesn't exist
36 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt'
37 >>> contents = b'This is a test'
38 >>> with open(testfile, 'wb') as wf:
39 ... wf.write(contents)
42 >>> d = DirectoryFileFilter('/tmp')
44 >>> d.apply(contents, testfile) # False if testfile already contains contents
47 >>> d.apply(b'That was a test', testfile) # True otherwise
50 >>> os.remove(testfile)
53 def __init__(self, directory: str):
56 directory: the directory we're filtering accesses to
59 from pyutils.files import file_utils
61 if not file_utils.does_directory_exist(directory):
62 raise ValueError(directory)
63 self.directory = directory
64 self.md5_by_filename: Dict[str, str] = {}
65 self.mtime_by_filename: Dict[str, float] = {}
70 Internal method. Foreach file in the directory, compute its
71 MD5 checksum via :meth:`_update_file`.
73 for direntry in os.scandir(self.directory):
74 if direntry.is_file(follow_symlinks=True):
75 mtime = direntry.stat(follow_symlinks=True).st_mtime
76 path = f"{self.directory}/{direntry.name}"
77 self._update_file(path, mtime)
79 def _update_file(self, filename: str, mtime: Optional[float] = None):
81 Internal method. Given a file and mtime, compute its MD5 checksum
82 and persist it in an internal map.
84 from pyutils.files import file_utils
86 assert file_utils.does_file_exist(filename)
88 mtime = file_utils.get_file_raw_mtime(filename)
89 assert mtime is not None
90 if self.mtime_by_filename.get(filename, 0) != mtime:
91 md5 = file_utils.get_file_md5(filename)
93 "Computed/stored %s's MD5 at ts=%.2f (%s)", filename, mtime, md5
95 self.mtime_by_filename[filename] = mtime
96 self.md5_by_filename[filename] = md5
98 def apply(self, proposed_contents: Any, filename: str) -> bool:
99 """Call this with the proposed new contents of filename in
100 memory and we'll compute the checksum of those contents and
101 return a value that indicates whether they are identical to
102 the disk contents already (so you can skip the write safely).
105 proposed_contents: the contents about to be written to
107 filename: the file about to be populated with
111 True if the disk contents of the file are identical to
112 proposed_contents already and False otherwise.
114 self._update_file(filename)
115 file_md5 = self.md5_by_filename.get(filename, 0)
116 logger.debug("%s's checksum is %s", filename, file_md5)
117 mem_hash = hashlib.md5()
118 mem_hash.update(proposed_contents)
119 md5 = mem_hash.hexdigest()
120 logger.debug("Item's checksum is %s", md5)
121 return md5 != file_md5
124 class DirectoryAllFilesFilter(DirectoryFileFilter):
125 """A predicate that will return False if a file to-be-written to a
126 particular directory is identical to any other file in that same
127 directory (regardless of its name).
129 i.e. this is the same as :class:`DirectoryFileFilter` except that
130 our :meth:`apply` method will return true not only if the contents
131 to be written are identical to the contents of filename on the
132 disk but also it returns true if there exists some other file
133 sitting in the same directory which already contains those
136 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt'
138 >>> contents = b'This is a test'
139 >>> with open(testfile, 'wb') as wf:
140 ... wf.write(contents)
143 >>> d = DirectoryAllFilesFilter('/tmp')
145 >>> d.apply(contents) # False is _any_ file in /tmp contains contents
148 >>> d.apply(b'That was a test') # True otherwise
151 >>> os.remove(testfile)
155 def __init__(self, directory: str):
158 directory: the directory we're watching
160 self.all_md5s: Set[str] = set()
161 super().__init__(directory)
163 def _update_file(self, filename: str, mtime: Optional[float] = None):
164 """Internal method. Given a file and its mtime, update internal
167 from pyutils.files import file_utils
169 assert file_utils.does_file_exist(filename)
171 mtime = file_utils.get_file_raw_mtime(filename)
172 assert mtime is not None
173 if self.mtime_by_filename.get(filename, 0) != mtime:
174 md5 = file_utils.get_file_md5(filename)
175 self.mtime_by_filename[filename] = mtime
176 self.md5_by_filename[filename] = md5
177 self.all_md5s.add(md5)
180 self, proposed_contents: Any, ignored_filename: Optional[str] = None
182 """Call this before writing a new file to directory with the
183 proposed_contents to be written and it will return a value that
184 indicates whether the identical contents is already sitting in
185 *any* file in that directory. Useful, e.g., for caching.
188 proposed_contents: the contents about to be persisted to
190 ignored_filename: unused for now, must be None
193 True if proposed contents does not yet exist in any file in
194 directory or False if it does exist in some file already.
196 assert ignored_filename is None
198 mem_hash = hashlib.md5()
199 mem_hash.update(proposed_contents)
200 md5 = mem_hash.hexdigest()
201 return md5 not in self.all_md5s
204 if __name__ == "__main__":