3 # © Copyright 2021-2022, Scott Gasch
5 """This module contains two classes meant to help reduce unnecessary disk
8 The first, :class:`DirectoryFileFilter`, determines when the contents
9 of a file held in memory are identical to the file copy already on
12 The second, :class:`DirectoryAllFilesFilter`, is basically the same
13 except for the caller need not indicate the name of the disk file
14 because it will check the memory file's signature against *all file
15 signatures* in a particular directory on disk.
23 from typing import Any, Dict, Optional, Set
25 logger = logging.getLogger(__name__)
28 class DirectoryFileFilter(object):
29 """A predicate that will return False if / when a proposed file's
30 content to-be-written is identical to the contents of the file on
31 disk allowing calling code to safely skip the write.
33 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt'
34 >>> contents = b'This is a test'
35 >>> with open(testfile, 'wb') as wf:
36 ... wf.write(contents)
39 >>> d = DirectoryFileFilter('/tmp')
41 >>> d.apply(contents, testfile) # False if testfile already contains contents
44 >>> d.apply(b'That was a test', testfile) # True otherwise
47 >>> os.remove(testfile)
50 def __init__(self, directory: str):
53 directory: the directory we're filtering accesses to
56 from pyutils.files import file_utils
58 if not file_utils.does_directory_exist(directory):
59 raise ValueError(directory)
60 self.directory = directory
61 self.md5_by_filename: Dict[str, str] = {}
62 self.mtime_by_filename: Dict[str, float] = {}
67 Internal method. Foreach file in the directory, compute its
68 MD5 checksum via :meth:`_update_file`.
70 for direntry in os.scandir(self.directory):
71 if direntry.is_file(follow_symlinks=True):
72 mtime = direntry.stat(follow_symlinks=True).st_mtime
73 path = f'{self.directory}/{direntry.name}'
74 self._update_file(path, mtime)
76 def _update_file(self, filename: str, mtime: Optional[float] = None):
78 Internal method. Given a file and mtime, compute its MD5 checksum
79 and persist it in an internal map.
81 from pyutils.files import file_utils
83 assert file_utils.does_file_exist(filename)
85 mtime = file_utils.get_file_raw_mtime(filename)
86 assert mtime is not None
87 if self.mtime_by_filename.get(filename, 0) != mtime:
88 md5 = file_utils.get_file_md5(filename)
90 'Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5
92 self.mtime_by_filename[filename] = mtime
93 self.md5_by_filename[filename] = md5
95 def apply(self, proposed_contents: Any, filename: str) -> bool:
96 """Call this with the proposed new contents of filename in
97 memory and we'll compute the checksum of those contents and
98 return a value that indicates whether they are identical to
99 the disk contents already (so you can skip the write safely).
102 proposed_contents: the contents about to be written to
104 filename: the file about to be populated with
108 True if the disk contents of the file are identical to
109 proposed_contents already and False otherwise.
111 self._update_file(filename)
112 file_md5 = self.md5_by_filename.get(filename, 0)
113 logger.debug('%s\'s checksum is %s', filename, file_md5)
114 mem_hash = hashlib.md5()
115 mem_hash.update(proposed_contents)
116 md5 = mem_hash.hexdigest()
117 logger.debug('Item\'s checksum is %s', md5)
118 return md5 != file_md5
121 class DirectoryAllFilesFilter(DirectoryFileFilter):
122 """A predicate that will return False if a file to-be-written to a
123 particular directory is identical to any other file in that same
124 directory (regardless of its name).
126 i.e. this is the same as :class:`DirectoryFileFilter` except that
127 our :meth:`apply` method will return true not only if the contents
128 to be written are identical to the contents of filename on the
129 disk but also it returns true if there exists some other file
130 sitting in the same directory which already contains those
133 >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt'
135 >>> contents = b'This is a test'
136 >>> with open(testfile, 'wb') as wf:
137 ... wf.write(contents)
140 >>> d = DirectoryAllFilesFilter('/tmp')
142 >>> d.apply(contents) # False is _any_ file in /tmp contains contents
145 >>> d.apply(b'That was a test') # True otherwise
148 >>> os.remove(testfile)
152 def __init__(self, directory: str):
155 directory: the directory we're watching
157 self.all_md5s: Set[str] = set()
158 super().__init__(directory)
160 def _update_file(self, filename: str, mtime: Optional[float] = None):
161 """Internal method. Given a file and its mtime, update internal
164 from pyutils.files import file_utils
166 assert file_utils.does_file_exist(filename)
168 mtime = file_utils.get_file_raw_mtime(filename)
169 assert mtime is not None
170 if self.mtime_by_filename.get(filename, 0) != mtime:
171 md5 = file_utils.get_file_md5(filename)
172 self.mtime_by_filename[filename] = mtime
173 self.md5_by_filename[filename] = md5
174 self.all_md5s.add(md5)
176 def apply(self, proposed_contents: Any, ignored_filename: str = None) -> bool:
177 """Call this before writing a new file to directory with the
178 proposed_contents to be written and it will return a value that
179 indicates whether the identical contents is already sitting in
180 *any* file in that directory. Useful, e.g., for caching.
183 proposed_contents: the contents about to be persisted to
185 ignored_filename: unused for now, must be None
188 True if proposed contents does not yet exist in any file in
189 directory or False if it does exist in some file already.
191 assert ignored_filename is None
193 mem_hash = hashlib.md5()
194 mem_hash.update(proposed_contents)
195 md5 = mem_hash.hexdigest()
196 return md5 not in self.all_md5s
199 if __name__ == '__main__':