#!/usr/bin/env python3
+"""Two predicates that can help avoid unnecessary disk I/O by
+detecting if a particular file is identical to the contents about to
+be written or if a particular directory already contains a file that
+is identical to the one to be written. See class docs below for
+examples."""
+
import hashlib
+import logging
import os
-from typing import Any, Optional
+from typing import Any, Dict, Optional, Set
+
+logger = logging.getLogger(__name__)
class DirectoryFileFilter(object):
- """A predicate that will return False if when a proposed file's
- content to-be-written is identical to the contents of the file;
- skip the write.
+ """A predicate that will return False if / when a proposed file's
+ content to-be-written is identical to the contents of the file on
+ disk allowing calling code to safely skip the write.
+
+ >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c2.txt'
+ >>> contents = b'This is a test'
+ >>> with open(testfile, 'wb') as wf:
+ ... wf.write(contents)
+ 14
+
+ >>> d = DirectoryFileFilter('/tmp')
+
+ >>> d.apply(contents, testfile) # False if testfile already contains contents
+ False
+
+ >>> d.apply(b'That was a test', testfile) # True otherwise
+ True
+
+ >>> os.remove(testfile)
+
"""
+
def __init__(self, directory: str):
super().__init__()
import file_utils
+
if not file_utils.does_directory_exist(directory):
raise ValueError(directory)
self.directory = directory
- self.md5_by_filename = {}
- self.mtime_by_filename = {}
+ self.md5_by_filename: Dict[str, str] = {}
+ self.mtime_by_filename: Dict[str, float] = {}
self._update()
def _update(self):
def _update_file(self, filename: str, mtime: Optional[float] = None):
import file_utils
+
assert file_utils.does_file_exist(filename)
if mtime is None:
mtime = file_utils.get_file_raw_mtime(filename)
+ assert mtime is not None
if self.mtime_by_filename.get(filename, 0) != mtime:
md5 = file_utils.get_file_md5(filename)
+ logger.debug('Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5)
self.mtime_by_filename[filename] = mtime
self.md5_by_filename[filename] = md5
def apply(self, item: Any, filename: str) -> bool:
self._update_file(filename)
file_md5 = self.md5_by_filename.get(filename, 0)
+ logger.debug('%s\'s checksum is %s', filename, file_md5)
mem_hash = hashlib.md5()
mem_hash.update(item)
md5 = mem_hash.hexdigest()
+ logger.debug('Item\'s checksum is %s', md5)
return md5 != file_md5
"""A predicate that will return False if a file to-be-written to a
particular directory is identical to any other file in that same
directory.
+
+ i.e. this is the same as the above except that its apply() method
+ will return true not only if the contents to be written are
+ identical to the contents of filename on the disk but also it
+ returns true if there exists some other file sitting in the same
+ directory which already contains those identical contents.
+
+ >>> testfile = '/tmp/directory_filter_text_f39e5b58-c260-40da-9448-ad1c3b2a69c3.txt'
+
+ >>> contents = b'This is a test'
+ >>> with open(testfile, 'wb') as wf:
+ ... wf.write(contents)
+ 14
+
+ >>> d = DirectoryAllFilesFilter('/tmp')
+
+ >>> d.apply(contents) # False is _any_ file in /tmp contains contents
+ False
+
+ >>> d.apply(b'That was a test') # True otherwise
+ True
+
+ >>> os.remove(testfile)
"""
def __init__(self, directory: str):
- self.all_md5s = set()
+ self.all_md5s: Set[str] = set()
super().__init__(directory)
- print(self.all_md5s)
def _update_file(self, filename: str, mtime: Optional[float] = None):
import file_utils
+
assert file_utils.does_file_exist(filename)
if mtime is None:
mtime = file_utils.get_file_raw_mtime(filename)
+ assert mtime is not None
if self.mtime_by_filename.get(filename, 0) != mtime:
md5 = file_utils.get_file_md5(filename)
self.mtime_by_filename[filename] = mtime
self.md5_by_filename[filename] = md5
self.all_md5s.add(md5)
- def apply(self, item: Any) -> bool:
+ def apply(self, item: Any, ignored_filename: str = None) -> bool:
+ assert ignored_filename is None
self._update()
mem_hash = hashlib.md5()
mem_hash.update(item)
md5 = mem_hash.hexdigest()
return md5 not in self.all_md5s
+
+
+if __name__ == '__main__':
+ import doctest
+
+ doctest.testmod()