Avoid directory writes when files are already there.
[python_utils.git] / directory_filter.py
1 #!/usr/bin/env python3
2
3 import hashlib
4 import os
5 from typing import Any, Optional
6
7 import file_utils
8
9
10 class DirectoryFileFilter(object):
11     """A predicate that will return False if when a proposed file's
12     content to-be-written is identical to the contents of the file;
13     skip the write.
14     """
15
16     def __init__(self, directory: str):
17         super().__init__()
18         if not file_utils.does_directory_exist(directory):
19             raise ValueError(directory)
20         self.directory = directory
21         self.md5_by_filename = {}
22         self.mtime_by_filename = {}
23         self._update()
24
25     def _update(self):
26         for direntry in os.scandir(self.directory):
27             if direntry.is_file(follow_symlinks=True):
28                 mtime = direntry.stat(follow_symlinks=True).st_mtime
29                 path = f'{self.directory}/{direntry.name}'
30                 self._update_file(path, mtime)
31
32     def _update_file(self, filename: str, mtime: Optional[float] = None):
33         assert file_utils.does_file_exist(filename)
34         if mtime is None:
35             mtime = file_utils.get_file_raw_mtime(filename)
36         if self.mtime_by_filename.get(filename, 0) != mtime:
37             md5 = file_utils.get_file_md5(filename)
38             self.mtime_by_filename[filename] = mtime
39             self.md5_by_filename[filename] = md5
40
41     def apply(self, item: Any, filename: str) -> bool:
42         self._update_file(filename)
43         file_md5 = self.md5_by_filename.get(filename, 0)
44         mem_hash = hashlib.md5()
45         mem_hash.update(item)
46         md5 = mem_hash.hexdigest()
47         return md5 != file_md5
48
49
50 class DirectoryAllFilesFilter(DirectoryFileFilter):
51     """A predicate that will return False if a file to-be-written to a
52     particular directory is identical to any other file in that same
53     directory.
54     """
55
56     def __init__(self, directory: str):
57         self.all_md5s = set()
58         super().__init__(directory)
59         print(self.all_md5s)
60
61     def _update_file(self, filename: str, mtime: Optional[float] = None):
62         assert file_utils.does_file_exist(filename)
63         if mtime is None:
64             mtime = file_utils.get_file_raw_mtime(filename)
65         if self.mtime_by_filename.get(filename, 0) != mtime:
66             md5 = file_utils.get_file_md5(filename)
67             self.mtime_by_filename[filename] = mtime
68             self.md5_by_filename[filename] = md5
69             self.all_md5s.add(md5)
70
71     def apply(self, item: Any) -> bool:
72         self._update()
73         mem_hash = hashlib.md5()
74         mem_hash.update(item)
75         md5 = mem_hash.hexdigest()
76         return md5 not in self.all_md5s