Various sundry changes.
[python_utils.git] / directory_filter.py
1 #!/usr/bin/env python3
2
3 import hashlib
4 import os
5 from typing import Any, Optional
6
7 import predicate
8 import file_utils
9
10
11 class DirectoryFileFilter(predicate.Predicate):
12     """A predicate that will return False if when a proposed file's
13     content to-be-written is identical to the contents of the file;
14     skip the write.
15     """
16
17     def __init__(self, directory: str):
18         super().__init__()
19         if not file_utils.does_directory_exist(directory):
20             raise ValueError(directory)
21         self.directory = directory
22         self.md5_by_filename = {}
23         self.mtime_by_filename = {}
24         self._update()
25
26     def _update(self):
27         for direntry in os.scandir(self.directory):
28             if direntry.is_file(follow_symlinks=True):
29                 mtime = direntry.stat(follow_symlinks=True).st_mtime
30                 path = f'{self.directory}/{direntry.name}'
31                 self._update_file(path, mtime)
32
33     def _update_file(self, filename: str, mtime: Optional[float] = None):
34         assert file_utils.does_file_exist(filename)
35         if mtime is None:
36             mtime = file_utils.get_file_raw_mtime(filename)
37         if self.mtime_by_filename.get(filename, 0) != mtime:
38             md5 = file_utils.get_file_md5(filename)
39             self.mtime_by_filename[filename] = mtime
40             self.md5_by_filename[filename] = md5
41
42     def apply(self, item: Any, filename: str) -> bool:
43         self._update_file(filename)
44         file_md5 = self.md5_by_filename.get(filename, 0)
45         mem_hash = hashlib.md5()
46         mem_hash.update(item)
47         md5 = mem_hash.hexdigest()
48         return md5 != file_md5
49
50
51 class DirectoryAllFilesFilter(DirectoryFileFilter):
52     """A predicate that will return False if a file to-be-written to a
53     particular directory is identical to any other file in that same
54     directory.
55     """
56
57     def __init__(self, directory: str):
58         self.all_md5s = set()
59         super().__init__(directory)
60         print(self.all_md5s)
61
62     def _update_file(self, filename: str, mtime: Optional[float] = None):
63         assert file_utils.does_file_exist(filename)
64         if mtime is None:
65             mtime = file_utils.get_file_raw_mtime(filename)
66         if self.mtime_by_filename.get(filename, 0) != mtime:
67             md5 = file_utils.get_file_md5(filename)
68             self.mtime_by_filename[filename] = mtime
69             self.md5_by_filename[filename] = md5
70             self.all_md5s.add(md5)
71
72     def apply(self, item: Any) -> bool:
73         self._update()
74         mem_hash = hashlib.md5()
75         mem_hash.update(item)
76         md5 = mem_hash.hexdigest()
77         return md5 not in self.all_md5s
78