Reduce import scopes, remove cycles.
[python_utils.git] / directory_filter.py
1 #!/usr/bin/env python3
2
3 import hashlib
4 import os
5 from typing import Any, Optional
6
7
8 class DirectoryFileFilter(object):
9     """A predicate that will return False if when a proposed file's
10     content to-be-written is identical to the contents of the file;
11     skip the write.
12     """
13
14     def __init__(self, directory: str):
15         import file_utils
16         super().__init__()
17         if not file_utils.does_directory_exist(directory):
18             raise ValueError(directory)
19         self.directory = directory
20         self.md5_by_filename = {}
21         self.mtime_by_filename = {}
22         self._update()
23
24     def _update(self):
25         for direntry in os.scandir(self.directory):
26             if direntry.is_file(follow_symlinks=True):
27                 mtime = direntry.stat(follow_symlinks=True).st_mtime
28                 path = f'{self.directory}/{direntry.name}'
29                 self._update_file(path, mtime)
30
31     def _update_file(self, filename: str, mtime: Optional[float] = None):
32         import file_utils
33         assert file_utils.does_file_exist(filename)
34         if mtime is None:
35             mtime = file_utils.get_file_raw_mtime(filename)
36         if self.mtime_by_filename.get(filename, 0) != mtime:
37             md5 = file_utils.get_file_md5(filename)
38             self.mtime_by_filename[filename] = mtime
39             self.md5_by_filename[filename] = md5
40
41     def apply(self, item: Any, filename: str) -> bool:
42         self._update_file(filename)
43         file_md5 = self.md5_by_filename.get(filename, 0)
44         mem_hash = hashlib.md5()
45         mem_hash.update(item)
46         md5 = mem_hash.hexdigest()
47         return md5 != file_md5
48
49
50 class DirectoryAllFilesFilter(DirectoryFileFilter):
51     """A predicate that will return False if a file to-be-written to a
52     particular directory is identical to any other file in that same
53     directory.
54     """
55
56     def __init__(self, directory: str):
57         self.all_md5s = set()
58         super().__init__(directory)
59         print(self.all_md5s)
60
61     def _update_file(self, filename: str, mtime: Optional[float] = None):
62         import file_utils
63         assert file_utils.does_file_exist(filename)
64         if mtime is None:
65             mtime = file_utils.get_file_raw_mtime(filename)
66         if self.mtime_by_filename.get(filename, 0) != mtime:
67             md5 = file_utils.get_file_md5(filename)
68             self.mtime_by_filename[filename] = mtime
69             self.md5_by_filename[filename] = md5
70             self.all_md5s.add(md5)
71
72     def apply(self, item: Any) -> bool:
73         self._update()
74         mem_hash = hashlib.md5()
75         mem_hash.update(item)
76         md5 = mem_hash.hexdigest()
77         return md5 not in self.all_md5s