Various
[python_utils.git] / directory_filter.py
1 #!/usr/bin/env python3
2
3 import hashlib
4 import os
5 from typing import Any, Optional
6
7
8 class DirectoryFileFilter(object):
9     """A predicate that will return False if when a proposed file's
10     content to-be-written is identical to the contents of the file;
11     skip the write.
12     """
13     def __init__(self, directory: str):
14         super().__init__()
15         import file_utils
16         if not file_utils.does_directory_exist(directory):
17             raise ValueError(directory)
18         self.directory = directory
19         self.md5_by_filename = {}
20         self.mtime_by_filename = {}
21         self._update()
22
23     def _update(self):
24         for direntry in os.scandir(self.directory):
25             if direntry.is_file(follow_symlinks=True):
26                 mtime = direntry.stat(follow_symlinks=True).st_mtime
27                 path = f'{self.directory}/{direntry.name}'
28                 self._update_file(path, mtime)
29
30     def _update_file(self, filename: str, mtime: Optional[float] = None):
31         import file_utils
32         assert file_utils.does_file_exist(filename)
33         if mtime is None:
34             mtime = file_utils.get_file_raw_mtime(filename)
35         if self.mtime_by_filename.get(filename, 0) != mtime:
36             md5 = file_utils.get_file_md5(filename)
37             self.mtime_by_filename[filename] = mtime
38             self.md5_by_filename[filename] = md5
39
40     def apply(self, item: Any, filename: str) -> bool:
41         self._update_file(filename)
42         file_md5 = self.md5_by_filename.get(filename, 0)
43         mem_hash = hashlib.md5()
44         mem_hash.update(item)
45         md5 = mem_hash.hexdigest()
46         return md5 != file_md5
47
48
49 class DirectoryAllFilesFilter(DirectoryFileFilter):
50     """A predicate that will return False if a file to-be-written to a
51     particular directory is identical to any other file in that same
52     directory.
53     """
54
55     def __init__(self, directory: str):
56         self.all_md5s = set()
57         super().__init__(directory)
58         print(self.all_md5s)
59
60     def _update_file(self, filename: str, mtime: Optional[float] = None):
61         import file_utils
62         assert file_utils.does_file_exist(filename)
63         if mtime is None:
64             mtime = file_utils.get_file_raw_mtime(filename)
65         if self.mtime_by_filename.get(filename, 0) != mtime:
66             md5 = file_utils.get_file_md5(filename)
67             self.mtime_by_filename[filename] = mtime
68             self.md5_by_filename[filename] = md5
69             self.all_md5s.add(md5)
70
71     def apply(self, item: Any) -> bool:
72         self._update()
73         mem_hash = hashlib.md5()
74         mem_hash.update(item)
75         md5 = mem_hash.hexdigest()
76         return md5 not in self.all_md5s