X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=directory_filter.py;h=69e5547255e20ac66cfea81658867c8697501126;hb=532df2c5b57c7517dfb3dddd8c1358fbadf8baf3;hp=508baf3bc888cf49e832ca72feddeda1797890a0;hpb=e6f32fdd9b373dfcd100c7accb41f57d83c2f0a1;p=python_utils.git diff --git a/directory_filter.py b/directory_filter.py index 508baf3..69e5547 100644 --- a/directory_filter.py +++ b/directory_filter.py @@ -1,9 +1,19 @@ #!/usr/bin/env python3 +# © Copyright 2021-2022, Scott Gasch + +"""Two predicates that can help avoid unnecessary disk I/O by +detecting if a particular file is identical to the contents about to +be written or if a particular directory already contains a file that +is identical to the one to be written. See class docs below for +examples. + +""" + import hashlib import logging import os -from typing import Any, Optional +from typing import Any, Dict, Optional, Set logger = logging.getLogger(__name__) @@ -38,8 +48,8 @@ class DirectoryFileFilter(object): if not file_utils.does_directory_exist(directory): raise ValueError(directory) self.directory = directory - self.md5_by_filename = {} - self.mtime_by_filename = {} + self.md5_by_filename: Dict[str, str] = {} + self.mtime_by_filename: Dict[str, float] = {} self._update() def _update(self): @@ -55,20 +65,21 @@ class DirectoryFileFilter(object): assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) + assert mtime is not None if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) - logger.debug(f'Computed/stored {filename}\'s MD5 at ts={mtime} ({md5})') + logger.debug('Computed/stored %s\'s MD5 at ts=%.2f (%s)', filename, mtime, md5) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 def apply(self, item: Any, filename: str) -> bool: self._update_file(filename) file_md5 = self.md5_by_filename.get(filename, 0) - logger.debug(f'{filename}\'s checksum is {file_md5}') + logger.debug('%s\'s checksum is %s', filename, file_md5) mem_hash = hashlib.md5() mem_hash.update(item) md5 = mem_hash.hexdigest() - logger.debug(f'Item\'s checksum is {md5}') + logger.debug('Item\'s checksum is %s', md5) return md5 != file_md5 @@ -102,7 +113,7 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): """ def __init__(self, directory: str): - self.all_md5s = set() + self.all_md5s: Set[str] = set() super().__init__(directory) def _update_file(self, filename: str, mtime: Optional[float] = None): @@ -111,13 +122,15 @@ class DirectoryAllFilesFilter(DirectoryFileFilter): assert file_utils.does_file_exist(filename) if mtime is None: mtime = file_utils.get_file_raw_mtime(filename) + assert mtime is not None if self.mtime_by_filename.get(filename, 0) != mtime: md5 = file_utils.get_file_md5(filename) self.mtime_by_filename[filename] = mtime self.md5_by_filename[filename] = md5 self.all_md5s.add(md5) - def apply(self, item: Any) -> bool: + def apply(self, item: Any, ignored_filename: str = None) -> bool: + assert ignored_filename is None self._update() mem_hash = hashlib.md5() mem_hash.update(item)