3 """Find duplicate files (based on hash of contents) in a directory (or
4 tree) and deduplicate them by either deleting duplicates or (with -l)
5 symlinking duplicates to a canonical original.
10 from collections import defaultdict
12 from pyutils import bootstrap, config, string_utils
13 from pyutils.files import file_utils
15 logger = logging.getLogger(__name__)
16 parser = config.add_commandline_args(
17 f'Dedup Files ({__file__})',
18 'Deduplicate files based on their contents below (one or more) directory(ies) [and, optionally, recursively]',
24 help='Filespec (glob) of root directory at which to operate, may include several',
30 help='Do nothing, just say what you\'d do',
36 help='Traverse recursively below root directory(ies)',
42 help='Instead of deleting identified duplicates, create symbolic link back to first source',
49 sigs = defaultdict(list)
50 sizes = defaultdict(list)
53 for spec in config.config['root_dir']:
54 if config.config['recursive']:
55 filez = file_utils.get_files_recursive(spec)
57 filez = file_utils.get_files(spec)
59 for filename in filez:
60 if not file_utils.is_symlink(filename) and file_utils.is_normal_file(
63 size = file_utils.get_file_size(filename)
64 sizes[size].append(filename)
65 logging.debug('%d => %s', size, sizes[size])
70 logging.debug('%s (size=%d) need checksums', files, size)
71 for filename in files:
72 md5 = file_utils.get_file_md5(filename)
73 sigs[md5].append(filename)
78 logging.debug('%s are all dupes', files)
81 for dupe in files[1:]:
82 if len(dupe) > len(filename):
89 assert not file_utils.is_symlink(dupe)
90 if config.config['dry_run']:
91 print(f'{filename} == {dupe} (WOULD DELETE {dupe})')
92 if config.config['link']:
94 f'{filename} <- {dupe} (WOULD SYMLINK {dupe} to {filename})'
96 dry_size += file_utils.get_file_size(dupe)
98 assert len(filename) >= len(dupe)
101 print(f'{killed} == {saved} (DELETED {killed})')
102 logger.info('Deleting %s', killed)
104 if config.config['link']:
105 print(f'{saved} <- {killed} (SYMLINK)')
106 logger.info('Creating symlink from %s -> %s', saved, killed)
107 os.symlink(saved, killed)
112 f'Running w/o -n would have deleted {string_utils.add_thousands_separator(dry_size)} bytes from disk.'
117 if __name__ == '__main__':