3 """Find duplicate files (based on hash of contents) in a directory (or
4 tree) and deduplicate them by either deleting duplicates or (with -l)
5 symlinking duplicates to a canonical original.
10 from collections import defaultdict
12 from pyutils import bootstrap, config, string_utils
13 from pyutils.files import file_utils
15 logger = logging.getLogger(__name__)
16 parser = config.add_commandline_args(
17 f'Dedup Files ({__file__})',
18 'Deduplicate files based on content in a directory or recursively',
24 help='Filespec (glob) of starting directory',
30 help='Do nothing, just say what you\'d do',
36 help='Traverse recursively',
42 help='Instead of deleting duplicates, create symbolic links',
49 sigs = defaultdict(list)
50 sizes = defaultdict(list)
53 for spec in config.config['start_dirs']:
54 if config.config['recursive']:
55 filez = file_utils.get_files_recursive(spec)
57 filez = file_utils.get_files(spec)
59 for filename in filez:
60 if not file_utils.is_symlink(filename) and file_utils.is_normal_file(
63 size = file_utils.get_file_size(filename)
64 sizes[size].append(filename)
65 logging.debug('%d => %s', size, sizes[size])
70 logging.debug('%s (size=%d) need checksums', files, size)
71 for filename in files:
72 md5 = file_utils.get_file_md5(filename)
73 sigs[md5].append(filename)
78 logging.debug('%s are all dupes', files)
81 for dupe in files[1:]:
82 if len(dupe) > len(filename):
89 assert not file_utils.is_symlink(dupe)
90 if config.config['dry_run']:
91 print(f'{filename} == {dupe}.')
92 dry_size += file_utils.get_file_size(dupe)
94 assert len(filename) >= len(dupe)
97 print(f'{killed} == {saved} -- DELETED')
98 logger.info('Deleting %s', killed)
100 if config.config['link']:
101 logger.info('Creating symlink from %s -> %s', saved, killed)
102 os.symlink(saved, killed)
107 f'Running w/o -n would have deleted {string_utils.add_thousands_separator(dry_size)} bytes from disk.'
112 if __name__ == '__main__':