Add some more examples and a convenience method in config.py for
[pyutils.git] / examples / dedup_files / dedup_files.py
1 #!/usr/bin/env python3
2
3 """Find duplicate files (based on hash of contents) in a directory (or
4 tree) and deduplicate them by either deleting duplicates or (with -l)
5 symlinking duplicates to a canonical original.
6 """
7
8 import logging
9 import os
10 from collections import defaultdict
11
12 from pyutils import bootstrap, config, string_utils
13 from pyutils.files import file_utils
14
15 logger = logging.getLogger(__name__)
16 parser = config.add_commandline_args(
17     f'Dedup Files ({__file__})',
18     'Deduplicate files based on content in a directory or recursively',
19 )
20 parser.add_argument(
21     'start_dirs',
22     type=str,
23     nargs='*',
24     help='Filespec (glob) of starting directory',
25 )
26 parser.add_argument(
27     '-n',
28     '--dry_run',
29     action='store_true',
30     help='Do nothing, just say what you\'d do',
31 )
32 parser.add_argument(
33     '-R',
34     '--recursive',
35     action='store_true',
36     help='Traverse recursively',
37 )
38 parser.add_argument(
39     '-l',
40     '--link',
41     action='store_true',
42     help='Instead of deleting duplicates, create symbolic links',
43 )
44
45
46 @bootstrap.initialize
47 def main() -> int:
48     """Entry point"""
49     sigs = defaultdict(list)
50     sizes = defaultdict(list)
51     dry_size = 0
52
53     for spec in config.config['start_dirs']:
54         if config.config['recursive']:
55             filez = file_utils.get_files_recursive(spec)
56         else:
57             filez = file_utils.get_files(spec)
58
59         for filename in filez:
60             if not file_utils.is_symlink(filename) and file_utils.is_normal_file(
61                 filename
62             ):
63                 size = file_utils.get_file_size(filename)
64                 sizes[size].append(filename)
65                 logging.debug('%d => %s', size, sizes[size])
66
67         for size in sizes:
68             files = sizes[size]
69             if len(files) > 1:
70                 logging.debug('%s (size=%d) need checksums', files, size)
71                 for filename in files:
72                     md5 = file_utils.get_file_md5(filename)
73                     sigs[md5].append(filename)
74
75         for md5 in sigs:
76             files = sigs[md5]
77             if len(files) > 1:
78                 logging.debug('%s are all dupes', files)
79
80                 filename = files[0]
81                 for dupe in files[1:]:
82                     if len(dupe) > len(filename):
83                         filename = dupe
84
85                 for dupe in files:
86                     if filename == dupe:
87                         continue
88
89                     assert not file_utils.is_symlink(dupe)
90                     if config.config['dry_run']:
91                         print(f'{filename} == {dupe}.')
92                         dry_size += file_utils.get_file_size(dupe)
93                     else:
94                         assert len(filename) >= len(dupe)
95                         saved = filename
96                         killed = dupe
97                         print(f'{killed} == {saved} -- DELETED')
98                         logger.info('Deleting %s', killed)
99                         os.remove(killed)
100                         if config.config['link']:
101                             logger.info('Creating symlink from %s -> %s', saved, killed)
102                             os.symlink(saved, killed)
103                         filename = saved
104
105     if dry_size > 0:
106         print(
107             f'Running w/o -n would have deleted {string_utils.add_thousands_separator(dry_size)} bytes from disk.'
108         )
109     return 0
110
111
112 if __name__ == '__main__':
113     main()