import os, hashlib directories = os.listdir('.') ## Get the files in the current directory files = [] def quickhash(filefullpath): try: with open(filefullpath, "rb" ) as f: md5 = hashlib.md5() block_size=1024*100 data = f.read(block_size) md5.update(data) return md5.digest() except OSError as er: print(f"Error hashing {filefullpath}") return None def direcCrawler(): global directories, files for item in directories: if os.path.isdir(item): newdirItems = [] newdir = os.listdir(item) for nwitem in newdir: directories.append(f'{item}/{nwitem}') elif os.path.isfile(item): if os.path.islink(item): print(f'FoundSymLink: {item}') files.append(item) directories.remove(item) # Its been crawled return True def calchashes(): global files, filehashes filehashes = {} for file in files: hash = quickhash(file) if hash == None: continue ## Inefficient to use a catch on every non-duplicate (most of the files) try: filehashes[hash].append(file) except: filehashes[hash] = [file] return filehashes def returnduplicates(filehashes:dict): duplicates = [] for hlistkey in filehashes.keys(): if len(filehashes[hlistkey]) > 1: duplicates.append(filehashes[hlistkey]) return duplicates def deleteDuplicates(duplicates): ignored = []; deletion = [] for dupe in duplicates: if len(dupe) > 2: print(f"IGNORED\n") ignored.append() elif len(dupe[0]) < len(dupe[1]) and "/" in dupe[0]: print(f"Deleting {dupe[1]} and keeping {dupe[0]}\n") elif len(dupe[1]) < len(dupe[0]) and "/" in dupe[1]: print(f"Deleting {dupe[0]} and keeping {dupe[1]}\n") print(f"To delete: {len(deletion)} Ignored: {len(ignored)}) def main(): global files, directories # Crawl the directories (TD add a timeout here later) print("Collecting files & paths") while len(directories) > 0: direcCrawler() print(f'found {len(files)} files') # Calc hashes print("Calculating file hashes") filehashes = calchashes() duplicates = returnduplicates(filehashes) print(f"Found {len(duplicates)} duplicates\n\n") input('Press enter to continue to DELETION of duplicates..') deleteDuplicates(duplicates) main() # quickhash('./ubuntu-22.04.2-desktop-amd64.iso')