import os, hashlib, sys directories = os.listdir('.') ## Get the files in the current directory files = [] def quickhash(filefullpath): try: with open(filefullpath, "rb" ) as f: md5 = hashlib.md5() block_size=1024*100 data = f.read(block_size) md5.update(data) return md5.digest() except OSError as er: print(f"Error hashing {filefullpath}") return None def direcCrawler(): global directories, files for item in directories: if os.path.isdir(item): newdirItems = [] newdir = os.listdir(item) for nwitem in newdir: directories.append(f'{item}/{nwitem}') elif os.path.isfile(item): files.append(item) directories.remove(item) # Its been crawled return True def calchashes(): global files, filehashes filehashes = {} for file in files: hash = quickhash(file) if hash == None: continue ## Inefficient to use a catch on every non-duplicate (most of the files) try: filehashes[hash].append(file) except: filehashes[hash] = [file] return filehashes def returnduplicates(filehashes:dict): duplicates = [] for hlistkey in filehashes.keys(): if len(filehashes[hlistkey]) > 1: duplicates.append(filehashes[hlistkey]) return duplicates def main(): global files, directories # Crawl the directories (TD add a timeout here later) print("Collecting files & paths") while len(directories) > 0: direcCrawler() print(f'found {len(files)} files') # Calc hashes print("Calculating file hashes") filehashes = calchashes() duplicates = returnduplicates(filehashes) print(f"Found {len(duplicates)} duplicates") for dupe in duplicates: print(dupe) main() # quickhash('./ubuntu-22.04.2-desktop-amd64.iso')