diff --git a/duplicateCheck.py b/duplicateCheck.py new file mode 100644 index 0000000..27d0aa7 --- /dev/null +++ b/duplicateCheck.py @@ -0,0 +1,64 @@ +import os, hashlib, sys + +directories = os.listdir('.') ## Get the files in the current directory + +files = [] + +def quickhash(filefullpath): + with open(filefullpath, "rb" ) as f: + md5 = hashlib.md5() + block_size=1024*100 + data = f.read(block_size) + md5.update(data) + return md5.digest() + +def direcCrawler(): + global directories, files + for item in directories: + if os.path.isdir(item): + newdirItems = [] + newdir = os.listdir(item) + for nwitem in newdir: + directories.append(f'{item}/{nwitem}') + + elif os.path.isfile(item): + files.append(item) + directories.remove(item) # Its been crawled + return True + +def calchashes(): + global files, filehashes + filehashes = {} + for file in files: + hash = quickhash(file) + ## Inefficient to use a catch on every non-duplicate (most of the files) + try: filehashes[hash].append(file) + except: filehashes[hash] = [file] + return filehashes + +def returnduplicates(filehashes:dict): + duplicates = [] + for hlistkey in filehashes.keys(): + if len(filehashes[hlistkey]) > 1: + duplicates.append(filehashes[hlistkey]) + return duplicates + + + +def main(): + global files, directories + # Crawl the directories (TD add a timeout here later) + while len(directories) > 0: + direcCrawler() + + # Calc hashes + filehashes = calchashes() + + print(returnduplicates(filehashes)) + + + +main() + + +# quickhash('./ubuntu-22.04.2-desktop-amd64.iso') \ No newline at end of file