DuplicateMovieFinder/duplicateCheck.py
2023-04-17 00:35:14 -04:00

72 lines
1.9 KiB
Python

import os, hashlib, sys
directories = os.listdir('.') ## Get the files in the current directory
files = []
def quickhash(filefullpath):
try:
with open(filefullpath, "rb" ) as f:
md5 = hashlib.md5()
block_size=1024*100
data = f.read(block_size)
md5.update(data)
return md5.digest()
except OSError as er:
print(f"Error hashing {filefullpath}")
return None
def direcCrawler():
global directories, files
for item in directories:
if os.path.isdir(item):
newdirItems = []
newdir = os.listdir(item)
for nwitem in newdir:
directories.append(f'{item}/{nwitem}')
elif os.path.isfile(item):
files.append(item)
directories.remove(item) # Its been crawled
return True
def calchashes():
global files, filehashes
filehashes = {}
for file in files:
hash = quickhash(file)
if hash == None: continue
## Inefficient to use a catch on every non-duplicate (most of the files)
try: filehashes[hash].append(file)
except: filehashes[hash] = [file]
return filehashes
def returnduplicates(filehashes:dict):
duplicates = []
for hlistkey in filehashes.keys():
if len(filehashes[hlistkey]) > 1:
duplicates.append(filehashes[hlistkey])
return duplicates
def main():
global files, directories
# Crawl the directories (TD add a timeout here later)
print("Collecting files & paths")
while len(directories) > 0:
direcCrawler()
print(f'found {len(files)} files')
# Calc hashes
print("Calculating file hashes")
filehashes = calchashes()
duplicates = returnduplicates(filehashes)
print(f"Found {len(duplicates)} duplicates")
main()
# quickhash('./ubuntu-22.04.2-desktop-amd64.iso')