DuplicateMovieFinder/duplicateCheck.py
2023-04-17 00:48:39 -04:00

87 lines
2.4 KiB
Python

import os, hashlib
directories = os.listdir('.') ## Get the files in the current directory
files = []
def quickhash(filefullpath):
try:
with open(filefullpath, "rb" ) as f:
md5 = hashlib.md5()
block_size=1024*100
data = f.read(block_size)
md5.update(data)
return md5.digest()
except OSError as er:
print(f"Error hashing {filefullpath}")
return None
def direcCrawler():
global directories, files
for item in directories:
if os.path.isdir(item):
newdirItems = []
newdir = os.listdir(item)
for nwitem in newdir:
directories.append(f'{item}/{nwitem}')
elif os.path.isfile(item):
if os.path.islink(item): print(f'FoundSymLink: {item}')
files.append(item)
directories.remove(item) # Its been crawled
return True
def calchashes():
global files, filehashes
filehashes = {}
for file in files:
hash = quickhash(file)
if hash == None: continue
## Inefficient to use a catch on every non-duplicate (most of the files)
try: filehashes[hash].append(file)
except: filehashes[hash] = [file]
return filehashes
def returnduplicates(filehashes:dict):
duplicates = []
for hlistkey in filehashes.keys():
if len(filehashes[hlistkey]) > 1:
duplicates.append(filehashes[hlistkey])
return duplicates
def deleteDuplicates(duplicates):
for dupe in duplicates:
print(f'{dupe}')
if len(dupe) > 2:
print(f"Deal with this on ur own: {dupe}")
elif len(dupe[0]) < len(dupe[1]) and "/" in dupe[0]:
input(f"Delete {dupe[1]} and keep {dupe[0]}?")
elif len(dupe[1]) < len(dupe[0]) and "/" in dupe[1]:
input(f"Delete {dupe[0]} and keep {dupe[1]}?")
def main():
global files, directories
# Crawl the directories (TD add a timeout here later)
print("Collecting files & paths")
while len(directories) > 0:
direcCrawler()
print(f'found {len(files)} files')
# Calc hashes
print("Calculating file hashes")
filehashes = calchashes()
duplicates = returnduplicates(filehashes)
print(f"Found {len(duplicates)} duplicates\n\n")
input('Press enter to continue to DELETION of duplicates..')
deleteDuplicates(duplicates)
main()
# quickhash('./ubuntu-22.04.2-desktop-amd64.iso')