Compare commits

..

13 Commits

Author SHA1 Message Date
Sean C
a1a40673fe append deletion 2023-04-17 00:56:04 -04:00
Sean C
f5ebce9c94 typo 2023-04-17 00:54:47 -04:00
Sean C
702581687c Add tally 2023-04-17 00:54:12 -04:00
Sean C
07fa4d8bb6 Algo testing 2023-04-17 00:51:29 -04:00
Sean C
17d0cfcf8e Dry run deletion algo 2023-04-17 00:48:39 -04:00
Sean C
2c7e8b6350 forgor 2023-04-17 00:40:31 -04:00
Sean C
a79b503f91 Look for symlinks 2023-04-17 00:40:11 -04:00
Sean C
caccc2f1fb list dupes 2023-04-17 00:37:17 -04:00
Sean C
4e265a68f3 typo 2023-04-17 00:35:14 -04:00
Sean C
76d8c4f01a Handle errors 2023-04-17 00:34:32 -04:00
Sean C
d282e8d5d6 Handle errors 2023-04-17 00:33:35 -04:00
Sean C
da49b29471 Added logs 2023-04-17 00:31:46 -04:00
Sean C
387153cb32 Added init file 2023-04-17 00:24:19 -04:00

91
duplicateCheck.py Normal file
View File

@ -0,0 +1,91 @@
import os, hashlib
directories = os.listdir('.') ## Get the files in the current directory
files = []
def quickhash(filefullpath):
try:
with open(filefullpath, "rb" ) as f:
md5 = hashlib.md5()
block_size=1024*100
data = f.read(block_size)
md5.update(data)
return md5.digest()
except OSError as er:
print(f"Error hashing {filefullpath}")
return None
def direcCrawler():
global directories, files
for item in directories:
if os.path.isdir(item):
newdirItems = []
newdir = os.listdir(item)
for nwitem in newdir:
directories.append(f'{item}/{nwitem}')
elif os.path.isfile(item):
if os.path.islink(item): print(f'FoundSymLink: {item}')
files.append(item)
directories.remove(item) # Its been crawled
return True
def calchashes():
global files, filehashes
filehashes = {}
for file in files:
hash = quickhash(file)
if hash == None: continue
## Inefficient to use a catch on every non-duplicate (most of the files)
try: filehashes[hash].append(file)
except: filehashes[hash] = [file]
return filehashes
def returnduplicates(filehashes:dict):
duplicates = []
for hlistkey in filehashes.keys():
if len(filehashes[hlistkey]) > 1:
duplicates.append(filehashes[hlistkey])
return duplicates
def deleteDuplicates(duplicates):
ignored = []; deletion = []
for dupe in duplicates:
if len(dupe) > 2:
print(f"IGNORED\n")
ignored.append(dupe)
elif len(dupe[0]) < len(dupe[1]) and "/" in dupe[0]:
print(f"Deleting {dupe[1]} and keeping {dupe[0]}\n")
deletion.append(dupe[1])
elif len(dupe[1]) < len(dupe[0]) and "/" in dupe[1]:
print(f"Deleting {dupe[0]} and keeping {dupe[1]}\n")
deletion.append(dupe[0])
print(f"To delete: {len(deletion)} Ignored: {len(ignored)}")
def main():
global files, directories
# Crawl the directories (TD add a timeout here later)
print("Collecting files & paths")
while len(directories) > 0:
direcCrawler()
print(f'found {len(files)} files')
# Calc hashes
print("Calculating file hashes")
filehashes = calchashes()
duplicates = returnduplicates(filehashes)
print(f"Found {len(duplicates)} duplicates\n\n")
input('Press enter to continue to DELETION of duplicates..')
deleteDuplicates(duplicates)
main()
# quickhash('./ubuntu-22.04.2-desktop-amd64.iso')