Compare commits
No commits in common. "Testing-matching" and "main" have entirely different histories.
Testing-ma
...
main
@ -1,91 +0,0 @@
|
|||||||
import os, hashlib
|
|
||||||
|
|
||||||
directories = os.listdir('.') ## Get the files in the current directory
|
|
||||||
|
|
||||||
files = []
|
|
||||||
|
|
||||||
def quickhash(filefullpath):
|
|
||||||
try:
|
|
||||||
with open(filefullpath, "rb" ) as f:
|
|
||||||
md5 = hashlib.md5()
|
|
||||||
block_size=1024*100
|
|
||||||
data = f.read(block_size)
|
|
||||||
md5.update(data)
|
|
||||||
return md5.digest()
|
|
||||||
except OSError as er:
|
|
||||||
print(f"Error hashing {filefullpath}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def direcCrawler():
|
|
||||||
global directories, files
|
|
||||||
for item in directories:
|
|
||||||
if os.path.isdir(item):
|
|
||||||
newdirItems = []
|
|
||||||
newdir = os.listdir(item)
|
|
||||||
for nwitem in newdir:
|
|
||||||
directories.append(f'{item}/{nwitem}')
|
|
||||||
|
|
||||||
elif os.path.isfile(item):
|
|
||||||
if os.path.islink(item): print(f'FoundSymLink: {item}')
|
|
||||||
files.append(item)
|
|
||||||
directories.remove(item) # Its been crawled
|
|
||||||
return True
|
|
||||||
|
|
||||||
def calchashes():
|
|
||||||
global files, filehashes
|
|
||||||
filehashes = {}
|
|
||||||
for file in files:
|
|
||||||
hash = quickhash(file)
|
|
||||||
if hash == None: continue
|
|
||||||
## Inefficient to use a catch on every non-duplicate (most of the files)
|
|
||||||
try: filehashes[hash].append(file)
|
|
||||||
except: filehashes[hash] = [file]
|
|
||||||
return filehashes
|
|
||||||
|
|
||||||
def returnduplicates(filehashes:dict):
|
|
||||||
duplicates = []
|
|
||||||
for hlistkey in filehashes.keys():
|
|
||||||
if len(filehashes[hlistkey]) > 1:
|
|
||||||
duplicates.append(filehashes[hlistkey])
|
|
||||||
return duplicates
|
|
||||||
|
|
||||||
def deleteDuplicates(duplicates):
|
|
||||||
ignored = []; deletion = []
|
|
||||||
for dupe in duplicates:
|
|
||||||
if len(dupe) > 2:
|
|
||||||
print(f"IGNORED\n")
|
|
||||||
ignored.append(dupe)
|
|
||||||
elif len(dupe[0]) < len(dupe[1]) and "/" in dupe[0]:
|
|
||||||
print(f"Deleting {dupe[1]} and keeping {dupe[0]}\n")
|
|
||||||
deletion.append(dupe[1])
|
|
||||||
elif len(dupe[1]) < len(dupe[0]) and "/" in dupe[1]:
|
|
||||||
print(f"Deleting {dupe[0]} and keeping {dupe[1]}\n")
|
|
||||||
deletion.append(dupe[0])
|
|
||||||
print(f"To delete: {len(deletion)} Ignored: {len(ignored)}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
global files, directories
|
|
||||||
# Crawl the directories (TD add a timeout here later)
|
|
||||||
print("Collecting files & paths")
|
|
||||||
while len(directories) > 0:
|
|
||||||
direcCrawler()
|
|
||||||
print(f'found {len(files)} files')
|
|
||||||
# Calc hashes
|
|
||||||
print("Calculating file hashes")
|
|
||||||
filehashes = calchashes()
|
|
||||||
|
|
||||||
duplicates = returnduplicates(filehashes)
|
|
||||||
print(f"Found {len(duplicates)} duplicates\n\n")
|
|
||||||
|
|
||||||
input('Press enter to continue to DELETION of duplicates..')
|
|
||||||
deleteDuplicates(duplicates)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
main()
|
|
||||||
|
|
||||||
|
|
||||||
# quickhash('./ubuntu-22.04.2-desktop-amd64.iso')
|
|
||||||
Loading…
Reference in New Issue
Block a user