Added init file
This commit is contained in:
parent
c8364c2576
commit
387153cb32
64
duplicateCheck.py
Normal file
64
duplicateCheck.py
Normal file
@ -0,0 +1,64 @@
|
||||
import os, hashlib, sys
|
||||
|
||||
directories = os.listdir('.') ## Get the files in the current directory
|
||||
|
||||
files = []
|
||||
|
||||
def quickhash(filefullpath):
|
||||
with open(filefullpath, "rb" ) as f:
|
||||
md5 = hashlib.md5()
|
||||
block_size=1024*100
|
||||
data = f.read(block_size)
|
||||
md5.update(data)
|
||||
return md5.digest()
|
||||
|
||||
def direcCrawler():
|
||||
global directories, files
|
||||
for item in directories:
|
||||
if os.path.isdir(item):
|
||||
newdirItems = []
|
||||
newdir = os.listdir(item)
|
||||
for nwitem in newdir:
|
||||
directories.append(f'{item}/{nwitem}')
|
||||
|
||||
elif os.path.isfile(item):
|
||||
files.append(item)
|
||||
directories.remove(item) # Its been crawled
|
||||
return True
|
||||
|
||||
def calchashes():
|
||||
global files, filehashes
|
||||
filehashes = {}
|
||||
for file in files:
|
||||
hash = quickhash(file)
|
||||
## Inefficient to use a catch on every non-duplicate (most of the files)
|
||||
try: filehashes[hash].append(file)
|
||||
except: filehashes[hash] = [file]
|
||||
return filehashes
|
||||
|
||||
def returnduplicates(filehashes:dict):
|
||||
duplicates = []
|
||||
for hlistkey in filehashes.keys():
|
||||
if len(filehashes[hlistkey]) > 1:
|
||||
duplicates.append(filehashes[hlistkey])
|
||||
return duplicates
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
global files, directories
|
||||
# Crawl the directories (TD add a timeout here later)
|
||||
while len(directories) > 0:
|
||||
direcCrawler()
|
||||
|
||||
# Calc hashes
|
||||
filehashes = calchashes()
|
||||
|
||||
print(returnduplicates(filehashes))
|
||||
|
||||
|
||||
|
||||
main()
|
||||
|
||||
|
||||
# quickhash('./ubuntu-22.04.2-desktop-amd64.iso')
|
||||
Loading…
Reference in New Issue
Block a user