append deletion

typo
Add tally
2023-04-17 00:56:04 -04:00 · 2023-04-17 00:54:47 -04:00 · 2023-04-17 00:54:12 -04:00 · 2023-04-17 00:51:29 -04:00 · 2023-04-17 00:48:39 -04:00 · 2023-04-17 00:40:31 -04:00
1 changed files with 91 additions and 0 deletions
--- a/duplicateCheck.py
+++ b/duplicateCheck.py
@ -0,0 +1,91 @@
 import os, hashlib
 directories = os.listdir('.') ## Get the files in the current directory
 files = []
 def quickhash(filefullpath):
    try:
        with open(filefullpath, "rb" ) as f:
            md5 = hashlib.md5()
            block_size=1024*100
            data = f.read(block_size)
            md5.update(data)
        return md5.digest()
    except OSError as er:
        print(f"Error hashing {filefullpath}")
        return None
 def direcCrawler():
    global directories, files
    for item in directories:
        if os.path.isdir(item):
            newdirItems = []
            newdir = os.listdir(item)
            for nwitem in newdir:
                directories.append(f'{item}/{nwitem}')
        elif os.path.isfile(item):
            if os.path.islink(item): print(f'FoundSymLink: {item}')
            files.append(item)
        directories.remove(item) # Its been crawled
    return True
 def calchashes():
    global files, filehashes
    filehashes = {}
    for file in files:
        hash = quickhash(file)
        if hash == None: continue
        ## Inefficient to use a catch on every non-duplicate (most of the files)
        try:  filehashes[hash].append(file)
        except: filehashes[hash] = [file]
    return filehashes
 def returnduplicates(filehashes:dict):
    duplicates = []
    for hlistkey in filehashes.keys():
        if len(filehashes[hlistkey]) > 1:
            duplicates.append(filehashes[hlistkey])
    return duplicates
 def deleteDuplicates(duplicates):
    ignored = []; deletion = []
    for dupe in duplicates:
        if len(dupe) > 2:
            print(f"IGNORED\n")
            ignored.append(dupe)
        elif len(dupe[0]) < len(dupe[1]) and "/" in dupe[0]:
            print(f"Deleting {dupe[1]} and keeping {dupe[0]}\n")
            deletion.append(dupe[1])
        elif len(dupe[1]) < len(dupe[0]) and "/" in dupe[1]:
            print(f"Deleting {dupe[0]} and keeping {dupe[1]}\n")
            deletion.append(dupe[0])
    print(f"To delete: {len(deletion)} Ignored: {len(ignored)}")
 def main():
    global files, directories
    # Crawl the directories (TD add a timeout here later)
    print("Collecting files & paths")
    while len(directories) > 0:
        direcCrawler()
    print(f'found {len(files)} files')
    # Calc hashes
    print("Calculating file hashes")
    filehashes = calchashes()
    duplicates = returnduplicates(filehashes)
    print(f"Found {len(duplicates)} duplicates\n\n")
    input('Press enter to continue to DELETION of duplicates..')
    deleteDuplicates(duplicates)
 main()
 # quickhash('./ubuntu-22.04.2-desktop-amd64.iso')
Author	SHA1	Message	Date
Sean C	a1a40673fe	append deletion	2023-04-17 00:56:04 -04:00
Sean C	f5ebce9c94	typo	2023-04-17 00:54:47 -04:00
Sean C	702581687c	Add tally	2023-04-17 00:54:12 -04:00
Sean C	07fa4d8bb6	Algo testing	2023-04-17 00:51:29 -04:00
Sean C	17d0cfcf8e	Dry run deletion algo	2023-04-17 00:48:39 -04:00
Sean C	2c7e8b6350	forgor	2023-04-17 00:40:31 -04:00
Sean C	a79b503f91	Look for symlinks	2023-04-17 00:40:11 -04:00
Sean C	caccc2f1fb	list dupes	2023-04-17 00:37:17 -04:00
Sean C	4e265a68f3	typo	2023-04-17 00:35:14 -04:00
Sean C	76d8c4f01a	Handle errors	2023-04-17 00:34:32 -04:00
Sean C	d282e8d5d6	Handle errors	2023-04-17 00:33:35 -04:00
Sean C	da49b29471	Added logs	2023-04-17 00:31:46 -04:00
Sean C	387153cb32	Added init file	2023-04-17 00:24:19 -04:00