Python搜寻重覆档案 hash

搭配 os.walk + hash 搜寻子目录下之重覆档案(图档为例) Source Code download
for 档案类型搜寻 在上一篇

import os, hashlib
#--- 找出重覆之档案 
# 参数 nPath 资料夹 / fTypes 要搜寻的类型
def findOverlap( nPath, fTypes ):
    allimage = []
    allhsh  = dict()   # key: hash / value: filePath
    overlapA = []      # 重覆之档 位置A
    overlapB = []      # 重覆之档 位置B
    f_tree = os.walk(nPath)
    # os.walk 传回的是generator
    print(f'return a generator: {type(f_tree)}')
    
    for dirname,subdir,files in f_tree:
        # 一层一层向下
        print(f'file count of this folder: {len(files)}')
        imgFiles = []    # 这一层的 image files
        # 取得 符合之档案,存入 imgFiles 串列中
        for file in files:  
            ext = file.split('.')[-1]
            if ext in filetypes:
                tmp = dirname +'/'+file
                imgFiles.append(tmp)
                allimage.append(tmp)
      
        # 如果这一层有符合档案 
        if len(imgFiles) > 0:
            #--- 逐一检查,如果发现新来之档hash已存在,则加入overlap 
            for img in imgFiles:
                imghsh = hashlib.md5(open(img,'rb').read()).digest()
                fname = os.path.abspath(img)
                if imghsh in allhsh:
                    overlapA.append(fname)
                    overlapB.append(allhsh[imghsh]) #B位置放入已有hash值之档
                else:  # else 增添入 hash dict 中
                    allhsh[imghsh] = fname

    return allimage, overlapA, overlapB
#--- 流程 主轴 -----
# 指定搜寻之目录 (或者预设为当前目录)
pathHere = os.getcwd() # 当前目录位置
path = input('从哪个资料夹 开始搜寻 ? ') or pathHere
print(f'搜寻资料夹: {path} (含子目录)图档')

# 要筛选的档案类型
filetypes = ['jpg', 'png', 'bmp', 'jpeg']  
iFile, overA, overB = findOverlap( path, filetypes )

print(f'图档数量: {len(iFile)} 重覆者: {len(overA)}')

if len(overA) != 0:
    print("找到下列重覆的档案:")
    for i in range(len(overA)):
        print(f'位置A: {overA[i]}\n位置B: {overB[i]}')
# 把结果存档
f = open( pathHere+'\overlap.txt','w',encoding='utf-8' )
print(f'图档数量: {len(iFile)} 重覆者: {len(overA)}',file=f)
print("找到下列重覆的档案:",file=f)
for i in range(len(overA)):
    print(f'位置A: {overA[i]}\n位置B: {overB[i]}\n',file=f)
f.close()

<<:  【清新温泉饭店 - 新采自助百汇 Freshfields in Taichung】#新北市宣布9/9到9/15禁止内用

>>:  裸机Hyperviser之间比较

【DAY 1】Microsoft 365 ,365天天都用的到的生产力工具

什麽是Microsoft 365? (+Microsoft 365 开发人员计画 (Microsof...

[Day 24] Edge Impulse + BLE Sense实现手势动作辨识(上)

有了先前[Day 20][Day 21][Day 22]「Edge Impulse + BLE Se...

EP 09 - [TDD] Message 加密及解密 (1/2)

Youtube 频道:https://www.youtube.com/c/kaochenlong ...

大共享时代系列_026_第三方物流(Third-Party logistics,3PL)

仓储+物流,术有专攻,让专业的来~ 降低电商营运时要租赁仓储、开发库存系统等的门槛~ 通通外包给第三...

D22 - 用 Swift 和公开资讯,打造投资理财的 Apps { 台股成交量实作.2 }

上一篇在 TwMarketTradingInfoManager 完成了拿取大盘成交量的 API,接下...