1 |
dhidas |
1.1 |
#!/usr/bin/python
|
2 |
|
|
'''
|
3 |
|
|
Created on 1 Jun 2010
|
4 |
|
|
|
5 |
|
|
@author: kreczko
|
6 |
|
|
|
7 |
|
|
Email: kreczko@cern.ch
|
8 |
|
|
'''
|
9 |
|
|
|
10 |
|
|
from optparse import OptionParser
|
11 |
|
|
import os
|
12 |
|
|
import copy
|
13 |
|
|
import glob
|
14 |
|
|
|
15 |
|
|
duplicates = []
|
16 |
|
|
duplicateFiles = {}
|
17 |
|
|
|
18 |
|
|
def getROOTFiles(path):
|
19 |
|
|
path += "/*.root"
|
20 |
|
|
files = glob.glob(path)
|
21 |
|
|
return files
|
22 |
|
|
|
23 |
|
|
def getUniqueFiles(files):
|
24 |
|
|
if listContainsDuplicates(files):
|
25 |
|
|
findDuplicates(files)
|
26 |
|
|
else:
|
27 |
|
|
return files
|
28 |
|
|
uniqueFiles = copy.copy(files)
|
29 |
|
|
for values in duplicateFiles.itervalues():
|
30 |
|
|
for value in values:
|
31 |
|
|
uniqueFiles.remove(value)
|
32 |
|
|
values.sort()
|
33 |
|
|
uniqueFiles.append(values[-1])
|
34 |
|
|
return uniqueFiles
|
35 |
|
|
|
36 |
|
|
def listContainsDuplicates(list):
|
37 |
|
|
seen = []
|
38 |
|
|
for item in list:
|
39 |
|
|
jobnumber = extractJobnumber(item)
|
40 |
|
|
if jobnumber in seen:
|
41 |
|
|
duplicates.append(jobnumber)
|
42 |
|
|
else:
|
43 |
|
|
seen.append(jobnumber)
|
44 |
|
|
return len(duplicates) >0
|
45 |
|
|
|
46 |
|
|
def findDuplicates(files):
|
47 |
|
|
for file in files:
|
48 |
|
|
for job in duplicates:
|
49 |
|
|
if job == extractJobnumber(file):
|
50 |
|
|
addDuplicate(job, file)
|
51 |
|
|
|
52 |
|
|
def extractJobnumber(file):
|
53 |
|
|
jobnumber = file.split('_')[-3]
|
54 |
|
|
return int(jobnumber)
|
55 |
|
|
|
56 |
|
|
def addDuplicate(jobnumber, file):
|
57 |
|
|
if not duplicateFiles.has_key(jobnumber):
|
58 |
|
|
duplicateFiles[jobnumber] = []
|
59 |
|
|
duplicateFiles[jobnumber].append(file)
|
60 |
|
|
|
61 |
|
|
def getDuplicateFiles(allFiles, uniqueFiles):
|
62 |
|
|
# print 'Number of file in path:', len(files)
|
63 |
|
|
# files.sort()
|
64 |
|
|
# uniqueFiles = getUniqueFiles(files)
|
65 |
|
|
# uniqueFiles.sort()
|
66 |
|
|
# print 'Number of unique files', len(uniqueFiles)
|
67 |
|
|
filesToRemove = [file for file in allFiles if not file in uniqueFiles]
|
68 |
|
|
return filesToRemove
|
69 |
|
|
|
70 |
|
|
if __name__ == "__main__":
|
71 |
|
|
parser = OptionParser()
|
72 |
|
|
(options, args) = parser.parse_args()
|
73 |
|
|
if len(args) >0:
|
74 |
|
|
path = args[0]
|
75 |
|
|
files = os.listdir(path)
|
76 |
|
|
files.sort()
|
77 |
|
|
uniqueFiles = getUniqueFiles(files)
|
78 |
|
|
uniqueFiles.sort()
|
79 |
|
|
duplicateFiles = getDuplicateFiles(files, uniqueFiles)
|
80 |
|
|
print 'Number of file in path:', len(files)
|
81 |
|
|
print 'Number of unique files', len(uniqueFiles)
|
82 |
|
|
print 'Number of duplicate files:', len(duplicateFiles)
|
83 |
|
|
if len(duplicateFiles) > 0:
|
84 |
|
|
print 'Files to remove:'
|
85 |
|
|
for file in duplicateFiles:
|
86 |
|
|
print path + file
|
87 |
|
|
else:
|
88 |
|
|
print 'File path was not specified. Use script "./remove_duplicates path"' |