1 |
#!/usr/bin/python
|
2 |
'''
|
3 |
Created on 1 Jun 2010
|
4 |
|
5 |
@author: kreczko
|
6 |
|
7 |
Email: kreczko@cern.ch
|
8 |
'''
|
9 |
|
10 |
from optparse import OptionParser
|
11 |
import os
|
12 |
import copy
|
13 |
import glob
|
14 |
|
15 |
duplicates = []
|
16 |
duplicateFiles = {}
|
17 |
|
18 |
def getROOTFiles(path):
|
19 |
path += "/*.root"
|
20 |
files = glob.glob(path)
|
21 |
return files
|
22 |
|
23 |
def getUniqueFiles(files):
|
24 |
if listContainsDuplicates(files):
|
25 |
findDuplicates(files)
|
26 |
else:
|
27 |
return files
|
28 |
uniqueFiles = copy.copy(files)
|
29 |
for values in duplicateFiles.itervalues():
|
30 |
for value in values:
|
31 |
uniqueFiles.remove(value)
|
32 |
values.sort()
|
33 |
uniqueFiles.append(values[-1])
|
34 |
return uniqueFiles
|
35 |
|
36 |
def listContainsDuplicates(list):
|
37 |
seen = []
|
38 |
for item in list:
|
39 |
jobnumber = extractJobnumber(item)
|
40 |
if jobnumber in seen:
|
41 |
duplicates.append(jobnumber)
|
42 |
else:
|
43 |
seen.append(jobnumber)
|
44 |
return len(duplicates) >0
|
45 |
|
46 |
def findDuplicates(files):
|
47 |
for file in files:
|
48 |
for job in duplicates:
|
49 |
if job == extractJobnumber(file):
|
50 |
addDuplicate(job, file)
|
51 |
|
52 |
def extractJobnumber(file):
|
53 |
jobnumber = file.split('_')[-3]
|
54 |
return int(jobnumber)
|
55 |
|
56 |
def addDuplicate(jobnumber, file):
|
57 |
if not duplicateFiles.has_key(jobnumber):
|
58 |
duplicateFiles[jobnumber] = []
|
59 |
duplicateFiles[jobnumber].append(file)
|
60 |
|
61 |
def getDuplicateFiles(allFiles, uniqueFiles):
|
62 |
# print 'Number of file in path:', len(files)
|
63 |
# files.sort()
|
64 |
# uniqueFiles = getUniqueFiles(files)
|
65 |
# uniqueFiles.sort()
|
66 |
# print 'Number of unique files', len(uniqueFiles)
|
67 |
filesToRemove = [file for file in allFiles if not file in uniqueFiles]
|
68 |
return filesToRemove
|
69 |
|
70 |
if __name__ == "__main__":
|
71 |
parser = OptionParser()
|
72 |
(options, args) = parser.parse_args()
|
73 |
if len(args) >0:
|
74 |
path = args[0]
|
75 |
files = os.listdir(path)
|
76 |
files.sort()
|
77 |
uniqueFiles = getUniqueFiles(files)
|
78 |
uniqueFiles.sort()
|
79 |
duplicateFiles = getDuplicateFiles(files, uniqueFiles)
|
80 |
print 'Number of file in path:', len(files)
|
81 |
print 'Number of unique files', len(uniqueFiles)
|
82 |
print 'Number of duplicate files:', len(duplicateFiles)
|
83 |
if len(duplicateFiles) > 0:
|
84 |
print 'Files to remove:'
|
85 |
for file in duplicateFiles:
|
86 |
print path + file
|
87 |
else:
|
88 |
print 'File path was not specified. Use script "./remove_duplicates path"' |