1 |
dhidas |
1.1 |
#!/usr/bin/python
|
2 |
|
|
'''
|
3 |
|
|
Created on 1 Jun 2010
|
4 |
|
|
|
5 |
|
|
@author: kreczko
|
6 |
|
|
|
7 |
|
|
Email: kreczko@cern.ch
|
8 |
|
|
'''
|
9 |
|
|
|
10 |
|
|
from optparse import OptionParser
|
11 |
|
|
import os
|
12 |
|
|
import copy
|
13 |
|
|
|
14 |
|
|
duplicates = []
|
15 |
|
|
duplicateFiles = {}
|
16 |
|
|
|
17 |
|
|
def getUniqueFiles(files):
|
18 |
|
|
if listContainsDuplicates(files):
|
19 |
|
|
findDuplicates(files)
|
20 |
|
|
else:
|
21 |
|
|
return files
|
22 |
|
|
uniqueFiles = copy.copy(files)
|
23 |
|
|
for values in duplicateFiles.itervalues():
|
24 |
|
|
for value in values:
|
25 |
|
|
uniqueFiles.remove(value)
|
26 |
|
|
values.sort()
|
27 |
|
|
uniqueFiles.append(values[-1])
|
28 |
|
|
return uniqueFiles
|
29 |
|
|
|
30 |
|
|
def listContainsDuplicates(list):
|
31 |
|
|
seen = []
|
32 |
|
|
for item in list:
|
33 |
|
|
jobnumber = extractJobnumber(item)
|
34 |
|
|
if jobnumber in seen:
|
35 |
|
|
duplicates.append(jobnumber)
|
36 |
|
|
else:
|
37 |
|
|
seen.append(jobnumber)
|
38 |
|
|
return len(duplicates) >0
|
39 |
|
|
|
40 |
|
|
def findDuplicates(files):
|
41 |
|
|
for file in files:
|
42 |
|
|
for job in duplicates:
|
43 |
|
|
if job == extractJobnumber(file):
|
44 |
|
|
addDuplicate(job, file)
|
45 |
|
|
|
46 |
|
|
def extractJobnumber(file):
|
47 |
|
|
jobnumber = file.split('_')[-3]
|
48 |
|
|
return int(jobnumber)
|
49 |
|
|
|
50 |
|
|
def addDuplicate(jobnumber, file):
|
51 |
|
|
if not duplicateFiles.has_key(jobnumber):
|
52 |
|
|
duplicateFiles[jobnumber] = []
|
53 |
|
|
duplicateFiles[jobnumber].append(file)
|
54 |
|
|
|
55 |
|
|
def removeDuplicates(path, files):
|
56 |
|
|
print 'Number of file in path:', len(files)
|
57 |
|
|
files.sort()
|
58 |
|
|
uniqueFiles = getUniqueFiles(files)
|
59 |
|
|
uniqueFiles.sort()
|
60 |
|
|
print 'Number of unique files', len(uniqueFiles)
|
61 |
|
|
filesToRemove = [file for file in files if not file in uniqueFiles]
|
62 |
|
|
print 'Number of duplicate files:', len(filesToRemove)
|
63 |
|
|
[remove(path + file) for file in filesToRemove]
|
64 |
|
|
|
65 |
|
|
def remove(file):
|
66 |
|
|
print 'removing',file
|
67 |
|
|
os.remove(file)
|
68 |
|
|
|
69 |
|
|
if __name__ == "__main__":
|
70 |
|
|
parser = OptionParser()
|
71 |
|
|
(options, args) = parser.parse_args()
|
72 |
|
|
if len(args) >0:
|
73 |
|
|
path = args[0]
|
74 |
|
|
files = os.listdir(path)
|
75 |
|
|
removeDuplicates(path, files)
|
76 |
|
|
else:
|
77 |
|
|
print 'File path was not specified. Use script "./remove_duplicates path"' |