1 |
#!/usr/bin/python
|
2 |
'''
|
3 |
Created on 1 Jun 2010
|
4 |
|
5 |
@author: kreczko
|
6 |
|
7 |
Email: kreczko@cern.ch
|
8 |
'''
|
9 |
|
10 |
from optparse import OptionParser
|
11 |
import os
|
12 |
import copy
|
13 |
|
14 |
duplicates = []
|
15 |
duplicateFiles = {}
|
16 |
|
17 |
def getUniqueFiles(files):
|
18 |
if listContainsDuplicates(files):
|
19 |
findDuplicates(files)
|
20 |
else:
|
21 |
return files
|
22 |
uniqueFiles = copy.copy(files)
|
23 |
for values in duplicateFiles.itervalues():
|
24 |
for value in values:
|
25 |
uniqueFiles.remove(value)
|
26 |
values.sort()
|
27 |
uniqueFiles.append(values[-1])
|
28 |
return uniqueFiles
|
29 |
|
30 |
def listContainsDuplicates(list):
|
31 |
seen = []
|
32 |
for item in list:
|
33 |
jobnumber = extractJobnumber(item)
|
34 |
if jobnumber in seen:
|
35 |
duplicates.append(jobnumber)
|
36 |
else:
|
37 |
seen.append(jobnumber)
|
38 |
return len(duplicates) >0
|
39 |
|
40 |
def findDuplicates(files):
|
41 |
for file in files:
|
42 |
for job in duplicates:
|
43 |
if job == extractJobnumber(file):
|
44 |
addDuplicate(job, file)
|
45 |
|
46 |
def extractJobnumber(file):
|
47 |
jobnumber = file.split('_')[-3]
|
48 |
return int(jobnumber)
|
49 |
|
50 |
def addDuplicate(jobnumber, file):
|
51 |
if not duplicateFiles.has_key(jobnumber):
|
52 |
duplicateFiles[jobnumber] = []
|
53 |
duplicateFiles[jobnumber].append(file)
|
54 |
|
55 |
def removeDuplicates(path, files):
|
56 |
print 'Number of file in path:', len(files)
|
57 |
files.sort()
|
58 |
uniqueFiles = getUniqueFiles(files)
|
59 |
uniqueFiles.sort()
|
60 |
print 'Number of unique files', len(uniqueFiles)
|
61 |
filesToRemove = [file for file in files if not file in uniqueFiles]
|
62 |
print 'Number of duplicate files:', len(filesToRemove)
|
63 |
[remove(path + file) for file in filesToRemove]
|
64 |
|
65 |
def remove(file):
|
66 |
print 'removing',file
|
67 |
os.remove(file)
|
68 |
|
69 |
if __name__ == "__main__":
|
70 |
parser = OptionParser()
|
71 |
(options, args) = parser.parse_args()
|
72 |
if len(args) >0:
|
73 |
path = args[0]
|
74 |
files = os.listdir(path)
|
75 |
removeDuplicates(path, files)
|
76 |
else:
|
77 |
print 'File path was not specified. Use script "./remove_duplicates path"' |