1 |
fanzago |
1.1 |
#!/bin/bash
|
2 |
|
|
set -o nounset
|
3 |
|
|
|
4 |
|
|
PROGNAME=$(basename $0)
|
5 |
|
|
|
6 |
|
|
function usage
|
7 |
|
|
{
|
8 |
|
|
cat <<EOF
|
9 |
|
|
Find a list of duplicate root files for a dataset at the SE that
|
10 |
|
|
should be removed.
|
11 |
|
|
|
12 |
|
|
Usage: $PROGNAME -c <crab_dir> [--h | --help]
|
13 |
|
|
where options are:
|
14 |
|
|
-c Mandatory argument, crab project directory
|
15 |
|
|
-v|--verbose Turn on debug statements (D=false)
|
16 |
|
|
-h|--help This message
|
17 |
|
|
|
18 |
|
|
example: $PROGNAME -c <crab_dir> -v
|
19 |
|
|
|
20 |
|
|
This script creates two files in the present directory:
|
21 |
|
|
|
22 |
|
|
allfiles.list - all the root files for the dataset present at the SE
|
23 |
|
|
goodfiles.list - root files for successful jobs as found in the crab_fjr_n.xml files
|
24 |
|
|
|
25 |
|
|
and finds the duplicate files from the difference. Note, that at times jobs may finish
|
26 |
|
|
and root files tranferred to the SE successfully, but crab may not immediately know about job
|
27 |
|
|
completion. Those 'most recent' root files will be tagged as duplicate, but they
|
28 |
|
|
are not.
|
29 |
|
|
EOF
|
30 |
|
|
|
31 |
|
|
exit 1
|
32 |
|
|
}
|
33 |
|
|
|
34 |
|
|
[ $# -gt 0 ] || usage
|
35 |
|
|
|
36 |
|
|
crab_dir=""
|
37 |
|
|
let "verbose = 0"
|
38 |
|
|
let "quiet = 0"
|
39 |
|
|
while [ $# -gt 0 ]; do
|
40 |
|
|
case $1 in
|
41 |
|
|
-c) shift
|
42 |
|
|
crab_dir=$1
|
43 |
|
|
;;
|
44 |
|
|
-v | --verbose ) let "verbose = 1"
|
45 |
|
|
;;
|
46 |
|
|
-q | --quiet ) let "quiet = 1"
|
47 |
|
|
;;
|
48 |
|
|
-h | --help ) usage
|
49 |
|
|
;;
|
50 |
|
|
* ) usage
|
51 |
|
|
;;
|
52 |
|
|
esac
|
53 |
|
|
shift
|
54 |
|
|
done
|
55 |
|
|
|
56 |
|
|
[ $crab_dir != "" ] && [ -e $crab_dir ] || usage
|
57 |
|
|
|
58 |
|
|
gflist=goodfiles.list
|
59 |
|
|
aflist=allfiles.list
|
60 |
|
|
|
61 |
|
|
# First of all get the list of goodfile by reading the fjr files
|
62 |
|
|
#export PERL5LIB=/afs/cern.ch/user/s/sarkar/public/perl/lib/perl5/site_perl/5.8.8:$PERL5LIB
|
63 |
|
|
#perl -w /afs/cern.ch/user/s/sarkar/public/ListGoodOutputFiles_new.pl $project/res > $gflist
|
64 |
|
|
|
65 |
|
|
[ $quiet -gt 0 ] || echo ">>> Find list of good files from fjr files..."
|
66 |
|
|
python /afs/cern.ch/user/s/sarkar/public/to_stage/find_goodfiles.py -c $crab_dir -q > $gflist
|
67 |
|
|
# Now find the remote directory name
|
68 |
|
|
rdir=$(dirname $(head -1 $gflist))
|
69 |
|
|
srmp=$(echo $rdir | awk -F= '{print $1}')
|
70 |
|
|
|
71 |
|
|
# Get list of all files for the project
|
72 |
|
|
[ $quiet -gt 0 ] || echo ">>> Find list of all root files at $rdir ..."
|
73 |
|
|
srmls $rdir 2> /dev/null | grep '.root$' | awk '{if (NF==2) print $NF}' > $aflist
|
74 |
|
|
|
75 |
|
|
# Now compare
|
76 |
|
|
[ $quiet -gt 0 ] || echo ">>> Following is the list of duplicate files at $rdir ..."
|
77 |
|
|
for file in $(cat $aflist)
|
78 |
|
|
do
|
79 |
|
|
grep $file $gflist > /dev/null
|
80 |
|
|
[ $? -eq 0 ] && continue
|
81 |
|
|
|
82 |
|
|
bname=$(basename $file)
|
83 |
|
|
grep $bname $gflist > /dev/null
|
84 |
|
|
[ $? -eq 0 ] && continue
|
85 |
|
|
|
86 |
|
|
echo "$srmp""=""$file"
|
87 |
|
|
done
|
88 |
|
|
|
89 |
|
|
exit 0
|