ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/JOBROBOT/TaskQuery
Revision: 1.5
Committed: Fri Jul 11 09:27:02 2008 UTC (16 years, 9 months ago) by asciaba
Branch: MAIN
Changes since 1.4: +2 -3 lines
Log Message:
Fixed bug with crab exit code being ignored

File Contents

# User Rev Content
1 gutsche 1.1 #!/usr/bin/env perl
2    
3     ##H This drop box agent collects the output from CRAB tasks after the
4     ##H the task jobs have run to completion. Tasks with jobs in them
5     ##H should not be passed into this agent until all jobs have completed.
6     ##H
7     ##H Usage:
8     ##H TaskQuery
9     ##H -state DIRECTORY [-next NEXT] [-wait SECS]
10     ##H
11     ##H -state agent state directory, including inbox
12     ##H -next next agent to pass the drops to; can be given several times
13     ##H -wait time to wait in seconds between work scans
14 gutsche 1.2 ##H -mode mode: EGEE or OSG, prevents kill of aborted jobs for EGEE
15 gutsche 1.1
16     BEGIN {
17     use strict; use warnings; $^W=1;
18     our $me = $0; $me =~ s|.*/||;
19     our $home = $0; $home =~ s|/[^/]+$||; $home ||= "."; $home .= "/../PHEDEX/Toolkit/Common";
20     unshift(@INC, $home);
21     }
22    
23     ######################################################################
24     use UtilsHelp;
25     while (scalar @ARGV) {
26     if ($ARGV[0] eq '-state' && scalar @ARGV > 1) {
27     shift (@ARGV); $args{DROPDIR}= shift(@ARGV);
28     } elsif ($ARGV[0] eq '-next' && scalar @ARGV > 1) {
29     shift (@ARGV); push (@{$args{NEXTDIR}}, shift(@ARGV));
30     } elsif ($ARGV[0] eq '-wait' && scalar @ARGV > 1) {
31     shift (@ARGV); $args{WAITTIME} = shift(@ARGV);
32 gutsche 1.2 } elsif ($ARGV[0] eq '-mode' && scalar @ARGV > 1) {
33     shift (@ARGV); $args{MODE} = shift(@ARGV);
34 gutsche 1.1 } elsif ($ARGV[0] eq '-h') {
35     &usage();
36     } else {
37     last;
38     }
39     }
40    
41     if (@ARGV || !$args{DROPDIR}) {
42     die "Insufficient parameters, use -h for help.\n";
43     }
44    
45     (new TaskCollect (%args))->process();
46    
47     ######################################################################
48     # Routines specific to this agent.
49     package TaskCollect; use strict; use warnings; use base 'UtilsAgent';
50     use UtilsCommand;
51     use UtilsLogging;
52     use UtilsTiming;
53    
54     sub getTimeSinceStart
55     {
56     # extract time since start of project in hours
57     my ($self, $taskdir) = @_;
58     my $time=time();
59    
60     # read start time from file in taskdir
61     open( FILE, "< $taskdir/TASK_INIT.txt" ) or die "Can't open $taskdir/TASK_INIT.txt : $!";
62     my $starttime = 0.0;
63     while ( <FILE> ) {
64     chomp;
65     $starttime = scalar ($_);
66     }
67     close FILE;
68    
69     my $diffsec = $time - $starttime;
70     my $diffhours = $diffsec/3600.;
71     return $diffhours;
72     }
73    
74     sub new
75     {
76     my $proto = shift;
77     my $class = ref($proto) || $proto;
78     my $self = $class->SUPER::new(@_);
79     # OLI 060702
80     # deactivate resubmission.
81 gutsche 1.2 my %params = (MODE => 'EGEE');
82 gutsche 1.1 my %args = (@_);
83     map { $self->{$_} = $args{$_} || $params{$_} } keys %params;
84     bless $self, $class;
85     return $self;
86     }
87    
88     # Actually process the drop.
89     sub processDrop {
90     my ($self, $drop) = @_;
91    
92     # cmd variable
93     my $cmd = "";
94    
95     # Sanity checking
96     return if (! $self->inspectDrop ($drop));
97     # waiting time for too many status checks
98     return if (($$self{NEXT_CHECK}{$drop} || 0) > time());
99     $$self{NEXT_CHECK}{$drop} = time() + 0;
100     delete $self->{BAD}{$drop};
101     &timeStart($self->{STARTTIME});
102    
103     # Read CRAB confiugration
104     my $taskdir = &input ("$self->{WORKDIR}/$drop/task");
105     if (! $taskdir || ! -d $taskdir) {
106     &alert ("missing task directory in $drop");
107     $self->markBad ($drop);
108     return;
109     }
110    
111     # Time to check job status on the task again. Let CRAB do the hard
112     # work, then check what it found.
113     $cmd = qq{
114     cd $taskdir || exit \$?;
115     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
116     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
117 asciaba 1.5 time crab -continue -status; a=\$?; which UpdateOldCrabDb >/dev/null 2>&1 && UpdateOldCrabDb; exit \$a) >> JOB_STATUS_LOG.txt 2>&1};
118 gutsche 1.1 if (my $rc = system($cmd)) {
119     &alert ("$drop $taskdir: failed to check job status: exit code @{[&runerror($rc)]}, move jobdb out of way");
120     # move crab jobs db out of the way to have the jobs not be counted in statistics for project creation (TaskSource)
121     $cmd = qq{
122     cd $taskdir || exit \$?;
123     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
124     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
125     mv -v crab_*/share/db/jobs crab.db.jobs.old) >> JOB_REMOVE_CRABDIR_FAILED_STATUS_CHECK_LOG.txt 2>&1};
126     if (my $rc3 = system($cmd)) {
127     &alert ("$drop $taskdir: failed to move crab jobs to crab.db.jobs.old");
128     }
129     # mark task as finished
130     &logmsg("stats: $drop $taskdir @{[&formatElapsedTime($self->{STARTTIME})]} failed checking status");
131     &output ("$taskdir/TASK_FINISHED.txt", &mytimeofday () . "\n");
132     $self->markBad ($drop);
133     return;
134     }
135    
136     # parse jobdb
137     my $f = (<$taskdir/crab_*/share/db/jobs>)[0];
138    
139     my @gridids = ();
140     my @crabids = ();
141     my @jobstat = ();
142     foreach my $status (split(/\n/, &input($f) || '')) {
143     my @statusarray = split('\|', $status);
144     push(@crabids, $statusarray[4]);
145     push(@gridids, $statusarray[3]);
146     push(@jobstat, $statusarray[1]);
147     }
148    
149     my $hold = 0;
150    
151     # check time between start of project and now
152     my $lifetime = $self->getTimeSinceStart("$taskdir");
153     # set time after which all jobs (running, pending and others) are killed
154     my $killtime = 36.;
155     # set time after which pending and other jobs are killed, running jobs are left running
156     my $stoptime = 24.;
157    
158     for (my $i = 0; $i <= $#jobstat; ++$i) {
159     # Compute CRAB index and nice label for this job
160     my $crabi = $i+1;
161     my $joblabel = $gridids[$i];
162     $joblabel =~ s/([^-A-Za-z0-9_:.])/sprintf("%%%02x", ord($1))/ge;
163    
164     # Determine job status.
165     my $status = $jobstat[$i];
166     if ($status !~ /^[YHSCARDKZ]$/) {
167     &alert ("$drop $taskdir: unexpected job status `$status' for `$crabi'");
168     # kill job
169     $cmd = qq{
170     cd $taskdir || exit \$?;
171     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
172     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
173 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
174 gutsche 1.1 system($cmd);
175     next;
176     }
177    
178     # if job is created but not submitted, submit again, check afterwards, otherwise kill and continue
179     if ($status eq 'C') {
180     &alert ("$drop $taskdir: job $crabids[$i] not submitted yet, submit now ");
181     $cmd = qq{
182     cd $taskdir || exit \$?;
183     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
184     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
185 asciaba 1.5 time crab -continue -submit '$crabids[$i]'; a=\$?; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb; exit \$a) >> JOB_SUBMIT_LOG.txt 2>&1};
186 gutsche 1.1 if (my $rc = system($cmd)) {
187     &alert ("$drop $taskdir: failed to submit job $crabids[$i] with exit code @{[&runerror($rc)]}, kill job");
188     # kill job
189     $cmd = qq{
190     cd $taskdir || exit \$?;
191     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
192     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
193 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
194 gutsche 1.1 system($cmd);
195     }
196     next;
197     }
198    
199     if ( $lifetime >= $killtime ) {
200    
201     # That's all for cleared jobs.
202     next if ($status eq 'Y');
203    
204     # That's all for killed jobs.
205     next if ($status eq 'K');
206    
207 belforte 1.3 # Retrieve job logging information if abort
208     if ($status eq 'A') {
209     my $outfile = "JOB_HISTORY.$crabi.$status.$joblabel.txt";
210     if (! -f "$taskdir/$outfile") {
211     $cmd = qq{
212     cd $taskdir || exit \$?;
213     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
214     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
215     time crab -continue -postMortem '$crabids[$i]') > $outfile 2>&1};
216     system($cmd);
217     }
218 gutsche 1.1 }
219    
220     # get output for job if done
221     if ($status eq 'D') {
222     next;
223     }
224    
225     # kill the rest
226     $cmd = qq{
227     cd $taskdir || exit \$?;
228     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
229     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
230 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
231 gutsche 1.1 system($cmd);
232     next;
233    
234     } elsif ( $lifetime >= $stoptime ) {
235    
236     # keep the running
237     if ($status =~ /^[R]$/) {
238     $hold = 1;
239     next;
240     }
241    
242     # That's all for cleared jobs.
243     next if ($status eq 'Y');
244    
245     # That's all for killed jobs.
246     next if ($status eq 'K');
247    
248 belforte 1.3 # Retrieve job logging information if abort
249     if ($status eq 'A') {
250     my $outfile = "JOB_HISTORY.$crabi.$status.$joblabel.txt";
251     if (! -f "$taskdir/$outfile") {
252     $cmd = qq{
253     cd $taskdir || exit \$?;
254     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
255     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
256     time crab -continue -postMortem '$crabids[$i]') > $outfile 2>&1};
257     system($cmd);
258     }
259 gutsche 1.1 }
260     # get output for job if done
261     if ($status eq 'D') {
262     next;
263     }
264    
265     # kill the rest
266     $cmd = qq{
267     cd $taskdir || exit \$?;
268     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
269     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
270 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
271 gutsche 1.1 system($cmd);
272     next;
273    
274     } else {
275    
276     # Process the easy cases.
277     if ($status =~ /^[SR]$/) {
278     $hold = 1;
279     next;
280     }
281    
282     # That's all for cleared jobs.
283     next if ($status eq 'Y');
284    
285     # That's all for killed jobs.
286     next if ($status eq 'K');
287    
288 belforte 1.3 # Retrieve job logging information if abort
289     if ($status eq 'A') {
290     my $outfile = "JOB_HISTORY.$crabi.$status.$joblabel.txt";
291     if (! -f "$taskdir/$outfile") {
292     $cmd = qq{
293     cd $taskdir || exit \$?;
294     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
295     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
296     time crab -continue -postMortem '$crabids[$i]') > $outfile 2>&1};
297     system($cmd);
298     }
299 gutsche 1.1 }
300    
301     # get output for job if done
302     if ($status eq 'D') {
303     next;
304     }
305    
306 gutsche 1.2 # kill the rest if mode OSG
307     if ( $self->{MODE} eq "OSG" ) {
308     $cmd = qq{
309 gutsche 1.1 cd $taskdir || exit \$?;
310     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
311     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
312 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
313 gutsche 1.2 system($cmd);
314     next;
315     }
316 gutsche 1.1
317     }
318     }
319    
320     # Check if we are supposed to hold this task.
321     return if $hold;
322    
323     &touch ("$self->{WORKDIR}/$drop/done");
324     delete $$self{NEXT_CHECK}{$drop};
325     $self->relayDrop ($drop);
326     &logmsg("stats: $drop $taskdir @{[&formatElapsedTime($self->{STARTTIME})]} success");
327     }