ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/JOBROBOT/TaskQuery
Revision: 1.6
Committed: Mon Jul 28 15:58:37 2008 UTC (16 years, 9 months ago) by belforte
Branch: MAIN
Changes since 1.5: +23 -2 lines
Log Message:
cleanup task/project when kill did not succeed in 1h?

File Contents

# User Rev Content
1 gutsche 1.1 #!/usr/bin/env perl
2    
3     ##H This drop box agent collects the output from CRAB tasks after the
4     ##H the task jobs have run to completion. Tasks with jobs in them
5     ##H should not be passed into this agent until all jobs have completed.
6     ##H
7     ##H Usage:
8     ##H TaskQuery
9     ##H -state DIRECTORY [-next NEXT] [-wait SECS]
10     ##H
11     ##H -state agent state directory, including inbox
12     ##H -next next agent to pass the drops to; can be given several times
13     ##H -wait time to wait in seconds between work scans
14 gutsche 1.2 ##H -mode mode: EGEE or OSG, prevents kill of aborted jobs for EGEE
15 gutsche 1.1
16     BEGIN {
17     use strict; use warnings; $^W=1;
18     our $me = $0; $me =~ s|.*/||;
19     our $home = $0; $home =~ s|/[^/]+$||; $home ||= "."; $home .= "/../PHEDEX/Toolkit/Common";
20     unshift(@INC, $home);
21     }
22    
23     ######################################################################
24     use UtilsHelp;
25     while (scalar @ARGV) {
26     if ($ARGV[0] eq '-state' && scalar @ARGV > 1) {
27     shift (@ARGV); $args{DROPDIR}= shift(@ARGV);
28     } elsif ($ARGV[0] eq '-next' && scalar @ARGV > 1) {
29     shift (@ARGV); push (@{$args{NEXTDIR}}, shift(@ARGV));
30     } elsif ($ARGV[0] eq '-wait' && scalar @ARGV > 1) {
31     shift (@ARGV); $args{WAITTIME} = shift(@ARGV);
32 gutsche 1.2 } elsif ($ARGV[0] eq '-mode' && scalar @ARGV > 1) {
33     shift (@ARGV); $args{MODE} = shift(@ARGV);
34 gutsche 1.1 } elsif ($ARGV[0] eq '-h') {
35     &usage();
36     } else {
37     last;
38     }
39     }
40    
41     if (@ARGV || !$args{DROPDIR}) {
42     die "Insufficient parameters, use -h for help.\n";
43     }
44    
45     (new TaskCollect (%args))->process();
46    
47     ######################################################################
48     # Routines specific to this agent.
49     package TaskCollect; use strict; use warnings; use base 'UtilsAgent';
50     use UtilsCommand;
51     use UtilsLogging;
52     use UtilsTiming;
53    
54     sub getTimeSinceStart
55     {
56     # extract time since start of project in hours
57     my ($self, $taskdir) = @_;
58     my $time=time();
59    
60     # read start time from file in taskdir
61     open( FILE, "< $taskdir/TASK_INIT.txt" ) or die "Can't open $taskdir/TASK_INIT.txt : $!";
62     my $starttime = 0.0;
63     while ( <FILE> ) {
64     chomp;
65     $starttime = scalar ($_);
66     }
67     close FILE;
68    
69     my $diffsec = $time - $starttime;
70     my $diffhours = $diffsec/3600.;
71     return $diffhours;
72     }
73    
74     sub new
75     {
76     my $proto = shift;
77     my $class = ref($proto) || $proto;
78     my $self = $class->SUPER::new(@_);
79     # OLI 060702
80     # deactivate resubmission.
81 gutsche 1.2 my %params = (MODE => 'EGEE');
82 gutsche 1.1 my %args = (@_);
83     map { $self->{$_} = $args{$_} || $params{$_} } keys %params;
84     bless $self, $class;
85     return $self;
86     }
87    
88     # Actually process the drop.
89     sub processDrop {
90     my ($self, $drop) = @_;
91    
92     # cmd variable
93     my $cmd = "";
94    
95     # Sanity checking
96     return if (! $self->inspectDrop ($drop));
97     # waiting time for too many status checks
98     return if (($$self{NEXT_CHECK}{$drop} || 0) > time());
99     $$self{NEXT_CHECK}{$drop} = time() + 0;
100     delete $self->{BAD}{$drop};
101     &timeStart($self->{STARTTIME});
102    
103     # Read CRAB confiugration
104     my $taskdir = &input ("$self->{WORKDIR}/$drop/task");
105     if (! $taskdir || ! -d $taskdir) {
106     &alert ("missing task directory in $drop");
107     $self->markBad ($drop);
108     return;
109     }
110    
111     # Time to check job status on the task again. Let CRAB do the hard
112     # work, then check what it found.
113     $cmd = qq{
114     cd $taskdir || exit \$?;
115     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
116     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
117 asciaba 1.5 time crab -continue -status; a=\$?; which UpdateOldCrabDb >/dev/null 2>&1 && UpdateOldCrabDb; exit \$a) >> JOB_STATUS_LOG.txt 2>&1};
118 gutsche 1.1 if (my $rc = system($cmd)) {
119     &alert ("$drop $taskdir: failed to check job status: exit code @{[&runerror($rc)]}, move jobdb out of way");
120     # move crab jobs db out of the way to have the jobs not be counted in statistics for project creation (TaskSource)
121     $cmd = qq{
122     cd $taskdir || exit \$?;
123     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
124     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
125     mv -v crab_*/share/db/jobs crab.db.jobs.old) >> JOB_REMOVE_CRABDIR_FAILED_STATUS_CHECK_LOG.txt 2>&1};
126     if (my $rc3 = system($cmd)) {
127     &alert ("$drop $taskdir: failed to move crab jobs to crab.db.jobs.old");
128     }
129     # mark task as finished
130     &logmsg("stats: $drop $taskdir @{[&formatElapsedTime($self->{STARTTIME})]} failed checking status");
131     &output ("$taskdir/TASK_FINISHED.txt", &mytimeofday () . "\n");
132     $self->markBad ($drop);
133     return;
134     }
135    
136     # parse jobdb
137     my $f = (<$taskdir/crab_*/share/db/jobs>)[0];
138    
139     my @gridids = ();
140     my @crabids = ();
141     my @jobstat = ();
142     foreach my $status (split(/\n/, &input($f) || '')) {
143     my @statusarray = split('\|', $status);
144     push(@crabids, $statusarray[4]);
145     push(@gridids, $statusarray[3]);
146     push(@jobstat, $statusarray[1]);
147     }
148    
149     my $hold = 0;
150    
151     # check time between start of project and now
152     my $lifetime = $self->getTimeSinceStart("$taskdir");
153 belforte 1.6 # set time after which all jobs are cleaned and task marked bad
154     my $cleanuptime = 31.;
155 gutsche 1.1 # set time after which all jobs (running, pending and others) are killed
156 belforte 1.6 my $killtime = 30.;
157 gutsche 1.1 # set time after which pending and other jobs are killed, running jobs are left running
158     my $stoptime = 24.;
159    
160     for (my $i = 0; $i <= $#jobstat; ++$i) {
161     # Compute CRAB index and nice label for this job
162     my $crabi = $i+1;
163     my $joblabel = $gridids[$i];
164     $joblabel =~ s/([^-A-Za-z0-9_:.])/sprintf("%%%02x", ord($1))/ge;
165    
166     # Determine job status.
167     my $status = $jobstat[$i];
168     if ($status !~ /^[YHSCARDKZ]$/) {
169     &alert ("$drop $taskdir: unexpected job status `$status' for `$crabi'");
170     # kill job
171     $cmd = qq{
172     cd $taskdir || exit \$?;
173     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
174     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
175 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
176 gutsche 1.1 system($cmd);
177     next;
178     }
179    
180     # if job is created but not submitted, submit again, check afterwards, otherwise kill and continue
181     if ($status eq 'C') {
182     &alert ("$drop $taskdir: job $crabids[$i] not submitted yet, submit now ");
183     $cmd = qq{
184     cd $taskdir || exit \$?;
185     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
186     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
187 asciaba 1.5 time crab -continue -submit '$crabids[$i]'; a=\$?; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb; exit \$a) >> JOB_SUBMIT_LOG.txt 2>&1};
188 gutsche 1.1 if (my $rc = system($cmd)) {
189     &alert ("$drop $taskdir: failed to submit job $crabids[$i] with exit code @{[&runerror($rc)]}, kill job");
190     # kill job
191     $cmd = qq{
192     cd $taskdir || exit \$?;
193     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
194     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
195 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
196 gutsche 1.1 system($cmd);
197     }
198     next;
199     }
200    
201 belforte 1.6 if ( $lifetime >= $cleanuptime ) {
202     &alert ("$drop $taskdir: task kill did not work in time");
203     # move crab jobs db out of the way to have the jobs not be counted in statistics for project creation (TaskSource)
204     $cmd = qq{
205     cd $taskdir || exit \$?;
206     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
207     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
208     mv -v crab_*/share/db/jobs crab.db.jobs.old) >> JOB_REMOVE_CRABDIR_KILL_FAILED.txt 2>&1};
209     if (my $rc = system($cmd)) {
210     &alert ("$drop $taskdir: failed to move crab jobs to crab.db.jobs.old");
211     }
212     # mark task as finished
213     &output ("$taskdir/TASK_FINISHED.txt", &mytimeofday () . "\n");
214     $self->markBad ($drop);
215     return;
216     }
217    
218     elsif ( $lifetime >= $killtime ) {
219 gutsche 1.1
220     # That's all for cleared jobs.
221     next if ($status eq 'Y');
222    
223     # That's all for killed jobs.
224     next if ($status eq 'K');
225    
226 belforte 1.3 # Retrieve job logging information if abort
227     if ($status eq 'A') {
228     my $outfile = "JOB_HISTORY.$crabi.$status.$joblabel.txt";
229     if (! -f "$taskdir/$outfile") {
230     $cmd = qq{
231     cd $taskdir || exit \$?;
232     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
233     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
234     time crab -continue -postMortem '$crabids[$i]') > $outfile 2>&1};
235     system($cmd);
236     }
237 gutsche 1.1 }
238    
239     # get output for job if done
240     if ($status eq 'D') {
241     next;
242     }
243    
244     # kill the rest
245     $cmd = qq{
246     cd $taskdir || exit \$?;
247     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
248     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
249 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
250 gutsche 1.1 system($cmd);
251 belforte 1.6 # come back to verify it was killed
252     $hold = 1;
253 gutsche 1.1 next;
254    
255     } elsif ( $lifetime >= $stoptime ) {
256    
257     # keep the running
258     if ($status =~ /^[R]$/) {
259     $hold = 1;
260     next;
261     }
262    
263     # That's all for cleared jobs.
264     next if ($status eq 'Y');
265    
266     # That's all for killed jobs.
267     next if ($status eq 'K');
268    
269 belforte 1.3 # Retrieve job logging information if abort
270     if ($status eq 'A') {
271     my $outfile = "JOB_HISTORY.$crabi.$status.$joblabel.txt";
272     if (! -f "$taskdir/$outfile") {
273     $cmd = qq{
274     cd $taskdir || exit \$?;
275     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
276     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
277     time crab -continue -postMortem '$crabids[$i]') > $outfile 2>&1};
278     system($cmd);
279     }
280 gutsche 1.1 }
281     # get output for job if done
282     if ($status eq 'D') {
283     next;
284     }
285    
286     # kill the rest
287     $cmd = qq{
288     cd $taskdir || exit \$?;
289     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
290     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
291 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
292 gutsche 1.1 system($cmd);
293     next;
294    
295     } else {
296    
297     # Process the easy cases.
298     if ($status =~ /^[SR]$/) {
299     $hold = 1;
300     next;
301     }
302    
303     # That's all for cleared jobs.
304     next if ($status eq 'Y');
305    
306     # That's all for killed jobs.
307     next if ($status eq 'K');
308    
309 belforte 1.3 # Retrieve job logging information if abort
310     if ($status eq 'A') {
311     my $outfile = "JOB_HISTORY.$crabi.$status.$joblabel.txt";
312     if (! -f "$taskdir/$outfile") {
313     $cmd = qq{
314     cd $taskdir || exit \$?;
315     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
316     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
317     time crab -continue -postMortem '$crabids[$i]') > $outfile 2>&1};
318     system($cmd);
319     }
320 gutsche 1.1 }
321    
322     # get output for job if done
323     if ($status eq 'D') {
324     next;
325     }
326    
327 gutsche 1.2 # kill the rest if mode OSG
328     if ( $self->{MODE} eq "OSG" ) {
329     $cmd = qq{
330 gutsche 1.1 cd $taskdir || exit \$?;
331     TIMEFORMAT="Timing information in seconds: \%R real, \%U user, \%S system, \%P \%\%";
332     (date -u +"START \%Y\%m\%dZ\%H\%M\%S == \%s"; set -x;
333 belforte 1.4 time crab -continue -kill '$crabids[$i]'; which CrabStatusAndUpdateOldCrabDb >/dev/null 2>&1 && CrabStatusAndUpdateOldCrabDb) >> JOB_KILL_LOG.txt 2>&1};
334 gutsche 1.2 system($cmd);
335     next;
336     }
337 gutsche 1.1
338     }
339     }
340    
341     # Check if we are supposed to hold this task.
342     return if $hold;
343    
344     &touch ("$self->{WORKDIR}/$drop/done");
345     delete $$self{NEXT_CHECK}{$drop};
346     $self->relayDrop ($drop);
347     &logmsg("stats: $drop $taskdir @{[&formatElapsedTime($self->{STARTTIME})]} success");
348     }