1 |
lat |
1.1 |
#!/usr/bin/env perl
|
2 |
|
|
|
3 |
|
|
##H This drop box agent reports status of submitted jobs by maintaining
|
4 |
|
|
##H a simple CSV file. It maintains the counts by site, job type, the
|
5 |
|
|
##H owner/dataset, job submission status, and job exit code.
|
6 |
|
|
##H
|
7 |
|
|
##H Usage:
|
8 |
|
|
##H TaskProfile
|
9 |
|
|
##H -state DIRECTORY [-next NEXT] [-wait SECS]
|
10 |
|
|
##H -taskrepo DIRECTORY
|
11 |
|
|
##H
|
12 |
|
|
##H -state agent state directory, including inbox
|
13 |
|
|
##H -next next agent to pass the drops to; can be given several times
|
14 |
|
|
##H -wait time to wait in seconds between work scans
|
15 |
|
|
##H -taskrepo directory with all the tasks in it
|
16 |
|
|
|
17 |
|
|
BEGIN {
|
18 |
|
|
use strict; use warnings; $^W=1;
|
19 |
|
|
our $me = $0; $me =~ s|.*/||;
|
20 |
|
|
our $home = $0; $home =~ s|/[^/]+$||; $home ||= "."; $home .= "/../../Toolkit/Common";
|
21 |
|
|
unshift(@INC, $home);
|
22 |
|
|
}
|
23 |
|
|
|
24 |
|
|
######################################################################
|
25 |
|
|
use UtilsHelp;
|
26 |
|
|
while (scalar @ARGV)
|
27 |
|
|
{
|
28 |
|
|
if ($ARGV[0] eq '-state' && scalar @ARGV > 1)
|
29 |
|
|
{ shift (@ARGV); $args{DROPDIR}= shift(@ARGV); }
|
30 |
|
|
elsif ($ARGV[0] eq '-next' && scalar @ARGV > 1)
|
31 |
|
|
{ shift (@ARGV); push (@{$args{NEXTDIR}}, shift(@ARGV)); }
|
32 |
|
|
elsif ($ARGV[0] eq '-wait' && scalar @ARGV > 1)
|
33 |
|
|
{ shift (@ARGV); $args{WAITTIME} = shift(@ARGV); }
|
34 |
|
|
elsif ($ARGV[0] eq '-taskrepo' && scalar @ARGV > 1)
|
35 |
lat |
1.3 |
{ shift (@ARGV); push(@{$args{TASKREPO}}, shift(@ARGV)); }
|
36 |
lat |
1.1 |
elsif ($ARGV[0] eq '-h')
|
37 |
|
|
{ &usage(); }
|
38 |
|
|
else
|
39 |
|
|
{ last; }
|
40 |
|
|
}
|
41 |
|
|
|
42 |
|
|
if (@ARGV || !$args{DROPDIR} || !$args{TASKREPO})
|
43 |
|
|
{
|
44 |
|
|
die "Insufficient parameters, use -h for help.\n";
|
45 |
|
|
}
|
46 |
|
|
|
47 |
|
|
(new TaskProfile (%args))->process();
|
48 |
|
|
|
49 |
|
|
######################################################################
|
50 |
|
|
# Routines specific to this agent.
|
51 |
|
|
package TaskProfile; use strict; use warnings; use base 'UtilsAgent';
|
52 |
|
|
use UtilsCommand;
|
53 |
|
|
use UtilsLogging;
|
54 |
|
|
use UtilsTiming;
|
55 |
|
|
|
56 |
|
|
sub new
|
57 |
|
|
{
|
58 |
|
|
my $proto = shift;
|
59 |
|
|
my $class = ref($proto) || $proto;
|
60 |
|
|
my $self = $class->SUPER::new(@_);
|
61 |
|
|
my %params = (TASKREPO => undef); # task base directory
|
62 |
|
|
my %args = (@_);
|
63 |
|
|
map { $self->{$_} = $args{$_} || $params{$_} } keys %params;
|
64 |
|
|
bless $self, $class;
|
65 |
|
|
return $self;
|
66 |
|
|
}
|
67 |
|
|
|
68 |
|
|
# Find out how many jobs are pending for each site. This is
|
69 |
|
|
# insensitive to the job type, and we only check once in the
|
70 |
|
|
# beginning to avoid favouring one dataset over another --
|
71 |
|
|
# once we decide to proceed for a site, we submit jobs for
|
72 |
|
|
# all datasets.
|
73 |
|
|
sub getSiteStatus
|
74 |
|
|
{
|
75 |
|
|
my ($self) = @_;
|
76 |
|
|
my $result = {};
|
77 |
lat |
1.2 |
my %statusname = ('X' => 'Initial', 'C' => 'Created', 'N' => 'No Input',
|
78 |
|
|
'S' => 'Submitted', 'P' => 'Pre-retrieve', 'R' => 'Retrieved',
|
79 |
|
|
'A' => 'Aborted', 'K' => 'Cancelled');
|
80 |
|
|
|
81 |
lat |
1.3 |
foreach my $site (map { (<$_/*>) } @{$self->{TASKREPO}})
|
82 |
lat |
1.1 |
{
|
83 |
|
|
my ($sitename) = ($site =~ m|.*/(.*)|);
|
84 |
|
|
foreach my $taskdir (<$site/*/*>)
|
85 |
|
|
{
|
86 |
|
|
# Match components from name SC3.FNAL.ExSimHitStatistics.jm03b_qcd_20_30.jm_Hit245_2_g133.1
|
87 |
|
|
my ($apptype, $owner, $dataset) = ($taskdir =~ m!.*/SC3\.[^.]+\.([^.]+)\.(\S+)\.([^.]+)\.\d+!);
|
88 |
|
|
my $crabdir = (<$taskdir/crab_*>)[0];
|
89 |
|
|
my (@jobids, @jobstat);
|
90 |
|
|
@jobids = split(/\n/, &input("$crabdir/log/scheduler_id.log") || '') if $crabdir;
|
91 |
|
|
@jobstat = split(/\n/, &input("$crabdir/share/scripts.list") || '') if $crabdir;
|
92 |
|
|
my $crabinfo = &input("$taskdir/JOB_STATUS_LOG.txt") || '';
|
93 |
|
|
$crabinfo =~ s/.*\nSTART \d//s;
|
94 |
|
|
my %crabstat = map { /^ JOB\s+\d+:\s+(\S+)\s+STATUS:\s+(.*)/ ? ($1 => $2) : () }
|
95 |
|
|
grep (/^ JOB/, split(/\n/, $crabinfo));
|
96 |
|
|
|
97 |
|
|
for (my $i = 0; $i <= $#jobstat; ++$i)
|
98 |
|
|
{
|
99 |
|
|
# Determine job status according to CRAB and from log.
|
100 |
|
|
my $status = (split(/ /, $jobstat[$i]))[1];
|
101 |
lat |
1.2 |
my $statusname = $statusname{$status};
|
102 |
lat |
1.1 |
if ($status =~ /^[XCN]$/)
|
103 |
|
|
{
|
104 |
lat |
1.2 |
$result->{$sitename}{$apptype}{$owner}{$dataset}{$statusname}{""} ||= 0;
|
105 |
|
|
$result->{$sitename}{$apptype}{$owner}{$dataset}{$statusname}{""}++;
|
106 |
lat |
1.1 |
}
|
107 |
|
|
elsif ($status =~ /^[RPAKS]$/)
|
108 |
|
|
{
|
109 |
|
|
# Figure out what last job status said
|
110 |
lat |
1.3 |
my $jobstat = $crabstat{$jobids[$i] || ''} || '';
|
111 |
lat |
1.1 |
my ($stat, $exit) = ($jobstat =~ /(\S+)(?:\s+EXIT_CODE:\s+(\d+))?/);
|
112 |
|
|
$exit = '' if ! defined $exit;
|
113 |
|
|
|
114 |
|
|
# If CRAB says it was aborted, trust it
|
115 |
|
|
$status = 'A' if (defined $stat && $stat eq 'Aborted');
|
116 |
lat |
1.2 |
$statusname = $statusname{$status};
|
117 |
|
|
$statusname = $stat if (defined $stat && $status eq 'S');
|
118 |
lat |
1.1 |
|
119 |
|
|
# Now record.
|
120 |
lat |
1.2 |
$result->{$sitename}{$apptype}{$owner}{$dataset}{$statusname}{$exit} ||= 0;
|
121 |
|
|
$result->{$sitename}{$apptype}{$owner}{$dataset}{$statusname}{$exit}++;
|
122 |
lat |
1.1 |
}
|
123 |
|
|
}
|
124 |
|
|
}
|
125 |
|
|
}
|
126 |
|
|
|
127 |
|
|
return $result;
|
128 |
|
|
}
|
129 |
|
|
|
130 |
|
|
sub idle
|
131 |
|
|
{
|
132 |
|
|
my ($self, @pending) = @_;
|
133 |
|
|
|
134 |
|
|
# Get status of how busy the sites are. We obtain this only once
|
135 |
|
|
# in order to not favour datasets "early on" in the list.
|
136 |
|
|
my $stats = $self->getSiteStatus ();
|
137 |
|
|
my $now = time();
|
138 |
|
|
|
139 |
lat |
1.2 |
my $output = "Time,Site,App,Owner,Dataset,Status,Exitcode,Count\n";
|
140 |
lat |
1.1 |
foreach my $site (keys %$stats) {
|
141 |
|
|
foreach my $app (keys %{$stats->{$site}}) {
|
142 |
|
|
foreach my $owner (keys %{$stats->{$site}{$app}}) {
|
143 |
|
|
foreach my $ds (keys %{$stats->{$site}{$app}{$owner}}) {
|
144 |
|
|
foreach my $status (keys %{$stats->{$site}{$app}{$owner}{$ds}}) {
|
145 |
|
|
foreach my $exit (keys %{$stats->{$site}{$app}{$owner}{$ds}{$status}}) {
|
146 |
|
|
my $val = $stats->{$site}{$app}{$owner}{$ds}{$status}{$exit};
|
147 |
|
|
$output .= "$now,$site,$app,$owner,$ds,$status,$exit,$val\n";
|
148 |
|
|
}
|
149 |
|
|
}
|
150 |
|
|
}
|
151 |
|
|
}
|
152 |
|
|
}
|
153 |
|
|
}
|
154 |
|
|
|
155 |
|
|
&output ("$self->{DROPDIR}/jobstatus.csv", $output);
|
156 |
|
|
|
157 |
|
|
$self->nap ($self->{WAITTIME});
|
158 |
|
|
}
|