1 |
#!/usr/bin/env perl
|
2 |
|
3 |
##H This drop box agent reports status of submitted jobs by maintaining
|
4 |
##H a simple CSV file. It maintains the counts by site, job type, the
|
5 |
##H owner/dataset, job submission status, and job exit code.
|
6 |
##H
|
7 |
##H Usage:
|
8 |
##H TaskProfile
|
9 |
##H -state DIRECTORY [-next NEXT] [-wait SECS]
|
10 |
##H -taskrepo DIRECTORY
|
11 |
##H
|
12 |
##H -state agent state directory, including inbox
|
13 |
##H -next next agent to pass the drops to; can be given several times
|
14 |
##H -wait time to wait in seconds between work scans
|
15 |
##H -taskrepo directory with all the tasks in it
|
16 |
|
17 |
BEGIN {
|
18 |
use strict; use warnings; $^W=1;
|
19 |
our $me = $0; $me =~ s|.*/||;
|
20 |
our $home = $0; $home =~ s|/[^/]+$||; $home ||= "."; $home .= "/../PHEDEX/Toolkit/Common";
|
21 |
unshift(@INC, $home);
|
22 |
}
|
23 |
|
24 |
######################################################################
|
25 |
use UtilsHelp;
|
26 |
while (scalar @ARGV)
|
27 |
{
|
28 |
if ($ARGV[0] eq '-state' && scalar @ARGV > 1)
|
29 |
{ shift (@ARGV); $args{DROPDIR}= shift(@ARGV); }
|
30 |
elsif ($ARGV[0] eq '-next' && scalar @ARGV > 1)
|
31 |
{ shift (@ARGV); push (@{$args{NEXTDIR}}, shift(@ARGV)); }
|
32 |
elsif ($ARGV[0] eq '-wait' && scalar @ARGV > 1)
|
33 |
{ shift (@ARGV); $args{WAITTIME} = shift(@ARGV); }
|
34 |
elsif ($ARGV[0] eq '-taskrepo' && scalar @ARGV > 1)
|
35 |
{ shift (@ARGV); push(@{$args{TASKREPO}}, shift(@ARGV)); }
|
36 |
elsif ($ARGV[0] eq '-h')
|
37 |
{ &usage(); }
|
38 |
else
|
39 |
{ last; }
|
40 |
}
|
41 |
|
42 |
if (@ARGV || !$args{DROPDIR} || !$args{TASKREPO})
|
43 |
{
|
44 |
die "Insufficient parameters, use -h for help.\n";
|
45 |
}
|
46 |
|
47 |
(new TaskProfile (%args))->process();
|
48 |
|
49 |
######################################################################
|
50 |
# Routines specific to this agent.
|
51 |
package TaskProfile; use strict; use warnings; use base 'UtilsAgent';
|
52 |
use UtilsCommand;
|
53 |
use UtilsLogging;
|
54 |
use UtilsTiming;
|
55 |
|
56 |
sub new
|
57 |
{
|
58 |
my $proto = shift;
|
59 |
my $class = ref($proto) || $proto;
|
60 |
my $self = $class->SUPER::new(@_);
|
61 |
my %params = (TASKREPO => undef); # task base directory
|
62 |
my %args = (@_);
|
63 |
map { $self->{$_} = $args{$_} || $params{$_} } keys %params;
|
64 |
bless $self, $class;
|
65 |
return $self;
|
66 |
}
|
67 |
|
68 |
# Find out how many jobs are pending for each site. This is
|
69 |
# insensitive to the job type, and we only check once in the
|
70 |
# beginning to avoid favouring one dataset over another --
|
71 |
# once we decide to proceed for a site, we submit jobs for
|
72 |
# all datasets.
|
73 |
sub getSiteStatus
|
74 |
{
|
75 |
my ($self) = @_;
|
76 |
my $result = {};
|
77 |
my %statusname = ('X' => 'Initial', 'C' => 'Created', 'N' => 'No Input',
|
78 |
'S' => 'Submitted', 'P' => 'Pre-retrieve', 'R' => 'Retrieved',
|
79 |
'A' => 'Aborted', 'K' => 'Cancelled');
|
80 |
|
81 |
foreach my $site (map { (<$_/*/*>) } @{$self->{TASKREPO}})
|
82 |
{
|
83 |
my ($sitename) = ($site =~ m|.*/(.*)|);
|
84 |
foreach my $taskdir (<$site/*/*>)
|
85 |
{
|
86 |
# Match components from name FNAL.ExSimHitStatistics.jm03b_qcd_20_30.jm_Hit245_2_g133.1
|
87 |
my ($apptype, $owner, $dataset) = ($taskdir =~ m!.*/SC3\.[^.]+\.([^.]+)\.(\S+)\.([^.]+)\.\d+!);
|
88 |
my $crabdir = (<$taskdir/crab_*>)[0];
|
89 |
my (@jobids, @jobstat);
|
90 |
@jobids = split(/\n/, &input("$crabdir/log/scheduler_id.log") || '') if $crabdir;
|
91 |
@jobstat = split(/\n/, &input("$crabdir/share/scripts.list") || '') if $crabdir;
|
92 |
my $crabinfo = &input("$taskdir/JOB_STATUS_LOG.txt") || '';
|
93 |
$crabinfo =~ s/.*\nSTART \d//s;
|
94 |
my %crabstat = map { /^ JOB\s+\d+:\s+(\S+)\s+STATUS:\s+(.*)/ ? ($1 => $2) : () }
|
95 |
grep (/^ JOB/, split(/\n/, $crabinfo));
|
96 |
|
97 |
for (my $i = 0; $i <= $#jobstat; ++$i)
|
98 |
{
|
99 |
# Determine job status according to CRAB and from log.
|
100 |
my $status = (split(/ /, $jobstat[$i]))[1];
|
101 |
my $statusname = $statusname{$status};
|
102 |
if ($status =~ /^[XCN]$/)
|
103 |
{
|
104 |
$result->{$sitename}{$apptype}{$owner}{$dataset}{$statusname}{""} ||= 0;
|
105 |
$result->{$sitename}{$apptype}{$owner}{$dataset}{$statusname}{""}++;
|
106 |
}
|
107 |
elsif ($status =~ /^[RPAKS]$/)
|
108 |
{
|
109 |
# Figure out what last job status said
|
110 |
my $jobstat = $crabstat{$jobids[$i] || ''} || '';
|
111 |
my ($stat, $exit) = ($jobstat =~ /(\S+)(?:\s+EXIT_CODE:\s+(\d+))?/);
|
112 |
$exit = '' if ! defined $exit;
|
113 |
|
114 |
# If CRAB says it was aborted, trust it
|
115 |
$status = 'A' if (defined $stat && $stat eq 'Aborted');
|
116 |
$statusname = $statusname{$status};
|
117 |
$statusname = $stat if (defined $stat && $status eq 'S');
|
118 |
|
119 |
# Now record.
|
120 |
$result->{$sitename}{$apptype}{$owner}{$dataset}{$statusname}{$exit} ||= 0;
|
121 |
$result->{$sitename}{$apptype}{$owner}{$dataset}{$statusname}{$exit}++;
|
122 |
}
|
123 |
}
|
124 |
}
|
125 |
}
|
126 |
|
127 |
return $result;
|
128 |
}
|
129 |
|
130 |
sub idle
|
131 |
{
|
132 |
my ($self, @pending) = @_;
|
133 |
|
134 |
# Get status of how busy the sites are. We obtain this only once
|
135 |
# in order to not favour datasets "early on" in the list.
|
136 |
my $stats = $self->getSiteStatus ();
|
137 |
my $now = time();
|
138 |
|
139 |
my $output = "Time,Site,App,Owner,Dataset,Status,Exitcode,Count\n";
|
140 |
foreach my $site (keys %$stats) {
|
141 |
foreach my $app (keys %{$stats->{$site}}) {
|
142 |
foreach my $owner (keys %{$stats->{$site}{$app}}) {
|
143 |
foreach my $ds (keys %{$stats->{$site}{$app}{$owner}}) {
|
144 |
foreach my $status (keys %{$stats->{$site}{$app}{$owner}{$ds}}) {
|
145 |
foreach my $exit (keys %{$stats->{$site}{$app}{$owner}{$ds}{$status}}) {
|
146 |
my $val = $stats->{$site}{$app}{$owner}{$ds}{$status}{$exit};
|
147 |
$output .= "$now,$site,$app,$owner,$ds,$status,$exit,$val\n";
|
148 |
}
|
149 |
}
|
150 |
}
|
151 |
}
|
152 |
}
|
153 |
}
|
154 |
|
155 |
&output ("$self->{DROPDIR}/jobstatus.csv", $output);
|
156 |
|
157 |
$self->nap ($self->{WAITTIME});
|
158 |
}
|