ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/JOBROBOT/TaskProfile
Revision: 1.1
Committed: Wed Oct 12 16:35:59 2005 UTC (19 years, 6 months ago) by lat
Branch: MAIN
Log Message:
Add simple agent to monitor job status

File Contents

# User Rev Content
1 lat 1.1 #!/usr/bin/env perl
2    
3     ##H This drop box agent reports status of submitted jobs by maintaining
4     ##H a simple CSV file. It maintains the counts by site, job type, the
5     ##H owner/dataset, job submission status, and job exit code.
6     ##H
7     ##H Usage:
8     ##H TaskProfile
9     ##H -state DIRECTORY [-next NEXT] [-wait SECS]
10     ##H -taskrepo DIRECTORY
11     ##H
12     ##H -state agent state directory, including inbox
13     ##H -next next agent to pass the drops to; can be given several times
14     ##H -wait time to wait in seconds between work scans
15     ##H -taskrepo directory with all the tasks in it
16    
17     BEGIN {
18     use strict; use warnings; $^W=1;
19     our $me = $0; $me =~ s|.*/||;
20     our $home = $0; $home =~ s|/[^/]+$||; $home ||= "."; $home .= "/../../Toolkit/Common";
21     unshift(@INC, $home);
22     }
23    
24     ######################################################################
25     use UtilsHelp;
26     while (scalar @ARGV)
27     {
28     if ($ARGV[0] eq '-state' && scalar @ARGV > 1)
29     { shift (@ARGV); $args{DROPDIR}= shift(@ARGV); }
30     elsif ($ARGV[0] eq '-next' && scalar @ARGV > 1)
31     { shift (@ARGV); push (@{$args{NEXTDIR}}, shift(@ARGV)); }
32     elsif ($ARGV[0] eq '-wait' && scalar @ARGV > 1)
33     { shift (@ARGV); $args{WAITTIME} = shift(@ARGV); }
34     elsif ($ARGV[0] eq '-taskrepo' && scalar @ARGV > 1)
35     { shift (@ARGV); $args{TASKREPO} = shift(@ARGV); }
36     elsif ($ARGV[0] eq '-h')
37     { &usage(); }
38     else
39     { last; }
40     }
41    
42     if (@ARGV || !$args{DROPDIR} || !$args{TASKREPO})
43     {
44     die "Insufficient parameters, use -h for help.\n";
45     }
46    
47     (new TaskProfile (%args))->process();
48    
49     ######################################################################
50     # Routines specific to this agent.
51     package TaskProfile; use strict; use warnings; use base 'UtilsAgent';
52     use UtilsCommand;
53     use UtilsLogging;
54     use UtilsTiming;
55    
56     sub new
57     {
58     my $proto = shift;
59     my $class = ref($proto) || $proto;
60     my $self = $class->SUPER::new(@_);
61     my %params = (TASKREPO => undef); # task base directory
62     my %args = (@_);
63     map { $self->{$_} = $args{$_} || $params{$_} } keys %params;
64     bless $self, $class;
65     return $self;
66     }
67    
68     # Find out how many jobs are pending for each site. This is
69     # insensitive to the job type, and we only check once in the
70     # beginning to avoid favouring one dataset over another --
71     # once we decide to proceed for a site, we submit jobs for
72     # all datasets.
73     sub getSiteStatus
74     {
75     my ($self) = @_;
76     my $result = {};
77     my $taskrepo = $self->{TASKREPO};
78     foreach my $site (<$taskrepo/*>)
79     {
80     my ($sitename) = ($site =~ m|.*/(.*)|);
81     foreach my $taskdir (<$site/*/*>)
82     {
83     # Match components from name SC3.FNAL.ExSimHitStatistics.jm03b_qcd_20_30.jm_Hit245_2_g133.1
84     my ($apptype, $owner, $dataset) = ($taskdir =~ m!.*/SC3\.[^.]+\.([^.]+)\.(\S+)\.([^.]+)\.\d+!);
85     my $crabdir = (<$taskdir/crab_*>)[0];
86     my (@jobids, @jobstat);
87     @jobids = split(/\n/, &input("$crabdir/log/scheduler_id.log") || '') if $crabdir;
88     @jobstat = split(/\n/, &input("$crabdir/share/scripts.list") || '') if $crabdir;
89     my $crabinfo = &input("$taskdir/JOB_STATUS_LOG.txt") || '';
90     $crabinfo =~ s/.*\nSTART \d//s;
91     my %crabstat = map { /^ JOB\s+\d+:\s+(\S+)\s+STATUS:\s+(.*)/ ? ($1 => $2) : () }
92     grep (/^ JOB/, split(/\n/, $crabinfo));
93    
94     for (my $i = 0; $i <= $#jobstat; ++$i)
95     {
96     # Determine job status according to CRAB and from log.
97     my $status = (split(/ /, $jobstat[$i]))[1];
98     if ($status =~ /^[XCN]$/)
99     {
100     $result->{$sitename}{$apptype}{$owner}{$dataset}{$status}{""} ||= 0;
101     $result->{$sitename}{$apptype}{$owner}{$dataset}{$status}{""}++;
102     }
103     elsif ($status =~ /^[RPAKS]$/)
104     {
105     # Figure out what last job status said
106     my $jobstat = $crabstat{$jobids[$i]} || '';
107     my ($stat, $exit) = ($jobstat =~ /(\S+)(?:\s+EXIT_CODE:\s+(\d+))?/);
108     $exit = '' if ! defined $exit;
109    
110     # If CRAB says it was aborted, trust it
111     $status = 'A' if (defined $stat && $stat eq 'Aborted');
112    
113     # Now record.
114     $result->{$sitename}{$apptype}{$owner}{$dataset}{$status}{$exit} ||= 0;
115     $result->{$sitename}{$apptype}{$owner}{$dataset}{$status}{$exit}++;
116     }
117     }
118     }
119     }
120    
121     return $result;
122     }
123    
124     sub idle
125     {
126     my ($self, @pending) = @_;
127    
128     # Get status of how busy the sites are. We obtain this only once
129     # in order to not favour datasets "early on" in the list.
130     my $stats = $self->getSiteStatus ();
131     my $now = time();
132    
133     my $output = "Time,Site,App,Owner,Dataset,Status,Exitcode\n";
134     foreach my $site (keys %$stats) {
135     foreach my $app (keys %{$stats->{$site}}) {
136     foreach my $owner (keys %{$stats->{$site}{$app}}) {
137     foreach my $ds (keys %{$stats->{$site}{$app}{$owner}}) {
138     foreach my $status (keys %{$stats->{$site}{$app}{$owner}{$ds}}) {
139     foreach my $exit (keys %{$stats->{$site}{$app}{$owner}{$ds}{$status}}) {
140     my $val = $stats->{$site}{$app}{$owner}{$ds}{$status}{$exit};
141     $output .= "$now,$site,$app,$owner,$ds,$status,$exit,$val\n";
142     }
143     }
144     }
145     }
146     }
147     }
148    
149     &output ("$self->{DROPDIR}/jobstatus.csv", $output);
150    
151     $self->nap ($self->{WAITTIME});
152     }