[ViewVC] Annotation of: cvsroot/COMP/JOBROBOT/TaskSource

#!/usr/bin/env perl

##H This drop box agent initiates tasks on all published datasets.
##H A task is an application to run on a dataset at a particular site.
##H It keeps a record of tasks created and avoids submitting a task too
##H frequently.
##H
##H Usage:
##H   TaskSource
##H      -state DIRECTORY [-next NEXT] [-wait SECS] [-url URL-PUBLISHED]
##H      [-ignore-sites REGEXP] [-accept-sites REGEXP]
##H      [-secs-per-event N] [-max-site-queue N]
##H
##H -state     agent state directory, including inbox
##H -next      next agent to pass the drops to; can be given several times
##H -wait      time to wait in seconds between work scans
##H -url       contact string for published datasets
##H -ignore-sites
##H            regular expression for sites to ignore; ignore applies before
##H             accept, and by default nothing is ignored and everything is
##H             accepted.  applies to pubdb site names, not the host names of
##H             the site.
##H -accept-sites
##H            regular expression for sites to accept; ignore applies before
##H             accept, and by default nothing is ignored and everything is
##H             accepted.  applies to pubdb site names, not the host names of
##H             the site.
##H -secs-per-event
##H            minimum time to allocate for a task, given a number of events
##H             in the dataset to process.  tasks are not created more often
##H             than this.
##H -max-site-queue
##H            the high water mark of currently submitted jobs to a site,
##H             above which new tasks will not be created.

BEGIN {
  use strict; use warnings; $^W=1;
  our $me = $0; $me =~ s|.*/||;
  our $home = $0; $home =~ s|/[^/]+$||; $home ||= "."; $home .= "/../PHEDEX/Toolkit/Common";
  unshift(@INC, $home);
}

######################################################################
use UtilsHelp;
my %args = (WAITTIME => 600, SECS_PER_EVENT => 1., MAX_SITE_QUEUE => 10,
            URL => "http://cmsdoc.cern.ch/cms/production/www/PubDB/GetPublishedCollectionInfoFromRefDB.php");
while (scalar @ARGV)
{
    if ($ARGV[0] eq '-state' && scalar @ARGV > 1)
    { shift (@ARGV); $args{DROPDIR}= shift(@ARGV); }
    elsif ($ARGV[0] eq '-next' && scalar @ARGV > 1)
    { shift (@ARGV); push (@{$args{NEXTDIR}}, shift(@ARGV)); }
    elsif ($ARGV[0] eq '-wait' && scalar @ARGV > 1)
    { shift (@ARGV); $args{WAITTIME} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-ignore-sites' && scalar @ARGV > 1)
    { shift (@ARGV); $args{IGNORE_REGEXP} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-accept-sites' && scalar @ARGV > 1)
    { shift (@ARGV); $args{ACCEPT_REGEXP} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-secs-per-event' && scalar @ARGV > 1)
    { shift (@ARGV); $args{SECS_PER_EVENT} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-max-site-queue' && scalar @ARGV > 1)
    { shift (@ARGV); $args{MAX_SITE_QUEUE} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-url' && scalar @ARGV > 1)
    { shift (@ARGV); $args{URL} = shift(@ARGV); }
    # Marco
    elsif ($ARGV[0] eq '-dataset' && scalar @ARGV > 1)
    { shift (@ARGV); $args{DATASET} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-owner' && scalar @ARGV > 1)
    { shift (@ARGV); $args{OWNER} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-events' && scalar @ARGV > 1)
    { shift (@ARGV); $args{NEVENT} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-mode' && scalar @ARGV > 1)
    { shift (@ARGV); $args{MODE} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-scheduler' && scalar @ARGV > 1)
    { shift (@ARGV); $args{SCHEDULER} = shift(@ARGV); }
    elsif ($ARGV[0] eq '-jobtype' && scalar @ARGV > 1)
    { shift (@ARGV); $args{JOBTYPE} = shift(@ARGV); }
    # Marco
    elsif ($ARGV[0] eq '-h')
    { &usage(); }
    else
    { last; }
}
        
if (@ARGV || !$args{DROPDIR} || !$args{URL})
{
    die "Insufficient parameters, use -h for help.\n";
}

(new TaskSource (%args))->process();

######################################################################
# Routines specific to this agent.
package TaskSource; use strict; use warnings; use base 'UtilsAgent';
use File::Path;
use UtilsCommand;
use UtilsLogging;
use UtilsTiming;
use UtilsNet;
use POSIX;

sub new
{
    my $proto = shift;
    my $class = ref($proto) || $proto;
    my $self = $class->SUPER::new(@_);
    my %params = (SECS_PER_EVENT => undef,         # secs/event to delay per dataset
                  MAX_SITE_QUEUE => undef,         # max number of jobs per site
                  IGNORE_REGEXP => undef,          # regexp of sites to ignore
                  ACCEPT_REGEXP => undef,          # regexp of sites to accept
                  DATASET       => undef,          # specific dataset
                  OWNER         => undef,          # specific owner
                  NEVENT        => 500,            # number of events per job
                  MODE          => 1,              # data discovery mode: (1) PudDB/RefDB, (2) DBS/DLS
                  JOBTYPE       => "orca",         # standard jobtype
                  SCHEDULER     => "edg",          # standard scheduler
                  URL => undef);                   # published dataset url
    my %args = (@_);
    map { $self->{$_} = defined $args{$_} ? $args{$_} : $params{$_} } keys %params;
    bless $self, $class;
    return $self;
}

sub init
{
    my ($self) = @_;
    $self->{TASKREPO} = "$self->{DROPDIR}/tasks";
    -d "$self->{TASKREPO}"
        || mkdir "$self->{TASKREPO}"
        || die  "$self->{TASKREPO}: cannot create directory: $!\n";

    # Determine if links supports -dump-width option
    $self->{LINKS_OPTS} = [];
    open (LINKS_HELP, "links -help 2>/dev/null |");
    if (grep (/-dump-width/, <LINKS_HELP>)) {
        push(@{$self->{LINKS_OPTS}}, qw(-dump-width 300));
    }
    close (LINKS_HELP);

    # Precode whitelist.  Should really read this from somewhere...

    $self->{WHITELIST} = { ASCC => "sinica.edu.tw",
                           NCU  => "ncu.edu.tw",
                           FNAL => "fnal.gov",
                           CNAF => "webserver.infn.it",
                           BA   => "ba.infn.it",
                           IN2P3=> "in2p3.fr",
                           PIC  => "pic.es",
                           T2_SP=> "ciemat.es",
                           RAL  => "ral.ac.uk",
                           CERN => "cern.ch",
                           FZK  => "fzk.de",
                           DESY => "desy.de",
                           NEBR => "unl.edu",
                           WISC => "wisc.edu",
                           UFL => "ufl.edu",
                           PURDUE => "purdue.edu",
                           UCSD => "ucsd.edu",
                           CALT => "caltech.edu"
                         };
  }

# Find out how many jobs are pending for each site.  This is
# insensitive to the job type, and we only check once in the
# beginning to avoid favouring one dataset over another --
# once we decide to proceed for a site, we submit jobs for 
# all datasets.
sub getSiteStatus
{
    my ($self) = @_;
    my %result = ();
    my $taskrepo = $self->{TASKREPO};
    foreach my $site (<$taskrepo/*/*>)
    {
        my ($sitename) = ($site =~ m|.*/(.*)|);
        foreach my $d (<$site/*/*>)
        {
            if (! -f "$d/JOB_CREATE_LOG.txt")
            {
                $result{$sitename}{C} ||= 0;
                $result{$sitename}{C}++;
                next;
          }

          my $f = (<$d/crab_*/share/db/jobs>)[0];
          next if ! defined $f;

          foreach my $status (split(/\n/, &input($f) || ''))
          {
            my @statusarray = split("\;", $status);
            $result{$sitename}{$statusarray[1]} ||= 0;
            $result{$sitename}{$statusarray[1]}++;
          }
        }
    }

    return %result;
}

sub idle
  {
    my ($self, @pending) = @_;
    eval {
      # Get status of how busy the sites are.  We obtain this only once
      # in order to not favour datasets "early on" in the list.
      my %sitestats = $self->getSiteStatus ();
      if (keys %sitestats)
        {
          my @load;
          foreach my $site (sort keys %sitestats)
            {
              push (@load, "$site" . join("", map { " $_=$sitestats{$site}{$_}" }
                                          sort keys %{$sitestats{$site}}));
            }
          &logmsg ("current load: ", join ("; ", @load));
        }
      
      # Invoke links to fetch a formatted web page of published datasets.
      if ( $self->{MODE} == 2 ) {
        &logmsg ("DBS/DLS mode");
        my $cmd = "python2.2 /data/CrabV1/COMP/JOBROBOT/DBSlistDataset.py";
        my $out = `$cmd`;
        foreach my $line (split(/\n/, $out)) 
          {
            &timeStart($self->{STARTTIME});
            # Find out what was published and what we would like to do with it
            my ($owner, $dataset, $site, $events) = split(/\//, $line);
            $self->createTask($dataset, $owner, $events, $site, 2, %sitestats);
          }
      } else {
        &logmsg ("RefDB/PubDB mode");
        my $cmd = "links @{$self->{LINKS_OPTS}} -dump '$self->{URL}'";
        open (PUBLISHED, "$cmd 2>/dev/null |") or die "cannot run `$cmd': $!\n";
        while (<PUBLISHED>)
          {
            &timeStart($self->{STARTTIME});
            chomp; next if ! /_/; s/\|/ /g; s/^\s+//; s/\s+$//;
            # Find out what was published and what we would like to do with it
            my ($dataset, $owner, $events, $site, $proto) = split(/\s+/, $_);
            $self->createTask($dataset, $owner, $events, $site, 1, %sitestats);
          }
        close (PUBLISHED);
      }
    };
    do { chomp ($@); &alert ($@); } if $@;

    $self->nap ($self->{WAITTIME});
  }

sub createTask()
  {

    my ($self, $dataset, $owner, $events, $site, $mode, %sitestats) = @_;

    # Marco
    next if ($self->{DATASET} && $dataset !~ /$self->{DATASET}/);
    next if ($self->{OWNER} && $owner !~ /$self->{OWNER}/);
    # Marco
    my ($app, $tiers, $rc, $nevents, $output);
    my $whitelist = $self->{WHITELIST}->{$site} || '.';
    next if ($self->{IGNORE_REGEXP} && $site =~ /$self->{IGNORE_REGEXP}/);
    next if ($self->{ACCEPT_REGEXP} && $site !~ /$self->{ACCEPT_REGEXP}/);
    
    if ($dataset =~ /MBforPU/) {
      next;
    } elsif ($owner =~ /Hit/) {
      $rc = "orcarc.read.simhits";
      $app = "ExSimHitStatistics";
      $tiers = "Hit";
      $output = "simhits.aida";
    } elsif ($owner =~ /DST/) {
      next;
      if (rand(1) > 1.75) {
        $rc = "orcarc.root.dst";
        $app = "ExRootAnalysisDST";
        $output = "test.root";
      } else {
        $rc = "orcarc.read.dst";
        $app = "ExDSTStatistics";
        $output = "dststatistics.aida";
      }
      $tiers = "DST,Digi,Hit";
    } else {
      if (rand(1) > 1.75) {
        $rc = "orcarc.root.digis";
        $app = "ExRootAnalysisDigi";
        $output = "test.root";
      } else {
        $rc = "orcarc.read.digis";
        $app = "ExDigiStatistics";
        $output = "digistatistics.aida";
      }
      $tiers = "Digi,Hit";
    }
    
    # Find out what is already pending for this task.  First find all
    # existing tasks in the repository, the latest generation.
    my $datestamp = strftime ("%y%m%d", gmtime(time()));
    my $taskdir = "$self->{TASKREPO}/$datestamp/$site/$app";
# OLI: shorten path for condor_g (restriction to 256 characters)
#    my $taskbase = "$datestamp.$site.$app.$dataset.$owner";
    my $taskbase = "$dataset.$owner";
    my @existing = sort { $a <=> $b } map { /.*\.(\d+)$/ } <$taskdir/$taskbase.*>;
    my $curgen = pop(@existing) || 0;
    my $nextgen = $curgen + 1;
    
    # If the site isn't too busy already, ignore.
    my $pending = ($sitestats{$site}{S} || 0);
    $pending += ($sitestats{$site}{C} || 0);
    next if $pending > $self->{MAX_SITE_QUEUE};
    
    # OK to create the task if enough time has passed from previous
    # task creation, or there is no previous task.
    if (! -f "$taskdir/$taskbase.$curgen/crab.cfg"
        || (((stat("$taskdir/$taskbase.$curgen/crab.cfg"))[9]
             < time() - $events * $self->{SECS_PER_EVENT})))
      {
        my $mydir = $0; $mydir =~ s|/[^/]+$||;
        my $drop = sprintf("%s.%03d", $taskbase, $nextgen);
        my $ret = &runcmd ("$mydir/CrabJobs", "-app", $app,
                           "-jobevents", $self->{NEVENT},
                           "-orcarc", "$mydir/$rc",
                           "-owner", $owner,
                           "-dataset", $dataset,
                           "-tiers", $tiers,
                           "-whitelist", $whitelist,
                           "-name", "$taskdir/$drop",
                           "-output", $output,
                           "-jobtype", $self->{JOBTYPE},
                           "-scheduler", $self->{SCHEDULER},
                           "-mode", $mode);
        die "$drop: failed to create task: @{[&runerror($ret)]}\n" if $ret;

        &output ("$taskdir/$drop/TASK_INIT.txt",
                 &mytimeofday () . "\n");
        
        my $dropdir = "$self->{WORKDIR}/$drop";
        mkdir "$dropdir" || die "$dropdir: cannot create: $!\n";
        if (&output ("$dropdir/task", "$taskdir/$drop"))
          {
            &touch ("$dropdir/done");
            $self->relayDrop ($drop);
            &logmsg("stats: $drop @{[&formatElapsedTime($self->{STARTTIME})]} success");
          }
        else
          {
            &alert ("$drop: failed to create drop");
            &rmtree ([ "$self->{WORKDIR}/$drop" ]);
          }
      }
  }
Revision:	1.1
Committed:	Mon Apr 10 16:38:01 2006 UTC (19 years ago) by gutsche
Branch:	MAIN
CVS Tags:	JOBROBOT_1_0001
Log Message:	inital commit of CRAB Version 1 JobRobot
#	User	Rev	Content
1	gutsche	1.1	#!/usr/bin/env perl
2
3			##H This drop box agent initiates tasks on all published datasets.
4			##H A task is an application to run on a dataset at a particular site.
5			##H It keeps a record of tasks created and avoids submitting a task too
6			##H frequently.
7			##H
8			##H Usage:
9			##H TaskSource
10			##H -state DIRECTORY [-next NEXT] [-wait SECS] [-url URL-PUBLISHED]
11			##H [-ignore-sites REGEXP] [-accept-sites REGEXP]
12			##H [-secs-per-event N] [-max-site-queue N]
13			##H
14			##H -state agent state directory, including inbox
15			##H -next next agent to pass the drops to; can be given several times
16			##H -wait time to wait in seconds between work scans
17			##H -url contact string for published datasets
18			##H -ignore-sites
19			##H regular expression for sites to ignore; ignore applies before
20			##H accept, and by default nothing is ignored and everything is
21			##H accepted. applies to pubdb site names, not the host names of
22			##H the site.
23			##H -accept-sites
24			##H regular expression for sites to accept; ignore applies before
25			##H accept, and by default nothing is ignored and everything is
26			##H accepted. applies to pubdb site names, not the host names of
27			##H the site.
28			##H -secs-per-event
29			##H minimum time to allocate for a task, given a number of events
30			##H in the dataset to process. tasks are not created more often
31			##H than this.
32			##H -max-site-queue
33			##H the high water mark of currently submitted jobs to a site,
34			##H above which new tasks will not be created.
35
36			BEGIN {
37			use strict; use warnings; $^W=1;
38			our $me = $0; $me =~ s\|.*/\|\|;
39			our $home = $0; $home =~ s\|/[^/]+$\|\|; $home \|\|= "."; $home .= "/../PHEDEX/Toolkit/Common";
40			unshift(@INC, $home);
41			}
42
43			######################################################################
44			use UtilsHelp;
45			my %args = (WAITTIME => 600, SECS_PER_EVENT => 1., MAX_SITE_QUEUE => 10,
46			URL => "http://cmsdoc.cern.ch/cms/production/www/PubDB/GetPublishedCollectionInfoFromRefDB.php");
47			while (scalar @ARGV)
48			{
49			if ($ARGV[0] eq '-state' && scalar @ARGV > 1)
50			{ shift (@ARGV); $args{DROPDIR}= shift(@ARGV); }
51			elsif ($ARGV[0] eq '-next' && scalar @ARGV > 1)
52			{ shift (@ARGV); push (@{$args{NEXTDIR}}, shift(@ARGV)); }
53			elsif ($ARGV[0] eq '-wait' && scalar @ARGV > 1)
54			{ shift (@ARGV); $args{WAITTIME} = shift(@ARGV); }
55			elsif ($ARGV[0] eq '-ignore-sites' && scalar @ARGV > 1)
56			{ shift (@ARGV); $args{IGNORE_REGEXP} = shift(@ARGV); }
57			elsif ($ARGV[0] eq '-accept-sites' && scalar @ARGV > 1)
58			{ shift (@ARGV); $args{ACCEPT_REGEXP} = shift(@ARGV); }
59			elsif ($ARGV[0] eq '-secs-per-event' && scalar @ARGV > 1)
60			{ shift (@ARGV); $args{SECS_PER_EVENT} = shift(@ARGV); }
61			elsif ($ARGV[0] eq '-max-site-queue' && scalar @ARGV > 1)
62			{ shift (@ARGV); $args{MAX_SITE_QUEUE} = shift(@ARGV); }
63			elsif ($ARGV[0] eq '-url' && scalar @ARGV > 1)
64			{ shift (@ARGV); $args{URL} = shift(@ARGV); }
65			# Marco
66			elsif ($ARGV[0] eq '-dataset' && scalar @ARGV > 1)
67			{ shift (@ARGV); $args{DATASET} = shift(@ARGV); }
68			elsif ($ARGV[0] eq '-owner' && scalar @ARGV > 1)
69			{ shift (@ARGV); $args{OWNER} = shift(@ARGV); }
70			elsif ($ARGV[0] eq '-events' && scalar @ARGV > 1)
71			{ shift (@ARGV); $args{NEVENT} = shift(@ARGV); }
72			elsif ($ARGV[0] eq '-mode' && scalar @ARGV > 1)
73			{ shift (@ARGV); $args{MODE} = shift(@ARGV); }
74			elsif ($ARGV[0] eq '-scheduler' && scalar @ARGV > 1)
75			{ shift (@ARGV); $args{SCHEDULER} = shift(@ARGV); }
76			elsif ($ARGV[0] eq '-jobtype' && scalar @ARGV > 1)
77			{ shift (@ARGV); $args{JOBTYPE} = shift(@ARGV); }
78			# Marco
79			elsif ($ARGV[0] eq '-h')
80			{ &usage(); }
81			else
82			{ last; }
83			}
84
85			if (@ARGV \|\| !$args{DROPDIR} \|\| !$args{URL})
86			{
87			die "Insufficient parameters, use -h for help.\n";
88			}
89
90			(new TaskSource (%args))->process();
91
92			######################################################################
93			# Routines specific to this agent.
94			package TaskSource; use strict; use warnings; use base 'UtilsAgent';
95			use File::Path;
96			use UtilsCommand;
97			use UtilsLogging;
98			use UtilsTiming;
99			use UtilsNet;
100			use POSIX;
101
102			sub new
103			{
104			my $proto = shift;
105			my $class = ref($proto) \|\| $proto;
106			my $self = $class->SUPER::new(@_);
107			my %params = (SECS_PER_EVENT => undef, # secs/event to delay per dataset
108			MAX_SITE_QUEUE => undef, # max number of jobs per site
109			IGNORE_REGEXP => undef, # regexp of sites to ignore
110			ACCEPT_REGEXP => undef, # regexp of sites to accept
111			DATASET => undef, # specific dataset
112			OWNER => undef, # specific owner
113			NEVENT => 500, # number of events per job
114			MODE => 1, # data discovery mode: (1) PudDB/RefDB, (2) DBS/DLS
115			JOBTYPE => "orca", # standard jobtype
116			SCHEDULER => "edg", # standard scheduler
117			URL => undef); # published dataset url
118			my %args = (@_);
119			map { $self->{$_} = defined $args{$_} ? $args{$_} : $params{$_} } keys %params;
120			bless $self, $class;
121			return $self;
122			}
123
124			sub init
125			{
126			my ($self) = @_;
127			$self->{TASKREPO} = "$self->{DROPDIR}/tasks";
128			-d "$self->{TASKREPO}"
129			\|\| mkdir "$self->{TASKREPO}"
130			\|\| die "$self->{TASKREPO}: cannot create directory: $!\n";
131
132			# Determine if links supports -dump-width option
133			$self->{LINKS_OPTS} = [];
134			open (LINKS_HELP, "links -help 2>/dev/null \|");
135			if (grep (/-dump-width/, <LINKS_HELP>)) {
136			push(@{$self->{LINKS_OPTS}}, qw(-dump-width 300));
137			}
138			close (LINKS_HELP);
139
140			# Precode whitelist. Should really read this from somewhere...
141
142			$self->{WHITELIST} = { ASCC => "sinica.edu.tw",
143			NCU => "ncu.edu.tw",
144			FNAL => "fnal.gov",
145			CNAF => "webserver.infn.it",
146			BA => "ba.infn.it",
147			IN2P3=> "in2p3.fr",
148			PIC => "pic.es",
149			T2_SP=> "ciemat.es",
150			RAL => "ral.ac.uk",
151			CERN => "cern.ch",
152			FZK => "fzk.de",
153			DESY => "desy.de",
154			NEBR => "unl.edu",
155			WISC => "wisc.edu",
156			UFL => "ufl.edu",
157			PURDUE => "purdue.edu",
158			UCSD => "ucsd.edu",
159			CALT => "caltech.edu"
160			};
161			}
162
163			# Find out how many jobs are pending for each site. This is
164			# insensitive to the job type, and we only check once in the
165			# beginning to avoid favouring one dataset over another --
166			# once we decide to proceed for a site, we submit jobs for
167			# all datasets.
168			sub getSiteStatus
169			{
170			my ($self) = @_;
171			my %result = ();
172			my $taskrepo = $self->{TASKREPO};
173			foreach my $site (<$taskrepo//>)
174			{
175			my ($sitename) = ($site =~ m\|./(.)\|);
176			foreach my $d (<$site//>)
177			{
178			if (! -f "$d/JOB_CREATE_LOG.txt")
179			{
180			$result{$sitename}{C} \|\|= 0;
181			$result{$sitename}{C}++;
182			next;
183			}
184
185			my $f = (<$d/crab_*/share/db/jobs>)[0];
186			next if ! defined $f;
187
188			foreach my $status (split(/\n/, &input($f) \|\| ''))
189			{
190			my @statusarray = split("\;", $status);
191			$result{$sitename}{$statusarray[1]} \|\|= 0;
192			$result{$sitename}{$statusarray[1]}++;
193			}
194			}
195			}
196
197			return %result;
198			}
199
200			sub idle
201			{
202			my ($self, @pending) = @_;
203			eval {
204			# Get status of how busy the sites are. We obtain this only once
205			# in order to not favour datasets "early on" in the list.
206			my %sitestats = $self->getSiteStatus ();
207			if (keys %sitestats)
208			{
209			my @load;
210			foreach my $site (sort keys %sitestats)
211			{
212			push (@load, "$site" . join("", map { " $_=$sitestats{$site}{$_}" }
213			sort keys %{$sitestats{$site}}));
214			}
215			&logmsg ("current load: ", join ("; ", @load));
216			}
217
218			# Invoke links to fetch a formatted web page of published datasets.
219			if ( $self->{MODE} == 2 ) {
220			&logmsg ("DBS/DLS mode");
221			my $cmd = "python2.2 /data/CrabV1/COMP/JOBROBOT/DBSlistDataset.py";
222			my $out = `$cmd`;
223			foreach my $line (split(/\n/, $out))
224			{
225			&timeStart($self->{STARTTIME});
226			# Find out what was published and what we would like to do with it
227			my ($owner, $dataset, $site, $events) = split(/\//, $line);
228			$self->createTask($dataset, $owner, $events, $site, 2, %sitestats);
229			}
230			} else {
231			&logmsg ("RefDB/PubDB mode");
232			my $cmd = "links @{$self->{LINKS_OPTS}} -dump '$self->{URL}'";
233			open (PUBLISHED, "$cmd 2>/dev/null \|") or die "cannot run `$cmd': $!\n";
234			while (<PUBLISHED>)
235			{
236			&timeStart($self->{STARTTIME});
237			chomp; next if ! /_/; s/\\|/ /g; s/^\s+//; s/\s+$//;
238			# Find out what was published and what we would like to do with it
239			my ($dataset, $owner, $events, $site, $proto) = split(/\s+/, $_);
240			$self->createTask($dataset, $owner, $events, $site, 1, %sitestats);
241			}
242			close (PUBLISHED);
243			}
244			};
245			do { chomp ($@); &alert ($@); } if $@;
246
247			$self->nap ($self->{WAITTIME});
248			}
249
250			sub createTask()
251			{
252
253			my ($self, $dataset, $owner, $events, $site, $mode, %sitestats) = @_;
254
255			# Marco
256			next if ($self->{DATASET} && $dataset !~ /$self->{DATASET}/);
257			next if ($self->{OWNER} && $owner !~ /$self->{OWNER}/);
258			# Marco
259			my ($app, $tiers, $rc, $nevents, $output);
260			my $whitelist = $self->{WHITELIST}->{$site} \|\| '.';
261			next if ($self->{IGNORE_REGEXP} && $site =~ /$self->{IGNORE_REGEXP}/);
262			next if ($self->{ACCEPT_REGEXP} && $site !~ /$self->{ACCEPT_REGEXP}/);
263
264			if ($dataset =~ /MBforPU/) {
265			next;
266			} elsif ($owner =~ /Hit/) {
267			$rc = "orcarc.read.simhits";
268			$app = "ExSimHitStatistics";
269			$tiers = "Hit";
270			$output = "simhits.aida";
271			} elsif ($owner =~ /DST/) {
272			next;
273			if (rand(1) > 1.75) {
274			$rc = "orcarc.root.dst";
275			$app = "ExRootAnalysisDST";
276			$output = "test.root";
277			} else {
278			$rc = "orcarc.read.dst";
279			$app = "ExDSTStatistics";
280			$output = "dststatistics.aida";
281			}
282			$tiers = "DST,Digi,Hit";
283			} else {
284			if (rand(1) > 1.75) {
285			$rc = "orcarc.root.digis";
286			$app = "ExRootAnalysisDigi";
287			$output = "test.root";
288			} else {
289			$rc = "orcarc.read.digis";
290			$app = "ExDigiStatistics";
291			$output = "digistatistics.aida";
292			}
293			$tiers = "Digi,Hit";
294			}
295
296			# Find out what is already pending for this task. First find all
297			# existing tasks in the repository, the latest generation.
298			my $datestamp = strftime ("%y%m%d", gmtime(time()));
299			my $taskdir = "$self->{TASKREPO}/$datestamp/$site/$app";
300			# OLI: shorten path for condor_g (restriction to 256 characters)
301			# my $taskbase = "$datestamp.$site.$app.$dataset.$owner";
302			my $taskbase = "$dataset.$owner";
303			my @existing = sort { $a <=> $b } map { /.\.(\d+)$/ } <$taskdir/$taskbase.>;
304			my $curgen = pop(@existing) \|\| 0;
305			my $nextgen = $curgen + 1;
306
307			# If the site isn't too busy already, ignore.
308			my $pending = ($sitestats{$site}{S} \|\| 0);
309			$pending += ($sitestats{$site}{C} \|\| 0);
310			next if $pending > $self->{MAX_SITE_QUEUE};
311
312			# OK to create the task if enough time has passed from previous
313			# task creation, or there is no previous task.
314			if (! -f "$taskdir/$taskbase.$curgen/crab.cfg"
315			\|\| (((stat("$taskdir/$taskbase.$curgen/crab.cfg"))[9]
316			< time() - $events * $self->{SECS_PER_EVENT})))
317			{
318			my $mydir = $0; $mydir =~ s\|/[^/]+$\|\|;
319			my $drop = sprintf("%s.%03d", $taskbase, $nextgen);
320			my $ret = &runcmd ("$mydir/CrabJobs", "-app", $app,
321			"-jobevents", $self->{NEVENT},
322			"-orcarc", "$mydir/$rc",
323			"-owner", $owner,
324			"-dataset", $dataset,
325			"-tiers", $tiers,
326			"-whitelist", $whitelist,
327			"-name", "$taskdir/$drop",
328			"-output", $output,
329			"-jobtype", $self->{JOBTYPE},
330			"-scheduler", $self->{SCHEDULER},
331			"-mode", $mode);
332			die "$drop: failed to create task: @{[&runerror($ret)]}\n" if $ret;
333
334			&output ("$taskdir/$drop/TASK_INIT.txt",
335			&mytimeofday () . "\n");
336
337			my $dropdir = "$self->{WORKDIR}/$drop";
338			mkdir "$dropdir" \|\| die "$dropdir: cannot create: $!\n";
339			if (&output ("$dropdir/task", "$taskdir/$drop"))
340			{
341			&touch ("$dropdir/done");
342			$self->relayDrop ($drop);
343			&logmsg("stats: $drop @{[&formatElapsedTime($self->{STARTTIME})]} success");
344			}
345			else
346			{
347			&alert ("$drop: failed to create drop");
348			&rmtree ([ "$self->{WORKDIR}/$drop" ]);
349			}
350			}
351			}