1 |
kukartse |
1.1 |
#!/usr/bin/env python
|
2 |
|
|
# @(#)root/tmva $Id: TMVAnalysis.py,v 1.8 2007/06/20 09:41:24 brun Exp $
|
3 |
|
|
# ------------------------------------------------------------------------------ #
|
4 |
|
|
# Project : TMVA - a Root-integrated toolkit for multivariate data analysis #
|
5 |
|
|
# Package : TMVA #
|
6 |
|
|
# Python script: TMVAnalysis.py #
|
7 |
|
|
# #
|
8 |
|
|
# This python script provides examples for the training and testing of all the #
|
9 |
|
|
# TMVA classifiers through PyROOT. Note that the use PyROOT requires that you #
|
10 |
|
|
# have a python version > 2.2 installed on your computer. #
|
11 |
|
|
# #
|
12 |
|
|
# As input data is used a toy-MC sample consisting of four Gaussian-distributed #
|
13 |
|
|
# and linearly correlated input variables. #
|
14 |
|
|
# #
|
15 |
|
|
# The methods to be used can be switched on and off via the prompt command, for #
|
16 |
|
|
# example: #
|
17 |
|
|
# #
|
18 |
|
|
# python TMVAnalysis.py --methods Fisher,Likelihood #
|
19 |
|
|
# #
|
20 |
|
|
# The output file "TMVA.root" can be analysed with the use of dedicated #
|
21 |
|
|
# macros (simply say: root -l <../macros/macro.C>), which can be conveniently #
|
22 |
|
|
# invoked through a GUI that will appear at the end of the run of this macro. #
|
23 |
|
|
# #
|
24 |
|
|
# for help type "python TMVAnalysis.py --help" #
|
25 |
|
|
# ------------------------------------------------------------------------------ #
|
26 |
|
|
|
27 |
|
|
# --------------------------------------------
|
28 |
|
|
# standard python import
|
29 |
|
|
import sys # exit
|
30 |
|
|
import time # time accounting
|
31 |
|
|
import getopt # command line parser
|
32 |
|
|
|
33 |
|
|
# --------------------------------------------
|
34 |
|
|
|
35 |
|
|
# default settings for command line arguments
|
36 |
|
|
DEFAULT_OUTFNAME = "TMVA.root"
|
37 |
|
|
DEFAULT_INFNAME = "../examples/data/toy_sigbkg.root"
|
38 |
|
|
DEFAULT_TREESIG = "TreeS"
|
39 |
|
|
DEFAULT_TREEBKG = "TreeB"
|
40 |
|
|
DEFAULT_METHODS = "CutsGA Likelihood LikelihoodPCA PDERS KNN HMatrix Fisher FDA MLP SVM_Gauss BDT RuleFitTMVA"
|
41 |
|
|
|
42 |
|
|
# print help
|
43 |
|
|
def usage():
|
44 |
|
|
print " "
|
45 |
|
|
print "Usage: python %s [options]" % sys.argv[0]
|
46 |
|
|
print " -m | --methods : gives methods to be run (default: all methods)"
|
47 |
|
|
print " -i | --inputfile : name of input ROOT file (default: '%s')" % DEFAULT_INFNAME
|
48 |
|
|
print " -o | --outputfile : name of output ROOT file containing results (default: '%s')" % DEFAULT_OUTFNAME
|
49 |
|
|
print " -t | --inputtrees : input ROOT Trees for signal and background (default: '%s %s')" \
|
50 |
|
|
% (DEFAULT_TREESIG, DEFAULT_TREEBKG)
|
51 |
|
|
print " -v | --verbose"
|
52 |
|
|
print " -? | --usage : print this help message"
|
53 |
|
|
print " -h | --help : print this help message"
|
54 |
|
|
print " "
|
55 |
|
|
|
56 |
|
|
# main routine
|
57 |
|
|
def main():
|
58 |
|
|
|
59 |
|
|
try:
|
60 |
|
|
# retrive command line options
|
61 |
|
|
shortopts = "m:i:t:o:vh?"
|
62 |
|
|
longopts = ["methods=", "inputfile=", "inputtrees=", "outputfile=", "verbose", "help", "usage"]
|
63 |
|
|
opts, args = getopt.getopt( sys.argv[1:], shortopts, longopts )
|
64 |
|
|
|
65 |
|
|
except getopt.GetoptError:
|
66 |
|
|
# print help information and exit:
|
67 |
|
|
print "ERROR: unknown options in argument %s" % sys.argv[1:]
|
68 |
|
|
usage()
|
69 |
|
|
sys.exit(1)
|
70 |
|
|
|
71 |
|
|
infname = DEFAULT_INFNAME
|
72 |
|
|
treeNameSig = DEFAULT_TREESIG
|
73 |
|
|
treeNameBkg = DEFAULT_TREEBKG
|
74 |
|
|
outfname = DEFAULT_OUTFNAME
|
75 |
|
|
methods = DEFAULT_METHODS
|
76 |
|
|
verbose = False
|
77 |
|
|
for o, a in opts:
|
78 |
|
|
if o in ("-?", "-h", "--help", "--usage"):
|
79 |
|
|
usage()
|
80 |
|
|
sys.exit(0)
|
81 |
|
|
elif o in ("-m", "--methods"):
|
82 |
|
|
methods = a
|
83 |
|
|
elif o in ("-i", "--inputfile"):
|
84 |
|
|
infname = a
|
85 |
|
|
elif o in ("-o", "--outputfile"):
|
86 |
|
|
outfname = a
|
87 |
|
|
elif o in ("-t", "--inputtrees"):
|
88 |
|
|
a.strip()
|
89 |
|
|
trees = a.rsplit( ' ' )
|
90 |
|
|
trees.sort()
|
91 |
|
|
trees.reverse()
|
92 |
|
|
if len(trees)-trees.count('') != 2:
|
93 |
|
|
print "ERROR: need to give two trees (each one for signal and background)"
|
94 |
|
|
print trees
|
95 |
|
|
sys.exit(1)
|
96 |
|
|
treeNameSig = trees[0]
|
97 |
|
|
treeNameBkg = trees[1]
|
98 |
|
|
elif o in ("-v", "--verbose"):
|
99 |
|
|
verbose = True
|
100 |
|
|
|
101 |
|
|
# print methods
|
102 |
|
|
mlist = methods.split(',')
|
103 |
|
|
print "=== TMVAnalysis: use methods..."
|
104 |
|
|
for m in mlist:
|
105 |
|
|
if m != '':
|
106 |
|
|
print "=== <%s>" % m
|
107 |
|
|
|
108 |
|
|
# import ROOT classes
|
109 |
|
|
from ROOT import gSystem, gROOT, gApplication, TFile, TTree, TCut
|
110 |
|
|
|
111 |
|
|
# logon not automatically loaded through PyROOT (logon loads TMVA library) load also GUI
|
112 |
|
|
gROOT.Macro( '../macros/TMVAlogon.C' )
|
113 |
|
|
gROOT.LoadMacro( '../macros/TMVAGui.C' )
|
114 |
|
|
|
115 |
|
|
# import TMVA classes from ROOT
|
116 |
|
|
from ROOT import TMVA
|
117 |
|
|
|
118 |
|
|
# output file
|
119 |
|
|
outputFile = TFile( outfname, 'RECREATE' )
|
120 |
|
|
|
121 |
|
|
# create einstance of factory
|
122 |
|
|
factory = TMVA.Factory( "TMVAnalysis", outputFile, "Color" )
|
123 |
|
|
|
124 |
|
|
# set verbosity
|
125 |
|
|
factory.SetVerbose( verbose )
|
126 |
|
|
|
127 |
|
|
# read input data
|
128 |
|
|
if not gSystem.AccessPathName( infname ):
|
129 |
|
|
input = TFile( infname )
|
130 |
|
|
else:
|
131 |
|
|
print "ERROR: could not access data file %s\n" % infname
|
132 |
|
|
|
133 |
|
|
signal = input.Get( treeNameSig )
|
134 |
|
|
background = input.Get( treeNameBkg )
|
135 |
|
|
|
136 |
|
|
# global event weights (see below for setting event-wise weights)
|
137 |
|
|
signalWeight = 1.0
|
138 |
|
|
backgroundWeight = 1.0
|
139 |
|
|
|
140 |
|
|
if not factory.SetInputTrees( signal, background, signalWeight, backgroundWeight ):
|
141 |
|
|
print "ERROR: could not set input trees\n"
|
142 |
|
|
sys.exit(1)
|
143 |
|
|
|
144 |
|
|
# Define the input variables that shall be used for the classifier training
|
145 |
|
|
# note that you may also use variable expressions, such as: "3*var1/var2*abs(var3)"
|
146 |
|
|
# [all types of expressions that can also be parsed by TTree::Draw( "expression" )]
|
147 |
|
|
factory.AddVariable("var1+var2", 'F')
|
148 |
|
|
factory.AddVariable("var1-var2", 'F')
|
149 |
|
|
factory.AddVariable("var3", 'F')
|
150 |
|
|
factory.AddVariable("var4", 'F')
|
151 |
|
|
|
152 |
|
|
# This would set individual event weights (the variables defined in the
|
153 |
|
|
# expression need to exist in the original TTree)
|
154 |
|
|
# factory->SetWeightExpression("weight1*weight2")
|
155 |
|
|
#
|
156 |
|
|
# Apply additional cuts on the signal and background sample.
|
157 |
|
|
# Assumptions on size of training and testing sample:
|
158 |
|
|
# a) equal number of signal and background events is used for training
|
159 |
|
|
# b) any numbers of signal and background events are used for testing
|
160 |
|
|
# c) an explicit syntax can violate a)
|
161 |
|
|
# more Documentation with the Factory class
|
162 |
|
|
# example for cut: mycut = TCut( "abs(var1)<0.5 && abs(var2-0.5)<1" )
|
163 |
|
|
mycut = TCut( "" )
|
164 |
|
|
|
165 |
|
|
# here, the relevant variables are copied over in new, slim trees that are
|
166 |
|
|
# used for TMVA training and testing
|
167 |
|
|
# "SplitMode=Random" means that the input events are randomly shuffled before
|
168 |
|
|
# splitting them into training and test samples
|
169 |
|
|
factory.PrepareTrainingAndTestTree( mycut, "NSigTrain=3000:NBkgTrain=3000:SplitMode=Random:NormMode=NumEvents:!V" )
|
170 |
|
|
|
171 |
|
|
# and alternative call to use a different number of signal and background training/test event is:
|
172 |
|
|
# factory.PrepareTrainingAndTestTree( mycut, "NSigTrain=3000:NBkgTrain=3000:NSigTest=3000:NBkgTest=3000:SplitMode=Random:!V" )
|
173 |
|
|
|
174 |
|
|
# Cut optimisation
|
175 |
|
|
if "Cuts" in mlist:
|
176 |
|
|
factory.BookMethod( TMVA.Types.kCuts, "Cuts",
|
177 |
|
|
"!H:!V:FitMethod=MC:EffSel:SampleSize=200000:VarProp=FSmart" )
|
178 |
|
|
|
179 |
|
|
# Cut optimisation using decorrelated input variables
|
180 |
|
|
if "CutsD" in mlist:
|
181 |
|
|
factory.BookMethod( TMVA.Types.kCuts, "CutsD",
|
182 |
|
|
"!H:!V:FitMethod=MC:EffSel:SampleSize=200000:VarProp=FSmart:VarTransform=Decorrelate" )
|
183 |
|
|
|
184 |
|
|
# Cut optimisation with a Genetic Algorithm
|
185 |
|
|
if "CutsGA" in mlist:
|
186 |
|
|
factory.BookMethod( TMVA.Types.kCuts, "CutsGA",
|
187 |
|
|
"!H:!V:FitMethod=GA:EffSel:Steps=30:Cycles=3:PopSize=100:SC_steps=10:SC_rate=5:SC_factor=0.95:VarProp=FSmart" )
|
188 |
|
|
|
189 |
|
|
# Likelihood
|
190 |
|
|
if "Likelihood" in mlist:
|
191 |
|
|
factory.BookMethod( TMVA.Types.kLikelihood, "Likelihood",
|
192 |
|
|
"!H:!V:!TransformOutput:PDFInterpol=Spline2:NSmoothSig[0]=100:NSmoothBkg[0]=10:NSmoothBkg[1]=100:NSmooth=10:NAvEvtPerBin=50" )
|
193 |
|
|
|
194 |
|
|
# test the decorrelated likelihood
|
195 |
|
|
if "LikelihoodD" in mlist:
|
196 |
|
|
factory.BookMethod( TMVA.Types.kLikelihood, "LikelihoodD",
|
197 |
|
|
"!H:!V:!TransformOutput:PDFInterpol=Spline2:NSmoothSig[0]=100:NSmoothBkg[0]=10:NSmooth=5:NAvEvtPerBin=50:VarTransform=Decorrelate" )
|
198 |
|
|
|
199 |
|
|
if "LikelihoodPCA" in mlist:
|
200 |
|
|
factory.BookMethod( TMVA.Types.kLikelihood, "LikelihoodPCA",
|
201 |
|
|
"!H:!V:!TransformOutput:PDFInterpol=Spline2:NSmoothSig[0]=100:NSmoothBkg[0]=10:NSmooth=5:NAvEvtPerBin=50:VarTransform=PCA" )
|
202 |
|
|
|
203 |
|
|
# likelihood method with unbinned kernel estimator
|
204 |
|
|
if "LikelihoodKDE" in mlist:
|
205 |
|
|
factory.BookMethod( TMVA.Types.kLikelihood, "LikelihoodKDE",
|
206 |
|
|
"!H:!V:!TransformOutput:PDFInterpol=KDE:KDEtype=Gauss:KDEiter=Nonadaptive:KDEborder=None:NAvEvtPerBin=50" )
|
207 |
|
|
|
208 |
|
|
# PDE - RS method
|
209 |
|
|
if "PDERS" in mlist:
|
210 |
|
|
factory.BookMethod( TMVA.Types.kPDERS, "PDERS",
|
211 |
|
|
"!H:!V:VolumeRangeMode=Adaptive:KernelEstimator=Gauss:GaussSigma=0.3:NEventsMin=400:NEventsMax=600:InitialScale=0.99" )
|
212 |
|
|
|
213 |
|
|
if "PDERSD" in mlist:
|
214 |
|
|
factory.BookMethod( TMVA.Types.kPDERS, "PDERSD",
|
215 |
|
|
"!H:!V:VolumeRangeMode=Adaptive:KernelEstimator=Gauss:GaussSigma=0.3:NEventsMin=400:NEventsMax=600:InitialScale=0.99:VarTransform=Decorrelate" )
|
216 |
|
|
|
217 |
|
|
if "PDERSPCA" in mlist:
|
218 |
|
|
factory.BookMethod( TMVA.Types.kPDERS, "PDERSPCA",
|
219 |
|
|
"!H:!V:VolumeRangeMode=Adaptive:KernelEstimator=Gauss:GaussSigma=0.3:NEventsMin=400:NEventsMax=600:InitialScale=0.99:VarTransform=PCA" )
|
220 |
|
|
|
221 |
|
|
# HMatrix (chi2-squared) method
|
222 |
|
|
if "HMatrix" in mlist:
|
223 |
|
|
factory.BookMethod( TMVA.Types.kHMatrix, "HMatrix", "!H:!V" )
|
224 |
|
|
|
225 |
|
|
# Fisher - also creates PDF for MVA output (here as an example, can be used for any other classifier)
|
226 |
|
|
if "Fisher" in mlist:
|
227 |
|
|
factory.BookMethod( TMVA.Types.kFisher, "Fisher",
|
228 |
|
|
"H:!V:!Normalise:CreateMVAPdfs:Fisher:NbinsMVAPdf=50:NsmoothMVAPdf=1" )
|
229 |
|
|
|
230 |
|
|
# Function discriminant analysis
|
231 |
|
|
if "FDA" in mlist:
|
232 |
|
|
factory.BookMethod( TMVA.Types.kFDA,"FDA_MT",
|
233 |
|
|
"H:!V:Formula=(0)+(1)*x0+(2)*x1+(3)*x2+(4)*x3:ParRanges=(-1,1);(-10,10);(-10,10);(-10,10);(-10,10):FitMethod=MINUIT:ErrorLevel=1:PrintLevel=-1:FitStrategy=2:UseImprove:UseMinos:SetBatch" )
|
234 |
|
|
|
235 |
|
|
# the new TMVA ANN: MLP (recommended ANN)
|
236 |
|
|
if "MLP" in mlist:
|
237 |
|
|
factory.BookMethod( TMVA.Types.kMLP, "MLP", "Normalise:H:!V:NCycles=200:HiddenLayers=N+1,N:TestRate=5" )
|
238 |
|
|
|
239 |
|
|
# CF(Clermont-Ferrand)ANN
|
240 |
|
|
if "CFMlpANN" in mlist:
|
241 |
|
|
factory.BookMethod( TMVA.Types.kCFMlpANN, "CFMlpANN", "!H:!V:NCycles=500:HiddenLayers=N+1,N" )
|
242 |
|
|
|
243 |
|
|
# Tmlp(Root)ANN
|
244 |
|
|
if "TMlpANN" in mlist:
|
245 |
|
|
factory.BookMethod( TMVA.Types.kTMlpANN, "TMlpANN", "!H:!V:NCycles=200:HiddenLayers=N+1,N" )
|
246 |
|
|
|
247 |
|
|
# Support Vector Machine with varying kernel functions
|
248 |
|
|
if "SVM_Gauss" in mlist:
|
249 |
|
|
factory.BookMethod( TMVA.Types.kSVM, "SVM_Gauss", "Sigma=2:C=1:Tol=0.001:Kernel=Gauss" )
|
250 |
|
|
|
251 |
|
|
if "SVM_Poly" in mlist:
|
252 |
|
|
factory.BookMethod( TMVA.Types.kSVM, "SVM_Poly", "Order=4:Theta=1:C=0.1:Tol=0.001:Kernel=Polynomial" )
|
253 |
|
|
|
254 |
|
|
if "SVM_Lin" in mlist:
|
255 |
|
|
factory.BookMethod( TMVA.Types.kSVM, "SVM_Lin", "!H:!V:Kernel=Linear:C=1:Tol=0.001" )
|
256 |
|
|
|
257 |
|
|
# Boosted Decision Trees
|
258 |
|
|
if "BDT" in mlist:
|
259 |
|
|
factory.BookMethod( TMVA.Types.kBDT, "BDT",
|
260 |
|
|
"!V:NTrees=400:BoostType=AdaBoost:SeparationType=GiniIndex:nCuts=20:PruneMethod=CostComplexity:PruneStrength=4.5")
|
261 |
|
|
|
262 |
|
|
# Decorrelated Boosted Decision Trees
|
263 |
|
|
if "BDTD" in mlist:
|
264 |
|
|
factory.BookMethod( TMVA.Types.kBDT, "BDTD",
|
265 |
|
|
"!H:!V:NTrees=400:BoostType=AdaBoost:SeparationType=GiniIndex:nCuts=20:PruneMethod=CostComplexity:PruneStrength=4.5" )
|
266 |
|
|
|
267 |
|
|
# Friedman's RuleFit method
|
268 |
|
|
if "RuleFitTMVA" in mlist:
|
269 |
|
|
factory.BookMethod( TMVA.Types.kRuleFit, "RuleFitTMVA",
|
270 |
|
|
"H:!V:RuleFitModule=RFTMVA:Model=ModRuleLinear:MinImp=0.001:RuleMinDist=0.001:NTrees=20:fEventsMin=0.01:fEventsMax=0.5:GDTau=-1.0:GDTauPrec=0.01:GDStep=0.01:GDNSteps=10000:GDErrScale=1.02" )
|
271 |
|
|
|
272 |
|
|
if "RuleFitJF" in mlist:
|
273 |
|
|
factory.BookMethod( TMVA.Types.kRuleFit, "RuleFitJF",
|
274 |
|
|
"!V:RuleFitModule=RFFriedman:Model=ModRuleLinear:GDStep=0.01:GDNSteps=10000:GDErrScale=1.1:RFNendnodes=4" )
|
275 |
|
|
|
276 |
|
|
# ---- Now you can tell the factory to train, test, and evaluate the MVAs.
|
277 |
|
|
|
278 |
|
|
# Train MVAs
|
279 |
|
|
factory.TrainAllMethods()
|
280 |
|
|
|
281 |
|
|
# Test MVAs
|
282 |
|
|
factory.TestAllMethods()
|
283 |
|
|
|
284 |
|
|
# Evaluate MVAs
|
285 |
|
|
factory.EvaluateAllMethods()
|
286 |
|
|
|
287 |
|
|
# Save the output.
|
288 |
|
|
outputFile.Close()
|
289 |
|
|
|
290 |
|
|
# clean up
|
291 |
|
|
factory.IsA().Destructor( factory )
|
292 |
|
|
|
293 |
|
|
print "=== wrote root file %s\n" % outfname
|
294 |
|
|
print "=== TMVAnalysis is done!\n"
|
295 |
|
|
|
296 |
|
|
# open the GUI for the result macros
|
297 |
|
|
gROOT.ProcessLine( "TMVAGui(\"%s\")" % outfname );
|
298 |
|
|
|
299 |
|
|
# keep the ROOT thread running
|
300 |
|
|
gApplication.Run()
|
301 |
|
|
|
302 |
|
|
# ----------------------------------------------------------
|
303 |
|
|
|
304 |
|
|
if __name__ == "__main__":
|
305 |
|
|
main()
|