Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

# 

# Licensed to the Apache Software Foundation (ASF) under one or more 

# contributor license agreements. See the NOTICE file distributed with 

# this work for additional information regarding copyright ownership. 

# The ASF licenses this file to You under the Apache License, Version 2.0 

# (the "License"); you may not use this file except in compliance with 

# the License. You may obtain a copy of the License at 

# 

# http://www.apache.org/licenses/LICENSE-2.0 

# 

# Unless required by applicable law or agreed to in writing, software 

# distributed under the License is distributed on an "AS IS" BASIS, 

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

# See the License for the specific language governing permissions and 

# limitations under the License. 

# 

 

from pyspark import since 

from pyspark.ml.param import Params 

from pyspark.ml.param.shared import HasCheckpointInterval, HasSeed, HasWeightCol, Param, \ 

TypeConverters, HasMaxIter, HasStepSize, HasValidationIndicatorCol 

from pyspark.ml.wrapper import JavaPredictionModel 

from pyspark.ml.common import inherit_doc 

 

 

@inherit_doc 

class _DecisionTreeModel(JavaPredictionModel): 

""" 

Abstraction for Decision Tree models. 

 

.. versionadded:: 1.5.0 

""" 

 

@property 

@since("1.5.0") 

def numNodes(self): 

"""Return number of nodes of the decision tree.""" 

return self._call_java("numNodes") 

 

@property 

@since("1.5.0") 

def depth(self): 

"""Return depth of the decision tree.""" 

return self._call_java("depth") 

 

@property 

@since("2.0.0") 

def toDebugString(self): 

"""Full description of model.""" 

return self._call_java("toDebugString") 

 

@since("3.0.0") 

def predictLeaf(self, value): 

""" 

Predict the indices of the leaves corresponding to the feature vector. 

""" 

return self._call_java("predictLeaf", value) 

 

 

class _DecisionTreeParams(HasCheckpointInterval, HasSeed, HasWeightCol): 

""" 

Mixin for Decision Tree parameters. 

""" 

 

leafCol = Param(Params._dummy(), "leafCol", "Leaf indices column name. Predicted leaf " + 

"index of each instance in each tree by preorder.", 

typeConverter=TypeConverters.toString) 

 

maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., " + 

"depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. " + 

"Must be in range [0, 30].", 

typeConverter=TypeConverters.toInt) 

 

maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous " + 

"features. Must be >=2 and >= number of categories for any categorical " + 

"feature.", typeConverter=TypeConverters.toInt) 

 

minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of " + 

"instances each child must have after split. If a split causes " + 

"the left or right child to have fewer than " + 

"minInstancesPerNode, the split will be discarded as invalid. " + 

"Should be >= 1.", typeConverter=TypeConverters.toInt) 

 

minWeightFractionPerNode = Param(Params._dummy(), "minWeightFractionPerNode", "Minimum " 

"fraction of the weighted sample count that each child " 

"must have after split. If a split causes the fraction " 

"of the total weight in the left or right child to be " 

"less than minWeightFractionPerNode, the split will be " 

"discarded as invalid. Should be in interval [0.0, 0.5).", 

typeConverter=TypeConverters.toFloat) 

 

minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split " + 

"to be considered at a tree node.", typeConverter=TypeConverters.toFloat) 

 

maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to " + 

"histogram aggregation. If too small, then 1 node will be split per " + 

"iteration, and its aggregates may exceed this size.", 

typeConverter=TypeConverters.toInt) 

 

cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass " + 

"trees to executors to match instances with nodes. If true, the " + 

"algorithm will cache node IDs for each instance. Caching can speed " + 

"up training of deeper trees. Users can set how often should the cache " + 

"be checkpointed or disable it by setting checkpointInterval.", 

typeConverter=TypeConverters.toBoolean) 

 

def __init__(self): 

super(_DecisionTreeParams, self).__init__() 

 

def setLeafCol(self, value): 

""" 

Sets the value of :py:attr:`leafCol`. 

""" 

return self._set(leafCol=value) 

 

def getLeafCol(self): 

""" 

Gets the value of leafCol or its default value. 

""" 

return self.getOrDefault(self.leafCol) 

 

def getMaxDepth(self): 

""" 

Gets the value of maxDepth or its default value. 

""" 

return self.getOrDefault(self.maxDepth) 

 

def getMaxBins(self): 

""" 

Gets the value of maxBins or its default value. 

""" 

return self.getOrDefault(self.maxBins) 

 

def getMinInstancesPerNode(self): 

""" 

Gets the value of minInstancesPerNode or its default value. 

""" 

return self.getOrDefault(self.minInstancesPerNode) 

 

def getMinWeightFractionPerNode(self): 

""" 

Gets the value of minWeightFractionPerNode or its default value. 

""" 

return self.getOrDefault(self.minWeightFractionPerNode) 

 

def getMinInfoGain(self): 

""" 

Gets the value of minInfoGain or its default value. 

""" 

return self.getOrDefault(self.minInfoGain) 

 

def getMaxMemoryInMB(self): 

""" 

Gets the value of maxMemoryInMB or its default value. 

""" 

return self.getOrDefault(self.maxMemoryInMB) 

 

def getCacheNodeIds(self): 

""" 

Gets the value of cacheNodeIds or its default value. 

""" 

return self.getOrDefault(self.cacheNodeIds) 

 

 

@inherit_doc 

class _TreeEnsembleModel(JavaPredictionModel): 

""" 

(private abstraction) 

Represents a tree ensemble model. 

""" 

 

@property 

@since("2.0.0") 

def trees(self): 

"""Trees in this ensemble. Warning: These have null parent Estimators.""" 

return [_DecisionTreeModel(m) for m in list(self._call_java("trees"))] 

 

@property 

@since("2.0.0") 

def getNumTrees(self): 

"""Number of trees in ensemble.""" 

return self._call_java("getNumTrees") 

 

@property 

@since("1.5.0") 

def treeWeights(self): 

"""Return the weights for each tree""" 

return list(self._call_java("javaTreeWeights")) 

 

@property 

@since("2.0.0") 

def totalNumNodes(self): 

"""Total number of nodes, summed over all trees in the ensemble.""" 

return self._call_java("totalNumNodes") 

 

@property 

@since("2.0.0") 

def toDebugString(self): 

"""Full description of model.""" 

return self._call_java("toDebugString") 

 

@since("3.0.0") 

def predictLeaf(self, value): 

""" 

Predict the indices of the leaves corresponding to the feature vector. 

""" 

return self._call_java("predictLeaf", value) 

 

 

class _TreeEnsembleParams(_DecisionTreeParams): 

""" 

Mixin for Decision Tree-based ensemble algorithms parameters. 

""" 

 

subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " + 

"used for learning each decision tree, in range (0, 1].", 

typeConverter=TypeConverters.toFloat) 

 

supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"] 

 

featureSubsetStrategy = \ 

Param(Params._dummy(), "featureSubsetStrategy", 

"The number of features to consider for splits at each tree node. Supported " + 

"options: 'auto' (choose automatically for task: If numTrees == 1, set to " + 

"'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to " + 

"'onethird' for regression), 'all' (use all features), 'onethird' (use " + 

"1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use " + 

"log2(number of features)), 'n' (when n is in the range (0, 1.0], use " + 

"n * number of features. When n is in the range (1, number of features), use" + 

" n features). default = 'auto'", typeConverter=TypeConverters.toString) 

 

def __init__(self): 

super(_TreeEnsembleParams, self).__init__() 

 

@since("1.4.0") 

def getSubsamplingRate(self): 

""" 

Gets the value of subsamplingRate or its default value. 

""" 

return self.getOrDefault(self.subsamplingRate) 

 

@since("1.4.0") 

def getFeatureSubsetStrategy(self): 

""" 

Gets the value of featureSubsetStrategy or its default value. 

""" 

return self.getOrDefault(self.featureSubsetStrategy) 

 

 

class _RandomForestParams(_TreeEnsembleParams): 

""" 

Private class to track supported random forest parameters. 

""" 

 

numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).", 

typeConverter=TypeConverters.toInt) 

 

bootstrap = Param(Params._dummy(), "bootstrap", "Whether bootstrap samples are used " 

"when building trees.", typeConverter=TypeConverters.toBoolean) 

 

def __init__(self): 

super(_RandomForestParams, self).__init__() 

 

@since("1.4.0") 

def getNumTrees(self): 

""" 

Gets the value of numTrees or its default value. 

""" 

return self.getOrDefault(self.numTrees) 

 

@since("3.0.0") 

def getBootstrap(self): 

""" 

Gets the value of bootstrap or its default value. 

""" 

return self.getOrDefault(self.bootstrap) 

 

 

class _GBTParams(_TreeEnsembleParams, HasMaxIter, HasStepSize, HasValidationIndicatorCol): 

""" 

Private class to track supported GBT params. 

""" 

 

stepSize = Param(Params._dummy(), "stepSize", 

"Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + 

"the contribution of each estimator.", 

typeConverter=TypeConverters.toFloat) 

 

validationTol = Param(Params._dummy(), "validationTol", 

"Threshold for stopping early when fit with validation is used. " + 

"If the error rate on the validation input changes by less than the " + 

"validationTol, then learning will stop early (before `maxIter`). " + 

"This parameter is ignored when fit without validation is used.", 

typeConverter=TypeConverters.toFloat) 

 

@since("3.0.0") 

def getValidationTol(self): 

""" 

Gets the value of validationTol or its default value. 

""" 

return self.getOrDefault(self.validationTol) 

 

 

class _HasVarianceImpurity(Params): 

""" 

Private class to track supported impurity measures. 

""" 

 

supportedImpurities = ["variance"] 

 

impurity = Param(Params._dummy(), "impurity", 

"Criterion used for information gain calculation (case-insensitive). " + 

"Supported options: " + 

", ".join(supportedImpurities), typeConverter=TypeConverters.toString) 

 

def __init__(self): 

super(_HasVarianceImpurity, self).__init__() 

 

@since("1.4.0") 

def getImpurity(self): 

""" 

Gets the value of impurity or its default value. 

""" 

return self.getOrDefault(self.impurity) 

 

 

class _TreeClassifierParams(Params): 

""" 

Private class to track supported impurity measures. 

 

.. versionadded:: 1.4.0 

""" 

 

supportedImpurities = ["entropy", "gini"] 

 

impurity = Param(Params._dummy(), "impurity", 

"Criterion used for information gain calculation (case-insensitive). " + 

"Supported options: " + 

", ".join(supportedImpurities), typeConverter=TypeConverters.toString) 

 

def __init__(self): 

super(_TreeClassifierParams, self).__init__() 

 

@since("1.6.0") 

def getImpurity(self): 

""" 

Gets the value of impurity or its default value. 

""" 

return self.getOrDefault(self.impurity) 

 

 

class _TreeRegressorParams(_HasVarianceImpurity): 

""" 

Private class to track supported impurity measures. 

""" 

pass