Coverage for pyspark/ml/classification.py: 87%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

# Licensed to the Apache Software Foundation (ASF) under one or more

# contributor license agreements. See the NOTICE file distributed with

# this work for additional information regarding copyright ownership.

# The ASF licenses this file to You under the Apache License, Version 2.0

# (the "License"); you may not use this file except in compliance with

# the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

import os

import operator

import sys

import uuid

import warnings

from abc import ABCMeta, abstractmethod, abstractproperty

from multiprocessing.pool import ThreadPool

from pyspark import keyword_only, since, SparkContext, inheritable_thread_target

from pyspark.ml import Estimator, Predictor, PredictionModel, Model

from pyspark.ml.param.shared import HasRawPredictionCol, HasProbabilityCol, HasThresholds, \

HasRegParam, HasMaxIter, HasFitIntercept, HasTol, HasStandardization, HasWeightCol, \

HasAggregationDepth, HasThreshold, HasBlockSize, HasMaxBlockSizeInMB, Param, Params, \

TypeConverters, HasElasticNetParam, HasSeed, HasStepSize, HasSolver, HasParallelism

from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \

_TreeEnsembleModel, _RandomForestParams, _GBTParams, \

_HasVarianceImpurity, _TreeClassifierParams

from pyspark.ml.regression import _FactorizationMachinesParams, DecisionTreeRegressionModel

from pyspark.ml.base import _PredictorParams

from pyspark.ml.util import DefaultParamsReader, DefaultParamsWriter, \

JavaMLReadable, JavaMLReader, JavaMLWritable, JavaMLWriter, \

MLReader, MLReadable, MLWriter, MLWritable, HasTrainingSummary

from pyspark.ml.wrapper import JavaParams, \

JavaPredictor, JavaPredictionModel, JavaWrapper

from pyspark.ml.common import inherit_doc

from pyspark.ml.linalg import Vectors, VectorUDT

from pyspark.sql import DataFrame

from pyspark.sql.functions import udf, when

from pyspark.sql.types import ArrayType, DoubleType

from pyspark.storagelevel import StorageLevel

__all__ = ['LinearSVC', 'LinearSVCModel',

'LinearSVCSummary', 'LinearSVCTrainingSummary',

'LogisticRegression', 'LogisticRegressionModel',

'LogisticRegressionSummary', 'LogisticRegressionTrainingSummary',

'BinaryLogisticRegressionSummary', 'BinaryLogisticRegressionTrainingSummary',

'DecisionTreeClassifier', 'DecisionTreeClassificationModel',

'GBTClassifier', 'GBTClassificationModel',

'RandomForestClassifier', 'RandomForestClassificationModel',

'RandomForestClassificationSummary', 'RandomForestClassificationTrainingSummary',

'BinaryRandomForestClassificationSummary',

'BinaryRandomForestClassificationTrainingSummary',

'NaiveBayes', 'NaiveBayesModel',

'MultilayerPerceptronClassifier', 'MultilayerPerceptronClassificationModel',

'MultilayerPerceptronClassificationSummary',

'MultilayerPerceptronClassificationTrainingSummary',

'OneVsRest', 'OneVsRestModel',

'FMClassifier', 'FMClassificationModel', 'FMClassificationSummary',

'FMClassificationTrainingSummary']

class _ClassifierParams(HasRawPredictionCol, _PredictorParams):

"""

Classifier Params for classification tasks.

.. versionadded:: 3.0.0

"""

pass

@inherit_doc

class Classifier(Predictor, _ClassifierParams, metaclass=ABCMeta):

"""

Classifier for classification tasks.

Classes are indexed {0, 1, ..., numClasses - 1}.

"""

@since("3.0.0")

def setRawPredictionCol(self, value):

"""

Sets the value of :py:attr:`rawPredictionCol`.

"""

return self._set(rawPredictionCol=value)

@inherit_doc

class ClassificationModel(PredictionModel, _ClassifierParams, metaclass=ABCMeta):

"""

Model produced by a ``Classifier``.

Classes are indexed {0, 1, ..., numClasses - 1}.

"""

@since("3.0.0")

def setRawPredictionCol(self, value):

"""

Sets the value of :py:attr:`rawPredictionCol`.

"""

return self._set(rawPredictionCol=value)

@abstractproperty

@since("2.1.0")

def numClasses(self):

"""

Number of classes (values which the label can take).

"""

raise NotImplementedError()

@abstractmethod

@since("3.0.0")

def predictRaw(self, value):

"""

Raw prediction for each possible label.

"""

raise NotImplementedError()

class _ProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, _ClassifierParams):

"""

Params for :py:class:`ProbabilisticClassifier` and

:py:class:`ProbabilisticClassificationModel`.

.. versionadded:: 3.0.0

"""

pass

@inherit_doc

class ProbabilisticClassifier(Classifier, _ProbabilisticClassifierParams,

metaclass=ABCMeta):

"""

Probabilistic Classifier for classification tasks.

"""

@since("3.0.0")

def setProbabilityCol(self, value):

"""

Sets the value of :py:attr:`probabilityCol`.

"""

return self._set(probabilityCol=value)

@since("3.0.0")

def setThresholds(self, value):

"""

Sets the value of :py:attr:`thresholds`.

"""

return self._set(thresholds=value)

@inherit_doc

class ProbabilisticClassificationModel(ClassificationModel,

_ProbabilisticClassifierParams,

metaclass=ABCMeta):

"""

Model produced by a ``ProbabilisticClassifier``.

"""

@since("3.0.0")

def setProbabilityCol(self, value):

"""

Sets the value of :py:attr:`probabilityCol`.

"""

return self._set(probabilityCol=value)

@since("3.0.0")

def setThresholds(self, value):

"""

Sets the value of :py:attr:`thresholds`.

"""

return self._set(thresholds=value)

@abstractmethod

@since("3.0.0")

def predictProbability(self, value):

"""

Predict the probability of each class given the features.

"""

raise NotImplementedError()

@inherit_doc

class _JavaClassifier(Classifier, JavaPredictor, metaclass=ABCMeta):

"""

Java Classifier for classification tasks.

Classes are indexed {0, 1, ..., numClasses - 1}.

"""

@since("3.0.0")

def setRawPredictionCol(self, value):

"""

Sets the value of :py:attr:`rawPredictionCol`.

"""

return self._set(rawPredictionCol=value)

@inherit_doc

class _JavaClassificationModel(ClassificationModel, JavaPredictionModel):

"""

Java Model produced by a ``Classifier``.

Classes are indexed {0, 1, ..., numClasses - 1}.

To be mixed in with :class:`pyspark.ml.JavaModel`

"""

@property

@since("2.1.0")

def numClasses(self):

"""

Number of classes (values which the label can take).

"""

return self._call_java("numClasses")

@since("3.0.0")

def predictRaw(self, value):

"""

Raw prediction for each possible label.

"""

return self._call_java("predictRaw", value)

@inherit_doc

class _JavaProbabilisticClassifier(ProbabilisticClassifier, _JavaClassifier,

metaclass=ABCMeta):

"""

Java Probabilistic Classifier for classification tasks.

"""

pass

@inherit_doc

class _JavaProbabilisticClassificationModel(ProbabilisticClassificationModel,

_JavaClassificationModel):

"""

Java Model produced by a ``ProbabilisticClassifier``.

"""

@since("3.0.0")

def predictProbability(self, value):

"""

Predict the probability of each class given the features.

"""

return self._call_java("predictProbability", value)

@inherit_doc

class _ClassificationSummary(JavaWrapper):

"""

Abstraction for multiclass classification results for a given model.

.. versionadded:: 3.1.0

"""

@property

@since("3.1.0")

def predictions(self):

"""

Dataframe outputted by the model's `transform` method.

"""

return self._call_java("predictions")

@property

@since("3.1.0")

def predictionCol(self):

"""

Field in "predictions" which gives the prediction of each class.

"""

return self._call_java("predictionCol")

@property

@since("3.1.0")

def labelCol(self):

"""

Field in "predictions" which gives the true label of each

instance.

"""

return self._call_java("labelCol")

@property

@since("3.1.0")

def weightCol(self):

"""

Field in "predictions" which gives the weight of each instance

as a vector.

"""

return self._call_java("weightCol")

@property

def labels(self):

"""

Returns the sequence of labels in ascending order. This order matches the order used

in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel.

.. versionadded:: 3.1.0

Notes

-----

In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the

training set is missing a label, then all of the arrays over labels

(e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the

expected numClasses.

"""

return self._call_java("labels")

@property

@since("3.1.0")

def truePositiveRateByLabel(self):

"""

Returns true positive rate for each label (category).

"""

return self._call_java("truePositiveRateByLabel")

@property

@since("3.1.0")

def falsePositiveRateByLabel(self):

"""

Returns false positive rate for each label (category).

"""

return self._call_java("falsePositiveRateByLabel")

@property

@since("3.1.0")

def precisionByLabel(self):

"""

Returns precision for each label (category).

"""

return self._call_java("precisionByLabel")

@property

@since("3.1.0")

def recallByLabel(self):

"""

Returns recall for each label (category).

"""

return self._call_java("recallByLabel")

@since("3.1.0")

def fMeasureByLabel(self, beta=1.0):

"""

Returns f-measure for each label (category).

"""

return self._call_java("fMeasureByLabel", beta)

@property

@since("3.1.0")

def accuracy(self):

"""

Returns accuracy.

(equals to the total number of correctly classified instances

out of the total number of instances.)

"""

return self._call_java("accuracy")

@property

@since("3.1.0")

def weightedTruePositiveRate(self):

"""

Returns weighted true positive rate.

(equals to precision, recall and f-measure)

"""

return self._call_java("weightedTruePositiveRate")

@property

@since("3.1.0")

def weightedFalsePositiveRate(self):

"""

Returns weighted false positive rate.

"""

return self._call_java("weightedFalsePositiveRate")

@property

@since("3.1.0")

def weightedRecall(self):

"""

Returns weighted averaged recall.

(equals to precision, recall and f-measure)

"""

return self._call_java("weightedRecall")

@property

@since("3.1.0")

def weightedPrecision(self):

"""

Returns weighted averaged precision.

"""

return self._call_java("weightedPrecision")

@since("3.1.0")

def weightedFMeasure(self, beta=1.0):

"""

Returns weighted averaged f-measure.

"""

return self._call_java("weightedFMeasure", beta)

@inherit_doc

class _TrainingSummary(JavaWrapper):

"""

Abstraction for Training results.

.. versionadded:: 3.1.0

"""

@property

@since("3.1.0")

def objectiveHistory(self):

"""

Objective function (scaled loss + regularization) at each

iteration. It contains one more element, the initial state,

than number of iterations.

"""

return self._call_java("objectiveHistory")

@property

@since("3.1.0")

def totalIterations(self):

"""

Number of training iterations until termination.

"""

return self._call_java("totalIterations")

@inherit_doc

class _BinaryClassificationSummary(_ClassificationSummary):

"""

Binary classification results for a given model.

.. versionadded:: 3.1.0

"""

@property

@since("3.1.0")

def scoreCol(self):

"""

Field in "predictions" which gives the probability or raw prediction

of each class as a vector.

"""

return self._call_java("scoreCol")

@property

def roc(self):

"""

Returns the receiver operating characteristic (ROC) curve,

which is a Dataframe having two fields (FPR, TPR) with

(0.0, 0.0) prepended and (1.0, 1.0) appended to it.

.. versionadded:: 3.1.0

Notes

-----

`Wikipedia reference <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

"""

return self._call_java("roc")

@property

@since("3.1.0")

def areaUnderROC(self):

"""

Computes the area under the receiver operating characteristic

(ROC) curve.

"""

return self._call_java("areaUnderROC")

@property

@since("3.1.0")

def pr(self):

"""

Returns the precision-recall curve, which is a Dataframe

containing two fields recall, precision with (0.0, 1.0) prepended

to it.

"""

return self._call_java("pr")

@property

@since("3.1.0")

def fMeasureByThreshold(self):

"""

Returns a dataframe with two fields (threshold, F-Measure) curve

with beta = 1.0.

"""

return self._call_java("fMeasureByThreshold")

@property

@since("3.1.0")

def precisionByThreshold(self):

"""

Returns a dataframe with two fields (threshold, precision) curve.

Every possible probability obtained in transforming the dataset

are used as thresholds used in calculating the precision.

"""

return self._call_java("precisionByThreshold")

@property

@since("3.1.0")

def recallByThreshold(self):

"""

Returns a dataframe with two fields (threshold, recall) curve.

Every possible probability obtained in transforming the dataset

are used as thresholds used in calculating the recall.

"""

return self._call_java("recallByThreshold")

class _LinearSVCParams(_ClassifierParams, HasRegParam, HasMaxIter, HasFitIntercept, HasTol,

HasStandardization, HasWeightCol, HasAggregationDepth, HasThreshold,

HasMaxBlockSizeInMB):

"""

Params for :py:class:`LinearSVC` and :py:class:`LinearSVCModel`.

.. versionadded:: 3.0.0

"""

threshold = Param(Params._dummy(), "threshold",

"The threshold in binary classification applied to the linear model"

" prediction. This threshold can be any real number, where Inf will make"

" all predictions 0.0 and -Inf will make all predictions 1.0.",

typeConverter=TypeConverters.toFloat)

def __init__(self, *args):

super(_LinearSVCParams, self).__init__(*args)

self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True,

standardization=True, threshold=0.0, aggregationDepth=2,

maxBlockSizeInMB=0.0)

@inherit_doc

class LinearSVC(_JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadable):

"""

This binary classifier optimizes the Hinge Loss using the OWLQN optimizer.

Only supports L2 regularization currently.

.. versionadded:: 2.2.0

Notes

-----

`Linear SVM Classifier <https://en.wikipedia.org/wiki/Support_vector_machine#Linear_SVM>`_

Examples

--------

>>> from pyspark.sql import Row

>>> from pyspark.ml.linalg import Vectors

>>> df = sc.parallelize([

... Row(label=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),

... Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()

>>> svm = LinearSVC()

>>> svm.getMaxIter()

100

>>> svm.setMaxIter(5)

LinearSVC...

>>> svm.getMaxIter()

>>> svm.getRegParam()

0.0

>>> svm.setRegParam(0.01)

LinearSVC...

>>> svm.getRegParam()

0.01

>>> model = svm.fit(df)

>>> model.setPredictionCol("newPrediction")

LinearSVCModel...

>>> model.getPredictionCol()

'newPrediction'

>>> model.setThreshold(0.5)

LinearSVCModel...

>>> model.getThreshold()

0.5

>>> model.getMaxBlockSizeInMB()

0.0

>>> model.coefficients

DenseVector([0.0, -1.0319, -0.5159])

>>> model.intercept

2.579645978780695

>>> model.numClasses

>>> model.numFeatures

>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, -1.0, -1.0))]).toDF()

>>> model.predict(test0.head().features)

1.0

>>> model.predictRaw(test0.head().features)

DenseVector([-4.1274, 4.1274])

>>> result = model.transform(test0).head()

>>> result.newPrediction

1.0

>>> result.rawPrediction

DenseVector([-4.1274, 4.1274])

>>> svm_path = temp_path + "/svm"

>>> svm.save(svm_path)

>>> svm2 = LinearSVC.load(svm_path)

>>> svm2.getMaxIter()

>>> model_path = temp_path + "/svm_model"

>>> model.save(model_path)

>>> model2 = LinearSVCModel.load(model_path)

>>> model.coefficients[0] == model2.coefficients[0]

True

>>> model.intercept == model2.intercept

True

>>> model.transform(test0).take(1) == model2.transform(test0).take(1)

True

"""

@keyword_only

def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",

maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction",

fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,

aggregationDepth=2, maxBlockSizeInMB=0.0):

"""

__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \

maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \

fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \

aggregationDepth=2, maxBlockSizeInMB=0.0):

"""

super(LinearSVC, self).__init__()

self._java_obj = self._new_java_obj(

"org.apache.spark.ml.classification.LinearSVC", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("2.2.0")

def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",

maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction",

fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,

aggregationDepth=2, maxBlockSizeInMB=0.0):

"""

setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \

maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \

fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \

aggregationDepth=2, maxBlockSizeInMB=0.0):

Sets params for Linear SVM Classifier.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

def _create_model(self, java_model):

return LinearSVCModel(java_model)

@since("2.2.0")

def setMaxIter(self, value):

"""

Sets the value of :py:attr:`maxIter`.

"""

return self._set(maxIter=value)

@since("2.2.0")

def setRegParam(self, value):

"""

Sets the value of :py:attr:`regParam`.

"""

return self._set(regParam=value)

@since("2.2.0")

def setTol(self, value):

"""

Sets the value of :py:attr:`tol`.

"""

return self._set(tol=value)

@since("2.2.0")

def setFitIntercept(self, value):

"""

Sets the value of :py:attr:`fitIntercept`.

"""

return self._set(fitIntercept=value)

@since("2.2.0")

def setStandardization(self, value):

"""

Sets the value of :py:attr:`standardization`.

"""

return self._set(standardization=value)

@since("2.2.0")

def setThreshold(self, value):

"""

Sets the value of :py:attr:`threshold`.

"""

return self._set(threshold=value)

@since("2.2.0")

def setWeightCol(self, value):

"""

Sets the value of :py:attr:`weightCol`.

"""

return self._set(weightCol=value)

@since("2.2.0")

def setAggregationDepth(self, value):

"""

Sets the value of :py:attr:`aggregationDepth`.

"""

return self._set(aggregationDepth=value)

@since("3.1.0")

def setMaxBlockSizeInMB(self, value):

"""

Sets the value of :py:attr:`maxBlockSizeInMB`.

"""

return self._set(maxBlockSizeInMB=value)

class LinearSVCModel(_JavaClassificationModel, _LinearSVCParams, JavaMLWritable, JavaMLReadable,

HasTrainingSummary):

"""

Model fitted by LinearSVC.

.. versionadded:: 2.2.0

"""

@since("3.0.0")

def setThreshold(self, value):

"""

Sets the value of :py:attr:`threshold`.

"""

return self._set(threshold=value)

@property

@since("2.2.0")

def coefficients(self):

"""

Model coefficients of Linear SVM Classifier.

"""

return self._call_java("coefficients")

@property

@since("2.2.0")

def intercept(self):

"""

Model intercept of Linear SVM Classifier.

"""

return self._call_java("intercept")

@since("3.1.0")

def summary(self):

"""

Gets summary (accuracy/precision/recall, objective history, total iterations) of model

trained on the training set. An exception is thrown if `trainingSummary is None`.

"""

744 ↛ 747line 744 didn't jump to line 747, because the condition on line 744 was never false if self.hasSummary:

return LinearSVCTrainingSummary(super(LinearSVCModel, self).summary)

else:

raise RuntimeError("No training summary available for this %s" %

self.__class__.__name__)

def evaluate(self, dataset):

"""

Evaluates the model on a test dataset.

.. versionadded:: 3.1.0

Parameters

----------

dataset : :py:class:`pyspark.sql.DataFrame`

Test dataset to evaluate model on.

"""

761 ↛ 762line 761 didn't jump to line 762, because the condition on line 761 was never true if not isinstance(dataset, DataFrame):

raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))

java_lsvc_summary = self._call_java("evaluate", dataset)

return LinearSVCSummary(java_lsvc_summary)

class LinearSVCSummary(_BinaryClassificationSummary):

"""

Abstraction for LinearSVC Results for a given model.

.. versionadded:: 3.1.0

"""

pass

@inherit_doc

class LinearSVCTrainingSummary(LinearSVCSummary, _TrainingSummary):

"""

Abstraction for LinearSVC Training results.

.. versionadded:: 3.1.0

"""

pass

class _LogisticRegressionParams(_ProbabilisticClassifierParams, HasRegParam,

HasElasticNetParam, HasMaxIter, HasFitIntercept, HasTol,

HasStandardization, HasWeightCol, HasAggregationDepth,

HasThreshold, HasMaxBlockSizeInMB):

"""

Params for :py:class:`LogisticRegression` and :py:class:`LogisticRegressionModel`.

.. versionadded:: 3.0.0

"""

threshold = Param(Params._dummy(), "threshold",

"Threshold in binary classification prediction, in range [0, 1]." +

" If threshold and thresholds are both set, they must match." +

"e.g. if threshold is p, then thresholds must be equal to [1-p, p].",

typeConverter=TypeConverters.toFloat)

family = Param(Params._dummy(), "family",

"The name of family which is a description of the label distribution to " +

"be used in the model. Supported options: auto, binomial, multinomial",

typeConverter=TypeConverters.toString)

lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients",

"The lower bounds on coefficients if fitting under bound "

"constrained optimization. The bound matrix must be "

"compatible with the shape "

"(1, number of features) for binomial regression, or "

"(number of classes, number of features) "

"for multinomial regression.",

typeConverter=TypeConverters.toMatrix)

upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients",

"The upper bounds on coefficients if fitting under bound "

"constrained optimization. The bound matrix must be "

"compatible with the shape "

"(1, number of features) for binomial regression, or "

"(number of classes, number of features) "

"for multinomial regression.",

typeConverter=TypeConverters.toMatrix)

lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts",

"The lower bounds on intercepts if fitting under bound "

"constrained optimization. The bounds vector size must be"

"equal with 1 for binomial regression, or the number of"

"lasses for multinomial regression.",

typeConverter=TypeConverters.toVector)

upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts",

"The upper bounds on intercepts if fitting under bound "

"constrained optimization. The bound vector size must be "

"equal with 1 for binomial regression, or the number of "

"classes for multinomial regression.",

typeConverter=TypeConverters.toVector)

def __init__(self, *args):

super(_LogisticRegressionParams, self).__init__(*args)

self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto",

maxBlockSizeInMB=0.0)

@since("1.4.0")

def setThreshold(self, value):

"""

Sets the value of :py:attr:`threshold`.

Clears value of :py:attr:`thresholds` if it has been set.

"""

self._set(threshold=value)

self.clear(self.thresholds)

return self

@since("1.4.0")

def getThreshold(self):

"""

Get threshold for binary classification.

If :py:attr:`thresholds` is set with length 2 (i.e., binary classification),

this returns the equivalent threshold:

:math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`.

Otherwise, returns :py:attr:`threshold` if set or its default value if unset.

"""

self._checkThresholdConsistency()

865 ↛ 866line 865 didn't jump to line 866, because the condition on line 865 was never true if self.isSet(self.thresholds):

ts = self.getOrDefault(self.thresholds)

if len(ts) != 2:

raise ValueError("Logistic Regression getThreshold only applies to" +

" binary classification, but thresholds has length != 2." +

" thresholds: " + ",".join(ts))

return 1.0/(1.0 + ts[0]/ts[1])

else:

return self.getOrDefault(self.threshold)

@since("1.5.0")

def setThresholds(self, value):

"""

Sets the value of :py:attr:`thresholds`.

Clears value of :py:attr:`threshold` if it has been set.

"""

self._set(thresholds=value)

self.clear(self.threshold)

return self

@since("1.5.0")

def getThresholds(self):

"""

If :py:attr:`thresholds` is set, return its value.

Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary

classification: (1-threshold, threshold).

If neither are set, throw an error.

"""

self._checkThresholdConsistency()

if not self.isSet(self.thresholds) and self.isSet(self.threshold):

t = self.getOrDefault(self.threshold)

return [1.0-t, t]

else:

return self.getOrDefault(self.thresholds)

def _checkThresholdConsistency(self):

if self.isSet(self.threshold) and self.isSet(self.thresholds):

ts = self.getOrDefault(self.thresholds)

903 ↛ 904line 903 didn't jump to line 904, because the condition on line 903 was never true if len(ts) != 2:

raise ValueError("Logistic Regression getThreshold only applies to" +

" binary classification, but thresholds has length != 2." +

" thresholds: {0}".format(str(ts)))

t = 1.0/(1.0 + ts[0]/ts[1])

t2 = self.getOrDefault(self.threshold)

if abs(t2 - t) >= 1E-5:

raise ValueError("Logistic Regression getThreshold found inconsistent values for" +

" threshold (%g) and thresholds (equivalent to %g)" % (t2, t))

@since("2.1.0")

def getFamily(self):

"""

Gets the value of :py:attr:`family` or its default value.

"""

return self.getOrDefault(self.family)

@since("2.3.0")

def getLowerBoundsOnCoefficients(self):

"""

Gets the value of :py:attr:`lowerBoundsOnCoefficients`

"""

return self.getOrDefault(self.lowerBoundsOnCoefficients)

@since("2.3.0")

def getUpperBoundsOnCoefficients(self):

"""

Gets the value of :py:attr:`upperBoundsOnCoefficients`

"""

return self.getOrDefault(self.upperBoundsOnCoefficients)

@since("2.3.0")

def getLowerBoundsOnIntercepts(self):

"""

Gets the value of :py:attr:`lowerBoundsOnIntercepts`

"""

return self.getOrDefault(self.lowerBoundsOnIntercepts)

@since("2.3.0")

def getUpperBoundsOnIntercepts(self):

"""

Gets the value of :py:attr:`upperBoundsOnIntercepts`

"""

return self.getOrDefault(self.upperBoundsOnIntercepts)

@inherit_doc

class LogisticRegression(_JavaProbabilisticClassifier, _LogisticRegressionParams, JavaMLWritable,

JavaMLReadable):

"""

Logistic regression.

This class supports multinomial logistic (softmax) and binomial logistic regression.

.. versionadded:: 1.3.0

Examples

--------

>>> from pyspark.sql import Row

>>> from pyspark.ml.linalg import Vectors

>>> bdf = sc.parallelize([

... Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)),

... Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),

... Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),

... Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))]).toDF()

>>> blor = LogisticRegression(weightCol="weight")

>>> blor.getRegParam()

0.0

>>> blor.setRegParam(0.01)

LogisticRegression...

>>> blor.getRegParam()

0.01

>>> blor.setMaxIter(10)

LogisticRegression...

>>> blor.getMaxIter()

>>> blor.clear(blor.maxIter)

>>> blorModel = blor.fit(bdf)

>>> blorModel.setFeaturesCol("features")

LogisticRegressionModel...

>>> blorModel.setProbabilityCol("newProbability")

LogisticRegressionModel...

>>> blorModel.getProbabilityCol()

'newProbability'

>>> blorModel.getMaxBlockSizeInMB()

0.0

>>> blorModel.setThreshold(0.1)

LogisticRegressionModel...

>>> blorModel.getThreshold()

0.1

>>> blorModel.coefficients

DenseVector([-1.080..., -0.646...])

>>> blorModel.intercept

3.112...

>>> blorModel.evaluate(bdf).accuracy == blorModel.summary.accuracy

True

>>> data_path = "data/mllib/sample_multiclass_classification_data.txt"

>>> mdf = spark.read.format("libsvm").load(data_path)

>>> mlor = LogisticRegression(regParam=0.1, elasticNetParam=1.0, family="multinomial")

>>> mlorModel = mlor.fit(mdf)

>>> mlorModel.coefficientMatrix

SparseMatrix(3, 4, [0, 1, 2, 3], [3, 2, 1], [1.87..., -2.75..., -0.50...], 1)

>>> mlorModel.interceptVector

DenseVector([0.04..., -0.42..., 0.37...])

>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 1.0))]).toDF()

>>> blorModel.predict(test0.head().features)

1.0

>>> blorModel.predictRaw(test0.head().features)

DenseVector([-3.54..., 3.54...])

>>> blorModel.predictProbability(test0.head().features)

DenseVector([0.028, 0.972])

>>> result = blorModel.transform(test0).head()

>>> result.prediction

1.0

>>> result.newProbability

DenseVector([0.02..., 0.97...])

>>> result.rawPrediction

DenseVector([-3.54..., 3.54...])

>>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()

>>> blorModel.transform(test1).head().prediction

1.0

>>> blor.setParams("vector")

Traceback (most recent call last):

...

TypeError: Method setParams forces keyword arguments.

>>> lr_path = temp_path + "/lr"

>>> blor.save(lr_path)

>>> lr2 = LogisticRegression.load(lr_path)

>>> lr2.getRegParam()

0.01

>>> model_path = temp_path + "/lr_model"

>>> blorModel.save(model_path)

>>> model2 = LogisticRegressionModel.load(model_path)

>>> blorModel.coefficients[0] == model2.coefficients[0]

True

>>> blorModel.intercept == model2.intercept

True

>>> model2

LogisticRegressionModel: uid=..., numClasses=2, numFeatures=2

>>> blorModel.transform(test0).take(1) == model2.transform(test0).take(1)

True

"""

@keyword_only

def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",

maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,

threshold=0.5, thresholds=None, probabilityCol="probability",

rawPredictionCol="rawPrediction", standardization=True, weightCol=None,

aggregationDepth=2, family="auto",

lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,

lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None,

maxBlockSizeInMB=0.0):

"""

__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \

maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \

threshold=0.5, thresholds=None, probabilityCol="probability", \

rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \

aggregationDepth=2, family="auto", \

lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \

lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, \

maxBlockSizeInMB=0.0):

If the threshold and thresholds Params are both set, they must be equivalent.

"""

super(LogisticRegression, self).__init__()

self._java_obj = self._new_java_obj(

"org.apache.spark.ml.classification.LogisticRegression", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

self._checkThresholdConsistency()

@keyword_only

@since("1.3.0")

def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",

maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,

threshold=0.5, thresholds=None, probabilityCol="probability",

rawPredictionCol="rawPrediction", standardization=True, weightCol=None,

aggregationDepth=2, family="auto",

lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,

lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None,

maxBlockSizeInMB=0.0):

"""

setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \

maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \

threshold=0.5, thresholds=None, probabilityCol="probability", \

rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \

aggregationDepth=2, family="auto", \

lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \

lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, \

maxBlockSizeInMB=0.0):

Sets params for logistic regression.

If the threshold and thresholds Params are both set, they must be equivalent.

"""

kwargs = self._input_kwargs

self._set(**kwargs)

self._checkThresholdConsistency()

return self

def _create_model(self, java_model):

return LogisticRegressionModel(java_model)

@since("2.1.0")

def setFamily(self, value):

"""

Sets the value of :py:attr:`family`.

"""

return self._set(family=value)

@since("2.3.0")

def setLowerBoundsOnCoefficients(self, value):

"""

Sets the value of :py:attr:`lowerBoundsOnCoefficients`

"""

return self._set(lowerBoundsOnCoefficients=value)

@since("2.3.0")

def setUpperBoundsOnCoefficients(self, value):

"""

Sets the value of :py:attr:`upperBoundsOnCoefficients`

"""

return self._set(upperBoundsOnCoefficients=value)

@since("2.3.0")

def setLowerBoundsOnIntercepts(self, value):

"""

Sets the value of :py:attr:`lowerBoundsOnIntercepts`

"""

return self._set(lowerBoundsOnIntercepts=value)

@since("2.3.0")

def setUpperBoundsOnIntercepts(self, value):

"""

Sets the value of :py:attr:`upperBoundsOnIntercepts`

"""

return self._set(upperBoundsOnIntercepts=value)

def setMaxIter(self, value):

"""

Sets the value of :py:attr:`maxIter`.

"""

return self._set(maxIter=value)

def setRegParam(self, value):

"""

Sets the value of :py:attr:`regParam`.

"""

return self._set(regParam=value)

def setTol(self, value):

"""

Sets the value of :py:attr:`tol`.

"""

return self._set(tol=value)

def setElasticNetParam(self, value):

"""

Sets the value of :py:attr:`elasticNetParam`.

"""

return self._set(elasticNetParam=value)

def setFitIntercept(self, value):

"""

Sets the value of :py:attr:`fitIntercept`.

"""

return self._set(fitIntercept=value)

def setStandardization(self, value):

"""

Sets the value of :py:attr:`standardization`.

"""

return self._set(standardization=value)

def setWeightCol(self, value):

"""

Sets the value of :py:attr:`weightCol`.

"""

return self._set(weightCol=value)

def setAggregationDepth(self, value):

"""

Sets the value of :py:attr:`aggregationDepth`.

"""

return self._set(aggregationDepth=value)

@since("3.1.0")

def setMaxBlockSizeInMB(self, value):

"""

Sets the value of :py:attr:`maxBlockSizeInMB`.

"""

return self._set(maxBlockSizeInMB=value)

class LogisticRegressionModel(_JavaProbabilisticClassificationModel, _LogisticRegressionParams,

JavaMLWritable, JavaMLReadable, HasTrainingSummary):

"""

Model fitted by LogisticRegression.

.. versionadded:: 1.3.0

"""

@property

@since("2.0.0")

def coefficients(self):

"""

Model coefficients of binomial logistic regression.

An exception is thrown in the case of multinomial logistic regression.

"""

return self._call_java("coefficients")

@property

@since("1.4.0")

def intercept(self):

"""

Model intercept of binomial logistic regression.

An exception is thrown in the case of multinomial logistic regression.

"""

return self._call_java("intercept")

@property

@since("2.1.0")

def coefficientMatrix(self):

"""

Model coefficients.

"""

return self._call_java("coefficientMatrix")

@property

@since("2.1.0")

def interceptVector(self):

"""

Model intercept.

"""

return self._call_java("interceptVector")

@property

@since("2.0.0")

def summary(self):

"""

Gets summary (accuracy/precision/recall, objective history, total iterations) of model

trained on the training set. An exception is thrown if `trainingSummary is None`.

"""

1243 ↛ 1251line 1243 didn't jump to line 1251, because the condition on line 1243 was never false if self.hasSummary:

if self.numClasses <= 2:

return BinaryLogisticRegressionTrainingSummary(super(LogisticRegressionModel,

self).summary)

else:

return LogisticRegressionTrainingSummary(super(LogisticRegressionModel,

self).summary)

else:

raise RuntimeError("No training summary available for this %s" %

self.__class__.__name__)

def evaluate(self, dataset):

"""

Evaluates the model on a test dataset.

.. versionadded:: 2.0.0

Parameters

----------

dataset : :py:class:`pyspark.sql.DataFrame`

Test dataset to evaluate model on.

"""

1265 ↛ 1266line 1265 didn't jump to line 1266, because the condition on line 1265 was never true if not isinstance(dataset, DataFrame):

raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))

java_blr_summary = self._call_java("evaluate", dataset)

if self.numClasses <= 2:

return BinaryLogisticRegressionSummary(java_blr_summary)

else:

return LogisticRegressionSummary(java_blr_summary)

class LogisticRegressionSummary(_ClassificationSummary):

"""

Abstraction for Logistic Regression Results for a given model.

.. versionadded:: 2.0.0

"""

@property

@since("2.0.0")

def probabilityCol(self):

"""

Field in "predictions" which gives the probability

of each class as a vector.

"""

return self._call_java("probabilityCol")

@property

@since("2.0.0")

def featuresCol(self):

"""

Field in "predictions" which gives the features of each instance

as a vector.

"""

return self._call_java("featuresCol")

@inherit_doc

class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSummary):

"""

Abstraction for multinomial Logistic Regression Training results.

.. versionadded:: 2.0.0

"""

pass

@inherit_doc

class BinaryLogisticRegressionSummary(_BinaryClassificationSummary,

LogisticRegressionSummary):

"""

Binary Logistic regression results for a given model.

.. versionadded:: 2.0.0

"""

pass

@inherit_doc

class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary,

LogisticRegressionTrainingSummary):

"""

Binary Logistic regression training results for a given model.

.. versionadded:: 2.0.0

"""

pass

@inherit_doc

class _DecisionTreeClassifierParams(_DecisionTreeParams, _TreeClassifierParams):

"""

Params for :py:class:`DecisionTreeClassifier` and :py:class:`DecisionTreeClassificationModel`.

"""

def __init__(self, *args):

super(_DecisionTreeClassifierParams, self).__init__(*args)

self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,

impurity="gini", leafCol="", minWeightFractionPerNode=0.0)

@inherit_doc

class DecisionTreeClassifier(_JavaProbabilisticClassifier, _DecisionTreeClassifierParams,

JavaMLWritable, JavaMLReadable):

"""

`Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_

learning algorithm for classification.

It supports both binary and multiclass labels, as well as both continuous and categorical

features.

.. versionadded:: 1.4.0

Examples

--------

>>> from pyspark.ml.linalg import Vectors

>>> from pyspark.ml.feature import StringIndexer

>>> df = spark.createDataFrame([

... (1.0, Vectors.dense(1.0)),

... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])

>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")

>>> si_model = stringIndexer.fit(df)

>>> td = si_model.transform(df)

>>> dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed", leafCol="leafId")

>>> model = dt.fit(td)

>>> model.getLabelCol()

'indexed'

>>> model.setFeaturesCol("features")

DecisionTreeClassificationModel...

>>> model.numNodes

>>> model.depth

>>> model.featureImportances

SparseVector(1, {0: 1.0})

>>> model.numFeatures

>>> model.numClasses

>>> print(model.toDebugString)

DecisionTreeClassificationModel...depth=1, numNodes=3...

>>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])

>>> model.predict(test0.head().features)

0.0

>>> model.predictRaw(test0.head().features)

DenseVector([1.0, 0.0])

>>> model.predictProbability(test0.head().features)

DenseVector([1.0, 0.0])

>>> result = model.transform(test0).head()

>>> result.prediction

0.0

>>> result.probability

DenseVector([1.0, 0.0])

>>> result.rawPrediction

DenseVector([1.0, 0.0])

>>> result.leafId

0.0

>>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])

>>> model.transform(test1).head().prediction

1.0

>>> dtc_path = temp_path + "/dtc"

>>> dt.save(dtc_path)

>>> dt2 = DecisionTreeClassifier.load(dtc_path)

>>> dt2.getMaxDepth()

>>> model_path = temp_path + "/dtc_model"

>>> model.save(model_path)

>>> model2 = DecisionTreeClassificationModel.load(model_path)

>>> model.featureImportances == model2.featureImportances

True

>>> model.transform(test0).take(1) == model2.transform(test0).take(1)

True

>>> df3 = spark.createDataFrame([

... (1.0, 0.2, Vectors.dense(1.0)),

... (1.0, 0.8, Vectors.dense(1.0)),

... (0.0, 1.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])

>>> si3 = StringIndexer(inputCol="label", outputCol="indexed")

>>> si_model3 = si3.fit(df3)

>>> td3 = si_model3.transform(df3)

>>> dt3 = DecisionTreeClassifier(maxDepth=2, weightCol="weight", labelCol="indexed")

>>> model3 = dt3.fit(td3)

>>> print(model3.toDebugString)

DecisionTreeClassificationModel...depth=1, numNodes=3...

"""

@keyword_only

def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",

probabilityCol="probability", rawPredictionCol="rawPrediction",

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",

seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0):

"""

__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \

probabilityCol="probability", rawPredictionCol="rawPrediction", \

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \

seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0)

"""

super(DecisionTreeClassifier, self).__init__()

self._java_obj = self._new_java_obj(

"org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.4.0")

def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",

probabilityCol="probability", rawPredictionCol="rawPrediction",

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,

impurity="gini", seed=None, weightCol=None, leafCol="",

minWeightFractionPerNode=0.0):

"""

setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \

probabilityCol="probability", rawPredictionCol="rawPrediction", \

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \

seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0)

Sets params for the DecisionTreeClassifier.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

def _create_model(self, java_model):

return DecisionTreeClassificationModel(java_model)

def setMaxDepth(self, value):

"""

Sets the value of :py:attr:`maxDepth`.

"""

return self._set(maxDepth=value)

def setMaxBins(self, value):

"""

Sets the value of :py:attr:`maxBins`.

"""

return self._set(maxBins=value)

def setMinInstancesPerNode(self, value):

"""

Sets the value of :py:attr:`minInstancesPerNode`.

"""

return self._set(minInstancesPerNode=value)

@since("3.0.0")

def setMinWeightFractionPerNode(self, value):

"""

Sets the value of :py:attr:`minWeightFractionPerNode`.

"""

return self._set(minWeightFractionPerNode=value)

def setMinInfoGain(self, value):

"""

Sets the value of :py:attr:`minInfoGain`.

"""

return self._set(minInfoGain=value)

def setMaxMemoryInMB(self, value):

"""

Sets the value of :py:attr:`maxMemoryInMB`.

"""

return self._set(maxMemoryInMB=value)

def setCacheNodeIds(self, value):

"""

Sets the value of :py:attr:`cacheNodeIds`.

"""

return self._set(cacheNodeIds=value)

@since("1.4.0")

def setImpurity(self, value):

"""

Sets the value of :py:attr:`impurity`.

"""

return self._set(impurity=value)

@since("1.4.0")

def setCheckpointInterval(self, value):

"""

Sets the value of :py:attr:`checkpointInterval`.

"""

return self._set(checkpointInterval=value)

def setSeed(self, value):

"""

Sets the value of :py:attr:`seed`.

"""

return self._set(seed=value)

@since("3.0.0")

def setWeightCol(self, value):

"""

Sets the value of :py:attr:`weightCol`.

"""

return self._set(weightCol=value)

@inherit_doc

class DecisionTreeClassificationModel(_DecisionTreeModel, _JavaProbabilisticClassificationModel,

_DecisionTreeClassifierParams, JavaMLWritable,

JavaMLReadable):

"""

Model fitted by DecisionTreeClassifier.

.. versionadded:: 1.4.0

"""

@property

def featureImportances(self):

"""

Estimate of the importance of each feature.

This generalizes the idea of "Gini" importance to other losses,

following the explanation of Gini importance from "Random Forests" documentation

by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.

This feature importance is calculated as follows:

- importance(feature j) = sum (over nodes which split on feature j) of the gain,

where gain is scaled by the number of instances passing through node

- Normalize importances for tree to sum to 1.

.. versionadded:: 2.0.0

Notes

-----

Feature importance for single decision trees can have high variance due to

correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`

to determine feature importance instead.

"""

return self._call_java("featureImportances")

@inherit_doc

class _RandomForestClassifierParams(_RandomForestParams, _TreeClassifierParams):

"""

Params for :py:class:`RandomForestClassifier` and :py:class:`RandomForestClassificationModel`.

"""

def __init__(self, *args):

super(_RandomForestClassifierParams, self).__init__(*args)

self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,

impurity="gini", numTrees=20, featureSubsetStrategy="auto",

subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0,

bootstrap=True)

@inherit_doc

class RandomForestClassifier(_JavaProbabilisticClassifier, _RandomForestClassifierParams,

JavaMLWritable, JavaMLReadable):

"""

`Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_

learning algorithm for classification.

It supports both binary and multiclass labels, as well as both continuous and categorical

features.

.. versionadded:: 1.4.0

Examples

--------

>>> import numpy

>>> from numpy import allclose

>>> from pyspark.ml.linalg import Vectors

>>> from pyspark.ml.feature import StringIndexer

>>> df = spark.createDataFrame([

... (1.0, Vectors.dense(1.0)),

... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])

>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")

>>> si_model = stringIndexer.fit(df)

>>> td = si_model.transform(df)

>>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42,

... leafCol="leafId")

>>> rf.getMinWeightFractionPerNode()

0.0

>>> model = rf.fit(td)

>>> model.getLabelCol()

'indexed'

>>> model.setFeaturesCol("features")

RandomForestClassificationModel...

>>> model.setRawPredictionCol("newRawPrediction")

RandomForestClassificationModel...

>>> model.getBootstrap()

True

>>> model.getRawPredictionCol()

'newRawPrediction'

>>> model.featureImportances

SparseVector(1, {0: 1.0})

>>> allclose(model.treeWeights, [1.0, 1.0, 1.0])

True

>>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])

>>> model.predict(test0.head().features)

0.0

>>> model.predictRaw(test0.head().features)

DenseVector([2.0, 0.0])

>>> model.predictProbability(test0.head().features)

DenseVector([1.0, 0.0])

>>> result = model.transform(test0).head()

>>> result.prediction

0.0

>>> numpy.argmax(result.probability)

>>> numpy.argmax(result.newRawPrediction)

>>> result.leafId

DenseVector([0.0, 0.0, 0.0])

>>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])

>>> model.transform(test1).head().prediction

1.0

>>> model.trees

[DecisionTreeClassificationModel...depth=..., DecisionTreeClassificationModel...]

>>> rfc_path = temp_path + "/rfc"

>>> rf.save(rfc_path)

>>> rf2 = RandomForestClassifier.load(rfc_path)

>>> rf2.getNumTrees()

>>> model_path = temp_path + "/rfc_model"

>>> model.save(model_path)

>>> model2 = RandomForestClassificationModel.load(model_path)

>>> model.featureImportances == model2.featureImportances

True

>>> model.transform(test0).take(1) == model2.transform(test0).take(1)

True

"""

@keyword_only

def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",

probabilityCol="probability", rawPredictionCol="rawPrediction",

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",

numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0,

leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True):

"""

__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \

probabilityCol="probability", rawPredictionCol="rawPrediction", \

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \

numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, \

leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True)

"""

super(RandomForestClassifier, self).__init__()

self._java_obj = self._new_java_obj(

"org.apache.spark.ml.classification.RandomForestClassifier", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.4.0")

def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",

probabilityCol="probability", rawPredictionCol="rawPrediction",

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,

impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0,

leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True):

"""

setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \

probabilityCol="probability", rawPredictionCol="rawPrediction", \

maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \

maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \

impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0, \

leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True)

Sets params for linear classification.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

def _create_model(self, java_model):

return RandomForestClassificationModel(java_model)

def setMaxDepth(self, value):

"""

Sets the value of :py:attr:`maxDepth`.

"""

return self._set(maxDepth=value)

def setMaxBins(self, value):

"""

Sets the value of :py:attr:`maxBins`.

"""

return self._set(maxBins=value)

def setMinInstancesPerNode(self, value):

"""

Sets the value of :py:attr:`minInstancesPerNode`.

"""

return self._set(minInstancesPerNode=value)

def setMinInfoGain(self, value):

"""

Sets the value of :py:attr:`minInfoGain`.

"""

return self._set(minInfoGain=value)

def setMaxMemoryInMB(self, value):

"""

Sets the value of :py:attr:`maxMemoryInMB`.

"""

return self._set(maxMemoryInMB=value)

def setCacheNodeIds(self, value):

"""

Sets the value of :py:attr:`cacheNodeIds`.

"""

return self._set(cacheNodeIds=value)

@since("1.4.0")

def setImpurity(self, value):

"""

Sets the value of :py:attr:`impurity`.

"""

return self._set(impurity=value)

@since("1.4.0")

def setNumTrees(self, value):

"""

Sets the value of :py:attr:`numTrees`.

"""

return self._set(numTrees=value)

@since("3.0.0")

def setBootstrap(self, value):

"""

Sets the value of :py:attr:`bootstrap`.

"""

return self._set(bootstrap=value)

@since("1.4.0")

def setSubsamplingRate(self, value):

"""

Sets the value of :py:attr:`subsamplingRate`.

"""

return self._set(subsamplingRate=value)

@since("2.4.0")

def setFeatureSubsetStrategy(self, value):

"""

Sets the value of :py:attr:`featureSubsetStrategy`.

"""

return self._set(featureSubsetStrategy=value)

def setSeed(self, value):

"""

Sets the value of :py:attr:`seed`.

"""

return self._set(seed=value)

def setCheckpointInterval(self, value):

"""

Sets the value of :py:attr:`checkpointInterval`.

"""

return self._set(checkpointInterval=value)

@since("3.0.0")

def setWeightCol(self, value):

"""

Sets the value of :py:attr:`weightCol`.

"""

return self._set(weightCol=value)

@since("3.0.0")

def setMinWeightFractionPerNode(self, value):

"""

Sets the value of :py:attr:`minWeightFractionPerNode`.

"""

return self._set(minWeightFractionPerNode=value)

class RandomForestClassificationModel(_TreeEnsembleModel, _JavaProbabilisticClassificationModel,

_RandomForestClassifierParams, JavaMLWritable,

JavaMLReadable, HasTrainingSummary):

"""

Model fitted by RandomForestClassifier.

.. versionadded:: 1.4.0

"""

@property

def featureImportances(self):

"""

Estimate of the importance of each feature.

Each feature's importance is the average of its importance across all trees in the ensemble

The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.

(Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)

and follows the implementation from scikit-learn.

.. versionadded:: 2.0.0

Coverage for pyspark/ml/classification.py : 87%

984 statements 855 run 129 missing 0 excluded 22 partial