Coverage for pyspark/ml/feature.py: 90%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

# Licensed to the Apache Software Foundation (ASF) under one or more

# contributor license agreements. See the NOTICE file distributed with

# this work for additional information regarding copyright ownership.

# The ASF licenses this file to You under the Apache License, Version 2.0

# (the "License"); you may not use this file except in compliance with

# the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

from pyspark import since, keyword_only, SparkContext

from pyspark.ml.linalg import _convert_to_vector

from pyspark.ml.param.shared import HasThreshold, HasThresholds, HasInputCol, HasOutputCol, \

HasInputCols, HasOutputCols, HasHandleInvalid, HasRelativeError, HasFeaturesCol, HasLabelCol, \

HasSeed, HasNumFeatures, HasStepSize, HasMaxIter, TypeConverters, Param, Params

from pyspark.ml.util import JavaMLReadable, JavaMLWritable

from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaTransformer, _jvm

from pyspark.ml.common import inherit_doc

__all__ = ['Binarizer',

'BucketedRandomProjectionLSH', 'BucketedRandomProjectionLSHModel',

'Bucketizer',

'ChiSqSelector', 'ChiSqSelectorModel',

'CountVectorizer', 'CountVectorizerModel',

'DCT',

'ElementwiseProduct',

'FeatureHasher',

'HashingTF',

'IDF', 'IDFModel',

'Imputer', 'ImputerModel',

'IndexToString',

'Interaction',

'MaxAbsScaler', 'MaxAbsScalerModel',

'MinHashLSH', 'MinHashLSHModel',

'MinMaxScaler', 'MinMaxScalerModel',

'NGram',

'Normalizer',

'OneHotEncoder', 'OneHotEncoderModel',

'PCA', 'PCAModel',

'PolynomialExpansion',

'QuantileDiscretizer',

'RobustScaler', 'RobustScalerModel',

'RegexTokenizer',

'RFormula', 'RFormulaModel',

'SQLTransformer',

'StandardScaler', 'StandardScalerModel',

'StopWordsRemover',

'StringIndexer', 'StringIndexerModel',

'Tokenizer',

'UnivariateFeatureSelector', 'UnivariateFeatureSelectorModel',

'VarianceThresholdSelector', 'VarianceThresholdSelectorModel',

'VectorAssembler',

'VectorIndexer', 'VectorIndexerModel',

'VectorSizeHint',

'VectorSlicer',

'Word2Vec', 'Word2VecModel']

@inherit_doc

class Binarizer(JavaTransformer, HasThreshold, HasThresholds, HasInputCol, HasOutputCol,

HasInputCols, HasOutputCols, JavaMLReadable, JavaMLWritable):

"""

Binarize a column of continuous features given a threshold. Since 3.0.0,

:py:class:`Binarize` can map multiple columns at once by setting the :py:attr:`inputCols`

parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters

are set, an Exception will be thrown. The :py:attr:`threshold` parameter is used for

single column usage, and :py:attr:`thresholds` is for multiple columns.

.. versionadded:: 1.4.0

Examples

--------

>>> df = spark.createDataFrame([(0.5,)], ["values"])

>>> binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features")

>>> binarizer.setThreshold(1.0)

Binarizer...

>>> binarizer.setInputCol("values")

Binarizer...

>>> binarizer.setOutputCol("features")

Binarizer...

>>> binarizer.transform(df).head().features

0.0

>>> binarizer.setParams(outputCol="freqs").transform(df).head().freqs

0.0

>>> params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"}

>>> binarizer.transform(df, params).head().vector

1.0

>>> binarizerPath = temp_path + "/binarizer"

>>> binarizer.save(binarizerPath)

>>> loadedBinarizer = Binarizer.load(binarizerPath)

>>> loadedBinarizer.getThreshold() == binarizer.getThreshold()

True

>>> loadedBinarizer.transform(df).take(1) == binarizer.transform(df).take(1)

True

>>> df2 = spark.createDataFrame([(0.5, 0.3)], ["values1", "values2"])

>>> binarizer2 = Binarizer(thresholds=[0.0, 1.0])

>>> binarizer2.setInputCols(["values1", "values2"]).setOutputCols(["output1", "output2"])

Binarizer...

>>> binarizer2.transform(df2).show()

+-------+-------+-------+-------+

+-------+-------+-------+-------+

| 0.5| 0.3| 1.0| 0.0|

+-------+-------+-------+-------+

...

"""

threshold = Param(Params._dummy(), "threshold",

"Param for threshold used to binarize continuous features. " +

"The features greater than the threshold will be binarized to 1.0. " +

"The features equal to or less than the threshold will be binarized to 0.0",

typeConverter=TypeConverters.toFloat)

thresholds = Param(Params._dummy(), "thresholds",

"Param for array of threshold used to binarize continuous features. " +

"This is for multiple columns input. If transforming multiple columns " +

"and thresholds is not set, but threshold is set, then threshold will " +

"be applied across all columns.",

typeConverter=TypeConverters.toListFloat)

@keyword_only

def __init__(self, *, threshold=0.0, inputCol=None, outputCol=None, thresholds=None,

inputCols=None, outputCols=None):

"""

__init__(self, \\*, threshold=0.0, inputCol=None, outputCol=None, thresholds=None, \

inputCols=None, outputCols=None)

"""

super(Binarizer, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid)

self._setDefault(threshold=0.0)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.4.0")

def setParams(self, *, threshold=0.0, inputCol=None, outputCol=None, thresholds=None,

inputCols=None, outputCols=None):

"""

setParams(self, \\*, threshold=0.0, inputCol=None, outputCol=None, thresholds=None, \

inputCols=None, outputCols=None)

Sets params for this Binarizer.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("1.4.0")

def setThreshold(self, value):

"""

Sets the value of :py:attr:`threshold`.

"""

return self._set(threshold=value)

@since("3.0.0")

def setThresholds(self, value):

"""

Sets the value of :py:attr:`thresholds`.

"""

return self._set(thresholds=value)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

@since("3.0.0")

def setInputCols(self, value):

"""

Sets the value of :py:attr:`inputCols`.

"""

return self._set(inputCols=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@since("3.0.0")

def setOutputCols(self, value):

"""

Sets the value of :py:attr:`outputCols`.

"""

return self._set(outputCols=value)

class _LSHParams(HasInputCol, HasOutputCol):

"""

Mixin for Locality Sensitive Hashing (LSH) algorithm parameters.

"""

numHashTables = Param(Params._dummy(), "numHashTables", "number of hash tables, where " +

"increasing number of hash tables lowers the false negative rate, " +

"and decreasing it improves the running performance.",

typeConverter=TypeConverters.toInt)

def __init__(self, *args):

super(_LSHParams, self).__init__(*args)

self._setDefault(numHashTables=1)

def getNumHashTables(self):

"""

Gets the value of numHashTables or its default value.

"""

return self.getOrDefault(self.numHashTables)

class _LSH(JavaEstimator, _LSHParams, JavaMLReadable, JavaMLWritable):

"""

Mixin for Locality Sensitive Hashing (LSH).

"""

def setNumHashTables(self, value):

"""

Sets the value of :py:attr:`numHashTables`.

"""

return self._set(numHashTables=value)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

class _LSHModel(JavaModel, _LSHParams):

"""

Mixin for Locality Sensitive Hashing (LSH) models.

"""

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

def approxNearestNeighbors(self, dataset, key, numNearestNeighbors, distCol="distCol"):

"""

Given a large dataset and an item, approximately find at most k items which have the

closest distance to the item. If the :py:attr:`outputCol` is missing, the method will

transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows

caching of the transformed data when necessary.

Notes

-----

This method is experimental and will likely change behavior in the next release.

Parameters

----------

dataset : :py:class:`pyspark.sql.DataFrame`

The dataset to search for nearest neighbors of the key.

key : :py:class:`pyspark.ml.linalg.Vector`

Feature vector representing the item to search for.

numNearestNeighbors : int

The maximum number of nearest neighbors.

distCol : str

Output column for storing the distance between each result row and the key.

Use "distCol" as default value if it's not specified.

Returns

-------

:py:class:`pyspark.sql.DataFrame`

A dataset containing at most k items closest to the key. A column "distCol" is

added to show the distance between each row and the key.

"""

return self._call_java("approxNearestNeighbors", dataset, key, numNearestNeighbors,

distCol)

def approxSimilarityJoin(self, datasetA, datasetB, threshold, distCol="distCol"):

"""

Join two datasets to approximately find all pairs of rows whose distance are smaller than

the threshold. If the :py:attr:`outputCol` is missing, the method will transform the data;

if the :py:attr:`outputCol` exists, it will use that. This allows caching of the

transformed data when necessary.

Parameters

----------

datasetA : :py:class:`pyspark.sql.DataFrame`

One of the datasets to join.

datasetB : :py:class:`pyspark.sql.DataFrame`

Another dataset to join.

threshold : float

The threshold for the distance of row pairs.

distCol : str, optional

Output column for storing the distance between each pair of rows. Use

"distCol" as default value if it's not specified.

Returns

-------

:py:class:`pyspark.sql.DataFrame`

A joined dataset containing pairs of rows. The original rows are in columns

"datasetA" and "datasetB", and a column "distCol" is added to show the distance

between each pair.

"""

threshold = TypeConverters.toFloat(threshold)

return self._call_java("approxSimilarityJoin", datasetA, datasetB, threshold, distCol)

class _BucketedRandomProjectionLSHParams():

"""

Params for :py:class:`BucketedRandomProjectionLSH` and

:py:class:`BucketedRandomProjectionLSHModel`.

.. versionadded:: 3.0.0

"""

bucketLength = Param(Params._dummy(), "bucketLength", "the length of each hash bucket, " +

"a larger bucket lowers the false negative rate.",

typeConverter=TypeConverters.toFloat)

@since("2.2.0")

def getBucketLength(self):

"""

Gets the value of bucketLength or its default value.

"""

return self.getOrDefault(self.bucketLength)

@inherit_doc

class BucketedRandomProjectionLSH(_LSH, _BucketedRandomProjectionLSHParams,

HasSeed, JavaMLReadable, JavaMLWritable):

"""

LSH class for Euclidean distance metrics.

The input is dense or sparse vectors, each of which represents a point in the Euclidean

distance space. The output will be vectors of configurable dimension. Hash values in the same

dimension are calculated by the same hash function.

.. versionadded:: 2.2.0

Notes

-----

- `Stable Distributions in Wikipedia article on Locality-sensitive hashing \

<https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_

- `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_

Examples

--------

>>> from pyspark.ml.linalg import Vectors

>>> from pyspark.sql.functions import col

>>> data = [(0, Vectors.dense([-1.0, -1.0 ]),),

... (1, Vectors.dense([-1.0, 1.0 ]),),

... (2, Vectors.dense([1.0, -1.0 ]),),

... (3, Vectors.dense([1.0, 1.0]),)]

>>> df = spark.createDataFrame(data, ["id", "features"])

>>> brp = BucketedRandomProjectionLSH()

>>> brp.setInputCol("features")

BucketedRandomProjectionLSH...

>>> brp.setOutputCol("hashes")

BucketedRandomProjectionLSH...

>>> brp.setSeed(12345)

BucketedRandomProjectionLSH...

>>> brp.setBucketLength(1.0)

BucketedRandomProjectionLSH...

>>> model = brp.fit(df)

>>> model.getBucketLength()

1.0

>>> model.setOutputCol("hashes")

BucketedRandomProjectionLSHModel...

>>> model.transform(df).head()

Row(id=0, features=DenseVector([-1.0, -1.0]), hashes=[DenseVector([-1.0])])

>>> data2 = [(4, Vectors.dense([2.0, 2.0 ]),),

... (5, Vectors.dense([2.0, 3.0 ]),),

... (6, Vectors.dense([3.0, 2.0 ]),),

... (7, Vectors.dense([3.0, 3.0]),)]

>>> df2 = spark.createDataFrame(data2, ["id", "features"])

>>> model.approxNearestNeighbors(df2, Vectors.dense([1.0, 2.0]), 1).collect()

[Row(id=4, features=DenseVector([2.0, 2.0]), hashes=[DenseVector([1.0])], distCol=1.0)]

>>> model.approxSimilarityJoin(df, df2, 3.0, distCol="EuclideanDistance").select(

... col("datasetA.id").alias("idA"),

... col("datasetB.id").alias("idB"),

... col("EuclideanDistance")).show()

+---+---+-----------------+

|idA|idB|EuclideanDistance|

+---+---+-----------------+

| 3| 6| 2.23606797749979|

+---+---+-----------------+

...

>>> model.approxSimilarityJoin(df, df2, 3, distCol="EuclideanDistance").select(

... col("datasetA.id").alias("idA"),

... col("datasetB.id").alias("idB"),

... col("EuclideanDistance")).show()

+---+---+-----------------+

|idA|idB|EuclideanDistance|

+---+---+-----------------+

| 3| 6| 2.23606797749979|

+---+---+-----------------+

...

>>> brpPath = temp_path + "/brp"

>>> brp.save(brpPath)

>>> brp2 = BucketedRandomProjectionLSH.load(brpPath)

>>> brp2.getBucketLength() == brp.getBucketLength()

True

>>> modelPath = temp_path + "/brp-model"

>>> model.save(modelPath)

>>> model2 = BucketedRandomProjectionLSHModel.load(modelPath)

>>> model.transform(df).head().hashes == model2.transform(df).head().hashes

True

"""

@keyword_only

def __init__(self, *, inputCol=None, outputCol=None, seed=None, numHashTables=1,

bucketLength=None):

"""

__init__(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1, \

bucketLength=None)

"""

super(BucketedRandomProjectionLSH, self).__init__()

self._java_obj = \

self._new_java_obj("org.apache.spark.ml.feature.BucketedRandomProjectionLSH", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("2.2.0")

def setParams(self, *, inputCol=None, outputCol=None, seed=None, numHashTables=1,

bucketLength=None):

"""

setParams(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1, \

bucketLength=None)

Sets params for this BucketedRandomProjectionLSH.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("2.2.0")

def setBucketLength(self, value):

"""

Sets the value of :py:attr:`bucketLength`.

"""

return self._set(bucketLength=value)

def setSeed(self, value):

"""

Sets the value of :py:attr:`seed`.

"""

return self._set(seed=value)

def _create_model(self, java_model):

return BucketedRandomProjectionLSHModel(java_model)

class BucketedRandomProjectionLSHModel(_LSHModel, _BucketedRandomProjectionLSHParams,

JavaMLReadable, JavaMLWritable):

r"""

Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are

stored. The vectors are normalized to be unit vectors and each vector is used in a hash

function: :math:`h_i(x) = floor(r_i \cdot x / bucketLength)` where :math:`r_i` is the

i-th random unit vector. The number of buckets will be `(max L2 norm of input vectors) /

bucketLength`.

.. versionadded:: 2.2.0

"""

@inherit_doc

class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOutputCols,

HasHandleInvalid, JavaMLReadable, JavaMLWritable):

"""

Maps a column of continuous features to a column of feature buckets. Since 3.0.0,

:py:class:`Bucketizer` can map multiple columns at once by setting the :py:attr:`inputCols`

parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters

are set, an Exception will be thrown. The :py:attr:`splits` parameter is only used for single

column usage, and :py:attr:`splitsArray` is for multiple columns.

.. versionadded:: 1.4.0

Examples

--------

>>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")),

... (float("nan"), 1.0), (float("nan"), 0.0)]

>>> df = spark.createDataFrame(values, ["values1", "values2"])

>>> bucketizer = Bucketizer()

>>> bucketizer.setSplits([-float("inf"), 0.5, 1.4, float("inf")])

Bucketizer...

>>> bucketizer.setInputCol("values1")

Bucketizer...

>>> bucketizer.setOutputCol("buckets")

Bucketizer...

>>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()

>>> bucketed = bucketizer.setHandleInvalid("keep").transform(df.select("values1"))

>>> bucketed.show(truncate=False)

+-------+-------+

|values1|buckets|

+-------+-------+

|0.1 |0.0 |

|0.4 |0.0 |

|1.2 |1.0 |

|1.5 |2.0 |

|NaN |3.0 |

+-------+-------+

...

>>> bucketizer.setParams(outputCol="b").transform(df).head().b

0.0

>>> bucketizerPath = temp_path + "/bucketizer"

>>> bucketizer.save(bucketizerPath)

>>> loadedBucketizer = Bucketizer.load(bucketizerPath)

>>> loadedBucketizer.getSplits() == bucketizer.getSplits()

True

>>> loadedBucketizer.transform(df).take(1) == bucketizer.transform(df).take(1)

True

>>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect()

>>> len(bucketed)

>>> bucketizer2 = Bucketizer(splitsArray=

... [[-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.5, float("inf")]],

... inputCols=["values1", "values2"], outputCols=["buckets1", "buckets2"])

>>> bucketed2 = bucketizer2.setHandleInvalid("keep").transform(df)

>>> bucketed2.show(truncate=False)

+-------+-------+--------+--------+

+-------+-------+--------+--------+

|0.1 |0.0 |0.0 |0.0 |

|0.4 |1.0 |0.0 |1.0 |

|1.2 |1.3 |1.0 |1.0 |

|1.5 |NaN |2.0 |2.0 |

|NaN |1.0 |3.0 |1.0 |

|NaN |0.0 |3.0 |0.0 |

+-------+-------+--------+--------+

...

"""

splits = \

Param(Params._dummy(), "splits",

"Split points for mapping continuous features into buckets. With n+1 splits, " +

"there are n buckets. A bucket defined by splits x,y holds values in the " +

"range [x,y) except the last bucket, which also includes y. The splits " +

"should be of length >= 3 and strictly increasing. Values at -inf, inf must be " +

"explicitly provided to cover all Double values; otherwise, values outside the " +

"splits specified will be treated as errors.",

typeConverter=TypeConverters.toListFloat)

handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries "

"containing NaN values. Values outside the splits will always be treated "

"as errors. Options are 'skip' (filter out rows with invalid values), " +

"'error' (throw an error), or 'keep' (keep invalid values in a " +

"special additional bucket). Note that in the multiple column " +

"case, the invalid handling is applied to all columns. That said " +

"for 'error' it will throw an error if any invalids are found in " +

"any column, for 'skip' it will skip rows with any invalids in " +

"any columns, etc.",

typeConverter=TypeConverters.toString)

splitsArray = Param(Params._dummy(), "splitsArray", "The array of split points for mapping " +

"continuous features into buckets for multiple columns. For each input " +

"column, with n+1 splits, there are n buckets. A bucket defined by " +

"splits x,y holds values in the range [x,y) except the last bucket, " +

"which also includes y. The splits should be of length >= 3 and " +

"strictly increasing. Values at -inf, inf must be explicitly provided " +

"to cover all Double values; otherwise, values outside the splits " +

"specified will be treated as errors.",

typeConverter=TypeConverters.toListListFloat)

@keyword_only

def __init__(self, *, splits=None, inputCol=None, outputCol=None, handleInvalid="error",

splitsArray=None, inputCols=None, outputCols=None):

"""

__init__(self, \\*, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \

splitsArray=None, inputCols=None, outputCols=None)

"""

super(Bucketizer, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)

self._setDefault(handleInvalid="error")

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.4.0")

def setParams(self, *, splits=None, inputCol=None, outputCol=None, handleInvalid="error",

splitsArray=None, inputCols=None, outputCols=None):

"""

setParams(self, \\*, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \

splitsArray=None, inputCols=None, outputCols=None)

Sets params for this Bucketizer.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("1.4.0")

def setSplits(self, value):

"""

Sets the value of :py:attr:`splits`.

"""

return self._set(splits=value)

@since("1.4.0")

def getSplits(self):

"""

Gets the value of threshold or its default value.

"""

return self.getOrDefault(self.splits)

@since("3.0.0")

def setSplitsArray(self, value):

"""

Sets the value of :py:attr:`splitsArray`.

"""

return self._set(splitsArray=value)

@since("3.0.0")

def getSplitsArray(self):

"""

Gets the array of split points or its default value.

"""

return self.getOrDefault(self.splitsArray)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

@since("3.0.0")

def setInputCols(self, value):

"""

Sets the value of :py:attr:`inputCols`.

"""

return self._set(inputCols=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@since("3.0.0")

def setOutputCols(self, value):

"""

Sets the value of :py:attr:`outputCols`.

"""

return self._set(outputCols=value)

def setHandleInvalid(self, value):

"""

Sets the value of :py:attr:`handleInvalid`.

"""

return self._set(handleInvalid=value)

class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol):

"""

Params for :py:class:`CountVectorizer` and :py:class:`CountVectorizerModel`.

"""

minTF = Param(

Params._dummy(), "minTF", "Filter to ignore rare words in" +

" a document. For each document, terms with frequency/count less than the given" +

" threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +

" times the term must appear in the document); if this is a double in [0,1), then this " +

"specifies a fraction (out of the document's token count). Note that the parameter is " +

"only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0",

typeConverter=TypeConverters.toFloat)

minDF = Param(

Params._dummy(), "minDF", "Specifies the minimum number of" +

" different documents a term must appear in to be included in the vocabulary." +

" If this is an integer >= 1, this specifies the number of documents the term must" +

" appear in; if this is a double in [0,1), then this specifies the fraction of documents." +

" Default 1.0", typeConverter=TypeConverters.toFloat)

maxDF = Param(

Params._dummy(), "maxDF", "Specifies the maximum number of" +

" different documents a term could appear in to be included in the vocabulary." +

" A term that appears more than the threshold will be ignored. If this is an" +

" integer >= 1, this specifies the maximum number of documents the term could appear in;" +

" if this is a double in [0,1), then this specifies the maximum" +

" fraction of documents the term could appear in." +

" Default (2^63) - 1", typeConverter=TypeConverters.toFloat)

vocabSize = Param(

Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.",

typeConverter=TypeConverters.toInt)

binary = Param(

Params._dummy(), "binary", "Binary toggle to control the output vector values." +

" If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful" +

" for discrete probabilistic models that model binary events rather than integer counts." +

" Default False", typeConverter=TypeConverters.toBoolean)

def __init__(self, *args):

super(_CountVectorizerParams, self).__init__(*args)

self._setDefault(minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False)

@since("1.6.0")

def getMinTF(self):

"""

Gets the value of minTF or its default value.

"""

return self.getOrDefault(self.minTF)

@since("1.6.0")

def getMinDF(self):

"""

Gets the value of minDF or its default value.

"""

return self.getOrDefault(self.minDF)

@since("2.4.0")

def getMaxDF(self):

"""

Gets the value of maxDF or its default value.

"""

return self.getOrDefault(self.maxDF)

@since("1.6.0")

def getVocabSize(self):

"""

Gets the value of vocabSize or its default value.

"""

return self.getOrDefault(self.vocabSize)

@since("2.0.0")

def getBinary(self):

"""

Gets the value of binary or its default value.

"""

return self.getOrDefault(self.binary)

@inherit_doc

class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, JavaMLWritable):

"""

Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.

.. versionadded:: 1.6.0

Examples

--------

>>> df = spark.createDataFrame(

... [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],

... ["label", "raw"])

>>> cv = CountVectorizer()

>>> cv.setInputCol("raw")

CountVectorizer...

>>> cv.setOutputCol("vectors")

CountVectorizer...

>>> model = cv.fit(df)

>>> model.setInputCol("raw")

CountVectorizerModel...

>>> model.transform(df).show(truncate=False)

+-----+---------------+-------------------------+

|label|raw |vectors |

+-----+---------------+-------------------------+

|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|

|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|

+-----+---------------+-------------------------+

...

>>> sorted(model.vocabulary) == ['a', 'b', 'c']

True

>>> countVectorizerPath = temp_path + "/count-vectorizer"

>>> cv.save(countVectorizerPath)

>>> loadedCv = CountVectorizer.load(countVectorizerPath)

>>> loadedCv.getMinDF() == cv.getMinDF()

True

>>> loadedCv.getMinTF() == cv.getMinTF()

True

>>> loadedCv.getVocabSize() == cv.getVocabSize()

True

>>> modelPath = temp_path + "/count-vectorizer-model"

>>> model.save(modelPath)

>>> loadedModel = CountVectorizerModel.load(modelPath)

>>> loadedModel.vocabulary == model.vocabulary

True

>>> loadedModel.transform(df).take(1) == model.transform(df).take(1)

True

>>> fromVocabModel = CountVectorizerModel.from_vocabulary(["a", "b", "c"],

... inputCol="raw", outputCol="vectors")

>>> fromVocabModel.transform(df).show(truncate=False)

+-----+---------------+-------------------------+

|label|raw |vectors |

+-----+---------------+-------------------------+

|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|

|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|

+-----+---------------+-------------------------+

...

"""

@keyword_only

def __init__(self, *, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18,

binary=False, inputCol=None, outputCol=None):

"""

__init__(self, \\*, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18,\

binary=False, inputCol=None,outputCol=None)

"""

super(CountVectorizer, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",

self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.6.0")

def setParams(self, *, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18,

binary=False, inputCol=None, outputCol=None):

"""

setParams(self, \\*, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18,\

binary=False, inputCol=None, outputCol=None)

Set the params for the CountVectorizer

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("1.6.0")

def setMinTF(self, value):

"""

Sets the value of :py:attr:`minTF`.

"""

return self._set(minTF=value)

@since("1.6.0")

def setMinDF(self, value):

"""

Sets the value of :py:attr:`minDF`.

"""

return self._set(minDF=value)

@since("2.4.0")

def setMaxDF(self, value):

"""

Sets the value of :py:attr:`maxDF`.

"""

return self._set(maxDF=value)

@since("1.6.0")

def setVocabSize(self, value):

"""

Sets the value of :py:attr:`vocabSize`.

"""

return self._set(vocabSize=value)

@since("2.0.0")

def setBinary(self, value):

"""

Sets the value of :py:attr:`binary`.

"""

return self._set(binary=value)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

def _create_model(self, java_model):

return CountVectorizerModel(java_model)

@inherit_doc

class CountVectorizerModel(JavaModel, _CountVectorizerParams, JavaMLReadable, JavaMLWritable):

"""

Model fitted by :py:class:`CountVectorizer`.

.. versionadded:: 1.6.0

"""

@since("3.0.0")

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

@since("3.0.0")

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@classmethod

@since("2.4.0")

def from_vocabulary(cls, vocabulary, inputCol, outputCol=None, minTF=None, binary=None):

"""

Construct the model directly from a vocabulary list of strings,

requires an active SparkContext.

"""

sc = SparkContext._active_spark_context

java_class = sc._gateway.jvm.java.lang.String

jvocab = CountVectorizerModel._new_java_array(vocabulary, java_class)

model = CountVectorizerModel._create_from_java_class(

"org.apache.spark.ml.feature.CountVectorizerModel", jvocab)

model.setInputCol(inputCol)

if outputCol is not None:

model.setOutputCol(outputCol)

if minTF is not None:

model.setMinTF(minTF)

908 ↛ 909line 908 didn't jump to line 909, because the condition on line 908 was never true if binary is not None:

model.setBinary(binary)

model._set(vocabSize=len(vocabulary))

return model

@property

@since("1.6.0")

def vocabulary(self):

"""

An array of terms in the vocabulary.

"""

return self._call_java("vocabulary")

@since("2.4.0")

def setMinTF(self, value):

"""

Sets the value of :py:attr:`minTF`.

"""

return self._set(minTF=value)

@since("2.4.0")

def setBinary(self, value):

"""

Sets the value of :py:attr:`binary`.

"""

return self._set(binary=value)

@inherit_doc

class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):

"""

A feature transformer that takes the 1D discrete cosine transform

of a real vector. No zero padding is performed on the input vector.

It returns a real vector of the same length representing the DCT.

The return vector is scaled such that the transform matrix is

unitary (aka scaled DCT-II).

.. versionadded:: 1.6.0

Notes

-----

`More information on Wikipedia \

<https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.

Examples

--------

>>> from pyspark.ml.linalg import Vectors

>>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])

>>> dct = DCT( )

>>> dct.setInverse(False)

DCT...

>>> dct.setInputCol("vec")

DCT...

>>> dct.setOutputCol("resultVec")

DCT...

>>> df2 = dct.transform(df1)

>>> df2.head().resultVec

DenseVector([10.969..., -0.707..., -2.041...])

>>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)

>>> df3.head().origVec

DenseVector([5.0, 8.0, 6.0])

>>> dctPath = temp_path + "/dct"

>>> dct.save(dctPath)

>>> loadedDtc = DCT.load(dctPath)

>>> loadedDtc.transform(df1).take(1) == dct.transform(df1).take(1)

True

>>> loadedDtc.getInverse()

False

"""

inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +

"default False.", typeConverter=TypeConverters.toBoolean)

@keyword_only

def __init__(self, *, inverse=False, inputCol=None, outputCol=None):

"""

__init__(self, \\*, inverse=False, inputCol=None, outputCol=None)

"""

super(DCT, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)

self._setDefault(inverse=False)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.6.0")

def setParams(self, *, inverse=False, inputCol=None, outputCol=None):

"""

setParams(self, \\*, inverse=False, inputCol=None, outputCol=None)

Sets params for this DCT.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("1.6.0")

def setInverse(self, value):

"""

Sets the value of :py:attr:`inverse`.

"""

return self._set(inverse=value)

@since("1.6.0")

def getInverse(self):

"""

Gets the value of inverse or its default value.

"""

return self.getOrDefault(self.inverse)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@inherit_doc

class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,

JavaMLWritable):

"""

Outputs the Hadamard product (i.e., the element-wise product) of each input vector

with a provided "weight" vector. In other words, it scales each column of the dataset

by a scalar multiplier.

.. versionadded:: 1.5.0

Examples

--------

>>> from pyspark.ml.linalg import Vectors

>>> df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"])

>>> ep = ElementwiseProduct()

>>> ep.setScalingVec(Vectors.dense([1.0, 2.0, 3.0]))

ElementwiseProduct...

>>> ep.setInputCol("values")

ElementwiseProduct...

>>> ep.setOutputCol("eprod")

ElementwiseProduct...

>>> ep.transform(df).head().eprod

DenseVector([2.0, 2.0, 9.0])

>>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod

DenseVector([4.0, 3.0, 15.0])

>>> elementwiseProductPath = temp_path + "/elementwise-product"

>>> ep.save(elementwiseProductPath)

>>> loadedEp = ElementwiseProduct.load(elementwiseProductPath)

>>> loadedEp.getScalingVec() == ep.getScalingVec()

True

>>> loadedEp.transform(df).take(1) == ep.transform(df).take(1)

True

"""

scalingVec = Param(Params._dummy(), "scalingVec", "Vector for hadamard product.",

typeConverter=TypeConverters.toVector)

@keyword_only

def __init__(self, *, scalingVec=None, inputCol=None, outputCol=None):

"""

__init__(self, \\*, scalingVec=None, inputCol=None, outputCol=None)

"""

super(ElementwiseProduct, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",

self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.5.0")

def setParams(self, *, scalingVec=None, inputCol=None, outputCol=None):

"""

setParams(self, \\*, scalingVec=None, inputCol=None, outputCol=None)

Sets params for this ElementwiseProduct.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("2.0.0")

def setScalingVec(self, value):

"""

Sets the value of :py:attr:`scalingVec`.

"""

return self._set(scalingVec=value)

@since("2.0.0")

def getScalingVec(self):

"""

Gets the value of scalingVec or its default value.

"""

return self.getOrDefault(self.scalingVec)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@inherit_doc

class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable,

JavaMLWritable):

"""

Feature hashing projects a set of categorical or numerical features into a feature vector of

specified dimension (typically substantially smaller than that of the original feature

space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)

to map features to indices in the feature vector.

The FeatureHasher transformer operates on multiple columns. Each column may contain either

numeric or categorical features. Behavior and handling of column data types is as follows:

* Numeric columns:

For numeric features, the hash value of the column name is used to map the

feature value to its index in the feature vector. By default, numeric features

are not treated as categorical (even when they are integers). To treat them

as categorical, specify the relevant columns in `categoricalCols`.

* String columns:

For categorical features, the hash value of the string "column_name=value"

is used to map to the vector index, with an indicator value of `1.0`.

Thus, categorical features are "one-hot" encoded

(similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).

* Boolean columns:

Boolean values are treated in the same way as string columns. That is,

boolean features are represented as "column_name=true" or "column_name=false",

with an indicator value of `1.0`.

Null (missing) values are ignored (implicitly zero in the resulting feature vector).

Since a simple modulo is used to transform the hash function to a vector index,

it is advisable to use a power of two as the `numFeatures` parameter;

otherwise the features will not be mapped evenly to the vector indices.

.. versionadded:: 2.3.0

Examples

--------

>>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]

>>> cols = ["real", "bool", "stringNum", "string"]

>>> df = spark.createDataFrame(data, cols)

>>> hasher = FeatureHasher()

>>> hasher.setInputCols(cols)

FeatureHasher...

>>> hasher.setOutputCol("features")

FeatureHasher...

>>> hasher.transform(df).head().features

SparseVector(262144, {174475: 2.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})

>>> hasher.setCategoricalCols(["real"]).transform(df).head().features

SparseVector(262144, {171257: 1.0, 247670: 1.0, 257907: 1.0, 262126: 1.0})

>>> hasherPath = temp_path + "/hasher"

>>> hasher.save(hasherPath)

>>> loadedHasher = FeatureHasher.load(hasherPath)

>>> loadedHasher.getNumFeatures() == hasher.getNumFeatures()

True

>>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features

True

"""

categoricalCols = Param(Params._dummy(), "categoricalCols",

"numeric columns to treat as categorical",

typeConverter=TypeConverters.toListString)

@keyword_only

def __init__(self, *, numFeatures=1 << 18, inputCols=None, outputCol=None,

categoricalCols=None):

"""

__init__(self, \\*, numFeatures=1 << 18, inputCols=None, outputCol=None, \

categoricalCols=None)

"""

super(FeatureHasher, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)

self._setDefault(numFeatures=1 << 18)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("2.3.0")

def setParams(self, *, numFeatures=1 << 18, inputCols=None, outputCol=None,

categoricalCols=None):

"""

setParams(self, \\*, numFeatures=1 << 18, inputCols=None, outputCol=None, \

categoricalCols=None)

Sets params for this FeatureHasher.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("2.3.0")

def setCategoricalCols(self, value):

"""

Sets the value of :py:attr:`categoricalCols`.

"""

return self._set(categoricalCols=value)

@since("2.3.0")

def getCategoricalCols(self):

"""

Gets the value of binary or its default value.

"""

return self.getOrDefault(self.categoricalCols)

def setInputCols(self, value):

"""

Sets the value of :py:attr:`inputCols`.

"""

return self._set(inputCols=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

def setNumFeatures(self, value):

"""

Sets the value of :py:attr:`numFeatures`.

"""

return self._set(numFeatures=value)

@inherit_doc

class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,

JavaMLWritable):

"""

Maps a sequence of terms to their term frequencies using the hashing trick.

Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32)

to calculate the hash code value for the term object.

Since a simple modulo is used to transform the hash function to a column index,

it is advisable to use a power of two as the numFeatures parameter;

otherwise the features will not be mapped evenly to the columns.

.. versionadded:: 1.3.0

Examples

--------

>>> df = spark.createDataFrame([(["a", "b", "c"],)], ["words"])

>>> hashingTF = HashingTF(inputCol="words", outputCol="features")

>>> hashingTF.setNumFeatures(10)

HashingTF...

>>> hashingTF.transform(df).head().features

SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})

>>> hashingTF.setParams(outputCol="freqs").transform(df).head().freqs

SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})

>>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"}

>>> hashingTF.transform(df, params).head().vector

SparseVector(5, {0: 1.0, 2: 1.0, 3: 1.0})

>>> hashingTFPath = temp_path + "/hashing-tf"

>>> hashingTF.save(hashingTFPath)

>>> loadedHashingTF = HashingTF.load(hashingTFPath)

>>> loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures()

True

>>> loadedHashingTF.transform(df).take(1) == hashingTF.transform(df).take(1)

True

>>> hashingTF.indexOf("b")

"""

binary = Param(Params._dummy(), "binary", "If True, all non zero counts are set to 1. " +

"This is useful for discrete probabilistic models that model binary events " +

"rather than integer counts. Default False.",

typeConverter=TypeConverters.toBoolean)

@keyword_only

def __init__(self, *, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None):

"""

__init__(self, \\*, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None)

"""

super(HashingTF, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid)

self._setDefault(numFeatures=1 << 18, binary=False)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.3.0")

def setParams(self, *, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None):

"""

setParams(self, \\*, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None)

Sets params for this HashingTF.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("2.0.0")

def setBinary(self, value):

"""

Sets the value of :py:attr:`binary`.

"""

return self._set(binary=value)

@since("2.0.0")

def getBinary(self):

"""

Gets the value of binary or its default value.

"""

return self.getOrDefault(self.binary)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

def setNumFeatures(self, value):

"""

Sets the value of :py:attr:`numFeatures`.

"""

return self._set(numFeatures=value)

@since("3.0.0")

def indexOf(self, term):

"""

Returns the index of the input term.

"""

self._transfer_params_to_java()

return self._java_obj.indexOf(term)

class _IDFParams(HasInputCol, HasOutputCol):

"""

Params for :py:class:`IDF` and :py:class:`IDFModel`.

.. versionadded:: 3.0.0

"""

minDocFreq = Param(Params._dummy(), "minDocFreq",

"minimum number of documents in which a term should appear for filtering",

typeConverter=TypeConverters.toInt)

@since("1.4.0")

def getMinDocFreq(self):

"""

Gets the value of minDocFreq or its default value.

"""

return self.getOrDefault(self.minDocFreq)

def __init__(self, *args):

super(_IDFParams, self).__init__(*args)

self._setDefault(minDocFreq=0)

@inherit_doc

class IDF(JavaEstimator, _IDFParams, JavaMLReadable, JavaMLWritable):

"""

Compute the Inverse Document Frequency (IDF) given a collection of documents.

.. versionadded:: 1.4.0

Examples

--------

>>> from pyspark.ml.linalg import DenseVector

>>> df = spark.createDataFrame([(DenseVector([1.0, 2.0]),),

... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"])

>>> idf = IDF(minDocFreq=3)

>>> idf.setInputCol("tf")

IDF...

>>> idf.setOutputCol("idf")

IDF...

>>> model = idf.fit(df)

>>> model.setOutputCol("idf")

IDFModel...

>>> model.getMinDocFreq()

>>> model.idf

DenseVector([0.0, 0.0])

>>> model.docFreq

[0, 3]

>>> model.numDocs == df.count()

True

>>> model.transform(df).head().idf

DenseVector([0.0, 0.0])

>>> idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs

DenseVector([0.0, 0.0])

>>> params = {idf.minDocFreq: 1, idf.outputCol: "vector"}

>>> idf.fit(df, params).transform(df).head().vector

DenseVector([0.2877, 0.0])

>>> idfPath = temp_path + "/idf"

>>> idf.save(idfPath)

>>> loadedIdf = IDF.load(idfPath)

>>> loadedIdf.getMinDocFreq() == idf.getMinDocFreq()

True

>>> modelPath = temp_path + "/idf-model"

>>> model.save(modelPath)

>>> loadedModel = IDFModel.load(modelPath)

>>> loadedModel.transform(df).head().idf == model.transform(df).head().idf

True

"""

@keyword_only

def __init__(self, *, minDocFreq=0, inputCol=None, outputCol=None):

"""

__init__(self, \\*, minDocFreq=0, inputCol=None, outputCol=None)

"""

super(IDF, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.4.0")

def setParams(self, *, minDocFreq=0, inputCol=None, outputCol=None):

"""

setParams(self, \\*, minDocFreq=0, inputCol=None, outputCol=None)

Sets params for this IDF.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("1.4.0")

def setMinDocFreq(self, value):

"""

Sets the value of :py:attr:`minDocFreq`.

"""

return self._set(minDocFreq=value)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

def _create_model(self, java_model):

return IDFModel(java_model)

class IDFModel(JavaModel, _IDFParams, JavaMLReadable, JavaMLWritable):

"""

Model fitted by :py:class:`IDF`.

.. versionadded:: 1.4.0

"""

@since("3.0.0")

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

@since("3.0.0")

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@property

@since("2.0.0")

def idf(self):

"""

Returns the IDF vector.

"""

return self._call_java("idf")

@property

@since("3.0.0")

def docFreq(self):

"""

Returns the document frequency.

"""

return self._call_java("docFreq")

@property

@since("3.0.0")

def numDocs(self):

"""

Returns number of documents evaluated to compute idf

"""

return self._call_java("numDocs")

class _ImputerParams(HasInputCol, HasInputCols, HasOutputCol, HasOutputCols, HasRelativeError):

"""

Params for :py:class:`Imputer` and :py:class:`ImputerModel`.

.. versionadded:: 3.0.0

"""

strategy = Param(Params._dummy(), "strategy",

"strategy for imputation. If mean, then replace missing values using the mean "

"value of the feature. If median, then replace missing values using the "

"median value of the feature. If mode, then replace missing using the most "

"frequent value of the feature.",

typeConverter=TypeConverters.toString)

missingValue = Param(Params._dummy(), "missingValue",

"The placeholder for the missing values. All occurrences of missingValue "

"will be imputed.", typeConverter=TypeConverters.toFloat)

def __init__(self, *args):

super(_ImputerParams, self).__init__(*args)

self._setDefault(strategy="mean", missingValue=float("nan"), relativeError=0.001)

@since("2.2.0")

def getStrategy(self):

"""

Gets the value of :py:attr:`strategy` or its default value.

"""

return self.getOrDefault(self.strategy)

@since("2.2.0")

def getMissingValue(self):

"""

Gets the value of :py:attr:`missingValue` or its default value.

"""

return self.getOrDefault(self.missingValue)

@inherit_doc

class Imputer(JavaEstimator, _ImputerParams, JavaMLReadable, JavaMLWritable):

"""

Imputation estimator for completing missing values, using the mean, median or mode

of the columns in which the missing values are located. The input columns should be of

numeric type. Currently Imputer does not support categorical features and

possibly creates incorrect values for a categorical feature.

Note that the mean/median/mode value is computed after filtering out missing values.

All Null values in the input columns are treated as missing, and so are also imputed. For

computing median, :py:meth:`pyspark.sql.DataFrame.approxQuantile` is used with a

relative error of `0.001`.

.. versionadded:: 2.2.0

Examples

--------

>>> df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0),

... (4.0, 4.0), (5.0, 5.0)], ["a", "b"])

>>> imputer = Imputer()

>>> imputer.setInputCols(["a", "b"])

Imputer...

>>> imputer.setOutputCols(["out_a", "out_b"])

Imputer...

>>> imputer.getRelativeError()

0.001

>>> model = imputer.fit(df)

>>> model.setInputCols(["a", "b"])

ImputerModel...

>>> model.getStrategy()

'mean'

>>> model.surrogateDF.show()

+---+---+

| a| b|

+---+---+

|3.0|4.0|

+---+---+

...

>>> model.transform(df).show()

+---+---+-----+-----+

| a| b|out_a|out_b|

+---+---+-----+-----+

|1.0|NaN| 1.0| 4.0|

|2.0|NaN| 2.0| 4.0|

|NaN|3.0| 3.0| 3.0|

...

>>> imputer.setStrategy("median").setMissingValue(1.0).fit(df).transform(df).show()

+---+---+-----+-----+

| a| b|out_a|out_b|

+---+---+-----+-----+

|1.0|NaN| 4.0| NaN|

...

>>> df1 = spark.createDataFrame([(1.0,), (2.0,), (float("nan"),), (4.0,), (5.0,)], ["a"])

>>> imputer1 = Imputer(inputCol="a", outputCol="out_a")

>>> model1 = imputer1.fit(df1)

>>> model1.surrogateDF.show()

+---+

| a|

+---+

|3.0|

+---+

...

>>> model1.transform(df1).show()

+---+-----+

| a|out_a|

+---+-----+

|1.0| 1.0|

|2.0| 2.0|

|NaN| 3.0|

...

>>> imputer1.setStrategy("median").setMissingValue(1.0).fit(df1).transform(df1).show()

+---+-----+

| a|out_a|

+---+-----+

|1.0| 4.0|

...

>>> df2 = spark.createDataFrame([(float("nan"),), (float("nan"),), (3.0,), (4.0,), (5.0,)],

... ["b"])

>>> imputer2 = Imputer(inputCol="b", outputCol="out_b")

>>> model2 = imputer2.fit(df2)

>>> model2.surrogateDF.show()

+---+

| b|

+---+

|4.0|

+---+

...

>>> model2.transform(df2).show()

+---+-----+

| b|out_b|

+---+-----+

|NaN| 4.0|

|3.0| 3.0|

...

>>> imputer2.setStrategy("median").setMissingValue(1.0).fit(df2).transform(df2).show()

+---+-----+

| b|out_b|

+---+-----+

|NaN| NaN|

...

>>> imputerPath = temp_path + "/imputer"

>>> imputer.save(imputerPath)

>>> loadedImputer = Imputer.load(imputerPath)

>>> loadedImputer.getStrategy() == imputer.getStrategy()

True

>>> loadedImputer.getMissingValue()

1.0

>>> modelPath = temp_path + "/imputer-model"

>>> model.save(modelPath)

>>> loadedModel = ImputerModel.load(modelPath)

>>> loadedModel.transform(df).head().out_a == model.transform(df).head().out_a

True

"""

@keyword_only

def __init__(self, *, strategy="mean", missingValue=float("nan"), inputCols=None,

outputCols=None, inputCol=None, outputCol=None, relativeError=0.001):

"""

__init__(self, \\*, strategy="mean", missingValue=float("nan"), inputCols=None, \

outputCols=None, inputCol=None, outputCol=None, relativeError=0.001):

"""

super(Imputer, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Imputer", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("2.2.0")

def setParams(self, *, strategy="mean", missingValue=float("nan"), inputCols=None,

outputCols=None, inputCol=None, outputCol=None, relativeError=0.001):

"""

setParams(self, \\*, strategy="mean", missingValue=float("nan"), inputCols=None, \

outputCols=None, inputCol=None, outputCol=None, relativeError=0.001)

Sets params for this Imputer.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("2.2.0")

def setStrategy(self, value):

"""

Sets the value of :py:attr:`strategy`.

"""

return self._set(strategy=value)

@since("2.2.0")

def setMissingValue(self, value):

"""

Sets the value of :py:attr:`missingValue`.

"""

return self._set(missingValue=value)

@since("2.2.0")

def setInputCols(self, value):

"""

Sets the value of :py:attr:`inputCols`.

"""

return self._set(inputCols=value)

@since("2.2.0")

def setOutputCols(self, value):

"""

Sets the value of :py:attr:`outputCols`.

"""

return self._set(outputCols=value)

@since("3.0.0")

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

@since("3.0.0")

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@since("3.0.0")

def setRelativeError(self, value):

"""

Sets the value of :py:attr:`relativeError`.

"""

return self._set(relativeError=value)

def _create_model(self, java_model):

return ImputerModel(java_model)

class ImputerModel(JavaModel, _ImputerParams, JavaMLReadable, JavaMLWritable):

"""

Model fitted by :py:class:`Imputer`.

.. versionadded:: 2.2.0

"""

@since("3.0.0")

def setInputCols(self, value):

"""

Sets the value of :py:attr:`inputCols`.

"""

return self._set(inputCols=value)

@since("3.0.0")

def setOutputCols(self, value):

"""

Sets the value of :py:attr:`outputCols`.

"""

return self._set(outputCols=value)

@since("3.0.0")

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

@since("3.0.0")

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@property

@since("2.2.0")

def surrogateDF(self):

"""

Returns a DataFrame containing inputCols and their corresponding surrogates,

which are used to replace the missing values in the input DataFrame.

"""

return self._call_java("surrogateDF")

@inherit_doc

class Interaction(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable, JavaMLWritable):

"""

Implements the feature interaction transform. This transformer takes in Double and Vector type

columns and outputs a flattened vector of their feature interactions. To handle interaction,

we first one-hot encode any nominal features. Then, a vector of the feature cross-products is

produced.

For example, given the input feature values `Double(2)` and `Vector(3, 4)`, the output would be

`Vector(6, 8)` if all input features were numeric. If the first feature was instead nominal

with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 0, 0)`.

.. versionadded:: 3.0.0

Examples

--------

>>> df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])

>>> interaction = Interaction()

>>> interaction.setInputCols(["a", "b"])

Interaction...

>>> interaction.setOutputCol("ab")

Interaction...

>>> interaction.transform(df).show()

+---+---+-----+

| a| b| ab|

+---+---+-----+

|0.0|1.0|[0.0]|

|2.0|3.0|[6.0]|

+---+---+-----+

...

>>> interactionPath = temp_path + "/interaction"

>>> interaction.save(interactionPath)

>>> loadedInteraction = Interaction.load(interactionPath)

>>> loadedInteraction.transform(df).head().ab == interaction.transform(df).head().ab

True

"""

@keyword_only

def __init__(self, *, inputCols=None, outputCol=None):

"""

__init__(self, \\*, inputCols=None, outputCol=None):

"""

super(Interaction, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Interaction", self.uid)

self._setDefault()

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("3.0.0")

def setParams(self, *, inputCols=None, outputCol=None):

"""

setParams(self, \\*, inputCols=None, outputCol=None)

Sets params for this Interaction.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("3.0.0")

def setInputCols(self, value):

"""

Sets the value of :py:attr:`inputCols`.

"""

return self._set(inputCols=value)

@since("3.0.0")

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

class _MaxAbsScalerParams(HasInputCol, HasOutputCol):

"""

Params for :py:class:`MaxAbsScaler` and :py:class:`MaxAbsScalerModel`.

.. versionadded:: 3.0.0

"""

pass

@inherit_doc

class MaxAbsScaler(JavaEstimator, _MaxAbsScalerParams, JavaMLReadable, JavaMLWritable):

"""

Rescale each feature individually to range [-1, 1] by dividing through the largest maximum

absolute value in each feature. It does not shift/center the data, and thus does not destroy

any sparsity.

.. versionadded:: 2.0.0

Examples

--------

>>> from pyspark.ml.linalg import Vectors

>>> df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"])

>>> maScaler = MaxAbsScaler(outputCol="scaled")

>>> maScaler.setInputCol("a")

MaxAbsScaler...

>>> model = maScaler.fit(df)

>>> model.setOutputCol("scaledOutput")

MaxAbsScalerModel...

>>> model.transform(df).show()

+-----+------------+

| a|scaledOutput|

+-----+------------+

|[1.0]| [0.5]|

|[2.0]| [1.0]|

+-----+------------+

...

>>> scalerPath = temp_path + "/max-abs-scaler"

>>> maScaler.save(scalerPath)

>>> loadedMAScaler = MaxAbsScaler.load(scalerPath)

>>> loadedMAScaler.getInputCol() == maScaler.getInputCol()

True

>>> loadedMAScaler.getOutputCol() == maScaler.getOutputCol()

True

>>> modelPath = temp_path + "/max-abs-scaler-model"

>>> model.save(modelPath)

>>> loadedModel = MaxAbsScalerModel.load(modelPath)

>>> loadedModel.maxAbs == model.maxAbs

True

>>> loadedModel.transform(df).take(1) == model.transform(df).take(1)

True

"""

@keyword_only

def __init__(self, *, inputCol=None, outputCol=None):

"""

__init__(self, \\*, inputCol=None, outputCol=None)

"""

super(MaxAbsScaler, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MaxAbsScaler", self.uid)

self._setDefault()

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("2.0.0")

def setParams(self, *, inputCol=None, outputCol=None):

"""

setParams(self, \\*, inputCol=None, outputCol=None)

Sets params for this MaxAbsScaler.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

def _create_model(self, java_model):

return MaxAbsScalerModel(java_model)

class MaxAbsScalerModel(JavaModel, _MaxAbsScalerParams, JavaMLReadable, JavaMLWritable):

"""

Model fitted by :py:class:`MaxAbsScaler`.

.. versionadded:: 2.0.0

"""

@since("3.0.0")

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

@since("3.0.0")

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@property

@since("2.0.0")

def maxAbs(self):

"""

Max Abs vector.

"""

return self._call_java("maxAbs")

@inherit_doc

class MinHashLSH(_LSH, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable, JavaMLWritable):

"""

LSH class for Jaccard distance.

The input can be dense or sparse vectors, but it is more efficient if it is sparse.

For example, `Vectors.sparse(10, [(2, 1.0), (3, 1.0), (5, 1.0)])` means there are 10 elements

in the space. This set contains elements 2, 3, and 5. Also, any input vector must have at

least 1 non-zero index, and all non-zero values are treated as binary "1" values.

.. versionadded:: 2.2.0

Notes

-----

See `Wikipedia on MinHash <https://en.wikipedia.org/wiki/MinHash>`_

Examples

--------

>>> from pyspark.ml.linalg import Vectors

>>> from pyspark.sql.functions import col

>>> data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),

... (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),

... (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]

>>> df = spark.createDataFrame(data, ["id", "features"])

>>> mh = MinHashLSH()

>>> mh.setInputCol("features")

MinHashLSH...

>>> mh.setOutputCol("hashes")

MinHashLSH...

>>> mh.setSeed(12345)

MinHashLSH...

>>> model = mh.fit(df)

>>> model.setInputCol("features")

MinHashLSHModel...

>>> model.transform(df).head()

Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([6179668...

>>> data2 = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),

... (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),

... (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]

>>> df2 = spark.createDataFrame(data2, ["id", "features"])

>>> key = Vectors.sparse(6, [1, 2], [1.0, 1.0])

>>> model.approxNearestNeighbors(df2, key, 1).collect()

[Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([6179668...

>>> model.approxSimilarityJoin(df, df2, 0.6, distCol="JaccardDistance").select(

... col("datasetA.id").alias("idA"),

... col("datasetB.id").alias("idB"),

... col("JaccardDistance")).show()

+---+---+---------------+

|idA|idB|JaccardDistance|

+---+---+---------------+

| 0| 5| 0.5|

| 1| 4| 0.5|

+---+---+---------------+

...

>>> mhPath = temp_path + "/mh"

>>> mh.save(mhPath)

>>> mh2 = MinHashLSH.load(mhPath)

>>> mh2.getOutputCol() == mh.getOutputCol()

True

>>> modelPath = temp_path + "/mh-model"

>>> model.save(modelPath)

>>> model2 = MinHashLSHModel.load(modelPath)

>>> model.transform(df).head().hashes == model2.transform(df).head().hashes

True

"""

@keyword_only

def __init__(self, *, inputCol=None, outputCol=None, seed=None, numHashTables=1):

"""

__init__(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1)

"""

super(MinHashLSH, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinHashLSH", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("2.2.0")

def setParams(self, *, inputCol=None, outputCol=None, seed=None, numHashTables=1):

"""

setParams(self, \\*, inputCol=None, outputCol=None, seed=None, numHashTables=1)

Sets params for this MinHashLSH.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

def setSeed(self, value):

"""

Sets the value of :py:attr:`seed`.

"""

return self._set(seed=value)

def _create_model(self, java_model):

return MinHashLSHModel(java_model)

class MinHashLSHModel(_LSHModel, JavaMLReadable, JavaMLWritable):

r"""

Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each

hash function is picked from the following family of hash functions, where :math:`a_i` and

:math:`b_i` are randomly chosen integers less than prime:

:math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise

independent according to the reference.

.. versionadded:: 2.2.0

Notes

-----

See Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear permutations."

Electronic Journal of Combinatorics 7 (2000): R26.

"""

class _MinMaxScalerParams(HasInputCol, HasOutputCol):

"""

Params for :py:class:`MinMaxScaler` and :py:class:`MinMaxScalerModel`.

.. versionadded:: 3.0.0

"""

min = Param(Params._dummy(), "min", "Lower bound of the output feature range",

typeConverter=TypeConverters.toFloat)

max = Param(Params._dummy(), "max", "Upper bound of the output feature range",

typeConverter=TypeConverters.toFloat)

def __init__(self, *args):

super(_MinMaxScalerParams, self).__init__(*args)

self._setDefault(min=0.0, max=1.0)

@since("1.6.0")

def getMin(self):

"""

Gets the value of min or its default value.

"""

return self.getOrDefault(self.min)

@since("1.6.0")

def getMax(self):

"""

Gets the value of max or its default value.

"""

return self.getOrDefault(self.max)

@inherit_doc

class MinMaxScaler(JavaEstimator, _MinMaxScalerParams, JavaMLReadable, JavaMLWritable):

"""

Rescale each feature individually to a common range [min, max] linearly using column summary

statistics, which is also known as min-max normalization or Rescaling. The rescaled value for

feature E is calculated as,

Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min

For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)

.. versionadded:: 1.6.0

Notes

-----

Since zero values will probably be transformed to non-zero values, output of the

transformer will be DenseVector even for sparse input.

Examples

--------

>>> from pyspark.ml.linalg import Vectors

>>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])

>>> mmScaler = MinMaxScaler(outputCol="scaled")

>>> mmScaler.setInputCol("a")

MinMaxScaler...

>>> model = mmScaler.fit(df)

>>> model.setOutputCol("scaledOutput")

MinMaxScalerModel...

>>> model.originalMin

DenseVector([0.0])

>>> model.originalMax

DenseVector([2.0])

>>> model.transform(df).show()

+-----+------------+

| a|scaledOutput|

+-----+------------+

|[0.0]| [0.0]|

|[2.0]| [1.0]|

+-----+------------+

...

>>> minMaxScalerPath = temp_path + "/min-max-scaler"

>>> mmScaler.save(minMaxScalerPath)

>>> loadedMMScaler = MinMaxScaler.load(minMaxScalerPath)

>>> loadedMMScaler.getMin() == mmScaler.getMin()

True

>>> loadedMMScaler.getMax() == mmScaler.getMax()

True

>>> modelPath = temp_path + "/min-max-scaler-model"

>>> model.save(modelPath)

>>> loadedModel = MinMaxScalerModel.load(modelPath)

>>> loadedModel.originalMin == model.originalMin

True

>>> loadedModel.originalMax == model.originalMax

True

>>> loadedModel.transform(df).take(1) == model.transform(df).take(1)

True

"""

@keyword_only

def __init__(self, *, min=0.0, max=1.0, inputCol=None, outputCol=None):

"""

__init__(self, \\*, min=0.0, max=1.0, inputCol=None, outputCol=None)

"""

super(MinMaxScaler, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.6.0")

def setParams(self, *, min=0.0, max=1.0, inputCol=None, outputCol=None):

"""

setParams(self, \\*, min=0.0, max=1.0, inputCol=None, outputCol=None)

Sets params for this MinMaxScaler.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("1.6.0")

def setMin(self, value):

"""

Sets the value of :py:attr:`min`.

"""

return self._set(min=value)

@since("1.6.0")

def setMax(self, value):

"""

Sets the value of :py:attr:`max`.

"""

return self._set(max=value)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

def _create_model(self, java_model):

return MinMaxScalerModel(java_model)

class MinMaxScalerModel(JavaModel, _MinMaxScalerParams, JavaMLReadable, JavaMLWritable):

"""

Model fitted by :py:class:`MinMaxScaler`.

.. versionadded:: 1.6.0

"""

@since("3.0.0")

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

@since("3.0.0")

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@since("3.0.0")

def setMin(self, value):

"""

Sets the value of :py:attr:`min`.

"""

return self._set(min=value)

@since("3.0.0")

def setMax(self, value):

"""

Sets the value of :py:attr:`max`.

"""

return self._set(max=value)

@property

@since("2.0.0")

def originalMin(self):

"""

Min value for each original column during fitting.

"""

return self._call_java("originalMin")

@property

@since("2.0.0")

def originalMax(self):

"""

Max value for each original column during fitting.

"""

return self._call_java("originalMax")

@inherit_doc

class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):

"""

A feature transformer that converts the input array of strings into an array of n-grams. Null

values in the input array are ignored.

It returns an array of n-grams where each n-gram is represented by a space-separated string of

words.

When the input is empty, an empty array is returned.

When the input array length is less than n (number of elements per n-gram), no n-grams are

returned.

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])])

>>> ngram = NGram(n=2)

>>> ngram.setInputCol("inputTokens")

NGram...

>>> ngram.setOutputCol("nGrams")

NGram...

>>> ngram.transform(df).head()

Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b', 'b c', 'c d', 'd e'])

>>> # Change n-gram length

>>> ngram.setParams(n=4).transform(df).head()

Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])

>>> # Temporarily modify output column.

>>> ngram.transform(df, {ngram.outputCol: "output"}).head()

Row(inputTokens=['a', 'b', 'c', 'd', 'e'], output=['a b c d', 'b c d e'])

>>> ngram.transform(df).head()

Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])

>>> # Must use keyword arguments to specify params.

>>> ngram.setParams("text")

Traceback (most recent call last):

...

TypeError: Method setParams forces keyword arguments.

>>> ngramPath = temp_path + "/ngram"

>>> ngram.save(ngramPath)

>>> loadedNGram = NGram.load(ngramPath)

>>> loadedNGram.getN() == ngram.getN()

True

>>> loadedNGram.transform(df).take(1) == ngram.transform(df).take(1)

True

"""

n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)",

typeConverter=TypeConverters.toInt)

@keyword_only

def __init__(self, *, n=2, inputCol=None, outputCol=None):

"""

__init__(self, \\*, n=2, inputCol=None, outputCol=None)

"""

super(NGram, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid)

self._setDefault(n=2)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.5.0")

def setParams(self, *, n=2, inputCol=None, outputCol=None):

"""

setParams(self, \\*, n=2, inputCol=None, outputCol=None)

Sets params for this NGram.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("1.5.0")

def setN(self, value):

"""

Sets the value of :py:attr:`n`.

"""

return self._set(n=value)

@since("1.5.0")

def getN(self):

"""

Gets the value of n or its default value.

"""

return self.getOrDefault(self.n)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

@inherit_doc

class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):

"""

Normalize a vector to have unit norm using the given p-norm.

.. versionadded:: 1.4.0

Examples

--------

>>> from pyspark.ml.linalg import Vectors

>>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})

>>> df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"])

>>> normalizer = Normalizer(p=2.0)

>>> normalizer.setInputCol("dense")

Normalizer...

>>> normalizer.setOutputCol("features")

Normalizer...

>>> normalizer.transform(df).head().features

DenseVector([0.6, -0.8])

>>> normalizer.setParams(inputCol="sparse", outputCol="freqs").transform(df).head().freqs

SparseVector(4, {1: 0.8, 3: 0.6})

>>> params = {normalizer.p: 1.0, normalizer.inputCol: "dense", normalizer.outputCol: "vector"}

>>> normalizer.transform(df, params).head().vector

DenseVector([0.4286, -0.5714])

>>> normalizerPath = temp_path + "/normalizer"

>>> normalizer.save(normalizerPath)

>>> loadedNormalizer = Normalizer.load(normalizerPath)

>>> loadedNormalizer.getP() == normalizer.getP()

True

>>> loadedNormalizer.transform(df).take(1) == normalizer.transform(df).take(1)

True

"""

p = Param(Params._dummy(), "p", "the p norm value.",

typeConverter=TypeConverters.toFloat)

@keyword_only

def __init__(self, *, p=2.0, inputCol=None, outputCol=None):

"""

__init__(self, \\*, p=2.0, inputCol=None, outputCol=None)

"""

super(Normalizer, self).__init__()

self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)

self._setDefault(p=2.0)

kwargs = self._input_kwargs

self.setParams(**kwargs)

@keyword_only

@since("1.4.0")

def setParams(self, *, p=2.0, inputCol=None, outputCol=None):

"""

setParams(self, \\*, p=2.0, inputCol=None, outputCol=None)

Sets params for this Normalizer.

"""

kwargs = self._input_kwargs

return self._set(**kwargs)

@since("1.4.0")

def setP(self, value):

"""

Sets the value of :py:attr:`p`.

"""

return self._set(p=value)

@since("1.4.0")

def getP(self):

"""

Gets the value of p or its default value.

"""

return self.getOrDefault(self.p)

def setInputCol(self, value):

"""

Sets the value of :py:attr:`inputCol`.

"""

return self._set(inputCol=value)

def setOutputCol(self, value):

"""

Sets the value of :py:attr:`outputCol`.

"""

return self._set(outputCol=value)

class _OneHotEncoderParams(HasInputCol, HasInputCols, HasOutputCol, HasOutputCols,

HasHandleInvalid):

"""

Params for :py:class:`OneHotEncoder` and :py:class:`OneHotEncoderModel`.

.. versionadded:: 3.0.0

"""

handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data during " +

"transform(). Options are 'keep' (invalid data presented as an extra " +

"categorical feature) or error (throw an error). Note that this Param " +

"is only used during transform; during fitting, invalid data will " +

"result in an error.",

typeConverter=TypeConverters.toString)

dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category",

typeConverter=TypeConverters.toBoolean)

def __init__(self, *args):

super(_OneHotEncoderParams, self).__init__(*args)

self._setDefault(handleInvalid="error", dropLast=True)

@since("2.3.0")

def getDropLast(self):

"""

Gets the value of dropLast or its default value.

"""

return self.getOrDefault(self.dropLast)

@inherit_doc

class OneHotEncoder(JavaEstimator, _OneHotEncoderParams, JavaMLReadable, JavaMLWritable):

"""

A one-hot encoder that maps a column of category indices to a column of binary vectors, with

at most a single one-value per row that indicates the input category index.

For example with 5 categories, an input value of 2.0 would map to an output vector of

`[0.0, 0.0, 1.0, 0.0]`.

The last category is not included by default (configurable via :py:attr:`dropLast`),

because it makes the vector entries sum up to one, and hence linearly dependent.

So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.

When :py:attr:`handleInvalid` is configured to 'keep', an extra "category" indicating invalid

values is added as last category. So when :py:attr:`dropLast` is true, invalid values are

encoded as all-zeros vector.

.. versionadded:: 2.3.0

Notes

-----

This is different from scikit-learn's OneHotEncoder, which keeps all categories.

The output vectors are sparse.

When encoding multi-column by using :py:attr:`inputCols` and

:py:attr:`outputCols` params, input/output cols come in pairs, specified by the order in

the arrays, and each pair is treated independently.

Coverage for pyspark/ml/feature.py : 90%

1422 statements 1279 run 143 missing 0 excluded 6 partial