Coverage for pyspark/ml/stat.py: 88%

364 ↛ 365line 364 didn't jump to line 365, because the condition on line 364 was never true if not isinstance(featuresCol, Column) or not isinstance(weightCol, Column):

raise TypeError("featureCol and weightCol should be a Column")

return featuresCol, weightCol

@staticmethod

def _get_single_metric(col, weightCol, metric):

col, weightCol = Summarizer._check_param(col, weightCol)

return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric,

col._jc, weightCol._jc))

@staticmethod

def metrics(*metrics):

"""

Given a list of metrics, provides a builder that it turns computes metrics from a column.

See the documentation of :py:class:`Summarizer` for an example.

The following metrics are accepted (case sensitive):

- mean: a vector that contains the coefficient-wise mean.

- sum: a vector that contains the coefficient-wise sum.

- variance: a vector tha contains the coefficient-wise variance.

- std: a vector tha contains the coefficient-wise standard deviation.

- count: the count of all vectors seen.

- numNonzeros: a vector with the number of non-zeros for each coefficients

- max: the maximum for each coefficient.

- min: the minimum for each coefficient.

- normL2: the Euclidean norm for each coefficient.

- normL1: the L1 norm of each coefficient (sum of the absolute values).

.. versionadded:: 2.4.0

Notes

-----

Currently, the performance of this interface is about 2x~3x slower than using the RDD

interface.

Examples

--------

metrics : str

metrics that can be provided.

Returns

-------

:py:class:`pyspark.ml.stat.SummaryBuilder`

"""

sc = SparkContext._active_spark_context

js = JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer.metrics",

_to_seq(sc, metrics))

return SummaryBuilder(js)

class SummaryBuilder(JavaWrapper):

"""

A builder object that provides summary statistics about a given column.

Users should not directly create such builders, but instead use one of the methods in

:py:class:`pyspark.ml.stat.Summarizer`

.. versionadded:: 2.4.0

"""

def __init__(self, jSummaryBuilder):

super(SummaryBuilder, self).__init__(jSummaryBuilder)

def summary(self, featuresCol, weightCol=None):

"""

Returns an aggregate object that contains the summary of the column with the requested

metrics.

.. versionadded:: 2.4.0

Parameters

----------

featuresCol : str

a column that contains features Vector object.

weightCol : str, optional

a column that contains weight value. Default weight is 1.0.

Returns

-------

:py:class:`pyspark.sql.Column`

an aggregate column that contains the statistics. The exact content of this

structure is determined during the creation of the builder.

"""

featuresCol, weightCol = Summarizer._check_param(featuresCol, weightCol)

return Column(self._java_obj.summary(featuresCol._jc, weightCol._jc))

class MultivariateGaussian(object):

"""Represents a (mean, cov) tuple

.. versionadded:: 3.0.0

Examples

--------

>>> from pyspark.ml.linalg import DenseMatrix, Vectors

>>> m = MultivariateGaussian(Vectors.dense([11,12]), DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0)))

>>> (m.mean, m.cov.toArray())

(DenseVector([11.0, 12.0]), array([[ 1., 5.],

[ 3., 2.]]))

"""

def __init__(self, mean, cov):

self.mean = mean

self.cov = cov

if __name__ == "__main__":

import doctest

import numpy

import pyspark.ml.stat

from pyspark.sql import SparkSession

try:

# Numpy 1.14+ changed it's string format.

numpy.set_printoptions(legacy='1.13')

except TypeError:

pass

globs = pyspark.ml.stat.__dict__.copy()

# The small batch size here ensures that we see multiple batches,

# even in these small test examples:

spark = SparkSession.builder \

.master("local[2]") \

.appName("ml.stat tests") \

.getOrCreate()

sc = spark.sparkContext

globs['sc'] = sc

globs['spark'] = spark

failure_count, test_count = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)

spark.stop()

494 ↛ 495line 494 didn't jump to line 495, because the condition on line 494 was never true if failure_count:

sys.exit(-1)

Coverage for pyspark/ml/stat.py : 88%

109 statements 96 run 13 missing 0 excluded 2 partial