Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #
# This file is ported from spark/util/StatCounter.scala
if values is None: values = list() self.n = 0 # Running count of our values self.mu = 0.0 # Running mean of our values self.m2 = 0.0 # Running variance numerator (sum of (x - mean)^2) self.maxValue = float("-inf") self.minValue = float("inf")
for v in values: self.merge(v)
# Add a value into this StatCounter, updating the internal statistics. delta = value - self.mu self.n += 1 self.mu += delta / self.n self.m2 += delta * (value - self.mu) self.maxValue = maximum(self.maxValue, value) self.minValue = minimum(self.minValue, value)
return self
# Merge another StatCounter into this one, adding up the internal statistics. raise TypeError("Can only merge StatCounter but got %s" % type(other))
self.merge(copy.deepcopy(other)) # Avoid overwriting fields in a weird order else:
self.mu = self.mu + (delta * other.n) / (self.n + other.n) self.mu = other.mu - (delta * self.n) / (self.n + other.n) else:
# Clone this StatCounter return copy.deepcopy(self)
# Return the variance of the values. return float('nan') else:
# # Return the sample variance, which corrects for bias in estimating the variance by dividing # by N-1 instead of N. # return float('nan') else:
# Return the standard deviation of the values.
# # Return the sample standard deviation of the values, which corrects for bias in estimating the # variance by dividing by N-1 instead of N. #
"""Returns the :class:`StatCounter` members as a ``dict``.
Examples -------- >>> sc.parallelize([1., 2., 3., 4.]).stats().asDict() {'count': 4L, 'max': 4.0, 'mean': 2.5, 'min': 1.0, 'stdev': 1.2909944487358056, 'sum': 10.0, 'variance': 1.6666666666666667} """ 'count': self.count(), 'mean': self.mean(), 'sum': self.sum(), 'min': self.min(), 'max': self.max(), 'stdev': self.stdev() if sample else self.sampleStdev(), 'variance': self.variance() if sample else self.sampleVariance() }
return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" % (self.count(), self.mean(), self.stdev(), self.max(), self.min())) |