Coverage for pyspark/rdd.py: 94%

1336 ↛ 1337line 1336 didn't jump to line 1337, because the condition on line 1336 was never true if any(i is None or isinstance(i, float) and isnan(i) for i in buckets):

raise ValueError("can not have None or NaN in buckets")

1339 ↛ 1340line 1339 didn't jump to line 1340, because the condition on line 1339 was never true if sorted(buckets) != list(buckets):

raise ValueError("buckets should be sorted")

1342 ↛ 1343line 1342 didn't jump to line 1343, because the condition on line 1342 was never true if len(set(buckets)) != len(buckets):

raise ValueError("buckets should not contain duplicated values")

minv = buckets[0]

maxv = buckets[-1]

even = False

inc = None

try:

steps = [buckets[i + 1] - buckets[i] for i in range(len(buckets) - 1)]

except TypeError:

pass # objects in buckets do not support '-'

else:

if max(steps) - min(steps) < 1e-10: # handle precision errors

even = True

inc = (maxv - minv) / (len(buckets) - 1)

else:

raise TypeError("buckets should be a list or tuple or number(int or long)")

def histogram(iterator):

counters = [0] * len(buckets)

for i in iterator:

if i is None or (type(i) is float and isnan(i)) or i > maxv or i < minv:

continue

t = (int((i - minv) / inc) if even

else bisect.bisect_right(buckets, i) - 1)

counters[t] += 1

# add last two together

last = counters.pop()

counters[-1] += last

return [counters]

def mergeCounters(a, b):

return [i + j for i, j in zip(a, b)]

return buckets, self.mapPartitions(histogram).reduce(mergeCounters)

def mean(self):

"""

Compute the mean of this RDD's elements.

Examples

--------

>>> sc.parallelize([1, 2, 3]).mean()

2.0

"""

return self.stats().mean()

def variance(self):

"""

Compute the variance of this RDD's elements.

Examples

--------

>>> sc.parallelize([1, 2, 3]).variance()

0.666...

"""

return self.stats().variance()

def stdev(self):

"""

Compute the standard deviation of this RDD's elements.

Examples

--------

>>> sc.parallelize([1, 2, 3]).stdev()

0.816...

"""

return self.stats().stdev()

def sampleStdev(self):

"""

Compute the sample standard deviation of this RDD's elements (which

corrects for bias in estimating the standard deviation by dividing by

N-1 instead of N).

Examples

--------

>>> sc.parallelize([1, 2, 3]).sampleStdev()

1.0

"""

return self.stats().sampleStdev()

def sampleVariance(self):

"""

Compute the sample variance of this RDD's elements (which corrects

for bias in estimating the variance by dividing by N-1 instead of N).

Examples

--------

>>> sc.parallelize([1, 2, 3]).sampleVariance()

1.0

"""

return self.stats().sampleVariance()

def countByValue(self):

"""

Return the count of each unique value in this RDD as a dictionary of

(value, count) pairs.

Examples

--------

>>> sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items())

[(1, 2), (2, 3)]

"""

def countPartition(iterator):

counts = defaultdict(int)

for obj in iterator:

counts[obj] += 1

yield counts

def mergeMaps(m1, m2):

for k, v in m2.items():

m1[k] += v

return m1

return self.mapPartitions(countPartition).reduce(mergeMaps)

def top(self, num, key=None):

"""

Get the top N elements from an RDD.

Notes

-----

This method should only be used if the resulting array is expected

to be small, as all the data is loaded into the driver's memory.

It returns the list sorted in descending order.

Examples

--------

>>> sc.parallelize([10, 4, 2, 12, 3]).top(1)

[12]

>>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2)

[6, 5]

>>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)

[4, 3, 2]

"""

def topIterator(iterator):

yield heapq.nlargest(num, iterator, key=key)

def merge(a, b):

return heapq.nlargest(num, a + b, key=key)

return self.mapPartitions(topIterator).reduce(merge)

def takeOrdered(self, num, key=None):

"""

Get the N elements from an RDD ordered in ascending order or as

specified by the optional key function.

Notes

-----

This method should only be used if the resulting array is expected

to be small, as all the data is loaded into the driver's memory.

Examples

--------

>>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)

[1, 2, 3, 4, 5, 6]

>>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x)

[10, 9, 7, 6, 5, 4]

"""

def merge(a, b):

return heapq.nsmallest(num, a + b, key)

return self.mapPartitions(lambda it: [heapq.nsmallest(num, it, key)]).reduce(merge)

def take(self, num):

"""

Take the first num elements of the RDD.

It works by first scanning one partition, and use the results from

that partition to estimate the number of additional partitions needed

to satisfy the limit.

Translated from the Scala implementation in RDD#take().

Notes

-----

This method should only be used if the resulting array is expected

to be small, as all the data is loaded into the driver's memory.

Examples

--------

>>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)

[2, 3]

>>> sc.parallelize([2, 3, 4, 5, 6]).take(10)

[2, 3, 4, 5, 6]

>>> sc.parallelize(range(100), 100).filter(lambda x: x > 90).take(3)

[91, 92, 93]

"""

items = []

totalParts = self.getNumPartitions()

partsScanned = 0

while len(items) < num and partsScanned < totalParts:

# The number of partitions to try in this iteration.

# It is ok for this number to be greater than totalParts because

# we actually cap it at totalParts in runJob.

numPartsToTry = 1

if partsScanned > 0:

# If we didn't find any rows after the previous iteration,

# quadruple and retry. Otherwise, interpolate the number of

# partitions we need to try, but overestimate it by 50%.

# We also cap the estimation in the end.

if len(items) == 0:

numPartsToTry = partsScanned * 4

else:

# the first parameter of max is >=1 whenever partsScanned >= 2

numPartsToTry = int(1.5 * num * partsScanned / len(items)) - partsScanned

numPartsToTry = min(max(numPartsToTry, 1), partsScanned * 4)

left = num - len(items)

def takeUpToNumLeft(iterator):

iterator = iter(iterator)

taken = 0

while taken < left:

try:

yield next(iterator)

except StopIteration:

return

taken += 1

p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))

res = self.context.runJob(self, takeUpToNumLeft, p)

items += res

partsScanned += numPartsToTry

return items[:num]

def first(self):

"""

Return the first element in this RDD.

Examples

--------

>>> sc.parallelize([2, 3, 4]).first()

>>> sc.parallelize([]).first()

Traceback (most recent call last):

...

ValueError: RDD is empty

"""

rs = self.take(1)

if rs:

return rs[0]

raise ValueError("RDD is empty")

def isEmpty(self):

"""

Returns true if and only if the RDD contains no elements at all.

Notes

-----

An RDD may be empty even when it has at least 1 partition.

Examples

--------

>>> sc.parallelize([]).isEmpty()

True

>>> sc.parallelize([1]).isEmpty()

False

"""

return self.getNumPartitions() == 0 or len(self.take(1)) == 0

def saveAsNewAPIHadoopDataset(self, conf, keyConverter=None, valueConverter=None):

"""

Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file

system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are

converted for output using either user specified converters or, by default,

"org.apache.spark.api.python.JavaToWritableConverter".

Parameters

----------

conf : dict

Hadoop job configuration

keyConverter : str, optional

fully qualified classname of key converter (None by default)

valueConverter : str, optional

fully qualified classname of value converter (None by default)

"""

jconf = self.ctx._dictToJavaMap(conf)

pickledRDD = self._pickled()

self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd, True, jconf,

keyConverter, valueConverter, True)

def saveAsNewAPIHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None,

keyConverter=None, valueConverter=None, conf=None):

"""

Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file

system, using the new Hadoop OutputFormat API (mapreduce package). Key and value types

will be inferred if not specified. Keys and values are converted for output using either

user specified converters or "org.apache.spark.api.python.JavaToWritableConverter". The

`conf` is applied on top of the base Hadoop conf associated with the SparkContext

of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.

path : str

path to Hadoop file

outputFormatClass : str

fully qualified classname of Hadoop OutputFormat

(e.g. "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")

keyClass : str, optional

fully qualified classname of key Writable class

(e.g. "org.apache.hadoop.io.IntWritable", None by default)

valueClass : str, optional

fully qualified classname of value Writable class

(e.g. "org.apache.hadoop.io.Text", None by default)

keyConverter : str, optional

fully qualified classname of key converter (None by default)

valueConverter : str, optional

fully qualified classname of value converter (None by default)

conf : dict, optional

Hadoop job configuration (None by default)

"""

jconf = self.ctx._dictToJavaMap(conf)

pickledRDD = self._pickled()

self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickledRDD._jrdd, True, path,

outputFormatClass,

keyClass, valueClass,

keyConverter, valueConverter, jconf)

def saveAsHadoopDataset(self, conf, keyConverter=None, valueConverter=None):

"""

Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file

system, using the old Hadoop OutputFormat API (mapred package). Keys/values are

converted for output using either user specified converters or, by default,

"org.apache.spark.api.python.JavaToWritableConverter".

Parameters

----------

conf : dict

Hadoop job configuration

keyConverter : str, optional

fully qualified classname of key converter (None by default)

valueConverter : str, optional

fully qualified classname of value converter (None by default)

"""

jconf = self.ctx._dictToJavaMap(conf)

pickledRDD = self._pickled()

self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd, True, jconf,

keyConverter, valueConverter, False)

def saveAsHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None,

keyConverter=None, valueConverter=None, conf=None,

compressionCodecClass=None):

"""

Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file

system, using the old Hadoop OutputFormat API (mapred package). Key and value types

will be inferred if not specified. Keys and values are converted for output using either

user specified converters or "org.apache.spark.api.python.JavaToWritableConverter". The

`conf` is applied on top of the base Hadoop conf associated with the SparkContext

of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.

Parameters

----------

path : str

path to Hadoop file

outputFormatClass : str

fully qualified classname of Hadoop OutputFormat

(e.g. "org.apache.hadoop.mapred.SequenceFileOutputFormat")

keyClass : str, optional

fully qualified classname of key Writable class

(e.g. "org.apache.hadoop.io.IntWritable", None by default)

valueClass : str, optional

fully qualified classname of value Writable class

(e.g. "org.apache.hadoop.io.Text", None by default)

keyConverter : str, optional

fully qualified classname of key converter (None by default)

valueConverter : str, optional

fully qualified classname of value converter (None by default)

conf : dict, optional

(None by default)

compressionCodecClass : str

fully qualified classname of the compression codec class

i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default)

"""

jconf = self.ctx._dictToJavaMap(conf)

pickledRDD = self._pickled()

self.ctx._jvm.PythonRDD.saveAsHadoopFile(pickledRDD._jrdd, True, path,

outputFormatClass,

keyClass, valueClass,

keyConverter, valueConverter,

jconf, compressionCodecClass)

def saveAsSequenceFile(self, path, compressionCodecClass=None):

"""

Output a Python RDD of key-value pairs (of form ``RDD[(K, V)]``) to any Hadoop file

system, using the "org.apache.hadoop.io.Writable" types that we convert from the

RDD's key and value types. The mechanism is as follows:

1. Pyrolite is used to convert pickled Python RDD into RDD of Java objects.

2. Keys and values of this Java RDD are converted to Writables and written out.

Parameters

----------

path : str

path to sequence file

compressionCodecClass : str, optional

fully qualified classname of the compression codec class

i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default)

"""

pickledRDD = self._pickled()

self.ctx._jvm.PythonRDD.saveAsSequenceFile(pickledRDD._jrdd, True,

path, compressionCodecClass)

def saveAsPickleFile(self, path, batchSize=10):

"""

Save this RDD as a SequenceFile of serialized objects. The serializer

used is :class:`pyspark.serializers.PickleSerializer`, default batch size

is 10.

Examples

--------

>>> from tempfile import NamedTemporaryFile

>>> tmpFile = NamedTemporaryFile(delete=True)

>>> tmpFile.close()

>>> sc.parallelize([1, 2, 'spark', 'rdd']).saveAsPickleFile(tmpFile.name, 3)

>>> sorted(sc.pickleFile(tmpFile.name, 5).map(str).collect())

['1', '2', 'rdd', 'spark']

"""

1765 ↛ 1766line 1765 didn't jump to line 1766, because the condition on line 1765 was never true if batchSize == 0:

ser = AutoBatchedSerializer(PickleSerializer())

else:

ser = BatchedSerializer(PickleSerializer(), batchSize)

self._reserialize(ser)._jrdd.saveAsObjectFile(path)

def saveAsTextFile(self, path, compressionCodecClass=None):

"""

Save this RDD as a text file, using string representations of elements.

Parameters

----------

path : str

path to text file

compressionCodecClass : str, optional

fully qualified classname of the compression codec class

i.e. "org.apache.hadoop.io.compress.GzipCodec" (None by default)

Examples

--------

>>> from tempfile import NamedTemporaryFile

>>> tempFile = NamedTemporaryFile(delete=True)

>>> tempFile.close()

>>> sc.parallelize(range(10)).saveAsTextFile(tempFile.name)

>>> from fileinput import input

>>> from glob import glob

>>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))

'0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n'

Empty lines are tolerated when saving to text files.

>>> from tempfile import NamedTemporaryFile

>>> tempFile2 = NamedTemporaryFile(delete=True)

>>> tempFile2.close()

>>> sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(tempFile2.name)

>>> ''.join(sorted(input(glob(tempFile2.name + "/part-0000*"))))

'\\n\\n\\nbar\\nfoo\\n'

Using compressionCodecClass

>>> from tempfile import NamedTemporaryFile

>>> tempFile3 = NamedTemporaryFile(delete=True)

>>> tempFile3.close()

>>> codec = "org.apache.hadoop.io.compress.GzipCodec"

>>> sc.parallelize(['foo', 'bar']).saveAsTextFile(tempFile3.name, codec)

>>> from fileinput import input, hook_compressed

>>> result = sorted(input(glob(tempFile3.name + "/part*.gz"), openhook=hook_compressed))

>>> b''.join(result).decode('utf-8')

'bar\\nfoo\\n'

"""

def func(split, iterator):

for x in iterator:

if not isinstance(x, (str, bytes)):

x = str(x)

if isinstance(x, str):

x = x.encode("utf-8")

yield x

keyed = self.mapPartitionsWithIndex(func)

keyed._bypass_serializer = True

if compressionCodecClass:

compressionCodec = self.ctx._jvm.java.lang.Class.forName(compressionCodecClass)

keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path, compressionCodec)

else:

keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)

# Pair functions

def collectAsMap(self):

"""

Return the key-value pairs in this RDD to the master as a dictionary.

Notes

-----

This method should only be used if the resulting data is expected

to be small, as all the data is loaded into the driver's memory.

Examples

--------

>>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()

>>> m[1]

>>> m[3]

"""

return dict(self.collect())

def keys(self):

"""

Return an RDD with the keys of each tuple.

Examples

--------

>>> m = sc.parallelize([(1, 2), (3, 4)]).keys()

>>> m.collect()

[1, 3]

"""

return self.map(lambda x: x[0])

def values(self):

"""

Return an RDD with the values of each tuple.

Examples

--------

>>> m = sc.parallelize([(1, 2), (3, 4)]).values()

>>> m.collect()

[2, 4]

"""

return self.map(lambda x: x[1])

def reduceByKey(self, func, numPartitions=None, partitionFunc=portable_hash):

"""

Merge the values for each key using an associative and commutative reduce function.

This will also perform the merging locally on each mapper before

sending results to a reducer, similarly to a "combiner" in MapReduce.

Output will be partitioned with `numPartitions` partitions, or

the default parallelism level if `numPartitions` is not specified.

Default partitioner is hash-partition.

Examples

--------

>>> from operator import add

>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])

>>> sorted(rdd.reduceByKey(add).collect())

[('a', 2), ('b', 1)]

"""

return self.combineByKey(lambda x: x, func, func, numPartitions, partitionFunc)

def reduceByKeyLocally(self, func):

"""

Merge the values for each key using an associative and commutative reduce function, but

return the results immediately to the master as a dictionary.

This will also perform the merging locally on each mapper before

sending results to a reducer, similarly to a "combiner" in MapReduce.

Examples

--------

>>> from operator import add

>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])

>>> sorted(rdd.reduceByKeyLocally(add).items())

[('a', 2), ('b', 1)]

"""

func = fail_on_stopiteration(func)

def reducePartition(iterator):

m = {}

for k, v in iterator:

m[k] = func(m[k], v) if k in m else v

yield m

def mergeMaps(m1, m2):

for k, v in m2.items():

m1[k] = func(m1[k], v) if k in m1 else v

return m1

return self.mapPartitions(reducePartition).reduce(mergeMaps)

def countByKey(self):

"""

Count the number of elements for each key, and return the result to the

master as a dictionary.

Examples

--------

>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])

>>> sorted(rdd.countByKey().items())

[('a', 2), ('b', 1)]

"""

return self.map(lambda x: x[0]).countByValue()

def join(self, other, numPartitions=None):

"""

Return an RDD containing all pairs of elements with matching keys in

`self` and `other`.

Each pair of elements will be returned as a (k, (v1, v2)) tuple, where

(k, v1) is in `self` and (k, v2) is in `other`.

Performs a hash join across the cluster.

Examples

--------

>>> x = sc.parallelize([("a", 1), ("b", 4)])

>>> y = sc.parallelize([("a", 2), ("a", 3)])

>>> sorted(x.join(y).collect())

[('a', (1, 2)), ('a', (1, 3))]

"""

return python_join(self, other, numPartitions)

def leftOuterJoin(self, other, numPartitions=None):

"""

Perform a left outer join of `self` and `other`.

For each element (k, v) in `self`, the resulting RDD will either

contain all pairs (k, (v, w)) for w in `other`, or the pair

(k, (v, None)) if no elements in `other` have key k.

Hash-partitions the resulting RDD into the given number of partitions.

Examples

--------

>>> x = sc.parallelize([("a", 1), ("b", 4)])

>>> y = sc.parallelize([("a", 2)])

>>> sorted(x.leftOuterJoin(y).collect())

[('a', (1, 2)), ('b', (4, None))]

"""

return python_left_outer_join(self, other, numPartitions)

def rightOuterJoin(self, other, numPartitions=None):

"""

Perform a right outer join of `self` and `other`.

For each element (k, w) in `other`, the resulting RDD will either

contain all pairs (k, (v, w)) for v in this, or the pair (k, (None, w))

if no elements in `self` have key k.

Hash-partitions the resulting RDD into the given number of partitions.

Examples

--------

>>> x = sc.parallelize([("a", 1), ("b", 4)])

>>> y = sc.parallelize([("a", 2)])

>>> sorted(y.rightOuterJoin(x).collect())

[('a', (2, 1)), ('b', (None, 4))]

"""

return python_right_outer_join(self, other, numPartitions)

def fullOuterJoin(self, other, numPartitions=None):

"""

Perform a right outer join of `self` and `other`.

For each element (k, v) in `self`, the resulting RDD will either

contain all pairs (k, (v, w)) for w in `other`, or the pair

(k, (v, None)) if no elements in `other` have key k.

Similarly, for each element (k, w) in `other`, the resulting RDD will

either contain all pairs (k, (v, w)) for v in `self`, or the pair

(k, (None, w)) if no elements in `self` have key k.

Hash-partitions the resulting RDD into the given number of partitions.

Examples

--------

>>> x = sc.parallelize([("a", 1), ("b", 4)])

>>> y = sc.parallelize([("a", 2), ("c", 8)])

>>> sorted(x.fullOuterJoin(y).collect())

[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]

"""

return python_full_outer_join(self, other, numPartitions)

# TODO: add option to control map-side combining

# portable_hash is used as default, because builtin hash of None is different

# cross machines.

def partitionBy(self, numPartitions, partitionFunc=portable_hash):

"""

Return a copy of the RDD partitioned using the specified partitioner.

Examples

--------

>>> pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))

>>> sets = pairs.partitionBy(2).glom().collect()

>>> len(set(sets[0]).intersection(set(sets[1])))

"""

if numPartitions is None:

numPartitions = self._defaultReducePartitions()

partitioner = Partitioner(numPartitions, partitionFunc)

if self.partitioner == partitioner:

return self

# Transferring O(n) objects to Java is too expensive.

# Instead, we'll form the hash buckets in Python,

# transferring O(numPartitions) objects to Java.

# Each object is a (splitNumber, [objects]) pair.

# In order to avoid too huge objects, the objects are

# grouped into chunks.

outputSerializer = self.ctx._unbatched_serializer

limit = (self._memory_limit() / 2)

def add_shuffle_key(split, iterator):

buckets = defaultdict(list)

c, batch = 0, min(10 * numPartitions, 1000)

for k, v in iterator:

buckets[partitionFunc(k) % numPartitions].append((k, v))

c += 1

# check used memory and avg size of chunk of objects

if (c % 1000 == 0 and get_used_memory() > limit

or c > batch):

n, size = len(buckets), 0

for split in list(buckets.keys()):

yield pack_long(split)

d = outputSerializer.dumps(buckets[split])

del buckets[split]

yield d

size += len(d)

avg = int(size / n) >> 20

# let 1M < avg < 10M

2069 ↛ 2071line 2069 didn't jump to line 2071, because the condition on line 2069 was never false if avg < 1:

batch = min(sys.maxsize, batch * 1.5)

elif avg > 10:

batch = max(int(batch / 1.5), 1)

c = 0

for split, items in buckets.items():

yield pack_long(split)

yield outputSerializer.dumps(items)

keyed = self.mapPartitionsWithIndex(add_shuffle_key, preservesPartitioning=True)

keyed._bypass_serializer = True

with SCCallSiteSync(self.context) as css:

pairRDD = self.ctx._jvm.PairwiseRDD(

keyed._jrdd.rdd()).asJavaPairRDD()

jpartitioner = self.ctx._jvm.PythonPartitioner(numPartitions,

id(partitionFunc))

jrdd = self.ctx._jvm.PythonRDD.valueOfPair(pairRDD.partitionBy(jpartitioner))

rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))

rdd.partitioner = partitioner

return rdd

# TODO: add control over map-side aggregation

def combineByKey(self, createCombiner, mergeValue, mergeCombiners,

numPartitions=None, partitionFunc=portable_hash):

"""

Generic function to combine the elements for each key using a custom

set of aggregation functions.

Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined

type" C.

Users provide three functions:

- `createCombiner`, which turns a V into a C (e.g., creates

a one-element list)

- `mergeValue`, to merge a V into a C (e.g., adds it to the end of

a list)

- `mergeCombiners`, to combine two C's into a single one (e.g., merges

the lists)

To avoid memory allocation, both mergeValue and mergeCombiners are allowed to

modify and return their first argument instead of creating a new C.

In addition, users can control the partitioning of the output RDD.

Notes

-----

V and C can be different -- for example, one might group an RDD of type

(Int, Int) into an RDD of type (Int, List[Int]).

Examples

--------

>>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 2)])

>>> def to_list(a):

... return [a]

...

>>> def append(a, b):

... a.append(b)

... return a

...

>>> def extend(a, b):

... a.extend(b)

... return a

...

>>> sorted(x.combineByKey(to_list, append, extend).collect())

[('a', [1, 2]), ('b', [1])]

"""

if numPartitions is None:

numPartitions = self._defaultReducePartitions()

serializer = self.ctx.serializer

memory = self._memory_limit()

agg = Aggregator(createCombiner, mergeValue, mergeCombiners)

def combineLocally(iterator):

merger = ExternalMerger(agg, memory * 0.9, serializer)

merger.mergeValues(iterator)

return merger.items()

locally_combined = self.mapPartitions(combineLocally, preservesPartitioning=True)

shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)

def _mergeCombiners(iterator):

merger = ExternalMerger(agg, memory, serializer)

merger.mergeCombiners(iterator)

return merger.items()

return shuffled.mapPartitions(_mergeCombiners, preservesPartitioning=True)

def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None,

partitionFunc=portable_hash):

"""

Aggregate the values of each key, using given combine functions and a neutral

"zero value". This function can return a different result type, U, than the type

of the values in this RDD, V. Thus, we need one operation for merging a V into

a U and one operation for merging two U's, The former operation is used for merging

values within a partition, and the latter is used for merging values between

partitions. To avoid memory allocation, both of these functions are

allowed to modify and return their first argument instead of creating a new U.

"""

def createZero():

return copy.deepcopy(zeroValue)

return self.combineByKey(

lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions, partitionFunc)

def foldByKey(self, zeroValue, func, numPartitions=None, partitionFunc=portable_hash):

"""

Merge the values for each key using an associative function "func"

and a neutral "zeroValue" which may be added to the result an

arbitrary number of times, and must not change the result

(e.g., 0 for addition, or 1 for multiplication.).

Examples

--------

>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])

>>> from operator import add

>>> sorted(rdd.foldByKey(0, add).collect())

[('a', 2), ('b', 1)]

"""

def createZero():

return copy.deepcopy(zeroValue)

return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions,

partitionFunc)

def _memory_limit(self):

return _parse_memory(self.ctx._conf.get("spark.python.worker.memory", "512m"))

# TODO: support variant with custom partitioner

def groupByKey(self, numPartitions=None, partitionFunc=portable_hash):

"""

Group the values for each key in the RDD into a single sequence.

Hash-partitions the resulting RDD with numPartitions partitions.

Notes

-----

If you are grouping in order to perform an aggregation (such as a

sum or average) over each key, using reduceByKey or aggregateByKey will

provide much better performance.

Examples

--------

>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])

>>> sorted(rdd.groupByKey().mapValues(len).collect())

[('a', 2), ('b', 1)]

>>> sorted(rdd.groupByKey().mapValues(list).collect())

[('a', [1, 1]), ('b', [1])]

"""

def createCombiner(x):

return [x]

def mergeValue(xs, x):

xs.append(x)

return xs

def mergeCombiners(a, b):

a.extend(b)

return a

memory = self._memory_limit()

serializer = self._jrdd_deserializer

agg = Aggregator(createCombiner, mergeValue, mergeCombiners)

def combine(iterator):

merger = ExternalMerger(agg, memory * 0.9, serializer)

merger.mergeValues(iterator)

return merger.items()

locally_combined = self.mapPartitions(combine, preservesPartitioning=True)

shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)

def groupByKey(it):

merger = ExternalGroupBy(agg, memory, serializer)

merger.mergeCombiners(it)

return merger.items()

return shuffled.mapPartitions(groupByKey, True).mapValues(ResultIterable)

def flatMapValues(self, f):

"""

Pass each value in the key-value pair RDD through a flatMap function

without changing the keys; this also retains the original RDD's

partitioning.

Examples

--------

>>> x = sc.parallelize([("a", ["x", "y", "z"]), ("b", ["p", "r"])])

>>> def f(x): return x

>>> x.flatMapValues(f).collect()

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

"""

flat_map_fn = lambda kv: ((kv[0], x) for x in f(kv[1]))

return self.flatMap(flat_map_fn, preservesPartitioning=True)

def mapValues(self, f):

"""

Pass each value in the key-value pair RDD through a map function

without changing the keys; this also retains the original RDD's

partitioning.

Examples

--------

>>> x = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])])

>>> def f(x): return len(x)

>>> x.mapValues(f).collect()

[('a', 3), ('b', 1)]

"""

map_values_fn = lambda kv: (kv[0], f(kv[1]))

return self.map(map_values_fn, preservesPartitioning=True)

def groupWith(self, other, *others):

"""

Alias for cogroup but with support for multiple RDDs.

Examples

--------

>>> w = sc.parallelize([("a", 5), ("b", 6)])

>>> x = sc.parallelize([("a", 1), ("b", 4)])

>>> y = sc.parallelize([("a", 2)])

>>> z = sc.parallelize([("b", 42)])

>>> [(x, tuple(map(list, y))) for x, y in sorted(list(w.groupWith(x, y, z).collect()))]

[('a', ([5], [1], [2], [])), ('b', ([6], [4], [], [42]))]

"""

return python_cogroup((self, other) + others, numPartitions=None)

# TODO: add variant with custom partitioner

def cogroup(self, other, numPartitions=None):

"""

For each key k in `self` or `other`, return a resulting RDD that

contains a tuple with the list of values for that key in `self` as

well as `other`.

Examples

--------

>>> x = sc.parallelize([("a", 1), ("b", 4)])

>>> y = sc.parallelize([("a", 2)])

>>> [(x, tuple(map(list, y))) for x, y in sorted(list(x.cogroup(y).collect()))]

[('a', ([1], [2])), ('b', ([4], []))]

"""

return python_cogroup((self, other), numPartitions)

def sampleByKey(self, withReplacement, fractions, seed=None):

"""

Return a subset of this RDD sampled by key (via stratified sampling).

Create a sample of this RDD using variable sampling rates for

different keys as specified by fractions, a key to sampling rate map.

Examples

--------

>>> fractions = {"a": 0.2, "b": 0.1}

>>> rdd = sc.parallelize(fractions.keys()).cartesian(sc.parallelize(range(0, 1000)))

>>> sample = dict(rdd.sampleByKey(False, fractions, 2).groupByKey().collect())

>>> 100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150

True

>>> max(sample["a"]) <= 999 and min(sample["a"]) >= 0

True

>>> max(sample["b"]) <= 999 and min(sample["b"]) >= 0

True

"""

for fraction in fractions.values():

assert fraction >= 0.0, "Negative fraction value: %s" % fraction

return self.mapPartitionsWithIndex(

RDDStratifiedSampler(withReplacement, fractions, seed).func, True)

def subtractByKey(self, other, numPartitions=None):

"""

Return each (key, value) pair in `self` that has no pair with matching

key in `other`.

Examples

--------

>>> x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 2)])

>>> y = sc.parallelize([("a", 3), ("c", None)])

>>> sorted(x.subtractByKey(y).collect())

[('b', 4), ('b', 5)]

"""

def filter_func(pair):

key, (val1, val2) = pair

return val1 and not val2

return self.cogroup(other, numPartitions).filter(filter_func).flatMapValues(lambda x: x[0])

def subtract(self, other, numPartitions=None):

"""

Return each value in `self` that is not contained in `other`.

Examples

--------

>>> x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 3)])

>>> y = sc.parallelize([("a", 3), ("c", None)])

>>> sorted(x.subtract(y).collect())

[('a', 1), ('b', 4), ('b', 5)]

"""

# note: here 'True' is just a placeholder

rdd = other.map(lambda x: (x, True))

return self.map(lambda x: (x, True)).subtractByKey(rdd, numPartitions).keys()

def keyBy(self, f):

"""

Creates tuples of the elements in this RDD by applying `f`.

Examples

--------

>>> x = sc.parallelize(range(0,3)).keyBy(lambda x: x*x)

>>> y = sc.parallelize(zip(range(0,5), range(0,5)))

>>> [(x, list(map(list, y))) for x, y in sorted(x.cogroup(y).collect())]

[(0, [[0], [0]]), (1, [[1], [1]]), (2, [[], [2]]), (3, [[], [3]]), (4, [[2], [4]])]

"""

return self.map(lambda x: (f(x), x))

def repartition(self, numPartitions):

"""

Return a new RDD that has exactly numPartitions partitions.

Can increase or decrease the level of parallelism in this RDD.

Internally, this uses a shuffle to redistribute data.

If you are decreasing the number of partitions in this RDD, consider

using `coalesce`, which can avoid performing a shuffle.

Examples

--------

>>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4)

>>> sorted(rdd.glom().collect())

[[1], [2, 3], [4, 5], [6, 7]]

>>> len(rdd.repartition(2).glom().collect())

>>> len(rdd.repartition(10).glom().collect())

"""

return self.coalesce(numPartitions, shuffle=True)

def coalesce(self, numPartitions, shuffle=False):

"""

Return a new RDD that is reduced into `numPartitions` partitions.

Examples

--------

>>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect()

[[1], [2, 3], [4, 5]]

>>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect()

[[1, 2, 3, 4, 5]]

"""

if shuffle:

# Decrease the batch size in order to distribute evenly the elements across output

# partitions. Otherwise, repartition will possibly produce highly skewed partitions.

batchSize = min(10, self.ctx._batchSize or 1024)

ser = BatchedSerializer(PickleSerializer(), batchSize)

selfCopy = self._reserialize(ser)

jrdd_deserializer = selfCopy._jrdd_deserializer

jrdd = selfCopy._jrdd.coalesce(numPartitions, shuffle)

else:

jrdd_deserializer = self._jrdd_deserializer

jrdd = self._jrdd.coalesce(numPartitions, shuffle)

return RDD(jrdd, self.ctx, jrdd_deserializer)

def zip(self, other):

"""

Zips this RDD with another one, returning key-value pairs with the

first element in each RDD second element in each RDD, etc. Assumes

that the two RDDs have the same number of partitions and the same

number of elements in each partition (e.g. one was made through

a map on the other).

Examples

--------

>>> x = sc.parallelize(range(0,5))

>>> y = sc.parallelize(range(1000, 1005))

>>> x.zip(y).collect()

[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]

"""

def get_batch_size(ser):

if isinstance(ser, BatchedSerializer):

return ser.batchSize

return 1 # not batched

def batch_as(rdd, batchSize):

return rdd._reserialize(BatchedSerializer(PickleSerializer(), batchSize))

my_batch = get_batch_size(self._jrdd_deserializer)

other_batch = get_batch_size(other._jrdd_deserializer)

if my_batch != other_batch or not my_batch:

# use the smallest batchSize for both of them

batchSize = min(my_batch, other_batch)

if batchSize <= 0:

# auto batched or unlimited

batchSize = 100

other = batch_as(other, batchSize)

self = batch_as(self, batchSize)

if self.getNumPartitions() != other.getNumPartitions():

raise ValueError("Can only zip with RDD which has the same number of partitions")

# There will be an Exception in JVM if there are different number

# of items in each partitions.

pairRDD = self._jrdd.zip(other._jrdd)

deserializer = PairDeserializer(self._jrdd_deserializer,

other._jrdd_deserializer)

return RDD(pairRDD, self.ctx, deserializer)

def zipWithIndex(self):

"""

Zips this RDD with its element indices.

The ordering is first based on the partition index and then the

ordering of items within each partition. So the first item in

the first partition gets index 0, and the last item in the last

partition receives the largest index.

This method needs to trigger a spark job when this RDD contains

more than one partitions.

Examples

--------

>>> sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect()

[('a', 0), ('b', 1), ('c', 2), ('d', 3)]

"""

starts = [0]

2488 ↛ 2493line 2488 didn't jump to line 2493, because the condition on line 2488 was never false if self.getNumPartitions() > 1:

nums = self.mapPartitions(lambda it: [sum(1 for i in it)]).collect()

for i in range(len(nums) - 1):

starts.append(starts[-1] + nums[i])

def func(k, it):

for i, v in enumerate(it, starts[k]):

yield v, i

return self.mapPartitionsWithIndex(func)

def zipWithUniqueId(self):

"""

Zips this RDD with generated unique Long ids.

Items in the kth partition will get ids k, n+k, 2*n+k, ..., where

n is the number of partitions. So there may exist gaps, but this

method won't trigger a spark job, which is different from

:meth:`zipWithIndex`.

Examples

--------

>>> sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect()

[('a', 0), ('b', 1), ('c', 4), ('d', 2), ('e', 5)]

"""

n = self.getNumPartitions()

def func(k, it):

for i, v in enumerate(it):

yield v, i * n + k

return self.mapPartitionsWithIndex(func)

def name(self):

"""

Return the name of this RDD.

"""

n = self._jrdd.name()

2526 ↛ exitline 2526 didn't return from function 'name', because the condition on line 2526 was never false if n:

return n

def setName(self, name):

"""

Assign a name to this RDD.

Examples

--------

>>> rdd1 = sc.parallelize([1, 2])

>>> rdd1.setName('RDD1').name()

'RDD1'

"""

self._jrdd.setName(name)

return self

def toDebugString(self):

"""

A description of this RDD and its recursive dependencies for debugging.

"""

debug_string = self._jrdd.toDebugString()

if debug_string:

return debug_string.encode('utf-8')

def getStorageLevel(self):

"""

Get the RDD's current storage level.

Examples

--------

>>> rdd1 = sc.parallelize([1,2])

>>> rdd1.getStorageLevel()

StorageLevel(False, False, False, False, 1)

>>> print(rdd1.getStorageLevel())

Serialized 1x Replicated

"""

java_storage_level = self._jrdd.getStorageLevel()

storage_level = StorageLevel(java_storage_level.useDisk(),

java_storage_level.useMemory(),

java_storage_level.useOffHeap(),

java_storage_level.deserialized(),

java_storage_level.replication())

return storage_level

def _defaultReducePartitions(self):

"""

Returns the default number of partitions to use during reduce tasks (e.g., groupBy).

If spark.default.parallelism is set, then we'll use the value from SparkContext

defaultParallelism, otherwise we'll use the number of partitions in this RDD.

This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce

the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will

be inherent.

"""

2580 ↛ 2581line 2580 didn't jump to line 2581, because the condition on line 2580 was never true if self.ctx._conf.contains("spark.default.parallelism"):

return self.ctx.defaultParallelism

else:

return self.getNumPartitions()

def lookup(self, key):

"""

Return the list of values in the RDD for key `key`. This operation

is done efficiently if the RDD has a known partitioner by only

searching the partition that the key maps to.

Examples

--------

>>> l = range(1000)

>>> rdd = sc.parallelize(zip(l, l), 10)

>>> rdd.lookup(42) # slow

[42]

>>> sorted = rdd.sortByKey()

>>> sorted.lookup(42) # fast

[42]

>>> sorted.lookup(1024)

[]

>>> rdd2 = sc.parallelize([(('a', 'b'), 'c')]).groupByKey()

>>> list(rdd2.lookup(('a', 'b'))[0])

['c']

"""

values = self.filter(lambda kv: kv[0] == key).values()

if self.partitioner is not None:

return self.ctx.runJob(values, lambda x: x, [self.partitioner(key)])

return values.collect()

def _to_java_object_rdd(self):

""" Return a JavaRDD of Object by unpickling

It will convert each Python object into Java object by Pyrolite, whenever the

RDD is serialized in batch or not.

"""

rdd = self._pickled()

return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True)

def countApprox(self, timeout, confidence=0.95):

"""

Approximate version of count() that returns a potentially incomplete

result within a timeout, even if not all tasks have finished.

Examples

--------

>>> rdd = sc.parallelize(range(1000), 10)

>>> rdd.countApprox(1000, 1.0)

1000

"""

drdd = self.mapPartitions(lambda it: [float(sum(1 for i in it))])

return int(drdd.sumApprox(timeout, confidence))

def sumApprox(self, timeout, confidence=0.95):

"""

Approximate operation to return the sum within a timeout

or meet the confidence.

Examples

--------

>>> rdd = sc.parallelize(range(1000), 10)

>>> r = sum(range(1000))

>>> abs(rdd.sumApprox(1000) - r) / r < 0.05

True

"""

jrdd = self.mapPartitions(lambda it: [float(sum(it))])._to_java_object_rdd()

jdrdd = self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())

r = jdrdd.sumApprox(timeout, confidence).getFinalValue()

return BoundedFloat(r.mean(), r.confidence(), r.low(), r.high())

def meanApprox(self, timeout, confidence=0.95):

"""

Approximate operation to return the mean within a timeout

or meet the confidence.

Examples

--------

>>> rdd = sc.parallelize(range(1000), 10)

>>> r = sum(range(1000)) / 1000.0

>>> abs(rdd.meanApprox(1000) - r) / r < 0.05

True

"""

jrdd = self.map(float)._to_java_object_rdd()

jdrdd = self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())

r = jdrdd.meanApprox(timeout, confidence).getFinalValue()

return BoundedFloat(r.mean(), r.confidence(), r.low(), r.high())

def countApproxDistinct(self, relativeSD=0.05):

"""

Return approximate number of distinct elements in the RDD.

Parameters

----------

relativeSD : float, optional

Relative accuracy. Smaller values create

counters that require more space.

It must be greater than 0.000017.

Notes

-----

The algorithm used is based on streamlib's implementation of

`"HyperLogLog in Practice: Algorithmic Engineering of a State

of The Art Cardinality Estimation Algorithm", available here

<https://doi.org/10.1145/2452376.2452456>`_.

Examples

--------

>>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct()

>>> 900 < n < 1100

True

>>> n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct()

>>> 16 < n < 24

True

"""

if relativeSD < 0.000017:

raise ValueError("relativeSD should be greater than 0.000017")

# the hash space in Java is 2^32

hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF)

return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)

def toLocalIterator(self, prefetchPartitions=False):

"""

Return an iterator that contains all of the elements in this RDD.

The iterator will consume as much memory as the largest partition in this RDD.

With prefetch it may consume up to the memory of the 2 largest partitions.

Parameters

----------

prefetchPartitions : bool, optional

If Spark should pre-fetch the next partition

before it is needed.

Examples

--------

>>> rdd = sc.parallelize(range(10))

>>> [x for x in rdd.toLocalIterator()]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

"""

with SCCallSiteSync(self.context) as css:

sock_info = self.ctx._jvm.PythonRDD.toLocalIteratorAndServe(

self._jrdd.rdd(),

prefetchPartitions)

return _local_iterator_from_socket(sock_info, self._jrdd_deserializer)

def barrier(self):

"""

Marks the current stage as a barrier stage, where Spark must launch all tasks together.

In case of a task failure, instead of only restarting the failed task, Spark will abort the

entire stage and relaunch all tasks for this stage.

The barrier execution mode feature is experimental and it only handles limited scenarios.

Please read the linked SPIP and design docs to understand the limitations and future plans.

.. versionadded:: 2.4.0

Returns

-------

:class:`RDDBarrier`

instance that provides actions within a barrier stage.

Coverage for pyspark/rdd.py : 94%

932 statements 891 run 41 missing 0 excluded 29 partial