Coverage for pyspark/streaming/context.py: 83%

140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId:

cls._activeContext = None

raise RuntimeError(

"JVM's active JavaStreamingContext is not the JavaStreamingContext "

"backing the action Python StreamingContext. This is unexpected.")

return cls._activeContext

@classmethod

def getActiveOrCreate(cls, checkpointPath, setupFunc):

"""

Either return the active StreamingContext (i.e. currently started but not stopped),

or recreate a StreamingContext from checkpoint data or create a new StreamingContext

using the provided setupFunc function. If the checkpointPath is None or does not contain

valid checkpoint data, then setupFunc will be called to create a new context and setup

DStreams.

Parameters

----------

checkpointPath : str

Checkpoint directory used in an earlier streaming program. Can be

None if the intention is to always create a new context when there

is no active context.

setupFunc : function

Function to create a new JavaStreamingContext and setup DStreams

"""

166 ↛ 167line 166 didn't jump to line 167, because the condition on line 166 was never true if not callable(setupFunc):

raise TypeError("setupFunc should be callable.")

activeContext = cls.getActive()

if activeContext is not None:

return activeContext

elif checkpointPath is not None:

return cls.getOrCreate(checkpointPath, setupFunc)

else:

return setupFunc()

@property

def sparkContext(self):

"""

Return SparkContext which is associated with this StreamingContext.

"""

return self._sc

def start(self):

"""

Start the execution of the streams.

"""

self._jssc.start()

StreamingContext._activeContext = self

def awaitTermination(self, timeout=None):

"""

Wait for the execution to stop.

Parameters

----------

timeout : int, optional

time to wait in seconds

"""

if timeout is None:

self._jssc.awaitTermination()

else:

self._jssc.awaitTerminationOrTimeout(int(timeout * 1000))

def awaitTerminationOrTimeout(self, timeout):

"""

Wait for the execution to stop. Return `true` if it's stopped; or

throw the reported error during the execution; or `false` if the

waiting time elapsed before returning from the method.

Parameters

----------

timeout : int

time to wait in seconds

"""

return self._jssc.awaitTerminationOrTimeout(int(timeout * 1000))

def stop(self, stopSparkContext=True, stopGraceFully=False):

"""

Stop the execution of the streams, with option of ensuring all

received data has been processed.

Parameters

----------

stopSparkContext : bool, optional

Stop the associated SparkContext or not

stopGracefully : bool, optional

Stop gracefully by waiting for the processing of all received

data to be completed

"""

self._jssc.stop(stopSparkContext, stopGraceFully)

StreamingContext._activeContext = None

if stopSparkContext:

self._sc.stop()

def remember(self, duration):

"""

Set each DStreams in this context to remember RDDs it generated

in the last given duration. DStreams remember RDDs only for a

limited duration of time and releases them for garbage collection.

This method allows the developer to specify how long to remember

the RDDs (if the developer wishes to query old data outside the

DStream computation).

Parameters

----------

duration : int

Minimum duration (in seconds) that each DStream should remember its RDDs

"""

self._jssc.remember(self._jduration(duration))

def checkpoint(self, directory):

"""

Sets the context to periodically checkpoint the DStream operations for master

fault-tolerance. The graph will be checkpointed every batch interval.

Parameters

----------

directory : str

HDFS-compatible directory where the checkpoint data will be reliably stored

"""

self._jssc.checkpoint(directory)

def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_DISK_2):

"""

Create an input from TCP source hostname:port. Data is received using

a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited

lines.

Parameters

----------

hostname : str

Hostname to connect to for receiving data

port : int

Port to connect to for receiving data

storageLevel : :class:`pyspark.StorageLevel`, optional

Storage level to use for storing the received objects

"""

jlevel = self._sc._getJavaStorageLevel(storageLevel)

return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self,

UTF8Deserializer())

def textFileStream(self, directory):

"""

Create an input stream that monitors a Hadoop-compatible file system

for new files and reads them as text files. Files must be written to the

monitored directory by "moving" them from another location within the same

file system. File names starting with . are ignored.

The text files must be encoded as UTF-8.

"""

return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())

def binaryRecordsStream(self, directory, recordLength):

"""

Create an input stream that monitors a Hadoop-compatible file system

for new files and reads them as flat binary files with records of

fixed length. Files must be written to the monitored directory by "moving"

them from another location within the same file system.

File names starting with . are ignored.

Parameters

----------

directory : str

Directory to load data from

recordLength : int

Length of each record in bytes

"""

return DStream(self._jssc.binaryRecordsStream(directory, recordLength), self,

NoOpSerializer())

def _check_serializers(self, rdds):

# make sure they have same serializer

if len(set(rdd._jrdd_deserializer for rdd in rdds)) > 1:

for i in range(len(rdds)):

# reset them to sc.serializer

rdds[i] = rdds[i]._reserialize()

def queueStream(self, rdds, oneAtATime=True, default=None):

"""

Create an input stream from a queue of RDDs or list. In each batch,

it will process either one or all of the RDDs returned by the queue.

Parameters

----------

rdds : list

Queue of RDDs

oneAtATime : bool, optional

pick one rdd each time or pick all of them once.

default : :class:`pyspark.RDD`, optional

The default rdd if no more in rdds

Notes

-----

Changes to the queue after the stream is created will not be recognized.

"""

335 ↛ 336line 335 didn't jump to line 336, because the condition on line 335 was never true if default and not isinstance(default, RDD):

default = self._sc.parallelize(default)

338 ↛ 339line 338 didn't jump to line 339, because the condition on line 338 was never true if not rdds and default:

rdds = [rdds]

if rdds and not isinstance(rdds[0], RDD):

rdds = [self._sc.parallelize(input) for input in rdds]

self._check_serializers(rdds)

queue = self._jvm.PythonDStream.toRDDQueue([r._jrdd for r in rdds])

346 ↛ 347line 346 didn't jump to line 347, because the condition on line 346 was never true if default:

default = default._reserialize(rdds[0]._jrdd_deserializer)

jdstream = self._jssc.queueStream(queue, oneAtATime, default._jrdd)

else:

jdstream = self._jssc.queueStream(queue, oneAtATime)

return DStream(jdstream, self, rdds[0]._jrdd_deserializer)

def transform(self, dstreams, transformFunc):

"""

Create a new DStream in which each RDD is generated by applying

a function on RDDs of the DStreams. The order of the JavaRDDs in

the transform function parameter will be the same as the order

of corresponding DStreams in the list.

"""

jdstreams = [d._jdstream for d in dstreams]

# change the final serializer to sc.serializer

func = TransformFunction(self._sc,

lambda t, *rdds: transformFunc(rdds),

*[d._jrdd_deserializer for d in dstreams])

jfunc = self._jvm.TransformFunction(func)

jdstream = self._jssc.transform(jdstreams, jfunc)

return DStream(jdstream, self, self._sc.serializer)

def union(self, *dstreams):

"""

Create a unified DStream from multiple DStreams of the same

type and same slide duration.

"""

374 ↛ 375line 374 didn't jump to line 375, because the condition on line 374 was never true if not dstreams:

raise ValueError("should have at least one DStream to union")

376 ↛ 377line 376 didn't jump to line 377, because the condition on line 376 was never true if len(dstreams) == 1:

return dstreams[0]

378 ↛ 379line 378 didn't jump to line 379, because the condition on line 378 was never true if len(set(s._jrdd_deserializer for s in dstreams)) > 1:

raise ValueError("All DStreams should have same serializer")

380 ↛ 381line 380 didn't jump to line 381, because the condition on line 380 was never true if len(set(s._slideDuration for s in dstreams)) > 1:

raise ValueError("All DStreams should have same slide duration")

jdstream_cls = SparkContext._jvm.org.apache.spark.streaming.api.java.JavaDStream

jpair_dstream_cls = SparkContext._jvm.org.apache.spark.streaming.api.java.JavaPairDStream

gw = SparkContext._gateway

385 ↛ 387line 385 didn't jump to line 387, because the condition on line 385 was never false if is_instance_of(gw, dstreams[0]._jdstream, jdstream_cls):

cls = jdstream_cls

elif is_instance_of(gw, dstreams[0]._jdstream, jpair_dstream_cls):

cls = jpair_dstream_cls

else:

cls_name = dstreams[0]._jdstream.getClass().getCanonicalName()

raise TypeError("Unsupported Java DStream class %s" % cls_name)

jdstreams = gw.new_array(cls, len(dstreams))

for i in range(0, len(dstreams)):

jdstreams[i] = dstreams[i]._jdstream

return DStream(self._jssc.union(jdstreams), self, dstreams[0]._jrdd_deserializer)

def addStreamingListener(self, streamingListener):

"""

Add a [[org.apache.spark.streaming.scheduler.StreamingListener]] object for

receiving system events related to streaming.

"""

self._jssc.addStreamingListener(self._jvm.JavaStreamingListenerWrapper(

self._jvm.PythonStreamingListenerWrapper(streamingListener)))

Coverage for pyspark/streaming/context.py : 83%

140 statements 120 run 20 missing 0 excluded 10 partial