Coverage for pyspark/sql/streaming.py: 88%

770 ↛ 771line 770 didn't jump to line 771, because the condition on line 770 was never true if not outputMode or type(outputMode) != str or len(outputMode.strip()) == 0:

raise ValueError('The output mode must be a non-empty string. Got: %s' % outputMode)

self._jwrite = self._jwrite.outputMode(outputMode)

return self

def format(self, source):

"""Specifies the underlying output data source.

.. versionadded:: 2.0.0

Parameters

----------

source : str

string, name of the data source, which for now can be 'parquet'.

Notes

-----

This API is evolving.

Examples

--------

>>> writer = sdf.writeStream.format('json')

"""

self._jwrite = self._jwrite.format(source)

return self

def option(self, key, value):

"""Adds an output option for the underlying data source.

.. versionadded:: 2.0.0

Notes

-----

This API is evolving.

"""

self._jwrite = self._jwrite.option(key, to_str(value))

return self

def options(self, **options):

"""Adds output options for the underlying data source.

.. versionadded:: 2.0.0

Notes

-----

This API is evolving.

"""

for k in options:

self._jwrite = self._jwrite.option(k, to_str(options[k]))

return self

def partitionBy(self, *cols):

"""Partitions the output by the given columns on the file system.

If specified, the output is laid out on the file system similar

to Hive's partitioning scheme.

.. versionadded:: 2.0.0

Parameters

----------

cols : str or list

name of columns

Notes

-----

This API is evolving.

"""

838 ↛ 839line 838 didn't jump to line 839, because the condition on line 838 was never true if len(cols) == 1 and isinstance(cols[0], (list, tuple)):

cols = cols[0]

self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))

return self

def queryName(self, queryName):

"""Specifies the name of the :class:`StreamingQuery` that can be started with

:func:`start`. This name must be unique among all the currently active queries

in the associated SparkSession.

.. versionadded:: 2.0.0

Parameters

----------

queryName : str

unique name for the query

Notes

-----

This API is evolving.

Examples

--------

>>> writer = sdf.writeStream.queryName('streaming_query')

"""

863 ↛ 864line 863 didn't jump to line 864, because the condition on line 863 was never true if not queryName or type(queryName) != str or len(queryName.strip()) == 0:

raise ValueError('The queryName must be a non-empty string. Got: %s' % queryName)

self._jwrite = self._jwrite.queryName(queryName)

return self

@keyword_only

def trigger(self, *, processingTime=None, once=None, continuous=None):

"""Set the trigger for the stream query. If this is not set it will run the query as fast

as possible, which is equivalent to setting the trigger to ``processingTime='0 seconds'``.

.. versionadded:: 2.0.0

Parameters

----------

processingTime : str, optional

a processing time interval as a string, e.g. '5 seconds', '1 minute'.

Set a trigger that runs a microbatch query periodically based on the

processing time. Only one trigger can be set.

once : bool, optional

if set to True, set a trigger that processes only one batch of data in a

streaming query then terminates the query. Only one trigger can be set.

continuous : str, optional

a time interval as a string, e.g. '5 seconds', '1 minute'.

Set a trigger that runs a continuous query with a given checkpoint

interval. Only one trigger can be set.

Notes

-----

This API is evolving.

Examples

--------

>>> # trigger the query for execution every 5 seconds

>>> writer = sdf.writeStream.trigger(processingTime='5 seconds')

>>> # trigger the query for just once batch of data

>>> writer = sdf.writeStream.trigger(once=True)

>>> # trigger the query for execution every 5 seconds

>>> writer = sdf.writeStream.trigger(continuous='5 seconds')

"""

params = [processingTime, once, continuous]

if params.count(None) == 3:

raise ValueError('No trigger provided')

elif params.count(None) < 2:

raise ValueError('Multiple triggers not allowed.')

jTrigger = None

if processingTime is not None:

911 ↛ 912line 911 didn't jump to line 912, because the condition on line 911 was never true if type(processingTime) != str or len(processingTime.strip()) == 0:

raise ValueError('Value for processingTime must be a non empty string. Got: %s' %

processingTime)

interval = processingTime.strip()

jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.ProcessingTime(

interval)

elif once is not None:

919 ↛ 920line 919 didn't jump to line 920, because the condition on line 919 was never true if once is not True:

raise ValueError('Value for once must be True. Got: %s' % once)

jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Once()

else:

924 ↛ 925line 924 didn't jump to line 925, because the condition on line 924 was never true if type(continuous) != str or len(continuous.strip()) == 0:

raise ValueError('Value for continuous must be a non empty string. Got: %s' %

continuous)

interval = continuous.strip()

jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Continuous(

interval)

self._jwrite = self._jwrite.trigger(jTrigger)

return self

def foreach(self, f):

"""

Sets the output of the streaming query to be processed using the provided writer ``f``.

This is often used to write the output of a streaming query to arbitrary storage systems.

The processing logic can be specified in two ways.

#. A **function** that takes a row as input.

This is a simple way to express your processing logic. Note that this does

not allow you to deduplicate generated data when failures cause reprocessing of

some input data. That would require you to specify the processing logic in the next

way.

#. An **object** with a ``process`` method and optional ``open`` and ``close`` methods.

The object can have the following methods.

* ``open(partition_id, epoch_id)``: *Optional* method that initializes the processing

(for example, open a connection, start a transaction, etc). Additionally, you can

use the `partition_id` and `epoch_id` to deduplicate regenerated data

(discussed later).

* ``process(row)``: *Non-optional* method that processes each :class:`Row`.

* ``close(error)``: *Optional* method that finalizes and cleans up (for example,

close connection, commit transaction, etc.) after all rows have been processed.

The object will be used by Spark in the following way.

* A single copy of this object is responsible of all the data generated by a

single task in a query. In other words, one instance is responsible for

processing one partition of the data generated in a distributed manner.

* This object must be serializable because each task will get a fresh

serialized-deserialized copy of the provided object. Hence, it is strongly

recommended that any initialization for writing data (e.g. opening a

connection or starting a transaction) is done after the `open(...)`

method has been called, which signifies that the task is ready to generate data.

* The lifecycle of the methods are as follows.

For each partition with ``partition_id``:

... For each batch/epoch of streaming data with ``epoch_id``:

....... Method ``open(partitionId, epochId)`` is called.

....... If ``open(...)`` returns true, for each row in the partition and

batch/epoch, method ``process(row)`` is called.

....... Method ``close(errorOrNull)`` is called with error (if any) seen while

processing rows.

Important points to note:

* The `partitionId` and `epochId` can be used to deduplicate generated data when

failures cause reprocessing of some input data. This depends on the execution

mode of the query. If the streaming query is being executed in the micro-batch

mode, then every partition represented by a unique tuple (partition_id, epoch_id)

is guaranteed to have the same data. Hence, (partition_id, epoch_id) can be used

to deduplicate and/or transactionally commit data and achieve exactly-once

guarantees. However, if the streaming query is being executed in the continuous

mode, then this guarantee does not hold and therefore should not be used for

deduplication.

* The ``close()`` method (if exists) will be called if `open()` method exists and

returns successfully (irrespective of the return value), except if the Python

crashes in the middle.

.. versionadded:: 2.4.0

Notes

-----

This API is evolving.

Examples

--------

>>> # Print every row using a function

>>> def print_row(row):

... print(row)

...

>>> writer = sdf.writeStream.foreach(print_row)

>>> # Print every row using a object with process() method

>>> class RowPrinter:

... def open(self, partition_id, epoch_id):

... print("Opened %d, %d" % (partition_id, epoch_id))

... return True

... def process(self, row):

... print(row)

... def close(self, error):

... print("Closed with error: %s" % str(error))

...

>>> writer = sdf.writeStream.foreach(RowPrinter())

"""

from pyspark.rdd import _wrap_function

from pyspark.serializers import PickleSerializer, AutoBatchedSerializer

from pyspark.taskcontext import TaskContext

if callable(f):

# The provided object is a callable function that is supposed to be called on each row.

# Construct a function that takes an iterator and calls the provided function on each

# row.

def func_without_process(_, iterator):

for x in iterator:

f(x)

return iter([])

func = func_without_process

else:

# The provided object is not a callable function. Then it is expected to have a

# 'process(row)' method, and optional 'open(partition_id, epoch_id)' and

# 'close(error)' methods.

if not hasattr(f, 'process'):

raise AttributeError("Provided object does not have a 'process' method")

if not callable(getattr(f, 'process')):

raise TypeError("Attribute 'process' in provided object is not callable")

def doesMethodExist(method_name):

exists = hasattr(f, method_name)

if exists and not callable(getattr(f, method_name)):

raise TypeError(

"Attribute '%s' in provided object is not callable" % method_name)

return exists

open_exists = doesMethodExist('open')

close_exists = doesMethodExist('close')

def func_with_open_process_close(partition_id, iterator):

epoch_id = TaskContext.get().getLocalProperty('streaming.sql.batchId')

1065 ↛ 1068line 1065 didn't jump to line 1068, because the condition on line 1065 was never false if epoch_id:

epoch_id = int(epoch_id)

else:

raise RuntimeError("Could not get batch id from TaskContext")

# Check if the data should be processed

should_process = True

if open_exists:

should_process = f.open(partition_id, epoch_id)

error = None

try:

if should_process:

for x in iterator:

f.process(x)

except Exception as ex:

error = ex

finally:

if close_exists:

f.close(error)

if error:

raise error

return iter([])

func = func_with_open_process_close

serializer = AutoBatchedSerializer(PickleSerializer())

wrapped_func = _wrap_function(self._spark._sc, func, serializer, serializer)

jForeachWriter = \

self._spark._sc._jvm.org.apache.spark.sql.execution.python.PythonForeachWriter(

wrapped_func, self._df._jdf.schema())

self._jwrite.foreach(jForeachWriter)

return self

def foreachBatch(self, func):

"""

Sets the output of the streaming query to be processed using the provided

function. This is supported only the in the micro-batch execution modes (that is, when the

trigger is not continuous). In every micro-batch, the provided function will be called in

every micro-batch with (i) the output rows as a DataFrame and (ii) the batch identifier.

The batchId can be used deduplicate and transactionally write the output

(that is, the provided Dataset) to external systems. The output DataFrame is guaranteed

to exactly same for the same batchId (assuming all operations are deterministic in the

query).

.. versionadded:: 2.4.0

Notes

-----

This API is evolving.

Examples

--------

>>> def func(batch_df, batch_id):

... batch_df.collect()

...

>>> writer = sdf.writeStream.foreachBatch(func)

"""

from pyspark.java_gateway import ensure_callback_server_started

gw = self._spark._sc._gateway

java_import(gw.jvm, "org.apache.spark.sql.execution.streaming.sources.*")

wrapped_func = ForeachBatchFunction(self._spark, func)

gw.jvm.PythonForeachBatchHelper.callForeachBatch(self._jwrite, wrapped_func)

ensure_callback_server_started(gw)

return self

def start(self, path=None, format=None, outputMode=None, partitionBy=None, queryName=None,

**options):

"""Streams the contents of the :class:`DataFrame` to a data source.

The data source is specified by the ``format`` and a set of ``options``.

If ``format`` is not specified, the default data source configured by

``spark.sql.sources.default`` will be used.

.. versionadded:: 2.0.0

Parameters

----------

path : str, optional

the path in a Hadoop supported file system

format : str, optional

the format used to save

outputMode : str, optional

specifies how data of a streaming DataFrame/Dataset is written to a

streaming sink.

* `append`: Only the new rows in the streaming DataFrame/Dataset will be written to the

sink

* `complete`: All the rows in the streaming DataFrame/Dataset will be written to the

sink every time these are some updates

* `update`: only the rows that were updated in the streaming DataFrame/Dataset will be

written to the sink every time there are some updates. If the query doesn't contain

aggregations, it will be equivalent to `append` mode.

partitionBy : str or list, optional

names of partitioning columns

queryName : str, optional

unique name for the query

**options : dict

All other string options. You may want to provide a `checkpointLocation`

for most streams, however it is not required for a `memory` stream.

Notes

-----

This API is evolving.

Examples

--------

>>> sq = sdf.writeStream.format('memory').queryName('this_query').start()

>>> sq.isActive

True

>>> sq.name

'this_query'

>>> sq.stop()

>>> sq.isActive

False

>>> sq = sdf.writeStream.trigger(processingTime='5 seconds').start(

... queryName='that_query', outputMode="append", format='memory')

>>> sq.name

'that_query'

>>> sq.isActive

True

>>> sq.stop()

"""

self.options(**options)

if outputMode is not None:

self.outputMode(outputMode)

1195 ↛ 1196line 1195 didn't jump to line 1196, because the condition on line 1195 was never true if partitionBy is not None:

self.partitionBy(partitionBy)

if format is not None:

self.format(format)

if queryName is not None:

self.queryName(queryName)

if path is None:

return self._sq(self._jwrite.start())

else:

return self._sq(self._jwrite.start(path))

def toTable(self, tableName, format=None, outputMode=None, partitionBy=None, queryName=None,

**options):

"""

Starts the execution of the streaming query, which will continually output results to the

given table as new data arrives.

The returned :class:`StreamingQuery` object can be used to interact with the stream.

.. versionadded:: 3.1.0

Parameters

----------

tableName : str

string, for the name of the table.

format : str, optional

the format used to save.

outputMode : str, optional

specifies how data of a streaming DataFrame/Dataset is written to a

streaming sink.

* `append`: Only the new rows in the streaming DataFrame/Dataset will be written to the

sink

* `complete`: All the rows in the streaming DataFrame/Dataset will be written to the

sink every time these are some updates

* `update`: only the rows that were updated in the streaming DataFrame/Dataset will be

written to the sink every time there are some updates. If the query doesn't contain

aggregations, it will be equivalent to `append` mode.

partitionBy : str or list, optional

names of partitioning columns

queryName : str, optional

unique name for the query

**options : dict

All other string options. You may want to provide a `checkpointLocation`.

Notes

-----

This API is evolving.

For v1 table, partitioning columns provided by `partitionBy` will be respected no matter

the table exists or not. A new table will be created if the table not exists.

For v2 table, `partitionBy` will be ignored if the table already exists. `partitionBy` will

be respected only if the v2 table does not exist. Besides, the v2 table created by this API

lacks some functionalities (e.g., customized properties, options, and serde info). If you

need them, please create the v2 table manually before the execution to avoid creating a

table with incomplete information.

Examples

--------

>>> sdf.writeStream.format('parquet').queryName('query').toTable('output_table')

... # doctest: +SKIP

>>> sdf.writeStream.trigger(processingTime='5 seconds').toTable(

... 'output_table',

... queryName='that_query',

... outputMode="append",

... format='parquet',

... checkpointLocation='/tmp/checkpoint') # doctest: +SKIP

"""

self.options(**options)

1266 ↛ 1267line 1266 didn't jump to line 1267, because the condition on line 1266 was never true if outputMode is not None:

self.outputMode(outputMode)

1268 ↛ 1269line 1268 didn't jump to line 1269, because the condition on line 1268 was never true if partitionBy is not None:

self.partitionBy(partitionBy)

1270 ↛ 1272line 1270 didn't jump to line 1272, because the condition on line 1270 was never false if format is not None:

self.format(format)

1272 ↛ 1273line 1272 didn't jump to line 1273, because the condition on line 1272 was never true if queryName is not None:

self.queryName(queryName)

return self._sq(self._jwrite.toTable(tableName))

def _test():

import doctest

import os

import tempfile

from pyspark.sql import SparkSession, SQLContext

import pyspark.sql.streaming

os.chdir(os.environ["SPARK_HOME"])

globs = pyspark.sql.streaming.__dict__.copy()

try:

spark = SparkSession.builder.getOrCreate()

except py4j.protocol.Py4JError: # noqa: F821

spark = SparkSession(sc) # noqa: F821

globs['tempfile'] = tempfile

globs['os'] = os

globs['spark'] = spark

globs['sqlContext'] = SQLContext.getOrCreate(spark.sparkContext)

globs['sdf'] = \

spark.readStream.format('text').load('python/test_support/sql/streaming')

globs['sdf_schema'] = StructType([StructField("data", StringType(), True)])

globs['df'] = \

globs['spark'].readStream.format('text').load('python/test_support/sql/streaming')

(failure_count, test_count) = doctest.testmod(

pyspark.sql.streaming, globs=globs,

optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)

globs['spark'].stop()

1307 ↛ 1308line 1307 didn't jump to line 1308, because the condition on line 1307 was never true if failure_count:

sys.exit(-1)

if __name__ == "__main__":

_test()

Coverage for pyspark/sql/streaming.py : 88%

298 statements 273 run 25 missing 0 excluded 23 partial