Coverage for pyspark/sql/readwriter.py: 80%

662 ↛ 663line 662 didn't jump to line 663, because the condition on line 662 was never true if not all(isinstance(c, str) for c in cols) or not(isinstance(col, str)):

raise TypeError("all names should be `str`")

self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))

return self

def sortBy(self, col, *cols):

"""Sorts the output in each bucket by the given columns on the file system.

.. versionadded:: 2.3.0

Parameters

----------

col : str, tuple or list

a name of a column, or a list of names.

cols : str

additional names (optional). If `col` is a list it should be empty.

Examples

--------

>>> (df.write.format('parquet') # doctest: +SKIP

... .bucketBy(100, 'year', 'month')

... .sortBy('day')

... .mode("overwrite")

... .saveAsTable('sorted_bucketed_table'))

"""

if isinstance(col, (list, tuple)):

689 ↛ 690line 689 didn't jump to line 690, because the condition on line 689 was never true if cols:

raise ValueError("col is a {0} but cols are not empty".format(type(col)))

col, cols = col[0], col[1:]

694 ↛ 695line 694 didn't jump to line 695, because the condition on line 694 was never true if not all(isinstance(c, str) for c in cols) or not(isinstance(col, str)):

raise TypeError("all names should be `str`")

self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))

return self

def save(self, path=None, format=None, mode=None, partitionBy=None, **options):

"""Saves the contents of the :class:`DataFrame` to a data source.

The data source is specified by the ``format`` and a set of ``options``.

If ``format`` is not specified, the default data source configured by

``spark.sql.sources.default`` will be used.

.. versionadded:: 1.4.0

Parameters

----------

path : str, optional

the path in a Hadoop supported file system

format : str, optional

the format used to save

mode : str, optional

specifies the behavior of the save operation when data already exists.

* ``append``: Append contents of this :class:`DataFrame` to existing data.

* ``overwrite``: Overwrite existing data.

* ``ignore``: Silently ignore this operation if data already exists.

* ``error`` or ``errorifexists`` (default case): Throw an exception if data already \

exists.

partitionBy : list, optional

names of partitioning columns

**options : dict

all other string options

Examples

--------

>>> df.write.mode("append").save(os.path.join(tempfile.mkdtemp(), 'data'))

"""

self.mode(mode).options(**options)

733 ↛ 734line 733 didn't jump to line 734, because the condition on line 733 was never true if partitionBy is not None:

self.partitionBy(partitionBy)

if format is not None:

self.format(format)

737 ↛ 738line 737 didn't jump to line 738, because the condition on line 737 was never true if path is None:

self._jwrite.save()

else:

self._jwrite.save(path)

@since(1.4)

def insertInto(self, tableName, overwrite=None):

"""Inserts the content of the :class:`DataFrame` to the specified table.

It requires that the schema of the :class:`DataFrame` is the same as the

schema of the table.

Parameters

----------

overwrite : bool, optional

If true, overwrites existing data. Disabled by default

Notes

-----

Unlike :meth:`DataFrameWriter.saveAsTable`, :meth:`DataFrameWriter.insertInto` ignores

the column names and just uses position-based resolution.

"""

if overwrite is not None:

self.mode("overwrite" if overwrite else "append")

self._jwrite.insertInto(tableName)

def saveAsTable(self, name, format=None, mode=None, partitionBy=None, **options):

"""Saves the content of the :class:`DataFrame` as the specified table.

In the case the table already exists, behavior of this function depends on the

save mode, specified by the `mode` function (default to throwing an exception).

When `mode` is `Overwrite`, the schema of the :class:`DataFrame` does not need to be

the same as that of the existing table.

* `append`: Append contents of this :class:`DataFrame` to existing data.

* `overwrite`: Overwrite existing data.

* `error` or `errorifexists`: Throw an exception if data already exists.

* `ignore`: Silently ignore this operation if data already exists.

.. versionadded:: 1.4.0

Notes

-----

When `mode` is `Append`, if there is an existing table, we will use the format and

options of the existing table. The column order in the schema of the :class:`DataFrame`

doesn't need to be same as that of the existing table. Unlike

:meth:`DataFrameWriter.insertInto`, :meth:`DataFrameWriter.saveAsTable` will use the

column names to find the correct column positions.

Parameters

----------

name : str

the table name

format : str, optional

the format used to save

mode : str, optional

one of `append`, `overwrite`, `error`, `errorifexists`, `ignore` \

(default: error)

partitionBy : str or list

names of partitioning columns

**options : dict

all other string options

"""

self.mode(mode).options(**options)

802 ↛ 803line 802 didn't jump to line 803, because the condition on line 802 was never true if partitionBy is not None:

self.partitionBy(partitionBy)

if format is not None:

self.format(format)

self._jwrite.saveAsTable(name)

def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None,

lineSep=None, encoding=None, ignoreNullFields=None):

"""Saves the content of the :class:`DataFrame` in JSON format

(`JSON Lines text format or newline-delimited JSON <http://jsonlines.org/>`_) at the

specified path.

.. versionadded:: 1.4.0

Parameters

----------

path : str

the path in any Hadoop supported file system

mode : str, optional

specifies the behavior of the save operation when data already exists.

* ``append``: Append contents of this :class:`DataFrame` to existing data.

* ``overwrite``: Overwrite existing data.

* ``ignore``: Silently ignore this operation if data already exists.

* ``error`` or ``errorifexists`` (default case): Throw an exception if data already \

exists.

Other Parameters

----------------

Extra options

For the extra options, refer to

`Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_

in the version you use.

.. # noqa

Examples

--------

>>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))

"""

self.mode(mode)

self._set_opts(

compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat,

lineSep=lineSep, encoding=encoding, ignoreNullFields=ignoreNullFields)

self._jwrite.json(path)

def parquet(self, path, mode=None, partitionBy=None, compression=None):

"""Saves the content of the :class:`DataFrame` in Parquet format at the specified path.

.. versionadded:: 1.4.0

Parameters

----------

path : str

the path in any Hadoop supported file system

mode : str, optional

specifies the behavior of the save operation when data already exists.

* ``append``: Append contents of this :class:`DataFrame` to existing data.

* ``overwrite``: Overwrite existing data.

* ``ignore``: Silently ignore this operation if data already exists.

* ``error`` or ``errorifexists`` (default case): Throw an exception if data already \

exists.

partitionBy : str or list, optional

names of partitioning columns

Other Parameters

----------------

Extra options

For the extra options, refer to

`Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option>`_

in the version you use.

.. # noqa

Examples

--------

>>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data'))

"""

self.mode(mode)

882 ↛ 883line 882 didn't jump to line 883, because the condition on line 882 was never true if partitionBy is not None:

self.partitionBy(partitionBy)

self._set_opts(compression=compression)

self._jwrite.parquet(path)

def text(self, path, compression=None, lineSep=None):

"""Saves the content of the DataFrame in a text file at the specified path.

The text files will be encoded as UTF-8.

.. versionadded:: 1.6.0

Parameters

----------

path : str

the path in any Hadoop supported file system

Other Parameters

----------------

Extra options

For the extra options, refer to

`Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-text.html#data-source-option>`_

in the version you use.

.. # noqa

The DataFrame must have only one column that is of string type.

Each row becomes a new line in the output file.

"""

self._set_opts(compression=compression, lineSep=lineSep)

self._jwrite.text(path)

def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,

header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,

timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,

charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None, lineSep=None):

r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.

.. versionadded:: 2.0.0

Parameters

----------

path : str

the path in any Hadoop supported file system

mode : str, optional

specifies the behavior of the save operation when data already exists.

* ``append``: Append contents of this :class:`DataFrame` to existing data.

* ``overwrite``: Overwrite existing data.

* ``ignore``: Silently ignore this operation if data already exists.

* ``error`` or ``errorifexists`` (default case): Throw an exception if data already \

exists.

Other Parameters

----------------

Extra options

For the extra options, refer to

`Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_

in the version you use.

.. # noqa

Examples

--------

>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))

"""

self.mode(mode)

self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header,

nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll,

dateFormat=dateFormat, timestampFormat=timestampFormat,

ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace,

ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace,

charToEscapeQuoteEscaping=charToEscapeQuoteEscaping,

encoding=encoding, emptyValue=emptyValue, lineSep=lineSep)

self._jwrite.csv(path)

def orc(self, path, mode=None, partitionBy=None, compression=None):

"""Saves the content of the :class:`DataFrame` in ORC format at the specified path.

.. versionadded:: 1.5.0

Parameters

----------

path : str

the path in any Hadoop supported file system

mode : str, optional

specifies the behavior of the save operation when data already exists.

* ``append``: Append contents of this :class:`DataFrame` to existing data.

* ``overwrite``: Overwrite existing data.

* ``ignore``: Silently ignore this operation if data already exists.

* ``error`` or ``errorifexists`` (default case): Throw an exception if data already \

exists.

partitionBy : str or list, optional

names of partitioning columns

Other Parameters

----------------

Extra options

For the extra options, refer to

`Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-orc.html#data-source-option>`_

in the version you use.

.. # noqa

Examples

--------

>>> orc_df = spark.read.orc('python/test_support/sql/orc_partitioned')

>>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))

"""

self.mode(mode)

992 ↛ 993line 992 didn't jump to line 993, because the condition on line 992 was never true if partitionBy is not None:

self.partitionBy(partitionBy)

self._set_opts(compression=compression)

self._jwrite.orc(path)

def jdbc(self, url, table, mode=None, properties=None):

"""Saves the content of the :class:`DataFrame` to an external database table via JDBC.

.. versionadded:: 1.4.0

Parameters

----------

table : str

Name of the table in the external database.

mode : str, optional

specifies the behavior of the save operation when data already exists.

* ``append``: Append contents of this :class:`DataFrame` to existing data.

* ``overwrite``: Overwrite existing data.

* ``ignore``: Silently ignore this operation if data already exists.

* ``error`` or ``errorifexists`` (default case): Throw an exception if data already \

exists.

properties : dict

a dictionary of JDBC database connection arguments. Normally at

least properties "user" and "password" with their corresponding values.

For example { 'user' : 'SYSTEM', 'password' : 'mypassword' }

Other Parameters

----------------

Extra options

For the extra options, refer to

`Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option>`_

in the version you use.

.. # noqa

Notes

-----

Don't create too many partitions in parallel on a large cluster;

otherwise Spark might crash your external database systems.

"""

if properties is None:

properties = dict()

jprop = JavaClass("java.util.Properties", self._spark._sc._gateway._gateway_client)()

for k in properties:

jprop.setProperty(k, properties[k])

self.mode(mode)._jwrite.jdbc(url, table, jprop)

class DataFrameWriterV2(object):

"""

Interface used to write a class:`pyspark.sql.dataframe.DataFrame`

to external storage using the v2 API.

.. versionadded:: 3.1.0

"""

def __init__(self, df, table):

self._df = df

self._spark = df.sql_ctx

self._jwriter = df._jdf.writeTo(table)

@since(3.1)

def using(self, provider):

"""

Specifies a provider for the underlying output data source.

Spark's default catalog supports "parquet", "json", etc.

"""

self._jwriter.using(provider)

return self

@since(3.1)

def option(self, key, value):

"""

Add a write option.

"""

self._jwriter.option(key, to_str(value))

return self

@since(3.1)

def options(self, **options):

"""

Add write options.

"""

options = {k: to_str(v) for k, v in options.items()}

self._jwriter.options(options)

return self

@since(3.1)

def tableProperty(self, property, value):

"""

Add table property.

"""

self._jwriter.tableProperty(property, value)

return self

@since(3.1)

def partitionedBy(self, col, *cols):

"""

Partition the output table created by `create`, `createOrReplace`, or `replace` using

the given columns or transforms.

When specified, the table data will be stored by these values for efficient reads.

For example, when a table is partitioned by day, it may be stored

in a directory layout like:

* `table/day=2019-06-01/`

* `table/day=2019-06-02/`

Partitioning is one of the most widely used techniques to optimize physical data layout.

It provides a coarse-grained index for skipping unnecessary data reads when queries have

predicates on the partitioned columns. In order for partitioning to work well, the number

of distinct values in each column should typically be less than tens of thousands.

`col` and `cols` support only the following functions:

* :py:func:`pyspark.sql.functions.years`

* :py:func:`pyspark.sql.functions.months`

* :py:func:`pyspark.sql.functions.days`

* :py:func:`pyspark.sql.functions.hours`

* :py:func:`pyspark.sql.functions.bucket`

"""

col = _to_java_column(col)

cols = _to_seq(self._spark._sc, [_to_java_column(c) for c in cols])

return self

@since(3.1)

def create(self):

"""

Create a new table from the contents of the data frame.

The new table's schema, partition layout, properties, and other configuration will be

based on the configuration set on this writer.

"""

self._jwriter.create()

@since(3.1)

def replace(self):

"""

Replace an existing table with the contents of the data frame.

The existing table's schema, partition layout, properties, and other configuration will be

replaced with the contents of the data frame and the configuration set on this writer.

"""

self._jwriter.replace()

@since(3.1)

def createOrReplace(self):

"""

Create a new table or replace an existing table with the contents of the data frame.

The output table's schema, partition layout, properties,

and other configuration will be based on the contents of the data frame

and the configuration set on this writer.

If the table exists, its configuration and data will be replaced.

"""

self._jwriter.createOrReplace()

@since(3.1)

def append(self):

"""

Append the contents of the data frame to the output table.

"""

self._jwriter.append()

@since(3.1)

def overwrite(self, condition):

"""

Overwrite rows matching the given filter condition with the contents of the data frame in

the output table.

"""

self._jwriter.overwrite(condition)

@since(3.1)

def overwritePartitions(self):

"""

Overwrite all partition for which the data frame contains at least one row with the contents

of the data frame in the output table.

This operation is equivalent to Hive's `INSERT OVERWRITE ... PARTITION`, which replaces

partitions dynamically depending on the contents of the data frame.

"""

self._jwriter.overwritePartitions()

def _test():

import doctest

import os

import tempfile

import py4j

from pyspark.context import SparkContext

from pyspark.sql import SparkSession

import pyspark.sql.readwriter

os.chdir(os.environ["SPARK_HOME"])

globs = pyspark.sql.readwriter.__dict__.copy()

sc = SparkContext('local[4]', 'PythonTest')

try:

spark = SparkSession.builder.getOrCreate()

except py4j.protocol.Py4JError:

spark = SparkSession(sc)

globs['tempfile'] = tempfile

globs['os'] = os

globs['sc'] = sc

globs['spark'] = spark

globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned')

(failure_count, test_count) = doctest.testmod(

pyspark.sql.readwriter, globs=globs,

optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)

sc.stop()

1206 ↛ 1207line 1206 didn't jump to line 1207, because the condition on line 1206 was never true if failure_count:

sys.exit(-1)

if __name__ == "__main__":

_test()

Coverage for pyspark/sql/readwriter.py : 80%

290 statements 241 run 49 missing 0 excluded 20 partial