Coverage for pyspark/sql/dataframe.py: 88%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

# Licensed to the Apache Software Foundation (ASF) under one or more

# contributor license agreements. See the NOTICE file distributed with

# this work for additional information regarding copyright ownership.

# The ASF licenses this file to You under the Apache License, Version 2.0

# (the "License"); you may not use this file except in compliance with

# the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

import sys

import random

import warnings

from collections.abc import Iterable

from functools import reduce

from html import escape as html_escape

from pyspark import copy_func, since, _NoValue

from pyspark.rdd import RDD, _load_from_socket, _local_iterator_from_socket

from pyspark.serializers import BatchedSerializer, PickleSerializer, \

UTF8Deserializer

from pyspark.storagelevel import StorageLevel

from pyspark.traceback_utils import SCCallSiteSync

from pyspark.sql.types import _parse_datatype_json_string

from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column

from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2

from pyspark.sql.streaming import DataStreamWriter

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

from pyspark.sql.pandas.conversion import PandasConversionMixin

from pyspark.sql.pandas.map_ops import PandasMapOpsMixin

__all__ = ["DataFrame", "DataFrameNaFunctions", "DataFrameStatFunctions"]

class DataFrame(PandasMapOpsMixin, PandasConversionMixin):

"""A distributed collection of data grouped into named columns.

A :class:`DataFrame` is equivalent to a relational table in Spark SQL,

and can be created using various functions in :class:`SparkSession`::

people = spark.read.parquet("...")

Once created, it can be manipulated using the various domain-specific-language

(DSL) functions defined in: :class:`DataFrame`, :class:`Column`.

To select a column from the :class:`DataFrame`, use the apply method::

ageCol = people.age

A more concrete example::

# To create DataFrame using SparkSession

people = spark.read.parquet("...")

department = spark.read.parquet("...")

people.filter(people.age > 30).join(department, people.deptId == department.id) \\

.groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})

.. versionadded:: 1.3.0

"""

def __init__(self, jdf, sql_ctx):

self._jdf = jdf

self.sql_ctx = sql_ctx

self._sc = sql_ctx and sql_ctx._sc

self.is_cached = False

self._schema = None # initialized lazily

self._lazy_rdd = None

# Check whether _repr_html is supported or not, we use it to avoid calling _jdf twice

# by __repr__ and _repr_html_ while eager evaluation opened.

self._support_repr_html = False

@property

@since(1.3)

def rdd(self):

"""Returns the content as an :class:`pyspark.RDD` of :class:`Row`.

"""

if self._lazy_rdd is None:

jrdd = self._jdf.javaToPython()

self._lazy_rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))

return self._lazy_rdd

@property

@since("1.3.1")

def na(self):

"""Returns a :class:`DataFrameNaFunctions` for handling missing values.

"""

return DataFrameNaFunctions(self)

@property

@since(1.4)

def stat(self):

"""Returns a :class:`DataFrameStatFunctions` for statistic functions.

"""

return DataFrameStatFunctions(self)

def toJSON(self, use_unicode=True):

"""Converts a :class:`DataFrame` into a :class:`RDD` of string.

Each row is turned into a JSON document as one element in the returned RDD.

.. versionadded:: 1.3.0

Examples

--------

>>> df.toJSON().first()

'{"age":2,"name":"Alice"}'

"""

rdd = self._jdf.toJSON()

return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))

def registerTempTable(self, name):

"""Registers this :class:`DataFrame` as a temporary table using the given name.

The lifetime of this temporary table is tied to the :class:`SparkSession`

that was used to create this :class:`DataFrame`.

.. versionadded:: 1.3.0

.. deprecated:: 2.0.0

Use :meth:`DataFrame.createOrReplaceTempView` instead.

Examples

--------

>>> df.registerTempTable("people")

>>> df2 = spark.sql("select * from people")

>>> sorted(df.collect()) == sorted(df2.collect())

True

>>> spark.catalog.dropTempView("people")

"""

warnings.warn(

"Deprecated in 2.0, use createOrReplaceTempView instead.",

FutureWarning

)

self._jdf.createOrReplaceTempView(name)

def createTempView(self, name):

"""Creates a local temporary view with this :class:`DataFrame`.

The lifetime of this temporary table is tied to the :class:`SparkSession`

that was used to create this :class:`DataFrame`.

throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the

catalog.

.. versionadded:: 2.0.0

Examples

--------

>>> df.createTempView("people")

>>> df2 = spark.sql("select * from people")

>>> sorted(df.collect()) == sorted(df2.collect())

True

>>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL

Traceback (most recent call last):

...

AnalysisException: u"Temporary table 'people' already exists;"

>>> spark.catalog.dropTempView("people")

"""

self._jdf.createTempView(name)

def createOrReplaceTempView(self, name):

"""Creates or replaces a local temporary view with this :class:`DataFrame`.

The lifetime of this temporary table is tied to the :class:`SparkSession`

that was used to create this :class:`DataFrame`.

.. versionadded:: 2.0.0

Examples

--------

>>> df.createOrReplaceTempView("people")

>>> df2 = df.filter(df.age > 3)

>>> df2.createOrReplaceTempView("people")

>>> df3 = spark.sql("select * from people")

>>> sorted(df3.collect()) == sorted(df2.collect())

True

>>> spark.catalog.dropTempView("people")

"""

self._jdf.createOrReplaceTempView(name)

def createGlobalTempView(self, name):

"""Creates a global temporary view with this :class:`DataFrame`.

The lifetime of this temporary view is tied to this Spark application.

throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the

catalog.

.. versionadded:: 2.1.0

Examples

--------

>>> df.createGlobalTempView("people")

>>> df2 = spark.sql("select * from global_temp.people")

>>> sorted(df.collect()) == sorted(df2.collect())

True

>>> df.createGlobalTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL

Traceback (most recent call last):

...

AnalysisException: u"Temporary table 'people' already exists;"

>>> spark.catalog.dropGlobalTempView("people")

"""

self._jdf.createGlobalTempView(name)

def createOrReplaceGlobalTempView(self, name):

"""Creates or replaces a global temporary view using the given name.

The lifetime of this temporary view is tied to this Spark application.

.. versionadded:: 2.2.0

Examples

--------

>>> df.createOrReplaceGlobalTempView("people")

>>> df2 = df.filter(df.age > 3)

>>> df2.createOrReplaceGlobalTempView("people")

>>> df3 = spark.sql("select * from global_temp.people")

>>> sorted(df3.collect()) == sorted(df2.collect())

True

>>> spark.catalog.dropGlobalTempView("people")

"""

self._jdf.createOrReplaceGlobalTempView(name)

@property

def write(self):

"""

Interface for saving the content of the non-streaming :class:`DataFrame` out into external

storage.

.. versionadded:: 1.4.0

Returns

-------

:class:`DataFrameWriter`

"""

return DataFrameWriter(self)

@property

def writeStream(self):

"""

Interface for saving the content of the streaming :class:`DataFrame` out into external

storage.

.. versionadded:: 2.0.0

Notes

-----

This API is evolving.

Returns

-------

:class:`DataStreamWriter`

"""

return DataStreamWriter(self)

@property

def schema(self):

"""Returns the schema of this :class:`DataFrame` as a :class:`pyspark.sql.types.StructType`.

.. versionadded:: 1.3.0

Examples

--------

>>> df.schema

StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))

"""

if self._schema is None:

try:

self._schema = _parse_datatype_json_string(self._jdf.schema().json())

except Exception as e:

raise ValueError(

"Unable to parse datatype from schema. %s" % e) from e

return self._schema

def printSchema(self):

"""Prints out the schema in the tree format.

.. versionadded:: 1.3.0

Examples

--------

>>> df.printSchema()

root

|-- age: integer (nullable = true)

|-- name: string (nullable = true)

"""

print(self._jdf.schema().treeString())

def explain(self, extended=None, mode=None):

"""Prints the (logical and physical) plans to the console for debugging purpose.

.. versionadded:: 1.3.0

Parameters

----------

extended : bool, optional

default ``False``. If ``False``, prints only the physical plan.

When this is a string without specifying the ``mode``, it works as the mode is

specified.

mode : str, optional

specifies the expected output format of plans.

* ``simple``: Print only a physical plan.

* ``extended``: Print both logical and physical plans.

* ``codegen``: Print a physical plan and generated codes if they are available.

* ``cost``: Print a logical plan and statistics if they are available.

* ``formatted``: Split explain output into two sections: a physical plan outline \

and node details.

.. versionchanged:: 3.0.0

Added optional argument `mode` to specify the expected output format of plans.

Examples

--------

>>> df.explain()

== Physical Plan ==

*(1) Scan ExistingRDD[age#0,name#1]

>>> df.explain(True)

== Parsed Logical Plan ==

...

== Analyzed Logical Plan ==

...

== Optimized Logical Plan ==

...

== Physical Plan ==

...

>>> df.explain(mode="formatted")

== Physical Plan ==

* Scan ExistingRDD (1)

(1) Scan ExistingRDD [codegen id : 1]

Output [2]: [age#0, name#1]

...

>>> df.explain("cost")

== Optimized Logical Plan ==

...Statistics...

...

"""

353 ↛ 354line 353 didn't jump to line 354, because the condition on line 353 was never true if extended is not None and mode is not None:

raise ValueError("extended and mode should not be set together.")

# For the no argument case: df.explain()

is_no_argument = extended is None and mode is None

# For the cases below:

# explain(True)

# explain(extended=False)

is_extended_case = isinstance(extended, bool) and mode is None

# For the case when extended is mode:

# df.explain("formatted")

is_extended_as_mode = isinstance(extended, str) and mode is None

# For the mode specified:

# df.explain(mode="formatted")

is_mode_case = extended is None and isinstance(mode, str)

372 ↛ 373line 372 didn't jump to line 373 if not (is_no_argument or is_extended_case or is_extended_as_mode or is_mode_case):

argtypes = [

str(type(arg)) for arg in [extended, mode] if arg is not None]

raise TypeError(

"extended (optional) and mode (optional) should be a string "

"and bool; however, got [%s]." % ", ".join(argtypes))

# Sets an explain mode depending on a given argument

if is_no_argument:

explain_mode = "simple"

elif is_extended_case:

explain_mode = "extended" if extended else "simple"

elif is_mode_case:

explain_mode = mode

386 ↛ 389line 386 didn't jump to line 389, because the condition on line 386 was never false elif is_extended_as_mode:

explain_mode = extended

print(self._sc._jvm.PythonSQLUtils.explainString(self._jdf.queryExecution(), explain_mode))

def exceptAll(self, other):

"""Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but

not in another :class:`DataFrame` while preserving duplicates.

This is equivalent to `EXCEPT ALL` in SQL.

As standard in SQL, this function resolves columns by position (not by name).

.. versionadded:: 2.4.0

Examples

--------

>>> df1 = spark.createDataFrame(

... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"])

>>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])

>>> df1.exceptAll(df2).show()

+---+---+

| C1| C2|

+---+---+

| a| 1|

| a| 2|

| c| 4|

+---+---+

"""

return DataFrame(self._jdf.exceptAll(other._jdf), self.sql_ctx)

@since(1.3)

def isLocal(self):

"""Returns ``True`` if the :func:`collect` and :func:`take` methods can be run locally

(without any Spark executors).

"""

return self._jdf.isLocal()

@property

def isStreaming(self):

"""Returns ``True`` if this :class:`DataFrame` contains one or more sources that

continuously return data as it arrives. A :class:`DataFrame` that reads data from a

streaming source must be executed as a :class:`StreamingQuery` using the :func:`start`

method in :class:`DataStreamWriter`. Methods that return a single answer, (e.g.,

:func:`count` or :func:`collect`) will throw an :class:`AnalysisException` when there

is a streaming source present.

.. versionadded:: 2.0.0

Notes

-----

This API is evolving.

"""

return self._jdf.isStreaming()

def show(self, n=20, truncate=True, vertical=False):

"""Prints the first ``n`` rows to the console.

.. versionadded:: 1.3.0

Parameters

----------

n : int, optional

Number of rows to show.

truncate : bool or int, optional

If set to ``True``, truncate strings longer than 20 chars by default.

If set to a number greater than one, truncates long strings to length ``truncate``

and align cells right.

vertical : bool, optional

If set to ``True``, print output rows vertically (one line

per column value).

Examples

--------

>>> df

DataFrame[age: int, name: string]

>>> df.show()

+---+-----+

|age| name|

+---+-----+

| 2|Alice|

| 5| Bob|

+---+-----+

>>> df.show(truncate=3)

+---+----+

|age|name|

+---+----+

| 2| Ali|

| 5| Bob|

+---+----+

>>> df.show(vertical=True)

-RECORD 0-----

age | 2

name | Alice

-RECORD 1-----

age | 5

name | Bob

"""

if not isinstance(n, int) or isinstance(n, bool):

raise TypeError("Parameter 'n' (number of rows) must be an int")

if not isinstance(vertical, bool):

raise TypeError("Parameter 'vertical' must be a bool")

if isinstance(truncate, bool) and truncate:

print(self._jdf.showString(n, 20, vertical))

else:

try:

int_truncate = int(truncate)

except ValueError:

raise TypeError(

"Parameter 'truncate={}' should be either bool or int.".format(truncate))

print(self._jdf.showString(n, int_truncate, vertical))

def __repr__(self):

if not self._support_repr_html and self.sql_ctx._conf.isReplEagerEvalEnabled():

vertical = False

return self._jdf.showString(

self.sql_ctx._conf.replEagerEvalMaxNumRows(),

self.sql_ctx._conf.replEagerEvalTruncate(), vertical)

else:

return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))

def _repr_html_(self):

"""Returns a :class:`DataFrame` with html code when you enabled eager evaluation

by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are

using support eager evaluation with HTML.

"""

if not self._support_repr_html:

self._support_repr_html = True

if self.sql_ctx._conf.isReplEagerEvalEnabled():

max_num_rows = max(self.sql_ctx._conf.replEagerEvalMaxNumRows(), 0)

sock_info = self._jdf.getRowsToPython(

max_num_rows, self.sql_ctx._conf.replEagerEvalTruncate())

rows = list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))

head = rows[0]

row_data = rows[1:]

has_more_data = len(row_data) > max_num_rows

row_data = row_data[:max_num_rows]

html = "<table border='1'>\n"

# generate table head

html += "<tr><th>%s</th></tr>\n" % "</th><th>".join(map(lambda x: html_escape(x), head))

# generate table rows

for row in row_data:

html += "<tr><td>%s</td></tr>\n" % "</td><td>".join(

map(lambda x: html_escape(x), row))

html += "</table>\n"

if has_more_data:

html += "only showing top %d %s\n" % (

max_num_rows, "row" if max_num_rows == 1 else "rows")

return html

else:

return None

def checkpoint(self, eager=True):

"""Returns a checkpointed version of this :class:`DataFrame`. Checkpointing can be used to

truncate the logical plan of this :class:`DataFrame`, which is especially useful in

iterative algorithms where the plan may grow exponentially. It will be saved to files

inside the checkpoint directory set with :meth:`SparkContext.setCheckpointDir`.

.. versionadded:: 2.1.0

Parameters

----------

eager : bool, optional

Whether to checkpoint this :class:`DataFrame` immediately

Notes

-----

This API is experimental.

"""

jdf = self._jdf.checkpoint(eager)

return DataFrame(jdf, self.sql_ctx)

def localCheckpoint(self, eager=True):

"""Returns a locally checkpointed version of this :class:`DataFrame`. Checkpointing can be

used to truncate the logical plan of this :class:`DataFrame`, which is especially useful in

iterative algorithms where the plan may grow exponentially. Local checkpoints are

stored in the executors using the caching subsystem and therefore they are not reliable.

.. versionadded:: 2.3.0

Parameters

----------

eager : bool, optional

Whether to checkpoint this :class:`DataFrame` immediately

Notes

-----

This API is experimental.

"""

jdf = self._jdf.localCheckpoint(eager)

return DataFrame(jdf, self.sql_ctx)

def withWatermark(self, eventTime, delayThreshold):

"""Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point

in time before which we assume no more late data is going to arrive.

Spark will use this watermark for several purposes:

- To know when a given time window aggregation can be finalized and thus can be emitted

when using output modes that do not allow updates.

- To minimize the amount of state that we need to keep for on-going aggregations.

The current watermark is computed by looking at the `MAX(eventTime)` seen across

all of the partitions in the query minus a user specified `delayThreshold`. Due to the cost

of coordinating this value across partitions, the actual watermark used is only guaranteed

to be at least `delayThreshold` behind the actual event time. In some cases we may still

process records that arrive more than `delayThreshold` late.

.. versionadded:: 2.1.0

Parameters

----------

eventTime : str

the name of the column that contains the event time of the row.

delayThreshold : str

the minimum delay to wait to data to arrive late, relative to the

latest record that has been processed in the form of an interval

(e.g. "1 minute" or "5 hours").

Notes

-----

This API is evolving.

>>> from pyspark.sql.functions import timestamp_seconds

>>> sdf.select(

... 'name',

... timestamp_seconds(sdf.time).alias('time')).withWatermark('time', '10 minutes')

DataFrame[name: string, time: timestamp]

"""

622 ↛ 623line 622 didn't jump to line 623, because the condition on line 622 was never true if not eventTime or type(eventTime) is not str:

raise TypeError("eventTime should be provided as a string")

624 ↛ 625line 624 didn't jump to line 625, because the condition on line 624 was never true if not delayThreshold or type(delayThreshold) is not str:

raise TypeError("delayThreshold should be provided as a string interval")

jdf = self._jdf.withWatermark(eventTime, delayThreshold)

return DataFrame(jdf, self.sql_ctx)

def hint(self, name, *parameters):

"""Specifies some hint on the current :class:`DataFrame`.

.. versionadded:: 2.2.0

Parameters

----------

name : str

A name of the hint.

parameters : str, list, float or int

Optional parameters.

Returns

-------

:class:`DataFrame`

Examples

--------

>>> df.join(df2.hint("broadcast"), "name").show()

+----+---+------+

|name|age|height|

+----+---+------+

| Bob| 5| 85|

+----+---+------+

"""

if len(parameters) == 1 and isinstance(parameters[0], list):

parameters = parameters[0]

657 ↛ 658line 657 didn't jump to line 658, because the condition on line 657 was never true if not isinstance(name, str):

raise TypeError("name should be provided as str, got {0}".format(type(name)))

allowed_types = (str, list, float, int)

for p in parameters:

662 ↛ 663line 662 didn't jump to line 663, because the condition on line 662 was never true if not isinstance(p, allowed_types):

raise TypeError(

"all parameters should be in {0}, got {1} of type {2}".format(

allowed_types, p, type(p)))

jdf = self._jdf.hint(name, self._jseq(parameters))

return DataFrame(jdf, self.sql_ctx)

def count(self):

"""Returns the number of rows in this :class:`DataFrame`.

.. versionadded:: 1.3.0

Examples

--------

>>> df.count()

"""

return int(self._jdf.count())

def collect(self):

"""Returns all the records as a list of :class:`Row`.

.. versionadded:: 1.3.0

Examples

--------

>>> df.collect()

[Row(age=2, name='Alice'), Row(age=5, name='Bob')]

"""

with SCCallSiteSync(self._sc) as css:

sock_info = self._jdf.collectToPython()

return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))

def toLocalIterator(self, prefetchPartitions=False):

"""

Returns an iterator that contains all of the rows in this :class:`DataFrame`.

The iterator will consume as much memory as the largest partition in this

:class:`DataFrame`. With prefetch it may consume up to the memory of the 2 largest

partitions.

.. versionadded:: 2.0.0

Parameters

----------

prefetchPartitions : bool, optional

If Spark should pre-fetch the next partition before it is needed.

Examples

--------

>>> list(df.toLocalIterator())

[Row(age=2, name='Alice'), Row(age=5, name='Bob')]

"""

with SCCallSiteSync(self._sc) as css:

sock_info = self._jdf.toPythonIterator(prefetchPartitions)

return _local_iterator_from_socket(sock_info, BatchedSerializer(PickleSerializer()))

def limit(self, num):

"""Limits the result count to the number specified.

.. versionadded:: 1.3.0

Examples

--------

>>> df.limit(1).collect()

[Row(age=2, name='Alice')]

>>> df.limit(0).collect()

[]

"""

jdf = self._jdf.limit(num)

return DataFrame(jdf, self.sql_ctx)

def take(self, num):

"""Returns the first ``num`` rows as a :class:`list` of :class:`Row`.

.. versionadded:: 1.3.0

Examples

--------

>>> df.take(2)

[Row(age=2, name='Alice'), Row(age=5, name='Bob')]

"""

return self.limit(num).collect()

def tail(self, num):

"""

Returns the last ``num`` rows as a :class:`list` of :class:`Row`.

Running tail requires moving data into the application's driver process, and doing so with

a very large ``num`` can crash the driver process with OutOfMemoryError.

.. versionadded:: 3.0.0

Examples

--------

>>> df.tail(1)

[Row(age=5, name='Bob')]

"""

with SCCallSiteSync(self._sc):

sock_info = self._jdf.tailToPython(num)

return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))

def foreach(self, f):

"""Applies the ``f`` function to all :class:`Row` of this :class:`DataFrame`.

This is a shorthand for ``df.rdd.foreach()``.

.. versionadded:: 1.3.0

Examples

--------

>>> def f(person):

... print(person.name)

>>> df.foreach(f)

"""

self.rdd.foreach(f)

def foreachPartition(self, f):

"""Applies the ``f`` function to each partition of this :class:`DataFrame`.

This a shorthand for ``df.rdd.foreachPartition()``.

.. versionadded:: 1.3.0

Examples

--------

>>> def f(people):

... for person in people:

... print(person.name)

>>> df.foreachPartition(f)

"""

self.rdd.foreachPartition(f)

def cache(self):

"""Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK`).

.. versionadded:: 1.3.0

Notes

-----

The default storage level has changed to `MEMORY_AND_DISK` to match Scala in 2.0.

"""

self.is_cached = True

self._jdf.cache()

return self

def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK_DESER):

"""Sets the storage level to persist the contents of the :class:`DataFrame` across

operations after the first time it is computed. This can only be used to assign

a new storage level if the :class:`DataFrame` does not have a storage level set yet.

If no storage level is specified defaults to (`MEMORY_AND_DISK_DESER`)

.. versionadded:: 1.3.0

Notes

-----

The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.

"""

self.is_cached = True

javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)

self._jdf.persist(javaStorageLevel)

return self

@property

def storageLevel(self):

"""Get the :class:`DataFrame`'s current storage level.

.. versionadded:: 2.1.0

Examples

--------

>>> df.storageLevel

StorageLevel(False, False, False, False, 1)

>>> df.cache().storageLevel

StorageLevel(True, True, False, True, 1)

>>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel

StorageLevel(True, False, False, False, 2)

"""

java_storage_level = self._jdf.storageLevel()

storage_level = StorageLevel(java_storage_level.useDisk(),

java_storage_level.useMemory(),

java_storage_level.useOffHeap(),

java_storage_level.deserialized(),

java_storage_level.replication())

return storage_level

def unpersist(self, blocking=False):

"""Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from

memory and disk.

.. versionadded:: 1.3.0

Notes

-----

`blocking` default has changed to ``False`` to match Scala in 2.0.

"""

self.is_cached = False

self._jdf.unpersist(blocking)

return self

def coalesce(self, numPartitions):

"""

Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.

Similar to coalesce defined on an :class:`RDD`, this operation results in a

narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,

there will not be a shuffle, instead each of the 100 new partitions will

claim 10 of the current partitions. If a larger number of partitions is requested,

it will stay at the current number of partitions.

However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,

this may result in your computation taking place on fewer nodes than

you like (e.g. one node in the case of numPartitions = 1). To avoid this,

you can call repartition(). This will add a shuffle step, but means the

current upstream partitions will be executed in parallel (per whatever

the current partitioning is).

.. versionadded:: 1.4.0

Parameters

----------

numPartitions : int

specify the target number of partitions

Examples

--------

>>> df.coalesce(1).rdd.getNumPartitions()

"""

return DataFrame(self._jdf.coalesce(numPartitions), self.sql_ctx)

def repartition(self, numPartitions, *cols):

"""

Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The

resulting :class:`DataFrame` is hash partitioned.

.. versionadded:: 1.3.0

Parameters

----------

numPartitions : int

can be an int to specify the target number of partitions or a Column.

If it is a Column, it will be used as the first partitioning column. If not specified,

the default number of partitions is used.

cols : str or :class:`Column`

partitioning columns.

.. versionchanged:: 1.6

Added optional arguments to specify the partitioning columns. Also made numPartitions

optional if partitioning columns are specified.

Examples

--------

>>> df.repartition(10).rdd.getNumPartitions()

>>> data = df.union(df).repartition("age")

>>> data.show()

+---+-----+

|age| name|

+---+-----+

| 2|Alice|

| 5| Bob|

| 2|Alice|

| 5| Bob|

+---+-----+

>>> data = data.repartition(7, "age")

>>> data.show()

+---+-----+

|age| name|

+---+-----+

| 2|Alice|

| 5| Bob|

| 2|Alice|

| 5| Bob|

+---+-----+

>>> data.rdd.getNumPartitions()

>>> data = data.repartition(3, "name", "age")

>>> data.show()

+---+-----+

|age| name|

+---+-----+

| 5| Bob|

| 2|Alice|

+---+-----+

"""

if isinstance(numPartitions, int):

if len(cols) == 0:

return DataFrame(self._jdf.repartition(numPartitions), self.sql_ctx)

else:

return DataFrame(

self._jdf.repartition(numPartitions, self._jcols(*cols)), self.sql_ctx)

956 ↛ 960line 956 didn't jump to line 960, because the condition on line 956 was never false elif isinstance(numPartitions, (str, Column)):

cols = (numPartitions, ) + cols

return DataFrame(self._jdf.repartition(self._jcols(*cols)), self.sql_ctx)

else:

raise TypeError("numPartitions should be an int or Column")

def repartitionByRange(self, numPartitions, *cols):

"""

Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The

resulting :class:`DataFrame` is range partitioned.

At least one partition-by expression must be specified.

When no explicit sort order is specified, "ascending nulls first" is assumed.

.. versionadded:: 2.4.0

Parameters

----------

numPartitions : int

can be an int to specify the target number of partitions or a Column.

If it is a Column, it will be used as the first partitioning column. If not specified,

the default number of partitions is used.

cols : str or :class:`Column`

partitioning columns.

Notes

-----

Due to performance reasons this method uses sampling to estimate the ranges.

Hence, the output may not be consistent, since sampling can return different values.

The sample size can be controlled by the config

`spark.sql.execution.rangeExchange.sampleSizePerPartition`.

Examples

--------

>>> df.repartitionByRange(2, "age").rdd.getNumPartitions()

>>> df.show()

+---+-----+

|age| name|

+---+-----+

| 2|Alice|

| 5| Bob|

+---+-----+

>>> df.repartitionByRange(1, "age").rdd.getNumPartitions()

>>> data = df.repartitionByRange("age")

>>> df.show()

+---+-----+

|age| name|

+---+-----+

| 2|Alice|

| 5| Bob|

+---+-----+

"""

if isinstance(numPartitions, int):

1011 ↛ 1012line 1011 didn't jump to line 1012, because the condition on line 1011 was never true if len(cols) == 0:

return ValueError("At least one partition-by expression must be specified.")

else:

return DataFrame(

self._jdf.repartitionByRange(numPartitions, self._jcols(*cols)), self.sql_ctx)

1016 ↛ 1020line 1016 didn't jump to line 1020, because the condition on line 1016 was never false elif isinstance(numPartitions, (str, Column)):

cols = (numPartitions,) + cols

return DataFrame(self._jdf.repartitionByRange(self._jcols(*cols)), self.sql_ctx)

else:

raise TypeError("numPartitions should be an int, string or Column")

def distinct(self):

"""Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.

.. versionadded:: 1.3.0

Examples

--------

>>> df.distinct().count()

"""

return DataFrame(self._jdf.distinct(), self.sql_ctx)

def sample(self, withReplacement=None, fraction=None, seed=None):

"""Returns a sampled subset of this :class:`DataFrame`.

.. versionadded:: 1.3.0

Parameters

----------

withReplacement : bool, optional

Sample with replacement or not (default ``False``).

fraction : float, optional

Fraction of rows to generate, range [0.0, 1.0].

seed : int, optional

Seed for sampling (default a random seed).

Notes

-----

This is not guaranteed to provide exactly the fraction specified of the total

count of the given :class:`DataFrame`.

`fraction` is required and, `withReplacement` and `seed` are optional.

Examples

--------

>>> df = spark.range(10)

>>> df.sample(0.5, 3).count()

>>> df.sample(fraction=0.5, seed=3).count()

>>> df.sample(withReplacement=True, fraction=0.5, seed=3).count()

>>> df.sample(1.0).count()

>>> df.sample(fraction=1.0).count()

>>> df.sample(False, fraction=1.0).count()

"""

# For the cases below:

# sample(True, 0.5 [, seed])

# sample(True, fraction=0.5 [, seed])

# sample(withReplacement=False, fraction=0.5 [, seed])

is_withReplacement_set = \

type(withReplacement) == bool and isinstance(fraction, float)

# For the case below:

# sample(faction=0.5 [, seed])

is_withReplacement_omitted_kwargs = \

withReplacement is None and isinstance(fraction, float)

# For the case below:

# sample(0.5 [, seed])

is_withReplacement_omitted_args = isinstance(withReplacement, float)

if not (is_withReplacement_set

or is_withReplacement_omitted_kwargs

or is_withReplacement_omitted_args):

argtypes = [

str(type(arg)) for arg in [withReplacement, fraction, seed] if arg is not None]

raise TypeError(

"withReplacement (optional), fraction (required) and seed (optional)"

" should be a bool, float and number; however, "

"got [%s]." % ", ".join(argtypes))

if is_withReplacement_omitted_args:

if fraction is not None:

seed = fraction

fraction = withReplacement

withReplacement = None

seed = int(seed) if seed is not None else None

args = [arg for arg in [withReplacement, fraction, seed] if arg is not None]

jdf = self._jdf.sample(*args)

return DataFrame(jdf, self.sql_ctx)

def sampleBy(self, col, fractions, seed=None):

"""

Returns a stratified sample without replacement based on the

fraction given on each stratum.

.. versionadded:: 1.5.0

Parameters

----------

col : :class:`Column` or str

column that defines strata

.. versionchanged:: 3.0

Added sampling by a column of :class:`Column`

fractions : dict

sampling fraction for each stratum. If a stratum is not

specified, we treat its fraction as zero.

seed : int, optional

random seed

Returns

-------

a new :class:`DataFrame` that represents the stratified sample

Examples

--------

>>> from pyspark.sql.functions import col

>>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key"))

>>> sampled = dataset.sampleBy("key", fractions={0: 0.1, 1: 0.2}, seed=0)

>>> sampled.groupBy("key").count().orderBy("key").show()

+---+-----+

|key|count|

+---+-----+

| 0| 3|

| 1| 6|

+---+-----+

>>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count()

"""

if isinstance(col, str):

col = Column(col)

1150 ↛ 1151line 1150 didn't jump to line 1151, because the condition on line 1150 was never true elif not isinstance(col, Column):

raise TypeError("col must be a string or a column, but got %r" % type(col))

1152 ↛ 1153line 1152 didn't jump to line 1153, because the condition on line 1152 was never true if not isinstance(fractions, dict):

raise TypeError("fractions must be a dict but got %r" % type(fractions))

for k, v in fractions.items():

1155 ↛ 1156line 1155 didn't jump to line 1156, because the condition on line 1155 was never true if not isinstance(k, (float, int, str)):

raise TypeError("key must be float, int, or string, but got %r" % type(k))

fractions[k] = float(v)

col = col._jc

seed = seed if seed is not None else random.randint(0, sys.maxsize)

return DataFrame(self._jdf.stat().sampleBy(col, self._jmap(fractions), seed), self.sql_ctx)

def randomSplit(self, weights, seed=None):

"""Randomly splits this :class:`DataFrame` with the provided weights.

.. versionadded:: 1.4.0

Parameters

----------

weights : list

list of doubles as weights with which to split the :class:`DataFrame`.

Weights will be normalized if they don't sum up to 1.0.

seed : int, optional

The seed for sampling.

Examples

--------

>>> splits = df4.randomSplit([1.0, 2.0], 24)

>>> splits[0].count()

>>> splits[1].count()

"""

for w in weights:

1185 ↛ 1186line 1185 didn't jump to line 1186, because the condition on line 1185 was never true if w < 0.0:

raise ValueError("Weights must be positive. Found weight value: %s" % w)

seed = seed if seed is not None else random.randint(0, sys.maxsize)

rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), int(seed))

return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array]

@property

def dtypes(self):

"""Returns all column names and their data types as a list.

.. versionadded:: 1.3.0

Examples

--------

>>> df.dtypes

[('age', 'int'), ('name', 'string')]

"""

return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]

@property

def columns(self):

"""Returns all column names as a list.

.. versionadded:: 1.3.0

Examples

--------

>>> df.columns

['age', 'name']

"""

return [f.name for f in self.schema.fields]

def colRegex(self, colName):

"""

Selects column based on the column name specified as a regex and returns it

as :class:`Column`.

.. versionadded:: 2.3.0

Parameters

----------

colName : str

string, column name specified as a regex.

Examples

--------

>>> df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"])

>>> df.select(df.colRegex("`(Col1)?+.+`")).show()

+----+

|Col2|

+----+

| 1|

| 2|

| 3|

+----+

"""

1241 ↛ 1242line 1241 didn't jump to line 1242, because the condition on line 1241 was never true if not isinstance(colName, str):

raise TypeError("colName should be provided as string")

jc = self._jdf.colRegex(colName)

return Column(jc)

def alias(self, alias):

"""Returns a new :class:`DataFrame` with an alias set.

.. versionadded:: 1.3.0

Parameters

----------

alias : str

an alias name to be set for the :class:`DataFrame`.

Examples

--------

>>> from pyspark.sql.functions import *

>>> df_as1 = df.alias("df_as1")

>>> df_as2 = df.alias("df_as2")

>>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')

>>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age") \

.sort(desc("df_as1.name")).collect()

[Row(name='Bob', name='Bob', age=5), Row(name='Alice', name='Alice', age=2)]

"""

assert isinstance(alias, str), "alias should be a string"

return DataFrame(getattr(self._jdf, "as")(alias), self.sql_ctx)

def crossJoin(self, other):

"""Returns the cartesian product with another :class:`DataFrame`.

.. versionadded:: 2.1.0

Parameters

----------

other : :class:`DataFrame`

Right side of the cartesian product.

Examples

--------

>>> df.select("age", "name").collect()

[Row(age=2, name='Alice'), Row(age=5, name='Bob')]

>>> df2.select("name", "height").collect()

[Row(name='Tom', height=80), Row(name='Bob', height=85)]

>>> df.crossJoin(df2.select("height")).select("age", "name", "height").collect()

[Row(age=2, name='Alice', height=80), Row(age=2, name='Alice', height=85),

Row(age=5, name='Bob', height=80), Row(age=5, name='Bob', height=85)]

"""

jdf = self._jdf.crossJoin(other._jdf)

return DataFrame(jdf, self.sql_ctx)

def join(self, other, on=None, how=None):

"""Joins with another :class:`DataFrame`, using the given join expression.

.. versionadded:: 1.3.0

Parameters

----------

other : :class:`DataFrame`

Right side of the join

on : str, list or :class:`Column`, optional

a string for the join column name, a list of column names,

a join expression (Column), or a list of Columns.

If `on` is a string or a list of strings indicating the name of the join column(s),

the column(s) must exist on both sides, and this performs an equi-join.

how : str, optional

default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,

``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,

``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,

``anti``, ``leftanti`` and ``left_anti``.

Examples

--------

The following performs a full outer join between ``df1`` and ``df2``.

>>> from pyspark.sql.functions import desc

>>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height) \

.sort(desc("name")).collect()

[Row(name='Bob', height=85), Row(name='Alice', height=None), Row(name=None, height=80)]

>>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).collect()

[Row(name='Tom', height=80), Row(name='Bob', height=85), Row(name='Alice', height=None)]

>>> cond = [df.name == df3.name, df.age == df3.age]

>>> df.join(df3, cond, 'outer').select(df.name, df3.age).collect()

[Row(name='Alice', age=2), Row(name='Bob', age=5)]

>>> df.join(df2, 'name').select(df.name, df2.height).collect()

[Row(name='Bob', height=85)]

>>> df.join(df4, ['name', 'age']).select(df.name, df.age).collect()

[Row(name='Bob', age=5)]

"""

if on is not None and not isinstance(on, list):

on = [on]

if on is not None:

if isinstance(on[0], str):

on = self._jseq(on)

else:

assert isinstance(on[0], Column), "on should be Column or list of Column"

on = reduce(lambda x, y: x.__and__(y), on)

on = on._jc

if on is None and how is None:

jdf = self._jdf.join(other._jdf)

else:

if how is None:

how = "inner"

if on is None:

on = self._jseq([])

assert isinstance(how, str), "how should be a string"

jdf = self._jdf.join(other._jdf, on, how)

return DataFrame(jdf, self.sql_ctx)

def sortWithinPartitions(self, *cols, **kwargs):

"""Returns a new :class:`DataFrame` with each partition sorted by the specified column(s).

.. versionadded:: 1.6.0

Parameters

----------

cols : str, list or :class:`Column`, optional

list of :class:`Column` or column names to sort by.

Other Parameters

----------------

ascending : bool or list, optional

boolean or list of boolean (default ``True``).

Sort ascending vs. descending. Specify list for multiple sort orders.

If a list is specified, length of the list must equal length of the `cols`.

Examples

--------

>>> df.sortWithinPartitions("age", ascending=False).show()

+---+-----+

|age| name|

+---+-----+

| 2|Alice|

| 5| Bob|

+---+-----+

"""

jdf = self._jdf.sortWithinPartitions(self._sort_cols(cols, kwargs))

return DataFrame(jdf, self.sql_ctx)

def sort(self, *cols, **kwargs):

"""Returns a new :class:`DataFrame` sorted by the specified column(s).

.. versionadded:: 1.3.0

Parameters

----------

cols : str, list, or :class:`Column`, optional

list of :class:`Column` or column names to sort by.

Other Parameters

----------------

ascending : bool or list, optional

boolean or list of boolean (default ``True``).

Sort ascending vs. descending. Specify list for multiple sort orders.

If a list is specified, length of the list must equal length of the `cols`.

Examples

--------

>>> df.sort(df.age.desc()).collect()

[Row(age=5, name='Bob'), Row(age=2, name='Alice')]

>>> df.sort("age", ascending=False).collect()

[Row(age=5, name='Bob'), Row(age=2, name='Alice')]

>>> df.orderBy(df.age.desc()).collect()

[Row(age=5, name='Bob'), Row(age=2, name='Alice')]

>>> from pyspark.sql.functions import *

>>> df.sort(asc("age")).collect()

[Row(age=2, name='Alice'), Row(age=5, name='Bob')]

>>> df.orderBy(desc("age"), "name").collect()

[Row(age=5, name='Bob'), Row(age=2, name='Alice')]

>>> df.orderBy(["age", "name"], ascending=[0, 1]).collect()

[Row(age=5, name='Bob'), Row(age=2, name='Alice')]

"""

jdf = self._jdf.sort(self._sort_cols(cols, kwargs))

return DataFrame(jdf, self.sql_ctx)

orderBy = sort

def _jseq(self, cols, converter=None):

"""Return a JVM Seq of Columns from a list of Column or names"""

return _to_seq(self.sql_ctx._sc, cols, converter)

def _jmap(self, jm):

"""Return a JVM Scala Map from a dict"""

return _to_scala_map(self.sql_ctx._sc, jm)

def _jcols(self, *cols):

"""Return a JVM Seq of Columns from a list of Column or column names

If `cols` has only one list in it, cols[0] will be used as the list.

"""

if len(cols) == 1 and isinstance(cols[0], list):

cols = cols[0]

return self._jseq(cols, _to_java_column)

def _sort_cols(self, cols, kwargs):

""" Return a JVM Seq of Columns that describes the sort order

"""

1446 ↛ 1447line 1446 didn't jump to line 1447, because the condition on line 1446 was never true if not cols:

raise ValueError("should sort by at least one column")

if len(cols) == 1 and isinstance(cols[0], list):

cols = cols[0]

jcols = [_to_java_column(c) for c in cols]

ascending = kwargs.get('ascending', True)

if isinstance(ascending, (bool, int)):

if not ascending:

jcols = [jc.desc() for jc in jcols]

1455 ↛ 1459line 1455 didn't jump to line 1459, because the condition on line 1455 was never false elif isinstance(ascending, list):

jcols = [jc if asc else jc.desc()

for asc, jc in zip(ascending, jcols)]

else:

raise TypeError("ascending can only be boolean or list, but got %s" % type(ascending))

return self._jseq(jcols)

def describe(self, *cols):

"""Computes basic statistics for numeric and string columns.

.. versionadded:: 1.3.1

This include count, mean, stddev, min, and max. If no columns are

given, this function computes statistics for all numerical or string columns.

Notes

-----

This function is meant for exploratory data analysis, as we make no

guarantee about the backward compatibility of the schema of the resulting

:class:`DataFrame`.

Use summary for expanded statistics and control over which statistics to compute.

Examples

--------

>>> df.describe(['age']).show()

+-------+------------------+

|summary| age|

+-------+------------------+

| count| 2|

| mean| 3.5|

| stddev|2.1213203435596424|

| min| 2|

| max| 5|

+-------+------------------+

>>> df.describe().show()

+-------+------------------+-----+

|summary| age| name|

+-------+------------------+-----+

| count| 2| 2|

| mean| 3.5| null|

| stddev|2.1213203435596424| null|

| min| 2|Alice|

| max| 5| Bob|

+-------+------------------+-----+

Coverage for pyspark/sql/dataframe.py : 88%

628 statements 565 run 63 missing 0 excluded 47 partial