Coverage for pyspark/sql/functions.py: 90%

2711 ↛ 2712line 2711 didn't jump to line 2712, because the condition on line 2711 was never true if len is not None and not isinstance(len, (int, str, Column)):

raise TypeError(

"len should be an integer or a Column / column name, got {}".format(type(len)))

pos = _create_column_from_literal(pos) if isinstance(pos, int) else _to_java_column(pos)

len = _create_column_from_literal(len) if isinstance(len, int) else _to_java_column(len)

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.overlay(

_to_java_column(src),

_to_java_column(replace),

pos,

len

))

def sentences(string, language=None, country=None):

"""

Splits a string into arrays of sentences, where each sentence is an array of words.

The 'language' and 'country' arguments are optional, and if omitted, the default locale is used.

.. versionadded:: 3.2.0

Parameters

----------

string : :class:`~pyspark.sql.Column` or str

a string to be split

language : :class:`~pyspark.sql.Column` or str, optional

a language of the locale

country : :class:`~pyspark.sql.Column` or str, optional

a country of the locale

Examples

--------

>>> df = spark.createDataFrame([["This is an example sentence."]], ["string"])

>>> df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False)

+-----------------------------------+

|sentences(string, en, US) |

+-----------------------------------+

|[[This, is, an, example, sentence]]|

+-----------------------------------+

"""

2754 ↛ 2755line 2754 didn't jump to line 2755, because the condition on line 2754 was never true if language is None:

language = lit("")

2756 ↛ 2757line 2756 didn't jump to line 2757, because the condition on line 2756 was never true if country is None:

country = lit("")

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.sentences(

_to_java_column(string),

_to_java_column(language),

_to_java_column(country)

))

def substring(str, pos, len):

"""

Substring starts at `pos` and is of length `len` when str is String type or

returns the slice of byte array that starts at `pos` in byte and is of length `len`

when str is Binary type.

.. versionadded:: 1.5.0

Notes

-----

The position is not zero based, but 1 based index.

Examples

--------

>>> df = spark.createDataFrame([('abcd',)], ['s',])

>>> df.select(substring(df.s, 1, 2).alias('s')).collect()

[Row(s='ab')]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len))

def substring_index(str, delim, count):

"""

Returns the substring from string str before count occurrences of the delimiter delim.

If count is positive, everything the left of the final delimiter (counting from left) is

returned. If count is negative, every to the right of the final delimiter (counting from the

right) is returned. substring_index performs a case-sensitive match when searching for delim.

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([('a.b.c.d',)], ['s'])

>>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()

[Row(s='a.b')]

>>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()

[Row(s='b.c.d')]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count))

def levenshtein(left, right):

"""Computes the Levenshtein distance of the two given strings.

.. versionadded:: 1.5.0

Examples

--------

>>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])

>>> df0.select(levenshtein('l', 'r').alias('d')).collect()

[Row(d=3)]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.levenshtein(_to_java_column(left), _to_java_column(right))

return Column(jc)

def locate(substr, str, pos=1):

"""

Locate the position of the first occurrence of substr in a string column, after position pos.

.. versionadded:: 1.5.0

Parameters

----------

substr : str

a string

str : :class:`~pyspark.sql.Column` or str

a Column of :class:`pyspark.sql.types.StringType`

pos : int, optional

start position (zero based)

Notes

-----

The position is not zero based, but 1 based index. Returns 0 if substr

could not be found in str.

Examples

--------

>>> df = spark.createDataFrame([('abcd',)], ['s',])

>>> df.select(locate('b', df.s, 1).alias('s')).collect()

[Row(s=2)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.locate(substr, _to_java_column(str), pos))

def lpad(col, len, pad):

"""

Left-pad the string column to width `len` with `pad`.

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([('abcd',)], ['s',])

>>> df.select(lpad(df.s, 6, '#').alias('s')).collect()

[Row(s='##abcd')]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.lpad(_to_java_column(col), len, pad))

def rpad(col, len, pad):

"""

Right-pad the string column to width `len` with `pad`.

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([('abcd',)], ['s',])

>>> df.select(rpad(df.s, 6, '#').alias('s')).collect()

[Row(s='abcd##')]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.rpad(_to_java_column(col), len, pad))

def repeat(col, n):

"""

Repeats a string column n times, and returns it as a new string column.

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([('ab',)], ['s',])

>>> df.select(repeat(df.s, 3).alias('s')).collect()

[Row(s='ababab')]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.repeat(_to_java_column(col), n))

def split(str, pattern, limit=-1):

"""

Splits str around matches of the given pattern.

.. versionadded:: 1.5.0

Parameters

----------

str : :class:`~pyspark.sql.Column` or str

a string expression to split

pattern : str

a string representing a regular expression. The regex string should be

a Java regular expression.

limit : int, optional

an integer which controls the number of times `pattern` is applied.

* ``limit > 0``: The resulting array's length will not be more than `limit`, and the

resulting array's last entry will contain all input beyond the last

matched pattern.

* ``limit <= 0``: `pattern` will be applied as many times as possible, and the resulting

array can be of any size.

.. versionchanged:: 3.0

`split` now takes an optional `limit` field. If not provided, default limit value is -1.

Examples

--------

>>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])

>>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect()

[Row(s=['one', 'twoBthreeC'])]

>>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect()

[Row(s=['one', 'two', 'three', ''])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.split(_to_java_column(str), pattern, limit))

def regexp_extract(str, pattern, idx):

r"""Extract a specific group matched by a Java regex, from the specified string column.

If the regex did not match, or the specified group did not match, an empty string is returned.

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([('100-200',)], ['str'])

>>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()

[Row(d='100')]

>>> df = spark.createDataFrame([('foo',)], ['str'])

>>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()

[Row(d='')]

>>> df = spark.createDataFrame([('aaaac',)], ['str'])

>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()

[Row(d='')]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx)

return Column(jc)

def regexp_replace(str, pattern, replacement):

r"""Replace all substrings of the specified string value that match regexp with rep.

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([('100-200',)], ['str'])

>>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()

[Row(d='-----')]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement)

return Column(jc)

def initcap(col):

"""Translate the first letter of each word to upper case in the sentence.

.. versionadded:: 1.5.0

Examples

--------

>>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect()

[Row(v='Ab Cd')]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.initcap(_to_java_column(col)))

def soundex(col):

"""

Returns the SoundEx encoding for a string

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name'])

>>> df.select(soundex(df.name).alias("soundex")).collect()

[Row(soundex='P362'), Row(soundex='U612')]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.soundex(_to_java_column(col)))

def bin(col):

"""Returns the string representation of the binary value of the given column.

.. versionadded:: 1.5.0

Examples

--------

>>> df.select(bin(df.age).alias('c')).collect()

[Row(c='10'), Row(c='101')]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.bin(_to_java_column(col))

return Column(jc)

def hex(col):

"""Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`,

:class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or

:class:`pyspark.sql.types.LongType`.

.. versionadded:: 1.5.0

Examples

--------

>>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect()

[Row(hex(a)='414243', hex(b)='3')]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.hex(_to_java_column(col))

return Column(jc)

def unhex(col):

"""Inverse of hex. Interprets each pair of characters as a hexadecimal number

and converts to the byte representation of number.

.. versionadded:: 1.5.0

Examples

--------

>>> spark.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect()

[Row(unhex(a)=bytearray(b'ABC'))]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.unhex(_to_java_column(col)))

def length(col):

"""Computes the character length of string data or number of bytes of binary data.

The length of character data includes the trailing spaces. The length of binary data

includes binary zeros.

.. versionadded:: 1.5.0

Examples

--------

>>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect()

[Row(length=4)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.length(_to_java_column(col)))

def translate(srcCol, matching, replace):

"""A function translate any character in the `srcCol` by a character in `matching`.

The characters in `replace` is corresponding to the characters in `matching`.

The translate will happen when any character in the string matching with the character

in the `matching`.

.. versionadded:: 1.5.0

Examples

--------

>>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\

... .alias('r')).collect()

[Row(r='1a2s3ae')]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.translate(_to_java_column(srcCol), matching, replace))

# ---------------------- Collection functions ------------------------------

def create_map(*cols):

"""Creates a new map column.

.. versionadded:: 2.0.0

Parameters

----------

cols : :class:`~pyspark.sql.Column` or str

column names or :class:`~pyspark.sql.Column`\\s that are

grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).

Examples

--------

>>> df.select(create_map('name', 'age').alias("map")).collect()

[Row(map={'Alice': 2}), Row(map={'Bob': 5})]

>>> df.select(create_map([df.name, df.age]).alias("map")).collect()

[Row(map={'Alice': 2}), Row(map={'Bob': 5})]

"""

sc = SparkContext._active_spark_context

if len(cols) == 1 and isinstance(cols[0], (list, set)):

cols = cols[0]

jc = sc._jvm.functions.map(_to_seq(sc, cols, _to_java_column))

return Column(jc)

def map_from_arrays(col1, col2):

"""Creates a new map from two arrays.

.. versionadded:: 2.4.0

Parameters

----------

col1 : :class:`~pyspark.sql.Column` or str

name of column containing a set of keys. All elements should not be null

col2 : :class:`~pyspark.sql.Column` or str

name of column containing a set of values

Examples

--------

>>> df = spark.createDataFrame([([2, 5], ['a', 'b'])], ['k', 'v'])

>>> df.select(map_from_arrays(df.k, df.v).alias("map")).show()

+----------------+

| map|

+----------------+

|{2 -> a, 5 -> b}|

+----------------+

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.map_from_arrays(_to_java_column(col1), _to_java_column(col2)))

def array(*cols):

"""Creates a new array column.

.. versionadded:: 1.4.0

Parameters

----------

cols : :class:`~pyspark.sql.Column` or str

column names or :class:`~pyspark.sql.Column`\\s that have

the same data type.

Examples

--------

>>> df.select(array('age', 'age').alias("arr")).collect()

[Row(arr=[2, 2]), Row(arr=[5, 5])]

>>> df.select(array([df.age, df.age]).alias("arr")).collect()

[Row(arr=[2, 2]), Row(arr=[5, 5])]

"""

sc = SparkContext._active_spark_context

if len(cols) == 1 and isinstance(cols[0], (list, set)):

cols = cols[0]

jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column))

return Column(jc)

def array_contains(col, value):

"""

Collection function: returns null if the array is null, true if the array contains the

given value, and false otherwise.

.. versionadded:: 1.5.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column containing array

value :

value or column to check for in array

Examples

--------

>>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])

>>> df.select(array_contains(df.data, "a")).collect()

[Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]

>>> df.select(array_contains(df.data, lit("a"))).collect()

[Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]

"""

sc = SparkContext._active_spark_context

value = value._jc if isinstance(value, Column) else value

return Column(sc._jvm.functions.array_contains(_to_java_column(col), value))

def arrays_overlap(a1, a2):

"""

Collection function: returns true if the arrays contain any common non-null element; if not,

returns null if both the arrays are non-empty and any of them contains a null element; returns

false otherwise.

.. versionadded:: 2.4.0

Examples

--------

>>> df = spark.createDataFrame([(["a", "b"], ["b", "c"]), (["a"], ["b", "c"])], ['x', 'y'])

>>> df.select(arrays_overlap(df.x, df.y).alias("overlap")).collect()

[Row(overlap=True), Row(overlap=False)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.arrays_overlap(_to_java_column(a1), _to_java_column(a2)))

def slice(x, start, length):

"""

Collection function: returns an array containing all the elements in `x` from index `start`

(array indices start at 1, or from the end if `start` is negative) with the specified `length`.

.. versionadded:: 2.4.0

Parameters

----------

x : :class:`~pyspark.sql.Column` or str

the array to be sliced

start : :class:`~pyspark.sql.Column` or int

the starting index

length : :class:`~pyspark.sql.Column` or int

the length of the slice

Examples

--------

>>> df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x'])

>>> df.select(slice(df.x, 2, 2).alias("sliced")).collect()

[Row(sliced=[2, 3]), Row(sliced=[5])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.slice(

_to_java_column(x),

start._jc if isinstance(start, Column) else start,

length._jc if isinstance(length, Column) else length

))

def array_join(col, delimiter, null_replacement=None):

"""

Concatenates the elements of `column` using the `delimiter`. Null values are replaced with

`null_replacement` if set, otherwise they are ignored.

.. versionadded:: 2.4.0

Examples

--------

>>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data'])

>>> df.select(array_join(df.data, ",").alias("joined")).collect()

[Row(joined='a,b,c'), Row(joined='a')]

>>> df.select(array_join(df.data, ",", "NULL").alias("joined")).collect()

[Row(joined='a,b,c'), Row(joined='a,NULL')]

"""

sc = SparkContext._active_spark_context

if null_replacement is None:

return Column(sc._jvm.functions.array_join(_to_java_column(col), delimiter))

else:

return Column(sc._jvm.functions.array_join(

_to_java_column(col), delimiter, null_replacement))

def concat(*cols):

"""

Concatenates multiple input columns together into a single column.

The function works with strings, binary and compatible array columns.

.. versionadded:: 1.5.0

Examples

--------

>>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])

>>> df.select(concat(df.s, df.d).alias('s')).collect()

[Row(s='abcd123')]

>>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])

>>> df.select(concat(df.a, df.b, df.c).alias("arr")).collect()

[Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column)))

def array_position(col, value):

"""

Collection function: Locates the position of the first occurrence of the given value

in the given array. Returns null if either of the arguments are null.

.. versionadded:: 2.4.0

Notes

-----

The position is not zero based, but 1 based index. Returns 0 if the given

value could not be found in the array.

Examples

--------

>>> df = spark.createDataFrame([(["c", "b", "a"],), ([],)], ['data'])

>>> df.select(array_position(df.data, "a")).collect()

[Row(array_position(data, a)=3), Row(array_position(data, a)=0)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_position(_to_java_column(col), value))

def element_at(col, extraction):

"""

Collection function: Returns element of array at given index in extraction if col is array.

Returns value for the given key in extraction if col is map.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column containing array or map

extraction :

index to check for in array or key to check for in map

Notes

-----

The position is not zero based, but 1 based index.

Examples

--------

>>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])

>>> df.select(element_at(df.data, 1)).collect()

[Row(element_at(data, 1)='a'), Row(element_at(data, 1)=None)]

>>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},), ({},)], ['data'])

>>> df.select(element_at(df.data, lit("a"))).collect()

[Row(element_at(data, a)=1.0), Row(element_at(data, a)=None)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.element_at(

_to_java_column(col), lit(extraction)._jc))

def array_remove(col, element):

"""

Collection function: Remove all elements that equal to element from the given array.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column containing array

element :

element to be removed from the array

Examples

--------

>>> df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([],)], ['data'])

>>> df.select(array_remove(df.data, 1)).collect()

[Row(array_remove(data, 1)=[2, 3]), Row(array_remove(data, 1)=[])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_remove(_to_java_column(col), element))

def array_distinct(col):

"""

Collection function: removes duplicate values from the array.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ['data'])

>>> df.select(array_distinct(df.data)).collect()

[Row(array_distinct(data)=[1, 2, 3]), Row(array_distinct(data)=[4, 5])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_distinct(_to_java_column(col)))

def array_intersect(col1, col2):

"""

Collection function: returns an array of the elements in the intersection of col1 and col2,

without duplicates.

.. versionadded:: 2.4.0

Parameters

----------

col1 : :class:`~pyspark.sql.Column` or str

name of column containing array

col2 : :class:`~pyspark.sql.Column` or str

name of column containing array

Examples

--------

>>> from pyspark.sql import Row

>>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])

>>> df.select(array_intersect(df.c1, df.c2)).collect()

[Row(array_intersect(c1, c2)=['a', 'c'])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_intersect(_to_java_column(col1), _to_java_column(col2)))

def array_union(col1, col2):

"""

Collection function: returns an array of the elements in the union of col1 and col2,

without duplicates.

.. versionadded:: 2.4.0

Parameters

----------

col1 : :class:`~pyspark.sql.Column` or str

name of column containing array

col2 : :class:`~pyspark.sql.Column` or str

name of column containing array

Examples

--------

>>> from pyspark.sql import Row

>>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])

>>> df.select(array_union(df.c1, df.c2)).collect()

[Row(array_union(c1, c2)=['b', 'a', 'c', 'd', 'f'])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_union(_to_java_column(col1), _to_java_column(col2)))

def array_except(col1, col2):

"""

Collection function: returns an array of the elements in col1 but not in col2,

without duplicates.

.. versionadded:: 2.4.0

Parameters

----------

col1 : :class:`~pyspark.sql.Column` or str

name of column containing array

col2 : :class:`~pyspark.sql.Column` or str

name of column containing array

Examples

--------

>>> from pyspark.sql import Row

>>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])

>>> df.select(array_except(df.c1, df.c2)).collect()

[Row(array_except(c1, c2)=['b'])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_except(_to_java_column(col1), _to_java_column(col2)))

def explode(col):

"""

Returns a new row for each element in the given array or map.

Uses the default column name `col` for elements in the array and

`key` and `value` for elements in the map unless specified otherwise.

.. versionadded:: 1.4.0

Examples

--------

>>> from pyspark.sql import Row

>>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])

>>> eDF.select(explode(eDF.intlist).alias("anInt")).collect()

[Row(anInt=1), Row(anInt=2), Row(anInt=3)]

>>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show()

+---+-----+

|key|value|

+---+-----+

| a| b|

+---+-----+

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.explode(_to_java_column(col))

return Column(jc)

def posexplode(col):

"""

Returns a new row for each element with position in the given array or map.

Uses the default column name `pos` for position, and `col` for elements in the

array and `key` and `value` for elements in the map unless specified otherwise.

.. versionadded:: 2.1.0

Examples

--------

>>> from pyspark.sql import Row

>>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])

>>> eDF.select(posexplode(eDF.intlist)).collect()

[Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)]

>>> eDF.select(posexplode(eDF.mapfield)).show()

+---+---+-----+

|pos|key|value|

+---+---+-----+

| 0| a| b|

+---+---+-----+

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.posexplode(_to_java_column(col))

return Column(jc)

def explode_outer(col):

"""

Returns a new row for each element in the given array or map.

Unlike explode, if the array/map is null or empty then null is produced.

Uses the default column name `col` for elements in the array and

`key` and `value` for elements in the map unless specified otherwise.

.. versionadded:: 2.3.0

Examples

--------

>>> df = spark.createDataFrame(

... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],

... ("id", "an_array", "a_map")

... )

>>> df.select("id", "an_array", explode_outer("a_map")).show()

+---+----------+----+-----+

| id| an_array| key|value|

+---+----------+----+-----+

| 1|[foo, bar]| x| 1.0|

| 2| []|null| null|

+---+----------+----+-----+

>>> df.select("id", "a_map", explode_outer("an_array")).show()

+---+----------+----+

| id| a_map| col|

+---+----------+----+

| 1|{x -> 1.0}| foo|

| 1|{x -> 1.0}| bar|

| 2| {}|null|

| 3| null|null|

+---+----------+----+

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.explode_outer(_to_java_column(col))

return Column(jc)

def posexplode_outer(col):

"""

Returns a new row for each element with position in the given array or map.

Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced.

Uses the default column name `pos` for position, and `col` for elements in the

array and `key` and `value` for elements in the map unless specified otherwise.

.. versionadded:: 2.3.0

Examples

--------

>>> df = spark.createDataFrame(

... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],

... ("id", "an_array", "a_map")

... )

>>> df.select("id", "an_array", posexplode_outer("a_map")).show()

+---+----------+----+----+-----+

| id| an_array| pos| key|value|

+---+----------+----+----+-----+

| 1|[foo, bar]| 0| x| 1.0|

+---+----------+----+----+-----+

>>> df.select("id", "a_map", posexplode_outer("an_array")).show()

+---+----------+----+----+

| id| a_map| pos| col|

+---+----------+----+----+

| 1|{x -> 1.0}| 0| foo|

| 1|{x -> 1.0}| 1| bar|

| 2| {}|null|null|

+---+----------+----+----+

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.posexplode_outer(_to_java_column(col))

return Column(jc)

def get_json_object(col, path):

"""

Extracts json object from a json string based on json path specified, and returns json string

of the extracted json object. It will return null if the input json string is invalid.

.. versionadded:: 1.6.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

string column in json format

path : str

path to the json object to extract

Examples

--------

>>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]

>>> df = spark.createDataFrame(data, ("key", "jstring"))

>>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\

... get_json_object(df.jstring, '$.f2').alias("c1") ).collect()

[Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.get_json_object(_to_java_column(col), path)

return Column(jc)

def json_tuple(col, *fields):

"""Creates a new row for a json column according to the given field names.

.. versionadded:: 1.6.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

string column in json format

fields : str

fields to extract

Examples

--------

>>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]

>>> df = spark.createDataFrame(data, ("key", "jstring"))

>>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()

[Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.json_tuple(_to_java_column(col), _to_seq(sc, fields))

return Column(jc)

def from_json(col, schema, options=None):

"""

Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType`

as keys type, :class:`StructType` or :class:`ArrayType` with

the specified schema. Returns `null`, in the case of an unparseable string.

.. versionadded:: 2.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

string column in json format

schema : :class:`DataType` or str

a StructType or ArrayType of StructType to use when parsing the json column.

.. versionchanged:: 2.3

the DDL-formatted string is also supported for ``schema``.

options : dict, optional

options to control parsing. accepts the same options as the json datasource.

See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_

in the version you use.

.. # noqa

Examples

--------

>>> from pyspark.sql.types import *

>>> data = [(1, '''{"a": 1}''')]

>>> schema = StructType([StructField("a", IntegerType())])

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=Row(a=1))]

>>> df.select(from_json(df.value, "a INT").alias("json")).collect()

[Row(json=Row(a=1))]

>>> df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect()

[Row(json={'a': 1})]

>>> data = [(1, '''[{"a": 1}]''')]

>>> schema = ArrayType(StructType([StructField("a", IntegerType())]))

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=[Row(a=1)])]

>>> schema = schema_of_json(lit('''{"a": 0}'''))

>>> df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=Row(a=None))]

>>> data = [(1, '''[1, 2, 3]''')]

>>> schema = ArrayType(IntegerType())

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=[1, 2, 3])]

"""

sc = SparkContext._active_spark_context

if isinstance(schema, DataType):

schema = schema.json()

elif isinstance(schema, Column):

schema = _to_java_column(schema)

jc = sc._jvm.functions.from_json(_to_java_column(col), schema, _options_to_str(options))

return Column(jc)

def to_json(col, options=None):

"""

Converts a column containing a :class:`StructType`, :class:`ArrayType` or a :class:`MapType`

into a JSON string. Throws an exception, in the case of an unsupported type.

.. versionadded:: 2.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column containing a struct, an array or a map.

options : dict, optional

options to control converting. accepts the same options as the JSON datasource.

See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_

in the version you use.

Additionally the function supports the `pretty` option which enables

pretty JSON generation.

.. # noqa

Examples

--------

>>> from pyspark.sql import Row

>>> from pyspark.sql.types import *

>>> data = [(1, Row(age=2, name='Alice'))]

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(to_json(df.value).alias("json")).collect()

[Row(json='{"age":2,"name":"Alice"}')]

>>> data = [(1, [Row(age=2, name='Alice'), Row(age=3, name='Bob')])]

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(to_json(df.value).alias("json")).collect()

[Row(json='[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]

>>> data = [(1, {"name": "Alice"})]

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(to_json(df.value).alias("json")).collect()

[Row(json='{"name":"Alice"}')]

>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(to_json(df.value).alias("json")).collect()

[Row(json='[{"name":"Alice"},{"name":"Bob"}]')]

>>> data = [(1, ["Alice", "Bob"])]

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(to_json(df.value).alias("json")).collect()

[Row(json='["Alice","Bob"]')]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.to_json(_to_java_column(col), _options_to_str(options))

return Column(jc)

def schema_of_json(json, options=None):

"""

Parses a JSON string and infers its schema in DDL format.

.. versionadded:: 2.4.0

Parameters

----------

json : :class:`~pyspark.sql.Column` or str

a JSON string or a foldable string column containing a JSON string.

options : dict, optional

options to control parsing. accepts the same options as the JSON datasource.

See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_

in the version you use.

.. # noqa

.. versionchanged:: 3.0

It accepts `options` parameter to control schema inferring.

Examples

--------

>>> df = spark.range(1)

>>> df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect()

[Row(json='STRUCT<`a`: BIGINT>')]

>>> schema = schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'})

>>> df.select(schema.alias("json")).collect()

[Row(json='STRUCT<`a`: BIGINT>')]

"""

if isinstance(json, str):

col = _create_column_from_literal(json)

3786 ↛ 3789line 3786 didn't jump to line 3789, because the condition on line 3786 was never false elif isinstance(json, Column):

col = _to_java_column(json)

else:

raise TypeError("schema argument should be a column or string")

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.schema_of_json(col, _options_to_str(options))

return Column(jc)

def schema_of_csv(csv, options=None):

"""

Parses a CSV string and infers its schema in DDL format.

.. versionadded:: 3.0.0

Parameters

----------

csv : :class:`~pyspark.sql.Column` or str

a CSV string or a foldable string column containing a CSV string.

options : dict, optional

options to control parsing. accepts the same options as the CSV datasource.

See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_

in the version you use.

.. # noqa

Examples

--------

>>> df = spark.range(1)

>>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()

[Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')]

>>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()

[Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')]

"""

if isinstance(csv, str):

col = _create_column_from_literal(csv)

3823 ↛ 3826line 3823 didn't jump to line 3826, because the condition on line 3823 was never false elif isinstance(csv, Column):

col = _to_java_column(csv)

else:

raise TypeError("schema argument should be a column or string")

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.schema_of_csv(col, _options_to_str(options))

return Column(jc)

def to_csv(col, options=None):

"""

Converts a column containing a :class:`StructType` into a CSV string.

Throws an exception, in the case of an unsupported type.

.. versionadded:: 3.0.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column containing a struct.

options: dict, optional

options to control converting. accepts the same options as the CSV datasource.

See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_

in the version you use.

.. # noqa

Examples

--------

>>> from pyspark.sql import Row

>>> data = [(1, Row(age=2, name='Alice'))]

>>> df = spark.createDataFrame(data, ("key", "value"))

>>> df.select(to_csv(df.value).alias("csv")).collect()

[Row(csv='2,Alice')]

"""

sc = SparkContext._active_spark_context

jc = sc._jvm.functions.to_csv(_to_java_column(col), _options_to_str(options))

return Column(jc)

def size(col):

"""

Collection function: returns the length of the array or map stored in the column.

.. versionadded:: 1.5.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])

>>> df.select(size(df.data)).collect()

[Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.size(_to_java_column(col)))

def array_min(col):

"""

Collection function: returns the minimum value of the array.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data'])

>>> df.select(array_min(df.data).alias('min')).collect()

[Row(min=1), Row(min=-1)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_min(_to_java_column(col)))

def array_max(col):

"""

Collection function: returns the maximum value of the array.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data'])

>>> df.select(array_max(df.data).alias('max')).collect()

[Row(max=3), Row(max=10)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_max(_to_java_column(col)))

def sort_array(col, asc=True):

"""

Collection function: sorts the input array in ascending or descending order according

to the natural ordering of the array elements. Null elements will be placed at the beginning

of the returned array in ascending order or at the end of the returned array in descending

order.

.. versionadded:: 1.5.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

asc : bool, optional

Examples

--------

>>> df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data'])

>>> df.select(sort_array(df.data).alias('r')).collect()

[Row(r=[None, 1, 2, 3]), Row(r=[1]), Row(r=[])]

>>> df.select(sort_array(df.data, asc=False).alias('r')).collect()

[Row(r=[3, 2, 1, None]), Row(r=[1]), Row(r=[])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.sort_array(_to_java_column(col), asc))

def array_sort(col):

"""

Collection function: sorts the input array in ascending order. The elements of the input array

must be orderable. Null elements will be placed at the end of the returned array.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data'])

>>> df.select(array_sort(df.data).alias('r')).collect()

[Row(r=[1, 2, 3, None]), Row(r=[1]), Row(r=[])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_sort(_to_java_column(col)))

def shuffle(col):

"""

Collection function: Generates a random permutation of the given array.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Notes

-----

The function is non-deterministic.

Examples

--------

>>> df = spark.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],)], ['data'])

>>> df.select(shuffle(df.data).alias('s')).collect() # doctest: +SKIP

[Row(s=[3, 1, 5, 20]), Row(s=[20, None, 3, 1])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.shuffle(_to_java_column(col)))

def reverse(col):

"""

Collection function: returns a reversed string or an array with reverse order of elements.

.. versionadded:: 1.5.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> df = spark.createDataFrame([('Spark SQL',)], ['data'])

>>> df.select(reverse(df.data).alias('s')).collect()

[Row(s='LQS krapS')]

>>> df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data'])

>>> df.select(reverse(df.data).alias('r')).collect()

[Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.reverse(_to_java_column(col)))

def flatten(col):

"""

Collection function: creates a single array from an array of arrays.

If a structure of nested arrays is deeper than two levels,

only one level of nesting is removed.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> df = spark.createDataFrame([([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], ['data'])

>>> df.select(flatten(df.data).alias('r')).collect()

[Row(r=[1, 2, 3, 4, 5, 6]), Row(r=None)]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.flatten(_to_java_column(col)))

def map_keys(col):

"""

Collection function: Returns an unordered array containing the keys of the map.

.. versionadded:: 2.3.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> from pyspark.sql.functions import map_keys

>>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")

>>> df.select(map_keys("data").alias("keys")).show()

+------+

| keys|

+------+

|[1, 2]|

+------+

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.map_keys(_to_java_column(col)))

def map_values(col):

"""

Collection function: Returns an unordered array containing the values of the map.

.. versionadded:: 2.3.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> from pyspark.sql.functions import map_values

>>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")

>>> df.select(map_values("data").alias("values")).show()

+------+

|values|

+------+

|[a, b]|

+------+

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.map_values(_to_java_column(col)))

def map_entries(col):

"""

Collection function: Returns an unordered array of all entries in the given map.

.. versionadded:: 3.0.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> from pyspark.sql.functions import map_entries

>>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")

>>> df.select(map_entries("data").alias("entries")).show()

+----------------+

| entries|

+----------------+

|[{1, a}, {2, b}]|

+----------------+

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.map_entries(_to_java_column(col)))

def map_from_entries(col):

"""

Collection function: Returns a map created from the given array of entries.

.. versionadded:: 2.4.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

Examples

--------

>>> from pyspark.sql.functions import map_from_entries

>>> df = spark.sql("SELECT array(struct(1, 'a'), struct(2, 'b')) as data")

>>> df.select(map_from_entries("data").alias("map")).show()

+----------------+

| map|

+----------------+

|{1 -> a, 2 -> b}|

+----------------+

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.map_from_entries(_to_java_column(col)))

def array_repeat(col, count):

"""

Collection function: creates an array containing a column repeated count times.

.. versionadded:: 2.4.0

Examples

--------

>>> df = spark.createDataFrame([('ab',)], ['data'])

>>> df.select(array_repeat(df.data, 3).alias('r')).collect()

[Row(r=['ab', 'ab', 'ab'])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.array_repeat(

_to_java_column(col),

_to_java_column(count) if isinstance(count, Column) else count

))

def arrays_zip(*cols):

"""

Collection function: Returns a merged array of structs in which the N-th struct contains all

N-th values of input arrays.

.. versionadded:: 2.4.0

Parameters

----------

cols : :class:`~pyspark.sql.Column` or str

columns of arrays to be merged.

Examples

--------

>>> from pyspark.sql.functions import arrays_zip

>>> df = spark.createDataFrame([(([1, 2, 3], [2, 3, 4]))], ['vals1', 'vals2'])

>>> df.select(arrays_zip(df.vals1, df.vals2).alias('zipped')).collect()

[Row(zipped=[Row(vals1=1, vals2=2), Row(vals1=2, vals2=3), Row(vals1=3, vals2=4)])]

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.arrays_zip(_to_seq(sc, cols, _to_java_column)))

def map_concat(*cols):

"""Returns the union of all the given maps.

.. versionadded:: 2.4.0

Parameters

----------

cols : :class:`~pyspark.sql.Column` or str

column names or :class:`~pyspark.sql.Column`\\s

Examples

--------

>>> from pyspark.sql.functions import map_concat

>>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2")

>>> df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False)

+------------------------+

|map3 |

+------------------------+

|{1 -> a, 2 -> b, 3 -> c}|

+------------------------+

"""

sc = SparkContext._active_spark_context

4217 ↛ 4218line 4217 didn't jump to line 4218, because the condition on line 4217 was never true if len(cols) == 1 and isinstance(cols[0], (list, set)):

cols = cols[0]

jc = sc._jvm.functions.map_concat(_to_seq(sc, cols, _to_java_column))

return Column(jc)

def sequence(start, stop, step=None):

"""

Generate a sequence of integers from `start` to `stop`, incrementing by `step`.

If `step` is not set, incrementing by 1 if `start` is less than or equal to `stop`,

otherwise -1.

.. versionadded:: 2.4.0

Examples

--------

>>> df1 = spark.createDataFrame([(-2, 2)], ('C1', 'C2'))

>>> df1.select(sequence('C1', 'C2').alias('r')).collect()

[Row(r=[-2, -1, 0, 1, 2])]

>>> df2 = spark.createDataFrame([(4, -4, -2)], ('C1', 'C2', 'C3'))

>>> df2.select(sequence('C1', 'C2', 'C3').alias('r')).collect()

[Row(r=[4, 2, 0, -2, -4])]

"""

sc = SparkContext._active_spark_context

if step is None:

return Column(sc._jvm.functions.sequence(_to_java_column(start), _to_java_column(stop)))

else:

return Column(sc._jvm.functions.sequence(

_to_java_column(start), _to_java_column(stop), _to_java_column(step)))

def from_csv(col, schema, options=None):

"""

Parses a column containing a CSV string to a row with the specified schema.

Returns `null`, in the case of an unparseable string.

.. versionadded:: 3.0.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

string column in CSV format

schema :class:`~pyspark.sql.Column` or str

a string with schema in DDL format to use when parsing the CSV column.

options : dict, optional

options to control parsing. accepts the same options as the CSV datasource.

See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_

in the version you use.

.. # noqa

Examples

--------

>>> data = [("1,2,3",)]

>>> df = spark.createDataFrame(data, ("value",))

>>> df.select(from_csv(df.value, "a INT, b INT, c INT").alias("csv")).collect()

[Row(csv=Row(a=1, b=2, c=3))]

>>> value = data[0][0]

>>> df.select(from_csv(df.value, schema_of_csv(value)).alias("csv")).collect()

[Row(csv=Row(_c0=1, _c1=2, _c2=3))]

>>> data = [(" abc",)]

>>> df = spark.createDataFrame(data, ("value",))

>>> options = {'ignoreLeadingWhiteSpace': True}

>>> df.select(from_csv(df.value, "s string", options).alias("csv")).collect()

[Row(csv=Row(s='abc'))]

"""

sc = SparkContext._active_spark_context

if isinstance(schema, str):

schema = _create_column_from_literal(schema)

4287 ↛ 4290line 4287 didn't jump to line 4290, because the condition on line 4287 was never false elif isinstance(schema, Column):

schema = _to_java_column(schema)

else:

raise TypeError("schema argument should be a column or string")

jc = sc._jvm.functions.from_csv(_to_java_column(col), schema, _options_to_str(options))

return Column(jc)

def _unresolved_named_lambda_variable(*name_parts):

"""

Create `o.a.s.sql.expressions.UnresolvedNamedLambdaVariable`,

convert it to o.s.sql.Column and wrap in Python `Column`

Parameters

----------

name_parts : str

"""

sc = SparkContext._active_spark_context

name_parts_seq = _to_seq(sc, name_parts)

expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions

return Column(

sc._jvm.Column(

expressions.UnresolvedNamedLambdaVariable(name_parts_seq)

)

def _get_lambda_parameters(f):

import inspect

signature = inspect.signature(f)

parameters = signature.parameters.values()

# We should exclude functions that use

# variable args and keyword argnames

# as well as keyword only args

supported_parameter_types = {

inspect.Parameter.POSITIONAL_OR_KEYWORD,

inspect.Parameter.POSITIONAL_ONLY,

}

# Validate that

# function arity is between 1 and 3

if not (1 <= len(parameters) <= 3):

raise ValueError(

"f should take between 1 and 3 arguments, but provided function takes {}".format(

len(parameters)

)

# and all arguments can be used as positional

if not all(p.kind in supported_parameter_types for p in parameters):

raise ValueError(

"f should use only POSITIONAL or POSITIONAL OR KEYWORD arguments"

)

return parameters

def _create_lambda(f):

"""

Create `o.a.s.sql.expressions.LambdaFunction` corresponding

to transformation described by f

:param f: A Python of one of the following forms:

- (Column) -> Column: ...

- (Column, Column) -> Column: ...

- (Column, Column, Column) -> Column: ...

"""

parameters = _get_lambda_parameters(f)

sc = SparkContext._active_spark_context

expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions

argnames = ["x", "y", "z"]

args = [

_unresolved_named_lambda_variable(

expressions.UnresolvedNamedLambdaVariable.freshVarName(arg)

)

for arg in argnames[: len(parameters)]

]

result = f(*args)

if not isinstance(result, Column):

raise ValueError("f should return Column, got {}".format(type(result)))

jexpr = result._jc.expr()

jargs = _to_seq(sc, [arg._jc.expr() for arg in args])

return expressions.LambdaFunction(jexpr, jargs, False)

def _invoke_higher_order_function(name, cols, funs):

"""

Invokes expression identified by name,

(relative to ```org.apache.spark.sql.catalyst.expressions``)

and wraps the result with Column (first Scala one, then Python).

:param name: Name of the expression

:param cols: a list of columns

:param funs: a list of((*Column) -> Column functions.

:return: a Column

"""

sc = SparkContext._active_spark_context

expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions

expr = getattr(expressions, name)

jcols = [_to_java_column(col).expr() for col in cols]

jfuns = [_create_lambda(f) for f in funs]

return Column(sc._jvm.Column(expr(*jcols + jfuns)))

def transform(col, f):

"""

Returns an array of elements after applying a transformation to each element in the input array.

.. versionadded:: 3.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

f : function

a function that is applied to each element of the input array.

Can take one of the following forms:

- Unary ``(x: Column) -> Column: ...``

- Binary ``(x: Column, i: Column) -> Column...``, where the second argument is

a 0-based index of the element.

and can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame([(1, [1, 2, 3, 4])], ("key", "values"))

>>> df.select(transform("values", lambda x: x * 2).alias("doubled")).show()

+------------+

| doubled|

+------------+

|[2, 4, 6, 8]|

+------------+

>>> def alternate(x, i):

... return when(i % 2 == 0, x).otherwise(-x)

>>> df.select(transform("values", alternate).alias("alternated")).show()

+--------------+

| alternated|

+--------------+

|[1, -2, 3, -4]|

+--------------+

"""

return _invoke_higher_order_function("ArrayTransform", [col], [f])

def exists(col, f):

"""

Returns whether a predicate holds for one or more elements in the array.

.. versionadded:: 3.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

f : function

``(x: Column) -> Column: ...`` returning the Boolean expression.

Can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

:return: a :class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame([(1, [1, 2, 3, 4]), (2, [3, -1, 0])],("key", "values"))

>>> df.select(exists("values", lambda x: x < 0).alias("any_negative")).show()

+------------+

|any_negative|

+------------+

| false|

| true|

+------------+

"""

return _invoke_higher_order_function("ArrayExists", [col], [f])

def forall(col, f):

"""

Returns whether a predicate holds for every element in the array.

.. versionadded:: 3.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

f : function

``(x: Column) -> Column: ...`` returning the Boolean expression.

Can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame(

... [(1, ["bar"]), (2, ["foo", "bar"]), (3, ["foobar", "foo"])],

... ("key", "values")

... )

>>> df.select(forall("values", lambda x: x.rlike("foo")).alias("all_foo")).show()

+-------+

|all_foo|

+-------+

| false|

| true|

+-------+

"""

return _invoke_higher_order_function("ArrayForAll", [col], [f])

def filter(col, f):

"""

Returns an array of elements for which a predicate holds in a given array.

.. versionadded:: 3.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

f : function

A function that returns the Boolean expression.

Can take one of the following forms:

- Unary ``(x: Column) -> Column: ...``

- Binary ``(x: Column, i: Column) -> Column...``, where the second argument is

a 0-based index of the element.

and can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame(

... [(1, ["2018-09-20", "2019-02-03", "2019-07-01", "2020-06-01"])],

... ("key", "values")

... )

>>> def after_second_quarter(x):

... return month(to_date(x)) > 6

>>> df.select(

... filter("values", after_second_quarter).alias("after_second_quarter")

... ).show(truncate=False)

+------------------------+

|after_second_quarter |

+------------------------+

|[2018-09-20, 2019-07-01]|

+------------------------+

"""

return _invoke_higher_order_function("ArrayFilter", [col], [f])

def aggregate(col, initialValue, merge, finish=None):

"""

Applies a binary operator to an initial state and all elements in the array,

and reduces this to a single state. The final state is converted into the final result

by applying a finish function.

Both functions can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

.. versionadded:: 3.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

initialValue : :class:`~pyspark.sql.Column` or str

initial value. Name of column or expression

merge : function

a binary function ``(acc: Column, x: Column) -> Column...`` returning expression

of the same type as ``zero``

finish : function

an optional unary function ``(x: Column) -> Column: ...``

used to convert accumulated value.

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values"))

>>> df.select(aggregate("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show()

+----+

| sum|

+----+

|42.0|

+----+

>>> def merge(acc, x):

... count = acc.count + 1

... sum = acc.sum + x

... return struct(count.alias("count"), sum.alias("sum"))

>>> df.select(

... aggregate(

... "values",

... struct(lit(0).alias("count"), lit(0.0).alias("sum")),

... merge,

... lambda acc: acc.sum / acc.count,

... ).alias("mean")

... ).show()

+----+

|mean|

+----+

| 8.4|

+----+

"""

if finish is not None:

return _invoke_higher_order_function(

"ArrayAggregate",

[col, initialValue],

[merge, finish]

)

else:

return _invoke_higher_order_function(

"ArrayAggregate",

[col, initialValue],

[merge]

)

def zip_with(left, right, f):

"""

Merge two given arrays, element-wise, into a single array using a function.

If one array is shorter, nulls are appended at the end to match the length of the longer

array, before applying the function.

.. versionadded:: 3.1.0

Parameters

----------

left : :class:`~pyspark.sql.Column` or str

name of the first column or expression

right : :class:`~pyspark.sql.Column` or str

name of the second column or expression

f : function

a binary function ``(x1: Column, x2: Column) -> Column...``

Can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame([(1, [1, 3, 5, 8], [0, 2, 4, 6])], ("id", "xs", "ys"))

>>> df.select(zip_with("xs", "ys", lambda x, y: x ** y).alias("powers")).show(truncate=False)

+---------------------------+

|powers |

+---------------------------+

|[1.0, 9.0, 625.0, 262144.0]|

+---------------------------+

>>> df = spark.createDataFrame([(1, ["foo", "bar"], [1, 2, 3])], ("id", "xs", "ys"))

>>> df.select(zip_with("xs", "ys", lambda x, y: concat_ws("_", x, y)).alias("xs_ys")).show()

+-----------------+

| xs_ys|

+-----------------+

|[foo_1, bar_2, 3]|

+-----------------+

"""

return _invoke_higher_order_function("ZipWith", [left, right], [f])

def transform_keys(col, f):

"""

Applies a function to every key-value pair in a map and returns

a map with the results of those applications as the new keys for the pairs.

.. versionadded:: 3.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

f : function

a binary function ``(k: Column, v: Column) -> Column...``

Can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame([(1, {"foo": -2.0, "bar": 2.0})], ("id", "data"))

>>> df.select(transform_keys(

... "data", lambda k, _: upper(k)).alias("data_upper")

... ).show(truncate=False)

+-------------------------+

|data_upper |

+-------------------------+

|{BAR -> 2.0, FOO -> -2.0}|

+-------------------------+

"""

return _invoke_higher_order_function("TransformKeys", [col], [f])

def transform_values(col, f):

"""

Applies a function to every key-value pair in a map and returns

a map with the results of those applications as the new values for the pairs.

.. versionadded:: 3.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

f : function

a binary function ``(k: Column, v: Column) -> Column...``

Can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame([(1, {"IT": 10.0, "SALES": 2.0, "OPS": 24.0})], ("id", "data"))

>>> df.select(transform_values(

... "data", lambda k, v: when(k.isin("IT", "OPS"), v + 10.0).otherwise(v)

... ).alias("new_data")).show(truncate=False)

+---------------------------------------+

|new_data |

+---------------------------------------+

|{OPS -> 34.0, IT -> 20.0, SALES -> 2.0}|

+---------------------------------------+

"""

return _invoke_higher_order_function("TransformValues", [col], [f])

def map_filter(col, f):

"""

Returns a map whose key-value pairs satisfy a predicate.

.. versionadded:: 3.1.0

Parameters

----------

col : :class:`~pyspark.sql.Column` or str

name of column or expression

f : function

a binary function ``(k: Column, v: Column) -> Column...``

Can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame([(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data"))

>>> df.select(map_filter(

... "data", lambda _, v: v > 30.0).alias("data_filtered")

... ).show(truncate=False)

+--------------------------+

|data_filtered |

+--------------------------+

|{baz -> 32.0, foo -> 42.0}|

+--------------------------+

"""

return _invoke_higher_order_function("MapFilter", [col], [f])

def map_zip_with(col1, col2, f):

"""

Merge two given maps, key-wise into a single map using a function.

.. versionadded:: 3.1.0

Parameters

----------

col1 : :class:`~pyspark.sql.Column` or str

name of the first column or expression

col2 : :class:`~pyspark.sql.Column` or str

name of the second column or expression

f : function

a ternary function ``(k: Column, v1: Column, v2: Column) -> Column...``

Can use methods of :class:`~pyspark.sql.Column`, functions defined in

:py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.

Python ``UserDefinedFunctions`` are not supported

(`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).

Returns

-------

:class:`~pyspark.sql.Column`

Examples

--------

>>> df = spark.createDataFrame([

... (1, {"IT": 24.0, "SALES": 12.00}, {"IT": 2.0, "SALES": 1.4})],

... ("id", "base", "ratio")

... )

>>> df.select(map_zip_with(

... "base", "ratio", lambda k, v1, v2: round(v1 * v2, 2)).alias("updated_data")

... ).show(truncate=False)

+---------------------------+

|updated_data |

+---------------------------+

|{SALES -> 16.8, IT -> 48.0}|

+---------------------------+

"""

return _invoke_higher_order_function("MapZipWith", [col1, col2], [f])

# ---------------------- Partition transform functions --------------------------------

def years(col):

"""

Partition transform function: A transform for timestamps and dates

to partition data into years.

.. versionadded:: 3.1.0

Examples

--------

>>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP

... years("ts")

... ).createOrReplace()

Notes

-----

This function can be used only in combination with

:py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`

method of the `DataFrameWriterV2`.

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.years(_to_java_column(col)))

def months(col):

"""

Partition transform function: A transform for timestamps and dates

to partition data into months.

.. versionadded:: 3.1.0

Examples

--------

>>> df.writeTo("catalog.db.table").partitionedBy(

... months("ts")

... ).createOrReplace() # doctest: +SKIP

Notes

-----

This function can be used only in combination with

:py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`

method of the `DataFrameWriterV2`.

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.months(_to_java_column(col)))

def days(col):

"""

Partition transform function: A transform for timestamps and dates

to partition data into days.

.. versionadded:: 3.1.0

Examples

--------

>>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP

... days("ts")

... ).createOrReplace()

Notes

-----

This function can be used only in combination with

:py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`

method of the `DataFrameWriterV2`.

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.days(_to_java_column(col)))

def hours(col):

"""

Partition transform function: A transform for timestamps

to partition data into hours.

.. versionadded:: 3.1.0

Examples

--------

>>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP

... hours("ts")

... ).createOrReplace()

Notes

-----

This function can be used only in combination with

:py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`

method of the `DataFrameWriterV2`.

"""

sc = SparkContext._active_spark_context

return Column(sc._jvm.functions.hours(_to_java_column(col)))

def bucket(numBuckets, col):

"""

Partition transform function: A transform for any type that partitions

by a hash of the input column.

.. versionadded:: 3.1.0

Examples

--------

>>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP

... bucket(42, "ts")

... ).createOrReplace()

Notes

-----

This function can be used only in combination with

:py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`

method of the `DataFrameWriterV2`.

"""

4958 ↛ 4959line 4958 didn't jump to line 4959, because the condition on line 4958 was never true if not isinstance(numBuckets, (int, Column)):

raise TypeError(

"numBuckets should be a Column or an int, got {}".format(type(numBuckets))

)

sc = SparkContext._active_spark_context

numBuckets = (

_create_column_from_literal(numBuckets)

if isinstance(numBuckets, int)

else _to_java_column(numBuckets)

)

return Column(sc._jvm.functions.bucket(numBuckets, _to_java_column(col)))

# ---------------------------- User Defined Function ----------------------------------

def udf(f=None, returnType=StringType()):

"""Creates a user defined function (UDF).

.. versionadded:: 1.3.0

Parameters

----------

f : function

python function if used as a standalone function

returnType : :class:`pyspark.sql.types.DataType` or str

the return type of the user-defined function. The value can be either a

:class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

Examples

--------

>>> from pyspark.sql.types import IntegerType

>>> slen = udf(lambda s: len(s), IntegerType())

>>> @udf

... def to_upper(s):

... if s is not None:

... return s.upper()

...

>>> @udf(returnType=IntegerType())

... def add_one(x):

... if x is not None:

... return x + 1

...

>>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))

>>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show()

+----------+--------------+------------+

|slen(name)|to_upper(name)|add_one(age)|

+----------+--------------+------------+

| 8| JOHN DOE| 22|

+----------+--------------+------------+

Notes

-----

The user-defined functions are considered deterministic by default. Due to

optimization, duplicate invocations may be eliminated or the function may even be invoked

more times than it is present in the query. If your function is not deterministic, call

`asNondeterministic` on the user defined function. E.g.:

>>> from pyspark.sql.types import IntegerType

>>> import random

>>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()

The user-defined functions do not support conditional expressions or short circuiting

in boolean expressions and it ends up with being executed all internally. If the functions

can fail on special rows, the workaround is to incorporate the condition into the functions.

The user-defined functions do not take keyword arguments on the calling side.

"""

# The following table shows most of Python data and SQL type conversions in normal UDFs that

# are not yet visible to the user. Some of behaviors are buggy and might be changed in the near

# future. The table might have to be eventually documented externally.

# Please see SPARK-28131's PR to see the codes in order to generate the table below.

# +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa

# |SQL Type \ Python Value(Type)|None(NoneType)|True(bool)|1(int)| a(str)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array('i', [1])(array)|[1](list)| (1,)(tuple)|bytearray(b'ABC')(bytearray)| 1(Decimal)|{'a': 1}(dict)|Row(kwargs=1)(Row)|Row(namedtuple=1)(Row)| # noqa

# | string| None| 'true'| '1'| 'a'|'java.util.Gregor...| 'java.util.Gregor...| '1.0'| '[I@66cbb73a'| '[1]'|'[Ljava.lang.Obje...| '[B@5a51eb1a'| '1'| '{a=1}'| X| X| # noqa

# | date| None| X| X| X|datetime.date(197...| datetime.date(197...| X| X| X| X| X| X| X| X| X| # noqa

# | timestamp| None| X| X| X| X| datetime.datetime...| X| X| X| X| X| X| X| X| X| # noqa

# | map<string,int>| None| None| None| None| None| None| None| None| None| None| None| None| {'a': 1}| X| X| # noqa

# | struct<_1:int>| None| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa

# Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be

# used in `returnType`.

# Note: The values inside of the table are generated by `repr`.

# Note: 'X' means it throws an exception during the conversion.

# Note: Python 3.7.3 is used.

# decorator @udf, @udf(), @udf(dataType())

if f is None or isinstance(f, (str, DataType)):

# If DataType has been passed as a positional argument

# for decorator use it as a returnType

return_type = f or returnType

return functools.partial(_create_udf, returnType=return_type,

evalType=PythonEvalType.SQL_BATCHED_UDF)

else:

return _create_udf(f=f, returnType=returnType,

evalType=PythonEvalType.SQL_BATCHED_UDF)

def _test():

import doctest

from pyspark.sql import Row, SparkSession

import pyspark.sql.functions

globs = pyspark.sql.functions.__dict__.copy()

spark = SparkSession.builder\

.master("local[4]")\

.appName("sql.functions tests")\

.getOrCreate()

sc = spark.sparkContext

globs['sc'] = sc

globs['spark'] = spark

globs['df'] = spark.createDataFrame([Row(age=2, name='Alice'), Row(age=5, name='Bob')])

(failure_count, test_count) = doctest.testmod(

pyspark.sql.functions, globs=globs,

optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)

spark.stop()

5087 ↛ 5088line 5087 didn't jump to line 5088, because the condition on line 5087 was never true if failure_count:

sys.exit(-1)

if __name__ == "__main__":

_test()

Coverage for pyspark/sql/functions.py : 90%

866 statements 788 run 78 missing 0 excluded 21 partial