Coverage for pyspark/sql/column.py: 92%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

# Licensed to the Apache Software Foundation (ASF) under one or more

# contributor license agreements. See the NOTICE file distributed with

# this work for additional information regarding copyright ownership.

# The ASF licenses this file to You under the Apache License, Version 2.0

# (the "License"); you may not use this file except in compliance with

# the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

import sys

import json

import warnings

from pyspark import copy_func

from pyspark.context import SparkContext

from pyspark.sql.types import DataType, StructField, StructType, IntegerType, StringType

__all__ = ["Column"]

def _create_column_from_literal(literal):

sc = SparkContext._active_spark_context

return sc._jvm.functions.lit(literal)

def _create_column_from_name(name):

sc = SparkContext._active_spark_context

return sc._jvm.functions.col(name)

def _to_java_column(col):

if isinstance(col, Column):

jcol = col._jc

elif isinstance(col, str):

jcol = _create_column_from_name(col)

else:

raise TypeError(

"Invalid argument, not a string or column: "

"{0} of type {1}. "

"For column literals, use 'lit', 'array', 'struct' or 'create_map' "

"function.".format(col, type(col)))

return jcol

def _to_seq(sc, cols, converter=None):

"""

Convert a list of Column (or names) into a JVM Seq of Column.

An optional `converter` could be used to convert items in `cols`

into JVM Column objects.

"""

if converter:

cols = [converter(c) for c in cols]

return sc._jvm.PythonUtils.toSeq(cols)

def _to_list(sc, cols, converter=None):

"""

Convert a list of Column (or names) into a JVM (Scala) List of Column.

An optional `converter` could be used to convert items in `cols`

into JVM Column objects.

"""

72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true if converter:

cols = [converter(c) for c in cols]

return sc._jvm.PythonUtils.toList(cols)

def _unary_op(name, doc="unary operator"):

""" Create a method for given unary operator """

def _(self):

jc = getattr(self._jc, name)()

return Column(jc)

_.__doc__ = doc

return _

def _func_op(name, doc=''):

def _(self):

sc = SparkContext._active_spark_context

jc = getattr(sc._jvm.functions, name)(self._jc)

return Column(jc)

_.__doc__ = doc

return _

def _bin_func_op(name, reverse=False, doc="binary function"):

def _(self, other):

sc = SparkContext._active_spark_context

fn = getattr(sc._jvm.functions, name)

jc = other._jc if isinstance(other, Column) else _create_column_from_literal(other)

njc = fn(self._jc, jc) if not reverse else fn(jc, self._jc)

return Column(njc)

_.__doc__ = doc

return _

def _bin_op(name, doc="binary operator"):

""" Create a method for given binary operator

"""

def _(self, other):

jc = other._jc if isinstance(other, Column) else other

njc = getattr(self._jc, name)(jc)

return Column(njc)

_.__doc__ = doc

return _

def _reverse_op(name, doc="binary operator"):

""" Create a method for binary operator (this object is on right side)

"""

def _(self, other):

jother = _create_column_from_literal(other)

jc = getattr(jother, name)(self._jc)

return Column(jc)

_.__doc__ = doc

return _

class Column(object):

"""

A column in a DataFrame.

:class:`Column` instances can be created by::

# 1. Select a column out of a DataFrame

df.colName

df["colName"]

# 2. Create from an expression

df.colName + 1

1 / df.colName

.. versionadded:: 1.3.0

"""

def __init__(self, jc):

self._jc = jc

# arithmetic operators

__neg__ = _func_op("negate")

__add__ = _bin_op("plus")

__sub__ = _bin_op("minus")

__mul__ = _bin_op("multiply")

__div__ = _bin_op("divide")

__truediv__ = _bin_op("divide")

__mod__ = _bin_op("mod")

__radd__ = _bin_op("plus")

__rsub__ = _reverse_op("minus")

__rmul__ = _bin_op("multiply")

__rdiv__ = _reverse_op("divide")

__rtruediv__ = _reverse_op("divide")

__rmod__ = _reverse_op("mod")

__pow__ = _bin_func_op("pow")

__rpow__ = _bin_func_op("pow", reverse=True)

# logistic operators

__eq__ = _bin_op("equalTo")

__ne__ = _bin_op("notEqual")

__lt__ = _bin_op("lt")

__le__ = _bin_op("leq")

__ge__ = _bin_op("geq")

__gt__ = _bin_op("gt")

_eqNullSafe_doc = """

Equality test that is safe for null values.

.. versionadded:: 2.3.0

Parameters

----------

other

a value or :class:`Column`

Examples

--------

>>> from pyspark.sql import Row

>>> df1 = spark.createDataFrame([

... Row(id=1, value='foo'),

... Row(id=2, value=None)

... ])

>>> df1.select(

... df1['value'] == 'foo',

... df1['value'].eqNullSafe('foo'),

... df1['value'].eqNullSafe(None)

... ).show()

+-------------+---------------+----------------+

|(value = foo)|(value <=> foo)|(value <=> NULL)|

+-------------+---------------+----------------+

| true| true| false|

| null| false| true|

+-------------+---------------+----------------+

>>> df2 = spark.createDataFrame([

... Row(value = 'bar'),

... Row(value = None)

... ])

>>> df1.join(df2, df1["value"] == df2["value"]).count()

>>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count()

>>> df2 = spark.createDataFrame([

... Row(id=1, value=float('NaN')),

... Row(id=2, value=42.0),

... Row(id=3, value=None)

... ])

>>> df2.select(

... df2['value'].eqNullSafe(None),

... df2['value'].eqNullSafe(float('NaN')),

... df2['value'].eqNullSafe(42.0)

... ).show()

+----------------+---------------+----------------+

|(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|

+----------------+---------------+----------------+

| false| true| false|

| false| false| true|

| true| false| false|

+----------------+---------------+----------------+

Notes

-----

Unlike Pandas, PySpark doesn't consider NaN values to be NULL. See the

`NaN Semantics <https://spark.apache.org/docs/latest/sql-ref-datatypes.html#nan-semantics>`_

for details.

"""

eqNullSafe = _bin_op("eqNullSafe", _eqNullSafe_doc)

# `and`, `or`, `not` cannot be overloaded in Python,

# so use bitwise operators as boolean operators

__and__ = _bin_op('and')

__or__ = _bin_op('or')

__invert__ = _func_op('not')

__rand__ = _bin_op("and")

__ror__ = _bin_op("or")

# container operators

def __contains__(self, item):

raise ValueError("Cannot apply 'in' operator against a column: please use 'contains' "

"in a string column or 'array_contains' function for an array column.")

# bitwise operators

_bitwiseOR_doc = """

Compute bitwise OR of this expression with another expression.

Parameters

----------

other

a value or :class:`Column` to calculate bitwise or(|) with

this :class:`Column`.

Examples

--------

>>> from pyspark.sql import Row

>>> df = spark.createDataFrame([Row(a=170, b=75)])

>>> df.select(df.a.bitwiseOR(df.b)).collect()

[Row((a | b)=235)]

"""

_bitwiseAND_doc = """

Compute bitwise AND of this expression with another expression.

Parameters

----------

other

a value or :class:`Column` to calculate bitwise and(&) with

this :class:`Column`.

Examples

--------

>>> from pyspark.sql import Row

>>> df = spark.createDataFrame([Row(a=170, b=75)])

>>> df.select(df.a.bitwiseAND(df.b)).collect()

[Row((a & b)=10)]

"""

_bitwiseXOR_doc = """

Compute bitwise XOR of this expression with another expression.

Parameters

----------

other

a value or :class:`Column` to calculate bitwise xor(^) with

this :class:`Column`.

Examples

--------

>>> from pyspark.sql import Row

>>> df = spark.createDataFrame([Row(a=170, b=75)])

>>> df.select(df.a.bitwiseXOR(df.b)).collect()

[Row((a ^ b)=225)]

"""

bitwiseOR = _bin_op("bitwiseOR", _bitwiseOR_doc)

bitwiseAND = _bin_op("bitwiseAND", _bitwiseAND_doc)

bitwiseXOR = _bin_op("bitwiseXOR", _bitwiseXOR_doc)

def getItem(self, key):

"""

An expression that gets an item at position ``ordinal`` out of a list,

or gets an item by key out of a dict.

.. versionadded:: 1.3.0

Examples

--------

>>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"])

>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()

+----+------+

|l[0]|d[key]|

+----+------+

| 1| value|

+----+------+

"""

321 ↛ 322line 321 didn't jump to line 322, because the condition on line 321 was never true if isinstance(key, Column):

warnings.warn(

"A column as 'key' in getItem is deprecated as of Spark 3.0, and will not "

"be supported in the future release. Use `column[key]` or `column.key` syntax "

"instead.",

FutureWarning

)

return self[key]

def getField(self, name):

"""

An expression that gets a field by name in a :class:`StructType`.

.. versionadded:: 1.3.0

Examples

--------

>>> from pyspark.sql import Row

>>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))])

>>> df.select(df.r.getField("b")).show()

+---+

|r.b|

+---+

| b|

+---+

>>> df.select(df.r.a).show()

+---+

|r.a|

+---+

| 1|

+---+

"""

353 ↛ 354line 353 didn't jump to line 354, because the condition on line 353 was never true if isinstance(name, Column):

warnings.warn(

"A column as 'name' in getField is deprecated as of Spark 3.0, and will not "

"be supported in the future release. Use `column[name]` or `column.name` syntax "

"instead.",

FutureWarning

)

return self[name]

def withField(self, fieldName, col):

"""

An expression that adds/replaces a field in :class:`StructType` by name.

.. versionadded:: 3.1.0

Examples

--------

>>> from pyspark.sql import Row

>>> from pyspark.sql.functions import lit

>>> df = spark.createDataFrame([Row(a=Row(b=1, c=2))])

>>> df.withColumn('a', df['a'].withField('b', lit(3))).select('a.b').show()

+---+

| b|

+---+

| 3|

+---+

>>> df.withColumn('a', df['a'].withField('d', lit(4))).select('a.d').show()

+---+

| d|

+---+

| 4|

+---+

"""

if not isinstance(fieldName, str):

raise TypeError("fieldName should be a string")

if not isinstance(col, Column):

raise TypeError("col should be a Column")

return Column(self._jc.withField(fieldName, col._jc))

def dropFields(self, *fieldNames):

"""

An expression that drops fields in :class:`StructType` by name.

This is a no-op if schema doesn't contain field name(s).

.. versionadded:: 3.1.0

Examples

--------

>>> from pyspark.sql import Row

>>> from pyspark.sql.functions import col, lit

>>> df = spark.createDataFrame([

... Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))])

>>> df.withColumn('a', df['a'].dropFields('b')).show()

+-----------------+

| a|

+-----------------+

|{2, 3, {4, 5, 6}}|

+-----------------+

>>> df.withColumn('a', df['a'].dropFields('b', 'c')).show()

+--------------+

| a|

+--------------+

|{3, {4, 5, 6}}|

+--------------+

This method supports dropping multiple nested fields directly e.g.

>>> df.withColumn("a", col("a").dropFields("e.g", "e.h")).show()

+--------------+

| a|

+--------------+

|{1, 2, 3, {4}}|

+--------------+

However, if you are going to add/replace multiple nested fields,

it is preferred to extract out the nested struct before

adding/replacing multiple fields e.g.

>>> df.select(col("a").withField(

... "e", col("a.e").dropFields("g", "h")).alias("a")

... ).show()

+--------------+

| a|

+--------------+

|{1, 2, 3, {4}}|

+--------------+

"""

sc = SparkContext._active_spark_context

jc = self._jc.dropFields(_to_seq(sc, fieldNames))

return Column(jc)

def __getattr__(self, item):

if item.startswith("__"):

raise AttributeError(item)

return self[item]

def __getitem__(self, k):

if isinstance(k, slice):

if k.step is not None:

raise ValueError("slice with step is not supported.")

return self.substr(k.start, k.stop)

else:

return _bin_op("apply")(self, k)

def __iter__(self):

raise TypeError("Column is not iterable")

# string methods

_contains_doc = """

Contains the other element. Returns a boolean :class:`Column` based on a string match.

Parameters

----------

other

string in line. A value as a literal or a :class:`Column`.

Examples

--------

>>> df.filter(df.name.contains('o')).collect()

[Row(age=5, name='Bob')]

"""

_rlike_doc = """

SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex

match.

Parameters

----------

other : str

an extended regex expression

Examples

--------

>>> df.filter(df.name.rlike('ice$')).collect()

[Row(age=2, name='Alice')]

"""

_like_doc = """

SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match.

Parameters

----------

other : str

a SQL LIKE pattern

Coverage for pyspark/sql/column.py : 92%

237 statements 226 run 11 missing 0 excluded 9 partial