Coverage for pyspark/sql/pandas/functions.py: 18%

# +-----------------------------+----------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+--------------+--------------+--------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+--------------+-----------------+------------------+---------------+--------------------------------+ # noqa

# | boolean| None| True| True| True| True| True| True| True| True| True| True| True| True| X| X| X| X| X| X| X| X| X| X| # noqa

# | tinyint| None| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| X| X| X| 1| X| X| X| X| X| X| # noqa

# | smallint| None| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| X| X| X| 1| X| X| X| X| X| X| # noqa

# | int| None| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| X| X| X| 1| X| X| X| X| X| X| # noqa

# | bigint| None| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1| 0| 18000000000000| X| 1| X| X| X| X| X| 86400000000000| # noqa

# | float| None| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| X| X| X| X| X| X| X| X| X| X| # noqa

# | double| None| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| X| X| X| X| X| X| X| X| X| X| # noqa

# | date| None| X| X| X|datetime.date(197...| X| X| X| X| X| X| X| X| datetime.date(197...| datetime.date(197...| X|datetime.date(197...| X| X| X| X| X| X| # noqa

# | timestamp| None| X| X| X| X|datetime.datetime...| X| X| X| X| X| X| X| datetime.datetime...| datetime.datetime...| X|datetime.datetime...| X| X| X| X| X| X| # noqa

# | string| None| X| X| X| X| X| X| X| X| X| X| X| X| X| X| 'a'| X| X| X| X| X| 'A'| X| # noqa

# | decimal(10,0)| None| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| Decimal('1')| X| X| X| X| X| X| # noqa

# | array<int>| None| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| [1, 2, 3]| X| X| X| X| X| # noqa

# | map<string,int>| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| # noqa

# | struct<_1:int>| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| # noqa

# Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be

# used in `returnType`.

# Note: The values inside of the table are generated by `repr`.

# Note: Python 3.7.3, Pandas 1.1.1 and PyArrow 1.0.1 are used.

# Note: Timezone is KST.

# Note: 'X' means it throws an exception during the conversion.

require_minimum_pandas_version()

require_minimum_pyarrow_version()

# decorator @pandas_udf(returnType, functionType)

is_decorator = f is None or isinstance(f, (str, DataType))

if is_decorator:

# If DataType has been passed as a positional argument

# for decorator use it as a returnType

return_type = f or returnType

if functionType is not None:

# @pandas_udf(dataType, functionType=functionType)

# @pandas_udf(returnType=dataType, functionType=functionType)

eval_type = functionType

elif returnType is not None and isinstance(returnType, int):

# @pandas_udf(dataType, functionType)

eval_type = returnType

else:

# @pandas_udf(dataType) or @pandas_udf(returnType=dataType)

eval_type = None

else:

return_type = returnType

if functionType is not None:

eval_type = functionType

else:

eval_type = None

if return_type is None:

raise ValueError("Invalid return type: returnType can not be None")

if eval_type not in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,

PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,

PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,

PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,

PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,

PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,

None]: # None means it should infer the type from type hints.

raise ValueError("Invalid function type: "

"functionType must be one the values from PandasUDFType")

if is_decorator:

return functools.partial(_create_pandas_udf, returnType=return_type, evalType=eval_type)

else:

return _create_pandas_udf(f=f, returnType=return_type, evalType=eval_type)

def _create_pandas_udf(f, returnType, evalType):

argspec = getfullargspec(f)

# pandas UDF by type hints.

from inspect import signature

if evalType in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,

PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,

PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:

warnings.warn(

"In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for "

"pandas UDF instead of specifying pandas UDF type which will be deprecated "

"in the future releases. See SPARK-28264 for more details.", UserWarning)

elif evalType in [PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,

PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,

PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF]:

# In case of 'SQL_GROUPED_MAP_PANDAS_UDF', deprecation warning is being triggered

# at `apply` instead.

# In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the

# evaluation type will always be set.

pass

elif len(argspec.annotations) > 0:

evalType = infer_eval_type(signature(f))

assert evalType is not None

if evalType is None:

# Set default is scalar UDF.

evalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF

if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or

evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF) and \

len(argspec.args) == 0 and \

argspec.varargs is None:

raise ValueError(

"Invalid function: 0-arg pandas_udfs are not supported. "

"Instead, create a 1-arg pandas_udf and ignore the arg in your function."

)

if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \

and len(argspec.args) not in (1, 2):

raise ValueError(

"Invalid function: pandas_udf with function type GROUPED_MAP or "

"the function in groupby.applyInPandas "

"must take either one argument (data) or two arguments (key, data).")

if evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF \

and len(argspec.args) not in (2, 3):

raise ValueError(

"Invalid function: the function in cogroup.applyInPandas "

"must take either two arguments (left, right) "

"or three arguments (key, left, right).")

return _create_udf(f, returnType, evalType)

Coverage for pyspark/sql/pandas/functions.py : 18%

54 statements 15 run 39 missing 0 excluded 0 partial