Coverage for pyspark/sql/pandas/conversion.py: 59%

317 ↛ 318line 317 didn't jump to line 318, because the condition on line 317 was never true if self._wrapped._conf.arrowPySparkEnabled() and len(data) > 0:

try:

return self._create_from_pandas_with_arrow(data, schema, timezone)

except Exception as e:

if self._wrapped._conf.arrowPySparkFallbackEnabled():

msg = (

"createDataFrame attempted Arrow optimization because "

"'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "

"failed by the reason below:\n %s\n"

"Attempting non-optimization as "

"'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "

"true." % str(e))

warnings.warn(msg)

else:

msg = (

"createDataFrame attempted Arrow optimization because "

"'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "

"reached the error below and will not continue because automatic "

"fallback with 'spark.sql.execution.arrow.pyspark.fallback.enabled' "

"has been set to false.\n %s" % str(e))

warnings.warn(msg)

raise

data = self._convert_from_pandas(data, schema, timezone)

return self._create_dataframe(data, schema, samplingRatio, verifySchema)

def _convert_from_pandas(self, pdf, schema, timezone):

"""

Convert a pandas.DataFrame to list of records that can be used to make a DataFrame

Returns

-------

list

list of records

"""

from pyspark.sql import SparkSession

assert isinstance(self, SparkSession)

355 ↛ 382line 355 didn't jump to line 382, because the condition on line 355 was never false if timezone is not None:

from pyspark.sql.pandas.types import _check_series_convert_timestamps_tz_local

copied = False

if isinstance(schema, StructType):

for field in schema:

# TODO: handle nested timestamps, such as ArrayType(TimestampType())?

if isinstance(field.dataType, TimestampType):

s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone)

363 ↛ 359line 363 didn't jump to line 359, because the condition on line 363 was never false if s is not pdf[field.name]:

364 ↛ 369line 364 didn't jump to line 369, because the condition on line 364 was never false if not copied:

# Copy once if the series is modified to prevent the original

# Pandas DataFrame from being updated

pdf = pdf.copy()

copied = True

pdf[field.name] = s

else:

for column, series in pdf.iteritems():

s = _check_series_convert_timestamps_tz_local(series, timezone)

if s is not series:

374 ↛ 379line 374 didn't jump to line 379, because the condition on line 374 was never false if not copied:

# Copy once if the series is modified to prevent the original

# Pandas DataFrame from being updated

pdf = pdf.copy()

copied = True

pdf[column] = s

# Convert pandas.DataFrame to list of numpy records

np_records = pdf.to_records(index=False)

# Check if any columns need to be fixed for Spark to infer properly

385 ↛ 391line 385 didn't jump to line 391, because the condition on line 385 was never false if len(np_records) > 0:

record_dtype = self._get_numpy_record_dtype(np_records[0])

387 ↛ 391line 387 didn't jump to line 391, because the condition on line 387 was never false if record_dtype is not None:

return [r.astype(record_dtype).tolist() for r in np_records]

# Convert list of numpy records to python lists

return [r.tolist() for r in np_records]

def _get_numpy_record_dtype(self, rec):

"""

Used when converting a pandas.DataFrame to Spark using to_records(), this will correct

the dtypes of fields in a record so they can be properly loaded into Spark.

Parameters

----------

rec : numpy.record

a numpy record to check field dtypes

Returns

-------

numpy.dtype

corrected dtype for a numpy.record or None if no correction needed

"""

import numpy as np

cur_dtypes = rec.dtype

col_names = cur_dtypes.names

record_type_list = []

has_rec_fix = False

for i in range(len(cur_dtypes)):

curr_type = cur_dtypes[i]

# If type is a datetime64 timestamp, convert to microseconds

# NOTE: if dtype is datetime[ns] then np.record.tolist() will output values as longs,

# conversion from [us] or lower will lead to py datetime objects, see SPARK-22417

if curr_type == np.dtype('datetime64[ns]'):

curr_type = 'datetime64[us]'

has_rec_fix = True

record_type_list.append((str(col_names[i]), curr_type))

return np.dtype(record_type_list) if has_rec_fix else None

def _create_from_pandas_with_arrow(self, pdf, schema, timezone):

"""

Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting

to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the

data types will be used to coerce the data in Pandas to Arrow conversion.

"""

from pyspark.sql import SparkSession

from pyspark.sql.dataframe import DataFrame

assert isinstance(self, SparkSession)

from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer

from pyspark.sql.types import TimestampType

from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type

from pyspark.sql.pandas.utils import require_minimum_pandas_version, \

require_minimum_pyarrow_version

require_minimum_pandas_version()

require_minimum_pyarrow_version()

from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

import pyarrow as pa

# Create the Spark schema from list of names passed in with Arrow types

if isinstance(schema, (list, tuple)):

arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)

struct = StructType()

for name, field in zip(schema, arrow_schema):

struct.add(name, from_arrow_type(field.type), nullable=field.nullable)

schema = struct

# Determine arrow types to coerce data when creating batches

if isinstance(schema, StructType):

arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]

elif isinstance(schema, DataType):

raise ValueError("Single data type %s is not supported with Arrow" % str(schema))

else:

# Any timestamps must be coerced to be compatible with Spark

arrow_types = [to_arrow_type(TimestampType())

if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None

for t in pdf.dtypes]

# Slice the DataFrame to be batched

step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up

pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step))

# Create list of Arrow (columns, type) for serializer dump_stream

arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]

for pdf_slice in pdf_slices]

jsqlContext = self._wrapped._jsqlContext

safecheck = self._wrapped._conf.arrowSafeTypeConversion()

col_by_name = True # col by name only applies to StructType columns, can't happen here

ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

def reader_func(temp_filename):

return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

def create_RDD_server():

return self._jvm.ArrowRDDServer(jsqlContext)

# Create Spark DataFrame from Arrow stream file, using one batch per partition

jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server)

jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)

df = DataFrame(jdf, self._wrapped)

df._schema = schema

return df

def _test():

import doctest

from pyspark.sql import SparkSession

import pyspark.sql.pandas.conversion

globs = pyspark.sql.pandas.conversion.__dict__.copy()

spark = SparkSession.builder\

.master("local[4]")\

.appName("sql.pandas.conversion tests")\

.getOrCreate()

globs['spark'] = spark

(failure_count, test_count) = doctest.testmod(

pyspark.sql.pandas.conversion, globs=globs,

optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)

spark.stop()

508 ↛ 509line 508 didn't jump to line 509, because the condition on line 508 was never true if failure_count:

sys.exit(-1)

if __name__ == "__main__":

_test()

Coverage for pyspark/sql/pandas/conversion.py : 59%

243 statements 146 run 97 missing 0 excluded 11 partial