Coverage for pyspark/serializers.py: 81%

193 ↛ 194line 193 didn't jump to line 194, because the condition on line 193 was never true elif hasattr(iterator, "__len__") and hasattr(iterator, "__getslice__"):

n = len(iterator)

for i in range(0, n, self.batchSize):

yield iterator[i: i + self.batchSize]

else:

items = []

count = 0

for item in iterator:

items.append(item)

count += 1

if count == self.batchSize:

yield items

items = []

count = 0

if items:

yield items

def dump_stream(self, iterator, stream):

self.serializer.dump_stream(self._batched(iterator), stream)

def load_stream(self, stream):

return chain.from_iterable(self._load_stream_without_unbatching(stream))

def _load_stream_without_unbatching(self, stream):

return self.serializer.load_stream(stream)

def __repr__(self):

return "BatchedSerializer(%s, %d)" % (str(self.serializer), self.batchSize)

class FlattenedValuesSerializer(BatchedSerializer):

"""

Serializes a stream of list of pairs, split the list of values

which contain more than a certain number of objects to make them

have similar sizes.

"""

def __init__(self, serializer, batchSize=10):

BatchedSerializer.__init__(self, serializer, batchSize)

def _batched(self, iterator):

n = self.batchSize

for key, values in iterator:

for i in range(0, len(values), n):

yield key, values[i:i + n]

def load_stream(self, stream):

return self.serializer.load_stream(stream)

def __repr__(self):

return "FlattenedValuesSerializer(%s, %d)" % (self.serializer, self.batchSize)

class AutoBatchedSerializer(BatchedSerializer):

"""

Choose the size of batch automatically based on the size of object

"""

def __init__(self, serializer, bestSize=1 << 16):

BatchedSerializer.__init__(self, serializer, self.UNKNOWN_BATCH_SIZE)

self.bestSize = bestSize

def dump_stream(self, iterator, stream):

batch, best = 1, self.bestSize

iterator = iter(iterator)

while True:

vs = list(itertools.islice(iterator, batch))

if not vs:

break

bytes = self.serializer.dumps(vs)

write_int(len(bytes), stream)

stream.write(bytes)

size = len(bytes)

268 ↛ 270line 268 didn't jump to line 270, because the condition on line 268 was never false if size < best:

batch *= 2

elif size > best * 10 and batch > 1:

batch //= 2

def __repr__(self):

return "AutoBatchedSerializer(%s)" % self.serializer

class CartesianDeserializer(Serializer):

"""

Deserializes the JavaRDD cartesian() of two PythonRDDs.

Due to pyspark batching we cannot simply use the result of the Java RDD cartesian,

we additionally need to do the cartesian within each pair of batches.

"""

def __init__(self, key_ser, val_ser):

self.key_ser = key_ser

self.val_ser = val_ser

def _load_stream_without_unbatching(self, stream):

key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)

val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)

for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):

# for correctness with repeated cartesian/zip this must be returned as one batch

yield product(key_batch, val_batch)

def load_stream(self, stream):

return chain.from_iterable(self._load_stream_without_unbatching(stream))

def __repr__(self):

return "CartesianDeserializer(%s, %s)" % \

(str(self.key_ser), str(self.val_ser))

class PairDeserializer(Serializer):

"""

Deserializes the JavaRDD zip() of two PythonRDDs.

Due to pyspark batching we cannot simply use the result of the Java RDD zip,

we additionally need to do the zip within each pair of batches.

"""

def __init__(self, key_ser, val_ser):

self.key_ser = key_ser

self.val_ser = val_ser

def _load_stream_without_unbatching(self, stream):

key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)

val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)

for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):

# For double-zipped RDDs, the batches can be iterators from other PairDeserializer,

# instead of lists. We need to convert them to lists if needed.

key_batch = key_batch if hasattr(key_batch, '__len__') else list(key_batch)

val_batch = val_batch if hasattr(val_batch, '__len__') else list(val_batch)

324 ↛ 325line 324 didn't jump to line 325, because the condition on line 324 was never true if len(key_batch) != len(val_batch):

raise ValueError("Can not deserialize PairRDD with different number of items"

" in batches: (%d, %d)" % (len(key_batch), len(val_batch)))

# for correctness with repeated cartesian/zip this must be returned as one batch

yield zip(key_batch, val_batch)

def load_stream(self, stream):

return chain.from_iterable(self._load_stream_without_unbatching(stream))

def __repr__(self):

return "PairDeserializer(%s, %s)" % (str(self.key_ser), str(self.val_ser))

class NoOpSerializer(FramedSerializer):

def loads(self, obj):

return obj

def dumps(self, obj):

return obj

# Hack namedtuple, make it picklable

__cls = {} # type: ignore

def _restore(name, fields, value):

""" Restore an object of namedtuple"""

k = (name, fields)

cls = __cls.get(k)

if cls is None:

cls = collections.namedtuple(name, fields)

__cls[k] = cls

return cls(*value)

def _hack_namedtuple(cls):

""" Make class generated by namedtuple picklable """

name = cls.__name__

fields = cls._fields

def __reduce__(self):

return (_restore, (name, fields, tuple(self)))

cls.__reduce__ = __reduce__

cls._is_namedtuple_ = True

return cls

def _hijack_namedtuple():

""" Hack namedtuple() to make it picklable """

# hijack only one time

if hasattr(collections.namedtuple, "__hijack"):

return

global _old_namedtuple # or it will put in closure

global _old_namedtuple_kwdefaults # or it will put in closure too

def _copy_func(f):

return types.FunctionType(f.__code__, f.__globals__, f.__name__,

f.__defaults__, f.__closure__)

_old_namedtuple = _copy_func(collections.namedtuple)

_old_namedtuple_kwdefaults = collections.namedtuple.__kwdefaults__

def namedtuple(*args, **kwargs):

for k, v in _old_namedtuple_kwdefaults.items():

kwargs[k] = kwargs.get(k, v)

cls = _old_namedtuple(*args, **kwargs)

return _hack_namedtuple(cls)

# replace namedtuple with the new one

collections.namedtuple.__globals__["_old_namedtuple_kwdefaults"] = _old_namedtuple_kwdefaults

collections.namedtuple.__globals__["_old_namedtuple"] = _old_namedtuple

collections.namedtuple.__globals__["_hack_namedtuple"] = _hack_namedtuple

collections.namedtuple.__code__ = namedtuple.__code__

collections.namedtuple.__hijack = 1

# hack the cls already generated by namedtuple.

# Those created in other modules can be pickled as normal,

# so only hack those in __main__ module

for n, o in sys.modules["__main__"].__dict__.items():

406 ↛ 409line 406 didn't jump to line 409, because the condition on line 406 was never true if (type(o) is type and o.__base__ is tuple

and hasattr(o, "_fields")

and "__reduce__" not in o.__dict__):

_hack_namedtuple(o) # hack inplace

_hijack_namedtuple()

class PickleSerializer(FramedSerializer):

"""

Serializes objects using Python's pickle serializer:

http://docs.python.org/2/library/pickle.html

This serializer supports nearly any Python object, but may

not be as fast as more specialized serializers.

"""

def dumps(self, obj):

return pickle.dumps(obj, pickle_protocol)

def loads(self, obj, encoding="bytes"):

return pickle.loads(obj, encoding=encoding)

class CloudPickleSerializer(PickleSerializer):

def dumps(self, obj):

try:

return cloudpickle.dumps(obj, pickle_protocol)

438 ↛ 439line 438 didn't jump to line 439, because the exception caught by line 438 didn't happen except pickle.PickleError:

raise

except Exception as e:

emsg = str(e)

442 ↛ 443line 442 didn't jump to line 443, because the condition on line 442 was never true if "'i' format requires" in emsg:

msg = "Object too large to serialize: %s" % emsg

else:

msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg)

print_exec(sys.stderr)

raise pickle.PicklingError(msg)

class MarshalSerializer(FramedSerializer):

"""

Serializes objects using Python's Marshal serializer:

http://docs.python.org/2/library/marshal.html

This serializer is faster than PickleSerializer but supports fewer datatypes.

"""

def dumps(self, obj):

return marshal.dumps(obj)

def loads(self, obj):

return marshal.loads(obj)

class AutoSerializer(FramedSerializer):

"""

Choose marshal or pickle as serialization protocol automatically

"""

def __init__(self):

FramedSerializer.__init__(self)

self._type = None

def dumps(self, obj):

if self._type is not None:

return b'P' + pickle.dumps(obj, -1)

try:

return b'M' + marshal.dumps(obj)

except Exception:

self._type = b'P'

return b'P' + pickle.dumps(obj, -1)

def loads(self, obj):

_type = obj[0]

if _type == b'M':

return marshal.loads(obj[1:])

elif _type == b'P':

return pickle.loads(obj[1:])

else:

raise ValueError("invalid serialization type: %s" % _type)

class CompressedSerializer(FramedSerializer):

"""

Compress the serialized data

"""

def __init__(self, serializer):

FramedSerializer.__init__(self)

assert isinstance(serializer, FramedSerializer), "serializer must be a FramedSerializer"

self.serializer = serializer

def dumps(self, obj):

return zlib.compress(self.serializer.dumps(obj), 1)

def loads(self, obj):

return self.serializer.loads(zlib.decompress(obj))

def __repr__(self):

return "CompressedSerializer(%s)" % self.serializer

class UTF8Deserializer(Serializer):

"""

Deserializes streams written by String.getBytes.

"""

def __init__(self, use_unicode=True):

self.use_unicode = use_unicode

def loads(self, stream):

length = read_int(stream)

526 ↛ 527line 526 didn't jump to line 527, because the condition on line 526 was never true if length == SpecialLengths.END_OF_DATA_SECTION:

raise EOFError

elif length == SpecialLengths.NULL:

return None

s = stream.read(length)

return s.decode("utf-8") if self.use_unicode else s

def load_stream(self, stream):

try:

while True:

yield self.loads(stream)

537 ↛ 538line 537 didn't jump to line 538, because the exception caught by line 537 didn't happen except struct.error:

return

except EOFError:

return

def __repr__(self):

return "UTF8Deserializer(%s)" % self.use_unicode

def read_long(stream):

length = stream.read(8)

if not length:

raise EOFError

return struct.unpack("!q", length)[0]

def write_long(value, stream):

stream.write(struct.pack("!q", value))

def pack_long(value):

return struct.pack("!q", value)

def read_int(stream):

length = stream.read(4)

if not length:

raise EOFError

return struct.unpack("!i", length)[0]

def write_int(value, stream):

stream.write(struct.pack("!i", value))

def read_bool(stream):

length = stream.read(1)

if not length:

raise EOFError

return struct.unpack("!?", length)[0]

def write_with_length(obj, stream):

write_int(len(obj), stream)

stream.write(obj)

class ChunkedStream(object):

"""

This is a file-like object takes a stream of data, of unknown length, and breaks it into fixed

length frames. The intended use case is serializing large data and sending it immediately over

a socket -- we do not want to buffer the entire data before sending it, but the receiving end

needs to know whether or not there is more data coming.

It works by buffering the incoming data in some fixed-size chunks. If the buffer is full, it

first sends the buffer size, then the data. This repeats as long as there is more data to send.

When this is closed, it sends the length of whatever data is in the buffer, then that data, and

finally a "length" of -1 to indicate the stream has completed.

"""

def __init__(self, wrapped, buffer_size):

self.buffer_size = buffer_size

self.buffer = bytearray(buffer_size)

self.current_pos = 0

self.wrapped = wrapped

def write(self, bytes):

byte_pos = 0

byte_remaining = len(bytes)

while byte_remaining > 0:

new_pos = byte_remaining + self.current_pos

if new_pos < self.buffer_size:

# just put it in our buffer

self.buffer[self.current_pos:new_pos] = bytes[byte_pos:]

self.current_pos = new_pos

byte_remaining = 0

else:

# fill the buffer, send the length then the contents, and start filling again

space_left = self.buffer_size - self.current_pos

new_byte_pos = byte_pos + space_left

self.buffer[self.current_pos:self.buffer_size] = bytes[byte_pos:new_byte_pos]

write_int(self.buffer_size, self.wrapped)

self.wrapped.write(self.buffer)

byte_remaining -= space_left

byte_pos = new_byte_pos

self.current_pos = 0

def close(self):

# if there is anything left in the buffer, write it out first

if self.current_pos > 0:

write_int(self.current_pos, self.wrapped)

self.wrapped.write(self.buffer[:self.current_pos])

# -1 length indicates to the receiving end that we're done.

write_int(-1, self.wrapped)

self.wrapped.close()

@property

def closed(self):

"""

Return True if the `wrapped` object has been closed.

NOTE: this property is required by pyarrow to be used as a file-like object in

pyarrow.RecordBatchStreamWriter from ArrowStreamSerializer

"""

return self.wrapped.closed

if __name__ == '__main__':

import doctest

(failure_count, test_count) = doctest.testmod()

647 ↛ 648line 647 didn't jump to line 648, because the condition on line 647 was never true if failure_count:

sys.exit(-1)

Coverage for pyspark/serializers.py : 81%

339 statements 289 run 50 missing 0 excluded 13 partial