Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #
MLlib utilities for linear algebra. For dense vectors, MLlib uses the NumPy `array` type, so you can simply pass NumPy arrays around. For sparse vectors, users can construct a :class:`SparseVector` object from MLlib or pass SciPy `scipy.sparse` column vectors if SciPy is available in their environment. """
IntegerType, ByteType, BooleanType
'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices']
# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, # such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
except: # No SciPy in environment, but that's okay _have_scipy = False
return l elif _have_scipy and scipy.sparse.issparse(l): assert l.shape[1] == 1, "Expected column vector" # Make sure the converted csc_matrix has sorted indices. csc = l.tocsc() if not csc.has_sorted_indices: csc.sort_indices() return SparseVector(l.shape[0], csc.indices, csc.data) else: raise TypeError("Cannot convert type %s into Vector" % type(l))
""" Returns the size of the vector.
Examples -------- >>> _vector_size([1., 2., 3.]) 3 >>> _vector_size((1., 2., 3.)) 3 >>> _vector_size(array.array('d', [1., 2., 3.])) 3 >>> _vector_size(np.zeros(3)) 3 >>> _vector_size(np.zeros((3, 1))) 3 >>> _vector_size(np.zeros((1, 3))) Traceback (most recent call last): ... ValueError: Cannot treat an ndarray of shape (1, 3) as a vector """ else: elif _have_scipy and scipy.sparse.issparse(v): assert v.shape[1] == 1, "Expected column vector" return v.shape[0] else: raise TypeError("Cannot treat type %s as a vector" % type(v))
value = float('nan') # pack double into 64 bits, then unpack as long int
""" SQL user-defined type (UDT) for Vector. """
def sqlType(cls): StructField("type", ByteType(), False), StructField("size", IntegerType(), True), StructField("indices", ArrayType(IntegerType(), False), True), StructField("values", ArrayType(DoubleType(), False), True)])
def module(cls):
def scalaUDT(cls):
else: raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
"VectorUDT.deserialize given row with length %d but requires 4" % len(datum) else: raise ValueError("do not recognize type %r" % tpe)
return "vector"
""" SQL user-defined type (UDT) for Matrix. """
def sqlType(cls): StructField("type", ByteType(), False), StructField("numRows", IntegerType(), False), StructField("numCols", IntegerType(), False), StructField("colPtrs", ArrayType(IntegerType(), False), True), StructField("rowIndices", ArrayType(IntegerType(), False), True), StructField("values", ArrayType(DoubleType(), False), True), StructField("isTransposed", BooleanType(), False)])
def module(cls):
def scalaUDT(cls):
rowIndices, values, bool(obj.isTransposed)) bool(obj.isTransposed)) else: raise TypeError("cannot serialize type %r" % (type(obj)))
"MatrixUDT.deserialize given row with length %d but requires 7" % len(datum) else: raise ValueError("do not recognize type %r" % tpe)
return "matrix"
""" Abstract class for DenseVector and SparseVector """ """ Convert the vector into an numpy.ndarray
:return: numpy.ndarray """ raise NotImplementedError
""" A dense vector represented by a value array. We use numpy array for storage and arithmetics will be delegated to the underlying numpy array.
Examples -------- >>> v = Vectors.dense([1.0, 2.0]) >>> u = Vectors.dense([3.0, 4.0]) >>> v + u DenseVector([4.0, 6.0]) >>> 2 - v DenseVector([1.0, 0.0]) >>> v / 2 DenseVector([0.5, 1.0]) >>> v * u DenseVector([3.0, 8.0]) >>> u / v DenseVector([3.0, 2.0]) >>> u % 2 DenseVector([1.0, 0.0]) >>> -v DenseVector([-1.0, -2.0]) """
""" Number of nonzero elements. This scans all active values and count non zeros """ return np.count_nonzero(self.array)
""" Calculates the norm of a DenseVector.
Examples -------- >>> a = DenseVector([0, -1, 2, -3]) >>> a.norm(2) 3.7... >>> a.norm(1) 6.0 """
""" Compute the dot product of two Vectors. We support (Numpy array, list, SparseVector, or SciPy sparse) and a target NumPy array that is either 1- or 2-dimensional. Equivalent to calling numpy.dot of the two vectors.
Examples -------- >>> dense = DenseVector(array.array('d', [1., 2.])) >>> dense.dot(dense) 5.0 >>> dense.dot(SparseVector(2, [0, 1], [2., 1.])) 4.0 >>> dense.dot(range(1, 3)) 5.0 >>> dense.dot(np.array(range(1, 3))) 5.0 >>> dense.dot([1.,]) Traceback (most recent call last): ... AssertionError: dimension mismatch >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F')) array([ 5., 11.]) >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F')) Traceback (most recent call last): ... AssertionError: dimension mismatch """ assert len(self) == other.shape[0], "dimension mismatch" return other.transpose().dot(self.toArray()) else: else:
""" Squared distance of two Vectors.
Examples -------- >>> dense1 = DenseVector(array.array('d', [1., 2.])) >>> dense1.squared_distance(dense1) 0.0 >>> dense2 = np.array([2., 1.]) >>> dense1.squared_distance(dense2) 2.0 >>> dense3 = [2., 1.] >>> dense1.squared_distance(dense3) 2.0 >>> sparse1 = SparseVector(2, [0, 1], [2., 1.]) >>> dense1.squared_distance(sparse1) 2.0 >>> dense1.squared_distance([1.,]) Traceback (most recent call last): ... AssertionError: dimension mismatch >>> dense1.squared_distance(SparseVector(1, [0,], [1.,])) Traceback (most recent call last): ... AssertionError: dimension mismatch """ return _convert_to_vector(other).squared_distance(self)
""" Returns the underlying numpy.ndarray """
def values(self): """ Returns the underlying numpy.ndarray """ return self.array
return "[" + ",".join([str(v) for v in self.array]) + "]"
return False return False
""" A simple sparse vector class for passing data to MLlib. Users may alternatively pass SciPy's {scipy.sparse} data types. """ """ Create a sparse vector, using either a dictionary, a list of (index, value) pairs, or two separate arrays of indices and values (sorted by index).
Examples -------- size : int Size of the vector. args Active entries, as a dictionary {index: value, ...}, a list of tuples [(index, value), ...], or a list of strictly increasing indices and a list of corresponding values [index, ...], [value, ...]. Inactive entries are treated as zeros.
Examples -------- >>> SparseVector(4, {1: 1.0, 3: 5.5}) SparseVector(4, {1: 1.0, 3: 5.5}) >>> SparseVector(4, [(1, 1.0), (3, 5.5)]) SparseVector(4, {1: 1.0, 3: 5.5}) >>> SparseVector(4, [1, 3], [1.0, 5.5]) SparseVector(4, {1: 1.0, 3: 5.5}) >>> SparseVector(4, {1:1.0, 6:2.0}) Traceback (most recent call last): ... AssertionError: Index 6 is out of the size of vector with size=4 >>> SparseVector(4, {-1:1.0}) Traceback (most recent call last): ... AssertionError: Contains negative index -1 """ """ Size of the vector. """ """ A list of indices corresponding to active entries. """ else: else: # np.frombuffer() doesn't work well with empty string in older version else: raise TypeError( "Indices %s and %s are not strictly increasing" % (self.indices[i], self.indices[i + 1]))
"Index %d is out of the size of vector with size=%d" \ % (np.max(self.indices), self.size) "Contains negative index %d" % (np.min(self.indices))
""" Number of nonzero elements. This scans all active values and count non zeros. """
""" Calculates the norm of a SparseVector.
Examples -------- >>> a = SparseVector(4, [0, 1], [3., -4.]) >>> a.norm(1) 7.0 >>> a.norm(2) 5.0 """
SparseVector, (self.size, self.indices.tostring(), self.values.tostring()))
""" Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
Examples -------- >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) >>> a.dot(a) 25.0 >>> a.dot(array.array('d', [1., 2., 3., 4.])) 22.0 >>> b = SparseVector(4, [2], [1.0]) >>> a.dot(b) 0.0 >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]])) array([ 22., 22.]) >>> a.dot([1., 2., 3.]) Traceback (most recent call last): ... AssertionError: dimension mismatch >>> a.dot(np.array([1., 2.])) Traceback (most recent call last): ... AssertionError: dimension mismatch >>> a.dot(DenseVector([1., 2.])) Traceback (most recent call last): ... AssertionError: dimension mismatch >>> a.dot(np.zeros((3, 2))) Traceback (most recent call last): ... AssertionError: dimension mismatch """
raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
# Find out common indices. else:
else:
""" Squared distance from a SparseVector or 1-dimensional NumPy array.
Examples -------- >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) >>> a.squared_distance(a) 0.0 >>> a.squared_distance(array.array('d', [1., 2., 3., 4.])) 11.0 >>> a.squared_distance(np.array([1., 2., 3., 4.])) 11.0 >>> b = SparseVector(4, [2], [1.0]) >>> a.squared_distance(b) 26.0 >>> b.squared_distance(a) 26.0 >>> b.squared_distance([1., 2.]) Traceback (most recent call last): ... AssertionError: dimension mismatch >>> b.squared_distance(SparseVector(3, [1,], [1.0,])) Traceback (most recent call last): ... AssertionError: dimension mismatch """
raise ValueError("Cannot call squared_distance with %d-dimensional array" % other.ndim)
else: else:
""" Returns a copy of this SparseVector as a 1-dimensional numpy.ndarray. """
inds = "[" + ",".join([str(i) for i in self.indices]) + "]" vals = "[" + ",".join([str(v) for v in self.values]) + "]" return "(" + ",".join((str(self.size), inds, vals)) + ")"
for i in range(len(inds))])
and np.array_equal(other.values, self.values) elif isinstance(other, DenseVector): if self.size != len(other): return False return Vectors._equals(self.indices, self.values, list(range(len(other))), other.array) return False
"Indices must be of type integer, got type %s" % type(index))
return not self.__eq__(other)
""" Factory methods for working with vectors.
Notes ----- Dense vectors are simply represented as NumPy array objects, so there is no need to covert them for use in MLlib. For sparse vectors, the factory methods in this class create an MLlib-compatible type, or users can pass in SciPy's `scipy.sparse` column vectors. """
def sparse(size, *args): """ Create a sparse vector, using either a dictionary, a list of (index, value) pairs, or two separate arrays of indices and values (sorted by index).
Parameters ---------- size : int Size of the vector. args Non-zero entries, as a dictionary, list of tuples, or two sorted lists containing indices and values.
Examples -------- >>> Vectors.sparse(4, {1: 1.0, 3: 5.5}) SparseVector(4, {1: 1.0, 3: 5.5}) >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) SparseVector(4, {1: 1.0, 3: 5.5}) >>> Vectors.sparse(4, [1, 3], [1.0, 5.5]) SparseVector(4, {1: 1.0, 3: 5.5}) """
def dense(*elements): """ Create a dense vector of 64-bit floats from a Python list or numbers.
Examples -------- >>> Vectors.dense([1, 2, 3]) DenseVector([1.0, 2.0, 3.0]) >>> Vectors.dense(1.0, 2.0) DenseVector([1.0, 2.0]) """ # it's list, numpy.array or other iterable object.
def squared_distance(v1, v2): """ Squared distance between two vectors. a and b can be of type SparseVector, DenseVector, np.ndarray or array.array.
Examples -------- >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) >>> b = Vectors.dense([2, 5, 4, 1]) >>> a.squared_distance(b) 51.0 """ v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2) return v1.squared_distance(v2)
def norm(vector, p): """ Find norm of the given vector. """ return _convert_to_vector(vector).norm(p)
def zeros(size): return DenseVector(np.zeros(size))
def _equals(v1_indices, v1_values, v2_indices, v2_values): """ Check equality between sparse/dense vectors, v1_indices and v2_indices assume to be strictly increasing. """
""" Represents a local matrix. """
""" Returns its elements in a numpy.ndarray. """ raise NotImplementedError
def _convert_to_array(array_like, dtype): """ Convert Matrix attributes which are array-like or buffer to array. """
""" Column-major dense matrix. """
self.numRows, self.numCols, self.values.tostring(), int(self.isTransposed))
""" Pretty printing of a DenseMatrix
Examples -------- >>> dm = DenseMatrix(2, 2, range(4)) >>> print(dm) DenseMatrix([[ 0., 2.], [ 1., 3.]]) >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True) >>> print(dm) DenseMatrix([[ 0., 1.], [ 2., 3.]]) """ # Inspired by __repr__ in scipy matrices.
# We need to adjust six spaces which is the difference in number # of letters between "DenseMatrix" and "array"
""" Representation of a DenseMatrix
Examples -------- >>> dm = DenseMatrix(2, 2, range(4)) >>> dm DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False) """ # If the number of values are less than seventeen then return as it is. # Else return first eight values and last eight values. else: _format_float_list(self.values[:8]) + ["..."] + _format_float_list(self.values[-8:]) )
self.numRows, self.numCols, entries, self.isTransposed)
""" Return a :py:class:`numpy.ndarray`
Examples -------- >>> m = DenseMatrix(2, 2, range(4)) >>> m.toArray() array([[ 0., 2.], [ 1., 3.]]) """ self.values.reshape((self.numRows, self.numCols))) else:
"""Convert to SparseMatrix""" else: (0, colCounts, np.zeros(self.numCols - colCounts.size))))
% (i, self.numRows)) raise IndexError("Column index %d is out of range [0, %d)" % (j, self.numCols))
else:
return False
"""Sparse Matrix stored in CSC format.""" isTransposed=False):
raise ValueError("Expected colPtrs of size %d, got %d." % (numRows + 1, self.colPtrs.size)) else: raise ValueError("Expected colPtrs of size %d, got %d." % (numCols + 1, self.colPtrs.size)) raise ValueError("Expected rowIndices of length %d, got %d." % (self.rowIndices.size, self.values.size))
""" Pretty printing of a SparseMatrix
Examples -------- >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) >>> print(sm1) 2 X 2 CSCMatrix (0,0) 2.0 (1,0) 3.0 (1,1) 4.0 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) >>> print(sm1) 2 X 2 CSRMatrix (0,0) 2.0 (0,1) 3.0 (1,1) 4.0 """ else:
# Display first 16 values. else: cur_col, rowInd, _format_float(value))) else: rowInd, cur_col, _format_float(value)))
""" Representation of a SparseMatrix
Examples -------- >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) >>> sm1 SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False) """
else: _format_float_list(self.values[:8]) + ["..."] + _format_float_list(self.values[-8:]) )
self.numRows, self.numCols, colPtrs, rowIndices, values, self.isTransposed)
self.numRows, self.numCols, self.colPtrs.tostring(), self.rowIndices.tostring(), self.values.tostring(), int(self.isTransposed))
% (i, self.numRows)) raise IndexError("Column index %d is out of range [0, %d)" % (j, self.numCols))
# If a CSR matrix is given, then the row index should be searched # for in ColPtrs, and the column index should be searched for in the # corresponding slice obtained from rowIndices.
else:
""" Return a numpy.ndarray """ else:
# TODO: More efficient implementation:
def dense(numRows, numCols, values): """ Create a DenseMatrix """
def sparse(numRows, numCols, colPtrs, rowIndices, values): """ Create a SparseMatrix """
# Numpy 1.14+ changed it's string format. except TypeError: pass sys.exit(-1)
|