Skip to content

Commit

Permalink
Robust parser and removed unnecessary methods
Browse files Browse the repository at this point in the history
  • Loading branch information
MechCoder committed May 6, 2015
1 parent f779561 commit 1bd3c04
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 103 deletions.
129 changes: 35 additions & 94 deletions python/pyspark/mllib/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

import sys
import array
from math import sqrt

if sys.version >= '3':
basestring = str
Expand Down Expand Up @@ -209,35 +208,24 @@ def __init__(self, ar):
ar = ar.astype(np.float64)
self.array = ar

def toString(self):
"""
Convert DenseVector to string representation.
>>> a = DenseVector([0, 1, 2, 3])
>>> a.toString()
'[0.0,1.0,2.0,3.0]'
"""
return str(self)

def copy(self):
return DenseVector(np.copy(self.array))

@staticmethod
def parse(vectorString):
"""
Parse string representation back into the DenseVector.
>>> DenseVector.parse('[0.0,1.0,2.0,3.0]')
>>> DenseVector.parse(' [ 0.0,1.0,2.0, 3.0]')
DenseVector([0.0, 1.0, 2.0, 3.0])
"""
vectorString = vectorString[1:-1]
start = vectorString.find('[')
end = vectorString.find(']')
vectorString = vectorString[start + 1: end]
return DenseVector([float(val) for val in vectorString.split(',')])

def __reduce__(self):
return DenseVector, (self.array.tostring(),)

def numNonzeros(self):
return np.nonzero(self.array)[0].size
return np.count_nonzero(self.array)

def norm(self, p):
"""
Expand All @@ -249,14 +237,7 @@ def norm(self, p):
>>> a.norm(1)
6.0
"""
if p == 1:
return np.sum(np.abs(self.array))
elif p == 2:
return sqrt(np.dot(self.array, self.array))
elif p == np.inf:
return np.max(np.abs(self.array))
else:
return pow(np.power(self.array, p), 1.0 / p)
return np.linalg.norm(self.array, p)

def dot(self, other):
"""
Expand Down Expand Up @@ -434,11 +415,8 @@ def __init__(self, size, *args):
if self.indices[i] >= self.indices[i + 1]:
raise TypeError("indices array must be sorted")

def copy(self):
return SparseVector(self.size, np.copy(self.indices), np.copy(self.values))

def numNonzeros(self):
return np.nonzero(self.values)[0].size
return np.count_nonzero(self.values)

def norm(self, p):
"""
Expand All @@ -450,42 +428,36 @@ def norm(self, p):
>>> a.norm(2)
5.0
"""
if p == 1:
return np.sum(np.abs(self.values))
elif p == 2:
return sqrt(np.dot(self.values, self.values))
elif p == np.inf:
return np.max(np.abs(self.values))
else:
return pow(np.power(self.values, p), 1.0 / p)
return np.linalg.norm(self.values, p)

def __reduce__(self):
return (SparseVector, (self.size, self.indices.tostring(), self.values.tostring()))

def toString(self):
"""
Convert SparseVector to string representation.
>>> a = SparseVector(4, [0, 1], [4, 5])
>>> a.toString()
'(4,[0,1],[4.0,5.0])'
"""
return str(self)
return (
SparseVector,
(self.size, self.indices.tostring(), self.values.tostring()))

@staticmethod
def parse(vectorString):
"""
Parse string representation back into the DenseVector.
>>> SparseVector.parse('(4,[0,1],[4.0,5.0])')
>>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )')
SparseVector(4, {0: 4.0, 1: 5.0})
"""
size = int(vectorString[1])
start = vectorString.find('(')
end = vectorString.find(')')
vectorString = vectorString[start+1: end].strip()
size = int(vectorString[0])

ind_start = vectorString.find('[')
ind_end = vectorString.find(']')
index_string = vectorString[4: ind_end]
indices = [int(ind) for ind in index_string.split(',')]
value_string = vectorString[ind_end + 3: -2]
values = [float(val) for val in value_string.split(',')]
ind_list = vectorString[ind_start + 1: ind_end].split(',')
indices = [int(ind) for ind in ind_list]
vectorString = vectorString[ind_end + 1:].strip()

val_start = vectorString.find('[')
val_end = vectorString.find(']')
val_list = vectorString[val_start + 1: val_end].split(',')
values = [float(val) for val in val_list]
return SparseVector(size, indices, values)

def dot(self, other):
Expand Down Expand Up @@ -528,15 +500,12 @@ def dot(self, other):

assert len(self) == _vector_size(other), "dimension mismatch"

if type(other) in (np.ndarray, array.array):
if type(other) in (np.ndarray, array.array, DenseVector):
result = 0.0
for i, ind in enumerate(self.indices):
result += self.values[i] * other[ind]
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
return result

elif isinstance(other, DenseVector):
return np.dot(other.toArray()[self.indices], self.values)

elif type(other) is SparseVector:
result = 0.0
i, j = 0, 0
Expand Down Expand Up @@ -580,28 +549,19 @@ def squared_distance(self, other):
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if type(other) in (list, array.array, np.array, np.ndarray):
if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
if type(other) is np.array and other.ndim != 1:
raise Exception("Cannot call squared_distance with %d-dimensional array" %
other.ndim)
result = 0.0
j = 0 # index into our own array
for i, other_ind in enumerate(other):
for i in xrange(len(other)):
if j < len(self.indices) and self.indices[j] == i:
diff = self.values[j] - other_ind
diff = self.values[j] - other[i]
result += diff * diff
j += 1
else:
result += other_ind * other_ind
return result

elif isinstance(other, DenseVector):
bool_ind = np.zeros(len(other), dtype=bool)
bool_ind[self.indices] = True
dist = other.toArray()[bool_ind] - self.values
result = np.dot(dist, dist)
other_values = other.toArray()[~bool_ind]
result += np.dot(other_values, other_values)
result += other[i] * other[i]
return result

elif type(other) is SparseVector:
Expand Down Expand Up @@ -743,30 +703,11 @@ def stringify(vector):
"""
return str(vector)

@staticmethod
def dot(a, b):
"""
Dot product between two vectors.
a and b can be of type, SparseVector, DenseVector, np.ndarray
or array.array.
>>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
>>> b = Vectors.dense([23, 41, 9, 1])
>>> Vectors.dot(a, b)
27.0
>>> Vectors.dot(a, a)
17.0
>>> Vectors.dot(a, np.array([0, 1, 2, 4]))
16.0
"""
a, b = _convert_to_vector(a), _convert_to_vector(b)
return a.dot(b)

@staticmethod
def squared_distance(a, b):
"""
Squared distance between two vectors.
a and b can be of type, SparseVector, DenseVector, np.ndarray
a and b can be of type SparseVector, DenseVector, np.ndarray
or array.array.
>>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
Expand All @@ -786,7 +727,7 @@ def norm(vec, p):

@staticmethod
def parse(vectorString):
if vectorString[0] == '[':
if vectorString.find('(') == -1:
return DenseVector.parse(vectorString)
return SparseVector.parse(vectorString)

Expand Down
14 changes: 5 additions & 9 deletions python/pyspark/mllib/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ def test_dot(self):
self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
self.assertEquals(30.0, lst.dot(dv))
self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
self.assertEquals(Vectors.dot(sv, sv), 5.)
self.assertEquals(Vectors.dot(sv, dv), 10.)
self.assertEquals(Vectors.dot(dv, sv), 10.)
self.assertEquals(Vectors.dot(sv, array([2, 5, 7, 8])), 21.0)

def test_squared_distance(self):
sv = SparseVector(4, {1: 1, 3: 2})
Expand Down Expand Up @@ -224,13 +220,13 @@ def test_dense_matrix_is_transposed(self):
self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))

def test_parse_matrix(self):
def test_parse_vector(self):
a = DenseVector([3, 4, 6, 7])
self.assertTrue(a.toString(), '[3.0,4.0,6.0,7.0]')
self.assertTrue(Vectors.parse(a.toString()), a)
self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]')
self.assertTrue(Vectors.parse(str(a)), a)
a = SparseVector(4, [0, 2], [3, 4])
self.assertTrue(a.toString(), '(4,[0,2],[3.0,4.0])')
self.assertTrue(Vectors.parse(a.toString()), a)
self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])')
self.assertTrue(Vectors.parse(str(a)), a)

def test_norms(self):
a = DenseVector([0, 2, 3, -1])
Expand Down

0 comments on commit 1bd3c04

Please sign in to comment.