像这样的东西应该工作:
from pyspark.mllib.linalg import Vectors, SparseVector, DenseVector
import numpy as np
def add(v1, v2):
"""Add two sparse vectors
>>> v1 = Vectors.sparse(3, {0: 1.0, 2: 1.0})
>>> v2 = Vectors.sparse(3, {1: 1.0})
>>> add(v1, v2)
SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0})
"""
assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector)
assert v1.size == v2.size
# Compute union of indices
indices = set(v1.indices).union(set(v2.indices))
# Not particularly efficient but we are limited by SPARK-10973
# Create index: value dicts
v1d = dict(zip(v1.indices, v1.values))
v2d = dict(zip(v2.indices, v2.values))
zero = np.float64(0)
# Create dictionary index: (v1[index] + v2[index])
values = {i: v1d.get(i, zero) + v2d.get(i, zero)
for i in indices
if v1d.get(i, zero) + v2d.get(i, zero) != zero}
return Vectors.sparse(v1.size, values)
如果你喜欢只单通和不关心推出零您可以在上面修改像这样的代码:
from collections import defaultdict
def add(v1, v2):
assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector)
assert v1.size == v2.size
values = defaultdict(float) # Dictionary with default value 0.0
# Add values from v1
for i in range(v1.indices.size):
values[v1.indices[i]] += v1.values[i]
# Add values from v2
for i in range(v2.indices.size):
values[v2.indices[i]] += v2.values[i]
return Vectors.sparse(v1.size, dict(values))
如果你愿意,你可以尝试猴补丁SparseVector
:
SparseVector.__add__ = add
v1 = Vectors.sparse(5, {0: 1.0, 2: 3.0})
v2 = Vectors.sparse(5, {0: -3.0, 2: -3.0, 4: 10})
v1 + v2
## SparseVector(5, {0: -2.0, 4: 10.0})
或者,您应该可以使用scipy.sparse
。
from scipy.sparse import csc_matrix
from pyspark.mllib.regression import LabeledPoint
m1 = csc_matrix((
v1.values,
(v1.indices, [0] * v1.numNonzeros())),
shape=(v1.size, 1))
m2 = csc_matrix((
v2.values,
(v2.indices, [0] * v2.numNonzeros())),
shape=(v2.size, 1))
LabeledPoint(0, m1 + m2)
感谢您的回答。有效。你能解释一下第一种方法是如何在那里计算加法 – Nick
它只是创建两个字典{index:value}并添加相应的值来创建输出字典。我已经更新了一个应该更易于阅读的解决方案。 – zero323