用Python实现K-means算法时候,要计算随机两个数之间的欧氏距离,数据量为5000行,但计算的时间却有500多秒,不知道有什么能优化,求指教,代码如下循环
for i in range(len(data)): # 计算任意两点距离和
for j in range(i+1, len(data)):
random_sum += ed_relate(data[i][2:], data[j][2:])
ed_relate
def ed_relate(dataX, dataY):
'''
:param dataX:第一行
:param dataY: 第二行
:return: 之间的相似度
'''
sum = 0
if len(dataX) == len(dataY):
for a in range(0, len(dataX)):
sum += (float(dataX[a])-float(dataY[a])) ** 2
relate = math.sqrt(sum)
return relate
else:
print 'len is not equal'
return 0
数据data
[['3', '0010000000000', '1', '1', '4', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '4', '4', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000000000010', '1', '0', '4', '2', '1', '3', '3', '2', '3', '5', '3', '2', '2', '3', '4', '2', '2', '4', '1', '1', '1', '1', '3', '2', '3', '2', '2', '3', '2', '2', '3']
['3', '0010000000000', '1', '3', '2', '3', '3', '3', '3', '2', '3', '2', '2', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '2', '2', '2', '3']
['2', '1000000000000', '2', '1', '3', '4', '2', '2', '2', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '2', '2', '3', '3', '2', '2', '2', '2', '3', '2', '2']
['2', '1000000000000', '1', '1', '5', '3', '3', '3', '3', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '3']
['3', '0000000100000', '1', '0', '5', '2', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '2', '2', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '2', '2', '3', '3', '2']
['3', '0000000100000', '1', '0', '4', '2', '3', '3', '3', '2', '2', '2', '2', '2', '2', '1', '1', '2', '2', '2', '2', '4', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0010000000000', '2', '1', '3', '4', '2', '2', '3', '2', '2', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '2', '3', '2', '2', '3', '2', '2', '3']
['3', '0000010000000', '1', '1', '3', '2', '2', '2', '3', '2', '2', '2', '2', '2', '3', '2', '2', '3', '2', '4', '2', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2', '3', '2']
['3', '0010000000000', '3', '1', '4', '3', '3', '3', '4', '3', '3', '2', '3', '3', '2', '1', '1', '1', '4', '4', '4', '4', '4', '4', '3', '1', '1', '1', '1', '1', '1', '1', '1']
['1', '0100000000000', '3', '4', '1', '2', '3', '4', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '4', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['2', '0000000000100', '1', '2', '3', '4', '3', '2', '3', '1', '2', '2', '2', '2', '2', '2', '2', '4', '2', '2', '2', '3', '3', '2', '3', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000000000010', '1', '3', '3', '2', '2', '3', '2', '3', '3', '3', '3', '3', '2', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['1', '0100000000000', '1', '1', '3', '2', '3', '3', '3', '2', '3', '3', '3', '3', '3', '3', '2', '1', '1', '3', '2', '2', '3', '1', '1', '1', '1', '1', '2', '3', '3', '1', '2']
['1', '0100000000000', '1', '2', '4', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000000100000', '1', '1', '3', '3', '3', '2', '4', '4', '4', '4', '4', '2', '2', '1', '1', '3', '3', '4', '3', '4', '3', '1', '2', '1', '1', '1', '2', '2', '1', '1', '1']
['3', '0010000000000', '1', '2', '3', '3', '3', '2', '2', '2', '2', '3', '2', '2', '2', '2', '3', '2', '2', '3', '3', '3', '2', '2', '2', '2', '3', '3', '3', '2', '2', '2', '2']
['3', '0000010000000', '1', '1', '5', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000010000000', '1', '1', '5', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['3', '0000000100000', '1', '4', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']]
只给出了前20行
The solution has been found. The problem lies in the two floats used to calculate the Euclidean distance
In calculation, the speed is increased by 10 times🎜sum += (float(dataX[a])-float(dataY[a])) ** 2
sum += (float(dataX[a])-float(dataY[a])) ** 2
先将data全转成 int
data = [[int(x) for x in row] for row in data]
First convert all data into intdata = [[int(x) for x in row] for row in data]
Not tested, it should improve some performance.
It seems that there is no big problem with your existing code. I repeated your calculation 1,000 times, which is roughly equivalent to 20,000 pieces of data, and the time is 4.4 seconds.
You'd better profile it to see where the problem is.
There seems to be no better way in terms of performance, but I think your double loop can be written more elegantly, such as:
Using numpy and pandas