如何优化嵌套的循环?
我有一个距离矩阵bd_d_r
,其中包含主矩阵的每个元素的距离psnr_bitrate
到其他元素。每次我们有6个主要元素簇,我们都必须找到一个与群集中所有其他元素最低距离的元素。为此,我必须运行三个嵌套以进行循环,以找到每个集群的元素,这非常耗时。是为每个集群找到这一点的一种方法,而不是将其用于循环并再次计算距离?我们一次计算距离,元素的数量很高,而且很耗时。 PSNR_BITRATE
是一个大小(700,6)和bd_d_r
的数组,是一个尺寸为700x700的数组。每个集群可能有200个或更少的成员。如果我们可以在每个群集中提取元素的索引,并使用距离矩阵而不循环来计算距离,我认为它有效,但是我不知道如何使用python代码来执行此操作。 我使用以下代码来更新点。如您所见,我使用自己的距离函数BD_RATE实现Kmeans聚类,而代码的瓶颈则在更新质心时,当我必须在每个群集中使用所有其他节点计算每个节点的距离。您是否有任何建议在不使用嵌套循环的情况下进行此部分?
#####################################################################################
def centroid_initiation(centroid,K,label,psnr_bitrate):
array_part=np.array_split(psnr_bitrate,K)
if label==0:
for i in range(K):
tmp=array_part[i]
tmp=tmp[np.lexsort(np.fliplr(tmp).T)]# sort based on first column without chnaging other columns
centroid[i,:]=tmp[int(np.floor(len(array_part[i])/2)),:]
return(centroid)
else:
rnd=random.sample(range(0, psnr_bitrate.shape[0]), K)
centroid=psnr_bitrate[rnd]
return centroid
###################################################################################
def kmeans_BD(psnr_bitrate,K,centroid):
m=psnr_bitrate.shape[0]#number of samples
n=psnr_bitrate.shape[1]#number of bitrate
# creating an empty array
BD=np.zeros((m,K))
#weight of BD_rate
wr=0.5
#weight of BD_Q
wq=0.5
n_itr=1000
# finding distance between for each centroid
for itr in range(n_itr):
for k in range(K):
for i in range(len(psnr_bitrate)):
# print(i)
tmp_return=BD_RATE('VMAF_Y',rate,centroid[k,:],rate,psnr_bitrate[i,:])
if tmp_return==np.NINF:
BD[i,k]=np.inf
else:
BD[i,k]=np.abs(tmp_return)
# a=BD_RATE('VMAF_Y',rate,psnr_bitrate[i,:],rate,centroid[k,:])
# BD_R=bd_rate(rate,psnr_bitrate[i,:],rate,centroid[k,:],i)
# BD_Q=bd_PSNR(rate,psnr_bitrate[i,:],rate,centroid[k,:],i)
# BD[i,k]=wr*BD_R+wq*BD_Q
#remove rows which all columns has inf value and put them in a new cluster
finding_non_infValue=np.asarray(BD[BD.min(axis=1) != np.inf])
indx_removed=np.where([BD.min(axis=1) == np.inf])[1]
indx_nonremoved=np.where([BD.min(axis=1) != np.inf])[1]
removed_member=[psnr_bitrate[x] for x in indx_removed]
saved_member=[psnr_bitrate[x] for x in indx_nonremoved]
# storing the minimum value we have computed
minimum=np.argmin(finding_non_infValue,axis=1)+1
minimum_distance=finding_non_infValue.min(axis=1)
minimum_merge=np.zeros((minimum.shape[0],2))
minimum_merge[:,0]=minimum
minimum_merge[:,1]=minimum_distance
# computing the mean of separated clusters
clusters={}
for itr1 in range(K):
clusters[itr1+1]=np.array([]).reshape(n,0)
# assigning of clusters to points
for itr1 in range(len(saved_member)):
clusters[minimum[itr1]]=np.c_[clusters[minimum[itr1]],saved_member[itr1]]
for itr1 in range(K):
clusters[itr1+1]=clusters[itr1+1].T
clusters_tmp=np.zeros((K,1))
for itr1 in range(K):
if clusters[itr1+1].shape[0]!=[]:
clusters_tmp[itr1]=clusters[itr1+1].shape[0]
else:
clusters_tmp[itr1]=-1
num=(clusters_tmp==-1).sum()#[itr4 for itr4, x in clusters_tmp if x == -1]
if num==K:
centroid=centroid_initiation(centroid,K,label,psnr_bitrate)
else:
if num>0:
tmp_idx=0
while num>0:
indx=np.where([clusters_tmp ==-1 ])[1]+1 #index of no member cluster
H_cluster=np.argmax(clusters_tmp)+1# index of cluster with more members
tmp=np.where(minimum_merge == max([i for i in minimum_merge if i[0]==H_cluster], key=lambda x : x[1]))
idx,=np.where(tmp[1]==1)
clusters[indx[tmp_idx]][0]=clusters[H_cluster][idx]#add member to other cluster
clusters[H_cluster]=np.delete(clusters[H_cluster],(idx),axis=0)#remove the member from main cluster
num-=1
tmp_idx+=1
# computing mean and updating it
for itr2 in range(K):
tmp_cl=clusters[itr2+1]
if len(tmp_cl)>1:
BD_cent=np.zeros((len(tmp_cl),1))
for itr3 in range(len(tmp_cl)):
sumv=0
for itr5 in range(len(tmp_cl)):
value=BD_RATE('VMAF_Y',rate,tmp_cl[itr3,:],rate,tmp_cl[itr5,:])
if value!=np.NINF:
sumv+=np.abs(value)
else:
sumv+=1000#for curve which have not overlap with others
BD_cent[itr3]=sumv/len(tmp_cl)
new_centroid_index=np.argmin(BD_cent)
centroid[itr2]=clusters[itr2+1][new_centroid_index]
# a=BD_RATE('VMAF_Y',rate,psnr_bitrate[i,:],rate,centroid[k,:])
# BD_R=bd_rate(rate,psnr_bitrate[i,:],rate,centroid[k,:],i)
# BD_Q=bd_PSNR(rate,psnr_bitrate[i,:],rate,centroid[k,:],i)
# BD[i,k]=wr*BD_R+wq*BD_Q
# storing the minimum value we have computed
scaled_features = pd.DataFrame((psnr_bitrate))
scaled_features['cluster'] =minimum
pd.plotting.parallel_coordinates(scaled_features, 'cluster')
psnr_bitrate=np.vstack([np.loadtxt(path, dtype='float') for path in glob.iglob(r'C:/Users/jamalm8/ffmpeg/UGCvideos/*.txt')])
#remove non monotonic curve
itr=0
m=len(psnr_bitrate)
while (itr<len(psnr_bitrate)):
brqtypairs1=psnr_bitrate[itr,:]
rd1_monotonic = all(x<y for x, y in zip(brqtypairs1, brqtypairs1[1:]))
if (rd1_monotonic == False ):
print (itr)
psnr_bitrate=np.delete(psnr_bitrate, (itr),axis=0)
itr=itr-1
itr=itr+1
m=psnr_bitrate.shape[0]#number of samples
n=psnr_bitrate.shape[1]#number of bitrate
K=6 #number of cluster
rate=[1000,2000,3000,4000,5000,6000]
#firt initializaation of cluster centroid
#the median of each group is considered as centroid
array_part=np.array_split(psnr_bitrate,K)
centroid=np.zeros((K,6))
label=0
centroid=centroid_initiation(centroid,K,label,psnr_bitrate)
label+=1
# rnd=random.sample(range(0, 638), K)
# centroid=psnr_bitrate[rnd]
# creating an empty array
#weight of BD_rate
wr=0.5
#weight of BD_Q
wq=0.5
kmeans_BD(psnr_bitrate, K, centroid)
I have a distance matrix BD_D_R
that contains the distance of each element of the main matrix psnr_bitrate
to other elements. each time we have 6 clusters of main elements and we have to find an element that has the lowest distance from all other elements in the cluster. for this aim, I have to run three nested for loops to find the element for each cluster which is very time-consuming. is it a way to find this point for each cluster instead of using these for loops and calculating the distance again? we calculate the distances one time and the number of elements is high and it is time-consuming. psnr_bitrate
is an array with size (700,6) and BD_D_R
is an array with size 700x700. each cluster maybe has 200 or fewer members. if we could extract the index of elements in each cluster and calculate the distance using a distance matrix without for loops, I think it works, but I do not know how can I do this using python code.
I used the following code for updating the points. As you can see I implement Kmeans clustering with my own distance function BD_rate and the bottleneck of the code is during updating the centroid when I have to calculate the distance of each node with all other nodes in each cluster. do you have any suggestions to do this part without using nested for loops?
#####################################################################################
def centroid_initiation(centroid,K,label,psnr_bitrate):
array_part=np.array_split(psnr_bitrate,K)
if label==0:
for i in range(K):
tmp=array_part[i]
tmp=tmp[np.lexsort(np.fliplr(tmp).T)]# sort based on first column without chnaging other columns
centroid[i,:]=tmp[int(np.floor(len(array_part[i])/2)),:]
return(centroid)
else:
rnd=random.sample(range(0, psnr_bitrate.shape[0]), K)
centroid=psnr_bitrate[rnd]
return centroid
###################################################################################
def kmeans_BD(psnr_bitrate,K,centroid):
m=psnr_bitrate.shape[0]#number of samples
n=psnr_bitrate.shape[1]#number of bitrate
# creating an empty array
BD=np.zeros((m,K))
#weight of BD_rate
wr=0.5
#weight of BD_Q
wq=0.5
n_itr=1000
# finding distance between for each centroid
for itr in range(n_itr):
for k in range(K):
for i in range(len(psnr_bitrate)):
# print(i)
tmp_return=BD_RATE('VMAF_Y',rate,centroid[k,:],rate,psnr_bitrate[i,:])
if tmp_return==np.NINF:
BD[i,k]=np.inf
else:
BD[i,k]=np.abs(tmp_return)
# a=BD_RATE('VMAF_Y',rate,psnr_bitrate[i,:],rate,centroid[k,:])
# BD_R=bd_rate(rate,psnr_bitrate[i,:],rate,centroid[k,:],i)
# BD_Q=bd_PSNR(rate,psnr_bitrate[i,:],rate,centroid[k,:],i)
# BD[i,k]=wr*BD_R+wq*BD_Q
#remove rows which all columns has inf value and put them in a new cluster
finding_non_infValue=np.asarray(BD[BD.min(axis=1) != np.inf])
indx_removed=np.where([BD.min(axis=1) == np.inf])[1]
indx_nonremoved=np.where([BD.min(axis=1) != np.inf])[1]
removed_member=[psnr_bitrate[x] for x in indx_removed]
saved_member=[psnr_bitrate[x] for x in indx_nonremoved]
# storing the minimum value we have computed
minimum=np.argmin(finding_non_infValue,axis=1)+1
minimum_distance=finding_non_infValue.min(axis=1)
minimum_merge=np.zeros((minimum.shape[0],2))
minimum_merge[:,0]=minimum
minimum_merge[:,1]=minimum_distance
# computing the mean of separated clusters
clusters={}
for itr1 in range(K):
clusters[itr1+1]=np.array([]).reshape(n,0)
# assigning of clusters to points
for itr1 in range(len(saved_member)):
clusters[minimum[itr1]]=np.c_[clusters[minimum[itr1]],saved_member[itr1]]
for itr1 in range(K):
clusters[itr1+1]=clusters[itr1+1].T
clusters_tmp=np.zeros((K,1))
for itr1 in range(K):
if clusters[itr1+1].shape[0]!=[]:
clusters_tmp[itr1]=clusters[itr1+1].shape[0]
else:
clusters_tmp[itr1]=-1
num=(clusters_tmp==-1).sum()#[itr4 for itr4, x in clusters_tmp if x == -1]
if num==K:
centroid=centroid_initiation(centroid,K,label,psnr_bitrate)
else:
if num>0:
tmp_idx=0
while num>0:
indx=np.where([clusters_tmp ==-1 ])[1]+1 #index of no member cluster
H_cluster=np.argmax(clusters_tmp)+1# index of cluster with more members
tmp=np.where(minimum_merge == max([i for i in minimum_merge if i[0]==H_cluster], key=lambda x : x[1]))
idx,=np.where(tmp[1]==1)
clusters[indx[tmp_idx]][0]=clusters[H_cluster][idx]#add member to other cluster
clusters[H_cluster]=np.delete(clusters[H_cluster],(idx),axis=0)#remove the member from main cluster
num-=1
tmp_idx+=1
# computing mean and updating it
for itr2 in range(K):
tmp_cl=clusters[itr2+1]
if len(tmp_cl)>1:
BD_cent=np.zeros((len(tmp_cl),1))
for itr3 in range(len(tmp_cl)):
sumv=0
for itr5 in range(len(tmp_cl)):
value=BD_RATE('VMAF_Y',rate,tmp_cl[itr3,:],rate,tmp_cl[itr5,:])
if value!=np.NINF:
sumv+=np.abs(value)
else:
sumv+=1000#for curve which have not overlap with others
BD_cent[itr3]=sumv/len(tmp_cl)
new_centroid_index=np.argmin(BD_cent)
centroid[itr2]=clusters[itr2+1][new_centroid_index]
# a=BD_RATE('VMAF_Y',rate,psnr_bitrate[i,:],rate,centroid[k,:])
# BD_R=bd_rate(rate,psnr_bitrate[i,:],rate,centroid[k,:],i)
# BD_Q=bd_PSNR(rate,psnr_bitrate[i,:],rate,centroid[k,:],i)
# BD[i,k]=wr*BD_R+wq*BD_Q
# storing the minimum value we have computed
scaled_features = pd.DataFrame((psnr_bitrate))
scaled_features['cluster'] =minimum
pd.plotting.parallel_coordinates(scaled_features, 'cluster')
psnr_bitrate=np.vstack([np.loadtxt(path, dtype='float') for path in glob.iglob(r'C:/Users/jamalm8/ffmpeg/UGCvideos/*.txt')])
#remove non monotonic curve
itr=0
m=len(psnr_bitrate)
while (itr<len(psnr_bitrate)):
brqtypairs1=psnr_bitrate[itr,:]
rd1_monotonic = all(x<y for x, y in zip(brqtypairs1, brqtypairs1[1:]))
if (rd1_monotonic == False ):
print (itr)
psnr_bitrate=np.delete(psnr_bitrate, (itr),axis=0)
itr=itr-1
itr=itr+1
m=psnr_bitrate.shape[0]#number of samples
n=psnr_bitrate.shape[1]#number of bitrate
K=6 #number of cluster
rate=[1000,2000,3000,4000,5000,6000]
#firt initializaation of cluster centroid
#the median of each group is considered as centroid
array_part=np.array_split(psnr_bitrate,K)
centroid=np.zeros((K,6))
label=0
centroid=centroid_initiation(centroid,K,label,psnr_bitrate)
label+=1
# rnd=random.sample(range(0, 638), K)
# centroid=psnr_bitrate[rnd]
# creating an empty array
#weight of BD_rate
wr=0.5
#weight of BD_Q
wq=0.5
kmeans_BD(psnr_bitrate, K, centroid)
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论