散点图文本聚类结果
我正在尝试散布使用K均值聚类算法产生的簇的绘图结果,但我遇到了错误。我应该改用嵌入方式还是索引问题? 聚类结果还可以,我想要一种可视化簇并尝试散点图方法的方法。 错误消息:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
4 frames
/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 15
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-63-52bc7659b609> in <module>()
2
3 for i in u_labels:
----> 4 plt.scatter(df_clusters [cluster_labels[i] ] , df_clusters [cluster_labels[i] ] , label = i)
5 #plt.scatter(df_clusters[u_labels == i , 0] , df_clusters[u_labels == i , 1] , u_labels = i)
6 plt.legend()
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in __getitem__(self, key)
3456 if self.columns.nlevels > 1:
3457 return self._getitem_multilevel(key)
-> 3458 indexer = self.columns.get_loc(key)
3459 if is_integer(indexer):
3460 indexer = [indexer]
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 15
#Clustering our data using K-mean and Word2Vec embeddings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim
from gensim import corpora
from google.colab import drive
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import re
import string
drive.mount('/content/gdrive/',force_remount=True)
txt_reader=pd.read_csv('/content/gdrive/MyDrive/project_raw_data/preprocessing_results.txt',sep=";")#,header=None
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_text=[]
for prescription in txt_reader:
tokenized=word_tokenize(prescription)
cleaned= [word for word in tokenized if len(word)>3]
tokenized_text.append(cleaned)
model = Word2Vec(sentences=tokenized_text, size=300, workers=64,seed=1)
#This function will vectorize the prescriptions and return features
def vectorize(list_of_docs, model): # edit
features = []
for tokens in list_of_docs:
zero_vector = np.zeros(model.vector_size)
vectors = []
for token in tokens:
if token in model.wv:
try:
vectors.append(model.wv[token])
except KeyError:
continue
if vectors:
vectors = np.asarray(vectors)
avg_vec = vectors.mean(axis=0)
features.append(avg_vec)
else:
features.append(zero_vector)
return features
vectorized_docs = vectorize(tokenized_text, model=model)
len(vectorized_docs)
len(vectorized_docs[0])
#K-means Clustering using mini batches
def mbkmeans_clusters(
X, # Matrix of features
k, # Clusters number
mb, # mini batch size
print_silhouette_values, #per cluster
):
# Generate clusters and use MBKmeans to print Silhouette metrics
km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
print(f"For n_clusters = {k}")
print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
print(f"Inertia:{km.inertia_}")
if print_silhouette_values:
sample_silhouette_values = silhouette_samples(X, km.labels_)
print(f"Silhouette values:")
silhouette_values = []
for i in range(k):
cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
silhouette_values.append(
(
i,
cluster_silhouette_values.shape[0],
cluster_silhouette_values.mean(),
cluster_silhouette_values.min(),
cluster_silhouette_values.max(),
)
)
silhouette_values = sorted(
silhouette_values, key=lambda tup: tup[2], reverse=True
)
for s in silhouette_values:
print(
f" Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
)
return km, km.labels_ # Trained clustering model and labels based on X
clustering, cluster_labels = mbkmeans_clusters(
X=vectorized_docs,
k=17,
mb=1000,
print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
"text": tokenized_text,
"tokens": [" ".join(text) for text in tokenized_text],
"cluster": cluster_labels
})
'''
For n_clusters = 20
Silhouette coefficient: 0.11
Inertia:973821.9586816286
'''
#Presenting the most relevant words in each cluster and examine them
print("Most representative terms in each cluster:")
for i in range(17): #50
tokens_per_cluster = ""
most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=20)
for t in most_representative:
tokens_per_cluster += f"{t[0]} "
print(f"Cluster {i}: {tokens_per_cluster}")
#
Clustering our data using K-mean and Word2Vec embeddings
Necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim
from gensim import corpora
from google.colab import drive
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import re
import string
Mounting the drive and reading the csv file
drive.mount('/content/gdrive/',force_remount=True)
Mounted at /content/gdrive/
txt_reader=pd.read_csv('/content/gdrive/MyDrive/project_raw_data/preprocessing_results.txt',sep=";")#,header=None
Mounted at /content/gdrive/
Tokenize the prescriptions
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_text=[]
for prescription in txt_reader:
tokenized=word_tokenize(prescription)
cleaned= [word for word in tokenized if len(word)>3]
tokenized_text.append(cleaned)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
Initialzing an instance of the Word2vec model with our toknized text
model = Word2Vec(sentences=tokenized_text, size=300, workers=64,seed=1)
# sentences is a list of lists containing the tokenized text.
# Error encountered :use size instead of vector_size which is the length of the vector representing the words.
# the number of workers is like the number of independent working cores to parallelize the process and make it faster.
# by fixing the seed I tried to make the results as consistent as possible with every excution of the notebook.
This function will vectorize the prescriptions and return features
def vectorize(list_of_docs, model): # edit
features = []
for tokens in list_of_docs:
zero_vector = np.zeros(model.vector_size)
vectors = []
for token in tokens:
if token in model.wv:
try:
vectors.append(model.wv[token])
except KeyError:
continue
if vectors:
vectors = np.asarray(vectors)
avg_vec = vectors.mean(axis=0)
features.append(avg_vec)
else:
features.append(zero_vector)
return features
Vectorizing the prescriptions
vectorized_docs = vectorize(tokenized_text, model=model)
len(vectorized_docs)
len(vectorized_docs[0])
300
K-means Clustering using mini batches
def mbkmeans_clusters(
X, # Matrix of features
k, # Clusters number
mb, # mini batch size
print_silhouette_values, #per cluster
):
# Generate clusters and use MBKmeans to print Silhouette metrics
km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
print(f"For n_clusters = {k}")
print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
print(f"Inertia:{km.inertia_}")
if print_silhouette_values:
sample_silhouette_values = silhouette_samples(X, km.labels_)
print(f"Silhouette values:")
silhouette_values = []
for i in range(k):
cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
silhouette_values.append(
(
i,
cluster_silhouette_values.shape[0],
cluster_silhouette_values.mean(),
cluster_silhouette_values.min(),
cluster_silhouette_values.max(),
)
)
silhouette_values = sorted(
silhouette_values, key=lambda tup: tup[2], reverse=True
)
for s in silhouette_values:
print(
f" Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
)
return km, km.labels_ # Trained clustering model and labels based on X
clustering, cluster_labels = mbkmeans_clusters(
X=vectorized_docs,
k=17,
mb=1000,
print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
"text": tokenized_text,
"tokens": [" ".join(text) for text in tokenized_text],
"cluster": cluster_labels
})
'''
For n_clusters = 20
Silhouette coefficient: 0.11
Inertia:973821.9586816286
'''
#trying to scatter plot results
#Getting unique labels
u_labels = np.unique(cluster_labels)
df_clusters
#plotting the results:
for i in u_labels:
#plt.scatter(df_clusters [cluster_labels[i] ] , df_clusters [cluster_labels[i] ] , label = i)
plt.scatter(df_clusters[u_labels == i , 0] , df_clusters[u_labels == i , 1] , u_labels = i)
plt.legend()
plt.show()
I am trying to scatter plot results of the clusters produced using k-mean clustering algorithm but I'm getting an error. should I use the embeddings instead or is it an indexing problem?
The clustering results were ok and I want a way to visualize the clusters and tried the scatter plot method.
The error message:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
4 frames
/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 15
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-63-52bc7659b609> in <module>()
2
3 for i in u_labels:
----> 4 plt.scatter(df_clusters [cluster_labels[i] ] , df_clusters [cluster_labels[i] ] , label = i)
5 #plt.scatter(df_clusters[u_labels == i , 0] , df_clusters[u_labels == i , 1] , u_labels = i)
6 plt.legend()
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in __getitem__(self, key)
3456 if self.columns.nlevels > 1:
3457 return self._getitem_multilevel(key)
-> 3458 indexer = self.columns.get_loc(key)
3459 if is_integer(indexer):
3460 indexer = [indexer]
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 15
#Clustering our data using K-mean and Word2Vec embeddings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim
from gensim import corpora
from google.colab import drive
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import re
import string
drive.mount('/content/gdrive/',force_remount=True)
txt_reader=pd.read_csv('/content/gdrive/MyDrive/project_raw_data/preprocessing_results.txt',sep=";")#,header=None
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_text=[]
for prescription in txt_reader:
tokenized=word_tokenize(prescription)
cleaned= [word for word in tokenized if len(word)>3]
tokenized_text.append(cleaned)
model = Word2Vec(sentences=tokenized_text, size=300, workers=64,seed=1)
#This function will vectorize the prescriptions and return features
def vectorize(list_of_docs, model): # edit
features = []
for tokens in list_of_docs:
zero_vector = np.zeros(model.vector_size)
vectors = []
for token in tokens:
if token in model.wv:
try:
vectors.append(model.wv[token])
except KeyError:
continue
if vectors:
vectors = np.asarray(vectors)
avg_vec = vectors.mean(axis=0)
features.append(avg_vec)
else:
features.append(zero_vector)
return features
vectorized_docs = vectorize(tokenized_text, model=model)
len(vectorized_docs)
len(vectorized_docs[0])
#K-means Clustering using mini batches
def mbkmeans_clusters(
X, # Matrix of features
k, # Clusters number
mb, # mini batch size
print_silhouette_values, #per cluster
):
# Generate clusters and use MBKmeans to print Silhouette metrics
km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
print(f"For n_clusters = {k}")
print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
print(f"Inertia:{km.inertia_}")
if print_silhouette_values:
sample_silhouette_values = silhouette_samples(X, km.labels_)
print(f"Silhouette values:")
silhouette_values = []
for i in range(k):
cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
silhouette_values.append(
(
i,
cluster_silhouette_values.shape[0],
cluster_silhouette_values.mean(),
cluster_silhouette_values.min(),
cluster_silhouette_values.max(),
)
)
silhouette_values = sorted(
silhouette_values, key=lambda tup: tup[2], reverse=True
)
for s in silhouette_values:
print(
f" Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
)
return km, km.labels_ # Trained clustering model and labels based on X
clustering, cluster_labels = mbkmeans_clusters(
X=vectorized_docs,
k=17,
mb=1000,
print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
"text": tokenized_text,
"tokens": [" ".join(text) for text in tokenized_text],
"cluster": cluster_labels
})
'''
For n_clusters = 20
Silhouette coefficient: 0.11
Inertia:973821.9586816286
'''
#Presenting the most relevant words in each cluster and examine them
print("Most representative terms in each cluster:")
for i in range(17): #50
tokens_per_cluster = ""
most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=20)
for t in most_representative:
tokens_per_cluster += f"{t[0]} "
print(f"Cluster {i}: {tokens_per_cluster}")
#
Clustering our data using K-mean and Word2Vec embeddings
Necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim
from gensim import corpora
from google.colab import drive
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import re
import string
Mounting the drive and reading the csv file
drive.mount('/content/gdrive/',force_remount=True)
Mounted at /content/gdrive/
txt_reader=pd.read_csv('/content/gdrive/MyDrive/project_raw_data/preprocessing_results.txt',sep=";")#,header=None
Mounted at /content/gdrive/
Tokenize the prescriptions
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_text=[]
for prescription in txt_reader:
tokenized=word_tokenize(prescription)
cleaned= [word for word in tokenized if len(word)>3]
tokenized_text.append(cleaned)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
Initialzing an instance of the Word2vec model with our toknized text
model = Word2Vec(sentences=tokenized_text, size=300, workers=64,seed=1)
# sentences is a list of lists containing the tokenized text.
# Error encountered :use size instead of vector_size which is the length of the vector representing the words.
# the number of workers is like the number of independent working cores to parallelize the process and make it faster.
# by fixing the seed I tried to make the results as consistent as possible with every excution of the notebook.
This function will vectorize the prescriptions and return features
def vectorize(list_of_docs, model): # edit
features = []
for tokens in list_of_docs:
zero_vector = np.zeros(model.vector_size)
vectors = []
for token in tokens:
if token in model.wv:
try:
vectors.append(model.wv[token])
except KeyError:
continue
if vectors:
vectors = np.asarray(vectors)
avg_vec = vectors.mean(axis=0)
features.append(avg_vec)
else:
features.append(zero_vector)
return features
Vectorizing the prescriptions
vectorized_docs = vectorize(tokenized_text, model=model)
len(vectorized_docs)
len(vectorized_docs[0])
300
K-means Clustering using mini batches
def mbkmeans_clusters(
X, # Matrix of features
k, # Clusters number
mb, # mini batch size
print_silhouette_values, #per cluster
):
# Generate clusters and use MBKmeans to print Silhouette metrics
km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
print(f"For n_clusters = {k}")
print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
print(f"Inertia:{km.inertia_}")
if print_silhouette_values:
sample_silhouette_values = silhouette_samples(X, km.labels_)
print(f"Silhouette values:")
silhouette_values = []
for i in range(k):
cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
silhouette_values.append(
(
i,
cluster_silhouette_values.shape[0],
cluster_silhouette_values.mean(),
cluster_silhouette_values.min(),
cluster_silhouette_values.max(),
)
)
silhouette_values = sorted(
silhouette_values, key=lambda tup: tup[2], reverse=True
)
for s in silhouette_values:
print(
f" Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
)
return km, km.labels_ # Trained clustering model and labels based on X
clustering, cluster_labels = mbkmeans_clusters(
X=vectorized_docs,
k=17,
mb=1000,
print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
"text": tokenized_text,
"tokens": [" ".join(text) for text in tokenized_text],
"cluster": cluster_labels
})
'''
For n_clusters = 20
Silhouette coefficient: 0.11
Inertia:973821.9586816286
'''
#trying to scatter plot results
#Getting unique labels
u_labels = np.unique(cluster_labels)
df_clusters
#plotting the results:
for i in u_labels:
#plt.scatter(df_clusters [cluster_labels[i] ] , df_clusters [cluster_labels[i] ] , label = i)
plt.scatter(df_clusters[u_labels == i , 0] , df_clusters[u_labels == i , 1] , u_labels = i)
plt.legend()
plt.show()
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论