用惯性代替轮廓

发布于 2025-02-04 01:52:59 字数 2492 浏览 4 评论 0原文

我有问题。我正在使用k-means，并希望找到最佳群集。不幸的是，我的数据集太大了，无法应用silhouette。是否有一个选项可以调整此代码并将silhouette替换为intertia？

MVC

from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import silhouette_score
import matplotlib as mpl
import matplotlib.pyplot as plt

X = np.array([[1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0],
              [10, 2], [10, 4], [10, 0],
              [1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0],
              [10, 2], [10, 4], [10, 0],
              [1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0],
              [10, 2], [10, 4], [10, 0],
              [1, 2], [1, 4], [1, 0],])

kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]

silhouette_scores = [silhouette_score(X, model.labels_)
                     for model in kmeans_per_k[1:]]


from sklearn.metrics import silhouette_samples
from matplotlib.ticker import FixedLocator, FixedFormatter

plt.figure(figsize=(11, 9))

for k in (3, 4, 5, 6):
    plt.subplot(2, 2, k - 2)
    
    y_pred = kmeans_per_k[k - 1].labels_
    silhouette_coefficients = silhouette_samples(X, y_pred)

    padding = len(X) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = mpl.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (3, 5):
        plt.ylabel("Cluster")
    
    if k in (5, 6):
        plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_scores[k - 2], color="red", linestyle="--")
    plt.title("$k={}$".format(k), fontsize=16)

#save_fig("silhouette_analysis_plot")
plt.show()

我想要惯性

原文

I have a problem. I am working with k-means and would like to find the optimal cluster. Unfortunately, my data set is too large to apply silhouette. Is there an option to adapt this code and replace the silhouette with the Inertia?

MVC

from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import silhouette_score
import matplotlib as mpl
import matplotlib.pyplot as plt

X = np.array([[1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0],
              [10, 2], [10, 4], [10, 0],
              [1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0],
              [10, 2], [10, 4], [10, 0],
              [1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0],
              [10, 2], [10, 4], [10, 0],
              [1, 2], [1, 4], [1, 0],])

kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]

silhouette_scores = [silhouette_score(X, model.labels_)
                     for model in kmeans_per_k[1:]]


from sklearn.metrics import silhouette_samples
from matplotlib.ticker import FixedLocator, FixedFormatter

plt.figure(figsize=(11, 9))

for k in (3, 4, 5, 6):
    plt.subplot(2, 2, k - 2)
    
    y_pred = kmeans_per_k[k - 1].labels_
    silhouette_coefficients = silhouette_samples(X, y_pred)

    padding = len(X) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = mpl.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (3, 5):
        plt.ylabel("Cluster")
    
    if k in (5, 6):
        plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_scores[k - 2], color="red", linestyle="--")
    plt.title("$k={}quot;.format(k), fontsize=16)

#save_fig("silhouette_analysis_plot")
plt.show()

What I want with Inertia

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

゛时过境迁 2025-02-11 01:52:59

首先，我建议使用参数sample_size和Random_State（用于可重复性）在数据子集上计算剪影得分。这可以节省您一些时间，同时计算和绘制相当全面的信息。（如何使用）。但是，如您所知，有很多可以测量聚类质量以及可视化的选择。您提到的一个是肘（惯性），可以像这样使用：

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
X, y = make_blobs(n_samples=100, centers=3, n_features=2,
                  random_state=0)
scores = [KMeans(n_clusters=i+2).fit(X).inertia_ 
          for i in range(10)]
sns.lineplot(np.arange(2, 12), scores)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

-greate-visalize-interpret-customer-perments-474E55D00EBB“ rel =“ nofollow noreferrer”>文章引入了几种有用但简单的技术来获取聚类质量。

First of all I suggest calculating silhouette score on a subset of data using argument sample_size and random_state (for reproducibility). This may save you some time, meanwhile calculate and plot rather comprehensive information. (how to use). But as you know there are plenty of options for measuring clustering quality along with visualization. The one you've mentioned is elbow (inertia) which can be used like this:

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
X, y = make_blobs(n_samples=100, centers=3, n_features=2,
                  random_state=0)
scores = [KMeans(n_clusters=i+2).fit(X).inertia_ 
          for i in range(10)]
sns.lineplot(np.arange(2, 12), scores)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")