ExcelでデータをCSV形式で作成し
それを読み込む形で
混合ガウス分布のそれぞれの中心μの座標を
KMeans法を用いて求めてみました
まず、仮の中心座標μを指定し(図表1)
そこから最小2乗法を6回繰り返すことにより
それぞれの本来の中心点を求めています
下にその結果とコードを載せておきます
殆どのコード内容はこの本から引用しています
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, preprocessing
data = pd.read_csv('data.csv')
data_s = data.sort_values('X0')
X =np.c_[data["X0"], data["X1"]]
X_range0 = [-3, 3]
X_range1 = [-3, 3]
N = X.shape[0]
K = 3
X_col = ['cyan', 'black', 'green']
# MuとRの初期化
Mu = np.array([[-2, 1,], [-2, 0], [-2, -1]])
R = np.c_[np.ones((N, 1), dtype=int), np.zeros((N, 2), dtype=int)]
def show_prm(x, r, mu, col):
for k in range(K):
# データ分布の描写
plt.plot(x[r[:, k] == 1, 0], x[r[:, k] == 1, 1],
marker='o',
markerfacecolor=X_col[k], markeredgecolor='k',
markersize=6, alpha=0.5, linestyle='none')
# データの平均を「星マーク」で描写
plt.plot(mu[k, 0], mu[k, 1], marker='*',
markerfacecolor='red', markersize=30,
markeredgecolor='k', markeredgewidth=1)
plt.xlim(X_range0)
plt.ylim(X_range1)
plt.grid(True)
# r を決める
def step1_kmeans(x0, x1, mu):
N = len(x0)
r = np.zeros((N, K))
for n in range(N):
wk = np.zeros(K)
for k in range(K):
wk[k] = (x0[n] - mu[k, 0])**2 + (x1[n] - mu[k, 1])**2
r[n, np.argmin(wk)] = 1
return r
R = step1_kmeans(X[:, 0], X[:, 1], Mu)
# Mu を決める
def step2_kmeans(x0, x1, r):
mu = np.zeros((K, 2))
for k in range(K):
mu[k, 0] = np.sum(r[:, k] * x0) / np.sum(r[:, k])
mu[k, 1] = np.sum(r[:, k] * x1) / np.sum(r[:, k])
return mu
Mu = step2_kmeans(X[:, 0], X[:, 1], R)
Mu = np.array([[-2, 1], [-2, 0], [-2, -1]])
max_it = 6 # 繰り返しの回数
plt.rcParams["font.size"] = 25
plt.figure(figsize=(10, 6.5))
for it in range(0, max_it):
plt.subplot(2, 3, it + 1)
R = step1_kmeans(X[:, 0], X[:, 1], Mu)
show_prm(X, R, Mu, X_col)
plt.title("{0:d}".format(it + 1))
plt.xticks(range(X_range0[0], X_range0[1]), "")
plt.yticks(range(X_range1[0], X_range1[1]), "")
Mu = step2_kmeans(X[:, 0], X[:, 1], R)
plt.show()
data_s = data.sort_values('X0')
X =np.c_[data["X0"], data["X1"]]
X_range0 = [-3, 3]
X_range1 = [-3, 3]
N = X.shape[0]
K = 3
X_col = ['cyan', 'black', 'green']
# MuとRの初期化
Mu = np.array([[-2, 1,], [-2, 0], [-2, -1]])
R = np.c_[np.ones((N, 1), dtype=int), np.zeros((N, 2), dtype=int)]
def show_prm(x, r, mu, col):
for k in range(K):
# データ分布の描写
plt.plot(x[r[:, k] == 1, 0], x[r[:, k] == 1, 1],
marker='o',
markerfacecolor=X_col[k], markeredgecolor='k',
markersize=6, alpha=0.5, linestyle='none')
# データの平均を「星マーク」で描写
plt.plot(mu[k, 0], mu[k, 1], marker='*',
markerfacecolor='red', markersize=30,
markeredgecolor='k', markeredgewidth=1)
plt.xlim(X_range0)
plt.ylim(X_range1)
plt.grid(True)
# r を決める
def step1_kmeans(x0, x1, mu):
N = len(x0)
r = np.zeros((N, K))
for n in range(N):
wk = np.zeros(K)
for k in range(K):
wk[k] = (x0[n] - mu[k, 0])**2 + (x1[n] - mu[k, 1])**2
r[n, np.argmin(wk)] = 1
return r
R = step1_kmeans(X[:, 0], X[:, 1], Mu)
# Mu を決める
def step2_kmeans(x0, x1, r):
mu = np.zeros((K, 2))
for k in range(K):
mu[k, 0] = np.sum(r[:, k] * x0) / np.sum(r[:, k])
mu[k, 1] = np.sum(r[:, k] * x1) / np.sum(r[:, k])
return mu
Mu = step2_kmeans(X[:, 0], X[:, 1], R)
Mu = np.array([[-2, 1], [-2, 0], [-2, -1]])
max_it = 6 # 繰り返しの回数
plt.rcParams["font.size"] = 25
plt.figure(figsize=(10, 6.5))
for it in range(0, max_it):
plt.subplot(2, 3, it + 1)
R = step1_kmeans(X[:, 0], X[:, 1], Mu)
show_prm(X, R, Mu, X_col)
plt.title("{0:d}".format(it + 1))
plt.xticks(range(X_range0[0], X_range0[1]), "")
plt.yticks(range(X_range1[0], X_range1[1]), "")
Mu = step2_kmeans(X[:, 0], X[:, 1], R)
plt.show()
