4 и 5
This commit is contained in:
parent
7aff499269
commit
9eba71f6ee
4081
data/fish_data.csv
Normal file
4081
data/fish_data.csv
Normal file
File diff suppressed because it is too large
Load Diff
1003
lec3.ipynb
1003
lec3.ipynb
File diff suppressed because one or more lines are too long
2310
lec4.ipynb
2310
lec4.ipynb
File diff suppressed because one or more lines are too long
2490
lec4_1.ipynb
Normal file
2490
lec4_1.ipynb
Normal file
File diff suppressed because one or more lines are too long
3719
lec4_reg.ipynb
Normal file
3719
lec4_reg.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1339
lec5.ipynb
Normal file
1339
lec5.ipynb
Normal file
File diff suppressed because one or more lines are too long
13
utils.py
13
utils.py
@ -7,6 +7,7 @@ from sklearn.model_selection import train_test_split
|
|||||||
|
|
||||||
def split_stratified_into_train_val_test(
|
def split_stratified_into_train_val_test(
|
||||||
df_input,
|
df_input,
|
||||||
|
target_colname="z",
|
||||||
stratify_colname="y",
|
stratify_colname="y",
|
||||||
frac_train=0.6,
|
frac_train=0.6,
|
||||||
frac_val=0.15,
|
frac_val=0.15,
|
||||||
@ -51,14 +52,16 @@ def split_stratified_into_train_val_test(
|
|||||||
if stratify_colname not in df_input.columns:
|
if stratify_colname not in df_input.columns:
|
||||||
raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
|
raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
|
||||||
|
|
||||||
|
if target_colname not in df_input.columns:
|
||||||
|
raise ValueError("%s is not a column in the dataframe" % (target_colname))
|
||||||
|
|
||||||
X = df_input # Contains all columns.
|
X = df_input # Contains all columns.
|
||||||
y = df_input[
|
y = df_input[[target_colname]] # Dataframe of just the column on which to stratify.
|
||||||
[stratify_colname]
|
z = df_input[[stratify_colname]]
|
||||||
] # Dataframe of just the column on which to stratify.
|
|
||||||
|
|
||||||
# Split original dataframe into train and temp dataframes.
|
# Split original dataframe into train and temp dataframes.
|
||||||
df_train, df_temp, y_train, y_temp = train_test_split(
|
df_train, df_temp, y_train, y_temp = train_test_split(
|
||||||
X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
|
X, y, stratify=z, test_size=(1.0 - frac_train), random_state=random_state
|
||||||
)
|
)
|
||||||
|
|
||||||
if frac_val <= 0:
|
if frac_val <= 0:
|
||||||
@ -70,7 +73,7 @@ def split_stratified_into_train_val_test(
|
|||||||
df_val, df_test, y_val, y_test = train_test_split(
|
df_val, df_test, y_val, y_test = train_test_split(
|
||||||
df_temp,
|
df_temp,
|
||||||
y_temp,
|
y_temp,
|
||||||
stratify=y_temp,
|
stratify=df_temp[[stratify_colname]],
|
||||||
test_size=relative_frac_test,
|
test_size=relative_frac_test,
|
||||||
random_state=random_state,
|
random_state=random_state,
|
||||||
)
|
)
|
||||||
|
100
utils_clusters.py
Normal file
100
utils_clusters.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
import math
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from pandas import DataFrame
|
||||||
|
from sklearn import cluster
|
||||||
|
from sklearn.metrics import silhouette_samples, silhouette_score
|
||||||
|
|
||||||
|
|
||||||
|
def run_agglomerative(
|
||||||
|
df: DataFrame, num_clusters: int | None = 2
|
||||||
|
) -> cluster.AgglomerativeClustering:
|
||||||
|
agglomerative = cluster.AgglomerativeClustering(
|
||||||
|
n_clusters=num_clusters,
|
||||||
|
compute_distances=True,
|
||||||
|
)
|
||||||
|
return agglomerative.fit(df)
|
||||||
|
|
||||||
|
|
||||||
|
def get_linkage_matrix(model: cluster.AgglomerativeClustering) -> np.ndarray:
|
||||||
|
counts = np.zeros(model.children_.shape[0]) # type: ignore
|
||||||
|
n_samples = len(model.labels_)
|
||||||
|
for i, merge in enumerate(model.children_): # type: ignore
|
||||||
|
current_count = 0
|
||||||
|
for child_idx in merge:
|
||||||
|
if child_idx < n_samples:
|
||||||
|
current_count += 1
|
||||||
|
else:
|
||||||
|
current_count += counts[child_idx - n_samples]
|
||||||
|
counts[i] = current_count
|
||||||
|
|
||||||
|
return np.column_stack([model.children_, model.distances_, counts]).astype(float)
|
||||||
|
|
||||||
|
|
||||||
|
def print_cluster_result(
|
||||||
|
df: DataFrame, clusters_num: int, labels: np.ndarray, separator: str = ", "
|
||||||
|
):
|
||||||
|
for cluster_id in range(clusters_num):
|
||||||
|
cluster_indices = np.where(labels == cluster_id)[0]
|
||||||
|
print(f"Cluster {cluster_id + 1} ({len(cluster_indices)}):")
|
||||||
|
rules = [str(df.index[idx]) for idx in cluster_indices]
|
||||||
|
print(separator.join(rules))
|
||||||
|
print("")
|
||||||
|
print("--------")
|
||||||
|
|
||||||
|
|
||||||
|
def run_kmeans(
|
||||||
|
df: DataFrame, num_clusters: int, random_state: int
|
||||||
|
) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
|
||||||
|
labels = kmeans.fit_predict(df)
|
||||||
|
return labels, kmeans.cluster_centers_
|
||||||
|
|
||||||
|
|
||||||
|
def fit_kmeans(
|
||||||
|
reduced_data: np.ndarray, num_clusters: int, random_state: int
|
||||||
|
) -> cluster.KMeans:
|
||||||
|
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
|
||||||
|
kmeans.fit(reduced_data)
|
||||||
|
return kmeans
|
||||||
|
|
||||||
|
|
||||||
|
def _get_kmeans_range(
|
||||||
|
df: DataFrame | np.ndarray, random_state: int
|
||||||
|
) -> Tuple[List, range]:
|
||||||
|
max_clusters = int(math.sqrt(len(df)))
|
||||||
|
clusters_range = range(2, max_clusters + 1)
|
||||||
|
kmeans_per_k = [
|
||||||
|
cluster.KMeans(n_clusters=k, random_state=random_state).fit(df)
|
||||||
|
for k in clusters_range
|
||||||
|
]
|
||||||
|
return kmeans_per_k, clusters_range
|
||||||
|
|
||||||
|
|
||||||
|
def get_clusters_inertia(df: DataFrame, random_state: int) -> Tuple[List, range]:
|
||||||
|
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
|
||||||
|
return [model.inertia_ for model in kmeans_per_k], clusters_range
|
||||||
|
|
||||||
|
|
||||||
|
def get_clusters_silhouette_scores(
|
||||||
|
df: DataFrame, random_state: int
|
||||||
|
) -> Tuple[List, range]:
|
||||||
|
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
|
||||||
|
return [
|
||||||
|
float(silhouette_score(df, model.labels_)) for model in kmeans_per_k
|
||||||
|
], clusters_range
|
||||||
|
|
||||||
|
|
||||||
|
def get_clusters_silhouettes(df: np.ndarray, random_state: int) -> Dict:
|
||||||
|
kmeans_per_k, _ = _get_kmeans_range(df, random_state)
|
||||||
|
clusters_silhouettes: Dict = {}
|
||||||
|
for model in kmeans_per_k:
|
||||||
|
silhouette_value = silhouette_score(df, model.labels_)
|
||||||
|
sample_silhouette_values = silhouette_samples(df, model.labels_)
|
||||||
|
clusters_silhouettes[model.n_clusters] = (
|
||||||
|
silhouette_value,
|
||||||
|
sample_silhouette_values,
|
||||||
|
model,
|
||||||
|
)
|
||||||
|
return clusters_silhouettes
|
242
visual.py
Normal file
242
visual.py
Normal file
@ -0,0 +1,242 @@
|
|||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import matplotlib.cm as cm
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
from pandas import DataFrame
|
||||||
|
from scipy.cluster import hierarchy
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
|
|
||||||
|
def draw_data_2d(
|
||||||
|
df: DataFrame,
|
||||||
|
col1: int,
|
||||||
|
col2: int,
|
||||||
|
y: List | None = None,
|
||||||
|
classes: List | None = None,
|
||||||
|
subplot: Any | None = None,
|
||||||
|
):
|
||||||
|
ax = None
|
||||||
|
if subplot is None:
|
||||||
|
_, ax = plt.subplots()
|
||||||
|
else:
|
||||||
|
ax = subplot
|
||||||
|
scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y)
|
||||||
|
ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2])
|
||||||
|
if classes is not None:
|
||||||
|
ax.legend(
|
||||||
|
scatter.legend_elements()[0], classes, loc="lower right", title="Classes"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_dendrogram(linkage_matrix: np.ndarray):
|
||||||
|
hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_cluster_results(
|
||||||
|
df: DataFrame,
|
||||||
|
col1: int,
|
||||||
|
col2: int,
|
||||||
|
labels: np.ndarray,
|
||||||
|
cluster_centers: np.ndarray,
|
||||||
|
subplot: Any | None = None,
|
||||||
|
):
|
||||||
|
ax = None
|
||||||
|
if subplot is None:
|
||||||
|
ax = plt
|
||||||
|
else:
|
||||||
|
ax = subplot
|
||||||
|
|
||||||
|
centroids = cluster_centers
|
||||||
|
u_labels = np.unique(labels)
|
||||||
|
|
||||||
|
for i in u_labels:
|
||||||
|
ax.scatter(
|
||||||
|
df[labels == i][df.columns[col1]],
|
||||||
|
df[labels == i][df.columns[col2]],
|
||||||
|
label=i,
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k")
|
||||||
|
|
||||||
|
|
||||||
|
def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans):
|
||||||
|
h = 0.02
|
||||||
|
|
||||||
|
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
|
||||||
|
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
|
||||||
|
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
|
||||||
|
|
||||||
|
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||||
|
|
||||||
|
Z = Z.reshape(xx.shape)
|
||||||
|
plt.figure(1)
|
||||||
|
plt.clf()
|
||||||
|
plt.imshow(
|
||||||
|
Z,
|
||||||
|
interpolation="nearest",
|
||||||
|
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
|
||||||
|
cmap=plt.cm.Paired, # type: ignore
|
||||||
|
aspect="auto",
|
||||||
|
origin="lower",
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
|
||||||
|
centroids = kmeans.cluster_centers_
|
||||||
|
plt.scatter(
|
||||||
|
centroids[:, 0],
|
||||||
|
centroids[:, 1],
|
||||||
|
marker="x",
|
||||||
|
s=169,
|
||||||
|
linewidths=3,
|
||||||
|
color="w",
|
||||||
|
zorder=10,
|
||||||
|
)
|
||||||
|
plt.title(
|
||||||
|
"K-means clustering (PCA-reduced data)\n"
|
||||||
|
"Centroids are marked with white cross"
|
||||||
|
)
|
||||||
|
plt.xlim(x_min, x_max)
|
||||||
|
plt.ylim(y_min, y_max)
|
||||||
|
plt.xticks(())
|
||||||
|
plt.yticks(())
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_cluster_scores(
|
||||||
|
data: List,
|
||||||
|
clusters_range: range,
|
||||||
|
score_name: str,
|
||||||
|
title: str,
|
||||||
|
):
|
||||||
|
plt.figure(figsize=(8, 5))
|
||||||
|
plt.plot(clusters_range, data, "bo-")
|
||||||
|
plt.xlabel("$k$", fontsize=8)
|
||||||
|
plt.ylabel(score_name, fontsize=8)
|
||||||
|
plt.title(title)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_elbow_diagram(inertias: List, clusters_range: range):
|
||||||
|
_draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram")
|
||||||
|
|
||||||
|
|
||||||
|
def draw_silhouettes_diagram(silhouette: List, clusters_range: range):
|
||||||
|
_draw_cluster_scores(
|
||||||
|
silhouette, clusters_range, "Silhouette score", "The Silhouette score"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_silhouette(
|
||||||
|
ax: Any,
|
||||||
|
reduced_data: np.ndarray,
|
||||||
|
n_clusters: int,
|
||||||
|
silhouette_avg: float,
|
||||||
|
sample_silhouette_values: List,
|
||||||
|
cluster_labels: List,
|
||||||
|
):
|
||||||
|
ax.set_xlim([-0.1, 1])
|
||||||
|
ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10])
|
||||||
|
|
||||||
|
y_lower = 10
|
||||||
|
for i in range(n_clusters):
|
||||||
|
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
|
||||||
|
|
||||||
|
ith_cluster_silhouette_values.sort()
|
||||||
|
|
||||||
|
size_cluster_i = ith_cluster_silhouette_values.shape[0]
|
||||||
|
y_upper = y_lower + size_cluster_i
|
||||||
|
|
||||||
|
color = cm.nipy_spectral(float(i) / n_clusters) # type: ignore
|
||||||
|
ax.fill_betweenx(
|
||||||
|
np.arange(y_lower, y_upper),
|
||||||
|
0,
|
||||||
|
ith_cluster_silhouette_values,
|
||||||
|
facecolor=color,
|
||||||
|
edgecolor=color,
|
||||||
|
alpha=0.7,
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
|
||||||
|
|
||||||
|
y_lower = y_upper + 10 # 10 for the 0 samples
|
||||||
|
|
||||||
|
ax.set_title("The silhouette plot for the various clusters.")
|
||||||
|
ax.set_xlabel("The silhouette coefficient values")
|
||||||
|
ax.set_ylabel("Cluster label")
|
||||||
|
|
||||||
|
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
|
||||||
|
|
||||||
|
ax.set_yticks([])
|
||||||
|
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_cluster_data(
|
||||||
|
ax: Any,
|
||||||
|
reduced_data: np.ndarray,
|
||||||
|
n_clusters: int,
|
||||||
|
cluster_labels: np.ndarray,
|
||||||
|
cluster_centers: np.ndarray,
|
||||||
|
):
|
||||||
|
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) # type: ignore
|
||||||
|
ax.scatter(
|
||||||
|
reduced_data[:, 0],
|
||||||
|
reduced_data[:, 1],
|
||||||
|
marker=".",
|
||||||
|
s=30,
|
||||||
|
lw=0,
|
||||||
|
alpha=0.7,
|
||||||
|
c=colors,
|
||||||
|
edgecolor="k",
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.scatter(
|
||||||
|
cluster_centers[:, 0],
|
||||||
|
cluster_centers[:, 1],
|
||||||
|
marker="o",
|
||||||
|
c="white",
|
||||||
|
alpha=1,
|
||||||
|
s=200,
|
||||||
|
edgecolor="k",
|
||||||
|
)
|
||||||
|
|
||||||
|
for i, c in enumerate(cluster_centers):
|
||||||
|
ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
|
||||||
|
|
||||||
|
ax.set_title("The visualization of the clustered data.")
|
||||||
|
ax.set_xlabel("Feature space for the 1st feature")
|
||||||
|
ax.set_ylabel("Feature space for the 2nd feature")
|
||||||
|
|
||||||
|
|
||||||
|
def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict):
|
||||||
|
for key, value in silhouettes.items():
|
||||||
|
fig, (ax1, ax2) = plt.subplots(1, 2)
|
||||||
|
fig.set_size_inches(18, 7)
|
||||||
|
|
||||||
|
n_clusters = key
|
||||||
|
silhouette_avg = value[0]
|
||||||
|
sample_silhouette_values = value[1]
|
||||||
|
cluster_labels = value[2].labels_
|
||||||
|
cluster_centers = value[2].cluster_centers_
|
||||||
|
|
||||||
|
_draw_silhouette(
|
||||||
|
ax1,
|
||||||
|
reduced_data,
|
||||||
|
n_clusters,
|
||||||
|
silhouette_avg,
|
||||||
|
sample_silhouette_values,
|
||||||
|
cluster_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
_draw_cluster_data(
|
||||||
|
ax2,
|
||||||
|
reduced_data,
|
||||||
|
n_clusters,
|
||||||
|
cluster_labels,
|
||||||
|
cluster_centers,
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.suptitle(
|
||||||
|
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
|
||||||
|
% n_clusters,
|
||||||
|
fontsize=14,
|
||||||
|
fontweight="bold",
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user