This commit is contained in:
antoc0der 2024-11-15 23:06:57 +04:00
parent 7aff499269
commit 9eba71f6ee
10 changed files with 13585 additions and 1712 deletions

4081
data/fish_data.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 186 KiB

1003
lec3.ipynb

File diff suppressed because one or more lines are too long

2310
lec4.ipynb

File diff suppressed because one or more lines are too long

2490
lec4_1.ipynb Normal file

File diff suppressed because one or more lines are too long

3719
lec4_reg.ipynb Normal file

File diff suppressed because it is too large Load Diff

1339
lec5.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -7,6 +7,7 @@ from sklearn.model_selection import train_test_split
def split_stratified_into_train_val_test( def split_stratified_into_train_val_test(
df_input, df_input,
target_colname="z",
stratify_colname="y", stratify_colname="y",
frac_train=0.6, frac_train=0.6,
frac_val=0.15, frac_val=0.15,
@ -51,14 +52,16 @@ def split_stratified_into_train_val_test(
if stratify_colname not in df_input.columns: if stratify_colname not in df_input.columns:
raise ValueError("%s is not a column in the dataframe" % (stratify_colname)) raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
if target_colname not in df_input.columns:
raise ValueError("%s is not a column in the dataframe" % (target_colname))
X = df_input # Contains all columns. X = df_input # Contains all columns.
y = df_input[ y = df_input[[target_colname]] # Dataframe of just the column on which to stratify.
[stratify_colname] z = df_input[[stratify_colname]]
] # Dataframe of just the column on which to stratify.
# Split original dataframe into train and temp dataframes. # Split original dataframe into train and temp dataframes.
df_train, df_temp, y_train, y_temp = train_test_split( df_train, df_temp, y_train, y_temp = train_test_split(
X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state X, y, stratify=z, test_size=(1.0 - frac_train), random_state=random_state
) )
if frac_val <= 0: if frac_val <= 0:
@ -70,7 +73,7 @@ def split_stratified_into_train_val_test(
df_val, df_test, y_val, y_test = train_test_split( df_val, df_test, y_val, y_test = train_test_split(
df_temp, df_temp,
y_temp, y_temp,
stratify=y_temp, stratify=df_temp[[stratify_colname]],
test_size=relative_frac_test, test_size=relative_frac_test,
random_state=random_state, random_state=random_state,
) )

100
utils_clusters.py Normal file
View File

@ -0,0 +1,100 @@
import math
from typing import Dict, List, Tuple
import numpy as np
from pandas import DataFrame
from sklearn import cluster
from sklearn.metrics import silhouette_samples, silhouette_score
def run_agglomerative(
df: DataFrame, num_clusters: int | None = 2
) -> cluster.AgglomerativeClustering:
agglomerative = cluster.AgglomerativeClustering(
n_clusters=num_clusters,
compute_distances=True,
)
return agglomerative.fit(df)
def get_linkage_matrix(model: cluster.AgglomerativeClustering) -> np.ndarray:
counts = np.zeros(model.children_.shape[0]) # type: ignore
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_): # type: ignore
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
return np.column_stack([model.children_, model.distances_, counts]).astype(float)
def print_cluster_result(
df: DataFrame, clusters_num: int, labels: np.ndarray, separator: str = ", "
):
for cluster_id in range(clusters_num):
cluster_indices = np.where(labels == cluster_id)[0]
print(f"Cluster {cluster_id + 1} ({len(cluster_indices)}):")
rules = [str(df.index[idx]) for idx in cluster_indices]
print(separator.join(rules))
print("")
print("--------")
def run_kmeans(
df: DataFrame, num_clusters: int, random_state: int
) -> Tuple[np.ndarray, np.ndarray]:
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
labels = kmeans.fit_predict(df)
return labels, kmeans.cluster_centers_
def fit_kmeans(
reduced_data: np.ndarray, num_clusters: int, random_state: int
) -> cluster.KMeans:
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
kmeans.fit(reduced_data)
return kmeans
def _get_kmeans_range(
df: DataFrame | np.ndarray, random_state: int
) -> Tuple[List, range]:
max_clusters = int(math.sqrt(len(df)))
clusters_range = range(2, max_clusters + 1)
kmeans_per_k = [
cluster.KMeans(n_clusters=k, random_state=random_state).fit(df)
for k in clusters_range
]
return kmeans_per_k, clusters_range
def get_clusters_inertia(df: DataFrame, random_state: int) -> Tuple[List, range]:
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
return [model.inertia_ for model in kmeans_per_k], clusters_range
def get_clusters_silhouette_scores(
df: DataFrame, random_state: int
) -> Tuple[List, range]:
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
return [
float(silhouette_score(df, model.labels_)) for model in kmeans_per_k
], clusters_range
def get_clusters_silhouettes(df: np.ndarray, random_state: int) -> Dict:
kmeans_per_k, _ = _get_kmeans_range(df, random_state)
clusters_silhouettes: Dict = {}
for model in kmeans_per_k:
silhouette_value = silhouette_score(df, model.labels_)
sample_silhouette_values = silhouette_samples(df, model.labels_)
clusters_silhouettes[model.n_clusters] = (
silhouette_value,
sample_silhouette_values,
model,
)
return clusters_silhouettes

242
visual.py Normal file
View File

@ -0,0 +1,242 @@
from typing import Any, Dict, List
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
from scipy.cluster import hierarchy
from sklearn.cluster import KMeans
def draw_data_2d(
df: DataFrame,
col1: int,
col2: int,
y: List | None = None,
classes: List | None = None,
subplot: Any | None = None,
):
ax = None
if subplot is None:
_, ax = plt.subplots()
else:
ax = subplot
scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y)
ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2])
if classes is not None:
ax.legend(
scatter.legend_elements()[0], classes, loc="lower right", title="Classes"
)
def draw_dendrogram(linkage_matrix: np.ndarray):
hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3)
def draw_cluster_results(
df: DataFrame,
col1: int,
col2: int,
labels: np.ndarray,
cluster_centers: np.ndarray,
subplot: Any | None = None,
):
ax = None
if subplot is None:
ax = plt
else:
ax = subplot
centroids = cluster_centers
u_labels = np.unique(labels)
for i in u_labels:
ax.scatter(
df[labels == i][df.columns[col1]],
df[labels == i][df.columns[col2]],
label=i,
)
ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k")
def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans):
h = 0.02
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
Z,
interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired, # type: ignore
aspect="auto",
origin="lower",
)
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
centroids = kmeans.cluster_centers_
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"K-means clustering (PCA-reduced data)\n"
"Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
def _draw_cluster_scores(
data: List,
clusters_range: range,
score_name: str,
title: str,
):
plt.figure(figsize=(8, 5))
plt.plot(clusters_range, data, "bo-")
plt.xlabel("$k$", fontsize=8)
plt.ylabel(score_name, fontsize=8)
plt.title(title)
def draw_elbow_diagram(inertias: List, clusters_range: range):
_draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram")
def draw_silhouettes_diagram(silhouette: List, clusters_range: range):
_draw_cluster_scores(
silhouette, clusters_range, "Silhouette score", "The Silhouette score"
)
def _draw_silhouette(
ax: Any,
reduced_data: np.ndarray,
n_clusters: int,
silhouette_avg: float,
sample_silhouette_values: List,
cluster_labels: List,
):
ax.set_xlim([-0.1, 1])
ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters) # type: ignore
ax.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10 # 10 for the 0 samples
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
ax.set_yticks([])
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
def _draw_cluster_data(
ax: Any,
reduced_data: np.ndarray,
n_clusters: int,
cluster_labels: np.ndarray,
cluster_centers: np.ndarray,
):
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) # type: ignore
ax.scatter(
reduced_data[:, 0],
reduced_data[:, 1],
marker=".",
s=30,
lw=0,
alpha=0.7,
c=colors,
edgecolor="k",
)
ax.scatter(
cluster_centers[:, 0],
cluster_centers[:, 1],
marker="o",
c="white",
alpha=1,
s=200,
edgecolor="k",
)
for i, c in enumerate(cluster_centers):
ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
ax.set_title("The visualization of the clustered data.")
ax.set_xlabel("Feature space for the 1st feature")
ax.set_ylabel("Feature space for the 2nd feature")
def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict):
for key, value in silhouettes.items():
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
n_clusters = key
silhouette_avg = value[0]
sample_silhouette_values = value[1]
cluster_labels = value[2].labels_
cluster_centers = value[2].cluster_centers_
_draw_silhouette(
ax1,
reduced_data,
n_clusters,
silhouette_avg,
sample_silhouette_values,
cluster_labels,
)
_draw_cluster_data(
ax2,
reduced_data,
n_clusters,
cluster_labels,
cluster_centers,
)
plt.suptitle(
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
% n_clusters,
fontsize=14,
fontweight="bold",
)