lab5-6
This commit is contained in:
parent
338e0b0ad8
commit
f08c12ac81
File diff suppressed because it is too large
Load Diff
90837
data/neo.csv
Normal file
90837
data/neo.csv
Normal file
File diff suppressed because it is too large
Load Diff
1818
notebooks/lab4.ipynb
1818
notebooks/lab4.ipynb
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
1195
notebooks/lab5_1.ipynb
Normal file
1195
notebooks/lab5_1.ipynb
Normal file
File diff suppressed because one or more lines are too long
13326
notebooks/lab6_1.ipynb
Normal file
13326
notebooks/lab6_1.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
100
notebooks/utils_clusters.py
Normal file
100
notebooks/utils_clusters.py
Normal file
@ -0,0 +1,100 @@
|
||||
import math
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
from pandas import DataFrame
|
||||
from sklearn import cluster
|
||||
from sklearn.metrics import silhouette_samples, silhouette_score
|
||||
|
||||
|
||||
def run_agglomerative(
|
||||
df: DataFrame, num_clusters: int | None = 2
|
||||
) -> cluster.AgglomerativeClustering:
|
||||
agglomerative = cluster.AgglomerativeClustering(
|
||||
n_clusters=num_clusters,
|
||||
compute_distances=True,
|
||||
)
|
||||
return agglomerative.fit(df)
|
||||
|
||||
|
||||
def get_linkage_matrix(model: cluster.AgglomerativeClustering) -> np.ndarray:
|
||||
counts = np.zeros(model.children_.shape[0]) # type: ignore
|
||||
n_samples = len(model.labels_)
|
||||
for i, merge in enumerate(model.children_): # type: ignore
|
||||
current_count = 0
|
||||
for child_idx in merge:
|
||||
if child_idx < n_samples:
|
||||
current_count += 1
|
||||
else:
|
||||
current_count += counts[child_idx - n_samples]
|
||||
counts[i] = current_count
|
||||
|
||||
return np.column_stack([model.children_, model.distances_, counts]).astype(float)
|
||||
|
||||
|
||||
def print_cluster_result(
|
||||
df: DataFrame, clusters_num: int, labels: np.ndarray, separator: str = ", "
|
||||
):
|
||||
for cluster_id in range(clusters_num):
|
||||
cluster_indices = np.where(labels == cluster_id)[0]
|
||||
print(f"Cluster {cluster_id + 1} ({len(cluster_indices)}):")
|
||||
rules = [str(df.index[idx]) for idx in cluster_indices]
|
||||
print(separator.join(rules))
|
||||
print("")
|
||||
print("--------")
|
||||
|
||||
|
||||
def run_kmeans(
|
||||
df: DataFrame, num_clusters: int, random_state: int
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
|
||||
labels = kmeans.fit_predict(df)
|
||||
return labels, kmeans.cluster_centers_
|
||||
|
||||
|
||||
def fit_kmeans(
|
||||
reduced_data: np.ndarray, num_clusters: int, random_state: int
|
||||
) -> cluster.KMeans:
|
||||
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
|
||||
kmeans.fit(reduced_data)
|
||||
return kmeans
|
||||
|
||||
|
||||
def _get_kmeans_range(
|
||||
df: DataFrame | np.ndarray, random_state: int
|
||||
) -> Tuple[List, range]:
|
||||
max_clusters = int(math.sqrt(len(df)))
|
||||
clusters_range = range(2, max_clusters + 1)
|
||||
kmeans_per_k = [
|
||||
cluster.KMeans(n_clusters=k, random_state=random_state).fit(df)
|
||||
for k in clusters_range
|
||||
]
|
||||
return kmeans_per_k, clusters_range
|
||||
|
||||
|
||||
def get_clusters_inertia(df: DataFrame, random_state: int) -> Tuple[List, range]:
|
||||
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
|
||||
return [model.inertia_ for model in kmeans_per_k], clusters_range
|
||||
|
||||
|
||||
def get_clusters_silhouette_scores(
|
||||
df: DataFrame, random_state: int
|
||||
) -> Tuple[List, range]:
|
||||
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
|
||||
return [
|
||||
float(silhouette_score(df, model.labels_)) for model in kmeans_per_k
|
||||
], clusters_range
|
||||
|
||||
|
||||
def get_clusters_silhouettes(df: np.ndarray, random_state: int) -> Dict:
|
||||
kmeans_per_k, _ = _get_kmeans_range(df, random_state)
|
||||
clusters_silhouettes: Dict = {}
|
||||
for model in kmeans_per_k:
|
||||
silhouette_value = silhouette_score(df, model.labels_)
|
||||
sample_silhouette_values = silhouette_samples(df, model.labels_)
|
||||
clusters_silhouettes[model.n_clusters] = (
|
||||
silhouette_value,
|
||||
sample_silhouette_values,
|
||||
model,
|
||||
)
|
||||
return clusters_silhouettes
|
242
notebooks/visual.py
Normal file
242
notebooks/visual.py
Normal file
@ -0,0 +1,242 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import matplotlib.cm as cm
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from pandas import DataFrame
|
||||
from scipy.cluster import hierarchy
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
|
||||
def draw_data_2d(
|
||||
df: DataFrame,
|
||||
col1: int,
|
||||
col2: int,
|
||||
y: List | None = None,
|
||||
classes: List | None = None,
|
||||
subplot: Any | None = None,
|
||||
):
|
||||
ax = None
|
||||
if subplot is None:
|
||||
_, ax = plt.subplots()
|
||||
else:
|
||||
ax = subplot
|
||||
scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y)
|
||||
ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2])
|
||||
if classes is not None:
|
||||
ax.legend(
|
||||
scatter.legend_elements()[0], classes, loc="lower right", title="Classes"
|
||||
)
|
||||
|
||||
|
||||
def draw_dendrogram(linkage_matrix: np.ndarray):
|
||||
hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3)
|
||||
|
||||
|
||||
def draw_cluster_results(
|
||||
df: DataFrame,
|
||||
col1: int,
|
||||
col2: int,
|
||||
labels: np.ndarray,
|
||||
cluster_centers: np.ndarray,
|
||||
subplot: Any | None = None,
|
||||
):
|
||||
ax = None
|
||||
if subplot is None:
|
||||
ax = plt
|
||||
else:
|
||||
ax = subplot
|
||||
|
||||
centroids = cluster_centers
|
||||
u_labels = np.unique(labels)
|
||||
|
||||
for i in u_labels:
|
||||
ax.scatter(
|
||||
df[labels == i][df.columns[col1]],
|
||||
df[labels == i][df.columns[col2]],
|
||||
label=i,
|
||||
)
|
||||
|
||||
ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k")
|
||||
|
||||
|
||||
def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans):
|
||||
h = 0.02
|
||||
|
||||
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
|
||||
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
|
||||
|
||||
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.figure(1)
|
||||
plt.clf()
|
||||
plt.imshow(
|
||||
Z,
|
||||
interpolation="nearest",
|
||||
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
|
||||
cmap=plt.cm.Paired, # type: ignore
|
||||
aspect="auto",
|
||||
origin="lower",
|
||||
)
|
||||
|
||||
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
|
||||
centroids = kmeans.cluster_centers_
|
||||
plt.scatter(
|
||||
centroids[:, 0],
|
||||
centroids[:, 1],
|
||||
marker="x",
|
||||
s=169,
|
||||
linewidths=3,
|
||||
color="w",
|
||||
zorder=10,
|
||||
)
|
||||
plt.title(
|
||||
"K-means clustering (PCA-reduced data)\n"
|
||||
"Centroids are marked with white cross"
|
||||
)
|
||||
plt.xlim(x_min, x_max)
|
||||
plt.ylim(y_min, y_max)
|
||||
plt.xticks(())
|
||||
plt.yticks(())
|
||||
|
||||
|
||||
def _draw_cluster_scores(
|
||||
data: List,
|
||||
clusters_range: range,
|
||||
score_name: str,
|
||||
title: str,
|
||||
):
|
||||
plt.figure(figsize=(8, 5))
|
||||
plt.plot(clusters_range, data, "bo-")
|
||||
plt.xlabel("$k$", fontsize=8)
|
||||
plt.ylabel(score_name, fontsize=8)
|
||||
plt.title(title)
|
||||
|
||||
|
||||
def draw_elbow_diagram(inertias: List, clusters_range: range):
|
||||
_draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram")
|
||||
|
||||
|
||||
def draw_silhouettes_diagram(silhouette: List, clusters_range: range):
|
||||
_draw_cluster_scores(
|
||||
silhouette, clusters_range, "Silhouette score", "The Silhouette score"
|
||||
)
|
||||
|
||||
|
||||
def _draw_silhouette(
|
||||
ax: Any,
|
||||
reduced_data: np.ndarray,
|
||||
n_clusters: int,
|
||||
silhouette_avg: float,
|
||||
sample_silhouette_values: List,
|
||||
cluster_labels: List,
|
||||
):
|
||||
ax.set_xlim([-0.1, 1])
|
||||
ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10])
|
||||
|
||||
y_lower = 10
|
||||
for i in range(n_clusters):
|
||||
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
|
||||
|
||||
ith_cluster_silhouette_values.sort()
|
||||
|
||||
size_cluster_i = ith_cluster_silhouette_values.shape[0]
|
||||
y_upper = y_lower + size_cluster_i
|
||||
|
||||
color = cm.nipy_spectral(float(i) / n_clusters) # type: ignore
|
||||
ax.fill_betweenx(
|
||||
np.arange(y_lower, y_upper),
|
||||
0,
|
||||
ith_cluster_silhouette_values,
|
||||
facecolor=color,
|
||||
edgecolor=color,
|
||||
alpha=0.7,
|
||||
)
|
||||
|
||||
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
|
||||
|
||||
y_lower = y_upper + 10 # 10 for the 0 samples
|
||||
|
||||
ax.set_title("The silhouette plot for the various clusters.")
|
||||
ax.set_xlabel("The silhouette coefficient values")
|
||||
ax.set_ylabel("Cluster label")
|
||||
|
||||
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
|
||||
|
||||
ax.set_yticks([])
|
||||
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
|
||||
|
||||
|
||||
def _draw_cluster_data(
|
||||
ax: Any,
|
||||
reduced_data: np.ndarray,
|
||||
n_clusters: int,
|
||||
cluster_labels: np.ndarray,
|
||||
cluster_centers: np.ndarray,
|
||||
):
|
||||
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) # type: ignore
|
||||
ax.scatter(
|
||||
reduced_data[:, 0],
|
||||
reduced_data[:, 1],
|
||||
marker=".",
|
||||
s=30,
|
||||
lw=0,
|
||||
alpha=0.7,
|
||||
c=colors,
|
||||
edgecolor="k",
|
||||
)
|
||||
|
||||
ax.scatter(
|
||||
cluster_centers[:, 0],
|
||||
cluster_centers[:, 1],
|
||||
marker="o",
|
||||
c="white",
|
||||
alpha=1,
|
||||
s=200,
|
||||
edgecolor="k",
|
||||
)
|
||||
|
||||
for i, c in enumerate(cluster_centers):
|
||||
ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
|
||||
|
||||
ax.set_title("The visualization of the clustered data.")
|
||||
ax.set_xlabel("Feature space for the 1st feature")
|
||||
ax.set_ylabel("Feature space for the 2nd feature")
|
||||
|
||||
|
||||
def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict):
|
||||
for key, value in silhouettes.items():
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2)
|
||||
fig.set_size_inches(18, 7)
|
||||
|
||||
n_clusters = key
|
||||
silhouette_avg = value[0]
|
||||
sample_silhouette_values = value[1]
|
||||
cluster_labels = value[2].labels_
|
||||
cluster_centers = value[2].cluster_centers_
|
||||
|
||||
_draw_silhouette(
|
||||
ax1,
|
||||
reduced_data,
|
||||
n_clusters,
|
||||
silhouette_avg,
|
||||
sample_silhouette_values,
|
||||
cluster_labels,
|
||||
)
|
||||
|
||||
_draw_cluster_data(
|
||||
ax2,
|
||||
reduced_data,
|
||||
n_clusters,
|
||||
cluster_labels,
|
||||
cluster_centers,
|
||||
)
|
||||
|
||||
plt.suptitle(
|
||||
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
|
||||
% n_clusters,
|
||||
fontsize=14,
|
||||
fontweight="bold",
|
||||
)
|
54
poetry.lock
generated
54
poetry.lock
generated
@ -672,6 +672,17 @@ files = [
|
||||
[package.extras]
|
||||
tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
|
||||
|
||||
[[package]]
|
||||
name = "farama-notifications"
|
||||
version = "0.0.4"
|
||||
description = "Notifications for all Farama Foundation maintained libraries."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "Farama-Notifications-0.0.4.tar.gz", hash = "sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18"},
|
||||
{file = "Farama_Notifications-0.0.4-py3-none-any.whl", hash = "sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastjsonschema"
|
||||
version = "2.20.0"
|
||||
@ -868,6 +879,36 @@ files = [
|
||||
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gymnasium"
|
||||
version = "1.0.0"
|
||||
description = "A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "gymnasium-1.0.0-py3-none-any.whl", hash = "sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad"},
|
||||
{file = "gymnasium-1.0.0.tar.gz", hash = "sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cloudpickle = ">=1.2.0"
|
||||
farama-notifications = ">=0.0.1"
|
||||
numpy = ">=1.21.0"
|
||||
typing-extensions = ">=4.3.0"
|
||||
|
||||
[package.extras]
|
||||
all = ["ale-py (>=0.9)", "box2d-py (==2.3.5)", "cython (<3)", "flax (>=0.5.0)", "imageio (>=2.14.1)", "jax (>=0.4.0)", "jaxlib (>=0.4.0)", "matplotlib (>=3.0)", "moviepy (>=1.0.0)", "mujoco (>=2.1.5)", "mujoco-py (>=2.1,<2.2)", "opencv-python (>=3.0)", "pygame (>=2.1.3)", "swig (==4.*)", "torch (>=1.0.0)"]
|
||||
atari = ["ale-py (>=0.9)"]
|
||||
box2d = ["box2d-py (==2.3.5)", "pygame (>=2.1.3)", "swig (==4.*)"]
|
||||
classic-control = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
|
||||
jax = ["flax (>=0.5.0)", "jax (>=0.4.0)", "jaxlib (>=0.4.0)"]
|
||||
mujoco = ["imageio (>=2.14.1)", "mujoco (>=2.1.5)"]
|
||||
mujoco-py = ["cython (<3)", "cython (<3)", "mujoco-py (>=2.1,<2.2)", "mujoco-py (>=2.1,<2.2)"]
|
||||
other = ["matplotlib (>=3.0)", "moviepy (>=1.0.0)", "opencv-python (>=3.0)"]
|
||||
testing = ["dill (>=0.3.7)", "pytest (==7.1.3)", "scipy (>=1.7.3)"]
|
||||
torch = ["torch (>=1.0.0)"]
|
||||
toy-text = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "h11"
|
||||
version = "0.14.0"
|
||||
@ -3091,6 +3132,17 @@ files = [
|
||||
{file = "types_python_dateutil-2.9.0.20240821-py3-none-any.whl", hash = "sha256:f5889fcb4e63ed4aaa379b44f93c32593d50b9a94c9a60a0c854d8cc3511cd57"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.12.2"
|
||||
description = "Backported and Experimental Type Hints for Python 3.8+"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
|
||||
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tzdata"
|
||||
version = "2024.1"
|
||||
@ -3265,4 +3317,4 @@ updater = ["alteryx-open-src-update-checker (>=3.1.0)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.12"
|
||||
content-hash = "ddd000b70cadbcdb2463cdb4e0be8181c6dab001dd368a95bd2caa73a3085aa5"
|
||||
content-hash = "76a7ecc0524f2a9a187e4242566cf9813bf2265aa4176553ea4f33c9a4c78f17"
|
||||
|
@ -19,6 +19,8 @@ scikit-learn = "^1.5.2"
|
||||
imbalanced-learn = "^0.12.3"
|
||||
featuretools = "^1.31.0"
|
||||
seaborn = "^0.13.2"
|
||||
gymnasium = "^1.0.0"
|
||||
scipy = "^1.14.1"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
Loading…
x
Reference in New Issue
Block a user