lab5
This commit is contained in:
parent
ba19602e47
commit
833b09e809
8
.idea/.gitignore
vendored
Normal file
8
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
10
.idea/MII.iml
Normal file
10
.idea/MII.iml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.12 (2)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
7
.idea/misc.xml
Normal file
7
.idea/misc.xml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="Black">
|
||||||
|
<option name="sdkName" value="Python 3.12 (2)" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (2)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/MII.iml" filepath="$PROJECT_DIR$/.idea/MII.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
90837
mai/data/neo.csv
Normal file
90837
mai/data/neo.csv
Normal file
File diff suppressed because it is too large
Load Diff
1777
mai/lab4.ipynb
1777
mai/lab4.ipynb
File diff suppressed because one or more lines are too long
815
mai/lab5.ipynb
Normal file
815
mai/lab5.ipynb
Normal file
File diff suppressed because one or more lines are too long
100
mai/utils_clusters.py
Normal file
100
mai/utils_clusters.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
import math
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from pandas import DataFrame
|
||||||
|
from sklearn import cluster
|
||||||
|
from sklearn.metrics import silhouette_samples, silhouette_score
|
||||||
|
|
||||||
|
|
||||||
|
def run_agglomerative(
|
||||||
|
df: DataFrame, num_clusters: int | None = 2
|
||||||
|
) -> cluster.AgglomerativeClustering:
|
||||||
|
agglomerative = cluster.AgglomerativeClustering(
|
||||||
|
n_clusters=num_clusters,
|
||||||
|
compute_distances=True,
|
||||||
|
)
|
||||||
|
return agglomerative.fit(df)
|
||||||
|
|
||||||
|
|
||||||
|
def get_linkage_matrix(model: cluster.AgglomerativeClustering) -> np.ndarray:
|
||||||
|
counts = np.zeros(model.children_.shape[0]) # type: ignore
|
||||||
|
n_samples = len(model.labels_)
|
||||||
|
for i, merge in enumerate(model.children_): # type: ignore
|
||||||
|
current_count = 0
|
||||||
|
for child_idx in merge:
|
||||||
|
if child_idx < n_samples:
|
||||||
|
current_count += 1
|
||||||
|
else:
|
||||||
|
current_count += counts[child_idx - n_samples]
|
||||||
|
counts[i] = current_count
|
||||||
|
|
||||||
|
return np.column_stack([model.children_, model.distances_, counts]).astype(float)
|
||||||
|
|
||||||
|
|
||||||
|
def print_cluster_result(
|
||||||
|
df: DataFrame, clusters_num: int, labels: np.ndarray, separator: str = ", "
|
||||||
|
):
|
||||||
|
for cluster_id in range(clusters_num):
|
||||||
|
cluster_indices = np.where(labels == cluster_id)[0]
|
||||||
|
print(f"Cluster {cluster_id + 1} ({len(cluster_indices)}):")
|
||||||
|
rules = [str(df.index[idx]) for idx in cluster_indices]
|
||||||
|
print(separator.join(rules))
|
||||||
|
print("")
|
||||||
|
print("--------")
|
||||||
|
|
||||||
|
|
||||||
|
def run_kmeans(
|
||||||
|
df: DataFrame, num_clusters: int, random_state: int
|
||||||
|
) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
|
||||||
|
labels = kmeans.fit_predict(df)
|
||||||
|
return labels, kmeans.cluster_centers_
|
||||||
|
|
||||||
|
|
||||||
|
def fit_kmeans(
|
||||||
|
reduced_data: np.ndarray, num_clusters: int, random_state: int
|
||||||
|
) -> cluster.KMeans:
|
||||||
|
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
|
||||||
|
kmeans.fit(reduced_data)
|
||||||
|
return kmeans
|
||||||
|
|
||||||
|
|
||||||
|
def _get_kmeans_range(
|
||||||
|
df: DataFrame | np.ndarray, random_state: int
|
||||||
|
) -> Tuple[List, range]:
|
||||||
|
max_clusters = int(math.sqrt(len(df)))
|
||||||
|
clusters_range = range(2, max_clusters + 1)
|
||||||
|
kmeans_per_k = [
|
||||||
|
cluster.KMeans(n_clusters=k, random_state=random_state).fit(df)
|
||||||
|
for k in clusters_range
|
||||||
|
]
|
||||||
|
return kmeans_per_k, clusters_range
|
||||||
|
|
||||||
|
|
||||||
|
def get_clusters_inertia(df: DataFrame, random_state: int) -> Tuple[List, range]:
|
||||||
|
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
|
||||||
|
return [model.inertia_ for model in kmeans_per_k], clusters_range
|
||||||
|
|
||||||
|
|
||||||
|
def get_clusters_silhouette_scores(
|
||||||
|
df: DataFrame, random_state: int
|
||||||
|
) -> Tuple[List, range]:
|
||||||
|
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
|
||||||
|
return [
|
||||||
|
float(silhouette_score(df, model.labels_)) for model in kmeans_per_k
|
||||||
|
], clusters_range
|
||||||
|
|
||||||
|
|
||||||
|
def get_clusters_silhouettes(df: np.ndarray, random_state: int) -> Dict:
|
||||||
|
kmeans_per_k, _ = _get_kmeans_range(df, random_state)
|
||||||
|
clusters_silhouettes: Dict = {}
|
||||||
|
for model in kmeans_per_k:
|
||||||
|
silhouette_value = silhouette_score(df, model.labels_)
|
||||||
|
sample_silhouette_values = silhouette_samples(df, model.labels_)
|
||||||
|
clusters_silhouettes[model.n_clusters] = (
|
||||||
|
silhouette_value,
|
||||||
|
sample_silhouette_values,
|
||||||
|
model,
|
||||||
|
)
|
||||||
|
return clusters_silhouettes
|
242
mai/visual.py
Normal file
242
mai/visual.py
Normal file
@ -0,0 +1,242 @@
|
|||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import matplotlib.cm as cm
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
from pandas import DataFrame
|
||||||
|
from scipy.cluster import hierarchy
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
|
|
||||||
|
def draw_data_2d(
|
||||||
|
df: DataFrame,
|
||||||
|
col1: int,
|
||||||
|
col2: int,
|
||||||
|
y: List | None = None,
|
||||||
|
classes: List | None = None,
|
||||||
|
subplot: Any | None = None,
|
||||||
|
):
|
||||||
|
ax = None
|
||||||
|
if subplot is None:
|
||||||
|
_, ax = plt.subplots()
|
||||||
|
else:
|
||||||
|
ax = subplot
|
||||||
|
scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y)
|
||||||
|
ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2])
|
||||||
|
if classes is not None:
|
||||||
|
ax.legend(
|
||||||
|
scatter.legend_elements()[0], classes, loc="lower right", title="Classes"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_dendrogram(linkage_matrix: np.ndarray):
|
||||||
|
hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_cluster_results(
|
||||||
|
df: DataFrame,
|
||||||
|
col1: int,
|
||||||
|
col2: int,
|
||||||
|
labels: np.ndarray,
|
||||||
|
cluster_centers: np.ndarray,
|
||||||
|
subplot: Any | None = None,
|
||||||
|
):
|
||||||
|
ax = None
|
||||||
|
if subplot is None:
|
||||||
|
ax = plt
|
||||||
|
else:
|
||||||
|
ax = subplot
|
||||||
|
|
||||||
|
centroids = cluster_centers
|
||||||
|
u_labels = np.unique(labels)
|
||||||
|
|
||||||
|
for i in u_labels:
|
||||||
|
ax.scatter(
|
||||||
|
df[labels == i][df.columns[col1]],
|
||||||
|
df[labels == i][df.columns[col2]],
|
||||||
|
label=i,
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k")
|
||||||
|
|
||||||
|
|
||||||
|
def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans):
|
||||||
|
h = 0.02
|
||||||
|
|
||||||
|
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
|
||||||
|
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
|
||||||
|
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
|
||||||
|
|
||||||
|
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||||
|
|
||||||
|
Z = Z.reshape(xx.shape)
|
||||||
|
plt.figure(1)
|
||||||
|
plt.clf()
|
||||||
|
plt.imshow(
|
||||||
|
Z,
|
||||||
|
interpolation="nearest",
|
||||||
|
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
|
||||||
|
cmap=plt.cm.Paired, # type: ignore
|
||||||
|
aspect="auto",
|
||||||
|
origin="lower",
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
|
||||||
|
centroids = kmeans.cluster_centers_
|
||||||
|
plt.scatter(
|
||||||
|
centroids[:, 0],
|
||||||
|
centroids[:, 1],
|
||||||
|
marker="x",
|
||||||
|
s=169,
|
||||||
|
linewidths=3,
|
||||||
|
color="w",
|
||||||
|
zorder=10,
|
||||||
|
)
|
||||||
|
plt.title(
|
||||||
|
"K-means clustering (PCA-reduced data)\n"
|
||||||
|
"Centroids are marked with white cross"
|
||||||
|
)
|
||||||
|
plt.xlim(x_min, x_max)
|
||||||
|
plt.ylim(y_min, y_max)
|
||||||
|
plt.xticks(())
|
||||||
|
plt.yticks(())
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_cluster_scores(
|
||||||
|
data: List,
|
||||||
|
clusters_range: range,
|
||||||
|
score_name: str,
|
||||||
|
title: str,
|
||||||
|
):
|
||||||
|
plt.figure(figsize=(8, 5))
|
||||||
|
plt.plot(clusters_range, data, "bo-")
|
||||||
|
plt.xlabel("$k$", fontsize=8)
|
||||||
|
plt.ylabel(score_name, fontsize=8)
|
||||||
|
plt.title(title)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_elbow_diagram(inertias: List, clusters_range: range):
|
||||||
|
_draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram")
|
||||||
|
|
||||||
|
|
||||||
|
def draw_silhouettes_diagram(silhouette: List, clusters_range: range):
|
||||||
|
_draw_cluster_scores(
|
||||||
|
silhouette, clusters_range, "Silhouette score", "The Silhouette score"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_silhouette(
|
||||||
|
ax: Any,
|
||||||
|
reduced_data: np.ndarray,
|
||||||
|
n_clusters: int,
|
||||||
|
silhouette_avg: float,
|
||||||
|
sample_silhouette_values: List,
|
||||||
|
cluster_labels: List,
|
||||||
|
):
|
||||||
|
ax.set_xlim([-0.1, 1])
|
||||||
|
ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10])
|
||||||
|
|
||||||
|
y_lower = 10
|
||||||
|
for i in range(n_clusters):
|
||||||
|
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
|
||||||
|
|
||||||
|
ith_cluster_silhouette_values.sort()
|
||||||
|
|
||||||
|
size_cluster_i = ith_cluster_silhouette_values.shape[0]
|
||||||
|
y_upper = y_lower + size_cluster_i
|
||||||
|
|
||||||
|
color = cm.nipy_spectral(float(i) / n_clusters) # type: ignore
|
||||||
|
ax.fill_betweenx(
|
||||||
|
np.arange(y_lower, y_upper),
|
||||||
|
0,
|
||||||
|
ith_cluster_silhouette_values,
|
||||||
|
facecolor=color,
|
||||||
|
edgecolor=color,
|
||||||
|
alpha=0.7,
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
|
||||||
|
|
||||||
|
y_lower = y_upper + 10 # 10 for the 0 samples
|
||||||
|
|
||||||
|
ax.set_title("The silhouette plot for the various clusters.")
|
||||||
|
ax.set_xlabel("The silhouette coefficient values")
|
||||||
|
ax.set_ylabel("Cluster label")
|
||||||
|
|
||||||
|
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
|
||||||
|
|
||||||
|
ax.set_yticks([])
|
||||||
|
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_cluster_data(
|
||||||
|
ax: Any,
|
||||||
|
reduced_data: np.ndarray,
|
||||||
|
n_clusters: int,
|
||||||
|
cluster_labels: np.ndarray,
|
||||||
|
cluster_centers: np.ndarray,
|
||||||
|
):
|
||||||
|
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) # type: ignore
|
||||||
|
ax.scatter(
|
||||||
|
reduced_data[:, 0],
|
||||||
|
reduced_data[:, 1],
|
||||||
|
marker=".",
|
||||||
|
s=30,
|
||||||
|
lw=0,
|
||||||
|
alpha=0.7,
|
||||||
|
c=colors,
|
||||||
|
edgecolor="k",
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.scatter(
|
||||||
|
cluster_centers[:, 0],
|
||||||
|
cluster_centers[:, 1],
|
||||||
|
marker="o",
|
||||||
|
c="white",
|
||||||
|
alpha=1,
|
||||||
|
s=200,
|
||||||
|
edgecolor="k",
|
||||||
|
)
|
||||||
|
|
||||||
|
for i, c in enumerate(cluster_centers):
|
||||||
|
ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
|
||||||
|
|
||||||
|
ax.set_title("The visualization of the clustered data.")
|
||||||
|
ax.set_xlabel("Feature space for the 1st feature")
|
||||||
|
ax.set_ylabel("Feature space for the 2nd feature")
|
||||||
|
|
||||||
|
|
||||||
|
def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict):
|
||||||
|
for key, value in silhouettes.items():
|
||||||
|
fig, (ax1, ax2) = plt.subplots(1, 2)
|
||||||
|
fig.set_size_inches(18, 7)
|
||||||
|
|
||||||
|
n_clusters = key
|
||||||
|
silhouette_avg = value[0]
|
||||||
|
sample_silhouette_values = value[1]
|
||||||
|
cluster_labels = value[2].labels_
|
||||||
|
cluster_centers = value[2].cluster_centers_
|
||||||
|
|
||||||
|
_draw_silhouette(
|
||||||
|
ax1,
|
||||||
|
reduced_data,
|
||||||
|
n_clusters,
|
||||||
|
silhouette_avg,
|
||||||
|
sample_silhouette_values,
|
||||||
|
cluster_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
_draw_cluster_data(
|
||||||
|
ax2,
|
||||||
|
reduced_data,
|
||||||
|
n_clusters,
|
||||||
|
cluster_labels,
|
||||||
|
cluster_centers,
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.suptitle(
|
||||||
|
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
|
||||||
|
% n_clusters,
|
||||||
|
fontsize=14,
|
||||||
|
fontweight="bold",
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user