lab5-6

2025-02-28 16:02:44 +04:00 · 2025-02-28 16:02:44 +04:00 · f08c12ac81
commit f08c12ac81
parent 338e0b0ad8
10 changed files with 110484 additions and 2882 deletions
--- a/data/car-price-prediction.csv
+++ b/data/car-price-prediction.csv
--- a/data/neo.csv
+++ b/data/neo.csv
--- a/notebooks/lab4.ipynb
+++ b/notebooks/lab4.ipynb
--- a/notebooks/lab4_pipeline.ipynb
+++ b/notebooks/lab4_pipeline.ipynb
--- a/notebooks/lab5_1.ipynb
+++ b/notebooks/lab5_1.ipynb
--- a/notebooks/lab6_1.ipynb
+++ b/notebooks/lab6_1.ipynb
--- a/notebooks/utils_clusters.py
+++ b/notebooks/utils_clusters.py
@ -0,0 +1,100 @@
+import math
+from typing import Dict, List, Tuple
+
+import numpy as np
+from pandas import DataFrame
+from sklearn import cluster
+from sklearn.metrics import silhouette_samples, silhouette_score
+
+
+def run_agglomerative(
+    df: DataFrame, num_clusters: int | None = 2
+) -> cluster.AgglomerativeClustering:
+    agglomerative = cluster.AgglomerativeClustering(
+        n_clusters=num_clusters,
+        compute_distances=True,
+    )
+    return agglomerative.fit(df)
+
+
+def get_linkage_matrix(model: cluster.AgglomerativeClustering) -> np.ndarray:
+    counts = np.zeros(model.children_.shape[0])  # type: ignore
+    n_samples = len(model.labels_)
+    for i, merge in enumerate(model.children_):  # type: ignore
+        current_count = 0
+        for child_idx in merge:
+            if child_idx < n_samples:
+                current_count += 1
+            else:
+                current_count += counts[child_idx - n_samples]
+        counts[i] = current_count
+
+    return np.column_stack([model.children_, model.distances_, counts]).astype(float)
+
+
+def print_cluster_result(
+    df: DataFrame, clusters_num: int, labels: np.ndarray, separator: str = ", "
+):
+    for cluster_id in range(clusters_num):
+        cluster_indices = np.where(labels == cluster_id)[0]
+        print(f"Cluster {cluster_id + 1} ({len(cluster_indices)}):")
+        rules = [str(df.index[idx]) for idx in cluster_indices]
+        print(separator.join(rules))
+        print("")
+        print("--------")
+
+
+def run_kmeans(
+    df: DataFrame, num_clusters: int, random_state: int
+) -> Tuple[np.ndarray, np.ndarray]:
+    kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
+    labels = kmeans.fit_predict(df)
+    return labels, kmeans.cluster_centers_
+
+
+def fit_kmeans(
+    reduced_data: np.ndarray, num_clusters: int, random_state: int
+) -> cluster.KMeans:
+    kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
+    kmeans.fit(reduced_data)
+    return kmeans
+
+
+def _get_kmeans_range(
+    df: DataFrame | np.ndarray, random_state: int
+) -> Tuple[List, range]:
+    max_clusters = int(math.sqrt(len(df)))
+    clusters_range = range(2, max_clusters + 1)
+    kmeans_per_k = [
+        cluster.KMeans(n_clusters=k, random_state=random_state).fit(df)
+        for k in clusters_range
+    ]
+    return kmeans_per_k, clusters_range
+
+
+def get_clusters_inertia(df: DataFrame, random_state: int) -> Tuple[List, range]:
+    kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
+    return [model.inertia_ for model in kmeans_per_k], clusters_range
+
+
+def get_clusters_silhouette_scores(
+    df: DataFrame, random_state: int
+) -> Tuple[List, range]:
+    kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
+    return [
+        float(silhouette_score(df, model.labels_)) for model in kmeans_per_k
+    ], clusters_range
+
+
+def get_clusters_silhouettes(df: np.ndarray, random_state: int) -> Dict:
+    kmeans_per_k, _ = _get_kmeans_range(df, random_state)
+    clusters_silhouettes: Dict = {}
+    for model in kmeans_per_k:
+        silhouette_value = silhouette_score(df, model.labels_)
+        sample_silhouette_values = silhouette_samples(df, model.labels_)
+        clusters_silhouettes[model.n_clusters] = (
+            silhouette_value,
+            sample_silhouette_values,
+            model,
+        )
+    return clusters_silhouettes
--- a/notebooks/visual.py
+++ b/notebooks/visual.py
@ -0,0 +1,242 @@
+from typing import Any, Dict, List
+
+import matplotlib.cm as cm
+import matplotlib.pyplot as plt
+import numpy as np
+from pandas import DataFrame
+from scipy.cluster import hierarchy
+from sklearn.cluster import KMeans
+
+
+def draw_data_2d(
+    df: DataFrame,
+    col1: int,
+    col2: int,
+    y: List | None = None,
+    classes: List | None = None,
+    subplot: Any | None = None,
+):
+    ax = None
+    if subplot is None:
+        _, ax = plt.subplots()
+    else:
+        ax = subplot
+    scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y)
+    ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2])
+    if classes is not None:
+        ax.legend(
+            scatter.legend_elements()[0], classes, loc="lower right", title="Classes"
+        )
+
+
+def draw_dendrogram(linkage_matrix: np.ndarray):
+    hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3)
+
+
+def draw_cluster_results(
+    df: DataFrame,
+    col1: int,
+    col2: int,
+    labels: np.ndarray,
+    cluster_centers: np.ndarray,
+    subplot: Any | None = None,
+):
+    ax = None
+    if subplot is None:
+        ax = plt
+    else:
+        ax = subplot
+
+    centroids = cluster_centers
+    u_labels = np.unique(labels)
+
+    for i in u_labels:
+        ax.scatter(
+            df[labels == i][df.columns[col1]],
+            df[labels == i][df.columns[col2]],
+            label=i,
+        )
+
+    ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k")
+
+
+def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans):
+    h = 0.02
+
+    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
+    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+
+    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
+
+    Z = Z.reshape(xx.shape)
+    plt.figure(1)
+    plt.clf()
+    plt.imshow(
+        Z,
+        interpolation="nearest",
+        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+        cmap=plt.cm.Paired,  # type: ignore
+        aspect="auto",
+        origin="lower",
+    )
+
+    plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
+    centroids = kmeans.cluster_centers_
+    plt.scatter(
+        centroids[:, 0],
+        centroids[:, 1],
+        marker="x",
+        s=169,
+        linewidths=3,
+        color="w",
+        zorder=10,
+    )
+    plt.title(
+        "K-means clustering (PCA-reduced data)\n"
+        "Centroids are marked with white cross"
+    )
+    plt.xlim(x_min, x_max)
+    plt.ylim(y_min, y_max)
+    plt.xticks(())
+    plt.yticks(())
+
+
+def _draw_cluster_scores(
+    data: List,
+    clusters_range: range,
+    score_name: str,
+    title: str,
+):
+    plt.figure(figsize=(8, 5))
+    plt.plot(clusters_range, data, "bo-")
+    plt.xlabel("$k$", fontsize=8)
+    plt.ylabel(score_name, fontsize=8)
+    plt.title(title)
+
+
+def draw_elbow_diagram(inertias: List, clusters_range: range):
+    _draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram")
+
+
+def draw_silhouettes_diagram(silhouette: List, clusters_range: range):
+    _draw_cluster_scores(
+        silhouette, clusters_range, "Silhouette score", "The Silhouette score"
+    )
+
+
+def _draw_silhouette(
+    ax: Any,
+    reduced_data: np.ndarray,
+    n_clusters: int,
+    silhouette_avg: float,
+    sample_silhouette_values: List,
+    cluster_labels: List,
+):
+    ax.set_xlim([-0.1, 1])
+    ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10])
+
+    y_lower = 10
+    for i in range(n_clusters):
+        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
+
+        ith_cluster_silhouette_values.sort()
+
+        size_cluster_i = ith_cluster_silhouette_values.shape[0]
+        y_upper = y_lower + size_cluster_i
+
+        color = cm.nipy_spectral(float(i) / n_clusters)  # type: ignore
+        ax.fill_betweenx(
+            np.arange(y_lower, y_upper),
+            0,
+            ith_cluster_silhouette_values,
+            facecolor=color,
+            edgecolor=color,
+            alpha=0.7,
+        )
+
+        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
+
+        y_lower = y_upper + 10  # 10 for the 0 samples
+
+    ax.set_title("The silhouette plot for the various clusters.")
+    ax.set_xlabel("The silhouette coefficient values")
+    ax.set_ylabel("Cluster label")
+
+    ax.axvline(x=silhouette_avg, color="red", linestyle="--")
+
+    ax.set_yticks([])
+    ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
+
+
+def _draw_cluster_data(
+    ax: Any,
+    reduced_data: np.ndarray,
+    n_clusters: int,
+    cluster_labels: np.ndarray,
+    cluster_centers: np.ndarray,
+):
+    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)  # type: ignore
+    ax.scatter(
+        reduced_data[:, 0],
+        reduced_data[:, 1],
+        marker=".",
+        s=30,
+        lw=0,
+        alpha=0.7,
+        c=colors,
+        edgecolor="k",
+    )
+
+    ax.scatter(
+        cluster_centers[:, 0],
+        cluster_centers[:, 1],
+        marker="o",
+        c="white",
+        alpha=1,
+        s=200,
+        edgecolor="k",
+    )
+
+    for i, c in enumerate(cluster_centers):
+        ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
+
+    ax.set_title("The visualization of the clustered data.")
+    ax.set_xlabel("Feature space for the 1st feature")
+    ax.set_ylabel("Feature space for the 2nd feature")
+
+
+def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict):
+    for key, value in silhouettes.items():
+        fig, (ax1, ax2) = plt.subplots(1, 2)
+        fig.set_size_inches(18, 7)
+
+        n_clusters = key
+        silhouette_avg = value[0]
+        sample_silhouette_values = value[1]
+        cluster_labels = value[2].labels_
+        cluster_centers = value[2].cluster_centers_
+
+        _draw_silhouette(
+            ax1,
+            reduced_data,
+            n_clusters,
+            silhouette_avg,
+            sample_silhouette_values,
+            cluster_labels,
+        )
+
+        _draw_cluster_data(
+            ax2,
+            reduced_data,
+            n_clusters,
+            cluster_labels,
+            cluster_centers,
+        )
+
+        plt.suptitle(
+            "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
+            % n_clusters,
+            fontsize=14,
+            fontweight="bold",
+        )
--- a/poetry.lock
+++ b/poetry.lock
@ -672,6 +672,17 @@ files = [
 [package.extras]
 tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]

+[[package]]
+name = "farama-notifications"
+version = "0.0.4"
+description = "Notifications for all Farama Foundation maintained libraries."
+optional = false
+python-versions = "*"
+files = [
+    {file = "Farama-Notifications-0.0.4.tar.gz", hash = "sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18"},
+    {file = "Farama_Notifications-0.0.4-py3-none-any.whl", hash = "sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae"},
+]
+
 [[package]]
 name = "fastjsonschema"
 version = "2.20.0"
@ -868,6 +879,36 @@ files = [
    {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
 ]

+[[package]]
+name = "gymnasium"
+version = "1.0.0"
+description = "A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "gymnasium-1.0.0-py3-none-any.whl", hash = "sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad"},
+    {file = "gymnasium-1.0.0.tar.gz", hash = "sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403"},
+]
+
+[package.dependencies]
+cloudpickle = ">=1.2.0"
+farama-notifications = ">=0.0.1"
+numpy = ">=1.21.0"
+typing-extensions = ">=4.3.0"
+
+[package.extras]
+all = ["ale-py (>=0.9)", "box2d-py (==2.3.5)", "cython (<3)", "flax (>=0.5.0)", "imageio (>=2.14.1)", "jax (>=0.4.0)", "jaxlib (>=0.4.0)", "matplotlib (>=3.0)", "moviepy (>=1.0.0)", "mujoco (>=2.1.5)", "mujoco-py (>=2.1,<2.2)", "opencv-python (>=3.0)", "pygame (>=2.1.3)", "swig (==4.*)", "torch (>=1.0.0)"]
+atari = ["ale-py (>=0.9)"]
+box2d = ["box2d-py (==2.3.5)", "pygame (>=2.1.3)", "swig (==4.*)"]
+classic-control = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
+jax = ["flax (>=0.5.0)", "jax (>=0.4.0)", "jaxlib (>=0.4.0)"]
+mujoco = ["imageio (>=2.14.1)", "mujoco (>=2.1.5)"]
+mujoco-py = ["cython (<3)", "cython (<3)", "mujoco-py (>=2.1,<2.2)", "mujoco-py (>=2.1,<2.2)"]
+other = ["matplotlib (>=3.0)", "moviepy (>=1.0.0)", "opencv-python (>=3.0)"]
+testing = ["dill (>=0.3.7)", "pytest (==7.1.3)", "scipy (>=1.7.3)"]
+torch = ["torch (>=1.0.0)"]
+toy-text = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
+
 [[package]]
 name = "h11"
 version = "0.14.0"
@ -3091,6 +3132,17 @@ files = [
    {file = "types_python_dateutil-2.9.0.20240821-py3-none-any.whl", hash = "sha256:f5889fcb4e63ed4aaa379b44f93c32593d50b9a94c9a60a0c854d8cc3511cd57"},
 ]

+[[package]]
+name = "typing-extensions"
+version = "4.12.2"
+description = "Backported and Experimental Type Hints for Python 3.8+"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
+]
+
 [[package]]
 name = "tzdata"
 version = "2024.1"
@ -3265,4 +3317,4 @@ updater = ["alteryx-open-src-update-checker (>=3.1.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "ddd000b70cadbcdb2463cdb4e0be8181c6dab001dd368a95bd2caa73a3085aa5"
+content-hash = "76a7ecc0524f2a9a187e4242566cf9813bf2265aa4176553ea4f33c9a4c78f17"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -19,6 +19,8 @@ scikit-learn = "^1.5.2"
 imbalanced-learn = "^0.12.3"
 featuretools = "^1.31.0"
 seaborn = "^0.13.2"
+gymnasium = "^1.0.0"
+scipy = "^1.14.1"


 [tool.poetry.group.dev.dependencies]