lab5

2024-11-23 11:56:30 +04:00 · 2024-11-23 11:56:30 +04:00 · 833b09e809
commit 833b09e809
parent ba19602e47
11 changed files with 92984 additions and 840 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/MII.iml
+++ b/.idea/MII.iml
@ -0,0 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
    <orderEntry type="jdk" jdkName="Python 3.12 (2)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="Black">
    <option name="sdkName" value="Python 3.12 (2)" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (2)" project-jdk-type="Python SDK" />
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/MII.iml" filepath="$PROJECT_DIR$/.idea/MII.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="" vcs="Git" />
  </component>
 </project>
--- a/mai/data/neo.csv
+++ b/mai/data/neo.csv
--- a/mai/lab4.ipynb
+++ b/mai/lab4.ipynb
--- a/mai/lab5.ipynb
+++ b/mai/lab5.ipynb
--- a/mai/utils_clusters.py
+++ b/mai/utils_clusters.py
@ -0,0 +1,100 @@
 import math
 from typing import Dict, List, Tuple
 import numpy as np
 from pandas import DataFrame
 from sklearn import cluster
 from sklearn.metrics import silhouette_samples, silhouette_score
 def run_agglomerative(
    df: DataFrame, num_clusters: int | None = 2
 ) -> cluster.AgglomerativeClustering:
    agglomerative = cluster.AgglomerativeClustering(
        n_clusters=num_clusters,
        compute_distances=True,
    )
    return agglomerative.fit(df)
 def get_linkage_matrix(model: cluster.AgglomerativeClustering) -> np.ndarray:
    counts = np.zeros(model.children_.shape[0])  # type: ignore
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):  # type: ignore
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    return np.column_stack([model.children_, model.distances_, counts]).astype(float)
 def print_cluster_result(
    df: DataFrame, clusters_num: int, labels: np.ndarray, separator: str = ", "
 ):
    for cluster_id in range(clusters_num):
        cluster_indices = np.where(labels == cluster_id)[0]
        print(f"Cluster {cluster_id + 1} ({len(cluster_indices)}):")
        rules = [str(df.index[idx]) for idx in cluster_indices]
        print(separator.join(rules))
        print("")
        print("--------")
 def run_kmeans(
    df: DataFrame, num_clusters: int, random_state: int
 ) -> Tuple[np.ndarray, np.ndarray]:
    kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
    labels = kmeans.fit_predict(df)
    return labels, kmeans.cluster_centers_
 def fit_kmeans(
    reduced_data: np.ndarray, num_clusters: int, random_state: int
 ) -> cluster.KMeans:
    kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
    kmeans.fit(reduced_data)
    return kmeans
 def _get_kmeans_range(
    df: DataFrame | np.ndarray, random_state: int
 ) -> Tuple[List, range]:
    max_clusters = int(math.sqrt(len(df)))
    clusters_range = range(2, max_clusters + 1)
    kmeans_per_k = [
        cluster.KMeans(n_clusters=k, random_state=random_state).fit(df)
        for k in clusters_range
    ]
    return kmeans_per_k, clusters_range
 def get_clusters_inertia(df: DataFrame, random_state: int) -> Tuple[List, range]:
    kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
    return [model.inertia_ for model in kmeans_per_k], clusters_range
 def get_clusters_silhouette_scores(
    df: DataFrame, random_state: int
 ) -> Tuple[List, range]:
    kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
    return [
        float(silhouette_score(df, model.labels_)) for model in kmeans_per_k
    ], clusters_range
 def get_clusters_silhouettes(df: np.ndarray, random_state: int) -> Dict:
    kmeans_per_k, _ = _get_kmeans_range(df, random_state)
    clusters_silhouettes: Dict = {}
    for model in kmeans_per_k:
        silhouette_value = silhouette_score(df, model.labels_)
        sample_silhouette_values = silhouette_samples(df, model.labels_)
        clusters_silhouettes[model.n_clusters] = (
            silhouette_value,
            sample_silhouette_values,
            model,
        )
    return clusters_silhouettes
--- a/mai/visual.py
+++ b/mai/visual.py
@ -0,0 +1,242 @@
 from typing import Any, Dict, List
 import matplotlib.cm as cm
 import matplotlib.pyplot as plt
 import numpy as np
 from pandas import DataFrame
 from scipy.cluster import hierarchy
 from sklearn.cluster import KMeans
 def draw_data_2d(
    df: DataFrame,
    col1: int,
    col2: int,
    y: List | None = None,
    classes: List | None = None,
    subplot: Any | None = None,
 ):
    ax = None
    if subplot is None:
        _, ax = plt.subplots()
    else:
        ax = subplot
    scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y)
    ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2])
    if classes is not None:
        ax.legend(
            scatter.legend_elements()[0], classes, loc="lower right", title="Classes"
        )
 def draw_dendrogram(linkage_matrix: np.ndarray):
    hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3)
 def draw_cluster_results(
    df: DataFrame,
    col1: int,
    col2: int,
    labels: np.ndarray,
    cluster_centers: np.ndarray,
    subplot: Any | None = None,
 ):
    ax = None
    if subplot is None:
        ax = plt
    else:
        ax = subplot
    centroids = cluster_centers
    u_labels = np.unique(labels)
    for i in u_labels:
        ax.scatter(
            df[labels == i][df.columns[col1]],
            df[labels == i][df.columns[col2]],
            label=i,
        )
    ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k")
 def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans):
    h = 0.02
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(
        Z,
        interpolation="nearest",
        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
        cmap=plt.cm.Paired,  # type: ignore
        aspect="auto",
        origin="lower",
    )
    plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
    centroids = kmeans.cluster_centers_
    plt.scatter(
        centroids[:, 0],
        centroids[:, 1],
        marker="x",
        s=169,
        linewidths=3,
        color="w",
        zorder=10,
    )
    plt.title(
        "K-means clustering (PCA-reduced data)\n"
        "Centroids are marked with white cross"
    )
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
 def _draw_cluster_scores(
    data: List,
    clusters_range: range,
    score_name: str,
    title: str,
 ):
    plt.figure(figsize=(8, 5))
    plt.plot(clusters_range, data, "bo-")
    plt.xlabel("$k$", fontsize=8)
    plt.ylabel(score_name, fontsize=8)
    plt.title(title)
 def draw_elbow_diagram(inertias: List, clusters_range: range):
    _draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram")
 def draw_silhouettes_diagram(silhouette: List, clusters_range: range):
    _draw_cluster_scores(
        silhouette, clusters_range, "Silhouette score", "The Silhouette score"
    )
 def _draw_silhouette(
    ax: Any,
    reduced_data: np.ndarray,
    n_clusters: int,
    silhouette_avg: float,
    sample_silhouette_values: List,
    cluster_labels: List,
 ):
    ax.set_xlim([-0.1, 1])
    ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10])
    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.nipy_spectral(float(i) / n_clusters)  # type: ignore
        ax.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10  # 10 for the 0 samples
    ax.set_title("The silhouette plot for the various clusters.")
    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")
    ax.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax.set_yticks([])
    ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
 def _draw_cluster_data(
    ax: Any,
    reduced_data: np.ndarray,
    n_clusters: int,
    cluster_labels: np.ndarray,
    cluster_centers: np.ndarray,
 ):
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)  # type: ignore
    ax.scatter(
        reduced_data[:, 0],
        reduced_data[:, 1],
        marker=".",
        s=30,
        lw=0,
        alpha=0.7,
        c=colors,
        edgecolor="k",
    )
    ax.scatter(
        cluster_centers[:, 0],
        cluster_centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )
    for i, c in enumerate(cluster_centers):
        ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
    ax.set_title("The visualization of the clustered data.")
    ax.set_xlabel("Feature space for the 1st feature")
    ax.set_ylabel("Feature space for the 2nd feature")
 def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict):
    for key, value in silhouettes.items():
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)
        n_clusters = key
        silhouette_avg = value[0]
        sample_silhouette_values = value[1]
        cluster_labels = value[2].labels_
        cluster_centers = value[2].cluster_centers_
        _draw_silhouette(
            ax1,
            reduced_data,
            n_clusters,
            silhouette_avg,
            sample_silhouette_values,
            cluster_labels,
        )
        _draw_cluster_data(
            ax2,
            reduced_data,
            n_clusters,
            cluster_labels,
            cluster_centers,
        )
        plt.suptitle(
            "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
            % n_clusters,
            fontsize=14,
            fontweight="bold",
        )