313 lines
125 KiB
Plaintext
313 lines
125 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "e7893b9e",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Лабораторная работа: Методы искусственного интеллекта\n",
|
|||
|
"## Задача кластеризации продуктов с использованием cuML\n",
|
|||
|
"### Вариант: Продукты\n",
|
|||
|
"В данной работе используется библиотека cuML для GPU-ускоренного анализа данных. Цель: провести кластеризацию продуктов на основе их характеристик."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "e3834005",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Загрузка и исследование данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"id": "5530d138",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'cudf.core.dataframe.DataFrame'>\n",
|
|||
|
"RangeIndex: 162313 entries, 0 to 162312\n",
|
|||
|
"Data columns (total 5 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype\n",
|
|||
|
"--- ------ -------------- -----\n",
|
|||
|
" 0 category 162313 non-null object\n",
|
|||
|
" 1 sub_category 162313 non-null object\n",
|
|||
|
" 2 href 162313 non-null object\n",
|
|||
|
" 3 items 162280 non-null object\n",
|
|||
|
" 4 price 162282 non-null float64\n",
|
|||
|
"dtypes: float64(1), object(4)\n",
|
|||
|
"memory usage: 28.9+ MB\n",
|
|||
|
"None\n",
|
|||
|
" category sub_category \\\n",
|
|||
|
"0 Groceries Fruits & Vegetables \n",
|
|||
|
"1 Groceries Fruits & Vegetables \n",
|
|||
|
"2 Groceries Fruits & Vegetables \n",
|
|||
|
"3 Groceries Fruits & Vegetables \n",
|
|||
|
"4 Groceries Fruits & Vegetables \n",
|
|||
|
"\n",
|
|||
|
" href \\\n",
|
|||
|
"0 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"1 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"2 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"3 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"4 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"\n",
|
|||
|
" items price \n",
|
|||
|
"0 Fresh Dates (Pack) (Approx 450 g - 500 g) 109.0 \n",
|
|||
|
"1 Tender Coconut Cling Wrapped (1 pc) (Approx 90... 49.0 \n",
|
|||
|
"2 Mosambi 1 kg 69.0 \n",
|
|||
|
"3 Orange Imported 1 kg 125.0 \n",
|
|||
|
"4 Banana Robusta 6 pcs (Box) (Approx 800 g - 110... 44.0 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import cudf\n",
|
|||
|
"import cuml\n",
|
|||
|
"from cuml.preprocessing import LabelEncoder\n",
|
|||
|
"from cuml.decomposition import PCA\n",
|
|||
|
"from cuml.cluster import KMeans\n",
|
|||
|
"import cupy as cp\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = cudf.read_csv('/mnt/c/3curse/mii/AIM-PIbd-31-Medvedkov-A-D/data/jio_mart_items.csv')\n",
|
|||
|
"print(df.info())\n",
|
|||
|
"print(df.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"id": "b5ea4ef3",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "49112908",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Предварительная обработка данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"id": "1e3ef9fa",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Обработка пропущенных значений\n",
|
|||
|
"df = df.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Кодирование категориального признака 'items'\n",
|
|||
|
"label_encoder = LabelEncoder()\n",
|
|||
|
"df['items_encoded'] = label_encoder.fit_transform(df['items'])\n",
|
|||
|
"\n",
|
|||
|
"# Нормализация числовых признаков\n",
|
|||
|
"numeric_features = ['items_encoded', 'price']\n",
|
|||
|
"df_scaled = df[numeric_features].astype('float32')\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование данных в формат cupy\n",
|
|||
|
"X = cp.asarray(df_scaled.values)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "ff5f1f8f",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Понижение размерности и визуализация данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"id": "e15c80bb",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAHHCAYAAABDUnkqAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAWDtJREFUeJzt3XdcU+fiBvAngASQIaiAAxW3iHtih9pi0VornZafLa7a6tVbrV5baas4qtjhba119ypWHL1aq3WB1lG1Yp1UwT1xMFQ0YcjM+/vDS0pMQhIMGYfn+/nk0+bkzeE9x4wn7zoyIYQAERERkUQ4WLsCRERERObEcENERESSwnBDREREksJwQ0RERJLCcENERESSwnBDREREksJwQ0RERJLCcENERESSwnBDREREksJwQ0RERJLCcEM2JzY2FjKZTOPm6+uL3r17Y8eOHdauHhFVsl69emm8/318fNClSxcsX74cKpVKq/y+ffvw6quvwt/fH87OzvD19cWAAQOwceNGnfs/e/YsZDIZXFxc8ODBg0o+GrIGhhuyWTNmzMCqVavw448/4qOPPsKdO3fw4osvYuvWrdauGhFVsvr162PVqlVYtWoVpkyZguLiYowYMQKffPKJRrno6Gj07t0bycnJeP/997F48WJMmjQJOTk5eO2117BmzRqtfcfFxcHf3x8AsGHDBoscD1mWjBfOJFsTGxuLYcOG4ejRo+jcubN6+/379+Hn54c33ngDq1evtmINiagy9erVC3fv3kVycrJ6W15eHlq0aIH79+/j/v37qFatGjZs2IA33ngDr7/+OtasWYNq1app7CchIQFFRUV46aWX1NuEEGjcuDFeffVVXL16Fffv38fevXstdmxkGWy5IbtRo0YNuLq6wsnJSb3t2rVrkMlkiI2N1Sg7ZswYyGQyDB06VL1t48aN6Nq1K3x8fODq6oqWLVviiy++QGm+37t3L2QyGX755Retv71mzRrIZDIkJiYCAE6dOoWhQ4eicePGcHFxgb+/P4YPH4579+7prHujRo20utpkMhn27dunUaZsfQFg/fr1kMlkaNSokXrb+fPn8dxzz8Hf3x9yuRwBAQEYNWoUsrKy1GUKCwsxdepUdOrUCV5eXqhevTqeeeYZrQ/x0vP39ddfa9U5ODgYvXr10tjWq1cvrW1Hjx5VH09ZOTk5mDhxIho3boxq1appHPfdu3d1nqfH6Tpnj5+3yjjWffv2QSaT6fxV7+7urvHvVNqNeuzYMb3H8fh5GzJkCFxcXHD27FmNcmFhYfD29sbt27f17qv0OPTdHv/3yczMxIgRI+Dn5wcXFxe0a9cOK1eu1NqvSqXCvHnz0KZNG7i4uKB27dro27ev1nHp6jbW9XcfPHiA8ePHIyAgAHK5HE2bNsUXX3yhs1vJGG5ubujevTtyc3Nx584dAMCUKVPg4+OD5cuXawUb4NH5LBtsAOCPP/7AtWvX8NZbb+Gtt97C/v37cfPmzQrViWyXk+EiRNahUChw9+5dCCGQmZmJ+fPnIycnB2+//Xa5z7t06RKWLVumtV2pVKJbt24YMmQIqlWrhvj4eEyePBlOTk6YOHEievXqhYCAAKxevRqvvPKKxnNXr16NJk2aICQkBACwa9cuXLlyBcOGDYO/vz9SUlKwdOlSpKSk4PDhw1pf9ADwzDPP4L333gPwqM9/9uzZ5R5HcXExPv30U63tubm5qF+/PgYMGABPT08kJydjwYIFuHXrFrZs2aI+1h9++AEREREYOXIksrOz8Z///AdhYWE4cuQI2rdvX+7fNsXHH3+sc/ukSZOwePFijBgxAk899RSqVauGjRs36gyP5enTpw8iIyMBPApS3333ncbjljxWc5k3bx727NmDIUOGIDExEY6OjliyZAl27tyJVatWoW7dugb3ERERgRdffFFjW1RUlMb9hw8folevXrh06RLGjh2LwMBArF+/HkOHDsWDBw8wbtw4ddkRI0YgNjYW/fr1w7vvvovi4mIcOHAAhw8f1mhBLfXNN9+gVq1aAIBZs2ZpPJaXl4eePXvi1q1beP/999GgQQMcOnQIUVFRSEtLw7fffmvsqdJw5coVODo6okaNGrh48SLOnTuH4cOHw8PDw+h9lL6Xu3TpguDgYLi5uWHt2rWYNGlShepENkoQ2ZgVK1YIAFo3uVwuYmNjNcpevXpVABArVqxQb3vzzTdFcHCwCAgIEEOGDCn3bwUFBYmXXnpJfT8qKkrI5XLx4MED9bbMzEzh5OQkoqOj1dvy8vK09rV27VoBQOzfv1/rsXr16olhw4ap7+/du1cAEHv37lVva9iwoUZ9Fy5cKORyuejdu7do2LBhucfxj3/8Q7i7u6vvFxcXi4KCAo0y9+/fF35+fmL48OHqbaXn76uvvtLaZ+vWrUXPnj01tvXs2VNj2/bt2wUA0bdvX/H4x0mdOnVEWFiYxrbo6GgBQNy5c6fc4xFCiMLCQgFAjB07Vr1t/fr1WuetMo619N9n/fr1WmWrV6+u8e9U+no9evSo3mN5/LwJIURCQoIAID7//HNx5coV4e7uLsLDw/XuoyLH8e233woAIi4uTr2tsLBQhISECHd3d6FUKoUQQuzZs0cAEB988IHWPlUqlcb9ZcuWCQDi+vXreo9v5syZonr16uLChQsaz508ebJwdHQUqamp5R5jz549RcuWLcWdO3fEnTt3xNmzZ8UHH3wgAIgBAwYIIYTYvHmzACC++eabcvdVVmFhoahZs6b49NNP1dv+7//+T7Rr187ofZB9YLcU2awFCxZg165d2LVrF+Li4tC7d2+8++67emdAAMDx48exfv16xMTEwMFB98v77t27uHnzJmJjY3Hp0iU8++yz6sciIyNRUFCg0R3x008/obi4WKPFyNXVVf3/+fn5uHv3Lrp37w4AOHHihNbfLCwshFwuN/rY8/LyMGPGDIwdOxYNGjTQWUahUCAjIwO7d+/Gtm3bNI7D0dERzs7OAB51N2RlZaG4uBidO3fWWb+KEEIgKioKr732Grp166b1eHZ2NmrWrFnh/efn5wMAXFxcyi1n6rHm5eXh7t27GreSkhKd+87OztYqq09pS2N2drZRx/fCCy/g/fffx4wZM/Dqq6/CxcUFS5YsMeq5xtq+fTv8/f0RERGh3latWjV88MEHyMnJwe+//w4A+PnnnyGTyRAdHa21j8dbIQsLCwGg3Nfz+vXr8cwzz8Db21vj3IWGhqKkpAT79+83WPdz586hdu3aqF27Nlq1aoX58+ejf//+WL58OYBHLXYATGq12bFjB+7du6dxPiIiIvDXX38hJSXF6P2Q7avS4Wb//v0YMGAA6tatC5lMhk2bNpm8DyEEvv76azRv3hxyuRz16tXTaqKliunatStCQ0MRGhqKwYMHY9u2bQgKCsLYsWPVH7CPmzx5Mp555hmtfvZS+fn5qF27NgICAjB8+HBMmjRJozm6ZcuW6NKli8aA5dWrV6N79+5o2rSpeltWVhbGjRsHPz8/uLq6onbt2ggMDATw6EvucQqFAu7u7kYf+7///W/k5+drzQwpKywsDP7+/ggNDUWrVq3w008/aTy+cuVKtG3bFi4uLqhZsyZq166Nbdu26axfRaxevRopKSl6u9dCQkLwyy+/YMOGDUhLS8Pdu3eRl5dn9P5Lg4SXl5fBsqYca3R0tPpLs/R27tw5nfsdPny4Vtnc3FydZUNDQ1G7dm14enrC29sb//jHP/SWLfX111/Dx8cHSUlJ+O677+Dr62vwWE1x/fp1NGvWTCvot2rVSv04AFy+fBl169aFj4+PwX2WTp0u7/V88eJFxMfHa5270NBQAI/GARnSqFEj7Nq1C7/99hsOHjyI9PR0bN26Vd0V5unpCQBGh0ng0SypwMBAyOVyXLp0CZcuXUKTJk3g5ubGSQoSU6XH3OTm5qJdu3YYPnw4Xn311QrtY9y4cdi5cye+/vprtGnTBllZWRoDO8l8HBwc0Lt3b8ybNw8XL15E69atNR7fuXMnfvvtN/WgX12cnZ2xa9cu5OXl4cCBA/jiiy8QEBCA999/X10mMjIS48a
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Применение PCA для понижения размерности\n",
|
|||
|
"pca = PCA(n_components=2)\n",
|
|||
|
"reduced_data = pca.fit_transform(X)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразуем данные из cupy в numpy\n",
|
|||
|
"reduced_data_np = reduced_data.get()\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных\n",
|
|||
|
"plt.scatter(reduced_data_np[:, 0], reduced_data_np[:, 1])\n",
|
|||
|
"plt.title('Визуализация данных после PCA')\n",
|
|||
|
"plt.xlabel('PC1')\n",
|
|||
|
"plt.ylabel('PC2')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "f2eef505",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Выбор оптимального количества кластеров"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"id": "f72195d2",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оценка числа кластеров: 100%|█████████████████████████████████████████████████████████████| 9/9 [01:08<00:00, 7.67s/it]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABIQAAAHWCAYAAAAGrFJtAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAlmNJREFUeJzs3Xd4VGX+9/HPmbRJQgoBUkhCF0IJRRQEVFBRsLPuWlAXRUVF0VV2VbCh7rOLZVF3LahYsP4Wu2tDEUUsCNIEhFAjJaZBSO8z5/kjmYEhCYSQ5Mxk3q/rmg1z5pyZ70nWzMln7vt7G6ZpmgIAAAAAAIDfsFldAAAAAAAAAFoXgRAAAAAAAICfIRACAAAAAADwMwRCAAAAAAAAfoZACAAAAAAAwM8QCAEAAAAAAPgZAiEAAAAAAAA/QyAEAAAAAADgZwiEAAAAAAAA/AyBEACf8/TTTys/P999/8knn1RJSYl1BQEAAACAjyEQArzA/PnzZRiGVq5cWeexefPmyTAMTZgwQQ6Hw4LqvM/HH3+sBx54QLt379abb76p++67T6GhoVaXBQAAjoK3Xf+cd9556tat21EfN23aNBmG0fwFAUALC7S6AAAN++CDDzR16lSdcsop+u9//6uAgACrS/IKd999ty644AL9+9//ls1m05w5c2SzkW8DANAWcP0DAK2DQAjwUkuWLNHEiRPVr18/ffzxx7Lb7VaX5DVGjx6tnTt3atOmTUpOTlZSUpLVJQEAgGbA9Q8AtB4+Uge80Nq1a3XhhRcqISFBX3zxhaKiojweHzNmjAYMGKBVq1Zp5MiRCg0NVffu3fXcc8957LdkyRIZhqElS5Z4bD/33HNlGIYeeOABSdIDDzwgwzAOezv4OZYvX67x48crKipKYWFhGj16tH744QeP13A9Z1pami655BJFRkaqQ4cO+stf/qLy8nKPfQ+uxeWxxx6TYRgaM2ZMvecTHR2tESNGKCkpqc75NKSx34+D69+7d6/HvitXrpRhGJo/f77H9rS0NP3pT39STEyM7Ha7TjjhBP3vf//z2Mc1NH7p0qW64YYb1KFDB0VGRmrSpEnav3+/x77dunXTeeedV+cc6huW7qr1YMXFxYqPj69zvmPGjPH4nkrSzz//7P45AwBglSNd/0jSO++8o6FDhyo0NFQdO3bUlVdeqYyMDPfjGRkZmjhxohITExUSEqIePXrozjvvVFFRUZ3nev3115WcnKzo6GjNnj3bvX3BggXq3LmzOnbsqEceeaTOcV988YV69+6tdu3a6dZbb5VpmpJqrjN69uypyMhITZ8+3WOqW0tdg/z222/1XpfcfPPNMgxDV199tcf2/Px83XbbbUpOTlZISIh69eqlRx55RE6ns85z/utf/6pz7gMGDHBfR7jO6XC3I12blZeX64EHHlDv3r1lt9uVkJCgiy66SNu3b2/S+Uk11zr11eJ6jlmzZikoKEi5ubl1jr3++usVHR2t8vJyfffddxo7dqw6duyo0NBQDRkyRHPnznX/vA/3WgffXF555RWdfvrpio2NVUhIiPr166e5c+ce9vsDtDRGCAFeZvv27Ro/frxCQkL0xRdfKCEhod799u/fr3POOUeXXHKJJk6cqLfffltTp05VcHCwrrnmmgaff+nSpfrss888tl100UXq1auX+/7tt9+uvn376vrrr3dv69u3ryTp66+/1tlnn62hQ4dq1qxZstls7je47777TsOGDfN47ksuuUTdunXT7Nmz9dNPP+k///mP9u/fr9dee63BGvPz8z0uzA6nvvM5Gsd6vCT9+uuvGjVqlBITEzVjxgyFh4fr7bff1oQJE/Tee+/pD3/4g8f+06ZNU3R0tB544AFt3rxZc+fO1c6dO90XVs1hzpw5ys7ObtS+d911V7O8JgAATdWY65/58+dr8uTJOvHEEzV79mxlZ2fr3//+t3744QetWbNG0dHR2r59u7Kzs3XLLbeoffv2+vXXX/Wf//xHixcv1vfff+/uOfjDDz/oqquu0siRIzVx4kS9/vrr2rFjh8rKyvTQQw/p7rvv1pdffqkZM2aoS5cumjhxoiRpx44dmjBhgnr16qV//vOfWrhwobsH0s0336xbbrlFa9as0RNPPKFOnTpp5syZDZ5zc1yD1Gfbtm2aN29ene2lpaUaPXq0MjIydMMNN6hLly768ccfNXPmTGVmZurJJ588qtfp27evXn/9dff9F154QZs2bdITTzzh3jZw4MAGj3c4HDrvvPO0ePFiXXbZZfrLX/6ioqIiLVq0SBs2bFDPnj2P6vwOlpKSonvuuUeStHfvXt1+++3ux/785z/roYce0oIFCzRt2jT39srKSr377rv64x//KLvdrh9//FGxsbG69957FRAQoG+//VY33XST1q1b5w5y7rnnHl133XUer3P99dfrlFNOqVPT3Llz1b9/f11wwQUKDAzUxx9/rJtuuklOp1M333zzYc8HaDEmAMu98sorpiTzk08+MXv27GlKMs8666wG9x89erQpyZwzZ457W0VFhTl48GAzNjbWrKysNE3TNL/55htTkvnNN9+49xs+fLh59tlnm5LMWbNm1fv8Xbt2Na+66qo6251Op3nccceZ48aNM51Op3t7aWmp2b17d/PMM890b5s1a5Ypybzgggs8nuOmm24yJZm//PKLe9uhtdx5551mbGysOXToUHP06NHu7U09n6Yc76o/NzfX4zl+/vlnU5L5yiuvuLedccYZZmpqqlleXu7xvRo5cqR53HHHube5fs5Dhw51/4xM0zQfffRRU5L50Ucfubd17drVPPfcc+ucw80332we+qvbVatLTk6OGRER4T6vg8939OjRHt/Tzz77zJRkjh8/vs7zAgDQko7m+qeystKMjY01BwwYYJaVlbm3f/LJJ6Yk8/7772/wdRYtWmRKMh966CH3tgsuuMDs3r27+727qKjI7N69uxkWFmbu2LHDNM2a9/JRo0aZgwYNch936623mhEREebevXtN0zTNqqoq86STTjIlmcuXL3fvN3HiRDM2Ntb9/C11DZKenl5n2yWXXGIOGDDATE5O9rie+/vf/26Gh4ebW7Zs8XjeGTNmmAEBAeauXbs8nvOxxx6r873s37+/x3XEwa666iqza9eu9T5Wn5dfftmUZD7++ON1HnNdZx7N+bmMGjXKPO2009z363uOESNGmMOHD/c47v3336/zMzrUPffcY0oyly5dWuex+l7nYKWlpXW2jRs3zuzRo0eDrwe0NKaMAV7k6quv1u7du3X55Zfryy+/1DvvvNPgvoGBgbrhhhvc94ODg3XDDTcoJydHq1atqveY999/Xz///LMefvjhJtW3du1abd26VZdffrn27dunvXv3au/evSopKdEZZ5yhpUuXegw5llTnE49bbrlFkhr8RCwjI0NPPfWU7rvvPrVr1+6w9Rzr+TTm+Ly8PPd57t27VwUFBXUe//rrr3XJJZeoqKjIvd++ffs0btw4bd261WMou1QzHDkoKMh9f+rUqQoMDGy2Twn//ve/KyoqSrfeeuth9zNNUzNnztQf//hHDR8+vFleGwCAo9WY65+VK1cqJydHN910k0dfoXPPPVcpKSn69NNP3duqqqo83rsHDx6sE044weN5Fy9erHPOOUchISGSpHbt2qlfv37q1KmTunfvLknuVc5++eUX7du3z33cqaeeqg4dOkiquR4bOnSoJHmMkr7ooouUk5OjDRs21HvOx3oN05BVq1bpnXfe0ezZs+ssuPHOO+/olFNOUfv27T2+P2PHjpXD4dDSpUs99i8tLfXYb+/evc264tt7772njh07uq8ND9bQiOnDnZ9LZWWl++fakEmTJmn58uXuqWmS9Oabbyo5OVmjR492bzv0ezBlyhQFBQUd9hq9IQeviFtQUKC9e/dq9OjR2rFjR53rS6C1+HUgtHTpUp1//vnq3LmzDMPQhx9+eFTHl5eX6+qrr1ZqaqoCAwM1YcKEw+7/ww8/KDAwUIMHD25yzWjb8vLy9MYbb+jVV1/V4MGD9Ze//KXBN4jOnTsrPDzcY1vv3r0l1cy
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1400x500 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Оценка инерции и коэффициента силуэта\n",
|
|||
|
"from cuml.metrics.cluster import silhouette_score\n",
|
|||
|
"from tqdm import tqdm # Импорт библиотеки для отображения прогресса\n",
|
|||
|
"\n",
|
|||
|
"# Оценка инерции и коэффициента силуэта\n",
|
|||
|
"inertia = []\n",
|
|||
|
"silhouette_scores = []\n",
|
|||
|
"k_range = range(2, 11)\n",
|
|||
|
"\n",
|
|||
|
"# tqdm для отображения прогресса\n",
|
|||
|
"for k in tqdm(k_range, desc=\"Оценка числа кластеров\"):\n",
|
|||
|
" kmeans = KMeans(n_clusters=k, random_state=42)\n",
|
|||
|
" kmeans.fit(reduced_data)\n",
|
|||
|
" inertia.append(kmeans.inertia_)\n",
|
|||
|
" silhouette_scores.append(silhouette_score(reduced_data, kmeans.labels_))\n",
|
|||
|
"\n",
|
|||
|
"# Построение графиков\n",
|
|||
|
"plt.figure(figsize=(14, 5))\n",
|
|||
|
"\n",
|
|||
|
"# График инерции\n",
|
|||
|
"plt.subplot(1, 2, 1)\n",
|
|||
|
"plt.plot(k_range, inertia, marker='o')\n",
|
|||
|
"plt.title('Критерий инерции')\n",
|
|||
|
"plt.xlabel('Число кластеров')\n",
|
|||
|
"plt.ylabel('Инерция')\n",
|
|||
|
"\n",
|
|||
|
"# График коэффициента силуэта\n",
|
|||
|
"plt.subplot(1, 2, 2)\n",
|
|||
|
"plt.plot(k_range, silhouette_scores, marker='o')\n",
|
|||
|
"plt.title('Коэффициент силуэта')\n",
|
|||
|
"plt.xlabel('Число кластеров')\n",
|
|||
|
"plt.ylabel('Силуэт')\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "180e85ac",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Кластерный анализ"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"id": "dd573024",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAHHCAYAAABDUnkqAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAaNNJREFUeJzt3Xd8E/X/B/DXpSNtadOW0UEps+xZdlFBlCGiggvEgSjiAr8gTlwI/LQqoKACispQlgICUpGyRaBsyh4tq6V0UNomnUmT+/z+KA0NXSm0SXp9PX3kgbn73OV9l97dO5/7fD4nCSEEiIiIiBRCZe8AiIiIiCoTkxsiIiJSFCY3REREpChMboiIiEhRmNwQERGRojC5ISIiIkVhckNERESKwuSGiIiIFIXJDRERESkKkxsiqnGysrLg5+eHpUuX2jsUh/LUU09h2LBh9g6D6I4xuSFyAIsWLYIkSTh48GCxeT/99BMkScLQoUNhMpnsEJ3yzJ49G15eXnjqqafM0z799FNIkoTU1FSLsvHx8WjWrBlq166Nw4cPAwBGjRoFSZKg0WiQm5tbbP0xMTGQJAmSJGHGjBlVuzGV6L333sPq1atx9OhRe4dCdEeY3BA5sDVr1uC1117DPffcgxUrVsDJycneIVV7+fn5mD17Nl566aVy92dCQgL69u2LtLQ0bN68GZ07dzbPc3Z2Rk5ODtavX19suaVLl8LNza3SY69qoaGh6Nq1K2bOnGnvUIjuCJMbIge1Y8cOjBgxAm3atMH69eur5cXSEUVERODatWvl3n65evUq+vbti+vXr2Pz5s3o0qWLxXy1Wo37778fy5cvL7bssmXLMHjw4EqN21aGDRuGP//8E1lZWfYOhei2MbkhckDR0dEYMmQIAgMDERkZCW9v72JlLl26ZL71ceurqBkzZqBXr16oU6cO3N3d0aVLF6xatarEz12yZAm6d+8ODw8P+Pr6onfv3ti0aRMAoHHjxqV+niRJaNy4sXk9sixj1qxZaNu2Ldzc3ODv749XXnkF6enpFp/XuHFjPPTQQ9i0aRM6deoENzc3tGnTBn/++adFubJu2xW69957ce+995a1WwEAa9euRePGjdGsWbNSyyQmJqJv375ISUnBpk2b0LVr1xLLPf300/jnn3+QkZFhnnbgwAHExMTg6aefLnGZjIwMTJgwAcHBwVCr1QgJCcGXX34JWZYtyln7vUmShHHjxmHt2rVo164d1Go12rZti40bN1qUy8zMxIQJE9C4cWOo1Wr4+fmhf//+5ltthfr374/s7Gxs3ry51P1D5OiY3BA5mPPnz+OBBx6AWq1GZGQkAgMDyyz/8ssv47fffsNvv/2GRx99tNj82bNnIzQ0FFOnTsXnn38OZ2dnPPnkk/j7778tyk2ZMgXPPfccXFxcMHXqVEyZMgXBwcHYtm0bAGDWrFnmz/nggw8AAB988IF52qxZs8zreuWVV/DOO+/grrvuwuzZs/HCCy9g6dKlGDhwIPLz8y0+NyYmBsOHD8egQYMQHh5ujq+qLq579uyxuL10q+TkZNx3331ISkpCZGQkunXrVmrZxx57DJIkWSRjy5YtQ6tWrUr8jJycHPTp0wdLlizByJEj8e233+Kuu+7CpEmTMHHiRIuy1n5vALBr1y68/vrreOqpp/DVV18hLy8Pjz/+OK5fv24u8+qrr2LevHl4/PHHMXfuXLz99ttwd3fH6dOnLdbVpk0buLu7Y/fu3aVuN5HDE0RkdwsXLhQAREREhGjWrJkAIAYMGFDmMjExMQKAWLx4sXna5MmTxa2HdU5OjsV7g8Eg2rVrJ+677z6LdalUKvHoo48Kk8lkUV6W5WKfvX37dgFAbN++vdi8//77TwAQS5cutZi+cePGYtMbNWokAIjVq1ebp2m1WhEYGChCQ0PN0wr3z4EDB0raFUIIIfr06SP69OlT6nwhhMjPzxeSJIm33nqr2LzCfdeoUSOh0WhEVFRUqet5/vnnRa1atYQQQjzxxBPi/vvvF0IIYTKZREBAgJgyZYq4ePGiACCmT59uXm7atGmiVq1a4ty5cxbre//994WTk5OIi4szT7PmexNCCADC1dVVxMbGmqcdPXpUABDfffedeZq3t7cYO3ZsqdtUVIsWLcSgQYOsKkvkiFhzQ+RARo0ahfj4eDz99NPYtGkTVq5cWWpZg8EAoKDtR1nc3d3N/5+eng6tVot77rnH4nbE2rVrIcsyPvnkE6hUlqeFW29zlWflypXw9vZG//79kZqaan516dIFnp6e2L59u0X5+vXrW9Q4aTQajBw5EkeOHEFSUpJFWa1Wi9TUVGRmZlYopkJpaWkQQsDX17fUMsnJyfD09Cy3xqzQ008/jR07diApKQnbtm1DUlJSqbekVq5ciXvuuQe+vr4W+6Zfv34wmUzYuXOnuaw131uhfv36Wdxm69ChAzQaDS5cuGCe5uPjg3379uHq1avlblNhfETVVY1Obnbu3ImHH34Y9evXhyRJWLt2bYXXIYTAjBkz0KJFC6jVagQFBeGzzz6r/GCpRkhLS8OSJUuwePFidOrUCePHj4dWqy2xbGE7D09PzzLXGRERgZ49e8LNzQ21a9dGvXr1MG/ePIv1nj9/HiqVCm3atLnjbYiJiYFWq4Wfnx/q1atn8crKykJKSopF+ZCQkGIJVIsWLQAUtCsqql+/fqhXrx40Gg18fX3x+uuvIzs7u8IxCiFKnbdkyRKkpaWhf//+xWItyYMPPggvLy/8/vvvWLp0Kbp164aQkJASy8bExGDjxo3F9ku/fv0AwOLzrPneCjVs2LDYNF9fX4s2Tl999RVOnDiB4OBgdO/eHZ9++qlF8lOUEKLCSS2RI3G2dwD2lJ2djY4dO+LFF1/EY489dlvrGD9+PDZt2oQZM2agffv2SEtLQ1paWiVHSjXF9OnT8eSTTwIA5s+fj549e2LSpEmYO3dusbKFtRoBAQGlru+///7DI488gt69e2Pu3LkIDAyEi4sLFi5ciGXLllXJNsiyXOYAefXq1bvtdc+ZMwctWrSAXq/Hjh07zGPIlLR/SlK7dm1IklSsYXNRffr0wR9//IHHHnsMAwcOxI4dO0ps0F1IrVbjsccew+LFi3HhwgV8+umnpZaVZRn9+/fHu+++W+L8wqSuot9baV3aiyZxw4YNwz333IM1a9Zg06ZNmD59Or788kv8+eefGDRokMVy6enpaN68eanbQeToanRyM2jQoGIHdVF6vR4ffvghli9fjoyMDLRr1w5ffvmluUfG6dOnMW/ePJw4cQItW7YEADRp0sQWoZNC9e7d2/z/3bp1w9ixYzFnzhyMHDkSPXv2tCh76tQpSJJk/tsryerVq+Hm5obIyEiL21cLFy60KNesWTPIsoxTp06hU6dOd7QNzZo1w5YtW3DXXXdZ3FopTWxsbLGagnPnzgGARQ8sAOjevbu559LgwYNx9OjRYr2CyuLs7IxmzZrh4sWLZZZ7+OGHsWDBAjz//PPm3lxlbcvTTz+NBQsWQKVSWQwMeKtmzZohKyvLXFNTGmu/t4oKDAzE66+/jtdffx0pKSno3LkzPvvsM4vzoNFoRHx8PB555JE7+iwie6rRt6XKM27cOERFRWHFihU4duwYnnzySTzwwAOIiYkBAKxfvx5NmzZFREQEmjRpgsaNG+Oll15izQ1Vms8++wyBgYF4+eWXYTQazdONRiNWr16N7t27l3lbysnJCZIkWYxsfOnSpWK3YIcOHQqVSoWpU6cW65Jc1i2ckgwbNgwmkwnTpk0rNs9oNFp0mwYKxpNZs2aN+b1Op8Ovv/6KTp06lVkrBRTUhFR0YMOwsLAyu5QXeu655zBr1izs2rULjz/+eLFeXkX17dsX06ZNw/fff19mzMOGDUNUVBQiIyOLzcvIyDB/x9Z+b9YymUzFbmf5+fmhfv360Ov1FtNPnTqFvLw89OrV67Y+i8gR1Oiam7LExcVh4cKFiIuLQ/369QEAb7/9NjZu3IiFCxfi888/x4ULF3D58mWsXLkSv/76K0wmE95880088cQT5u6zRHfCy8sL3333HR577DH
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Кластеризация с использованием KMeans\n",
|
|||
|
"optimal_k = 4 # Выбираем на основе графиков\n",
|
|||
|
"kmeans = KMeans(n_clusters=optimal_k, random_state=42)\n",
|
|||
|
"labels = kmeans.fit_predict(reduced_data)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразуем данные из cupy в numpy\n",
|
|||
|
"reduced_data_np = reduced_data.get()\n",
|
|||
|
"labels_np = labels.get()\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация кластеров\n",
|
|||
|
"plt.scatter(reduced_data_np[:, 0], reduced_data_np[:, 1], c=labels_np, cmap='viridis')\n",
|
|||
|
"plt.title('Кластеры (KMeans)')\n",
|
|||
|
"plt.xlabel('PC1')\n",
|
|||
|
"plt.ylabel('PC2')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "407d268e",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Оценка качества кластеризации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"id": "d00795e2",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Силуэт для кластеризации: 0.58\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Оценка коэффициента силуэта\n",
|
|||
|
"silhouette = silhouette_score(reduced_data, labels)\n",
|
|||
|
"print(f'Силуэт для кластеризации: {silhouette:.2f}')"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3 (ipykernel)",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|