Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
110f79e4f5 | ||
| 639e381daa | |||
| a610d16a7f | |||
| 08bd1f76c0 | |||
| 95f9913c6f | |||
| 061214e244 | |||
| 80e6ee0e8f | |||
| 3312b4f4d2 | |||
| b817368d6c | |||
| 37103ea009 | |||
| 9dd4777138 |
930
lab_10/lab10.ipynb
Normal file
930
lab_10/lab10.ipynb
Normal file
@@ -0,0 +1,930 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae6b4270",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Лабораторная работа 10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b4b9ee35",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"В качестве задачи оптимизации была выбрана классическая вариация задачи о рюкзаке: дан набор предметов, каждый с определенным весом и ценностью. Требуется определить, какие предметы взять с собой в рюкзак, чтобы их суммарная ценность была максимальной, а суммарный вес не превышал заданную грузоподъемность рюкзака. При этом каждый предмет можно взять только один раз или не брать вовсе (0/1).\n",
|
||||
"\n",
|
||||
"Используем соответствующий датасет, в котором имеется большое число вариантов задачи с различными параметрами: https://www.kaggle.com/datasets/warcoder/knapsack-problem?select=knapsack_5_items.csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"id": "80d638c3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
|
||||
"columns": [
|
||||
{
|
||||
"name": "index",
|
||||
"rawType": "int64",
|
||||
"type": "integer"
|
||||
},
|
||||
{
|
||||
"name": "Weights",
|
||||
"rawType": "object",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "Prices",
|
||||
"rawType": "object",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "Capacity",
|
||||
"rawType": "int64",
|
||||
"type": "integer"
|
||||
},
|
||||
{
|
||||
"name": "Best picks",
|
||||
"rawType": "object",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "Best price",
|
||||
"rawType": "float64",
|
||||
"type": "float"
|
||||
}
|
||||
],
|
||||
"conversionMethod": "pd.DataFrame",
|
||||
"ref": "f950228b-7bd0-4f67-b42e-48be43e881b9",
|
||||
"rows": [
|
||||
[
|
||||
"0",
|
||||
"[46 40 42 38 10]",
|
||||
"[12 19 19 15 8]",
|
||||
"40",
|
||||
"[0. 1. 0. 0. 0.]",
|
||||
"19.0"
|
||||
],
|
||||
[
|
||||
"1",
|
||||
"[11 31 4 6 7]",
|
||||
"[ 2 8 18 16 3]",
|
||||
"64",
|
||||
"[1. 1. 1. 1. 1.]",
|
||||
"47.0"
|
||||
],
|
||||
[
|
||||
"2",
|
||||
"[32 49 27 37 24]",
|
||||
"[19 16 16 4 1]",
|
||||
"87",
|
||||
"[1. 0. 1. 0. 1.]",
|
||||
"36.0"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"[20 35 22 23 16]",
|
||||
"[19 17 19 9 1]",
|
||||
"21",
|
||||
"[1. 0. 0. 0. 0.]",
|
||||
"19.0"
|
||||
],
|
||||
[
|
||||
"4",
|
||||
"[ 7 12 19 13 20]",
|
||||
"[10 11 18 15 5]",
|
||||
"50",
|
||||
"[0. 1. 1. 1. 0.]",
|
||||
"44.0"
|
||||
],
|
||||
[
|
||||
"9995",
|
||||
"[18 12 11 49 32]",
|
||||
"[12 3 17 19 7]",
|
||||
"41",
|
||||
"[1. 1. 1. 0. 0.]",
|
||||
"32.0"
|
||||
],
|
||||
[
|
||||
"9996",
|
||||
"[20 2 24 7 7]",
|
||||
"[17 12 4 3 8]",
|
||||
"17",
|
||||
"[0. 1. 0. 1. 1.]",
|
||||
"23.0"
|
||||
],
|
||||
[
|
||||
"9997",
|
||||
"[43 43 5 15 23]",
|
||||
"[15 5 7 2 7]",
|
||||
"62",
|
||||
"[1. 0. 1. 0. 0.]",
|
||||
"22.0"
|
||||
],
|
||||
[
|
||||
"9998",
|
||||
"[49 9 15 21 39]",
|
||||
"[11 15 3 12 19]",
|
||||
"65",
|
||||
"[0. 1. 1. 0. 1.]",
|
||||
"37.0"
|
||||
],
|
||||
[
|
||||
"9999",
|
||||
"[25 36 42 19 39]",
|
||||
"[15 12 7 18 12]",
|
||||
"79",
|
||||
"[1. 0. 0. 1. 0.]",
|
||||
"33.0"
|
||||
]
|
||||
],
|
||||
"shape": {
|
||||
"columns": 5,
|
||||
"rows": 10
|
||||
}
|
||||
},
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Weights</th>\n",
|
||||
" <th>Prices</th>\n",
|
||||
" <th>Capacity</th>\n",
|
||||
" <th>Best picks</th>\n",
|
||||
" <th>Best price</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>[46 40 42 38 10]</td>\n",
|
||||
" <td>[12 19 19 15 8]</td>\n",
|
||||
" <td>40</td>\n",
|
||||
" <td>[0. 1. 0. 0. 0.]</td>\n",
|
||||
" <td>19.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>[11 31 4 6 7]</td>\n",
|
||||
" <td>[ 2 8 18 16 3]</td>\n",
|
||||
" <td>64</td>\n",
|
||||
" <td>[1. 1. 1. 1. 1.]</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>[32 49 27 37 24]</td>\n",
|
||||
" <td>[19 16 16 4 1]</td>\n",
|
||||
" <td>87</td>\n",
|
||||
" <td>[1. 0. 1. 0. 1.]</td>\n",
|
||||
" <td>36.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>[20 35 22 23 16]</td>\n",
|
||||
" <td>[19 17 19 9 1]</td>\n",
|
||||
" <td>21</td>\n",
|
||||
" <td>[1. 0. 0. 0. 0.]</td>\n",
|
||||
" <td>19.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>[ 7 12 19 13 20]</td>\n",
|
||||
" <td>[10 11 18 15 5]</td>\n",
|
||||
" <td>50</td>\n",
|
||||
" <td>[0. 1. 1. 1. 0.]</td>\n",
|
||||
" <td>44.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9995</th>\n",
|
||||
" <td>[18 12 11 49 32]</td>\n",
|
||||
" <td>[12 3 17 19 7]</td>\n",
|
||||
" <td>41</td>\n",
|
||||
" <td>[1. 1. 1. 0. 0.]</td>\n",
|
||||
" <td>32.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9996</th>\n",
|
||||
" <td>[20 2 24 7 7]</td>\n",
|
||||
" <td>[17 12 4 3 8]</td>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>[0. 1. 0. 1. 1.]</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9997</th>\n",
|
||||
" <td>[43 43 5 15 23]</td>\n",
|
||||
" <td>[15 5 7 2 7]</td>\n",
|
||||
" <td>62</td>\n",
|
||||
" <td>[1. 0. 1. 0. 0.]</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9998</th>\n",
|
||||
" <td>[49 9 15 21 39]</td>\n",
|
||||
" <td>[11 15 3 12 19]</td>\n",
|
||||
" <td>65</td>\n",
|
||||
" <td>[0. 1. 1. 0. 1.]</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9999</th>\n",
|
||||
" <td>[25 36 42 19 39]</td>\n",
|
||||
" <td>[15 12 7 18 12]</td>\n",
|
||||
" <td>79</td>\n",
|
||||
" <td>[1. 0. 0. 1. 0.]</td>\n",
|
||||
" <td>33.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Weights Prices Capacity Best picks \\\n",
|
||||
"0 [46 40 42 38 10] [12 19 19 15 8] 40 [0. 1. 0. 0. 0.] \n",
|
||||
"1 [11 31 4 6 7] [ 2 8 18 16 3] 64 [1. 1. 1. 1. 1.] \n",
|
||||
"2 [32 49 27 37 24] [19 16 16 4 1] 87 [1. 0. 1. 0. 1.] \n",
|
||||
"3 [20 35 22 23 16] [19 17 19 9 1] 21 [1. 0. 0. 0. 0.] \n",
|
||||
"4 [ 7 12 19 13 20] [10 11 18 15 5] 50 [0. 1. 1. 1. 0.] \n",
|
||||
"9995 [18 12 11 49 32] [12 3 17 19 7] 41 [1. 1. 1. 0. 0.] \n",
|
||||
"9996 [20 2 24 7 7] [17 12 4 3 8] 17 [0. 1. 0. 1. 1.] \n",
|
||||
"9997 [43 43 5 15 23] [15 5 7 2 7] 62 [1. 0. 1. 0. 0.] \n",
|
||||
"9998 [49 9 15 21 39] [11 15 3 12 19] 65 [0. 1. 1. 0. 1.] \n",
|
||||
"9999 [25 36 42 19 39] [15 12 7 18 12] 79 [1. 0. 0. 1. 0.] \n",
|
||||
"\n",
|
||||
" Best price \n",
|
||||
"0 19.0 \n",
|
||||
"1 47.0 \n",
|
||||
"2 36.0 \n",
|
||||
"3 19.0 \n",
|
||||
"4 44.0 \n",
|
||||
"9995 32.0 \n",
|
||||
"9996 23.0 \n",
|
||||
"9997 22.0 \n",
|
||||
"9998 37.0 \n",
|
||||
"9999 33.0 "
|
||||
]
|
||||
},
|
||||
"execution_count": 70,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(\"..//..//static//csv//knapsack_5_items.csv\")\n",
|
||||
"\n",
|
||||
"pd.concat([df.head(5), df.tail(5)])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "884f0cd1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Структура хромосомы и тип данных гена\n",
|
||||
"\n",
|
||||
"В данном случае хромосома будет представлять из себя список длины n (количество предметов в конкректной задаче), который представляет собой решение задачи рюкзака — то есть указывает, какие предметы включить в рюкзак.\n",
|
||||
"\n",
|
||||
"Пример: [1, 0, 1, 0, 0]. В примере выбраны первый и третий предметы.\n",
|
||||
"\n",
|
||||
"Ген же — это одно значение в хромосоме. \n",
|
||||
"\n",
|
||||
"Тип данных: int.\n",
|
||||
" \n",
|
||||
"Возможные значения:\n",
|
||||
"* 1 — предмет в рюкзаке;\n",
|
||||
"* 0 — предмет не в рюкзаке."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "92661ad8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Реализация функции генерации начальной популяции и ее тест:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a7b0970",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[[0, 1, 0, 0, 0],\n",
|
||||
" [1, 0, 0, 0, 1],\n",
|
||||
" [1, 0, 0, 0, 0],\n",
|
||||
" [1, 0, 1, 0, 1],\n",
|
||||
" [1, 1, 1, 1, 0],\n",
|
||||
" [0, 1, 0, 0, 1],\n",
|
||||
" [1, 0, 1, 1, 1],\n",
|
||||
" [1, 0, 0, 0, 1],\n",
|
||||
" [1, 0, 1, 1, 0],\n",
|
||||
" [1, 1, 0, 1, 1]]"
|
||||
]
|
||||
},
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"\n",
|
||||
"def create_individual(elements_num): \n",
|
||||
" return [random.randint(0, 1) for _ in range(elements_num)]\n",
|
||||
"\n",
|
||||
"def create_population(elements_num, population_size): \n",
|
||||
" return [create_individual(elements_num) for _ in range(population_size)]\n",
|
||||
"\n",
|
||||
"create_population(5, 10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bc114201",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Реализация фитнес-функции и ее тест:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0809a0b1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"44"
|
||||
]
|
||||
},
|
||||
"execution_count": 72,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def evaluate_fitness(individual, weights, prices, capacity):\n",
|
||||
" total_value = total_weight = 0\n",
|
||||
" for i in range(len(individual)):\n",
|
||||
" if individual[i] == 1:\n",
|
||||
" total_value += prices[i]\n",
|
||||
" total_weight += weights[i]\n",
|
||||
" return total_value if total_weight <= capacity else 0\n",
|
||||
"\n",
|
||||
"evaluate_fitness([0, 1, 1, 1, 0], [7, 12, 19, 13, 20], [10, 11, 18, 15, 5], 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cccc6557",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Реализация оператора кроссинговера и его тест:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4d5a13d7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"([0, 1, 1, 0, 0], [1, 0, 1, 1, 0])"
|
||||
]
|
||||
},
|
||||
"execution_count": 73,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def crossover(parent1, parent2):\n",
|
||||
" point = random.randint(1, len(parent1) - 1)\n",
|
||||
" return (parent1[:point] + parent2[point:], parent2[:point] + parent1[point:])\n",
|
||||
"\n",
|
||||
"crossover([0, 1, 1, 1, 0], [1, 0, 1, 0, 0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "08c626b5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Реализация двух операторов мутации и их тест:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "66021b53",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[0, 1, 1, 1, 0]\n",
|
||||
"================\n",
|
||||
"[0, 1, 0, 1, 0]\n",
|
||||
"================\n",
|
||||
"[0, 0, 0, 1, 1]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Мутация 1: побитовая замена\n",
|
||||
"def mutate_flip_bits(individual, mutation_rate):\n",
|
||||
" for i in range(len(individual)):\n",
|
||||
" if random.random() < mutation_rate:\n",
|
||||
" individual[i] = 1 - individual[i]\n",
|
||||
"\n",
|
||||
"# Мутация 2: случайный свап двух генов\n",
|
||||
"def mutate_swap_genes(individual, mutation_rate):\n",
|
||||
" if random.random() < mutation_rate:\n",
|
||||
" i, j = random.sample(range(len(individual)), 2)\n",
|
||||
" individual[i], individual[j] = individual[j], individual[i]\n",
|
||||
"\n",
|
||||
"individual = [0, 1, 1, 1, 0]\n",
|
||||
"print(individual)\n",
|
||||
"mutate_flip_bits(individual, 0.5)\n",
|
||||
"print(\"================\")\n",
|
||||
"print(individual)\n",
|
||||
"print(\"================\")\n",
|
||||
"mutate_swap_genes(individual, 1)\n",
|
||||
"print(individual)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d199e789",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### И наконец реализуем сам генетический алгоритм:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "17093d62",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"population_size = 100\n",
|
||||
"num_generations = 10\n",
|
||||
"mutation_rate = 0.1\n",
|
||||
"mutation_strategy = 'flip'\n",
|
||||
"\n",
|
||||
"def select_parents(population, weights, prices, capacity):\n",
|
||||
" fitness_values = [evaluate_fitness(ind, weights, prices, capacity) for ind in population]\n",
|
||||
" total_fitness = sum(fitness_values)\n",
|
||||
" if total_fitness == 0:\n",
|
||||
" return random.choice(population), random.choice(population)\n",
|
||||
" probabilities = [f / total_fitness for f in fitness_values]\n",
|
||||
" return random.choices(population, weights=probabilities, k=2)\n",
|
||||
"\n",
|
||||
"def genetic_algorithm(weights, prices, capacity, population_size = 100, num_generations = 10, mutation_rate = 0.1, mutation_strategy='flip'):\n",
|
||||
" elements_num = len(weights)\n",
|
||||
" population = create_population(elements_num, population_size)\n",
|
||||
"\n",
|
||||
" for _ in range(num_generations):\n",
|
||||
" new_population = []\n",
|
||||
" for _ in range(population_size // 2):\n",
|
||||
" p1, p2 = select_parents(population, weights, prices, capacity)\n",
|
||||
" c1, c2 = crossover(p1, p2)\n",
|
||||
" if mutation_strategy == 'flip':\n",
|
||||
" mutate_flip_bits(c1, mutation_rate)\n",
|
||||
" mutate_flip_bits(c2, mutation_rate)\n",
|
||||
" elif mutation_strategy == 'swap':\n",
|
||||
" mutate_swap_genes(c1, mutation_rate)\n",
|
||||
" mutate_swap_genes(c2, mutation_rate)\n",
|
||||
" new_population.extend([c1, c2])\n",
|
||||
" population = new_population\n",
|
||||
"\n",
|
||||
" best = max(population, key=lambda ind: evaluate_fitness(ind, weights, prices, capacity))\n",
|
||||
" best_value = evaluate_fitness(best, weights, prices, capacity)\n",
|
||||
" return best, best_value"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f241602a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Применим его для всех случаев из датасета:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 80,
|
||||
"id": "9ef718ac",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
|
||||
"columns": [
|
||||
{
|
||||
"name": "index",
|
||||
"rawType": "int64",
|
||||
"type": "integer"
|
||||
},
|
||||
{
|
||||
"name": "Weights",
|
||||
"rawType": "object",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "Prices",
|
||||
"rawType": "object",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "Capacity",
|
||||
"rawType": "int64",
|
||||
"type": "integer"
|
||||
},
|
||||
{
|
||||
"name": "Best picks",
|
||||
"rawType": "object",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "Best price",
|
||||
"rawType": "float64",
|
||||
"type": "float"
|
||||
},
|
||||
{
|
||||
"name": "algorithmPicks",
|
||||
"rawType": "object",
|
||||
"type": "unknown"
|
||||
},
|
||||
{
|
||||
"name": "algorithmPrice",
|
||||
"rawType": "int64",
|
||||
"type": "integer"
|
||||
}
|
||||
],
|
||||
"conversionMethod": "pd.DataFrame",
|
||||
"ref": "767fbcfd-78f3-4b7e-a80d-6735c17b7fbd",
|
||||
"rows": [
|
||||
[
|
||||
"0",
|
||||
"[46 40 42 38 10]",
|
||||
"[12 19 19 15 8]",
|
||||
"40",
|
||||
"[0. 1. 0. 0. 0.]",
|
||||
"19.0",
|
||||
"[0, 1, 0, 0, 0]",
|
||||
"19"
|
||||
],
|
||||
[
|
||||
"1",
|
||||
"[11 31 4 6 7]",
|
||||
"[ 2 8 18 16 3]",
|
||||
"64",
|
||||
"[1. 1. 1. 1. 1.]",
|
||||
"47.0",
|
||||
"[1, 1, 1, 1, 1]",
|
||||
"47"
|
||||
],
|
||||
[
|
||||
"2",
|
||||
"[32 49 27 37 24]",
|
||||
"[19 16 16 4 1]",
|
||||
"87",
|
||||
"[1. 0. 1. 0. 1.]",
|
||||
"36.0",
|
||||
"[1, 0, 1, 0, 1]",
|
||||
"36"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"[20 35 22 23 16]",
|
||||
"[19 17 19 9 1]",
|
||||
"21",
|
||||
"[1. 0. 0. 0. 0.]",
|
||||
"19.0",
|
||||
"[1, 0, 0, 0, 0]",
|
||||
"19"
|
||||
],
|
||||
[
|
||||
"4",
|
||||
"[ 7 12 19 13 20]",
|
||||
"[10 11 18 15 5]",
|
||||
"50",
|
||||
"[0. 1. 1. 1. 0.]",
|
||||
"44.0",
|
||||
"[0, 1, 1, 1, 0]",
|
||||
"44"
|
||||
],
|
||||
[
|
||||
"9995",
|
||||
"[18 12 11 49 32]",
|
||||
"[12 3 17 19 7]",
|
||||
"41",
|
||||
"[1. 1. 1. 0. 0.]",
|
||||
"32.0",
|
||||
"[1, 1, 1, 0, 0]",
|
||||
"32"
|
||||
],
|
||||
[
|
||||
"9996",
|
||||
"[20 2 24 7 7]",
|
||||
"[17 12 4 3 8]",
|
||||
"17",
|
||||
"[0. 1. 0. 1. 1.]",
|
||||
"23.0",
|
||||
"[0, 1, 0, 1, 1]",
|
||||
"23"
|
||||
],
|
||||
[
|
||||
"9997",
|
||||
"[43 43 5 15 23]",
|
||||
"[15 5 7 2 7]",
|
||||
"62",
|
||||
"[1. 0. 1. 0. 0.]",
|
||||
"22.0",
|
||||
"[1, 0, 1, 0, 0]",
|
||||
"22"
|
||||
],
|
||||
[
|
||||
"9998",
|
||||
"[49 9 15 21 39]",
|
||||
"[11 15 3 12 19]",
|
||||
"65",
|
||||
"[0. 1. 1. 0. 1.]",
|
||||
"37.0",
|
||||
"[0, 1, 1, 0, 1]",
|
||||
"37"
|
||||
],
|
||||
[
|
||||
"9999",
|
||||
"[25 36 42 19 39]",
|
||||
"[15 12 7 18 12]",
|
||||
"79",
|
||||
"[1. 0. 0. 1. 0.]",
|
||||
"33.0",
|
||||
"[1, 0, 0, 1, 0]",
|
||||
"33"
|
||||
]
|
||||
],
|
||||
"shape": {
|
||||
"columns": 7,
|
||||
"rows": 10
|
||||
}
|
||||
},
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Weights</th>\n",
|
||||
" <th>Prices</th>\n",
|
||||
" <th>Capacity</th>\n",
|
||||
" <th>Best picks</th>\n",
|
||||
" <th>Best price</th>\n",
|
||||
" <th>algorithmPicks</th>\n",
|
||||
" <th>algorithmPrice</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>[46 40 42 38 10]</td>\n",
|
||||
" <td>[12 19 19 15 8]</td>\n",
|
||||
" <td>40</td>\n",
|
||||
" <td>[0. 1. 0. 0. 0.]</td>\n",
|
||||
" <td>19.0</td>\n",
|
||||
" <td>[0, 1, 0, 0, 0]</td>\n",
|
||||
" <td>19</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>[11 31 4 6 7]</td>\n",
|
||||
" <td>[ 2 8 18 16 3]</td>\n",
|
||||
" <td>64</td>\n",
|
||||
" <td>[1. 1. 1. 1. 1.]</td>\n",
|
||||
" <td>47.0</td>\n",
|
||||
" <td>[1, 1, 1, 1, 1]</td>\n",
|
||||
" <td>47</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>[32 49 27 37 24]</td>\n",
|
||||
" <td>[19 16 16 4 1]</td>\n",
|
||||
" <td>87</td>\n",
|
||||
" <td>[1. 0. 1. 0. 1.]</td>\n",
|
||||
" <td>36.0</td>\n",
|
||||
" <td>[1, 0, 1, 0, 1]</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>[20 35 22 23 16]</td>\n",
|
||||
" <td>[19 17 19 9 1]</td>\n",
|
||||
" <td>21</td>\n",
|
||||
" <td>[1. 0. 0. 0. 0.]</td>\n",
|
||||
" <td>19.0</td>\n",
|
||||
" <td>[1, 0, 0, 0, 0]</td>\n",
|
||||
" <td>19</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>[ 7 12 19 13 20]</td>\n",
|
||||
" <td>[10 11 18 15 5]</td>\n",
|
||||
" <td>50</td>\n",
|
||||
" <td>[0. 1. 1. 1. 0.]</td>\n",
|
||||
" <td>44.0</td>\n",
|
||||
" <td>[0, 1, 1, 1, 0]</td>\n",
|
||||
" <td>44</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9995</th>\n",
|
||||
" <td>[18 12 11 49 32]</td>\n",
|
||||
" <td>[12 3 17 19 7]</td>\n",
|
||||
" <td>41</td>\n",
|
||||
" <td>[1. 1. 1. 0. 0.]</td>\n",
|
||||
" <td>32.0</td>\n",
|
||||
" <td>[1, 1, 1, 0, 0]</td>\n",
|
||||
" <td>32</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9996</th>\n",
|
||||
" <td>[20 2 24 7 7]</td>\n",
|
||||
" <td>[17 12 4 3 8]</td>\n",
|
||||
" <td>17</td>\n",
|
||||
" <td>[0. 1. 0. 1. 1.]</td>\n",
|
||||
" <td>23.0</td>\n",
|
||||
" <td>[0, 1, 0, 1, 1]</td>\n",
|
||||
" <td>23</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9997</th>\n",
|
||||
" <td>[43 43 5 15 23]</td>\n",
|
||||
" <td>[15 5 7 2 7]</td>\n",
|
||||
" <td>62</td>\n",
|
||||
" <td>[1. 0. 1. 0. 0.]</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>[1, 0, 1, 0, 0]</td>\n",
|
||||
" <td>22</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9998</th>\n",
|
||||
" <td>[49 9 15 21 39]</td>\n",
|
||||
" <td>[11 15 3 12 19]</td>\n",
|
||||
" <td>65</td>\n",
|
||||
" <td>[0. 1. 1. 0. 1.]</td>\n",
|
||||
" <td>37.0</td>\n",
|
||||
" <td>[0, 1, 1, 0, 1]</td>\n",
|
||||
" <td>37</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9999</th>\n",
|
||||
" <td>[25 36 42 19 39]</td>\n",
|
||||
" <td>[15 12 7 18 12]</td>\n",
|
||||
" <td>79</td>\n",
|
||||
" <td>[1. 0. 0. 1. 0.]</td>\n",
|
||||
" <td>33.0</td>\n",
|
||||
" <td>[1, 0, 0, 1, 0]</td>\n",
|
||||
" <td>33</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Weights Prices Capacity Best picks \\\n",
|
||||
"0 [46 40 42 38 10] [12 19 19 15 8] 40 [0. 1. 0. 0. 0.] \n",
|
||||
"1 [11 31 4 6 7] [ 2 8 18 16 3] 64 [1. 1. 1. 1. 1.] \n",
|
||||
"2 [32 49 27 37 24] [19 16 16 4 1] 87 [1. 0. 1. 0. 1.] \n",
|
||||
"3 [20 35 22 23 16] [19 17 19 9 1] 21 [1. 0. 0. 0. 0.] \n",
|
||||
"4 [ 7 12 19 13 20] [10 11 18 15 5] 50 [0. 1. 1. 1. 0.] \n",
|
||||
"9995 [18 12 11 49 32] [12 3 17 19 7] 41 [1. 1. 1. 0. 0.] \n",
|
||||
"9996 [20 2 24 7 7] [17 12 4 3 8] 17 [0. 1. 0. 1. 1.] \n",
|
||||
"9997 [43 43 5 15 23] [15 5 7 2 7] 62 [1. 0. 1. 0. 0.] \n",
|
||||
"9998 [49 9 15 21 39] [11 15 3 12 19] 65 [0. 1. 1. 0. 1.] \n",
|
||||
"9999 [25 36 42 19 39] [15 12 7 18 12] 79 [1. 0. 0. 1. 0.] \n",
|
||||
"\n",
|
||||
" Best price algorithmPicks algorithmPrice \n",
|
||||
"0 19.0 [0, 1, 0, 0, 0] 19 \n",
|
||||
"1 47.0 [1, 1, 1, 1, 1] 47 \n",
|
||||
"2 36.0 [1, 0, 1, 0, 1] 36 \n",
|
||||
"3 19.0 [1, 0, 0, 0, 0] 19 \n",
|
||||
"4 44.0 [0, 1, 1, 1, 0] 44 \n",
|
||||
"9995 32.0 [1, 1, 1, 0, 0] 32 \n",
|
||||
"9996 23.0 [0, 1, 0, 1, 1] 23 \n",
|
||||
"9997 22.0 [1, 0, 1, 0, 0] 22 \n",
|
||||
"9998 37.0 [0, 1, 1, 0, 1] 37 \n",
|
||||
"9999 33.0 [1, 0, 0, 1, 0] 33 "
|
||||
]
|
||||
},
|
||||
"execution_count": 80,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import ast\n",
|
||||
"import re\n",
|
||||
"\n",
|
||||
"picks = []\n",
|
||||
"best_prices = []\n",
|
||||
"\n",
|
||||
"def fix_list_string(s):\n",
|
||||
" # Удалить пробел сразу после [ и сразу перед ]\n",
|
||||
" s = re.sub(r'\\[\\s*', '[', s)\n",
|
||||
" s = re.sub(r'\\s*\\]', ']', s)\n",
|
||||
" # Заменить все группы пробелов на запятую\n",
|
||||
" s = re.sub(r'\\s+', ',', s)\n",
|
||||
" return s\n",
|
||||
"\n",
|
||||
"for _, row in df.iterrows():\n",
|
||||
" weights = ast.literal_eval(fix_list_string(row['Weights']))\n",
|
||||
" prices = ast.literal_eval(fix_list_string(row['Prices']))\n",
|
||||
" capacity = row['Capacity']\n",
|
||||
"\n",
|
||||
" best_individual, best_value = genetic_algorithm(weights, prices, capacity, population_size, num_generations, mutation_rate, mutation_strategy)\n",
|
||||
" \n",
|
||||
" picks.append(best_individual)\n",
|
||||
" best_prices.append(best_value)\n",
|
||||
"\n",
|
||||
"df['algorithmPicks'] = picks\n",
|
||||
"df['algorithmPrice'] = best_prices\n",
|
||||
"\n",
|
||||
"pd.concat([df.head(5), df.tail(5)])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "72958862",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"По полученным результатам видно, что ответы алгоритма совпадают с теми ответами, которые уже имелись в наборе данных. Поэтому, можно сказать, что для таких условий задачи алгоритм работает успешно даже с 10 поколениями"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "aimenv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
BIN
lab_10/requirements.txt
Normal file
BIN
lab_10/requirements.txt
Normal file
Binary file not shown.
1345
lab_11/lab11.ipynb
Normal file
1345
lab_11/lab11.ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
lab_11/requirements.txt
Normal file
BIN
lab_11/requirements.txt
Normal file
Binary file not shown.
2061
lab_12/lab12.ipynb
Normal file
2061
lab_12/lab12.ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
lab_12/requirements.txt
Normal file
BIN
lab_12/requirements.txt
Normal file
Binary file not shown.
134
lab_2/lab2.ipynb
134
lab_2/lab2.ipynb
@@ -15,7 +15,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 532,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -35,7 +35,6 @@
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"# вывод всех столбцов\n",
|
||||
"df = pd.read_csv(\"..//..//static//csv//flavors_of_cacao.csv\")\n",
|
||||
"df.columns = df.columns.str.replace('\\n', '')\n",
|
||||
"print(df.columns)"
|
||||
@@ -85,7 +84,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 533,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -108,13 +107,10 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Удаляем символ '%' и преобразуем столбец CocoaPercent в числовой формат\n",
|
||||
"df['CocoaPercent'] = df['CocoaPercent'].str.replace('%', '').astype(float)\n",
|
||||
"\n",
|
||||
"# Выбираем столбцы для анализа\n",
|
||||
"columns_to_check = ['CocoaPercent', 'Rating']\n",
|
||||
"\n",
|
||||
"# Функция для подсчета выбросов\n",
|
||||
"def count_outliers(df, columns):\n",
|
||||
" outliers_count = {}\n",
|
||||
" for col in columns:\n",
|
||||
@@ -123,21 +119,17 @@
|
||||
" IQR = Q3 - Q1\n",
|
||||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||||
" \n",
|
||||
" # Считаем количество выбросов\n",
|
||||
" \n",
|
||||
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
|
||||
" outliers_count[col] = len(outliers)\n",
|
||||
" \n",
|
||||
" return outliers_count\n",
|
||||
"\n",
|
||||
"# Подсчитываем выбросы\n",
|
||||
"outliers_count = count_outliers(df, columns_to_check)\n",
|
||||
"\n",
|
||||
"# Выводим количество выбросов для каждого столбца\n",
|
||||
"for col, count in outliers_count.items():\n",
|
||||
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
||||
"\n",
|
||||
"# Создаем диаграммы размахов\n",
|
||||
"plt.figure(figsize=(15, 10))\n",
|
||||
"for i, col in enumerate(columns_to_check, 1):\n",
|
||||
" plt.subplot(2, 2, i)\n",
|
||||
@@ -158,7 +150,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 534,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -180,10 +172,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Выбираем столбцы для очистки\n",
|
||||
"columns_to_clean = ['CocoaPercent']\n",
|
||||
"\n",
|
||||
"# Функция для удаления выбросов\n",
|
||||
"def remove_outliers(df, columns):\n",
|
||||
" for col in columns:\n",
|
||||
" Q1 = df[col].quantile(0.25)\n",
|
||||
@@ -192,21 +182,15 @@
|
||||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||||
" \n",
|
||||
" # Удаляем строки, содержащие выбросы\n",
|
||||
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
|
||||
" \n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"# Удаляем выбросы\n",
|
||||
"df_cleaned = remove_outliers(df, columns_to_clean)\n",
|
||||
"\n",
|
||||
"# Выводим количество удаленных строк\n",
|
||||
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
|
||||
"\n",
|
||||
"# Создаем диаграммы размаха для очищенных данных\n",
|
||||
"plt.figure(figsize=(15, 6))\n",
|
||||
"\n",
|
||||
"# Диаграмма размаха для CocoaPercent\n",
|
||||
"plt.subplot(1, 2, 1)\n",
|
||||
"sns.boxplot(x=df_cleaned['CocoaPercent'])\n",
|
||||
"plt.title('Box Plot of CocoaPercent (Cleaned)')\n",
|
||||
@@ -215,7 +199,6 @@
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Сохраняем очищенный датасет\n",
|
||||
"df_cleaned.to_csv(\"..//..//static//csv//flavors_of_cacao_cleaned.csv\", index=False)\n",
|
||||
"df = df_cleaned"
|
||||
]
|
||||
@@ -231,7 +214,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 535,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -266,17 +249,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Количество пустых значений признаков\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Есть ли пустые значения признаков\n",
|
||||
"print(df.isnull().any())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Процент пустых значений признаков\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
@@ -292,7 +272,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 536,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -315,13 +295,10 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Удаление пропущенных значений в столбцах BeanType и Broad BeanOrigin\n",
|
||||
"df = df.dropna(subset=['BeanType', 'Broad BeanOrigin'])\n",
|
||||
"\n",
|
||||
"# Проверка на пропущенные значения после удаления\n",
|
||||
"missing_values_after_drop = df.isnull().sum()\n",
|
||||
"\n",
|
||||
"# Вывод результатов после удаления\n",
|
||||
"print(\"\\nКоличество пустых значений в каждом столбце после удаления:\")\n",
|
||||
"print(missing_values_after_drop)"
|
||||
]
|
||||
@@ -337,7 +314,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 537,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -351,18 +328,13 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Разделение на признаки (X) и целевую переменную (y)\n",
|
||||
"# Предположим, что Rating - это целевая переменная\n",
|
||||
"X = df.drop('Rating', axis=1)\n",
|
||||
"y = df['Rating']\n",
|
||||
"\n",
|
||||
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
|
||||
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
|
||||
"\n",
|
||||
"# Разбиение остатка на контрольную и тестовую выборки\n",
|
||||
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
|
||||
"\n",
|
||||
"# Вывод размеров выборок\n",
|
||||
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
|
||||
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
|
||||
"print(\"Размер тестовой выборки:\", X_test.shape)"
|
||||
@@ -370,7 +342,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 538,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -426,7 +398,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Функция для анализа сбалансированности\n",
|
||||
"def analyze_balance(y_train, y_val, y_test):\n",
|
||||
" print(\"Распределение классов в обучающей выборке:\")\n",
|
||||
" print(y_train.value_counts(normalize=True))\n",
|
||||
@@ -437,7 +408,6 @@
|
||||
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
|
||||
" print(y_test.value_counts(normalize=True))\n",
|
||||
"\n",
|
||||
"# Анализ сбалансированности\n",
|
||||
"analyze_balance(y_train, y_val, y_test)"
|
||||
]
|
||||
},
|
||||
@@ -465,7 +435,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 539,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -485,7 +455,6 @@
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from imblearn.over_sampling import SMOTE\n",
|
||||
"\n",
|
||||
"# вывод всех столбцов\n",
|
||||
"df = pd.read_csv(\"..//..//static//csv//water_potability.csv\")\n",
|
||||
"print(df.columns)"
|
||||
]
|
||||
@@ -538,7 +507,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 540,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -562,10 +531,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Выбираем столбцы для анализа\n",
|
||||
"columns_to_check = ['Hardness', 'Solids', 'Organic_carbon']\n",
|
||||
"\n",
|
||||
"# Функция для подсчета выбросов\n",
|
||||
"def count_outliers(df, columns):\n",
|
||||
" outliers_count = {}\n",
|
||||
" for col in columns:\n",
|
||||
@@ -575,20 +542,16 @@
|
||||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||||
" \n",
|
||||
" # Считаем количество выбросов\n",
|
||||
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
|
||||
" outliers_count[col] = len(outliers)\n",
|
||||
" \n",
|
||||
" return outliers_count\n",
|
||||
"\n",
|
||||
"# Подсчитываем выбросы\n",
|
||||
"outliers_count = count_outliers(df, columns_to_check)\n",
|
||||
"\n",
|
||||
"# Выводим количество выбросов для каждого столбца\n",
|
||||
"for col, count in outliers_count.items():\n",
|
||||
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
||||
"\n",
|
||||
"# Создаем диаграммы размахов\n",
|
||||
"plt.figure(figsize=(15, 10))\n",
|
||||
"for i, col in enumerate(columns_to_check, 1):\n",
|
||||
" plt.subplot(2, 2, i)\n",
|
||||
@@ -607,7 +570,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 541,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -629,10 +592,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Выбираем столбцы для очистки\n",
|
||||
"columns_to_clean = ['Hardness', 'Solids', 'Organic_carbon']\n",
|
||||
"\n",
|
||||
"# Функция для удаления выбросов\n",
|
||||
"def remove_outliers(df, columns):\n",
|
||||
" for col in columns:\n",
|
||||
" Q1 = df[col].quantile(0.25)\n",
|
||||
@@ -641,33 +602,25 @@
|
||||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||||
" \n",
|
||||
" # Удаляем строки, содержащие выбросы\n",
|
||||
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
|
||||
" \n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"# Удаляем выбросы\n",
|
||||
"df_cleaned = remove_outliers(df, columns_to_clean)\n",
|
||||
"\n",
|
||||
"# Выводим количество удаленных строк\n",
|
||||
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
|
||||
"\n",
|
||||
"# Создаем диаграммы размаха для очищенных данных\n",
|
||||
"plt.figure(figsize=(15, 6))\n",
|
||||
"\n",
|
||||
"# Диаграмма размаха для Hardness\n",
|
||||
"plt.subplot(1, 3, 1)\n",
|
||||
"sns.boxplot(x=df_cleaned['Hardness'])\n",
|
||||
"plt.title('Box Plot of Hardness (Cleaned)')\n",
|
||||
"plt.xlabel('Hardness')\n",
|
||||
"\n",
|
||||
"# Диаграмма размаха для Solids\n",
|
||||
"plt.subplot(1, 3, 2)\n",
|
||||
"sns.boxplot(x=df_cleaned['Solids'])\n",
|
||||
"plt.title('Box Plot of Solids (Cleaned)')\n",
|
||||
"plt.xlabel('Solids')\n",
|
||||
"\n",
|
||||
"# Диаграмма размаха для Organic_carbon\n",
|
||||
"plt.subplot(1, 3, 3)\n",
|
||||
"sns.boxplot(x=df_cleaned['Organic_carbon'])\n",
|
||||
"plt.title('Box Plot of Organic_carbon (Cleaned)')\n",
|
||||
@@ -676,7 +629,6 @@
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Сохраняем очищенный датасет\n",
|
||||
"df_cleaned.to_csv(\"..//..//static//csv//water_potability_cleaned.csv\", index=False)\n",
|
||||
"df = df_cleaned"
|
||||
]
|
||||
@@ -692,7 +644,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 542,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -730,17 +682,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Количество пустых значений признаков\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Есть ли пустые значения признаков\n",
|
||||
"print(df.isnull().any())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Процент пустых значений признаков\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
@@ -756,7 +705,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 543,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -780,15 +729,12 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Замена значений\n",
|
||||
"df[\"ph\"] = df[\"ph\"].fillna(df[\"ph\"].median())\n",
|
||||
"df[\"Sulfate\"] = df[\"Sulfate\"].fillna(df[\"Sulfate\"].median())\n",
|
||||
"df[\"Trihalomethanes\"] = df[\"Trihalomethanes\"].fillna(df[\"Trihalomethanes\"].median())\n",
|
||||
"\n",
|
||||
"# Проверка на пропущенные значения после замены\n",
|
||||
"missing_values_after_drop = df.isnull().sum()\n",
|
||||
"\n",
|
||||
"# Вывод результатов после замены\n",
|
||||
"print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n",
|
||||
"print(missing_values_after_drop)"
|
||||
]
|
||||
@@ -804,7 +750,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 544,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -818,18 +764,13 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Разделение на признаки (X) и целевую переменную (y)\n",
|
||||
"# Предположим, что Potability - это целевая переменная\n",
|
||||
"X = df.drop('Potability', axis=1)\n",
|
||||
"y = df['Potability']\n",
|
||||
"\n",
|
||||
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
|
||||
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
|
||||
"\n",
|
||||
"# Разбиение остатка на контрольную и тестовую выборки\n",
|
||||
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
|
||||
"\n",
|
||||
"# Вывод размеров выборок\n",
|
||||
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
|
||||
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
|
||||
"print(\"Размер тестовой выборки:\", X_test.shape)"
|
||||
@@ -837,7 +778,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 545,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -865,7 +806,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Функция для анализа сбалансированности\n",
|
||||
"def analyze_balance(y_train, y_val, y_test):\n",
|
||||
" print(\"Распределение классов в обучающей выборке:\")\n",
|
||||
" print(y_train.value_counts(normalize=True))\n",
|
||||
@@ -876,7 +816,6 @@
|
||||
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
|
||||
" print(y_test.value_counts(normalize=True))\n",
|
||||
"\n",
|
||||
"# Анализ сбалансированности\n",
|
||||
"analyze_balance(y_train, y_val, y_test)"
|
||||
]
|
||||
},
|
||||
@@ -889,7 +828,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 546,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -907,10 +846,8 @@
|
||||
"source": [
|
||||
"smote = SMOTE(random_state=42)\n",
|
||||
"\n",
|
||||
"# Применение SMOTE для балансировки обучающей выборки\n",
|
||||
"X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)\n",
|
||||
"\n",
|
||||
"# Проверка сбалансированности после SMOTE\n",
|
||||
"print(\"Сбалансированность обучающей выборки после SMOTE:\")\n",
|
||||
"print(y_train_resampled.value_counts(normalize=True))"
|
||||
]
|
||||
@@ -938,7 +875,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 547,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -958,7 +895,6 @@
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||||
"\n",
|
||||
"# вывод всех столбцов\n",
|
||||
"df = pd.read_csv(\"..//..//static//csv//diabetes.csv\")\n",
|
||||
"print(df.columns)"
|
||||
]
|
||||
@@ -1009,7 +945,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 548,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -1033,10 +969,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Выбираем столбцы для анализа\n",
|
||||
"columns_to_check = ['Age', 'BloodPressure', 'BMI']\n",
|
||||
"\n",
|
||||
"# Функция для подсчета выбросов\n",
|
||||
"def count_outliers(df, columns):\n",
|
||||
" outliers_count = {}\n",
|
||||
" for col in columns:\n",
|
||||
@@ -1046,20 +980,16 @@
|
||||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||||
" \n",
|
||||
" # Считаем количество выбросов\n",
|
||||
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
|
||||
" outliers_count[col] = len(outliers)\n",
|
||||
" \n",
|
||||
" return outliers_count\n",
|
||||
"\n",
|
||||
"# Подсчитываем выбросы\n",
|
||||
"outliers_count = count_outliers(df, columns_to_check)\n",
|
||||
"\n",
|
||||
"# Выводим количество выбросов для каждого столбца\n",
|
||||
"for col, count in outliers_count.items():\n",
|
||||
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
||||
"\n",
|
||||
"# Создаем диаграммы размахов\n",
|
||||
"plt.figure(figsize=(15, 10))\n",
|
||||
"for i, col in enumerate(columns_to_check, 1):\n",
|
||||
" plt.subplot(2, 2, i)\n",
|
||||
@@ -1078,7 +1008,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 549,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -1100,10 +1030,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Выбираем столбцы для очистки\n",
|
||||
"columns_to_clean = ['Age', 'BloodPressure', 'BMI']\n",
|
||||
"\n",
|
||||
"# Функция для удаления выбросов\n",
|
||||
"def remove_outliers(df, columns):\n",
|
||||
" for col in columns:\n",
|
||||
" Q1 = df[col].quantile(0.25)\n",
|
||||
@@ -1112,33 +1040,26 @@
|
||||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||||
" \n",
|
||||
" # Удаляем строки, содержащие выбросы\n",
|
||||
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
|
||||
" \n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"# Удаляем выбросы\n",
|
||||
"df_cleaned = remove_outliers(df, columns_to_clean)\n",
|
||||
"\n",
|
||||
"# Выводим количество удаленных строк\n",
|
||||
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
|
||||
"\n",
|
||||
"# Создаем диаграммы размаха для очищенных данных\n",
|
||||
"plt.figure(figsize=(15, 6))\n",
|
||||
"\n",
|
||||
"# Диаграмма размаха для Age\n",
|
||||
"plt.subplot(1, 3, 1)\n",
|
||||
"sns.boxplot(x=df_cleaned['Age'])\n",
|
||||
"plt.title('Box Plot of Age (Cleaned)')\n",
|
||||
"plt.xlabel('Age')\n",
|
||||
"\n",
|
||||
"# Диаграмма размаха для BloodPressure\n",
|
||||
"plt.subplot(1, 3, 2)\n",
|
||||
"sns.boxplot(x=df_cleaned['BloodPressure'])\n",
|
||||
"plt.title('Box Plot of BloodPressure (Cleaned)')\n",
|
||||
"plt.xlabel('BloodPressure')\n",
|
||||
"\n",
|
||||
"# Диаграмма размаха для BMI\n",
|
||||
"plt.subplot(1, 3, 3)\n",
|
||||
"sns.boxplot(x=df_cleaned['BMI'])\n",
|
||||
"plt.title('Box Plot of BMI (Cleaned)')\n",
|
||||
@@ -1147,7 +1068,6 @@
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Сохраняем очищенный датасет\n",
|
||||
"df_cleaned.to_csv(\"..//..//static//csv//diabetes_cleaned.csv\", index=False)\n",
|
||||
"df = df_cleaned"
|
||||
]
|
||||
@@ -1163,7 +1083,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 550,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -1196,17 +1116,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Количество пустых значений признаков\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Есть ли пустые значения признаков\n",
|
||||
"print(df.isnull().any())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Процент пустых значений признаков\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
@@ -1224,7 +1141,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 551,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -1238,18 +1155,13 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Разделение на признаки (X) и целевую переменную (y)\n",
|
||||
"# Предположим, что Outcome - это целевая переменная\n",
|
||||
"X = df.drop('Outcome', axis=1)\n",
|
||||
"y = df['Outcome']\n",
|
||||
"\n",
|
||||
"# Разбиение на обучающую и остальную выборку (контрольную + тестовую)\n",
|
||||
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=42)\n",
|
||||
"\n",
|
||||
"# Разбиение остатка на контрольную и тестовую выборки\n",
|
||||
"X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)\n",
|
||||
"\n",
|
||||
"# Вывод размеров выборок\n",
|
||||
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
|
||||
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
|
||||
"print(\"Размер тестовой выборки:\", X_test.shape)"
|
||||
@@ -1257,7 +1169,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 552,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -1285,7 +1197,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Функция для анализа сбалансированности\n",
|
||||
"def analyze_balance(y_train, y_val, y_test):\n",
|
||||
" print(\"Распределение классов в обучающей выборке:\")\n",
|
||||
" print(y_train.value_counts(normalize=True))\n",
|
||||
@@ -1296,7 +1207,6 @@
|
||||
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
|
||||
" print(y_test.value_counts(normalize=True))\n",
|
||||
"\n",
|
||||
"# Анализ сбалансированности\n",
|
||||
"analyze_balance(y_train, y_val, y_test)"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -204,7 +204,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 137,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -244,17 +244,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Количество пустых значений признаков\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Есть ли пустые значения признаков\n",
|
||||
"print(df.isnull().any())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Процент пустых значений признаков\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
@@ -270,7 +267,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 138,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -296,13 +293,10 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Замена значений\n",
|
||||
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
|
||||
"\n",
|
||||
"# Проверка на пропущенные значения после замены\n",
|
||||
"missing_values_after_drop = df.isnull().sum()\n",
|
||||
"\n",
|
||||
"# Вывод результатов после замены\n",
|
||||
"print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n",
|
||||
"print(missing_values_after_drop)"
|
||||
]
|
||||
@@ -344,7 +338,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 140,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -358,19 +352,13 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Разделение данных на признаки (X) и целевую переменную (y)\n",
|
||||
"# В данном случае мы хотим предсказать 'stroke'\n",
|
||||
"X = df.drop(columns=['stroke'])\n",
|
||||
"y = df['stroke']\n",
|
||||
"\n",
|
||||
"# Разбиение данных на обучающую и тестовую выборки\n",
|
||||
"# Сначала разделим на обучающую и тестовую\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
|
||||
"\n",
|
||||
"# Затем разделим обучающую выборку на обучающую и контрольную\n",
|
||||
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)\n",
|
||||
"\n",
|
||||
"# Проверка размеров выборок\n",
|
||||
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
|
||||
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
|
||||
"print(\"Размер тестовой выборки:\", X_test.shape)"
|
||||
@@ -385,7 +373,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 141,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -423,9 +411,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Функция для анализа сбалансированности\n",
|
||||
"def analyze_balance(y_train, y_val, y_test, y_name):\n",
|
||||
" # Распределение классов\n",
|
||||
" print(\"Распределение классов в обучающей выборке:\")\n",
|
||||
" print(y_train.value_counts(normalize=True))\n",
|
||||
" \n",
|
||||
@@ -435,22 +421,18 @@
|
||||
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
|
||||
" print(y_test.value_counts(normalize=True))\n",
|
||||
"\n",
|
||||
" # Создание фигуры и осей для трех столбчатых диаграмм\n",
|
||||
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
|
||||
" fig.suptitle('Распределение в различных выборках')\n",
|
||||
"\n",
|
||||
" # Обучающая выборка\n",
|
||||
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
|
||||
" axes[0].set_title('Обучающая выборка')\n",
|
||||
" axes[0].set_xlabel(y_name)\n",
|
||||
" axes[0].set_ylabel('Доля')\n",
|
||||
"\n",
|
||||
" # Контрольная выборка\n",
|
||||
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
|
||||
" axes[1].set_title('Контрольная выборка')\n",
|
||||
" axes[1].set_xlabel(y_name)\n",
|
||||
"\n",
|
||||
" # Тестовая выборка\n",
|
||||
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
|
||||
" axes[2].set_title('Тестовая выборка')\n",
|
||||
" axes[2].set_xlabel(y_name)\n",
|
||||
@@ -469,7 +451,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 142,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -509,11 +491,9 @@
|
||||
"source": [
|
||||
"ros = RandomOverSampler(random_state=42)\n",
|
||||
"\n",
|
||||
"# Применение RandomOverSampler для балансировки выборок\n",
|
||||
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
||||
"X_val_resampled, y_val_resampled = ros.fit_resample(X_val, y_val)\n",
|
||||
"\n",
|
||||
"# Проверка сбалансированности после RandomOverSampler\n",
|
||||
"analyze_balance(y_train_resampled, y_val_resampled, y_test, 'stroke')"
|
||||
]
|
||||
},
|
||||
@@ -530,7 +510,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 143,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -575,16 +555,12 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Определение категориальных признаков\n",
|
||||
"categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
|
||||
"\n",
|
||||
"# Применение one-hot encoding к обучающей выборке\n",
|
||||
"X_train_encoded = pd.get_dummies(X_train_resampled, columns=categorical_features, drop_first=True)\n",
|
||||
"\n",
|
||||
"# Применение one-hot encoding к контрольной выборке\n",
|
||||
"X_val_encoded = pd.get_dummies(X_val_resampled, columns=categorical_features, drop_first=True)\n",
|
||||
"\n",
|
||||
"# Применение one-hot encoding к тестовой выборке\n",
|
||||
"X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)\n",
|
||||
"\n",
|
||||
"print(X_train_encoded.head())"
|
||||
@@ -599,7 +575,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 144,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -644,21 +620,17 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Определение числовых признаков для дискретизации\n",
|
||||
"numerical_features = ['age']\n",
|
||||
"\n",
|
||||
"# Функция для дискретизации числовых признаков\n",
|
||||
"def discretize_features(df, features, bins, labels):\n",
|
||||
" for feature in features:\n",
|
||||
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
|
||||
" df.drop(columns=[feature], inplace=True)\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"# Заданные интервалы и метки\n",
|
||||
"age_bins = [0, 25, 55, 100]\n",
|
||||
"age_labels = [\"young\", \"middle-aged\", \"old\"]\n",
|
||||
"\n",
|
||||
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
|
||||
"X_train_encoded = discretize_features(X_train_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
|
||||
"X_val_encoded = discretize_features(X_val_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
|
||||
"X_test_encoded = discretize_features(X_test_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
|
||||
@@ -741,7 +713,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 146,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -786,7 +758,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Пример масштабирования числовых признаков\n",
|
||||
"numerical_features = ['avg_glucose_level', 'bmi', 'glucose_age_deviation']\n",
|
||||
"\n",
|
||||
"scaler = StandardScaler()\n",
|
||||
@@ -806,7 +777,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 147,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -872,7 +843,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data = X_train_encoded.copy() # Используем предобработанные данные\n",
|
||||
"data = X_train_encoded.copy()\n",
|
||||
"\n",
|
||||
"es = ft.EntitySet(id=\"patients\")\n",
|
||||
"\n",
|
||||
@@ -918,7 +889,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 148,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -930,23 +901,15 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train_encoded = pd.get_dummies(X_train_encoded, drop_first=True)\n",
|
||||
"X_val_encoded = pd.get_dummies(X_val_encoded, drop_first=True)\n",
|
||||
"X_test_encoded = pd.get_dummies(X_test_encoded, drop_first=True)\n",
|
||||
"\n",
|
||||
"all_columns = X_train_encoded.columns\n",
|
||||
"X_train_encoded = X_train_encoded.reindex(columns=all_columns, fill_value=0)\n",
|
||||
"X_val_encoded = X_val_encoded.reindex(columns=all_columns, fill_value=0)\n",
|
||||
"X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)\n",
|
||||
"\n",
|
||||
"# Выбор модели\n",
|
||||
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
|
||||
"\n",
|
||||
"# Начинаем отсчет времени\n",
|
||||
"start_time = time.time()\n",
|
||||
"model.fit(X_train_encoded, y_train_resampled)\n",
|
||||
"\n",
|
||||
"# Время обучения модели\n",
|
||||
"train_time = time.time() - start_time\n",
|
||||
"\n",
|
||||
"print(f'Время обучения модели: {train_time:.2f} секунд')"
|
||||
@@ -954,7 +917,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 149,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -985,11 +948,9 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Получение важности признаков\n",
|
||||
"importances = model.feature_importances_\n",
|
||||
"feature_names = X_train_encoded.columns\n",
|
||||
"\n",
|
||||
"# Сортировка признаков по важности\n",
|
||||
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
|
||||
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
|
||||
"\n",
|
||||
@@ -999,7 +960,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 150,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -1037,7 +998,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Предсказание и оценка\n",
|
||||
"y_pred = model.predict(X_test_encoded)\n",
|
||||
"\n",
|
||||
"accuracy = accuracy_score(y_test, y_pred)\n",
|
||||
@@ -1052,12 +1012,10 @@
|
||||
"print(f\"F1 Score: {f1}\")\n",
|
||||
"print(f\"ROC AUC: {roc_auc}\")\n",
|
||||
"\n",
|
||||
"# Кросс-валидация\n",
|
||||
"scores = cross_val_score(model, X_train_encoded, y_train_resampled, cv=5, scoring='accuracy')\n",
|
||||
"accuracy_cv = scores.mean()\n",
|
||||
"print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n",
|
||||
"\n",
|
||||
"# Анализ важности признаков\n",
|
||||
"feature_importances = model.feature_importances_\n",
|
||||
"feature_names = X_train_encoded.columns\n",
|
||||
"\n",
|
||||
@@ -1069,7 +1027,6 @@
|
||||
"plt.title('Feature Importance')\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Проверка на переобучение\n",
|
||||
"y_train_pred = model.predict(X_train_encoded)\n",
|
||||
"\n",
|
||||
"accuracy_train = accuracy_score(y_train_resampled, y_train_pred)\n",
|
||||
|
||||
@@ -1153,7 +1153,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 86,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -1171,13 +1171,10 @@
|
||||
"import numpy as np\n",
|
||||
"from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score\n",
|
||||
"\n",
|
||||
"# Получаем уникальные классы для целевого признака из тренировочного набора данных\n",
|
||||
"unique_classes = np.unique(y_train)\n",
|
||||
"\n",
|
||||
"# Генерируем случайные предсказания, выбирая случайное значение из области значений целевого признака\n",
|
||||
"random_predictions = np.random.choice(unique_classes, size=len(y_test))\n",
|
||||
"\n",
|
||||
"# Вычисление метрик для ориентира\n",
|
||||
"baseline_accuracy = accuracy_score(y_test, random_predictions)\n",
|
||||
"baseline_precision = precision_score(y_test, random_predictions)\n",
|
||||
"baseline_recall = recall_score(y_test, random_predictions)\n",
|
||||
@@ -1624,7 +1621,6 @@
|
||||
"from sklearn.model_selection import GridSearchCV\n",
|
||||
"from sklearn import neighbors, ensemble, neural_network\n",
|
||||
"\n",
|
||||
"# Словарь с вариантами гиперпараметров для каждой модели\n",
|
||||
"param_grids = {\n",
|
||||
" \"knn\": {\n",
|
||||
" \"n_neighbors\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], \n",
|
||||
@@ -1648,22 +1644,17 @@
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Создаем экземпляры моделей\n",
|
||||
"models = {\n",
|
||||
" \"knn\": neighbors.KNeighborsClassifier(),\n",
|
||||
" \"random_forest\": ensemble.RandomForestClassifier(),\n",
|
||||
" \"mlp\": neural_network.MLPClassifier()\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Словарь для хранения моделей с их лучшими параметрами\n",
|
||||
"class_models = {}\n",
|
||||
"\n",
|
||||
"# Выполнение поиска по сетке для каждой модели\n",
|
||||
"for model_name, model in models.items():\n",
|
||||
" # Создаем GridSearchCV для текущей модели\n",
|
||||
" gs_optimizer = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring=\"f1\", n_jobs=-1)\n",
|
||||
" \n",
|
||||
" # Обучаем GridSearchCV\n",
|
||||
" gs_optimizer.fit(preprocessed_df, y_train.values.ravel())\n",
|
||||
" \n",
|
||||
" # Получаем лучшие параметры\n",
|
||||
@@ -1671,7 +1662,7 @@
|
||||
" print(f\"Лучшие параметры для {model_name}: {best_params}\")\n",
|
||||
" \n",
|
||||
" class_models[model_name] = {\n",
|
||||
" \"model\": model.set_params(**best_params) # Настраиваем модель с лучшими параметрами\n",
|
||||
" \"model\": model.set_params(**best_params) \n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
@@ -2586,7 +2577,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 94,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -2603,10 +2594,8 @@
|
||||
"import math\n",
|
||||
"from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
|
||||
"\n",
|
||||
"# Базовое предсказание: среднее значение по y_train\n",
|
||||
"baseline_predictions = [y_train.mean()] * len(y_test)\n",
|
||||
"\n",
|
||||
"# Вычисление метрик качества для ориентира\n",
|
||||
"baseline_rmse = math.sqrt(\n",
|
||||
" mean_squared_error(y_test, baseline_predictions)\n",
|
||||
" )\n",
|
||||
@@ -3111,7 +3100,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Словарь с вариантами гиперпараметров для каждой модели\n",
|
||||
"param_grids = {\n",
|
||||
" \"knn\": {\n",
|
||||
" \"n_neighbors\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], \n",
|
||||
@@ -3136,30 +3124,24 @@
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Создаем экземпляры моделей\n",
|
||||
"models = {\n",
|
||||
" \"knn\": neighbors.KNeighborsRegressor(),\n",
|
||||
" \"random_forest\": ensemble.RandomForestRegressor(),\n",
|
||||
" \"mlp\": neural_network.MLPRegressor()\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Словарь для хранения моделей с их лучшими параметрами\n",
|
||||
"class_models = {}\n",
|
||||
"\n",
|
||||
"# Выполнение поиска по сетке для каждой модели\n",
|
||||
"for model_name, model in models.items():\n",
|
||||
" # Создаем GridSearchCV для текущей модели\n",
|
||||
" gs_optimizer = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring='neg_mean_squared_error', n_jobs=-1)\n",
|
||||
" \n",
|
||||
" # Обучаем GridSearchCV\n",
|
||||
" gs_optimizer.fit(preprocessed_df, y_train.values.ravel())\n",
|
||||
" \n",
|
||||
" # Получаем лучшие параметры\n",
|
||||
" best_params = gs_optimizer.best_params_\n",
|
||||
" print(f\"Лучшие параметры для {model_name}: {best_params}\")\n",
|
||||
" \n",
|
||||
" class_models[model_name] = {\n",
|
||||
" \"model\": model.set_params(**best_params) # Настраиваем модель с лучшими параметрами\n",
|
||||
" \"model\": model.set_params(**best_params)\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
@@ -3323,7 +3305,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 100,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -3379,7 +3361,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Создаем графики для всех моделей\n",
|
||||
"for model_name, model_data in class_models.items():\n",
|
||||
" print(f\"Model: {model_name}\")\n",
|
||||
" y_pred = model_data[\"preds\"]\n",
|
||||
|
||||
@@ -308,7 +308,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 353,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -348,17 +348,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Количество пустых значений признаков\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Есть ли пустые значения признаков\n",
|
||||
"print(df.isnull().any())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Процент пустых значений признаков\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
@@ -367,11 +364,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 354,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Замена значений\n",
|
||||
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -114,7 +114,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -122,15 +122,9 @@
|
||||
"import emoji\n",
|
||||
"from num2words import num2words\n",
|
||||
"\n",
|
||||
"# Функция для преобразования эмоджи в слова\n",
|
||||
"def emojis_words(text):\n",
|
||||
" \n",
|
||||
" # Модуль emoji: преобразование эмоджи в их словесные описания\n",
|
||||
" text = emoji.demojize(text, delimiters=(\" \", \" \"))\n",
|
||||
" \n",
|
||||
" # Редактирование текста путём замены \":\" и\" _\", а так же - путём добавления пробела между отдельными словами\n",
|
||||
" text = text.replace(\":\", \"\").replace(\"_\", \" \")\n",
|
||||
" \n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"def transform_text(text):\n",
|
||||
@@ -140,22 +134,17 @@
|
||||
" # Удаление из текста всех URL и ссылок\n",
|
||||
" text = re.sub(r'http\\S+', '', text)\n",
|
||||
"\n",
|
||||
" # Преобразование эмоджи в текст\n",
|
||||
" text = emojis_words(text)\n",
|
||||
"\n",
|
||||
" # Приведение к нижнему регистру\n",
|
||||
" text = text.lower()\n",
|
||||
"\n",
|
||||
" # Удаление лишних пробелов\n",
|
||||
" text = re.sub(r'\\s+', ' ', text) \n",
|
||||
" \n",
|
||||
" # Преобразование \"ё\" в \"е\"\n",
|
||||
" text = text.replace(\"ё\", \"е\")\n",
|
||||
"\n",
|
||||
" # Удаление всех специальных символов\n",
|
||||
" text = re.sub(r'[^a-zA-Zа-яА-Я0-9\\s]', '', text)\n",
|
||||
"\n",
|
||||
" # Преобразование чисел в слова\n",
|
||||
" words: list[str] = text.split()\n",
|
||||
" words = [num2words(word, lang=\"ru\") if word.isdigit() else word for word in words]\n",
|
||||
" text = \" \".join(words)\n",
|
||||
@@ -177,7 +166,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -197,16 +186,15 @@
|
||||
" doc = sp(text)\n",
|
||||
" \n",
|
||||
" filtered_tokens = [\n",
|
||||
" f\"{token.lemma_}_{token.pos_}_{token.morph}\" # Формирование строки с нужным форматом\n",
|
||||
" f\"{token.lemma_}_{token.pos_}_{token.morph}\"\n",
|
||||
" for token in doc\n",
|
||||
" if token.text not in stop_words and len(token.text) <= 20 # Фильтрация \n",
|
||||
" if token.text not in stop_words and len(token.text) <= 20 \n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" return \" \".join(filtered_tokens)\n",
|
||||
"\n",
|
||||
"df[\"preprocessed_text\"] = df[\"preprocessed_text\"].apply(preprocess_text)\n",
|
||||
"\n",
|
||||
"# Выведем 10 токенов из первого текста\n",
|
||||
"first_text_tokens = df[\"preprocessed_text\"].iloc[0].split()[:10]\n",
|
||||
"print(\" \".join(first_text_tokens))"
|
||||
]
|
||||
@@ -220,7 +208,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -299,10 +287,8 @@
|
||||
" n_grams: list[tuple] = list(ngrams(tokens, n))\n",
|
||||
" return n_grams\n",
|
||||
"\n",
|
||||
"# Пример для биграмм (N=2)\n",
|
||||
"df[\"bigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=2))\n",
|
||||
"\n",
|
||||
"# Пример для триграмм (N=3)\n",
|
||||
"df[\"trigrams\"] = df[\"preprocessed_text\"].apply(lambda x: generate_ngrams(x, n=3))\n",
|
||||
"\n",
|
||||
"print(df.iloc[15:25])"
|
||||
|
||||
@@ -98,7 +98,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"id": "5b915c12",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -117,11 +117,8 @@
|
||||
"def preprocess_images(images):\n",
|
||||
" processed_images = []\n",
|
||||
" for img in images:\n",
|
||||
" # Изменение размера\n",
|
||||
" img_resized = cv2.resize(img, (128, 128))\n",
|
||||
" # Преобразование в оттенки серого\n",
|
||||
" img_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)\n",
|
||||
" # Увеличение контраста с помощью выравнивания гистограммы\n",
|
||||
" img_eq = cv2.equalizeHist(img_gray)\n",
|
||||
" processed_images.append(img_eq)\n",
|
||||
" return np.array(processed_images)\n",
|
||||
@@ -156,7 +153,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"id": "7cc2f6b2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -173,12 +170,9 @@
|
||||
],
|
||||
"source": [
|
||||
"def apply_filters(img):\n",
|
||||
" # Удаление шумов\n",
|
||||
" img_blur = cv2.GaussianBlur(img, (5, 5), 0)\n",
|
||||
" # Повышение резкости\n",
|
||||
" kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])\n",
|
||||
" img_sharp = cv2.filter2D(img_blur, -1, kernel)\n",
|
||||
" # Определение границ\n",
|
||||
" img_edges = cv2.Canny(img_sharp, 100, 200)\n",
|
||||
" return img_edges\n",
|
||||
"\n",
|
||||
|
||||
Reference in New Issue
Block a user