diff --git a/lab_1/lab1.ipynb b/lab_1/lab1.ipynb index acd4cbb..716945b 100644 --- a/lab_1/lab1.ipynb +++ b/lab_1/lab1.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/lab_4/__pycache__/utils.cpython-312.pyc b/lab_4/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000..8d965a8 Binary files /dev/null and b/lab_4/__pycache__/utils.cpython-312.pyc differ diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb new file mode 100644 index 0000000..f016a48 --- /dev/null +++ b/lab_4/lab4.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Определим бизнес цели:\n", + "## 1- Прогнозирование состояния миллиардера(регрессия)\n", + "## 2- Прогнозирование возраста миллиардера(классификация)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Networth Age Country Source Industry\n", + "0 219.0 50 United States Tesla, SpaceX Automotive \n", + "1 171.0 58 United States Amazon Technology \n", + "2 158.0 73 France LVMH Fashion & Retail \n", + "3 129.0 66 United States Microsoft Technology \n", + "4 118.0 91 United States Berkshire Hathaway Finance & Investments \n" + ] + } + ], + "source": [ + "# Удаление колонок 'rank' и 'name'\n", + "df.drop(columns=['Rank ', 'Name'], inplace=True)\n", + "\n", + "# Проверка, что колонки были удалены\n", + "print(df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Networth Country_Argentina Country_Australia Country_Austria \\\n", + "0 219.0 False False False \n", + "1 171.0 False False False \n", + "2 158.0 False False False \n", + "3 129.0 False False False \n", + "4 118.0 False False False \n", + "\n", + " Country_Barbados Country_Belgium Country_Belize Country_Brazil \\\n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", + "\n", + " Country_Bulgaria Country_Canada ... wind wine winter wire wireless \\\n", + "0 False False ... 0.0 0.0 0.0 0.0 0.0 \n", + "1 False False ... 0.0 0.0 0.0 0.0 0.0 \n", + "2 False False ... 0.0 0.0 0.0 0.0 0.0 \n", + "3 False False ... 0.0 0.0 0.0 0.0 0.0 \n", + "4 False False ... 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " yahoo yogurt zara zoom Age \n", + "0 0.0 0.0 0.0 0.0 50-60 \n", + "1 0.0 0.0 0.0 0.0 50-60 \n", + "2 0.0 0.0 0.0 0.0 70-80 \n", + "3 0.0 0.0 0.0 0.0 60-70 \n", + "4 0.0 0.0 0.0 0.0 80+ \n", + "\n", + "[5 rows x 828 columns]\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n", + "df_country = pd.get_dummies(df[['Country']], drop_first=True)\n", + "df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n", + "\n", + "# Преобразуем колонку 'source' с помощью TF-IDF\n", + "tfidf_vectorizer = TfidfVectorizer(max_features=1000) \n", + "X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n", + "\n", + "# Создаем DataFrame с результатами TF-IDF\n", + "df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n", + "\n", + "bins = [0, 30, 40, 50, 60, 70, 80, 100] # границы для возрастных категорий\n", + "labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n", + "\n", + "# Создаем новую колонку 'age_group', где будет храниться категория\n", + "df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n", + "\n", + "\n", + "# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n", + "df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n", + "\n", + "# Объединяем все преобразованные данные в один DataFrame\n", + "df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n", + "\n", + "# Просмотр результата\n", + "print(df_transformed.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df_transformed' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m 45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n", + "\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined" + ] + } + ], + "source": [ + "from typing import Tuple\n", + "import pandas as pd\n", + "from pandas import DataFrame\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n", + " \n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + " if frac_val <= 0:\n", + " assert len(df_input) == len(df_train) + len(df_temp)\n", + " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + " return df_train, df_val, df_test, y_train, y_val, y_test\n", + "\n", + "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", + " df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n", + ")\n", + "\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_4/utils.py b/lab_4/utils.py new file mode 100644 index 0000000..cb8c396 --- /dev/null +++ b/lab_4/utils.py @@ -0,0 +1,79 @@ +from typing import Tuple + +import pandas as pd +from pandas import DataFrame +from sklearn.model_selection import train_test_split + + +def split_stratified_into_train_val_test( + df_input, + stratify_colname="y", + frac_train=0.6, + frac_val=0.15, + frac_test=0.25, + random_state=None, +) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]: + """ + Splits a Pandas dataframe into three subsets (train, val, and test) + following fractional ratios provided by the user, where each subset is + stratified by the values in a specific column (that is, each subset has + the same relative frequency of the values in the column). It performs this + splitting by running train_test_split() twice. + + Parameters + ---------- + df_input : Pandas dataframe + Input dataframe to be split. + stratify_colname : str + The name of the column that will be used for stratification. Usually + this column would be for the label. + frac_train : float + frac_val : float + frac_test : float + The ratios with which the dataframe will be split into train, val, and + test data. The values should be expressed as float fractions and should + sum to 1.0. + random_state : int, None, or RandomStateInstance + Value to be passed to train_test_split(). + + Returns + ------- + df_train, df_val, df_test : + Dataframes containing the three splits. + """ + + if frac_train + frac_val + frac_test != 1.0: + raise ValueError( + "fractions %f, %f, %f do not add up to 1.0" + % (frac_train, frac_val, frac_test) + ) + + if stratify_colname not in df_input.columns: + raise ValueError("%s is not a column in the dataframe" % (stratify_colname)) + + X = df_input # Contains all columns. + y = df_input[ + [stratify_colname] + ] # Dataframe of just the column on which to stratify. + + # Split original dataframe into train and temp dataframes. + df_train, df_temp, y_train, y_temp = train_test_split( + X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state + ) + + if frac_val <= 0: + assert len(df_input) == len(df_train) + len(df_temp) + return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp + + # Split the temp dataframe into val and test dataframes. + relative_frac_test = frac_test / (frac_val + frac_test) + df_val, df_test, y_val, y_test = train_test_split( + df_temp, + y_temp, + stratify=y_temp, + test_size=relative_frac_test, + random_state=random_state, + ) + + assert len(df_input) == len(df_train) + len(df_val) + len(df_test) + return df_train, df_val, df_test, y_train, y_val, y_test