коммит

2024-11-15 22:35:48 +04:00 · 2024-11-15 22:35:48 +04:00 · 41c4ab91ed
commit 41c4ab91ed
parent eadc896314
4 changed files with 320 additions and 1 deletions
--- a/lab_1/lab1.ipynb
+++ b/lab_1/lab1.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
--- a/lab_4/pycache/utils.cpython-312.pyc
+++ b/lab_4/pycache/utils.cpython-312.pyc
--- a/lab_4/lab4.ipynb
+++ b/lab_4/lab4.ipynb
@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
+    "print(df.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Определим бизнес цели:\n",
+    "## 1- Прогнозирование состояния миллиардера(регрессия)\n",
+    "## 2- Прогнозирование возраста миллиардера(классификация)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Networth  Age        Country              Source                Industry\n",
+      "0     219.0   50  United States       Tesla, SpaceX             Automotive \n",
+      "1     171.0   58  United States              Amazon             Technology \n",
+      "2     158.0   73         France                LVMH       Fashion & Retail \n",
+      "3     129.0   66  United States           Microsoft             Technology \n",
+      "4     118.0   91  United States  Berkshire Hathaway  Finance & Investments \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Удаление колонок 'rank' и 'name'\n",
+    "df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
+    "\n",
+    "# Проверка, что колонки были удалены\n",
+    "print(df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Networth  Country_Argentina  Country_Australia  Country_Austria  \\\n",
+      "0     219.0              False              False            False   \n",
+      "1     171.0              False              False            False   \n",
+      "2     158.0              False              False            False   \n",
+      "3     129.0              False              False            False   \n",
+      "4     118.0              False              False            False   \n",
+      "\n",
+      "   Country_Barbados  Country_Belgium  Country_Belize  Country_Brazil  \\\n",
+      "0             False            False           False           False   \n",
+      "1             False            False           False           False   \n",
+      "2             False            False           False           False   \n",
+      "3             False            False           False           False   \n",
+      "4             False            False           False           False   \n",
+      "\n",
+      "   Country_Bulgaria  Country_Canada  ...  wind  wine  winter  wire  wireless  \\\n",
+      "0             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "1             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "2             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "3             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "4             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "\n",
+      "   yahoo  yogurt  zara  zoom    Age  \n",
+      "0    0.0     0.0   0.0   0.0  50-60  \n",
+      "1    0.0     0.0   0.0   0.0  50-60  \n",
+      "2    0.0     0.0   0.0   0.0  70-80  \n",
+      "3    0.0     0.0   0.0   0.0  60-70  \n",
+      "4    0.0     0.0   0.0   0.0    80+  \n",
+      "\n",
+      "[5 rows x 828 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
+    "df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
+    "df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
+    "\n",
+    "# Преобразуем колонку 'source' с помощью TF-IDF\n",
+    "tfidf_vectorizer = TfidfVectorizer(max_features=1000)  \n",
+    "X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
+    "\n",
+    "# Создаем DataFrame с результатами TF-IDF\n",
+    "df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
+    "\n",
+    "bins = [0, 30, 40, 50, 60, 70, 80, 100]  # границы для возрастных категорий\n",
+    "labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+']  # метки для категорий\n",
+    "\n",
+    "# Создаем новую колонку 'age_group', где будет храниться категория\n",
+    "df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
+    "\n",
+    "\n",
+    "# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
+    "df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
+    "\n",
+    "# Объединяем все преобразованные данные в один DataFrame\n",
+    "df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
+    "\n",
+    "# Просмотр результата\n",
+    "print(df_transformed.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'df_transformed' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m     42\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m     43\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m     45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m     \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m     47\u001b[0m )\n\u001b[0;32m     49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m     50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "from typing import Tuple\n",
+    "import pandas as pd\n",
+    "from pandas import DataFrame\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "def split_stratified_into_train_val_test(\n",
+    "    df_input,\n",
+    "    stratify_colname=\"y\",\n",
+    "    frac_train=0.6,\n",
+    "    frac_val=0.15,\n",
+    "    frac_test=0.25,\n",
+    "    random_state=None,\n",
+    ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
+    "   \n",
+    "    if frac_train + frac_val + frac_test != 1.0:\n",
+    "        raise ValueError(\n",
+    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
+    "            % (frac_train, frac_val, frac_test)\n",
+    "        )\n",
+    "    if stratify_colname not in df_input.columns:\n",
+    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
+    "    X = df_input  # Contains all columns.\n",
+    "    y = df_input[\n",
+    "        [stratify_colname]\n",
+    "    ]  # Dataframe of just the column on which to stratify.\n",
+    "    # Split original dataframe into train and temp dataframes.\n",
+    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
+    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+    "    )\n",
+    "    if frac_val <= 0:\n",
+    "        assert len(df_input) == len(df_train) + len(df_temp)\n",
+    "        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
+    "    # Split the temp dataframe into val and test dataframes.\n",
+    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
+    "    df_val, df_test, y_val, y_test = train_test_split(\n",
+    "        df_temp,\n",
+    "        y_temp,\n",
+    "        stratify=y_temp,\n",
+    "        test_size=relative_frac_test,\n",
+    "        random_state=random_state,\n",
+    "    )\n",
+    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+    "    return df_train, df_val, df_test, y_train, y_val, y_test\n",
+    "\n",
+    "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
+    "    df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
+    ")\n",
+    "\n",
+    "display(\"X_train\", X_train)\n",
+    "display(\"y_train\", y_train)\n",
+    "\n",
+    "display(\"X_test\", X_test)\n",
+    "display(\"y_test\", y_test)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/lab_4/utils.py
+++ b/lab_4/utils.py
@ -0,0 +1,79 @@
+from typing import Tuple
+
+import pandas as pd
+from pandas import DataFrame
+from sklearn.model_selection import train_test_split
+
+
+def split_stratified_into_train_val_test(
+    df_input,
+    stratify_colname="y",
+    frac_train=0.6,
+    frac_val=0.15,
+    frac_test=0.25,
+    random_state=None,
+) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
+    """
+    Splits a Pandas dataframe into three subsets (train, val, and test)
+    following fractional ratios provided by the user, where each subset is
+    stratified by the values in a specific column (that is, each subset has
+    the same relative frequency of the values in the column). It performs this
+    splitting by running train_test_split() twice.
+
+    Parameters
+    ----------
+    df_input : Pandas dataframe
+        Input dataframe to be split.
+    stratify_colname : str
+        The name of the column that will be used for stratification. Usually
+        this column would be for the label.
+    frac_train : float
+    frac_val   : float
+    frac_test  : float
+        The ratios with which the dataframe will be split into train, val, and
+        test data. The values should be expressed as float fractions and should
+        sum to 1.0.
+    random_state : int, None, or RandomStateInstance
+        Value to be passed to train_test_split().
+
+    Returns
+    -------
+    df_train, df_val, df_test :
+        Dataframes containing the three splits.
+    """
+
+    if frac_train + frac_val + frac_test != 1.0:
+        raise ValueError(
+            "fractions %f, %f, %f do not add up to 1.0"
+            % (frac_train, frac_val, frac_test)
+        )
+
+    if stratify_colname not in df_input.columns:
+        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
+
+    X = df_input  # Contains all columns.
+    y = df_input[
+        [stratify_colname]
+    ]  # Dataframe of just the column on which to stratify.
+
+    # Split original dataframe into train and temp dataframes.
+    df_train, df_temp, y_train, y_temp = train_test_split(
+        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
+    )
+
+    if frac_val <= 0:
+        assert len(df_input) == len(df_train) + len(df_temp)
+        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
+
+    # Split the temp dataframe into val and test dataframes.
+    relative_frac_test = frac_test / (frac_val + frac_test)
+    df_val, df_test, y_val, y_test = train_test_split(
+        df_temp,
+        y_temp,
+        stratify=y_temp,
+        test_size=relative_frac_test,
+        random_state=random_state,
+    )
+
+    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
+    return df_train, df_val, df_test, y_train, y_val, y_test