From 41c4ab91edd307d6fc96e24670a1b7639b6a9bbb Mon Sep 17 00:00:00 2001
From: "annalyovushkina@yandex.ru" <annalyovushkina@yandex.ru>
Date: Fri, 15 Nov 2024 22:35:48 +0400
Subject: [PATCH] =?UTF-8?q?=D0=BA=D0=BE=D0=BC=D0=BC=D0=B8=D1=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lab_1/lab1.ipynb                        |   2 +-
 lab_4/__pycache__/utils.cpython-312.pyc | Bin 0 -> 2826 bytes
 lab_4/lab4.ipynb                        | 240 ++++++++++++++++++++++++
 lab_4/utils.py                          |  79 ++++++++
 4 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 lab_4/__pycache__/utils.cpython-312.pyc
 create mode 100644 lab_4/lab4.ipynb
 create mode 100644 lab_4/utils.py

diff --git a/lab_1/lab1.ipynb b/lab_1/lab1.ipynb
index acd4cbb..716945b 100644
--- a/lab_1/lab1.ipynb
+++ b/lab_1/lab1.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
diff --git a/lab_4/__pycache__/utils.cpython-312.pyc b/lab_4/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d965a85f693d9f63443297b54e49ad0ec6741b7
GIT binary patch
literal 2826
zcma)8O>7fK6yEjP>$M&KBnX00RHg;e7#RZv8k!nV3IxKRfbi2sk*tk(94}e#nw@om
zqqP*F9wKqct*Jn2r1p>s4v|__tw1VLFO_<+>7mvNq)ME63nC{@eKX#59Jny{?##}-
z_vXFteQ$REiblg6w8YK9sb669l0JAX<PKhs!{8fE;S^ruO8hv_)3HzUjSJ8Vny=&^
z_p>>_7AT41BF}}|KP9LHFpvEkzk-+qdj+2&ejOMOR(&byKe|zhcLLWcWes0~QRtLx
z$s?v*!tju=Oj*@)7A97Xlr`0IL=#&TQ_qW3(QQHK1-M>^ebigqXA5)iX#441;im^V
z&g`~$cwkNOQ3=~$5az(jLwEizZ}T>H3oN|dW_UB!p)J__TY6kgv^krf;%RiXpcU{e
z*rFY@g9>l^<^W+k@W}trU7PTOq*M#l!nKGkD1qr6;NumGM#7ri_N!w~@_g->3wd)i
z*69I`BffJS#3ET<4!K(_&#qy&d+7Fd$PT}!WTYUD0j3l=y4PF;BIxRfHDpKN*Y!@6
z(ORq)uO&Kse&5)low0T3$2#XbW45Dq%nsXeJ3)O-iE=-{97fQ_D0v8y(PddzWP%hR
zS%H!ask&t#t7u}3NM({>kQ!jDOQRWCOCyj+lz%B^r(kHBF{|oR2(<H-YUr|tOc@3Q
zl}%$tRj`64t6&hL3c+R?%@(nV5tj2s*FI7SGeuw*OD(7y3b0=V6YvQ9BxRh3#XQOz
zTBW3ePes`R)$}G)MY&~*ZbxV|Cf2~{3<me`?F!cORb&*HHyg&OaLZ~cgGMb>#%94V
zO9aF`*C<*RjS@mJE4ogHZ<pdg3R$yi9%sTVoXawW)52!UqkZkhR+U0d)yowN4ZV{A
zwoY?2$^<qt19_Q*u&6JYPQ<D?@Ls3U4j}-st$z*TV$*KKE^-XGSyj`hHOPsAAX=)(
z(9hG{=^1pLRAf!7wvACNJnUzUiUwH)P(hKRr^%C8YxzWJkYmt*HwDd*EoOpE1LP3a
zJJXa+o2O~?)O(jmv#M1D-l#wsy4waKUK9)ym$~$S+@R`)|8x_onPHaQ?31Dw3cgb|
zF`;NcH88*1jD|7+&bLI{_DH3KARz}c2U>BPAY+tr#F8zV2dJVnx@71$jR2#oY;laP
zjp`tx=W*N64N7uq3h*q;%mZ40Z=Lf4DVJ7P*>S&#Q_K_)is_^gj37GJGh~O_Tb=TX
zoMuwD04T;|fYhybb||#snQ9$~YbWMXUS$)sr;v6pq!>syEF>!ms+3(M=63CYY9*}&
zk1SBVo066i9LXhH)d{uUWa=Dtx;haw>1!t?I$?&+Y11@JC)naIG3hJA5j3njzOrJ%
ziN>Y69m%UGC*fK5%EAe6kkXMnCOV<kEbzjlF9e7G(&4Mj4g=B&Qb=*B>;$VVXdMv-
zbfU*QRdQ_;j@W|T)gqP?sM0jFbcjW|VGBy@P3=uKKQwV2jzuyd>u_>T$ZBa~c=TfK
z^5~?p@6f^Ag{m=gofM~4T^`O1Pe7gJ4o_4pRU?^l)!EH>51jK3PB>F6qvb5!oP8&l
z2zHj>BIn@EKF_T|C-p3zZ3O$4s*T{l%E@QJ{m*0Dn_Yv=cyj6J@>liP{#ODYJ@8r(
z;^8$;2!+?eoV0!E&ObZ%H{j32>&fHwFRnI{V}Hl4)un6yN{RX5rj%%gl67gvE5R2M
z=ZDwB+}@$46k0gAczbzlWvH>|$kT~+!M9gz`1|M2E=(=XE{kgtCnXjS-mN{4_beX0
zSNlO&KKp(2XJPfsPhAa^eHPEwrEEQ(ZN_^a^nBO%ZQuP5R(e-Ie)7rh+268{kN^2;
zJ$|JwU0D;k9lMr`tI0<HP$T(y-M{Vmo+C{u-jrg^?%rn4?&j7H*CS#-Z2i&v$ikV$
z>q{f+39c)?aOd8!2Pc=M`xhUbTooUlZ}bg5+4bwx<8-5M<XL2-?jNZ~Mw*c=i#L{T
zE>EpgRtxn=w(ifq21vP7z}yPm)FIYho^S_$oN&vhe&SL}azv|IhDtN(H<=@r;V_Vi
z8G{E#UZ<B#Ygjh*OvzBNmLpih>`lsu&eHcIXMRY<cf#$Blm)qphL4;5@JPSb$Z5E(
z2|Ulg;I_Zyw!Gl_UT}vYY1@*`cuzCh-Hi1#ySKiIp5ggpYcWoYEPS@8E+<z;SF2A`
Q^~m`~;KF*GF0yU@2OB6GD*ylh

literal 0
HcmV?d00001

diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb
new file mode 100644
index 0000000..f016a48
--- /dev/null
+++ b/lab_4/lab4.ipynb
@@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
+    "print(df.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Определим бизнес цели:\n",
+    "## 1- Прогнозирование состояния миллиардера(регрессия)\n",
+    "## 2- Прогнозирование возраста миллиардера(классификация)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Networth  Age        Country              Source                Industry\n",
+      "0     219.0   50  United States       Tesla, SpaceX             Automotive \n",
+      "1     171.0   58  United States              Amazon             Technology \n",
+      "2     158.0   73         France                LVMH       Fashion & Retail \n",
+      "3     129.0   66  United States           Microsoft             Technology \n",
+      "4     118.0   91  United States  Berkshire Hathaway  Finance & Investments \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Удаление колонок 'rank' и 'name'\n",
+    "df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
+    "\n",
+    "# Проверка, что колонки были удалены\n",
+    "print(df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Networth  Country_Argentina  Country_Australia  Country_Austria  \\\n",
+      "0     219.0              False              False            False   \n",
+      "1     171.0              False              False            False   \n",
+      "2     158.0              False              False            False   \n",
+      "3     129.0              False              False            False   \n",
+      "4     118.0              False              False            False   \n",
+      "\n",
+      "   Country_Barbados  Country_Belgium  Country_Belize  Country_Brazil  \\\n",
+      "0             False            False           False           False   \n",
+      "1             False            False           False           False   \n",
+      "2             False            False           False           False   \n",
+      "3             False            False           False           False   \n",
+      "4             False            False           False           False   \n",
+      "\n",
+      "   Country_Bulgaria  Country_Canada  ...  wind  wine  winter  wire  wireless  \\\n",
+      "0             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "1             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "2             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "3             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "4             False           False  ...   0.0   0.0     0.0   0.0       0.0   \n",
+      "\n",
+      "   yahoo  yogurt  zara  zoom    Age  \n",
+      "0    0.0     0.0   0.0   0.0  50-60  \n",
+      "1    0.0     0.0   0.0   0.0  50-60  \n",
+      "2    0.0     0.0   0.0   0.0  70-80  \n",
+      "3    0.0     0.0   0.0   0.0  60-70  \n",
+      "4    0.0     0.0   0.0   0.0    80+  \n",
+      "\n",
+      "[5 rows x 828 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
+    "df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
+    "df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
+    "\n",
+    "# Преобразуем колонку 'source' с помощью TF-IDF\n",
+    "tfidf_vectorizer = TfidfVectorizer(max_features=1000)  \n",
+    "X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
+    "\n",
+    "# Создаем DataFrame с результатами TF-IDF\n",
+    "df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
+    "\n",
+    "bins = [0, 30, 40, 50, 60, 70, 80, 100]  # границы для возрастных категорий\n",
+    "labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+']  # метки для категорий\n",
+    "\n",
+    "# Создаем новую колонку 'age_group', где будет храниться категория\n",
+    "df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
+    "\n",
+    "\n",
+    "# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
+    "df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
+    "\n",
+    "# Объединяем все преобразованные данные в один DataFrame\n",
+    "df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
+    "\n",
+    "# Просмотр результата\n",
+    "print(df_transformed.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'df_transformed' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m     42\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m     43\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m     45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m     \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m     47\u001b[0m )\n\u001b[0;32m     49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m     50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "from typing import Tuple\n",
+    "import pandas as pd\n",
+    "from pandas import DataFrame\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "def split_stratified_into_train_val_test(\n",
+    "    df_input,\n",
+    "    stratify_colname=\"y\",\n",
+    "    frac_train=0.6,\n",
+    "    frac_val=0.15,\n",
+    "    frac_test=0.25,\n",
+    "    random_state=None,\n",
+    ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
+    "   \n",
+    "    if frac_train + frac_val + frac_test != 1.0:\n",
+    "        raise ValueError(\n",
+    "            \"fractions %f, %f, %f do not add up to 1.0\"\n",
+    "            % (frac_train, frac_val, frac_test)\n",
+    "        )\n",
+    "    if stratify_colname not in df_input.columns:\n",
+    "        raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
+    "    X = df_input  # Contains all columns.\n",
+    "    y = df_input[\n",
+    "        [stratify_colname]\n",
+    "    ]  # Dataframe of just the column on which to stratify.\n",
+    "    # Split original dataframe into train and temp dataframes.\n",
+    "    df_train, df_temp, y_train, y_temp = train_test_split(\n",
+    "        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+    "    )\n",
+    "    if frac_val <= 0:\n",
+    "        assert len(df_input) == len(df_train) + len(df_temp)\n",
+    "        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
+    "    # Split the temp dataframe into val and test dataframes.\n",
+    "    relative_frac_test = frac_test / (frac_val + frac_test)\n",
+    "    df_val, df_test, y_val, y_test = train_test_split(\n",
+    "        df_temp,\n",
+    "        y_temp,\n",
+    "        stratify=y_temp,\n",
+    "        test_size=relative_frac_test,\n",
+    "        random_state=random_state,\n",
+    "    )\n",
+    "    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+    "    return df_train, df_val, df_test, y_train, y_val, y_test\n",
+    "\n",
+    "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
+    "    df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
+    ")\n",
+    "\n",
+    "display(\"X_train\", X_train)\n",
+    "display(\"y_train\", y_train)\n",
+    "\n",
+    "display(\"X_test\", X_test)\n",
+    "display(\"y_test\", y_test)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/lab_4/utils.py b/lab_4/utils.py
new file mode 100644
index 0000000..cb8c396
--- /dev/null
+++ b/lab_4/utils.py
@@ -0,0 +1,79 @@
+from typing import Tuple
+
+import pandas as pd
+from pandas import DataFrame
+from sklearn.model_selection import train_test_split
+
+
+def split_stratified_into_train_val_test(
+    df_input,
+    stratify_colname="y",
+    frac_train=0.6,
+    frac_val=0.15,
+    frac_test=0.25,
+    random_state=None,
+) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
+    """
+    Splits a Pandas dataframe into three subsets (train, val, and test)
+    following fractional ratios provided by the user, where each subset is
+    stratified by the values in a specific column (that is, each subset has
+    the same relative frequency of the values in the column). It performs this
+    splitting by running train_test_split() twice.
+
+    Parameters
+    ----------
+    df_input : Pandas dataframe
+        Input dataframe to be split.
+    stratify_colname : str
+        The name of the column that will be used for stratification. Usually
+        this column would be for the label.
+    frac_train : float
+    frac_val   : float
+    frac_test  : float
+        The ratios with which the dataframe will be split into train, val, and
+        test data. The values should be expressed as float fractions and should
+        sum to 1.0.
+    random_state : int, None, or RandomStateInstance
+        Value to be passed to train_test_split().
+
+    Returns
+    -------
+    df_train, df_val, df_test :
+        Dataframes containing the three splits.
+    """
+
+    if frac_train + frac_val + frac_test != 1.0:
+        raise ValueError(
+            "fractions %f, %f, %f do not add up to 1.0"
+            % (frac_train, frac_val, frac_test)
+        )
+
+    if stratify_colname not in df_input.columns:
+        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
+
+    X = df_input  # Contains all columns.
+    y = df_input[
+        [stratify_colname]
+    ]  # Dataframe of just the column on which to stratify.
+
+    # Split original dataframe into train and temp dataframes.
+    df_train, df_temp, y_train, y_temp = train_test_split(
+        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
+    )
+
+    if frac_val <= 0:
+        assert len(df_input) == len(df_train) + len(df_temp)
+        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
+
+    # Split the temp dataframe into val and test dataframes.
+    relative_frac_test = frac_test / (frac_val + frac_test)
+    df_val, df_test, y_val, y_test = train_test_split(
+        df_temp,
+        y_temp,
+        stratify=y_temp,
+        test_size=relative_frac_test,
+        random_state=random_state,
+    )
+
+    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
+    return df_train, df_val, df_test, y_train, y_val, y_test