коммит
This commit is contained in:
parent
eadc896314
commit
41c4ab91ed
@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
BIN
lab_4/__pycache__/utils.cpython-312.pyc
Normal file
BIN
lab_4/__pycache__/utils.cpython-312.pyc
Normal file
Binary file not shown.
240
lab_4/lab4.ipynb
Normal file
240
lab_4/lab4.ipynb
Normal file
@ -0,0 +1,240 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
|
||||||
|
"print(df.columns)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Определим бизнес цели:\n",
|
||||||
|
"## 1- Прогнозирование состояния миллиардера(регрессия)\n",
|
||||||
|
"## 2- Прогнозирование возраста миллиардера(классификация)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Networth Age Country Source Industry\n",
|
||||||
|
"0 219.0 50 United States Tesla, SpaceX Automotive \n",
|
||||||
|
"1 171.0 58 United States Amazon Technology \n",
|
||||||
|
"2 158.0 73 France LVMH Fashion & Retail \n",
|
||||||
|
"3 129.0 66 United States Microsoft Technology \n",
|
||||||
|
"4 118.0 91 United States Berkshire Hathaway Finance & Investments \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Удаление колонок 'rank' и 'name'\n",
|
||||||
|
"df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# Проверка, что колонки были удалены\n",
|
||||||
|
"print(df.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Networth Country_Argentina Country_Australia Country_Austria \\\n",
|
||||||
|
"0 219.0 False False False \n",
|
||||||
|
"1 171.0 False False False \n",
|
||||||
|
"2 158.0 False False False \n",
|
||||||
|
"3 129.0 False False False \n",
|
||||||
|
"4 118.0 False False False \n",
|
||||||
|
"\n",
|
||||||
|
" Country_Barbados Country_Belgium Country_Belize Country_Brazil \\\n",
|
||||||
|
"0 False False False False \n",
|
||||||
|
"1 False False False False \n",
|
||||||
|
"2 False False False False \n",
|
||||||
|
"3 False False False False \n",
|
||||||
|
"4 False False False False \n",
|
||||||
|
"\n",
|
||||||
|
" Country_Bulgaria Country_Canada ... wind wine winter wire wireless \\\n",
|
||||||
|
"0 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"1 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"2 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"3 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"4 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"\n",
|
||||||
|
" yahoo yogurt zara zoom Age \n",
|
||||||
|
"0 0.0 0.0 0.0 0.0 50-60 \n",
|
||||||
|
"1 0.0 0.0 0.0 0.0 50-60 \n",
|
||||||
|
"2 0.0 0.0 0.0 0.0 70-80 \n",
|
||||||
|
"3 0.0 0.0 0.0 0.0 60-70 \n",
|
||||||
|
"4 0.0 0.0 0.0 0.0 80+ \n",
|
||||||
|
"\n",
|
||||||
|
"[5 rows x 828 columns]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"\n",
|
||||||
|
"# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
|
||||||
|
"df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
|
||||||
|
"df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# Преобразуем колонку 'source' с помощью TF-IDF\n",
|
||||||
|
"tfidf_vectorizer = TfidfVectorizer(max_features=1000) \n",
|
||||||
|
"X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
|
||||||
|
"\n",
|
||||||
|
"# Создаем DataFrame с результатами TF-IDF\n",
|
||||||
|
"df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
|
||||||
|
"\n",
|
||||||
|
"bins = [0, 30, 40, 50, 60, 70, 80, 100] # границы для возрастных категорий\n",
|
||||||
|
"labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n",
|
||||||
|
"\n",
|
||||||
|
"# Создаем новую колонку 'age_group', где будет храниться категория\n",
|
||||||
|
"df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
|
||||||
|
"df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# Объединяем все преобразованные данные в один DataFrame\n",
|
||||||
|
"df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"# Просмотр результата\n",
|
||||||
|
"print(df_transformed.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "NameError",
|
||||||
|
"evalue": "name 'df_transformed' is not defined",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m 45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
|
||||||
|
"\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from typing import Tuple\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from pandas import DataFrame\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"def split_stratified_into_train_val_test(\n",
|
||||||
|
" df_input,\n",
|
||||||
|
" stratify_colname=\"y\",\n",
|
||||||
|
" frac_train=0.6,\n",
|
||||||
|
" frac_val=0.15,\n",
|
||||||
|
" frac_test=0.25,\n",
|
||||||
|
" random_state=None,\n",
|
||||||
|
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
|
||||||
|
" \n",
|
||||||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||||||
|
" raise ValueError(\n",
|
||||||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||||||
|
" % (frac_train, frac_val, frac_test)\n",
|
||||||
|
" )\n",
|
||||||
|
" if stratify_colname not in df_input.columns:\n",
|
||||||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||||||
|
" X = df_input # Contains all columns.\n",
|
||||||
|
" y = df_input[\n",
|
||||||
|
" [stratify_colname]\n",
|
||||||
|
" ] # Dataframe of just the column on which to stratify.\n",
|
||||||
|
" # Split original dataframe into train and temp dataframes.\n",
|
||||||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||||||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||||||
|
" )\n",
|
||||||
|
" if frac_val <= 0:\n",
|
||||||
|
" assert len(df_input) == len(df_train) + len(df_temp)\n",
|
||||||
|
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
|
||||||
|
" # Split the temp dataframe into val and test dataframes.\n",
|
||||||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||||||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||||||
|
" df_temp,\n",
|
||||||
|
" y_temp,\n",
|
||||||
|
" stratify=y_temp,\n",
|
||||||
|
" test_size=relative_frac_test,\n",
|
||||||
|
" random_state=random_state,\n",
|
||||||
|
" )\n",
|
||||||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||||
|
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
|
||||||
|
"\n",
|
||||||
|
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||||||
|
" df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"display(\"X_train\", X_train)\n",
|
||||||
|
"display(\"y_train\", y_train)\n",
|
||||||
|
"\n",
|
||||||
|
"display(\"X_test\", X_test)\n",
|
||||||
|
"display(\"y_test\", y_test)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
79
lab_4/utils.py
Normal file
79
lab_4/utils.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
def split_stratified_into_train_val_test(
|
||||||
|
df_input,
|
||||||
|
stratify_colname="y",
|
||||||
|
frac_train=0.6,
|
||||||
|
frac_val=0.15,
|
||||||
|
frac_test=0.25,
|
||||||
|
random_state=None,
|
||||||
|
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
|
||||||
|
"""
|
||||||
|
Splits a Pandas dataframe into three subsets (train, val, and test)
|
||||||
|
following fractional ratios provided by the user, where each subset is
|
||||||
|
stratified by the values in a specific column (that is, each subset has
|
||||||
|
the same relative frequency of the values in the column). It performs this
|
||||||
|
splitting by running train_test_split() twice.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_input : Pandas dataframe
|
||||||
|
Input dataframe to be split.
|
||||||
|
stratify_colname : str
|
||||||
|
The name of the column that will be used for stratification. Usually
|
||||||
|
this column would be for the label.
|
||||||
|
frac_train : float
|
||||||
|
frac_val : float
|
||||||
|
frac_test : float
|
||||||
|
The ratios with which the dataframe will be split into train, val, and
|
||||||
|
test data. The values should be expressed as float fractions and should
|
||||||
|
sum to 1.0.
|
||||||
|
random_state : int, None, or RandomStateInstance
|
||||||
|
Value to be passed to train_test_split().
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_train, df_val, df_test :
|
||||||
|
Dataframes containing the three splits.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if frac_train + frac_val + frac_test != 1.0:
|
||||||
|
raise ValueError(
|
||||||
|
"fractions %f, %f, %f do not add up to 1.0"
|
||||||
|
% (frac_train, frac_val, frac_test)
|
||||||
|
)
|
||||||
|
|
||||||
|
if stratify_colname not in df_input.columns:
|
||||||
|
raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
|
||||||
|
|
||||||
|
X = df_input # Contains all columns.
|
||||||
|
y = df_input[
|
||||||
|
[stratify_colname]
|
||||||
|
] # Dataframe of just the column on which to stratify.
|
||||||
|
|
||||||
|
# Split original dataframe into train and temp dataframes.
|
||||||
|
df_train, df_temp, y_train, y_temp = train_test_split(
|
||||||
|
X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
|
||||||
|
)
|
||||||
|
|
||||||
|
if frac_val <= 0:
|
||||||
|
assert len(df_input) == len(df_train) + len(df_temp)
|
||||||
|
return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
|
||||||
|
|
||||||
|
# Split the temp dataframe into val and test dataframes.
|
||||||
|
relative_frac_test = frac_test / (frac_val + frac_test)
|
||||||
|
df_val, df_test, y_val, y_test = train_test_split(
|
||||||
|
df_temp,
|
||||||
|
y_temp,
|
||||||
|
stratify=y_temp,
|
||||||
|
test_size=relative_frac_test,
|
||||||
|
random_state=random_state,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
|
||||||
|
return df_train, df_val, df_test, y_train, y_val, y_test
|
Loading…
x
Reference in New Issue
Block a user