коммит
This commit is contained in:
parent
eadc896314
commit
41c4ab91ed
@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
BIN
lab_4/__pycache__/utils.cpython-312.pyc
Normal file
BIN
lab_4/__pycache__/utils.cpython-312.pyc
Normal file
Binary file not shown.
240
lab_4/lab4.ipynb
Normal file
240
lab_4/lab4.ipynb
Normal file
@ -0,0 +1,240 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
|
||||
"print(df.columns)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Определим бизнес цели:\n",
|
||||
"## 1- Прогнозирование состояния миллиардера(регрессия)\n",
|
||||
"## 2- Прогнозирование возраста миллиардера(классификация)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Networth Age Country Source Industry\n",
|
||||
"0 219.0 50 United States Tesla, SpaceX Automotive \n",
|
||||
"1 171.0 58 United States Amazon Technology \n",
|
||||
"2 158.0 73 France LVMH Fashion & Retail \n",
|
||||
"3 129.0 66 United States Microsoft Technology \n",
|
||||
"4 118.0 91 United States Berkshire Hathaway Finance & Investments \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Удаление колонок 'rank' и 'name'\n",
|
||||
"df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
|
||||
"\n",
|
||||
"# Проверка, что колонки были удалены\n",
|
||||
"print(df.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Networth Country_Argentina Country_Australia Country_Austria \\\n",
|
||||
"0 219.0 False False False \n",
|
||||
"1 171.0 False False False \n",
|
||||
"2 158.0 False False False \n",
|
||||
"3 129.0 False False False \n",
|
||||
"4 118.0 False False False \n",
|
||||
"\n",
|
||||
" Country_Barbados Country_Belgium Country_Belize Country_Brazil \\\n",
|
||||
"0 False False False False \n",
|
||||
"1 False False False False \n",
|
||||
"2 False False False False \n",
|
||||
"3 False False False False \n",
|
||||
"4 False False False False \n",
|
||||
"\n",
|
||||
" Country_Bulgaria Country_Canada ... wind wine winter wire wireless \\\n",
|
||||
"0 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"1 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"2 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"3 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"4 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"\n",
|
||||
" yahoo yogurt zara zoom Age \n",
|
||||
"0 0.0 0.0 0.0 0.0 50-60 \n",
|
||||
"1 0.0 0.0 0.0 0.0 50-60 \n",
|
||||
"2 0.0 0.0 0.0 0.0 70-80 \n",
|
||||
"3 0.0 0.0 0.0 0.0 60-70 \n",
|
||||
"4 0.0 0.0 0.0 0.0 80+ \n",
|
||||
"\n",
|
||||
"[5 rows x 828 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
|
||||
"df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
|
||||
"df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
|
||||
"\n",
|
||||
"# Преобразуем колонку 'source' с помощью TF-IDF\n",
|
||||
"tfidf_vectorizer = TfidfVectorizer(max_features=1000) \n",
|
||||
"X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
|
||||
"\n",
|
||||
"# Создаем DataFrame с результатами TF-IDF\n",
|
||||
"df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
|
||||
"\n",
|
||||
"bins = [0, 30, 40, 50, 60, 70, 80, 100] # границы для возрастных категорий\n",
|
||||
"labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n",
|
||||
"\n",
|
||||
"# Создаем новую колонку 'age_group', где будет храниться категория\n",
|
||||
"df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
|
||||
"df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
|
||||
"\n",
|
||||
"# Объединяем все преобразованные данные в один DataFrame\n",
|
||||
"df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
|
||||
"\n",
|
||||
"# Просмотр результата\n",
|
||||
"print(df_transformed.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'df_transformed' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m 45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
|
||||
"\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from typing import Tuple\n",
|
||||
"import pandas as pd\n",
|
||||
"from pandas import DataFrame\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"def split_stratified_into_train_val_test(\n",
|
||||
" df_input,\n",
|
||||
" stratify_colname=\"y\",\n",
|
||||
" frac_train=0.6,\n",
|
||||
" frac_val=0.15,\n",
|
||||
" frac_test=0.25,\n",
|
||||
" random_state=None,\n",
|
||||
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
|
||||
" \n",
|
||||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||||
" raise ValueError(\n",
|
||||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||||
" % (frac_train, frac_val, frac_test)\n",
|
||||
" )\n",
|
||||
" if stratify_colname not in df_input.columns:\n",
|
||||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||||
" X = df_input # Contains all columns.\n",
|
||||
" y = df_input[\n",
|
||||
" [stratify_colname]\n",
|
||||
" ] # Dataframe of just the column on which to stratify.\n",
|
||||
" # Split original dataframe into train and temp dataframes.\n",
|
||||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||||
" )\n",
|
||||
" if frac_val <= 0:\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_temp)\n",
|
||||
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
|
||||
" # Split the temp dataframe into val and test dataframes.\n",
|
||||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||||
" df_temp,\n",
|
||||
" y_temp,\n",
|
||||
" stratify=y_temp,\n",
|
||||
" test_size=relative_frac_test,\n",
|
||||
" random_state=random_state,\n",
|
||||
" )\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
|
||||
"\n",
|
||||
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||||
" df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"display(\"X_train\", X_train)\n",
|
||||
"display(\"y_train\", y_train)\n",
|
||||
"\n",
|
||||
"display(\"X_test\", X_test)\n",
|
||||
"display(\"y_test\", y_test)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
79
lab_4/utils.py
Normal file
79
lab_4/utils.py
Normal file
@ -0,0 +1,79 @@
|
||||
from typing import Tuple
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
def split_stratified_into_train_val_test(
|
||||
df_input,
|
||||
stratify_colname="y",
|
||||
frac_train=0.6,
|
||||
frac_val=0.15,
|
||||
frac_test=0.25,
|
||||
random_state=None,
|
||||
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
|
||||
"""
|
||||
Splits a Pandas dataframe into three subsets (train, val, and test)
|
||||
following fractional ratios provided by the user, where each subset is
|
||||
stratified by the values in a specific column (that is, each subset has
|
||||
the same relative frequency of the values in the column). It performs this
|
||||
splitting by running train_test_split() twice.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_input : Pandas dataframe
|
||||
Input dataframe to be split.
|
||||
stratify_colname : str
|
||||
The name of the column that will be used for stratification. Usually
|
||||
this column would be for the label.
|
||||
frac_train : float
|
||||
frac_val : float
|
||||
frac_test : float
|
||||
The ratios with which the dataframe will be split into train, val, and
|
||||
test data. The values should be expressed as float fractions and should
|
||||
sum to 1.0.
|
||||
random_state : int, None, or RandomStateInstance
|
||||
Value to be passed to train_test_split().
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_train, df_val, df_test :
|
||||
Dataframes containing the three splits.
|
||||
"""
|
||||
|
||||
if frac_train + frac_val + frac_test != 1.0:
|
||||
raise ValueError(
|
||||
"fractions %f, %f, %f do not add up to 1.0"
|
||||
% (frac_train, frac_val, frac_test)
|
||||
)
|
||||
|
||||
if stratify_colname not in df_input.columns:
|
||||
raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
|
||||
|
||||
X = df_input # Contains all columns.
|
||||
y = df_input[
|
||||
[stratify_colname]
|
||||
] # Dataframe of just the column on which to stratify.
|
||||
|
||||
# Split original dataframe into train and temp dataframes.
|
||||
df_train, df_temp, y_train, y_temp = train_test_split(
|
||||
X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
|
||||
)
|
||||
|
||||
if frac_val <= 0:
|
||||
assert len(df_input) == len(df_train) + len(df_temp)
|
||||
return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
|
||||
|
||||
# Split the temp dataframe into val and test dataframes.
|
||||
relative_frac_test = frac_test / (frac_val + frac_test)
|
||||
df_val, df_test, y_val, y_test = train_test_split(
|
||||
df_temp,
|
||||
y_temp,
|
||||
stratify=y_temp,
|
||||
test_size=relative_frac_test,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
|
||||
return df_train, df_val, df_test, y_train, y_val, y_test
|
Loading…
x
Reference in New Issue
Block a user