241 lines
11 KiB
Plaintext
241 lines
11 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
|
||
"print(df.columns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Определим бизнес цели:\n",
|
||
"## 1- Прогнозирование состояния миллиардера(регрессия)\n",
|
||
"## 2- Прогнозирование возраста миллиардера(классификация)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Подготовим данные: удалим колонки rank и name(в них уникальные значения, которые не участвуют в предсказаниях). А также преобразуем номинальные колонки в числовые(country, source, industry) и категоризируем колонку age"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Networth Age Country Source Industry\n",
|
||
"0 219.0 50 United States Tesla, SpaceX Automotive \n",
|
||
"1 171.0 58 United States Amazon Technology \n",
|
||
"2 158.0 73 France LVMH Fashion & Retail \n",
|
||
"3 129.0 66 United States Microsoft Technology \n",
|
||
"4 118.0 91 United States Berkshire Hathaway Finance & Investments \n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Удаление колонок 'rank' и 'name'\n",
|
||
"df.drop(columns=['Rank ', 'Name'], inplace=True)\n",
|
||
"\n",
|
||
"# Проверка, что колонки были удалены\n",
|
||
"print(df.head())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Networth Country_Argentina Country_Australia Country_Austria \\\n",
|
||
"0 219.0 False False False \n",
|
||
"1 171.0 False False False \n",
|
||
"2 158.0 False False False \n",
|
||
"3 129.0 False False False \n",
|
||
"4 118.0 False False False \n",
|
||
"\n",
|
||
" Country_Barbados Country_Belgium Country_Belize Country_Brazil \\\n",
|
||
"0 False False False False \n",
|
||
"1 False False False False \n",
|
||
"2 False False False False \n",
|
||
"3 False False False False \n",
|
||
"4 False False False False \n",
|
||
"\n",
|
||
" Country_Bulgaria Country_Canada ... wind wine winter wire wireless \\\n",
|
||
"0 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"1 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"2 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"3 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"4 False False ... 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" yahoo yogurt zara zoom Age \n",
|
||
"0 0.0 0.0 0.0 0.0 50-60 \n",
|
||
"1 0.0 0.0 0.0 0.0 50-60 \n",
|
||
"2 0.0 0.0 0.0 0.0 70-80 \n",
|
||
"3 0.0 0.0 0.0 0.0 60-70 \n",
|
||
"4 0.0 0.0 0.0 0.0 80+ \n",
|
||
"\n",
|
||
"[5 rows x 828 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||
"\n",
|
||
"# Преобразуем 'country' и 'industry' в бинарные матрицы с помощью One-Hot Encoding\n",
|
||
"df_country = pd.get_dummies(df[['Country']], drop_first=True)\n",
|
||
"df_industry = pd.get_dummies(df[['Industry']], drop_first=True)\n",
|
||
"\n",
|
||
"# Преобразуем колонку 'source' с помощью TF-IDF\n",
|
||
"tfidf_vectorizer = TfidfVectorizer(max_features=1000) \n",
|
||
"X_tfidf = tfidf_vectorizer.fit_transform(df['Source']).toarray()\n",
|
||
"\n",
|
||
"# Создаем DataFrame с результатами TF-IDF\n",
|
||
"df_source_tfidf = pd.DataFrame(X_tfidf, columns=tfidf_vectorizer.get_feature_names_out())\n",
|
||
"\n",
|
||
"bins = [0, 30, 40, 50, 60, 70, 80, 100] # границы для возрастных категорий\n",
|
||
"labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n",
|
||
"\n",
|
||
"# Создаем новую колонку 'age_group', где будет храниться категория\n",
|
||
"df_age_group = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n",
|
||
"\n",
|
||
"\n",
|
||
"# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n",
|
||
"df.drop(columns=['Country', 'Industry', 'Source', 'Age'], inplace=True)\n",
|
||
"\n",
|
||
"# Объединяем все преобразованные данные в один DataFrame\n",
|
||
"df_transformed = pd.concat([df, df_country, df_industry, df_source_tfidf, df_age_group], axis=1)\n",
|
||
"\n",
|
||
"# Просмотр результата\n",
|
||
"print(df_transformed.head())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Разобьём набор данных на обучающую и тестовые выборки (80/20) для задачи классификации. Целевой признак- Age"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "NameError",
|
||
"evalue": "name 'df_transformed' is not defined",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[18], line 46\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(df_input) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_train) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_val) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlen\u001b[39m(df_test)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df_train, df_val, df_test, y_train, y_val, y_test\n\u001b[0;32m 45\u001b[0m X_train, X_val, X_test, y_train, y_val, y_test \u001b[38;5;241m=\u001b[39m split_stratified_into_train_val_test(\n\u001b[1;32m---> 46\u001b[0m \u001b[43mdf_transformed\u001b[49m, stratify_colname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, frac_train\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.80\u001b[39m, frac_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, frac_test\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 47\u001b[0m )\n\u001b[0;32m 49\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, X_train)\n\u001b[0;32m 50\u001b[0m display(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_train\u001b[39m\u001b[38;5;124m\"\u001b[39m, y_train)\n",
|
||
"\u001b[1;31mNameError\u001b[0m: name 'df_transformed' is not defined"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from typing import Tuple\n",
|
||
"import pandas as pd\n",
|
||
"from pandas import DataFrame\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
|
||
" \n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
" if frac_val <= 0:\n",
|
||
" assert len(df_input) == len(df_train) + len(df_temp)\n",
|
||
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
|
||
"\n",
|
||
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||
" df, stratify_colname=\"Age\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=None\n",
|
||
")\n",
|
||
"\n",
|
||
"display(\"X_train\", X_train)\n",
|
||
"display(\"y_train\", y_train)\n",
|
||
"\n",
|
||
"display(\"X_test\", X_test)\n",
|
||
"display(\"y_test\", y_test)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|