{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n" ] } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n", "print(df.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Определим бизнес цели:\n", "## 1- Прогнозирование состояния миллиардера(регрессия)\n", "## 2- Прогнозирование возраста миллиардера(классификация)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Подготовим данные: категоризируем колонку age" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rank 0\n", "Name 0\n", "Networth 0\n", "Age 0\n", "Country 0\n", "Source 0\n", "Industry 0\n", "dtype: int64\n", "\n", "Rank False\n", "Name False\n", "Networth False\n", "Age False\n", "Country False\n", "Source False\n", "Industry False\n", "dtype: bool\n", "\n" ] } ], "source": [ "print(df.isnull().sum())\n", "\n", "print()\n", "\n", "# Есть ли пустые значения признаков\n", "print(df.isnull().any())\n", "\n", "print()\n", "\n", "# Процент пустых значений признаков\n", "for i in df.columns:\n", " null_rate = df[i].isnull().sum() / len(df) * 100\n", " if null_rate > 0:\n", " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Rank Name Networth Country \\\n", "0 1 Elon Musk 219.0 United States \n", "1 2 Jeff Bezos 171.0 United States \n", "2 3 Bernard Arnault & family 158.0 France \n", "3 4 Bill Gates 129.0 United States \n", "4 5 Warren Buffett 118.0 United States \n", "\n", " Source Industry Age_category \n", "0 Tesla, SpaceX Automotive 50-60 \n", "1 Amazon Technology 50-60 \n", "2 LVMH Fashion & Retail 70-80 \n", "3 Microsoft Technology 60-70 \n", "4 Berkshire Hathaway Finance & Investments 80+ \n" ] } ], "source": [ "\n", "\n", "bins = [0, 30, 40, 50, 60, 70, 80, 101] # границы для возрастных категорий\n", "labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'] # метки для категорий\n", "\n", "df[\"Age_category\"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)\n", "# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame\n", "df.drop(columns=['Age'], inplace=True)\n", "\n", "# Просмотр результата\n", "print(df.head())" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'X_train'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", " | Rank | \n", "Name | \n", "Networth | \n", "Country | \n", "Source | \n", "Industry | \n", "Age_category | \n", "
---|---|---|---|---|---|---|---|
1909 | \n", "1818 | \n", "Tran Ba Duong & family | \n", "1.6 | \n", "Vietnam | \n", "automotive | \n", "Automotive | \n", "60-70 | \n", "
2099 | \n", "2076 | \n", "Mark Dixon | \n", "1.4 | \n", "United Kingdom | \n", "office real estate | \n", "Real Estate | \n", "60-70 | \n", "
1392 | \n", "1341 | \n", "Yingzhuo Xu | \n", "2.3 | \n", "China | \n", "agribusiness | \n", "Food & Beverage | \n", "50-60 | \n", "
627 | \n", "622 | \n", "Bruce Flatt | \n", "4.6 | \n", "Canada | \n", "money management | \n", "Finance & Investments | \n", "50-60 | \n", "
527 | \n", "523 | \n", "Li Liangbin | \n", "5.2 | \n", "China | \n", "lithium | \n", "Manufacturing | \n", "50-60 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
84 | \n", "85 | \n", "Theo Albrecht, Jr. & family | \n", "18.7 | \n", "Germany | \n", "Aldi, Trader Joe's | \n", "Fashion & Retail | \n", "70-80 | \n", "
633 | \n", "622 | \n", "Tony Tamer | \n", "4.6 | \n", "United States | \n", "private equity | \n", "Finance & Investments | \n", "60-70 | \n", "
922 | \n", "913 | \n", "Bob Gaglardi | \n", "3.3 | \n", "Canada | \n", "hotels | \n", "Real Estate | \n", "80+ | \n", "
2178 | \n", "2076 | \n", "Eugene Wu | \n", "1.4 | \n", "Taiwan | \n", "finance | \n", "Finance & Investments | \n", "70-80 | \n", "
415 | \n", "411 | \n", "Leonard Stern | \n", "6.2 | \n", "United States | \n", "real estate | \n", "Real Estate | \n", "80+ | \n", "
2080 rows × 7 columns
\n", "\n", " | Age_category | \n", "
---|---|
1909 | \n", "60-70 | \n", "
2099 | \n", "60-70 | \n", "
1392 | \n", "50-60 | \n", "
627 | \n", "50-60 | \n", "
527 | \n", "50-60 | \n", "
... | \n", "... | \n", "
84 | \n", "70-80 | \n", "
633 | \n", "60-70 | \n", "
922 | \n", "80+ | \n", "
2178 | \n", "70-80 | \n", "
415 | \n", "80+ | \n", "
2080 rows × 1 columns
\n", "\n", " | Rank | \n", "Name | \n", "Networth | \n", "Country | \n", "Source | \n", "Industry | \n", "Age_category | \n", "
---|---|---|---|---|---|---|---|
2075 | \n", "2076 | \n", "Radhe Shyam Agarwal | \n", "1.4 | \n", "India | \n", "consumer goods | \n", "Fashion & Retail | \n", "70-80 | \n", "
1529 | \n", "1513 | \n", "Robert Duggan | \n", "2.0 | \n", "United States | \n", "pharmaceuticals | \n", "Healthcare | \n", "70-80 | \n", "
1803 | \n", "1729 | \n", "Yao Kuizhang | \n", "1.7 | \n", "China | \n", "beverages | \n", "Food & Beverage | \n", "50-60 | \n", "
425 | \n", "424 | \n", "Alexei Kuzmichev | \n", "6.0 | \n", "Russia | \n", "oil, banking, telecom | \n", "Energy | \n", "50-60 | \n", "
2597 | \n", "2578 | \n", "Ramesh Genomal | \n", "1.0 | \n", "Philippines | \n", "apparel | \n", "Fashion & Retail | \n", "70-80 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
935 | \n", "913 | \n", "Alfred Oetker | \n", "3.3 | \n", "Germany | \n", "consumer goods | \n", "Fashion & Retail | \n", "50-60 | \n", "
1541 | \n", "1513 | \n", "Thomas Lee | \n", "2.0 | \n", "United States | \n", "private equity | \n", "Finance & Investments | \n", "70-80 | \n", "
1646 | \n", "1645 | \n", "Roberto Angelini Rossi | \n", "1.8 | \n", "Chile | \n", "forestry, mining | \n", "diversified | \n", "70-80 | \n", "
376 | \n", "375 | \n", "Patrick Drahi | \n", "6.6 | \n", "France | \n", "telecom | \n", "Telecom | \n", "50-60 | \n", "
1894 | \n", "1818 | \n", "Gerald Schwartz | \n", "1.6 | \n", "Canada | \n", "finance | \n", "Finance & Investments | \n", "80+ | \n", "
520 rows × 7 columns
\n", "\n", " | Age_category | \n", "
---|---|
2075 | \n", "70-80 | \n", "
1529 | \n", "70-80 | \n", "
1803 | \n", "50-60 | \n", "
425 | \n", "50-60 | \n", "
2597 | \n", "70-80 | \n", "
... | \n", "... | \n", "
935 | \n", "50-60 | \n", "
1541 | \n", "70-80 | \n", "
1646 | \n", "70-80 | \n", "
376 | \n", "50-60 | \n", "
1894 | \n", "80+ | \n", "
520 rows × 1 columns
\n", "\n", " | prepocessing_num__Networth | \n", "prepocessing_cat__Country_Argentina | \n", "prepocessing_cat__Country_Australia | \n", "prepocessing_cat__Country_Austria | \n", "prepocessing_cat__Country_Barbados | \n", "prepocessing_cat__Country_Belgium | \n", "prepocessing_cat__Country_Belize | \n", "prepocessing_cat__Country_Brazil | \n", "prepocessing_cat__Country_Bulgaria | \n", "prepocessing_cat__Country_Canada | \n", "... | \n", "prepocessing_cat__Industry_Logistics | \n", "prepocessing_cat__Industry_Manufacturing | \n", "prepocessing_cat__Industry_Media & Entertainment | \n", "prepocessing_cat__Industry_Metals & Mining | \n", "prepocessing_cat__Industry_Real Estate | \n", "prepocessing_cat__Industry_Service | \n", "prepocessing_cat__Industry_Sports | \n", "prepocessing_cat__Industry_Technology | \n", "prepocessing_cat__Industry_Telecom | \n", "prepocessing_cat__Industry_diversified | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1909 | \n", "-0.309917 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2099 | \n", "-0.329245 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1392 | \n", "-0.242268 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
627 | \n", "-0.019995 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
527 | \n", "0.037990 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
84 | \n", "1.342637 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
633 | \n", "-0.019995 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
922 | \n", "-0.145628 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2178 | \n", "-0.329245 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
415 | \n", "0.134630 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
2080 rows × 860 columns
\n", "