ну вроде бы всё

This commit is contained in:
annalyovushkina@yandex.ru 2024-11-02 00:19:57 +04:00
parent d4cdc8ab91
commit 7f7102c559

View File

@ -436,7 +436,8 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Приведём пример использования future tools" "# Приведём пример использования future tools\n",
"## Попробую вынести страну в отдельную таблицу"
] ]
}, },
{ {
@ -544,6 +545,628 @@
"source": [ "source": [
"pip install --upgrade setuptools" "pip install --upgrade setuptools"
] ]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index billioner_id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function max at 0x000001952157A520> is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"max\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function std at 0x000001952157B060> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"std\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function sum at 0x0000019521579B20> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function min at 0x000001952157A660> is currently using SeriesGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"min\" instead.\n",
" ).agg(to_agg)\n",
"c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x000001952157AF20> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
" ).agg(to_agg)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>Networth</th>\n",
" <th>Age</th>\n",
" <th>Industry</th>\n",
" <th>id</th>\n",
" <th>country_id</th>\n",
" <th>country_table.id</th>\n",
" <th>country_table.Country</th>\n",
" <th>country_table.COUNT(other_about_billioner)</th>\n",
" <th>country_table.MAX(other_about_billioner.Age)</th>\n",
" <th>...</th>\n",
" <th>country_table.SKEW(other_about_billioner.Rank )</th>\n",
" <th>country_table.SKEW(other_about_billioner.id)</th>\n",
" <th>country_table.STD(other_about_billioner.Age)</th>\n",
" <th>country_table.STD(other_about_billioner.Networth)</th>\n",
" <th>country_table.STD(other_about_billioner.Rank )</th>\n",
" <th>country_table.STD(other_about_billioner.id)</th>\n",
" <th>country_table.SUM(other_about_billioner.Age)</th>\n",
" <th>country_table.SUM(other_about_billioner.Networth)</th>\n",
" <th>country_table.SUM(other_about_billioner.Rank )</th>\n",
" <th>country_table.SUM(other_about_billioner.id)</th>\n",
" </tr>\n",
" <tr>\n",
" <th>billioner_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>219</td>\n",
" <td>50</td>\n",
" <td>Automotive</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>United States</td>\n",
" <td>1</td>\n",
" <td>50.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>50.0</td>\n",
" <td>219.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>171</td>\n",
" <td>58</td>\n",
" <td>Technology</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>United States</td>\n",
" <td>1</td>\n",
" <td>58.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>58.0</td>\n",
" <td>171.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>158</td>\n",
" <td>73</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>France</td>\n",
" <td>1</td>\n",
" <td>73.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>73.0</td>\n",
" <td>158.0</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>129</td>\n",
" <td>66</td>\n",
" <td>Technology</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>United States</td>\n",
" <td>1</td>\n",
" <td>66.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>66.0</td>\n",
" <td>129.0</td>\n",
" <td>4.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>118</td>\n",
" <td>91</td>\n",
" <td>Finance &amp; Investments</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>United States</td>\n",
" <td>1</td>\n",
" <td>91.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>91.0</td>\n",
" <td>118.0</td>\n",
" <td>5.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2595</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>80</td>\n",
" <td>Healthcare</td>\n",
" <td>2595</td>\n",
" <td>2595</td>\n",
" <td>2595</td>\n",
" <td>Spain</td>\n",
" <td>1</td>\n",
" <td>80.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2595.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2596</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>82</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>2596</td>\n",
" <td>2596</td>\n",
" <td>2596</td>\n",
" <td>Philippines</td>\n",
" <td>1</td>\n",
" <td>82.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>82.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2596.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2597</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>71</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>2597</td>\n",
" <td>2597</td>\n",
" <td>2597</td>\n",
" <td>Philippines</td>\n",
" <td>1</td>\n",
" <td>71.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>71.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2597.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2598</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>68</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" <td>2598</td>\n",
" <td>2598</td>\n",
" <td>2598</td>\n",
" <td>Philippines</td>\n",
" <td>1</td>\n",
" <td>68.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>68.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2598.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2599</th>\n",
" <td>2578</td>\n",
" <td>1</td>\n",
" <td>69</td>\n",
" <td>Food &amp; Beverage</td>\n",
" <td>2599</td>\n",
" <td>2599</td>\n",
" <td>2599</td>\n",
" <td>Germany</td>\n",
" <td>1</td>\n",
" <td>69.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>69.0</td>\n",
" <td>1.0</td>\n",
" <td>2578.0</td>\n",
" <td>2599.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2600 rows × 35 columns</p>\n",
"</div>"
],
"text/plain": [
" Rank Networth Age Industry id country_id \\\n",
"billioner_id \n",
"0 1 219 50 Automotive 0 0 \n",
"1 2 171 58 Technology 1 1 \n",
"2 3 158 73 Fashion & Retail 2 2 \n",
"3 4 129 66 Technology 3 3 \n",
"4 5 118 91 Finance & Investments 4 4 \n",
"... ... ... ... ... ... ... \n",
"2595 2578 1 80 Healthcare 2595 2595 \n",
"2596 2578 1 82 Fashion & Retail 2596 2596 \n",
"2597 2578 1 71 Fashion & Retail 2597 2597 \n",
"2598 2578 1 68 Fashion & Retail 2598 2598 \n",
"2599 2578 1 69 Food & Beverage 2599 2599 \n",
"\n",
" country_table.id country_table.Country \\\n",
"billioner_id \n",
"0 0 United States \n",
"1 1 United States \n",
"2 2 France \n",
"3 3 United States \n",
"4 4 United States \n",
"... ... ... \n",
"2595 2595 Spain \n",
"2596 2596 Philippines \n",
"2597 2597 Philippines \n",
"2598 2598 Philippines \n",
"2599 2599 Germany \n",
"\n",
" country_table.COUNT(other_about_billioner) \\\n",
"billioner_id \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"2595 1 \n",
"2596 1 \n",
"2597 1 \n",
"2598 1 \n",
"2599 1 \n",
"\n",
" country_table.MAX(other_about_billioner.Age) ... \\\n",
"billioner_id ... \n",
"0 50.0 ... \n",
"1 58.0 ... \n",
"2 73.0 ... \n",
"3 66.0 ... \n",
"4 91.0 ... \n",
"... ... ... \n",
"2595 80.0 ... \n",
"2596 82.0 ... \n",
"2597 71.0 ... \n",
"2598 68.0 ... \n",
"2599 69.0 ... \n",
"\n",
" country_table.SKEW(other_about_billioner.Rank ) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.SKEW(other_about_billioner.id) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.STD(other_about_billioner.Age) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.STD(other_about_billioner.Networth) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.STD(other_about_billioner.Rank ) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.STD(other_about_billioner.id) \\\n",
"billioner_id \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"2595 NaN \n",
"2596 NaN \n",
"2597 NaN \n",
"2598 NaN \n",
"2599 NaN \n",
"\n",
" country_table.SUM(other_about_billioner.Age) \\\n",
"billioner_id \n",
"0 50.0 \n",
"1 58.0 \n",
"2 73.0 \n",
"3 66.0 \n",
"4 91.0 \n",
"... ... \n",
"2595 80.0 \n",
"2596 82.0 \n",
"2597 71.0 \n",
"2598 68.0 \n",
"2599 69.0 \n",
"\n",
" country_table.SUM(other_about_billioner.Networth) \\\n",
"billioner_id \n",
"0 219.0 \n",
"1 171.0 \n",
"2 158.0 \n",
"3 129.0 \n",
"4 118.0 \n",
"... ... \n",
"2595 1.0 \n",
"2596 1.0 \n",
"2597 1.0 \n",
"2598 1.0 \n",
"2599 1.0 \n",
"\n",
" country_table.SUM(other_about_billioner.Rank ) \\\n",
"billioner_id \n",
"0 1.0 \n",
"1 2.0 \n",
"2 3.0 \n",
"3 4.0 \n",
"4 5.0 \n",
"... ... \n",
"2595 2578.0 \n",
"2596 2578.0 \n",
"2597 2578.0 \n",
"2598 2578.0 \n",
"2599 2578.0 \n",
"\n",
" country_table.SUM(other_about_billioner.id) \n",
"billioner_id \n",
"0 0.0 \n",
"1 1.0 \n",
"2 2.0 \n",
"3 3.0 \n",
"4 4.0 \n",
"... ... \n",
"2595 2595.0 \n",
"2596 2596.0 \n",
"2597 2597.0 \n",
"2598 2598.0 \n",
"2599 2599.0 \n",
"\n",
"[2600 rows x 35 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import featuretools as ft\n",
"from woodwork.logical_types import Categorical, Integer\n",
"import pandas as pd\n",
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
"df['id'] = pd.Series(range(len(df))) \n",
"# Создание двух таблиц: одна с моделью, другая с остальными данными\n",
"country_df = df[['id', 'Country']].drop_duplicates().reset_index(drop=True)\n",
"other_df = df.drop(columns=['Country'])\n",
"\n",
"# Создание уникального идентификатора для связи\n",
"country_df['country_id'] = country_df.index\n",
"other_df['country_id'] = other_df['id'].map(country_df.set_index('id')['country_id'])\n",
"\n",
"es = ft.EntitySet(id=\"orders\")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"country_table\",\n",
" dataframe=country_df,\n",
" index=\"country_id\", # Индекс для уникальной идентификации моделей\n",
" logical_types={\n",
" \"Country\": Categorical # Определяем логический тип для модели\n",
" },\n",
")\n",
"es = es.add_dataframe(\n",
" dataframe_name=\"other_about_billioner\",\n",
" dataframe=other_df,\n",
" index=\"billioner_id\", # Индекс для уникальной идентификации автомобилей\n",
" logical_types={\n",
" \"Rank \": Integer, # Целевая переменная (цена)\n",
" \"Networth\": Integer, # Пробег (числовой признак)\n",
" \"Age\": Integer,\n",
" \"country_id\": Integer, # Пробег (числовой признак)\n",
" },\n",
")\n",
"es = es.add_relationship(\"country_table\", \"country_id\", \"other_about_billioner\", \"country_id\")\n",
"\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es,\n",
" target_dataframe_name=\"other_about_billioner\"\n",
")\n",
"\n",
"feature_matrix"
]
} }
], ],
"metadata": { "metadata": {