diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb index 02e514a..2761062 100644 --- a/lab_3/lab3.ipynb +++ b/lab_3/lab3.ipynb @@ -436,7 +436,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Приведём пример использования future tools" + "# Приведём пример использования future tools\n", + "## Попробую вынести страну в отдельную таблицу" ] }, { @@ -544,6 +545,628 @@ "source": [ "pip install --upgrade setuptools" ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index billioner_id not found in dataframe, creating new integer column\n", + " warnings.warn(\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"max\" instead.\n", + " ).agg(to_agg)\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"std\" instead.\n", + " ).agg(to_agg)\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n", + " ).agg(to_agg)\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"min\" instead.\n", + " ).agg(to_agg)\n", + "c:\\Users\\annal\\aim\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n", + " ).agg(to_agg)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankNetworthAgeIndustryidcountry_idcountry_table.idcountry_table.Countrycountry_table.COUNT(other_about_billioner)country_table.MAX(other_about_billioner.Age)...country_table.SKEW(other_about_billioner.Rank )country_table.SKEW(other_about_billioner.id)country_table.STD(other_about_billioner.Age)country_table.STD(other_about_billioner.Networth)country_table.STD(other_about_billioner.Rank )country_table.STD(other_about_billioner.id)country_table.SUM(other_about_billioner.Age)country_table.SUM(other_about_billioner.Networth)country_table.SUM(other_about_billioner.Rank )country_table.SUM(other_about_billioner.id)
billioner_id
0121950Automotive000United States150.0...NaNNaNNaNNaNNaNNaN50.0219.01.00.0
1217158Technology111United States158.0...NaNNaNNaNNaNNaNNaN58.0171.02.01.0
2315873Fashion & Retail222France173.0...NaNNaNNaNNaNNaNNaN73.0158.03.02.0
3412966Technology333United States166.0...NaNNaNNaNNaNNaNNaN66.0129.04.03.0
4511891Finance & Investments444United States191.0...NaNNaNNaNNaNNaNNaN91.0118.05.04.0
..................................................................
25952578180Healthcare259525952595Spain180.0...NaNNaNNaNNaNNaNNaN80.01.02578.02595.0
25962578182Fashion & Retail259625962596Philippines182.0...NaNNaNNaNNaNNaNNaN82.01.02578.02596.0
25972578171Fashion & Retail259725972597Philippines171.0...NaNNaNNaNNaNNaNNaN71.01.02578.02597.0
25982578168Fashion & Retail259825982598Philippines168.0...NaNNaNNaNNaNNaNNaN68.01.02578.02598.0
25992578169Food & Beverage259925992599Germany169.0...NaNNaNNaNNaNNaNNaN69.01.02578.02599.0
\n", + "

2600 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " Rank Networth Age Industry id country_id \\\n", + "billioner_id \n", + "0 1 219 50 Automotive 0 0 \n", + "1 2 171 58 Technology 1 1 \n", + "2 3 158 73 Fashion & Retail 2 2 \n", + "3 4 129 66 Technology 3 3 \n", + "4 5 118 91 Finance & Investments 4 4 \n", + "... ... ... ... ... ... ... \n", + "2595 2578 1 80 Healthcare 2595 2595 \n", + "2596 2578 1 82 Fashion & Retail 2596 2596 \n", + "2597 2578 1 71 Fashion & Retail 2597 2597 \n", + "2598 2578 1 68 Fashion & Retail 2598 2598 \n", + "2599 2578 1 69 Food & Beverage 2599 2599 \n", + "\n", + " country_table.id country_table.Country \\\n", + "billioner_id \n", + "0 0 United States \n", + "1 1 United States \n", + "2 2 France \n", + "3 3 United States \n", + "4 4 United States \n", + "... ... ... \n", + "2595 2595 Spain \n", + "2596 2596 Philippines \n", + "2597 2597 Philippines \n", + "2598 2598 Philippines \n", + "2599 2599 Germany \n", + "\n", + " country_table.COUNT(other_about_billioner) \\\n", + "billioner_id \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "... ... \n", + "2595 1 \n", + "2596 1 \n", + "2597 1 \n", + "2598 1 \n", + "2599 1 \n", + "\n", + " country_table.MAX(other_about_billioner.Age) ... \\\n", + "billioner_id ... \n", + "0 50.0 ... \n", + "1 58.0 ... \n", + "2 73.0 ... \n", + "3 66.0 ... \n", + "4 91.0 ... \n", + "... ... ... \n", + "2595 80.0 ... \n", + "2596 82.0 ... \n", + "2597 71.0 ... \n", + "2598 68.0 ... \n", + "2599 69.0 ... \n", + "\n", + " country_table.SKEW(other_about_billioner.Rank ) \\\n", + "billioner_id \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "2595 NaN \n", + "2596 NaN \n", + "2597 NaN \n", + "2598 NaN \n", + "2599 NaN \n", + "\n", + " country_table.SKEW(other_about_billioner.id) \\\n", + "billioner_id \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "2595 NaN \n", + "2596 NaN \n", + "2597 NaN \n", + "2598 NaN \n", + "2599 NaN \n", + "\n", + " country_table.STD(other_about_billioner.Age) \\\n", + "billioner_id \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "2595 NaN \n", + "2596 NaN \n", + "2597 NaN \n", + "2598 NaN \n", + "2599 NaN \n", + "\n", + " country_table.STD(other_about_billioner.Networth) \\\n", + "billioner_id \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "2595 NaN \n", + "2596 NaN \n", + "2597 NaN \n", + "2598 NaN \n", + "2599 NaN \n", + "\n", + " country_table.STD(other_about_billioner.Rank ) \\\n", + "billioner_id \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "2595 NaN \n", + "2596 NaN \n", + "2597 NaN \n", + "2598 NaN \n", + "2599 NaN \n", + "\n", + " country_table.STD(other_about_billioner.id) \\\n", + "billioner_id \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "2595 NaN \n", + "2596 NaN \n", + "2597 NaN \n", + "2598 NaN \n", + "2599 NaN \n", + "\n", + " country_table.SUM(other_about_billioner.Age) \\\n", + "billioner_id \n", + "0 50.0 \n", + "1 58.0 \n", + "2 73.0 \n", + "3 66.0 \n", + "4 91.0 \n", + "... ... \n", + "2595 80.0 \n", + "2596 82.0 \n", + "2597 71.0 \n", + "2598 68.0 \n", + "2599 69.0 \n", + "\n", + " country_table.SUM(other_about_billioner.Networth) \\\n", + "billioner_id \n", + "0 219.0 \n", + "1 171.0 \n", + "2 158.0 \n", + "3 129.0 \n", + "4 118.0 \n", + "... ... \n", + "2595 1.0 \n", + "2596 1.0 \n", + "2597 1.0 \n", + "2598 1.0 \n", + "2599 1.0 \n", + "\n", + " country_table.SUM(other_about_billioner.Rank ) \\\n", + "billioner_id \n", + "0 1.0 \n", + "1 2.0 \n", + "2 3.0 \n", + "3 4.0 \n", + "4 5.0 \n", + "... ... \n", + "2595 2578.0 \n", + "2596 2578.0 \n", + "2597 2578.0 \n", + "2598 2578.0 \n", + "2599 2578.0 \n", + "\n", + " country_table.SUM(other_about_billioner.id) \n", + "billioner_id \n", + "0 0.0 \n", + "1 1.0 \n", + "2 2.0 \n", + "3 3.0 \n", + "4 4.0 \n", + "... ... \n", + "2595 2595.0 \n", + "2596 2596.0 \n", + "2597 2597.0 \n", + "2598 2598.0 \n", + "2599 2599.0 \n", + "\n", + "[2600 rows x 35 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import featuretools as ft\n", + "from woodwork.logical_types import Categorical, Integer\n", + "import pandas as pd\n", + "df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n", + "df['id'] = pd.Series(range(len(df))) \n", + "# Создание двух таблиц: одна с моделью, другая с остальными данными\n", + "country_df = df[['id', 'Country']].drop_duplicates().reset_index(drop=True)\n", + "other_df = df.drop(columns=['Country'])\n", + "\n", + "# Создание уникального идентификатора для связи\n", + "country_df['country_id'] = country_df.index\n", + "other_df['country_id'] = other_df['id'].map(country_df.set_index('id')['country_id'])\n", + "\n", + "es = ft.EntitySet(id=\"orders\")\n", + "es = es.add_dataframe(\n", + " dataframe_name=\"country_table\",\n", + " dataframe=country_df,\n", + " index=\"country_id\", # Индекс для уникальной идентификации моделей\n", + " logical_types={\n", + " \"Country\": Categorical # Определяем логический тип для модели\n", + " },\n", + ")\n", + "es = es.add_dataframe(\n", + " dataframe_name=\"other_about_billioner\",\n", + " dataframe=other_df,\n", + " index=\"billioner_id\", # Индекс для уникальной идентификации автомобилей\n", + " logical_types={\n", + " \"Rank \": Integer, # Целевая переменная (цена)\n", + " \"Networth\": Integer, # Пробег (числовой признак)\n", + " \"Age\": Integer,\n", + " \"country_id\": Integer, # Пробег (числовой признак)\n", + " },\n", + ")\n", + "es = es.add_relationship(\"country_table\", \"country_id\", \"other_about_billioner\", \"country_id\")\n", + "\n", + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es,\n", + " target_dataframe_name=\"other_about_billioner\"\n", + ")\n", + "\n", + "feature_matrix" + ] } ], "metadata": {