diff --git a/lab_4/lab_4.ipynb b/lab_4/lab_4.ipynb index e7d76c9..6838ed8 100644 --- a/lab_4/lab_4.ipynb +++ b/lab_4/lab_4.ipynb @@ -2112,22 +2112,345 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 191, "metadata": {}, "outputs": [ { - "ename": "ValueError", - "evalue": "Shape of passed values is (8000, 21), indices imply (8000, 19)", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[184], line 123\u001b[0m\n\u001b[0;32m 121\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(preprocessing_result, columns\u001b[38;5;241m=\u001b[39mnum_columns \u001b[38;5;241m+\u001b[39m cat_columns \u001b[38;5;241m+\u001b[39m cols)\n\u001b[0;32m 122\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m features_engineering\u001b[38;5;241m.\u001b[39mfit_transform(preprocessing_result)\n\u001b[1;32m--> 123\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpreprocessing_result\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_columns\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcat_columns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 125\u001b[0m \u001b[38;5;66;03m# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)\u001b[39;00m\n\u001b[0;32m 126\u001b[0m \n\u001b[0;32m 127\u001b[0m \u001b[38;5;66;03m# preprocessing_result = pipeline_end.fit_transform(X_train)\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 131\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m 132\u001b[0m \u001b[38;5;66;03m# preprocessed_df\u001b[39;00m\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\frame.py:827\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 816\u001b[0m mgr \u001b[38;5;241m=\u001b[39m dict_to_mgr(\n\u001b[0;32m 817\u001b[0m \u001b[38;5;66;03m# error: Item \"ndarray\" of \"Union[ndarray, Series, Index]\" has no\u001b[39;00m\n\u001b[0;32m 818\u001b[0m \u001b[38;5;66;03m# attribute \"name\"\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 824\u001b[0m copy\u001b[38;5;241m=\u001b[39m_copy,\n\u001b[0;32m 825\u001b[0m )\n\u001b[0;32m 826\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 827\u001b[0m mgr \u001b[38;5;241m=\u001b[39m \u001b[43mndarray_to_mgr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 828\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 829\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 830\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 831\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 832\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 833\u001b[0m \u001b[43m \u001b[49m\u001b[43mtyp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmanager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 834\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 836\u001b[0m \u001b[38;5;66;03m# For data is list-like, or Iterable (will consume into list)\u001b[39;00m\n\u001b[0;32m 837\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(data):\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:336\u001b[0m, in \u001b[0;36mndarray_to_mgr\u001b[1;34m(values, index, columns, dtype, copy, typ)\u001b[0m\n\u001b[0;32m 331\u001b[0m \u001b[38;5;66;03m# _prep_ndarraylike ensures that values.ndim == 2 at this point\u001b[39;00m\n\u001b[0;32m 332\u001b[0m index, columns \u001b[38;5;241m=\u001b[39m _get_axes(\n\u001b[0;32m 333\u001b[0m values\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m], values\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m], index\u001b[38;5;241m=\u001b[39mindex, columns\u001b[38;5;241m=\u001b[39mcolumns\n\u001b[0;32m 334\u001b[0m )\n\u001b[1;32m--> 336\u001b[0m \u001b[43m_check_values_indices_shape_match\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 338\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m typ \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n", - "File \u001b[1;32md:\\Study\\3 курс 5 семестр\\AIM\\AIM-PIbd-31-Yakovlev-M-G\\kernel\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:420\u001b[0m, in \u001b[0;36m_check_values_indices_shape_match\u001b[1;34m(values, index, columns)\u001b[0m\n\u001b[0;32m 418\u001b[0m passed \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mshape\n\u001b[0;32m 419\u001b[0m implied \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mlen\u001b[39m(index), \u001b[38;5;28mlen\u001b[39m(columns))\n\u001b[1;32m--> 420\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShape of passed values is \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpassed\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, indices imply \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mimplied\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[1;31mValueError\u001b[0m: Shape of passed values is (8000, 21), indices imply (8000, 19)" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpricebedroomsbathroomssqft_livingsqft_lotfloorsconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15price_category
033260000340732600.042.5213073001.0471230900196309800547.605-122.16721307560
129828702055358000.021.596018082.0379600199309812247.6183-122.29812901668
213438500625210000.031.01080210431.03610800194209810647.5515-122.35713807620
322422029094517534.021.08331439471.0358330200609807047.3889-122.4821380143947
427462900015387000.032.251760451332.03717600198409806547.5124-121.866191051773
............................................................
799522787720140416000.032.51790115421.0571190600196909805947.5124-122.1617909131
799636192400400775000.042.5309071122.03930900200109805247.705-122.11830506000
799722296500036450000.042.752980132601.04818001180197909805647.5152-122.197192010731
799812787310130289950.041.75209074161.04710501040197009803147.4107-122.17917107527
799928567300110485000.032.52340590581.03823400198509803847.4052-122.028270037263
\n", + "

8000 rows × 19 columns

\n", + "
" + ], + "text/plain": [ + " id price bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "0 3 3260000340 732600.0 4 2.5 2130 7300 \n", + "1 2 9828702055 358000.0 2 1.5 960 1808 \n", + "2 1 3438500625 210000.0 3 1.0 1080 21043 \n", + "3 2 2422029094 517534.0 2 1.0 833 143947 \n", + "4 2 7462900015 387000.0 3 2.25 1760 45133 \n", + "... .. ... ... ... ... ... ... \n", + "7995 2 2787720140 416000.0 3 2.5 1790 11542 \n", + "7996 3 6192400400 775000.0 4 2.5 3090 7112 \n", + "7997 2 2296500036 450000.0 4 2.75 2980 13260 \n", + "7998 1 2787310130 289950.0 4 1.75 2090 7416 \n", + "7999 2 8567300110 485000.0 3 2.5 2340 59058 \n", + "\n", + " condition grade sqft_above sqft_basement yr_built yr_renovated zipcode \\\n", + "0 1.0 4 7 1230 900 1963 0 \n", + "1 2.0 3 7 960 0 1993 0 \n", + "2 1.0 3 6 1080 0 1942 0 \n", + "3 1.0 3 5 833 0 2006 0 \n", + "4 2.0 3 7 1760 0 1984 0 \n", + "... ... ... ... ... ... ... ... \n", + "7995 1.0 5 7 1190 600 1969 0 \n", + "7996 2.0 3 9 3090 0 2001 0 \n", + "7997 1.0 4 8 1800 1180 1979 0 \n", + "7998 1.0 4 7 1050 1040 1970 0 \n", + "7999 1.0 3 8 2340 0 1985 0 \n", + "\n", + " lat long sqft_living15 sqft_lot15 price_category \n", + "0 98005 47.605 -122.167 2130 7560 \n", + "1 98122 47.6183 -122.298 1290 1668 \n", + "2 98106 47.5515 -122.357 1380 7620 \n", + "3 98070 47.3889 -122.482 1380 143947 \n", + "4 98065 47.5124 -121.866 1910 51773 \n", + "... ... ... ... ... ... \n", + "7995 98059 47.5124 -122.16 1790 9131 \n", + "7996 98052 47.705 -122.118 3050 6000 \n", + "7997 98056 47.5152 -122.197 1920 10731 \n", + "7998 98031 47.4107 -122.179 1710 7527 \n", + "7999 98038 47.4052 -122.028 2700 37263 \n", + "\n", + "[8000 rows x 19 columns]" + ] + }, + "execution_count": 191, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -2209,17 +2532,18 @@ " transformers=[\n", " (\"prepocessing_num\", preprocessing_num, num_columns),\n", " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", + " # (\"prepocessing_features\", cat_imputer, [\"price_category\"]),\n", " ],\n", " remainder=\"passthrough\"\n", ")\n", "\n", - "# features_engineering = ColumnTransformer(\n", - "# verbose_feature_names_out=False,\n", - "# transformers=[\n", - "# (\"add_features\", HousesFeatures(), [\"price_category\"]),\n", - "# ],\n", - "# remainder=\"passthrough\",\n", - "# )\n", + "features_engineering = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"add_features\", HousesFeatures(), [\"price_category\"]),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", "\n", "drop_columns = ColumnTransformer(\n", " verbose_feature_names_out=False,\n", @@ -2249,10 +2573,11 @@ "cols = ['a', 'b']\n", "preprocessing_result = drop_columns.fit_transform(X_train)\n", "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n", - "preprocessing_result = features_preprocessing.fit_transform(preprocessing_result)\n", - "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols)\n", "preprocessing_result = features_engineering.fit_transform(preprocessing_result)\n", "preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n", + "preprocessing_result\n", + "# # preprocessing_result = features_preprocessing.fit_transform(preprocessing_result)\n", + "# # preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns)\n", "\n", "# preprocessing_result = features_postprocessing.fit_transform(preprocessing_result)\n", "\n",