303 KiB
303 KiB
Тестим Pandas
Считываем csv:
In [30]:
import pandas
# Считаем csv
data_frame = pandas.read_csv("data/kc_house_data.csv", index_col="id")
# Сохраняем data_frame в новый csv
data_frame.to_csv("data/new_kc_house_data.csv")
Получение сведений о data_frame:
In [31]:
# Общая информация о data_frame
print("Общая информация о data_frame:")
print(data_frame.info(), "\n")
# Первые строки
print("Первые строки data_frame:")
print(data_frame.head(), "\n")
# Описание данных
print("Описание данных data_frame:")
print(data_frame.describe(), "\n")
# Количество строк и столбцов
print(f"Количество строк и столбцов data_frame: {data_frame.shape}\n")
# Названия столбцов
print(f"Названия столбцов: {data_frame.columns}\n")
# Типы данных каждого столбца
print("Типы данных каждого столбца:")
print(data_frame.dtypes, "\n")
# Количество пропущенных значений
print("Количество пропущенных значений в каждом столбце:")
print(data_frame.isnull().sum(), "\n")
Общая информация о data_frame:
<class 'pandas.core.frame.DataFrame'>
Index: 21613 entries, 7129300520 to 1523300157
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 21613 non-null object
1 price 21613 non-null float64
2 bedrooms 21613 non-null int64
3 bathrooms 21613 non-null float64
4 sqft_living 21613 non-null int64
5 sqft_lot 21613 non-null int64
6 floors 21613 non-null float64
7 waterfront 21613 non-null int64
8 view 21613 non-null int64
9 condition 21613 non-null int64
10 grade 21613 non-null int64
11 sqft_above 21613 non-null int64
12 sqft_basement 21613 non-null int64
13 yr_built 21613 non-null int64
14 yr_renovated 21613 non-null int64
15 zipcode 21613 non-null int64
16 lat 21613 non-null float64
17 long 21613 non-null float64
18 sqft_living15 21613 non-null int64
19 sqft_lot15 21613 non-null int64
dtypes: float64(5), int64(14), object(1)
memory usage: 3.5+ MB
None
Первые строки data_frame:
date price bedrooms bathrooms sqft_living \
id
7129300520 20141013T000000 221900.0 3 1.00 1180
6414100192 20141209T000000 538000.0 3 2.25 2570
5631500400 20150225T000000 180000.0 2 1.00 770
2487200875 20141209T000000 604000.0 4 3.00 1960
1954400510 20150218T000000 510000.0 3 2.00 1680
sqft_lot floors waterfront view condition grade sqft_above \
id
7129300520 5650 1.0 0 0 3 7 1180
6414100192 7242 2.0 0 0 3 7 2170
5631500400 10000 1.0 0 0 3 6 770
2487200875 5000 1.0 0 0 5 7 1050
1954400510 8080 1.0 0 0 3 8 1680
sqft_basement yr_built yr_renovated zipcode lat long \
id
7129300520 0 1955 0 98178 47.5112 -122.257
6414100192 400 1951 1991 98125 47.7210 -122.319
5631500400 0 1933 0 98028 47.7379 -122.233
2487200875 910 1965 0 98136 47.5208 -122.393
1954400510 0 1987 0 98074 47.6168 -122.045
sqft_living15 sqft_lot15
id
7129300520 1340 5650
6414100192 1690 7639
5631500400 2720 8062
2487200875 1360 5000
1954400510 1800 7503
Описание данных data_frame:
price bedrooms bathrooms sqft_living sqft_lot \
count 2.161300e+04 21613.000000 21613.000000 21613.000000 2.161300e+04
mean 5.400881e+05 3.370842 2.114757 2079.899736 1.510697e+04
std 3.671272e+05 0.930062 0.770163 918.440897 4.142051e+04
min 7.500000e+04 0.000000 0.000000 290.000000 5.200000e+02
25% 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03
50% 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03
75% 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04
max 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06
floors waterfront view condition grade \
count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000
mean 1.494309 0.007542 0.234303 3.409430 7.656873
std 0.539989 0.086517 0.766318 0.650743 1.175459
min 1.000000 0.000000 0.000000 1.000000 1.000000
25% 1.000000 0.000000 0.000000 3.000000 7.000000
50% 1.500000 0.000000 0.000000 3.000000 7.000000
75% 2.000000 0.000000 0.000000 4.000000 8.000000
max 3.500000 1.000000 4.000000 5.000000 13.000000
sqft_above sqft_basement yr_built yr_renovated zipcode \
count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000
mean 1788.390691 291.509045 1971.005136 84.402258 98077.939805
std 828.090978 442.575043 29.373411 401.679240 53.505026
min 290.000000 0.000000 1900.000000 0.000000 98001.000000
25% 1190.000000 0.000000 1951.000000 0.000000 98033.000000
50% 1560.000000 0.000000 1975.000000 0.000000 98065.000000
75% 2210.000000 560.000000 1997.000000 0.000000 98118.000000
max 9410.000000 4820.000000 2015.000000 2015.000000 98199.000000
lat long sqft_living15 sqft_lot15
count 21613.000000 21613.000000 21613.000000 21613.000000
mean 47.560053 -122.213896 1986.552492 12768.455652
std 0.138564 0.140828 685.391304 27304.179631
min 47.155900 -122.519000 399.000000 651.000000
25% 47.471000 -122.328000 1490.000000 5100.000000
50% 47.571800 -122.230000 1840.000000 7620.000000
75% 47.678000 -122.125000 2360.000000 10083.000000
max 47.777600 -121.315000 6210.000000 871200.000000
Количество строк и столбцов data_frame: (21613, 20)
Названия столбцов: Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
'sqft_living15', 'sqft_lot15'],
dtype='object')
Типы данных каждого столбца:
date object
price float64
bedrooms int64
bathrooms float64
sqft_living int64
sqft_lot int64
floors float64
waterfront int64
view int64
condition int64
grade int64
sqft_above int64
sqft_basement int64
yr_built int64
yr_renovated int64
zipcode int64
lat float64
long float64
sqft_living15 int64
sqft_lot15 int64
dtype: object
Количество пропущенных значений в каждом столбце:
date 0
price 0
bedrooms 0
bathrooms 0
sqft_living 0
sqft_lot 0
floors 0
waterfront 0
view 0
condition 0
grade 0
sqft_above 0
sqft_basement 0
yr_built 0
yr_renovated 0
zipcode 0
lat 0
long 0
sqft_living15 0
sqft_lot15 0
dtype: int64
Получение сведений о колонках
In [32]:
# Список всех столбцов
print(f"Список всех столбцов: {data_frame.columns}\n")
# Типы данных каждого столбца
print("Типы данных каждого столбца:")
print(data_frame.dtypes, "\n")
# Описание всех столбцов
print("Описание всех столбцов DataFrame:")
print(data_frame.describe(include="all"), "\n")
# Количество пропущенных значений в каждом столбце
print("Количество пропущенных значений в каждом столбце:")
print(data_frame.isnull().sum(), "\n")
# Количество уникальных значений в столбце 'date'
print("Количество уникальных значений в столбце 'date':")
print(data_frame["date"].value_counts(), "\n")
Список всех столбцов: Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], dtype='object') Типы данных каждого столбца: date object price float64 bedrooms int64 bathrooms float64 sqft_living int64 sqft_lot int64 floors float64 waterfront int64 view int64 condition int64 grade int64 sqft_above int64 sqft_basement int64 yr_built int64 yr_renovated int64 zipcode int64 lat float64 long float64 sqft_living15 int64 sqft_lot15 int64 dtype: object Описание всех столбцов DataFrame: date price bedrooms bathrooms \ count 21613 2.161300e+04 21613.000000 21613.000000 unique 372 NaN NaN NaN top 20140623T000000 NaN NaN NaN freq 142 NaN NaN NaN mean NaN 5.400881e+05 3.370842 2.114757 std NaN 3.671272e+05 0.930062 0.770163 min NaN 7.500000e+04 0.000000 0.000000 25% NaN 3.219500e+05 3.000000 1.750000 50% NaN 4.500000e+05 3.000000 2.250000 75% NaN 6.450000e+05 4.000000 2.500000 max NaN 7.700000e+06 33.000000 8.000000 sqft_living sqft_lot floors waterfront view \ count 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.000000 unique NaN NaN NaN NaN NaN top NaN NaN NaN NaN NaN freq NaN NaN NaN NaN NaN mean 2079.899736 1.510697e+04 1.494309 0.007542 0.234303 std 918.440897 4.142051e+04 0.539989 0.086517 0.766318 min 290.000000 5.200000e+02 1.000000 0.000000 0.000000 25% 1427.000000 5.040000e+03 1.000000 0.000000 0.000000 50% 1910.000000 7.618000e+03 1.500000 0.000000 0.000000 75% 2550.000000 1.068800e+04 2.000000 0.000000 0.000000 max 13540.000000 1.651359e+06 3.500000 1.000000 4.000000 condition grade sqft_above sqft_basement yr_built \ count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 unique NaN NaN NaN NaN NaN top NaN NaN NaN NaN NaN freq NaN NaN NaN NaN NaN mean 3.409430 7.656873 1788.390691 291.509045 1971.005136 std 0.650743 1.175459 828.090978 442.575043 29.373411 min 1.000000 1.000000 290.000000 0.000000 1900.000000 25% 3.000000 7.000000 1190.000000 0.000000 1951.000000 50% 3.000000 7.000000 1560.000000 0.000000 1975.000000 75% 4.000000 8.000000 2210.000000 560.000000 1997.000000 max 5.000000 13.000000 9410.000000 4820.000000 2015.000000 yr_renovated zipcode lat long sqft_living15 \ count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 unique NaN NaN NaN NaN NaN top NaN NaN NaN NaN NaN freq NaN NaN NaN NaN NaN mean 84.402258 98077.939805 47.560053 -122.213896 1986.552492 std 401.679240 53.505026 0.138564 0.140828 685.391304 min 0.000000 98001.000000 47.155900 -122.519000 399.000000 25% 0.000000 98033.000000 47.471000 -122.328000 1490.000000 50% 0.000000 98065.000000 47.571800 -122.230000 1840.000000 75% 0.000000 98118.000000 47.678000 -122.125000 2360.000000 max 2015.000000 98199.000000 47.777600 -121.315000 6210.000000 sqft_lot15 count 21613.000000 unique NaN top NaN freq NaN mean 12768.455652 std 27304.179631 min 651.000000 25% 5100.000000 50% 7620.000000 75% 10083.000000 max 871200.000000 Количество пропущенных значений в каждом столбце: date 0 price 0 bedrooms 0 bathrooms 0 sqft_living 0 sqft_lot 0 floors 0 waterfront 0 view 0 condition 0 grade 0 sqft_above 0 sqft_basement 0 yr_built 0 yr_renovated 0 zipcode 0 lat 0 long 0 sqft_living15 0 sqft_lot15 0 dtype: int64 Количество уникальных значений в столбце 'date': date 20140623T000000 142 20140626T000000 131 20140625T000000 131 20140708T000000 127 20150427T000000 126 ... 20150131T000000 1 20150117T000000 1 20150308T000000 1 20150515T000000 1 20140803T000000 1 Name: count, Length: 372, dtype: int64
Вывод строки и стобца
In [33]:
# Вывод столбца 'date'
print("Столбец 'date':")
print(data_frame["date"], "\n")
# Вывод строки с индексом 2
print("Строка с индексом 2:")
print(data_frame.iloc[2], "\n")
# Вывод значения в первой строке и столбце 'date'
print("Значение в первой строке и столбце 'date':")
print(data_frame.iloc[0, 2], "\n")
# Вывод конкретного значения по метке строки и имени столбца
print("Значение в строке с индексом 1 и столбце 'date':")
print(data_frame.loc[7129300520, "date"], "\n")
# Вывод нескольких столбцов 'date' и 'price'
print("Столбцы 'date' и 'price':")
print(data_frame[["date", "price"]], "\n")
# Вывод первых двух строк
print("Первые две строки DataFrame:")
print(data_frame.iloc[:2], "\n")
Столбец 'date': id 7129300520 20141013T000000 6414100192 20141209T000000 5631500400 20150225T000000 2487200875 20141209T000000 1954400510 20150218T000000 ... 263000018 20140521T000000 6600060120 20150223T000000 1523300141 20140623T000000 291310100 20150116T000000 1523300157 20141015T000000 Name: date, Length: 21613, dtype: object Строка с индексом 2: date 20150225T000000 price 180000.0 bedrooms 2 bathrooms 1.0 sqft_living 770 sqft_lot 10000 floors 1.0 waterfront 0 view 0 condition 3 grade 6 sqft_above 770 sqft_basement 0 yr_built 1933 yr_renovated 0 zipcode 98028 lat 47.7379 long -122.233 sqft_living15 2720 sqft_lot15 8062 Name: 5631500400, dtype: object Значение в первой строке и столбце 'date': 3 Значение в строке с индексом 1 и столбце 'date': 20141013T000000 Столбцы 'date' и 'price': date price id 7129300520 20141013T000000 221900.0 6414100192 20141209T000000 538000.0 5631500400 20150225T000000 180000.0 2487200875 20141209T000000 604000.0 1954400510 20150218T000000 510000.0 ... ... ... 263000018 20140521T000000 360000.0 6600060120 20150223T000000 400000.0 1523300141 20140623T000000 402101.0 291310100 20150116T000000 400000.0 1523300157 20141015T000000 325000.0 [21613 rows x 2 columns] Первые две строки DataFrame: date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 sqft_lot floors waterfront view condition grade sqft_above \ id 7129300520 5650 1.0 0 0 3 7 1180 6414100192 7242 2.0 0 0 3 7 2170 sqft_basement yr_built yr_renovated zipcode lat long \ id 7129300520 0 1955 0 98178 47.5112 -122.257 6414100192 400 1951 1991 98125 47.7210 -122.319 sqft_living15 sqft_lot15 id 7129300520 1340 5650 6414100192 1690 7639
Группировка и агрегация данных
In [34]:
# 1. Средняя цена по количеству спален
print("Средняя цена по количеству спален:")
print(data_frame.groupby("bedrooms")["price"].mean())
# 2. Количество продаж по почтовому индексу
print("\nКоличество продаж по почтовому индексу:")
print(data_frame.groupby("zipcode")["price"].count())
# 3. Максимальная цена по количеству ванных комнат
print("\nМаксимальная цена по количеству ванных комнат:")
print(data_frame.groupby("bathrooms")["price"].max())
# 4. Общая цена по количеству этажей
print("\nОбщая цена по количеству этажей:")
print(data_frame.groupby("floors")["price"].sum())
Средняя цена по количеству спален: bedrooms 0 4.095038e+05 1 3.176429e+05 2 4.013727e+05 3 4.662321e+05 4 6.354195e+05 5 7.865998e+05 6 8.255206e+05 7 9.511847e+05 8 1.105077e+06 9 8.939998e+05 10 8.193333e+05 11 5.200000e+05 33 6.400000e+05 Name: price, dtype: float64 Количество продаж по почтовому индексу: zipcode 98001 362 98002 199 98003 280 98004 317 98005 168 ... 98177 255 98178 262 98188 136 98198 280 98199 317 Name: price, Length: 70, dtype: int64 Максимальная цена по количеству ванных комнат: bathrooms 0.00 1295650.0 0.50 312500.0 0.75 785000.0 1.00 1300000.0 1.25 1388000.0 1.50 1500000.0 1.75 3278000.0 2.00 2200000.0 2.25 2400000.0 2.50 3070000.0 2.75 2700000.0 3.00 4489000.0 3.25 3640900.0 3.50 3710000.0 3.75 3650000.0 4.00 3400000.0 4.25 3850000.0 4.50 7062500.0 4.75 3650000.0 5.00 5350000.0 5.25 5110800.0 5.50 4500000.0 5.75 5570000.0 6.00 5300000.0 6.25 3300000.0 6.50 2238890.0 6.75 4668000.0 7.50 450000.0 7.75 6885000.0 8.00 7700000.0 Name: price, dtype: float64 Общая цена по количеству этажей: floors 1.0 4.722489e+09 1.5 1.067653e+09 2.0 5.347512e+09 2.5 1.707158e+08 3.0 3.570885e+08 3.5 7.466500e+06 Name: price, dtype: float64
Сортировка данных
In [35]:
print("Сортировка по цене (возрастание):")
print(data_frame.sort_values(by="price"))
# 2. Сортировка по цене (убывание)
print("\nСортировка по цене (убывание):")
print(data_frame.sort_values(by="price", ascending=False))
# 3. Сортировка по количеству спален и затем по цене (возрастание)
print("\nСортировка по количеству спален и затем по цене (возрастание):")
print(data_frame.sort_values(by=["bedrooms", "price"]))
# 4. Сортировка по почтовому индексу и количеству ванных комнат (убывание)
print("\nСортировка по почтовому индексу и количеству ванных комнат (убывание):")
print(data_frame.sort_values(by=["zipcode", "bathrooms"], ascending=[False, False]))
Сортировка по цене (возрастание): date price bedrooms bathrooms sqft_living \ id 3421079032 20150217T000000 75000.0 1 0.00 670 40000362 20140506T000000 78000.0 2 1.00 780 8658300340 20140523T000000 80000.0 1 0.75 430 3028200080 20150324T000000 81000.0 2 1.00 730 3883800011 20141105T000000 82000.0 3 1.00 860 ... ... ... ... ... ... 8907500070 20150413T000000 5350000.0 5 5.00 8000 2470100110 20140804T000000 5570000.0 5 5.75 9200 9208900037 20140919T000000 6885000.0 6 7.75 9890 9808700762 20140611T000000 7062500.0 5 4.50 10040 6762700020 20141013T000000 7700000.0 6 8.00 12050 sqft_lot floors waterfront view condition grade sqft_above \ id 3421079032 43377 1.0 0 0 3 3 670 40000362 16344 1.0 0 0 1 5 780 8658300340 5050 1.0 0 0 2 4 430 3028200080 9975 1.0 0 0 1 5 730 3883800011 10426 1.0 0 0 3 6 860 ... ... ... ... ... ... ... ... 8907500070 23985 2.0 0 4 3 12 6720 2470100110 35069 2.0 0 0 3 13 6200 9208900037 31374 2.0 0 4 3 13 8860 9808700762 37325 2.0 1 2 3 11 7680 6762700020 27600 2.5 0 3 4 13 8570 sqft_basement yr_built yr_renovated zipcode lat long \ id 3421079032 0 1966 0 98022 47.2638 -121.906 40000362 0 1942 0 98168 47.4739 -122.280 8658300340 0 1912 0 98014 47.6499 -121.909 3028200080 0 1943 0 98168 47.4808 -122.315 3883800011 0 1954 0 98146 47.4987 -122.341 ... ... ... ... ... ... ... 8907500070 1280 2009 0 98004 47.6232 -122.220 2470100110 3000 2001 0 98039 47.6289 -122.233 9208900037 1030 2001 0 98039 47.6305 -122.240 9808700762 2360 1940 2001 98004 47.6500 -122.214 6762700020 3480 1910 1987 98102 47.6298 -122.323 sqft_living15 sqft_lot15 id 3421079032 1160 42882 40000362 1700 10387 8658300340 1200 7500 3028200080 860 9000 3883800011 1140 11250 ... ... ... 8907500070 4600 21750 2470100110 3560 24345 9208900037 4540 42730 9808700762 3930 25449 6762700020 3940 8800 [21613 rows x 20 columns] Сортировка по цене (убывание): date price bedrooms bathrooms sqft_living \ id 6762700020 20141013T000000 7700000.0 6 8.00 12050 9808700762 20140611T000000 7062500.0 5 4.50 10040 9208900037 20140919T000000 6885000.0 6 7.75 9890 2470100110 20140804T000000 5570000.0 5 5.75 9200 8907500070 20150413T000000 5350000.0 5 5.00 8000 ... ... ... ... ... ... 3883800011 20141105T000000 82000.0 3 1.00 860 3028200080 20150324T000000 81000.0 2 1.00 730 8658300340 20140523T000000 80000.0 1 0.75 430 40000362 20140506T000000 78000.0 2 1.00 780 3421079032 20150217T000000 75000.0 1 0.00 670 sqft_lot floors waterfront view condition grade sqft_above \ id 6762700020 27600 2.5 0 3 4 13 8570 9808700762 37325 2.0 1 2 3 11 7680 9208900037 31374 2.0 0 4 3 13 8860 2470100110 35069 2.0 0 0 3 13 6200 8907500070 23985 2.0 0 4 3 12 6720 ... ... ... ... ... ... ... ... 3883800011 10426 1.0 0 0 3 6 860 3028200080 9975 1.0 0 0 1 5 730 8658300340 5050 1.0 0 0 2 4 430 40000362 16344 1.0 0 0 1 5 780 3421079032 43377 1.0 0 0 3 3 670 sqft_basement yr_built yr_renovated zipcode lat long \ id 6762700020 3480 1910 1987 98102 47.6298 -122.323 9808700762 2360 1940 2001 98004 47.6500 -122.214 9208900037 1030 2001 0 98039 47.6305 -122.240 2470100110 3000 2001 0 98039 47.6289 -122.233 8907500070 1280 2009 0 98004 47.6232 -122.220 ... ... ... ... ... ... ... 3883800011 0 1954 0 98146 47.4987 -122.341 3028200080 0 1943 0 98168 47.4808 -122.315 8658300340 0 1912 0 98014 47.6499 -121.909 40000362 0 1942 0 98168 47.4739 -122.280 3421079032 0 1966 0 98022 47.2638 -121.906 sqft_living15 sqft_lot15 id 6762700020 3940 8800 9808700762 3930 25449 9208900037 4540 42730 2470100110 3560 24345 8907500070 4600 21750 ... ... ... 3883800011 1140 11250 3028200080 860 9000 8658300340 1200 7500 40000362 1700 10387 3421079032 1160 42882 [21613 rows x 20 columns] Сортировка по количеству спален и затем по цене (возрастание): date price bedrooms bathrooms sqft_living \ id 9543000205 20150413T000000 139950.0 0 0.00 844 3980300371 20140926T000000 142000.0 0 0.00 290 6896300380 20141002T000000 228000.0 0 1.00 390 7849202190 20141223T000000 235000.0 0 0.00 1470 2310060040 20140925T000000 240000.0 0 2.50 1810 ... ... ... ... ... ... 5566100170 20141029T000000 650000.0 10 2.00 3610 8812401450 20141229T000000 660000.0 10 3.00 2920 627300145 20140814T000000 1148000.0 10 5.25 4590 1773100755 20140821T000000 520000.0 11 3.00 3000 2402100895 20140625T000000 640000.0 33 1.75 1620 sqft_lot floors waterfront view condition grade sqft_above \ id 9543000205 4269 1.0 0 0 4 7 844 3980300371 20875 1.0 0 0 1 1 290 6896300380 5900 1.0 0 0 2 4 390 7849202190 4800 2.0 0 0 3 7 1470 2310060040 5669 2.0 0 0 3 7 1810 ... ... ... ... ... ... ... ... 5566100170 11914 2.0 0 0 4 7 3010 8812401450 3745 2.0 0 0 4 7 1860 627300145 10920 1.0 0 2 3 9 2500 1773100755 4960 2.0 0 0 3 7 2400 2402100895 6000 1.0 0 0 5 7 1040 sqft_basement yr_built yr_renovated zipcode lat long \ id 9543000205 0 1913 0 98001 47.2781 -122.250 3980300371 0 1963 0 98024 47.5308 -121.888 6896300380 0 1953 0 98118 47.5260 -122.261 7849202190 0 1996 0 98065 47.5265 -121.828 2310060040 0 2003 0 98038 47.3493 -122.053 ... ... ... ... ... ... ... 5566100170 600 1958 0 98006 47.5705 -122.175 8812401450 1060 1913 0 98105 47.6635 -122.320 627300145 2090 2008 0 98004 47.5861 -122.113 1773100755 600 1918 1999 98106 47.5560 -122.363 2402100895 580 1947 0 98103 47.6878 -122.331 sqft_living15 sqft_lot15 id 9543000205 1380 9600 3980300371 1620 22850 6896300380 2170 6000 7849202190 1060 7200 2310060040 1810 5685 ... ... ... 5566100170 2040 11914 8812401450 1810 3745 627300145 2730 10400 1773100755 1420 4960 2402100895 1330 4700 [21613 rows x 20 columns] Сортировка по почтовому индексу и количеству ванных комнат (убывание): date price bedrooms bathrooms sqft_living \ id 1068000375 20140923T000000 3200000.0 6 5.00 7100 3271800295 20150203T000000 1569500.0 5 4.50 5620 1370802115 20141205T000000 1925000.0 3 4.50 3950 1370802455 20140813T000000 1050000.0 4 4.50 3180 2771604190 20140617T000000 824000.0 7 4.25 3670 ... ... ... ... ... ... 1312900180 20150325T000000 225000.0 3 1.00 1250 3356403400 20140724T000000 159000.0 3 1.00 1360 1278000210 20150311T000000 110000.0 2 1.00 828 4045700455 20150316T000000 363000.0 3 0.75 2510 9543000205 20150413T000000 139950.0 0 0.00 844 sqft_lot floors waterfront view condition grade sqft_above \ id 1068000375 18200 2.5 0 0 3 13 5240 3271800295 5800 3.0 0 3 3 11 4700 1370802115 6134 2.0 0 3 3 11 2880 1370802455 4606 2.0 0 3 4 9 1990 2771604190 4000 2.0 0 1 3 8 2800 ... ... ... ... ... ... ... ... 1312900180 7820 1.0 0 0 3 7 1250 3356403400 20000 1.0 0 0 4 7 1360 1278000210 4524 1.0 0 0 3 6 828 4045700455 20000 2.0 0 0 4 7 2510 9543000205 4269 1.0 0 0 4 7 844 sqft_basement yr_built yr_renovated zipcode lat long \ id 1068000375 1860 1933 2002 98199 47.6427 -122.408 3271800295 920 1999 0 98199 47.6482 -122.412 1370802115 1070 1998 0 98199 47.6413 -122.405 1370802455 1190 1929 0 98199 47.6402 -122.405 2771604190 870 1964 0 98199 47.6375 -122.388 ... ... ... ... ... ... ... 1312900180 0 1967 0 98001 47.3397 -122.291 3356403400 0 1953 0 98001 47.2861 -122.253 1278000210 0 1968 2007 98001 47.2655 -122.244 4045700455 0 1961 0 98001 47.2871 -122.287 9543000205 0 1913 0 98001 47.2781 -122.250 sqft_living15 sqft_lot15 id 1068000375 3130 6477 3271800295 2360 5800 1370802115 3050 5281 1370802455 2110 5323 2771604190 2010 4000 ... ... ... 1312900180 1300 7920 3356403400 1530 9997 1278000210 828 5402 4045700455 2130 20000 9543000205 1380 9600 [21613 rows x 20 columns]
Удаление строк и столбцов
In [36]:
print("Удаление строки с индексом 1736800520:")
print(data_frame.drop(index=1736800520))
# 2. Удаление нескольких строк по индексам (например, удаляем строки с индексами 0 и 2)
print("\nУдаление строк с индексами 1736800520 и 6300500875:")
print(data_frame.drop(index=[1736800520, 6300500875]))
# 3. Удаление столбца по имени (например, удаляем столбец 'zipcode')
print("\nУдаление столбца 'zipcode':")
print(data_frame.drop(columns="zipcode"))
# 4. Удаление нескольких столбцов (например, 'bathrooms' и 'floors')
print("\nУдаление столбцов 'bathrooms' и 'floors':")
print(data_frame.drop(columns=["bathrooms", "floors"]))
Удаление строки с индексом 1736800520: date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition grade sqft_above \ id 7129300520 5650 1.0 0 0 3 7 1180 6414100192 7242 2.0 0 0 3 7 2170 5631500400 10000 1.0 0 0 3 6 770 2487200875 5000 1.0 0 0 5 7 1050 1954400510 8080 1.0 0 0 3 8 1680 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 8 1530 6600060120 5813 2.0 0 0 3 8 2310 1523300141 1350 2.0 0 0 3 7 1020 291310100 2388 2.0 0 0 3 8 1600 1523300157 1076 2.0 0 0 3 7 1020 sqft_basement yr_built yr_renovated zipcode lat long \ id 7129300520 0 1955 0 98178 47.5112 -122.257 6414100192 400 1951 1991 98125 47.7210 -122.319 5631500400 0 1933 0 98028 47.7379 -122.233 2487200875 910 1965 0 98136 47.5208 -122.393 1954400510 0 1987 0 98074 47.6168 -122.045 ... ... ... ... ... ... ... 263000018 0 2009 0 98103 47.6993 -122.346 6600060120 0 2014 0 98146 47.5107 -122.362 1523300141 0 2009 0 98144 47.5944 -122.299 291310100 0 2004 0 98027 47.5345 -122.069 1523300157 0 2008 0 98144 47.5941 -122.299 sqft_living15 sqft_lot15 id 7129300520 1340 5650 6414100192 1690 7639 5631500400 2720 8062 2487200875 1360 5000 1954400510 1800 7503 ... ... ... 263000018 1530 1509 6600060120 1830 7200 1523300141 1020 2007 291310100 1410 1287 1523300157 1020 1357 [21612 rows x 20 columns] Удаление строк с индексами 1736800520 и 6300500875: date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition grade sqft_above \ id 7129300520 5650 1.0 0 0 3 7 1180 6414100192 7242 2.0 0 0 3 7 2170 5631500400 10000 1.0 0 0 3 6 770 2487200875 5000 1.0 0 0 5 7 1050 1954400510 8080 1.0 0 0 3 8 1680 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 8 1530 6600060120 5813 2.0 0 0 3 8 2310 1523300141 1350 2.0 0 0 3 7 1020 291310100 2388 2.0 0 0 3 8 1600 1523300157 1076 2.0 0 0 3 7 1020 sqft_basement yr_built yr_renovated zipcode lat long \ id 7129300520 0 1955 0 98178 47.5112 -122.257 6414100192 400 1951 1991 98125 47.7210 -122.319 5631500400 0 1933 0 98028 47.7379 -122.233 2487200875 910 1965 0 98136 47.5208 -122.393 1954400510 0 1987 0 98074 47.6168 -122.045 ... ... ... ... ... ... ... 263000018 0 2009 0 98103 47.6993 -122.346 6600060120 0 2014 0 98146 47.5107 -122.362 1523300141 0 2009 0 98144 47.5944 -122.299 291310100 0 2004 0 98027 47.5345 -122.069 1523300157 0 2008 0 98144 47.5941 -122.299 sqft_living15 sqft_lot15 id 7129300520 1340 5650 6414100192 1690 7639 5631500400 2720 8062 2487200875 1360 5000 1954400510 1800 7503 ... ... ... 263000018 1530 1509 6600060120 1830 7200 1523300141 1020 2007 291310100 1410 1287 1523300157 1020 1357 [21611 rows x 20 columns] Удаление столбца 'zipcode': date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition grade sqft_above \ id 7129300520 5650 1.0 0 0 3 7 1180 6414100192 7242 2.0 0 0 3 7 2170 5631500400 10000 1.0 0 0 3 6 770 2487200875 5000 1.0 0 0 5 7 1050 1954400510 8080 1.0 0 0 3 8 1680 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 8 1530 6600060120 5813 2.0 0 0 3 8 2310 1523300141 1350 2.0 0 0 3 7 1020 291310100 2388 2.0 0 0 3 8 1600 1523300157 1076 2.0 0 0 3 7 1020 sqft_basement yr_built yr_renovated lat long \ id 7129300520 0 1955 0 47.5112 -122.257 6414100192 400 1951 1991 47.7210 -122.319 5631500400 0 1933 0 47.7379 -122.233 2487200875 910 1965 0 47.5208 -122.393 1954400510 0 1987 0 47.6168 -122.045 ... ... ... ... ... ... 263000018 0 2009 0 47.6993 -122.346 6600060120 0 2014 0 47.5107 -122.362 1523300141 0 2009 0 47.5944 -122.299 291310100 0 2004 0 47.5345 -122.069 1523300157 0 2008 0 47.5941 -122.299 sqft_living15 sqft_lot15 id 7129300520 1340 5650 6414100192 1690 7639 5631500400 2720 8062 2487200875 1360 5000 1954400510 1800 7503 ... ... ... 263000018 1530 1509 6600060120 1830 7200 1523300141 1020 2007 291310100 1410 1287 1523300157 1020 1357 [21613 rows x 19 columns] Удаление столбцов 'bathrooms' и 'floors': date price bedrooms sqft_living sqft_lot \ id 7129300520 20141013T000000 221900.0 3 1180 5650 6414100192 20141209T000000 538000.0 3 2570 7242 5631500400 20150225T000000 180000.0 2 770 10000 2487200875 20141209T000000 604000.0 4 1960 5000 1954400510 20150218T000000 510000.0 3 1680 8080 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 1530 1131 6600060120 20150223T000000 400000.0 4 2310 5813 1523300141 20140623T000000 402101.0 2 1020 1350 291310100 20150116T000000 400000.0 3 1600 2388 1523300157 20141015T000000 325000.0 2 1020 1076 waterfront view condition grade sqft_above sqft_basement \ id 7129300520 0 0 3 7 1180 0 6414100192 0 0 3 7 2170 400 5631500400 0 0 3 6 770 0 2487200875 0 0 5 7 1050 910 1954400510 0 0 3 8 1680 0 ... ... ... ... ... ... ... 263000018 0 0 3 8 1530 0 6600060120 0 0 3 8 2310 0 1523300141 0 0 3 7 1020 0 291310100 0 0 3 8 1600 0 1523300157 0 0 3 7 1020 0 yr_built yr_renovated zipcode lat long sqft_living15 \ id 7129300520 1955 0 98178 47.5112 -122.257 1340 6414100192 1951 1991 98125 47.7210 -122.319 1690 5631500400 1933 0 98028 47.7379 -122.233 2720 2487200875 1965 0 98136 47.5208 -122.393 1360 1954400510 1987 0 98074 47.6168 -122.045 1800 ... ... ... ... ... ... ... 263000018 2009 0 98103 47.6993 -122.346 1530 6600060120 2014 0 98146 47.5107 -122.362 1830 1523300141 2009 0 98144 47.5944 -122.299 1020 291310100 2004 0 98027 47.5345 -122.069 1410 1523300157 2008 0 98144 47.5941 -122.299 1020 sqft_lot15 id 7129300520 5650 6414100192 7639 5631500400 8062 2487200875 5000 1954400510 7503 ... ... 263000018 1509 6600060120 7200 1523300141 2007 291310100 1287 1523300157 1357 [21613 rows x 18 columns]
Создание новых столбцов на основе данных из существующих столбцов
In [37]:
# 1. Создание нового столбца 'price_per_bedroom'
print("Создание нового столбца 'price_per_bedroom':")
data_frame["price_per_bedroom"] = data_frame["price"] / data_frame["bedrooms"]
print(data_frame)
# 2. Создание нового столбца 'total_rooms' (сумма спален и ванных комнат)
print("\nСоздание нового столбца 'total_rooms':")
data_frame["total_rooms"] = data_frame["bedrooms"] + data_frame["bathrooms"]
print(data_frame)
# 3. Создание нового столбца 'is_expensive' (определяем, дорогой ли дом)
print("\nСоздание нового столбца 'is_expensive':")
data_frame["is_expensive"] = data_frame["price"] > 300000
print(data_frame)
# 4. Создание нового столбца 'floor_area_ratio' (соотношение этажей к количеству спален)
print("\nСоздание нового столбца 'floor_area_ratio':")
data_frame["floor_area_ratio"] = data_frame["floors"] / data_frame["bedrooms"]
print(data_frame)
Создание нового столбца 'price_per_bedroom': date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition ... sqft_above \ id ... 7129300520 5650 1.0 0 0 3 ... 1180 6414100192 7242 2.0 0 0 3 ... 2170 5631500400 10000 1.0 0 0 3 ... 770 2487200875 5000 1.0 0 0 5 ... 1050 1954400510 8080 1.0 0 0 3 ... 1680 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 ... 1530 6600060120 5813 2.0 0 0 3 ... 2310 1523300141 1350 2.0 0 0 3 ... 1020 291310100 2388 2.0 0 0 3 ... 1600 1523300157 1076 2.0 0 0 3 ... 1020 sqft_basement yr_built yr_renovated zipcode lat long \ id 7129300520 0 1955 0 98178 47.5112 -122.257 6414100192 400 1951 1991 98125 47.7210 -122.319 5631500400 0 1933 0 98028 47.7379 -122.233 2487200875 910 1965 0 98136 47.5208 -122.393 1954400510 0 1987 0 98074 47.6168 -122.045 ... ... ... ... ... ... ... 263000018 0 2009 0 98103 47.6993 -122.346 6600060120 0 2014 0 98146 47.5107 -122.362 1523300141 0 2009 0 98144 47.5944 -122.299 291310100 0 2004 0 98027 47.5345 -122.069 1523300157 0 2008 0 98144 47.5941 -122.299 sqft_living15 sqft_lot15 price_per_bedroom id 7129300520 1340 5650 73966.666667 6414100192 1690 7639 179333.333333 5631500400 2720 8062 90000.000000 2487200875 1360 5000 151000.000000 1954400510 1800 7503 170000.000000 ... ... ... ... 263000018 1530 1509 120000.000000 6600060120 1830 7200 100000.000000 1523300141 1020 2007 201050.500000 291310100 1410 1287 133333.333333 1523300157 1020 1357 162500.000000 [21613 rows x 21 columns] Создание нового столбца 'total_rooms': date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition ... sqft_basement \ id ... 7129300520 5650 1.0 0 0 3 ... 0 6414100192 7242 2.0 0 0 3 ... 400 5631500400 10000 1.0 0 0 3 ... 0 2487200875 5000 1.0 0 0 5 ... 910 1954400510 8080 1.0 0 0 3 ... 0 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 ... 0 6600060120 5813 2.0 0 0 3 ... 0 1523300141 1350 2.0 0 0 3 ... 0 291310100 2388 2.0 0 0 3 ... 0 1523300157 1076 2.0 0 0 3 ... 0 yr_built yr_renovated zipcode lat long sqft_living15 \ id 7129300520 1955 0 98178 47.5112 -122.257 1340 6414100192 1951 1991 98125 47.7210 -122.319 1690 5631500400 1933 0 98028 47.7379 -122.233 2720 2487200875 1965 0 98136 47.5208 -122.393 1360 1954400510 1987 0 98074 47.6168 -122.045 1800 ... ... ... ... ... ... ... 263000018 2009 0 98103 47.6993 -122.346 1530 6600060120 2014 0 98146 47.5107 -122.362 1830 1523300141 2009 0 98144 47.5944 -122.299 1020 291310100 2004 0 98027 47.5345 -122.069 1410 1523300157 2008 0 98144 47.5941 -122.299 1020 sqft_lot15 price_per_bedroom total_rooms id 7129300520 5650 73966.666667 4.00 6414100192 7639 179333.333333 5.25 5631500400 8062 90000.000000 3.00 2487200875 5000 151000.000000 7.00 1954400510 7503 170000.000000 5.00 ... ... ... ... 263000018 1509 120000.000000 5.50 6600060120 7200 100000.000000 6.50 1523300141 2007 201050.500000 2.75 291310100 1287 133333.333333 5.50 1523300157 1357 162500.000000 2.75 [21613 rows x 22 columns] Создание нового столбца 'is_expensive': date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition ... yr_built \ id ... 7129300520 5650 1.0 0 0 3 ... 1955 6414100192 7242 2.0 0 0 3 ... 1951 5631500400 10000 1.0 0 0 3 ... 1933 2487200875 5000 1.0 0 0 5 ... 1965 1954400510 8080 1.0 0 0 3 ... 1987 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 ... 2009 6600060120 5813 2.0 0 0 3 ... 2014 1523300141 1350 2.0 0 0 3 ... 2009 291310100 2388 2.0 0 0 3 ... 2004 1523300157 1076 2.0 0 0 3 ... 2008 yr_renovated zipcode lat long sqft_living15 \ id 7129300520 0 98178 47.5112 -122.257 1340 6414100192 1991 98125 47.7210 -122.319 1690 5631500400 0 98028 47.7379 -122.233 2720 2487200875 0 98136 47.5208 -122.393 1360 1954400510 0 98074 47.6168 -122.045 1800 ... ... ... ... ... ... 263000018 0 98103 47.6993 -122.346 1530 6600060120 0 98146 47.5107 -122.362 1830 1523300141 0 98144 47.5944 -122.299 1020 291310100 0 98027 47.5345 -122.069 1410 1523300157 0 98144 47.5941 -122.299 1020 sqft_lot15 price_per_bedroom total_rooms is_expensive id 7129300520 5650 73966.666667 4.00 False 6414100192 7639 179333.333333 5.25 True 5631500400 8062 90000.000000 3.00 False 2487200875 5000 151000.000000 7.00 True 1954400510 7503 170000.000000 5.00 True ... ... ... ... ... 263000018 1509 120000.000000 5.50 True 6600060120 7200 100000.000000 6.50 True 1523300141 2007 201050.500000 2.75 True 291310100 1287 133333.333333 5.50 True 1523300157 1357 162500.000000 2.75 True [21613 rows x 23 columns] Создание нового столбца 'floor_area_ratio': date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition ... yr_renovated \ id ... 7129300520 5650 1.0 0 0 3 ... 0 6414100192 7242 2.0 0 0 3 ... 1991 5631500400 10000 1.0 0 0 3 ... 0 2487200875 5000 1.0 0 0 5 ... 0 1954400510 8080 1.0 0 0 3 ... 0 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 ... 0 6600060120 5813 2.0 0 0 3 ... 0 1523300141 1350 2.0 0 0 3 ... 0 291310100 2388 2.0 0 0 3 ... 0 1523300157 1076 2.0 0 0 3 ... 0 zipcode lat long sqft_living15 sqft_lot15 \ id 7129300520 98178 47.5112 -122.257 1340 5650 6414100192 98125 47.7210 -122.319 1690 7639 5631500400 98028 47.7379 -122.233 2720 8062 2487200875 98136 47.5208 -122.393 1360 5000 1954400510 98074 47.6168 -122.045 1800 7503 ... ... ... ... ... ... 263000018 98103 47.6993 -122.346 1530 1509 6600060120 98146 47.5107 -122.362 1830 7200 1523300141 98144 47.5944 -122.299 1020 2007 291310100 98027 47.5345 -122.069 1410 1287 1523300157 98144 47.5941 -122.299 1020 1357 price_per_bedroom total_rooms is_expensive floor_area_ratio id 7129300520 73966.666667 4.00 False 0.333333 6414100192 179333.333333 5.25 True 0.666667 5631500400 90000.000000 3.00 False 0.500000 2487200875 151000.000000 7.00 True 0.250000 1954400510 170000.000000 5.00 True 0.333333 ... ... ... ... ... 263000018 120000.000000 5.50 True 1.000000 6600060120 100000.000000 6.50 True 0.500000 1523300141 201050.500000 2.75 True 1.000000 291310100 133333.333333 5.50 True 0.666667 1523300157 162500.000000 2.75 True 1.000000 [21613 rows x 24 columns]
Удаление строк с пустыми значениями
In [38]:
# 1. Исходный DataFrame с пустыми значениями
print("Исходный DataFrame с пустыми значениями:")
print(data_frame)
# 2. Удаление строк с любыми пустыми значениями
print("\nУдаление строк с любыми пустыми значениями:")
print(data_frame.dropna())
# 3. Удаление строк только с пустыми значениями в определенном столбце (например, 'price')
print("\nУдаление строк с пустыми значениями в столбце 'price':")
print(data_frame.dropna(subset=["price"]))
# 4. Удаление строк, где все значения пустые
print("\nУдаление строк, где все значения пустые:")
print(data_frame.dropna(how="all"))
Исходный DataFrame с пустыми значениями: date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition ... yr_renovated \ id ... 7129300520 5650 1.0 0 0 3 ... 0 6414100192 7242 2.0 0 0 3 ... 1991 5631500400 10000 1.0 0 0 3 ... 0 2487200875 5000 1.0 0 0 5 ... 0 1954400510 8080 1.0 0 0 3 ... 0 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 ... 0 6600060120 5813 2.0 0 0 3 ... 0 1523300141 1350 2.0 0 0 3 ... 0 291310100 2388 2.0 0 0 3 ... 0 1523300157 1076 2.0 0 0 3 ... 0 zipcode lat long sqft_living15 sqft_lot15 \ id 7129300520 98178 47.5112 -122.257 1340 5650 6414100192 98125 47.7210 -122.319 1690 7639 5631500400 98028 47.7379 -122.233 2720 8062 2487200875 98136 47.5208 -122.393 1360 5000 1954400510 98074 47.6168 -122.045 1800 7503 ... ... ... ... ... ... 263000018 98103 47.6993 -122.346 1530 1509 6600060120 98146 47.5107 -122.362 1830 7200 1523300141 98144 47.5944 -122.299 1020 2007 291310100 98027 47.5345 -122.069 1410 1287 1523300157 98144 47.5941 -122.299 1020 1357 price_per_bedroom total_rooms is_expensive floor_area_ratio id 7129300520 73966.666667 4.00 False 0.333333 6414100192 179333.333333 5.25 True 0.666667 5631500400 90000.000000 3.00 False 0.500000 2487200875 151000.000000 7.00 True 0.250000 1954400510 170000.000000 5.00 True 0.333333 ... ... ... ... ... 263000018 120000.000000 5.50 True 1.000000 6600060120 100000.000000 6.50 True 0.500000 1523300141 201050.500000 2.75 True 1.000000 291310100 133333.333333 5.50 True 0.666667 1523300157 162500.000000 2.75 True 1.000000 [21613 rows x 24 columns] Удаление строк с любыми пустыми значениями: date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition ... yr_renovated \ id ... 7129300520 5650 1.0 0 0 3 ... 0 6414100192 7242 2.0 0 0 3 ... 1991 5631500400 10000 1.0 0 0 3 ... 0 2487200875 5000 1.0 0 0 5 ... 0 1954400510 8080 1.0 0 0 3 ... 0 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 ... 0 6600060120 5813 2.0 0 0 3 ... 0 1523300141 1350 2.0 0 0 3 ... 0 291310100 2388 2.0 0 0 3 ... 0 1523300157 1076 2.0 0 0 3 ... 0 zipcode lat long sqft_living15 sqft_lot15 \ id 7129300520 98178 47.5112 -122.257 1340 5650 6414100192 98125 47.7210 -122.319 1690 7639 5631500400 98028 47.7379 -122.233 2720 8062 2487200875 98136 47.5208 -122.393 1360 5000 1954400510 98074 47.6168 -122.045 1800 7503 ... ... ... ... ... ... 263000018 98103 47.6993 -122.346 1530 1509 6600060120 98146 47.5107 -122.362 1830 7200 1523300141 98144 47.5944 -122.299 1020 2007 291310100 98027 47.5345 -122.069 1410 1287 1523300157 98144 47.5941 -122.299 1020 1357 price_per_bedroom total_rooms is_expensive floor_area_ratio id 7129300520 73966.666667 4.00 False 0.333333 6414100192 179333.333333 5.25 True 0.666667 5631500400 90000.000000 3.00 False 0.500000 2487200875 151000.000000 7.00 True 0.250000 1954400510 170000.000000 5.00 True 0.333333 ... ... ... ... ... 263000018 120000.000000 5.50 True 1.000000 6600060120 100000.000000 6.50 True 0.500000 1523300141 201050.500000 2.75 True 1.000000 291310100 133333.333333 5.50 True 0.666667 1523300157 162500.000000 2.75 True 1.000000 [21613 rows x 24 columns] Удаление строк с пустыми значениями в столбце 'price': date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition ... yr_renovated \ id ... 7129300520 5650 1.0 0 0 3 ... 0 6414100192 7242 2.0 0 0 3 ... 1991 5631500400 10000 1.0 0 0 3 ... 0 2487200875 5000 1.0 0 0 5 ... 0 1954400510 8080 1.0 0 0 3 ... 0 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 ... 0 6600060120 5813 2.0 0 0 3 ... 0 1523300141 1350 2.0 0 0 3 ... 0 291310100 2388 2.0 0 0 3 ... 0 1523300157 1076 2.0 0 0 3 ... 0 zipcode lat long sqft_living15 sqft_lot15 \ id 7129300520 98178 47.5112 -122.257 1340 5650 6414100192 98125 47.7210 -122.319 1690 7639 5631500400 98028 47.7379 -122.233 2720 8062 2487200875 98136 47.5208 -122.393 1360 5000 1954400510 98074 47.6168 -122.045 1800 7503 ... ... ... ... ... ... 263000018 98103 47.6993 -122.346 1530 1509 6600060120 98146 47.5107 -122.362 1830 7200 1523300141 98144 47.5944 -122.299 1020 2007 291310100 98027 47.5345 -122.069 1410 1287 1523300157 98144 47.5941 -122.299 1020 1357 price_per_bedroom total_rooms is_expensive floor_area_ratio id 7129300520 73966.666667 4.00 False 0.333333 6414100192 179333.333333 5.25 True 0.666667 5631500400 90000.000000 3.00 False 0.500000 2487200875 151000.000000 7.00 True 0.250000 1954400510 170000.000000 5.00 True 0.333333 ... ... ... ... ... 263000018 120000.000000 5.50 True 1.000000 6600060120 100000.000000 6.50 True 0.500000 1523300141 201050.500000 2.75 True 1.000000 291310100 133333.333333 5.50 True 0.666667 1523300157 162500.000000 2.75 True 1.000000 [21613 rows x 24 columns] Удаление строк, где все значения пустые: date price bedrooms bathrooms sqft_living \ id 7129300520 20141013T000000 221900.0 3 1.00 1180 6414100192 20141209T000000 538000.0 3 2.25 2570 5631500400 20150225T000000 180000.0 2 1.00 770 2487200875 20141209T000000 604000.0 4 3.00 1960 1954400510 20150218T000000 510000.0 3 2.00 1680 ... ... ... ... ... ... 263000018 20140521T000000 360000.0 3 2.50 1530 6600060120 20150223T000000 400000.0 4 2.50 2310 1523300141 20140623T000000 402101.0 2 0.75 1020 291310100 20150116T000000 400000.0 3 2.50 1600 1523300157 20141015T000000 325000.0 2 0.75 1020 sqft_lot floors waterfront view condition ... yr_renovated \ id ... 7129300520 5650 1.0 0 0 3 ... 0 6414100192 7242 2.0 0 0 3 ... 1991 5631500400 10000 1.0 0 0 3 ... 0 2487200875 5000 1.0 0 0 5 ... 0 1954400510 8080 1.0 0 0 3 ... 0 ... ... ... ... ... ... ... ... 263000018 1131 3.0 0 0 3 ... 0 6600060120 5813 2.0 0 0 3 ... 0 1523300141 1350 2.0 0 0 3 ... 0 291310100 2388 2.0 0 0 3 ... 0 1523300157 1076 2.0 0 0 3 ... 0 zipcode lat long sqft_living15 sqft_lot15 \ id 7129300520 98178 47.5112 -122.257 1340 5650 6414100192 98125 47.7210 -122.319 1690 7639 5631500400 98028 47.7379 -122.233 2720 8062 2487200875 98136 47.5208 -122.393 1360 5000 1954400510 98074 47.6168 -122.045 1800 7503 ... ... ... ... ... ... 263000018 98103 47.6993 -122.346 1530 1509 6600060120 98146 47.5107 -122.362 1830 7200 1523300141 98144 47.5944 -122.299 1020 2007 291310100 98027 47.5345 -122.069 1410 1287 1523300157 98144 47.5941 -122.299 1020 1357 price_per_bedroom total_rooms is_expensive floor_area_ratio id 7129300520 73966.666667 4.00 False 0.333333 6414100192 179333.333333 5.25 True 0.666667 5631500400 90000.000000 3.00 False 0.500000 2487200875 151000.000000 7.00 True 0.250000 1954400510 170000.000000 5.00 True 0.333333 ... ... ... ... ... 263000018 120000.000000 5.50 True 1.000000 6600060120 100000.000000 6.50 True 0.500000 1523300141 201050.500000 2.75 True 1.000000 291310100 133333.333333 5.50 True 0.666667 1523300157 162500.000000 2.75 True 1.000000 [21613 rows x 24 columns]
Matplotlib
In [39]:
import matplotlib.pyplot as plt
# 1. Столбчатая диаграмма: средняя цена по количеству спален
data_frame.groupby("bedrooms")["price"].mean().plot.bar(color="skyblue")
plt.title("Средняя цена по количеству спален")
plt.xlabel("Количество спален")
plt.ylabel("Средняя цена")
plt.show()
# 2. Гистограмма: распределение цен
data_frame["price"].plot.hist(bins=30, color="orange", alpha=0.7)
plt.title("Гистограмма цен")
plt.xlabel("Цена")
plt.ylabel("Частота")
plt.show()
# 3. Ящик с усами: цена по количеству ванных комнат
data_frame.boxplot(column="price", by="bathrooms")
plt.title("Ящик с усами цен по количеству ванных комнат")
plt.suptitle("")
plt.xlabel("Количество ванных комнат")
plt.ylabel("Цена")
plt.show()
# 4. Диаграмма с областями: суммарная цена по количеству этажей
data_frame.groupby("floors")["price"].sum().plot.area(color="lightgreen", alpha=0.5)
plt.title("Суммарная цена по количеству этажей")
plt.xlabel("Количество этажей")
plt.ylabel("Суммарная цена")
plt.show()
# 5. Диаграмма рассеяния: цена vs. площадь
data_frame.plot.scatter(x="sqft_living", y="price", color="purple", alpha=0.5)
plt.title("Диаграмма рассеяния: Цена vs Площадь")
plt.xlabel("Площадь (sqft)")
plt.ylabel("Цена")
plt.show()