Files
Data_Science/lab3.ipynb
2025-07-17 16:37:16 +04:00

47 KiB
Raw Permalink Blame History

Бизнес-цели для набора данных своего варианта:

Прогнозирование опасности столкновения астероидов с планетой; Прогнозирование опасности астероидов для природных явлений. Цели технического проекта для каждой бизнес-цели:

Для второй бизнес-цели целью является определение включения астероидов в состав системы мониторинга столкновений. Для первой бизнес-цели целью является определение ближайших к Земле астероидов. Загрузка данных и получение сведений о датафрейме с данными

In [85]:
import pandas as pd

# Загрузка датасета
df = pd.read_csv("data/neo.csv")
df.head()
Out[85]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
id name est_diameter_min est_diameter_max relative_velocity miss_distance orbiting_body sentry_object absolute_magnitude hazardous
0 2162635 162635 (2000 SS164) 1.198271 2.679415 13569.249224 5.483974e+07 Earth False 16.73 False
1 2277475 277475 (2005 WK4) 0.265800 0.594347 73588.726663 6.143813e+07 Earth False 20.00 True
2 2512244 512244 (2015 YE18) 0.722030 1.614507 114258.692129 4.979872e+07 Earth False 17.83 False
3 3596030 (2012 BV13) 0.096506 0.215794 24764.303138 2.543497e+07 Earth False 22.20 False
4 3667127 (2014 GE35) 0.255009 0.570217 42737.733765 4.627557e+07 Earth False 20.09 True
In [86]:
display(df.isnull().sum())
display()
id                    0
name                  0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
orbiting_body         0
sentry_object         0
absolute_magnitude    0
hazardous             0
dtype: int64
In [87]:
from sklearn.model_selection import train_test_split


data1 = df[["miss_distance", "relative_velocity", "absolute_magnitude", "sentry_object"]].copy()


df1_train, df1_temp = train_test_split(
    data1,
    stratify=data1["sentry_object"],
    test_size=0.4,
    random_state=42
)


df1_val, df1_test = train_test_split(
    df1_temp,
    stratify=df1_temp["sentry_object"],
    test_size=0.5,
    random_state=42
)


X1_train = df1_train.drop(columns=["sentry_object"])
y1_train = df1_train["sentry_object"]

X1_val = df1_val.drop(columns=["sentry_object"])
y1_val = df1_val["sentry_object"]

X1_test = df1_test.drop(columns=["sentry_object"])
y1_test = df1_test["sentry_object"]

display(X1_train)
display(y1_train)
display(X1_test)

display(y1_test)


data2 = df[["miss_distance", "est_diameter_max", "hazardous"]].copy()

df2_train, df2_temp = train_test_split(
    data2,
    stratify=data2["hazardous"],
    test_size=0.4,
    random_state=42
)

df2_val, df2_test = train_test_split(
    df2_temp,
    stratify=df2_temp["hazardous"],
    test_size=0.5,
    random_state=42
)

X2_train = df2_train.drop(columns=["hazardous"])
y2_train = df2_train["hazardous"]

X2_val = df2_val.drop(columns=["hazardous"])
y2_val = df2_val["hazardous"]

X2_test = df2_test.drop(columns=["hazardous"])
y2_test = df2_test["hazardous"]
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
miss_distance relative_velocity absolute_magnitude
34465 5.635134e+07 21624.928164 24.80
90701 2.472948e+07 37209.587564 22.02
2005 2.109070e+07 47978.493043 22.80
6232 3.326166e+07 39237.251103 22.80
90498 6.815431e+07 65595.360821 26.02
... ... ... ...
39769 3.949974e+07 43610.074161 18.75
33446 5.578624e+07 72819.444430 19.82
30882 5.099273e+07 27834.911966 22.89
1952 3.616413e+07 58325.583576 26.15
42479 4.748265e+06 41999.243515 22.57

54501 rows × 3 columns

34465    False
90701    False
2005     False
6232     False
90498    False
         ...  
39769    False
33446    False
30882    False
1952     False
42479    False
Name: sentry_object, Length: 54501, dtype: bool
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
miss_distance relative_velocity absolute_magnitude
81186 1.462177e+07 25664.748520 25.20
15352 3.205802e+07 88237.170517 22.10
22821 4.267297e+07 31651.822174 20.67
81312 4.707998e+05 32573.816362 28.90
43938 4.127747e+07 83882.960168 24.80
... ... ... ...
88920 2.400532e+06 38120.569097 26.97
9896 1.778582e+07 46403.473964 18.52
80597 4.021212e+06 36312.854593 25.80
49831 5.650808e+07 50795.212086 20.30
13003 6.619616e+07 48753.197881 22.50

18168 rows × 3 columns

81186    False
15352    False
22821    False
81312    False
43938    False
         ...  
88920    False
9896     False
80597    False
49831    False
13003    False
Name: sentry_object, Length: 18168, dtype: bool
In [88]:
display("Обучающая выборка первого набора данных: ", df1_train.shape)
display(df1_train.absolute_magnitude.value_counts())

display("Контрольная выборка первого набора данных: ", df1_val.shape)
display(df1_val.absolute_magnitude.value_counts())

display("Тестовая выборка первого набора данных: ", df1_test.shape)
display(df1_test.absolute_magnitude.value_counts())

display("Обучающая выборка второго набора данных: ", df2_train.shape)
display(df2_train.est_diameter_max.value_counts())

display("Контрольная выборка второго набора данных: ", df2_val.shape)
display(df2_val.est_diameter_max.value_counts())

display("Тестовая выборка второго набора данных: ", df2_test.shape)
display(df2_test.est_diameter_max.value_counts())
'Обучающая выборка первого набора данных: '
(54501, 4)
absolute_magnitude
24.40    671
24.80    609
23.60    589
25.40    588
24.60    585
        ... 
12.76      1
30.46      1
31.18      1
29.47      1
29.32      1
Name: count, Length: 1551, dtype: int64
'Контрольная выборка первого набора данных: '
(18167, 4)
absolute_magnitude
24.800    239
24.400    230
25.300    219
23.600    203
26.100    197
         ... 
17.260      1
21.120      1
30.360      1
27.482      1
27.950      1
Name: count, Length: 1319, dtype: int64
'Тестовая выборка первого набора данных: '
(18168, 4)
absolute_magnitude
24.400    229
24.600    221
24.800    210
25.300    207
25.400    200
         ... 
20.980      1
26.985      1
24.525      1
26.101      1
20.720      1
Name: count, Length: 1308, dtype: int64
'Обучающая выборка второго набора данных: '
(54501, 3)
est_diameter_max
0.078350    672
0.065169    627
0.113250    600
0.051765    597
0.068240    576
           ... 
0.020019      1
5.943469      1
0.002582      1
2.767196      1
0.027406      1
Name: count, Length: 1540, dtype: int64
'Контрольная выборка второго набора данных: '
(18167, 3)
est_diameter_max
0.078350    220
0.065169    220
0.051765    212
0.049436    209
0.071456    209
           ... 
4.898239      1
0.019144      1
0.023771      1
0.023337      1
0.087910      1
Name: count, Length: 1304, dtype: int64
'Тестовая выборка второго набора данных: '
(18168, 3)
est_diameter_max
0.078350    238
0.049436    215
0.065169    211
0.071456    205
0.056760    203
           ... 
0.044263      1
0.018951      1
0.026064      1
0.018969      1
9.247833      1
Name: count, Length: 1315, dtype: int64
In [89]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd


encoder = OneHotEncoder(sparse_output=False, drop='first')

target_col1 = "sentry_object"
encoded_1 = encoder.fit_transform(data1[[target_col1]])
encoded_cols_1 = encoder.get_feature_names_out([target_col1])
encoded_df1 = pd.DataFrame(encoded_1, columns=encoded_cols_1, index=data1.index)

target_col2 = "hazardous"
encoded_2 = encoder.fit_transform(data2[[target_col2]])
encoded_cols_2 = encoder.get_feature_names_out([target_col2])
encoded_df2 = pd.DataFrame(encoded_2, columns=encoded_cols_2, index=data2.index)

encoded_df1, encoded_df2
Out[89]:
(Empty DataFrame
 Columns: []
 Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
 
 [90836 rows x 0 columns],
        hazardous_True
 0                 0.0
 1                 1.0
 2                 0.0
 3                 0.0
 4                 1.0
 ...               ...
 90831             0.0
 90832             0.0
 90833             0.0
 90834             0.0
 90835             0.0
 
 [90836 rows x 1 columns])
In [90]:
import numpy as np
import pandas as pd



labels_1 = ["included", "not_included"]
num_bins_1 = 2

abs_mag_filled = data1["absolute_magnitude"].fillna(data1["absolute_magnitude"].median())
hist_1, bins_1 = np.histogram(abs_mag_filled, bins=num_bins_1)

cut_no_labels_1 = pd.cut(abs_mag_filled, bins=bins_1)
cut_with_labels_1 = pd.cut(abs_mag_filled, bins=bins_1, labels=labels_1)

display_1 = pd.concat([data1["absolute_magnitude"], cut_no_labels_1, cut_with_labels_1], axis=1).head(20)


labels_2 = ["potentially_dangerous", "not_dangerous"]
num_bins_2 = 2

miss_dist_filled = data2["miss_distance"].fillna(data2["miss_distance"].median())
hist_2, bins_2 = np.histogram(miss_dist_filled, bins=num_bins_2)

cut_no_labels_2 = pd.cut(miss_dist_filled, bins=bins_2)
cut_with_labels_2 = pd.cut(miss_dist_filled, bins=bins_2, labels=labels_2)

display_2 = pd.concat([data2["miss_distance"], cut_no_labels_2, cut_with_labels_2], axis=1).head(20)


display_1, display_2
Out[90]:
(   absolute_magnitude absolute_magnitude absolute_magnitude
 0               16.73     (9.23, 21.215]           included
 1               20.00     (9.23, 21.215]           included
 2               17.83     (9.23, 21.215]           included
 3               22.20     (21.215, 33.2]       not_included
 4               20.09     (9.23, 21.215]           included
 5               24.32     (21.215, 33.2]       not_included
 6               20.95     (9.23, 21.215]           included
 7               28.49     (21.215, 33.2]       not_included
 8               19.40     (9.23, 21.215]           included
 9               22.00     (21.215, 33.2]       not_included
 10              20.11     (9.23, 21.215]           included
 11              21.20     (9.23, 21.215]           included
 12              22.90     (21.215, 33.2]       not_included
 13              28.18     (21.215, 33.2]       not_included
 14              19.81     (9.23, 21.215]           included
 15              18.70     (9.23, 21.215]           included
 16              27.90     (21.215, 33.2]       not_included
 17              23.90     (21.215, 33.2]       not_included
 18              24.80     (21.215, 33.2]       not_included
 19              18.80     (9.23, 21.215]           included,
    miss_distance                 miss_distance          miss_distance
 0   5.483974e+07  (37402698.492, 74798651.452]          not_dangerous
 1   6.143813e+07  (37402698.492, 74798651.452]          not_dangerous
 2   4.979872e+07  (37402698.492, 74798651.452]          not_dangerous
 3   2.543497e+07      (6745.533, 37402698.492]  potentially_dangerous
 4   4.627557e+07  (37402698.492, 74798651.452]          not_dangerous
 5   4.058569e+07  (37402698.492, 74798651.452]          not_dangerous
 6   2.906912e+07      (6745.533, 37402698.492]  potentially_dangerous
 7   5.511502e+07  (37402698.492, 74798651.452]          not_dangerous
 8   6.903598e+07  (37402698.492, 74798651.452]          not_dangerous
 9   3.835526e+07  (37402698.492, 74798651.452]          not_dangerous
 10  3.833750e+07  (37402698.492, 74798651.452]          not_dangerous
 11  7.198311e+07  (37402698.492, 74798651.452]          not_dangerous
 12  5.209302e+07  (37402698.492, 74798651.452]          not_dangerous
 13  2.461759e+07      (6745.533, 37402698.492]  potentially_dangerous
 14  6.078930e+07  (37402698.492, 74798651.452]          not_dangerous
 15  5.988081e+07  (37402698.492, 74798651.452]          not_dangerous
 16  7.138706e+07  (37402698.492, 74798651.452]          not_dangerous
 17  2.771724e+07      (6745.533, 37402698.492]  potentially_dangerous
 18  3.942128e+07  (37402698.492, 74798651.452]          not_dangerous
 19  1.883284e+07      (6745.533, 37402698.492]  potentially_dangerous)
In [91]:
data1_cl = data1.copy()
data2_cl = data2.copy()

data1_cl["is_included"] = data1_cl["sentry_object"].apply(lambda x: 1 if x == "True" else 0)

data2_cl["is_close"] = data2_cl["hazardous"].apply(lambda x: 1 if x == "True" else 0)
In [92]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

data1["absolute_magnitude_Norm"] = min_max_scaler.fit_transform(data1["absolute_magnitude"].to_numpy().reshape(-1, 1)).reshape(data1["absolute_magnitude"].shape)


data2["miss_distance_Norm"] = min_max_scaler.fit_transform(data2["miss_distance"].to_numpy().reshape(-1, 1)).reshape(data2["miss_distance"].shape)

data1[["absolute_magnitude", "absolute_magnitude_Norm"]].head(20)
data2[["miss_distance", "miss_distance_Norm"]].head(20)
Out[92]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
miss_distance miss_distance_Norm
0 5.483974e+07 0.733141
1 6.143813e+07 0.821364
2 4.979872e+07 0.665740
3 2.543497e+07 0.339986
4 4.627557e+07 0.618634
5 4.058569e+07 0.542558
6 2.906912e+07 0.388576
7 5.511502e+07 0.736821
8 6.903598e+07 0.922951
9 3.835526e+07 0.512736
10 3.833750e+07 0.512499
11 7.198311e+07 0.962355
12 5.209302e+07 0.696416
13 2.461759e+07 0.329058
14 6.078930e+07 0.812689
15 5.988081e+07 0.800542
16 7.138706e+07 0.954386
17 2.771724e+07 0.370501
18 3.942128e+07 0.526989
19 1.883284e+07 0.251713
In [93]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


abs_mag_array = data1["absolute_magnitude"].to_numpy().reshape(-1, 1)
data1["absolute_magnitude_Stand"] = scaler.fit_transform(abs_mag_array).flatten()

miss_dist_array = data2["miss_distance"].to_numpy().reshape(-1, 1)
data2["miss_distance_Stand"] = scaler.fit_transform(miss_dist_array).flatten()

data1[["absolute_magnitude", "absolute_magnitude_Stand"]].head(20), data2[["miss_distance", "miss_distance_Stand"]].head(20)
Out[93]:
(    absolute_magnitude  absolute_magnitude_Stand
 0                16.73                 -2.348632
 1                20.00                 -1.218735
 2                17.83                 -1.968544
 3                22.20                 -0.458560
 4                20.09                 -1.187637
 5                24.32                  0.273973
 6                20.95                 -0.890477
 7                28.49                  1.714850
 8                19.40                 -1.426055
 9                22.00                 -0.527666
 10               20.11                 -1.180726
 11               21.20                 -0.804094
 12               22.90                 -0.216686
 13               28.18                  1.607735
 14               19.81                 -1.284386
 15               18.70                 -1.667929
 16               27.90                  1.510985
 17               23.90                  0.128849
 18               24.80                  0.439829
 19               18.80                 -1.633376,
     miss_distance  miss_distance_Stand
 0    5.483974e+07             0.795153
 1    6.143813e+07             1.090357
 2    4.979872e+07             0.569624
 3    2.543497e+07            -0.520384
 4    4.627557e+07             0.412001
 5    4.058569e+07             0.157443
 6    2.906912e+07            -0.357796
 7    5.511502e+07             0.807469
 8    6.903598e+07             1.430277
 9    3.835526e+07             0.057656
 10   3.833750e+07             0.056861
 11   7.198311e+07             1.562128
 12   5.209302e+07             0.672268
 13   2.461759e+07            -0.556953
 14   6.078930e+07             1.061330
 15   5.988081e+07             1.020685
 16   7.138706e+07             1.535462
 17   2.771724e+07            -0.418278
 18   3.942128e+07             0.105348
 19   1.883284e+07            -0.815756)
In [94]:
from sklearn.model_selection import train_test_split

# Берем нормализованные признаки
X2_scaled = data2[["miss_distance_Norm", "est_diameter_max"]]
y2 = data2["hazardous"]

# Стратифицированное разбиение
X2_train, X2_temp, y2_train, y2_temp = train_test_split(
    X2_scaled, y2, stratify=y2, test_size=0.4, random_state=42)

X2_val, X2_test, y2_val, y2_test = train_test_split(
    X2_temp, y2_temp, stratify=y2_temp, test_size=0.5, random_state=42)
In [95]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X2_train_bal, y2_train_bal = sm.fit_resample(X2_train, y2_train)

print("До балансировки:")
print(y2_train.value_counts())
print("После балансировки:")
print(pd.Series(y2_train_bal).value_counts())
До балансировки:
hazardous
False    49197
True      5304
Name: count, dtype: int64
После балансировки:
hazardous
False    49197
True     49197
Name: count, dtype: int64
In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
display(X2_train_bal)

clf = RandomForestClassifier(random_state=42)
clf.fit(X2_train_bal, y2_train_bal)


y2_pred_val = clf.predict(X2_val)
print("Валидация:")
print(classification_report(y2_val, y2_pred_val))


y2_pred_test = clf.predict(X2_test)
print("Тест:")
print(classification_report(y2_test, y2_pred_test))
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
miss_distance_Norm est_diameter_max
0 0.769858 0.430566
1 0.673053 0.547066
2 0.667690 0.879103
3 0.529716 0.054205
4 0.030387 0.030906
... ... ...
98389 0.624613 0.498930
98390 0.259200 0.311918
98391 0.181996 0.630802
98392 0.047292 0.378271
98393 0.916603 0.880686

98394 rows × 2 columns

Валидация:
              precision    recall  f1-score   support

       False       0.96      0.88      0.92     16399
        True       0.37      0.65      0.47      1768

    accuracy                           0.86     18167
   macro avg       0.66      0.77      0.69     18167
weighted avg       0.90      0.86      0.87     18167

Тест:
              precision    recall  f1-score   support

       False       0.96      0.88      0.92     16400
        True       0.36      0.62      0.46      1768

    accuracy                           0.86     18168
   macro avg       0.66      0.75      0.69     18168
weighted avg       0.90      0.86      0.87     18168

Для первого набора данных предсказательная способность достаточно хорошая, довольно быстрая скорость вычислений, достаточная надежность и вполне хорошая корреляция и цельность.

Для второго набора данных предсказательная способность довольно хорошая, крайне быстрая скорость вычислений, хорошая надежность, хорошая корреляция и цельность данных.