Data_Science/lab3.ipynb at main

Files

2025-07-17 16:37:16 +04:00

47 KiB

Raw Permalink Blame History

Бизнес-цели для набора данных своего варианта:

Прогнозирование опасности столкновения астероидов с планетой; Прогнозирование опасности астероидов для природных явлений. Цели технического проекта для каждой бизнес-цели:

Для второй бизнес-цели целью является определение включения астероидов в состав системы мониторинга столкновений. Для первой бизнес-цели целью является определение ближайших к Земле астероидов. Загрузка данных и получение сведений о датафрейме с данными

In [85]:

import pandas as pd

# Загрузка датасета
df = pd.read_csv("data/neo.csv")
df.head()

Out[85]:

	id	name	est_diameter_min	est_diameter_max	relative_velocity	miss_distance	orbiting_body	sentry_object	absolute_magnitude	hazardous
0	2162635	162635 (2000 SS164)	1.198271	2.679415	13569.249224	5.483974e+07	Earth	False	16.73	False
1	2277475	277475 (2005 WK4)	0.265800	0.594347	73588.726663	6.143813e+07	Earth	False	20.00	True
2	2512244	512244 (2015 YE18)	0.722030	1.614507	114258.692129	4.979872e+07	Earth	False	17.83	False
3	3596030	(2012 BV13)	0.096506	0.215794	24764.303138	2.543497e+07	Earth	False	22.20	False
4	3667127	(2014 GE35)	0.255009	0.570217	42737.733765	4.627557e+07	Earth	False	20.09	True

In [86]:

display(df.isnull().sum())
display()

id                    0
name                  0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
orbiting_body         0
sentry_object         0
absolute_magnitude    0
hazardous             0
dtype: int64

In [87]:

from sklearn.model_selection import train_test_split


data1 = df[["miss_distance", "relative_velocity", "absolute_magnitude", "sentry_object"]].copy()


df1_train, df1_temp = train_test_split(
    data1,
    stratify=data1["sentry_object"],
    test_size=0.4,
    random_state=42
)


df1_val, df1_test = train_test_split(
    df1_temp,
    stratify=df1_temp["sentry_object"],
    test_size=0.5,
    random_state=42
)


X1_train = df1_train.drop(columns=["sentry_object"])
y1_train = df1_train["sentry_object"]

X1_val = df1_val.drop(columns=["sentry_object"])
y1_val = df1_val["sentry_object"]

X1_test = df1_test.drop(columns=["sentry_object"])
y1_test = df1_test["sentry_object"]

display(X1_train)
display(y1_train)
display(X1_test)

display(y1_test)


data2 = df[["miss_distance", "est_diameter_max", "hazardous"]].copy()

df2_train, df2_temp = train_test_split(
    data2,
    stratify=data2["hazardous"],
    test_size=0.4,
    random_state=42
)

df2_val, df2_test = train_test_split(
    df2_temp,
    stratify=df2_temp["hazardous"],
    test_size=0.5,
    random_state=42
)

X2_train = df2_train.drop(columns=["hazardous"])
y2_train = df2_train["hazardous"]

X2_val = df2_val.drop(columns=["hazardous"])
y2_val = df2_val["hazardous"]

X2_test = df2_test.drop(columns=["hazardous"])
y2_test = df2_test["hazardous"]

	miss_distance	relative_velocity	absolute_magnitude
34465	5.635134e+07	21624.928164	24.80
90701	2.472948e+07	37209.587564	22.02
2005	2.109070e+07	47978.493043	22.80
6232	3.326166e+07	39237.251103	22.80
90498	6.815431e+07	65595.360821	26.02
...	...	...	...
39769	3.949974e+07	43610.074161	18.75
33446	5.578624e+07	72819.444430	19.82
30882	5.099273e+07	27834.911966	22.89
1952	3.616413e+07	58325.583576	26.15
42479	4.748265e+06	41999.243515	22.57

54501 rows × 3 columns

34465    False
90701    False
2005     False
6232     False
90498    False
         ...  
39769    False
33446    False
30882    False
1952     False
42479    False
Name: sentry_object, Length: 54501, dtype: bool

	miss_distance	relative_velocity	absolute_magnitude
81186	1.462177e+07	25664.748520	25.20
15352	3.205802e+07	88237.170517	22.10
22821	4.267297e+07	31651.822174	20.67
81312	4.707998e+05	32573.816362	28.90
43938	4.127747e+07	83882.960168	24.80
...	...	...	...
88920	2.400532e+06	38120.569097	26.97
9896	1.778582e+07	46403.473964	18.52
80597	4.021212e+06	36312.854593	25.80
49831	5.650808e+07	50795.212086	20.30
13003	6.619616e+07	48753.197881	22.50

18168 rows × 3 columns

81186    False
15352    False
22821    False
81312    False
43938    False
         ...  
88920    False
9896     False
80597    False
49831    False
13003    False
Name: sentry_object, Length: 18168, dtype: bool

In [88]:

display("Обучающая выборка первого набора данных: ", df1_train.shape)
display(df1_train.absolute_magnitude.value_counts())

display("Контрольная выборка первого набора данных: ", df1_val.shape)
display(df1_val.absolute_magnitude.value_counts())

display("Тестовая выборка первого набора данных: ", df1_test.shape)
display(df1_test.absolute_magnitude.value_counts())

display("Обучающая выборка второго набора данных: ", df2_train.shape)
display(df2_train.est_diameter_max.value_counts())

display("Контрольная выборка второго набора данных: ", df2_val.shape)
display(df2_val.est_diameter_max.value_counts())

display("Тестовая выборка второго набора данных: ", df2_test.shape)
display(df2_test.est_diameter_max.value_counts())

'Обучающая выборка первого набора данных: '

(54501, 4)

absolute_magnitude
24.40    671
24.80    609
23.60    589
25.40    588
24.60    585
        ... 
12.76      1
30.46      1
31.18      1
29.47      1
29.32      1
Name: count, Length: 1551, dtype: int64

'Контрольная выборка первого набора данных: '

(18167, 4)

absolute_magnitude
24.800    239
24.400    230
25.300    219
23.600    203
26.100    197
         ... 
17.260      1
21.120      1
30.360      1
27.482      1
27.950      1
Name: count, Length: 1319, dtype: int64

'Тестовая выборка первого набора данных: '

(18168, 4)

absolute_magnitude
24.400    229
24.600    221
24.800    210
25.300    207
25.400    200
         ... 
20.980      1
26.985      1
24.525      1
26.101      1
20.720      1
Name: count, Length: 1308, dtype: int64

'Обучающая выборка второго набора данных: '

(54501, 3)

est_diameter_max
0.078350    672
0.065169    627
0.113250    600
0.051765    597
0.068240    576
           ... 
0.020019      1
5.943469      1
0.002582      1
2.767196      1
0.027406      1
Name: count, Length: 1540, dtype: int64

'Контрольная выборка второго набора данных: '

(18167, 3)

est_diameter_max
0.078350    220
0.065169    220
0.051765    212
0.049436    209
0.071456    209
           ... 
4.898239      1
0.019144      1
0.023771      1
0.023337      1
0.087910      1
Name: count, Length: 1304, dtype: int64

'Тестовая выборка второго набора данных: '

(18168, 3)

est_diameter_max
0.078350    238
0.049436    215
0.065169    211
0.071456    205
0.056760    203
           ... 
0.044263      1
0.018951      1
0.026064      1
0.018969      1
9.247833      1
Name: count, Length: 1315, dtype: int64

In [89]:

from sklearn.preprocessing import OneHotEncoder
import pandas as pd


encoder = OneHotEncoder(sparse_output=False, drop='first')

target_col1 = "sentry_object"
encoded_1 = encoder.fit_transform(data1[[target_col1]])
encoded_cols_1 = encoder.get_feature_names_out([target_col1])
encoded_df1 = pd.DataFrame(encoded_1, columns=encoded_cols_1, index=data1.index)

target_col2 = "hazardous"
encoded_2 = encoder.fit_transform(data2[[target_col2]])
encoded_cols_2 = encoder.get_feature_names_out([target_col2])
encoded_df2 = pd.DataFrame(encoded_2, columns=encoded_cols_2, index=data2.index)

encoded_df1, encoded_df2

Out[89]:

(Empty DataFrame
 Columns: []
 Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
 
 [90836 rows x 0 columns],
        hazardous_True
 0                 0.0
 1                 1.0
 2                 0.0
 3                 0.0
 4                 1.0
 ...               ...
 90831             0.0
 90832             0.0
 90833             0.0
 90834             0.0
 90835             0.0
 
 [90836 rows x 1 columns])

In [90]:

import numpy as np
import pandas as pd



labels_1 = ["included", "not_included"]
num_bins_1 = 2

abs_mag_filled = data1["absolute_magnitude"].fillna(data1["absolute_magnitude"].median())
hist_1, bins_1 = np.histogram(abs_mag_filled, bins=num_bins_1)

cut_no_labels_1 = pd.cut(abs_mag_filled, bins=bins_1)
cut_with_labels_1 = pd.cut(abs_mag_filled, bins=bins_1, labels=labels_1)

display_1 = pd.concat([data1["absolute_magnitude"], cut_no_labels_1, cut_with_labels_1], axis=1).head(20)


labels_2 = ["potentially_dangerous", "not_dangerous"]
num_bins_2 = 2

miss_dist_filled = data2["miss_distance"].fillna(data2["miss_distance"].median())
hist_2, bins_2 = np.histogram(miss_dist_filled, bins=num_bins_2)

cut_no_labels_2 = pd.cut(miss_dist_filled, bins=bins_2)
cut_with_labels_2 = pd.cut(miss_dist_filled, bins=bins_2, labels=labels_2)

display_2 = pd.concat([data2["miss_distance"], cut_no_labels_2, cut_with_labels_2], axis=1).head(20)


display_1, display_2

Out[90]:

(   absolute_magnitude absolute_magnitude absolute_magnitude
 0               16.73     (9.23, 21.215]           included
 1               20.00     (9.23, 21.215]           included
 2               17.83     (9.23, 21.215]           included
 3               22.20     (21.215, 33.2]       not_included
 4               20.09     (9.23, 21.215]           included
 5               24.32     (21.215, 33.2]       not_included
 6               20.95     (9.23, 21.215]           included
 7               28.49     (21.215, 33.2]       not_included
 8               19.40     (9.23, 21.215]           included
 9               22.00     (21.215, 33.2]       not_included
 10              20.11     (9.23, 21.215]           included
 11              21.20     (9.23, 21.215]           included
 12              22.90     (21.215, 33.2]       not_included
 13              28.18     (21.215, 33.2]       not_included
 14              19.81     (9.23, 21.215]           included
 15              18.70     (9.23, 21.215]           included
 16              27.90     (21.215, 33.2]       not_included
 17              23.90     (21.215, 33.2]       not_included
 18              24.80     (21.215, 33.2]       not_included
 19              18.80     (9.23, 21.215]           included,
    miss_distance                 miss_distance          miss_distance
 0   5.483974e+07  (37402698.492, 74798651.452]          not_dangerous
 1   6.143813e+07  (37402698.492, 74798651.452]          not_dangerous
 2   4.979872e+07  (37402698.492, 74798651.452]          not_dangerous
 3   2.543497e+07      (6745.533, 37402698.492]  potentially_dangerous
 4   4.627557e+07  (37402698.492, 74798651.452]          not_dangerous
 5   4.058569e+07  (37402698.492, 74798651.452]          not_dangerous
 6   2.906912e+07      (6745.533, 37402698.492]  potentially_dangerous
 7   5.511502e+07  (37402698.492, 74798651.452]          not_dangerous
 8   6.903598e+07  (37402698.492, 74798651.452]          not_dangerous
 9   3.835526e+07  (37402698.492, 74798651.452]          not_dangerous
 10  3.833750e+07  (37402698.492, 74798651.452]          not_dangerous
 11  7.198311e+07  (37402698.492, 74798651.452]          not_dangerous
 12  5.209302e+07  (37402698.492, 74798651.452]          not_dangerous
 13  2.461759e+07      (6745.533, 37402698.492]  potentially_dangerous
 14  6.078930e+07  (37402698.492, 74798651.452]          not_dangerous
 15  5.988081e+07  (37402698.492, 74798651.452]          not_dangerous
 16  7.138706e+07  (37402698.492, 74798651.452]          not_dangerous
 17  2.771724e+07      (6745.533, 37402698.492]  potentially_dangerous
 18  3.942128e+07  (37402698.492, 74798651.452]          not_dangerous
 19  1.883284e+07      (6745.533, 37402698.492]  potentially_dangerous)

In [91]:

data1_cl = data1.copy()
data2_cl = data2.copy()

data1_cl["is_included"] = data1_cl["sentry_object"].apply(lambda x: 1 if x == "True" else 0)

data2_cl["is_close"] = data2_cl["hazardous"].apply(lambda x: 1 if x == "True" else 0)

In [92]:

from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

data1["absolute_magnitude_Norm"] = min_max_scaler.fit_transform(data1["absolute_magnitude"].to_numpy().reshape(-1, 1)).reshape(data1["absolute_magnitude"].shape)


data2["miss_distance_Norm"] = min_max_scaler.fit_transform(data2["miss_distance"].to_numpy().reshape(-1, 1)).reshape(data2["miss_distance"].shape)

data1[["absolute_magnitude", "absolute_magnitude_Norm"]].head(20)
data2[["miss_distance", "miss_distance_Norm"]].head(20)

Out[92]:

	miss_distance	miss_distance_Norm
0	5.483974e+07	0.733141
1	6.143813e+07	0.821364
2	4.979872e+07	0.665740
3	2.543497e+07	0.339986
4	4.627557e+07	0.618634
5	4.058569e+07	0.542558
6	2.906912e+07	0.388576
7	5.511502e+07	0.736821
8	6.903598e+07	0.922951
9	3.835526e+07	0.512736
10	3.833750e+07	0.512499
11	7.198311e+07	0.962355
12	5.209302e+07	0.696416
13	2.461759e+07	0.329058
14	6.078930e+07	0.812689
15	5.988081e+07	0.800542
16	7.138706e+07	0.954386
17	2.771724e+07	0.370501
18	3.942128e+07	0.526989
19	1.883284e+07	0.251713

In [93]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


abs_mag_array = data1["absolute_magnitude"].to_numpy().reshape(-1, 1)
data1["absolute_magnitude_Stand"] = scaler.fit_transform(abs_mag_array).flatten()

miss_dist_array = data2["miss_distance"].to_numpy().reshape(-1, 1)
data2["miss_distance_Stand"] = scaler.fit_transform(miss_dist_array).flatten()

data1[["absolute_magnitude", "absolute_magnitude_Stand"]].head(20), data2[["miss_distance", "miss_distance_Stand"]].head(20)

Out[93]:

(    absolute_magnitude  absolute_magnitude_Stand
 0                16.73                 -2.348632
 1                20.00                 -1.218735
 2                17.83                 -1.968544
 3                22.20                 -0.458560
 4                20.09                 -1.187637
 5                24.32                  0.273973
 6                20.95                 -0.890477
 7                28.49                  1.714850
 8                19.40                 -1.426055
 9                22.00                 -0.527666
 10               20.11                 -1.180726
 11               21.20                 -0.804094
 12               22.90                 -0.216686
 13               28.18                  1.607735
 14               19.81                 -1.284386
 15               18.70                 -1.667929
 16               27.90                  1.510985
 17               23.90                  0.128849
 18               24.80                  0.439829
 19               18.80                 -1.633376,
     miss_distance  miss_distance_Stand
 0    5.483974e+07             0.795153
 1    6.143813e+07             1.090357
 2    4.979872e+07             0.569624
 3    2.543497e+07            -0.520384
 4    4.627557e+07             0.412001
 5    4.058569e+07             0.157443
 6    2.906912e+07            -0.357796
 7    5.511502e+07             0.807469
 8    6.903598e+07             1.430277
 9    3.835526e+07             0.057656
 10   3.833750e+07             0.056861
 11   7.198311e+07             1.562128
 12   5.209302e+07             0.672268
 13   2.461759e+07            -0.556953
 14   6.078930e+07             1.061330
 15   5.988081e+07             1.020685
 16   7.138706e+07             1.535462
 17   2.771724e+07            -0.418278
 18   3.942128e+07             0.105348
 19   1.883284e+07            -0.815756)

In [94]:

from sklearn.model_selection import train_test_split

# Берем нормализованные признаки
X2_scaled = data2[["miss_distance_Norm", "est_diameter_max"]]
y2 = data2["hazardous"]

# Стратифицированное разбиение
X2_train, X2_temp, y2_train, y2_temp = train_test_split(
    X2_scaled, y2, stratify=y2, test_size=0.4, random_state=42)

X2_val, X2_test, y2_val, y2_test = train_test_split(
    X2_temp, y2_temp, stratify=y2_temp, test_size=0.5, random_state=42)

In [95]:

from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X2_train_bal, y2_train_bal = sm.fit_resample(X2_train, y2_train)

print("До балансировки:")
print(y2_train.value_counts())
print("После балансировки:")
print(pd.Series(y2_train_bal).value_counts())

До балансировки:
hazardous
False    49197
True      5304
Name: count, dtype: int64
После балансировки:
hazardous
False    49197
True     49197
Name: count, dtype: int64

In [97]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
display(X2_train_bal)

clf = RandomForestClassifier(random_state=42)
clf.fit(X2_train_bal, y2_train_bal)


y2_pred_val = clf.predict(X2_val)
print("Валидация:")
print(classification_report(y2_val, y2_pred_val))


y2_pred_test = clf.predict(X2_test)
print("Тест:")
print(classification_report(y2_test, y2_pred_test))

	miss_distance_Norm	est_diameter_max
0	0.769858	0.430566
1	0.673053	0.547066
2	0.667690	0.879103
3	0.529716	0.054205
4	0.030387	0.030906
...	...	...
98389	0.624613	0.498930
98390	0.259200	0.311918
98391	0.181996	0.630802
98392	0.047292	0.378271
98393	0.916603	0.880686

98394 rows × 2 columns

Валидация:
              precision    recall  f1-score   support

       False       0.96      0.88      0.92     16399
        True       0.37      0.65      0.47      1768

    accuracy                           0.86     18167
   macro avg       0.66      0.77      0.69     18167
weighted avg       0.90      0.86      0.87     18167

Тест:
              precision    recall  f1-score   support

       False       0.96      0.88      0.92     16400
        True       0.36      0.62      0.46      1768

    accuracy                           0.86     18168
   macro avg       0.66      0.75      0.69     18168
weighted avg       0.90      0.86      0.87     18168

Для первого набора данных предсказательная способность достаточно хорошая, довольно быстрая скорость вычислений, достаточная надежность и вполне хорошая корреляция и цельность.

Для второго набора данных предсказательная способность довольно хорошая, крайне быстрая скорость вычислений, хорошая надежность, хорошая корреляция и цельность данных.

47 KiB Raw Permalink Blame History Unescape Escape

47 KiB

Raw Permalink Blame History