Compare commits

...

18 Commits
main ... lab5-6

Author SHA1 Message Date
f08c12ac81 lab5-6 2025-02-28 16:02:44 +04:00
338e0b0ad8 feat(lab-4): make grid search 2024-12-14 15:01:59 +04:00
83031d3667 feat(lab-4): finish pipeline 2024-12-14 12:43:37 +04:00
5a6a48e622 feat(lab-4): add prediction output 2024-12-14 10:14:47 +04:00
75b0e0f580 feat(lab-4): r2 0,64 2024-12-12 23:48:52 +04:00
5ab313468c feat(lab-4): make pipeline 2024-12-07 13:00:14 +04:00
bd8c7a6d2b feat(lab3): finish preps 2024-12-07 10:51:50 +04:00
292b43e934 fix(lab3): fix age_create 2024-12-07 10:49:03 +04:00
daa238663b feat(lab3): add oversampling 2024-12-07 01:40:37 +04:00
59b6a164c8 feat(lab3): add featuretools 2024-12-07 00:18:42 +04:00
f77a5e5335 feat(lab3): make lab3 2024-12-07 00:08:27 +04:00
7aa7bd2f42 lab3 2024-12-06 18:43:41 +04:00
11ced38915 feat(lab-2): fix gitignore 2024-11-23 15:40:33 +04:00
c1ec962e77 feat(lab-2): add docs 2024-11-23 15:23:48 +04:00
f7672b7625 feat(lab-2): do lab-2, part 2 2024-11-23 15:06:07 +04:00
f249d643dc Merge branch 'main' into lab2
merge main into lab2
2024-11-10 15:10:02 +04:00
0b9d379e16 feat(lab-2): do lab-2, part 1 2024-11-10 14:56:44 +04:00
e3ad2174f2 feat(lab1): do lab1 2024-10-26 13:07:42 +04:00
25 changed files with 181956 additions and 2 deletions

4
.gitignore vendored
View File

@ -275,4 +275,6 @@ cython_debug/
# JS
node_modules/
test.csv
test.csv
описания_датасетов/.~lock.cars.odt#
описания_датасетов/.~lock.houses.odt#

14331
data/car-price-prediction.csv Normal file

File diff suppressed because it is too large Load Diff

19238
data/car_price_prediction.csv Normal file

File diff suppressed because it is too large Load Diff

244
data/dollar.csv Normal file
View File

@ -0,0 +1,244 @@
"my_date","my_value","bullet","bulletClass","label"
"28.03.2023","76.5662","","",""
"31.03.2023","77.0863","","",""
"01.04.2023","77.3233","","",""
"04.04.2023","77.9510","","",""
"05.04.2023","79.3563","","",""
"06.04.2023","79.4961","","",""
"07.04.2023","80.6713","","",""
"08.04.2023","82.3988","","",""
"11.04.2023","81.7441","","",""
"12.04.2023","82.1799","","",""
"13.04.2023","82.0934","","",""
"14.04.2023","81.6758","","",""
"15.04.2023","81.5045","","",""
"18.04.2023","81.6279","","",""
"19.04.2023","81.6028","","",""
"20.04.2023","81.6549","","",""
"21.04.2023","81.6188","","",""
"22.04.2023","81.4863","","",""
"25.04.2023","81.2745","","",""
"26.04.2023","81.5499","","",""
"27.04.2023","81.6274","","",""
"28.04.2023","81.5601","","",""
"29.04.2023","80.5093","","",""
"03.05.2023","79.9609","","",""
"04.05.2023","79.3071","","",""
"05.05.2023","78.6139","","",""
"06.05.2023","76.8207","","",""
"11.05.2023","76.6929","","",""
"12.05.2023","75.8846","round","min-pulsating-bullet","мин"
"13.05.2023","77.2041","","",""
"16.05.2023","79.1004","","",""
"17.05.2023","79.9798","","",""
"18.05.2023","80.7642","","",""
"19.05.2023","80.0366","","",""
"20.05.2023","79.9093","","",""
"23.05.2023","79.9379","","",""
"24.05.2023","80.1665","","",""
"25.05.2023","79.9669","","",""
"26.05.2023","79.9841","","",""
"27.05.2023","79.9667","","",""
"30.05.2023","80.0555","","",""
"31.05.2023","80.6872","","",""
"01.06.2023","80.9942","","",""
"02.06.2023","80.9657","","",""
"03.06.2023","80.8756","","",""
"06.06.2023","81.3294","","",""
"07.06.2023","81.2502","","",""
"08.06.2023","81.4581","","",""
"09.06.2023","82.0930","","",""
"10.06.2023","82.6417","","",""
"14.06.2023","83.6405","","",""
"15.06.2023","84.3249","","",""
"16.06.2023","83.9611","","",""
"17.06.2023","83.6498","","",""
"20.06.2023","83.9866","","",""
"21.06.2023","84.2336","","",""
"22.06.2023","84.2467","","",""
"23.06.2023","83.6077","","",""
"24.06.2023","84.0793","","",""
"27.06.2023","84.6642","","",""
"28.06.2023","85.0504","","",""
"29.06.2023","85.6192","","",""
"30.06.2023","87.0341","","",""
"01.07.2023","88.3844","","",""
"04.07.2023","89.3255","","",""
"05.07.2023","89.5450","","",""
"06.07.2023","90.3380","","",""
"07.07.2023","92.5695","","",""
"08.07.2023","91.6879","","",""
"11.07.2023","91.4931","","",""
"12.07.2023","90.5045","","",""
"13.07.2023","90.6253","","",""
"14.07.2023","90.1757","","",""
"15.07.2023","90.1190","","",""
"18.07.2023","90.4217","","",""
"19.07.2023","90.6906","","",""
"20.07.2023","91.2046","","",""
"21.07.2023","90.8545","","",""
"22.07.2023","90.3846","","",""
"25.07.2023","90.4890","","",""
"26.07.2023","90.0945","","",""
"27.07.2023","90.0468","","",""
"28.07.2023","90.0225","","",""
"29.07.2023","90.9783","","",""
"01.08.2023","91.5923","","",""
"02.08.2023","91.7755","","",""
"03.08.2023","92.8410","","",""
"04.08.2023","93.7792","","",""
"05.08.2023","94.8076","","",""
"08.08.2023","96.5668","","",""
"09.08.2023","96.0755","","",""
"10.08.2023","97.3999","","",""
"11.08.2023","97.2794","","",""
"12.08.2023","98.2066","","",""
"15.08.2023","101.0399","","",""
"16.08.2023","97.4217","","",""
"17.08.2023","96.7045","","",""
"18.08.2023","93.7460","","",""
"19.08.2023","93.4047","","",""
"22.08.2023","94.1424","","",""
"23.08.2023","94.1185","","",""
"24.08.2023","94.4421","","",""
"25.08.2023","94.4007","","",""
"26.08.2023","94.7117","","",""
"29.08.2023","95.4717","","",""
"30.08.2023","95.7070","","",""
"31.08.2023","95.9283","","",""
"01.09.2023","96.3344","","",""
"02.09.2023","96.3411","","",""
"05.09.2023","96.6199","","",""
"06.09.2023","97.5383","","",""
"07.09.2023","97.8439","","",""
"08.09.2023","98.1961","","",""
"09.09.2023","97.9241","","",""
"12.09.2023","96.5083","","",""
"13.09.2023","94.7035","","",""
"14.09.2023","95.9794","","",""
"15.09.2023","96.1609","","",""
"16.09.2023","96.6338","","",""
"19.09.2023","96.6472","","",""
"20.09.2023","96.2236","","",""
"21.09.2023","96.6172","","",""
"22.09.2023","96.0762","","",""
"23.09.2023","96.0419","","",""
"26.09.2023","96.1456","","",""
"27.09.2023","96.2378","","",""
"28.09.2023","96.5000","","",""
"29.09.2023","97.0018","","",""
"30.09.2023","97.4147","","",""
"03.10.2023","98.4785","","",""
"04.10.2023","99.2677","","",""
"05.10.2023","99.4555","","",""
"06.10.2023","99.6762","","",""
"07.10.2023","100.4911","","",""
"10.10.2023","101.3598","round","max-pulsating-bullet","макс"
"11.10.2023","99.9349","","",""
"12.10.2023","99.9808","","",""
"13.10.2023","96.9948","","",""
"14.10.2023","97.3075","","",""
"17.10.2023","97.2865","","",""
"18.10.2023","97.3458","","",""
"19.10.2023","97.3724","","",""
"20.10.2023","97.3074","","",""
"21.10.2023","95.9053","","",""
"24.10.2023","94.7081","","",""
"25.10.2023","93.5224","","",""
"26.10.2023","93.1507","","",""
"27.10.2023","93.5616","","",""
"28.10.2023","93.2174","","",""
"31.10.2023","93.2435","","",""
"01.11.2023","92.0226","","",""
"02.11.2023","93.2801","","",""
"03.11.2023","93.1730","","",""
"04.11.2023","93.0351","","",""
"08.11.2023","92.4151","","",""
"09.11.2023","92.1973","","",""
"10.11.2023","91.9266","","",""
"11.11.2023","92.0535","","",""
"14.11.2023","92.1185","","",""
"15.11.2023","91.2570","","",""
"16.11.2023","89.4565","","",""
"17.11.2023","88.9466","","",""
"18.11.2023","89.1237","","",""
"21.11.2023","88.4954","","",""
"22.11.2023","87.8701","","",""
"23.11.2023","88.1648","","",""
"24.11.2023","88.1206","","",""
"25.11.2023","88.8133","","",""
"28.11.2023","88.7045","","",""
"29.11.2023","88.6102","","",""
"30.11.2023","88.8841","","",""
"01.12.2023","88.5819","","",""
"02.12.2023","89.7619","","",""
"05.12.2023","90.6728","","",""
"06.12.2023","91.5823","","",""
"07.12.2023","92.7826","","",""
"08.12.2023","92.5654","","",""
"09.12.2023","91.6402","","",""
"12.12.2023","90.9846","","",""
"13.12.2023","90.2158","","",""
"14.12.2023","89.8926","","",""
"15.12.2023","89.6741","","",""
"16.12.2023","89.6966","","",""
"19.12.2023","90.4162","","",""
"20.12.2023","90.0870","","",""
"21.12.2023","90.4056","","",""
"22.12.2023","91.7062","","",""
"23.12.2023","91.9389","","",""
"26.12.2023","91.9690","","",""
"27.12.2023","91.7069","","",""
"28.12.2023","91.7051","","",""
"29.12.2023","90.3041","","",""
"30.12.2023","89.6883","","",""
"10.01.2024","90.4040","","",""
"11.01.2024","89.3939","","",""
"12.01.2024","88.7818","","",""
"13.01.2024","88.1324","","",""
"16.01.2024","87.6772","","",""
"17.01.2024","87.6457","","",""
"18.01.2024","88.3540","","",""
"19.01.2024","88.6610","","",""
"20.01.2024","88.5896","","",""
"23.01.2024","87.9724","","",""
"24.01.2024","87.9199","","",""
"25.01.2024","88.2829","","",""
"26.01.2024","88.6562","","",""
"27.01.2024","89.5159","","",""
"30.01.2024","89.6090","","",""
"31.01.2024","89.2887","","",""
"01.02.2024","89.6678","","",""
"02.02.2024","90.2299","","",""
"03.02.2024","90.6626","","",""
"06.02.2024","91.2434","","",""
"07.02.2024","90.6842","","",""
"08.02.2024","91.1514","","",""
"09.02.2024","91.2561","","",""
"10.02.2024","90.8901","","",""
"13.02.2024","91.0758","","",""
"14.02.2024","91.2057","","",""
"15.02.2024","91.4316","","",""
"16.02.2024","91.8237","","",""
"17.02.2024","92.5492","","",""
"20.02.2024","92.4102","","",""
"21.02.2024","92.3490","","",""
"22.02.2024","92.4387","","",""
"23.02.2024","92.7519","","",""
"27.02.2024","92.6321","","",""
"28.02.2024","92.0425","","",""
"29.02.2024","91.8692","","",""
"01.03.2024","90.8423","","",""
"02.03.2024","91.3336","","",""
"05.03.2024","91.3534","","",""
"06.03.2024","91.1604","","",""
"07.03.2024","90.3412","","",""
"08.03.2024","90.7493","","",""
"12.03.2024","90.6252","","",""
"13.03.2024","90.8818","","",""
"19.03.2024","91.9829","","",""
"20.03.2024","92.2243","","",""
"21.03.2024","92.6861","","",""
"22.03.2024","91.9499","","",""
"23.03.2024","92.6118","","",""
"26.03.2024","92.7761","","",""
1 my_date my_value bullet bulletClass label
2 28.03.2023 76.5662
3 31.03.2023 77.0863
4 01.04.2023 77.3233
5 04.04.2023 77.9510
6 05.04.2023 79.3563
7 06.04.2023 79.4961
8 07.04.2023 80.6713
9 08.04.2023 82.3988
10 11.04.2023 81.7441
11 12.04.2023 82.1799
12 13.04.2023 82.0934
13 14.04.2023 81.6758
14 15.04.2023 81.5045
15 18.04.2023 81.6279
16 19.04.2023 81.6028
17 20.04.2023 81.6549
18 21.04.2023 81.6188
19 22.04.2023 81.4863
20 25.04.2023 81.2745
21 26.04.2023 81.5499
22 27.04.2023 81.6274
23 28.04.2023 81.5601
24 29.04.2023 80.5093
25 03.05.2023 79.9609
26 04.05.2023 79.3071
27 05.05.2023 78.6139
28 06.05.2023 76.8207
29 11.05.2023 76.6929
30 12.05.2023 75.8846 round min-pulsating-bullet мин
31 13.05.2023 77.2041
32 16.05.2023 79.1004
33 17.05.2023 79.9798
34 18.05.2023 80.7642
35 19.05.2023 80.0366
36 20.05.2023 79.9093
37 23.05.2023 79.9379
38 24.05.2023 80.1665
39 25.05.2023 79.9669
40 26.05.2023 79.9841
41 27.05.2023 79.9667
42 30.05.2023 80.0555
43 31.05.2023 80.6872
44 01.06.2023 80.9942
45 02.06.2023 80.9657
46 03.06.2023 80.8756
47 06.06.2023 81.3294
48 07.06.2023 81.2502
49 08.06.2023 81.4581
50 09.06.2023 82.0930
51 10.06.2023 82.6417
52 14.06.2023 83.6405
53 15.06.2023 84.3249
54 16.06.2023 83.9611
55 17.06.2023 83.6498
56 20.06.2023 83.9866
57 21.06.2023 84.2336
58 22.06.2023 84.2467
59 23.06.2023 83.6077
60 24.06.2023 84.0793
61 27.06.2023 84.6642
62 28.06.2023 85.0504
63 29.06.2023 85.6192
64 30.06.2023 87.0341
65 01.07.2023 88.3844
66 04.07.2023 89.3255
67 05.07.2023 89.5450
68 06.07.2023 90.3380
69 07.07.2023 92.5695
70 08.07.2023 91.6879
71 11.07.2023 91.4931
72 12.07.2023 90.5045
73 13.07.2023 90.6253
74 14.07.2023 90.1757
75 15.07.2023 90.1190
76 18.07.2023 90.4217
77 19.07.2023 90.6906
78 20.07.2023 91.2046
79 21.07.2023 90.8545
80 22.07.2023 90.3846
81 25.07.2023 90.4890
82 26.07.2023 90.0945
83 27.07.2023 90.0468
84 28.07.2023 90.0225
85 29.07.2023 90.9783
86 01.08.2023 91.5923
87 02.08.2023 91.7755
88 03.08.2023 92.8410
89 04.08.2023 93.7792
90 05.08.2023 94.8076
91 08.08.2023 96.5668
92 09.08.2023 96.0755
93 10.08.2023 97.3999
94 11.08.2023 97.2794
95 12.08.2023 98.2066
96 15.08.2023 101.0399
97 16.08.2023 97.4217
98 17.08.2023 96.7045
99 18.08.2023 93.7460
100 19.08.2023 93.4047
101 22.08.2023 94.1424
102 23.08.2023 94.1185
103 24.08.2023 94.4421
104 25.08.2023 94.4007
105 26.08.2023 94.7117
106 29.08.2023 95.4717
107 30.08.2023 95.7070
108 31.08.2023 95.9283
109 01.09.2023 96.3344
110 02.09.2023 96.3411
111 05.09.2023 96.6199
112 06.09.2023 97.5383
113 07.09.2023 97.8439
114 08.09.2023 98.1961
115 09.09.2023 97.9241
116 12.09.2023 96.5083
117 13.09.2023 94.7035
118 14.09.2023 95.9794
119 15.09.2023 96.1609
120 16.09.2023 96.6338
121 19.09.2023 96.6472
122 20.09.2023 96.2236
123 21.09.2023 96.6172
124 22.09.2023 96.0762
125 23.09.2023 96.0419
126 26.09.2023 96.1456
127 27.09.2023 96.2378
128 28.09.2023 96.5000
129 29.09.2023 97.0018
130 30.09.2023 97.4147
131 03.10.2023 98.4785
132 04.10.2023 99.2677
133 05.10.2023 99.4555
134 06.10.2023 99.6762
135 07.10.2023 100.4911
136 10.10.2023 101.3598 round max-pulsating-bullet макс
137 11.10.2023 99.9349
138 12.10.2023 99.9808
139 13.10.2023 96.9948
140 14.10.2023 97.3075
141 17.10.2023 97.2865
142 18.10.2023 97.3458
143 19.10.2023 97.3724
144 20.10.2023 97.3074
145 21.10.2023 95.9053
146 24.10.2023 94.7081
147 25.10.2023 93.5224
148 26.10.2023 93.1507
149 27.10.2023 93.5616
150 28.10.2023 93.2174
151 31.10.2023 93.2435
152 01.11.2023 92.0226
153 02.11.2023 93.2801
154 03.11.2023 93.1730
155 04.11.2023 93.0351
156 08.11.2023 92.4151
157 09.11.2023 92.1973
158 10.11.2023 91.9266
159 11.11.2023 92.0535
160 14.11.2023 92.1185
161 15.11.2023 91.2570
162 16.11.2023 89.4565
163 17.11.2023 88.9466
164 18.11.2023 89.1237
165 21.11.2023 88.4954
166 22.11.2023 87.8701
167 23.11.2023 88.1648
168 24.11.2023 88.1206
169 25.11.2023 88.8133
170 28.11.2023 88.7045
171 29.11.2023 88.6102
172 30.11.2023 88.8841
173 01.12.2023 88.5819
174 02.12.2023 89.7619
175 05.12.2023 90.6728
176 06.12.2023 91.5823
177 07.12.2023 92.7826
178 08.12.2023 92.5654
179 09.12.2023 91.6402
180 12.12.2023 90.9846
181 13.12.2023 90.2158
182 14.12.2023 89.8926
183 15.12.2023 89.6741
184 16.12.2023 89.6966
185 19.12.2023 90.4162
186 20.12.2023 90.0870
187 21.12.2023 90.4056
188 22.12.2023 91.7062
189 23.12.2023 91.9389
190 26.12.2023 91.9690
191 27.12.2023 91.7069
192 28.12.2023 91.7051
193 29.12.2023 90.3041
194 30.12.2023 89.6883
195 10.01.2024 90.4040
196 11.01.2024 89.3939
197 12.01.2024 88.7818
198 13.01.2024 88.1324
199 16.01.2024 87.6772
200 17.01.2024 87.6457
201 18.01.2024 88.3540
202 19.01.2024 88.6610
203 20.01.2024 88.5896
204 23.01.2024 87.9724
205 24.01.2024 87.9199
206 25.01.2024 88.2829
207 26.01.2024 88.6562
208 27.01.2024 89.5159
209 30.01.2024 89.6090
210 31.01.2024 89.2887
211 01.02.2024 89.6678
212 02.02.2024 90.2299
213 03.02.2024 90.6626
214 06.02.2024 91.2434
215 07.02.2024 90.6842
216 08.02.2024 91.1514
217 09.02.2024 91.2561
218 10.02.2024 90.8901
219 13.02.2024 91.0758
220 14.02.2024 91.2057
221 15.02.2024 91.4316
222 16.02.2024 91.8237
223 17.02.2024 92.5492
224 20.02.2024 92.4102
225 21.02.2024 92.3490
226 22.02.2024 92.4387
227 23.02.2024 92.7519
228 27.02.2024 92.6321
229 28.02.2024 92.0425
230 29.02.2024 91.8692
231 01.03.2024 90.8423
232 02.03.2024 91.3336
233 05.03.2024 91.3534
234 06.03.2024 91.1604
235 07.03.2024 90.3412
236 08.03.2024 90.7493
237 12.03.2024 90.6252
238 13.03.2024 90.8818
239 19.03.2024 91.9829
240 20.03.2024 92.2243
241 21.03.2024 92.6861
242 22.03.2024 91.9499
243 23.03.2024 92.6118
244 26.03.2024 92.7761

3756
data/ds_salaries.csv Normal file

File diff suppressed because it is too large Load Diff

21614
data/kc_house_data.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

90837
data/neo.csv Normal file

File diff suppressed because it is too large Load Diff

848
notebooks/lab1.ipynb Normal file

File diff suppressed because one or more lines are too long

312
notebooks/lab2_1.ipynb Normal file
View File

@ -0,0 +1,312 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка данных в DataFrame"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"../data/kc_house_data.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Получение сведений о пропущенных данных"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(df.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(df.isnull().any())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание выборок данных"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[3 5 4 1 2]\n"
]
}
],
"source": [
"print(df.condition.unique())\n",
"\n",
"data = df[\n",
" [\n",
" \"price\",\n",
" \"bedrooms\",\n",
" \"bathrooms\",\n",
" \"sqft_living\",\n",
" \"sqft_lot\",\n",
" \"floors\",\n",
" \"view\",\n",
" \"condition\",\n",
" \"grade\",\n",
" \"sqft_above\",\n",
" \"sqft_basement\",\n",
" \"yr_built\",\n",
" \"yr_renovated\",\n",
" \"zipcode\",\n",
" \"lat\",\n",
" \"long\",\n",
" ]\n",
"].copy()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (12967, 16)\n",
"condition\n",
"3 8418\n",
"4 3407\n",
"5 1021\n",
"2 103\n",
"1 18\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (4323, 16)\n",
"condition\n",
"3 2806\n",
"4 1136\n",
"5 340\n",
"2 35\n",
"1 6\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: (4323, 16)\n",
"condition\n",
"3 2807\n",
"4 1136\n",
"5 340\n",
"2 34\n",
"1 6\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
" data,\n",
" stratify_colname=\"condition\",\n",
" frac_train=0.60,\n",
" frac_val=0.20,\n",
" frac_test=0.20,\n",
")\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.condition.value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_val.shape)\n",
"print(df_val.condition.value_counts())\n",
"\n",
"print(\"Тестовая выборка: \", df_test.shape)\n",
"print(df_test.condition.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (12967, 16)\n",
"condition\n",
"3 8418\n",
"4 3407\n",
"5 1021\n",
"2 103\n",
"1 18\n",
"Name: count, dtype: int64\n",
"Обучающая выборка после oversampling: (42073, 16)\n",
"condition\n",
"5 8464\n",
"2 8421\n",
"1 8420\n",
"3 8418\n",
"4 8350\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.condition.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"condition\"])\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn.condition.value_counts())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

648
notebooks/lab2_2.ipynb Normal file
View File

@ -0,0 +1,648 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка данных в DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"../data/car_price_prediction.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>Prod_year</th>\n",
" <th>Category</th>\n",
" <th>Leather interior</th>\n",
" <th>Fuel type</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Gear_box_type</th>\n",
" <th>Drive_wheels</th>\n",
" <th>Doors</th>\n",
" <th>Wheel</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>45654403</td>\n",
" <td>13328</td>\n",
" <td>1399</td>\n",
" <td>LEXUS</td>\n",
" <td>RX 450</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>3.5</td>\n",
" <td>186005 km</td>\n",
" <td>6.0</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>44731507</td>\n",
" <td>16621</td>\n",
" <td>1018</td>\n",
" <td>CHEVROLET</td>\n",
" <td>Equinox</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>3</td>\n",
" <td>192000 km</td>\n",
" <td>6.0</td>\n",
" <td>Tiptronic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Black</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>45774419</td>\n",
" <td>8467</td>\n",
" <td>-</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2006</td>\n",
" <td>Hatchback</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>200000 km</td>\n",
" <td>4.0</td>\n",
" <td>Variator</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Right-hand drive</td>\n",
" <td>Black</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45769185</td>\n",
" <td>3607</td>\n",
" <td>862</td>\n",
" <td>FORD</td>\n",
" <td>Escape</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>2.5</td>\n",
" <td>168966 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>White</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45809263</td>\n",
" <td>11726</td>\n",
" <td>446</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2014</td>\n",
" <td>Hatchback</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>91901 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID Price Levy Manufacturer Model Prod_year Category \\\n",
"0 45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
"1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
"2 45774419 8467 - HONDA FIT 2006 Hatchback \n",
"3 45769185 3607 862 FORD Escape 2011 Jeep \n",
"4 45809263 11726 446 HONDA FIT 2014 Hatchback \n",
"\n",
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
"0 Yes Hybrid 3.5 186005 km 6.0 \n",
"1 No Petrol 3 192000 km 6.0 \n",
"2 No Petrol 1.3 200000 km 4.0 \n",
"3 Yes Hybrid 2.5 168966 km 4.0 \n",
"4 Yes Petrol 1.3 91901 km 4.0 \n",
"\n",
" Gear_box_type Drive_wheels Doors Wheel Color Airbags \n",
"0 Automatic 4x4 04-May Left wheel Silver 12 \n",
"1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
"2 Variator Front 04-May Right-hand drive Black 2 \n",
"3 Automatic 4x4 04-May Left wheel White 0 \n",
"4 Automatic Front 04-May Left wheel Silver 4 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Получение сведений о пропущенных данных"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID 0\n",
"Price 0\n",
"Levy 0\n",
"Manufacturer 0\n",
"Model 0\n",
"Prod_year 0\n",
"Category 0\n",
"Leather interior 0\n",
"Fuel type 0\n",
"Engine volume 0\n",
"Mileage 0\n",
"Cylinders 0\n",
"Gear_box_type 0\n",
"Drive_wheels 0\n",
"Doors 0\n",
"Wheel 0\n",
"Color 0\n",
"Airbags 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(df.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID False\n",
"Price False\n",
"Levy False\n",
"Manufacturer False\n",
"Model False\n",
"Prod_year False\n",
"Category False\n",
"Leather interior False\n",
"Fuel type False\n",
"Engine volume False\n",
"Mileage False\n",
"Cylinders False\n",
"Gear_box_type False\n",
"Drive_wheels False\n",
"Doors False\n",
"Wheel False\n",
"Color False\n",
"Airbags False\n",
"dtype: bool\n"
]
}
],
"source": [
"print(df.isnull().any())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['1399' '1018' '-' '862' '446' '891' '761' '751' '394' '1053' '1055'\n",
" '1079' '810' '2386' '1850' '531' '586' '1249' '2455' '583' '1537' '1288'\n",
" '915' '1750' '707' '1077' '1486' '1091' '650' '382' '1436' '1194' '503'\n",
" '1017' '1104' '639' '629' '919' '781' '530' '640' '765' '777' '779' '934'\n",
" '769' '645' '1185' '1324' '830' '1187' '1111' '760' '642' '1604' '1095'\n",
" '966' '473' '1138' '1811' '988' '917' '1156' '687' '11714' '836' '1347'\n",
" '2866' '1646' '259' '609' '697' '585' '475' '690' '308' '1823' '1361'\n",
" '1273' '924' '584' '2078' '831' '1172' '893' '1872' '1885' '1266' '447'\n",
" '2148' '1730' '730' '289' '502' '333' '1325' '247' '879' '1342' '1327'\n",
" '1598' '1514' '1058' '738' '1935' '481' '1522' '1282' '456' '880' '900'\n",
" '798' '1277' '442' '1051' '790' '1292' '1047' '528' '1211' '1493' '1793'\n",
" '574' '930' '1998' '271' '706' '1481' '1677' '1661' '1286' '1408' '1090'\n",
" '595' '1451' '1267' '993' '1714' '878' '641' '749' '1511' '603' '353'\n",
" '877' '1236' '1141' '397' '784' '1024' '1357' '1301' '770' '922' '1438'\n",
" '753' '607' '1363' '638' '490' '431' '565' '517' '833' '489' '1760' '986'\n",
" '1841' '1620' '1360' '474' '1099' '978' '1624' '1946' '1268' '1307' '696'\n",
" '649' '666' '2151' '551' '800' '971' '1323' '2377' '1845' '1083' '694'\n",
" '463' '419' '345' '1515' '1505' '2056' '1203' '729' '460' '1356' '876'\n",
" '911' '1190' '780' '448' '2410' '1848' '1148' '834' '1275' '1028' '1197'\n",
" '724' '890' '1705' '505' '789' '2959' '518' '461' '1719' '2858' '3156'\n",
" '2225' '2177' '1968' '1888' '1308' '2736' '1103' '557' '2195' '843'\n",
" '1664' '723' '4508' '562' '501' '2018' '1076' '1202' '3301' '691' '1440'\n",
" '1869' '1178' '418' '1820' '1413' '488' '1304' '363' '2108' '521' '1659'\n",
" '87' '1411' '1528' '3292' '7058' '1578' '627' '874' '1996' '1488' '5679'\n",
" '1234' '5603' '400' '889' '3268' '875' '949' '2265' '441' '742' '425'\n",
" '2476' '2971' '614' '1816' '1375' '1405' '2297' '1062' '1113' '420'\n",
" '2469' '658' '1951' '2670' '2578' '1995' '1032' '994' '1011' '2421'\n",
" '1296' '155' '494' '426' '1086' '961' '2236' '1829' '764' '1834' '1054'\n",
" '617' '1529' '2266' '637' '626' '1832' '1016' '2002' '1756' '746' '1285'\n",
" '2690' '1118' '5332' '980' '1807' '970' '1228' '1195' '1132' '1768'\n",
" '1384' '1080' '7063' '1817' '1452' '1975' '1368' '702' '1974' '1781'\n",
" '1036' '944' '663' '364' '1539' '1345' '1680' '2209' '741' '1575' '695'\n",
" '1317' '294' '1525' '424' '997' '1473' '1552' '2819' '2188' '1668' '3057'\n",
" '799' '1502' '2606' '552' '1694' '1759' '1110' '399' '1470' '1174' '5877'\n",
" '1474' '1688' '526' '686' '5908' '1107' '2070' '1468' '1246' '1685' '556'\n",
" '1533' '1917' '1346' '732' '692' '579' '421' '362' '3505' '1855' '2711'\n",
" '1586' '3739' '681' '1708' '2278' '1701' '722' '1482' '928' '827' '832'\n",
" '527' '604' '173' '1341' '3329' '1553' '859' '167' '916' '828' '2082'\n",
" '1176' '1108' '975' '3008' '1516' '2269' '1699' '2073' '1031' '1503'\n",
" '2364' '1030' '1442' '5666' '2715' '1437' '2067' '1426' '2908' '1279'\n",
" '866' '4283' '279' '2658' '3015' '2004' '1391' '4736' '748' '1466' '644'\n",
" '683' '2705' '1297' '731' '1252' '2216' '3141' '3273' '1518' '1723'\n",
" '1588' '972' '682' '1094' '668' '175' '967' '402' '3894' '1960' '1599'\n",
" '2000' '2084' '1621' '714' '1109' '3989' '873' '1572' '1163' '1991'\n",
" '1716' '1673' '2562' '2874' '965' '462' '605' '1948' '1736' '3518' '2054'\n",
" '2467' '1681' '1272' '1205' '750' '2156' '2566' '115' '524' '3184' '676'\n",
" '1678' '612' '328' '955' '1441' '1675' '3965' '2909' '623' '822' '867'\n",
" '3025' '1993' '792' '636' '4057' '3743' '2337' '2570' '2418' '2472'\n",
" '3910' '1662' '2123' '2628' '3208' '2080' '3699' '2913' '864' '2505'\n",
" '870' '7536' '1924' '1671' '1064' '1836' '1866' '4741' '841' '1369'\n",
" '5681' '3112' '1366' '2223' '1198' '1039' '3811' '3571' '1387' '1171'\n",
" '1365' '1531' '1590' '11706' '2308' '4860' '1641' '1045' '1901']\n"
]
}
],
"source": [
"print(df[\"Levy\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df[\"Levy\"] = df[\"Levy\"].replace({'-' : None})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Levy процент пустых значений: 30.25%\n"
]
}
],
"source": [
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Заполнение пропущенных данных"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df.fillna({\"Levy\": 0}, inplace=True)\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: {null_rate:.2f}%\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание выборок данных"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Automatic' 'Tiptronic' 'Variator' 'Manual']\n"
]
}
],
"source": [
"print(df.Gear_box_type.unique())\n",
"\n",
"data = df[\n",
" [\n",
" \"Price\",\n",
" \"Gear_box_type\",\n",
" ]\n",
"].copy()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (11542, 2)\n",
"Gear_box_type\n",
"Automatic 8108\n",
"Tiptronic 1861\n",
"Manual 1125\n",
"Variator 448\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (3847, 2)\n",
"Gear_box_type\n",
"Automatic 2703\n",
"Tiptronic 620\n",
"Manual 375\n",
"Variator 149\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: (3848, 2)\n",
"Gear_box_type\n",
"Automatic 2703\n",
"Tiptronic 621\n",
"Manual 375\n",
"Variator 149\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
" data,\n",
" stratify_colname=\"Gear_box_type\",\n",
" frac_train=0.60,\n",
" frac_val=0.20,\n",
" frac_test=0.20,\n",
")\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Gear_box_type.value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_val.shape)\n",
"print(df_val.Gear_box_type.value_counts())\n",
"\n",
"print(\"Тестовая выборка: \", df_test.shape)\n",
"print(df_test.Gear_box_type.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборка с избытком (oversampling)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (11542, 2)\n",
"Gear_box_type\n",
"Automatic 8108\n",
"Tiptronic 1861\n",
"Manual 1125\n",
"Variator 448\n",
"Name: count, dtype: int64\n"
]
},
{
"ename": "ValueError",
"evalue": "could not convert string to float: 'Automatic'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_9996\\2277749880.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Обучающая выборка: \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mGear_box_type\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[0mX_resampled\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_resampled\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mada\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_resample\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"Gear_box_type\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 9\u001b[0m \u001b[0mdf_train_adasyn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Обучающая выборка после oversampling: \"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf_train_adasyn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 204\u001b[0m \u001b[0my_resampled\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0marray\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mlike\u001b[0m \u001b[0mof\u001b[0m \u001b[0mshape\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mn_samples_new\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mcorresponding\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mof\u001b[0m \u001b[1;33m`\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 206\u001b[0m \"\"\"\n\u001b[0;32m 207\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 208\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_resample\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mcorresponding\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0mof\u001b[0m \u001b[1;33m`\u001b[0m\u001b[0mX_resampled\u001b[0m\u001b[1;33m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 103\u001b[0m \"\"\"\n\u001b[0;32m 104\u001b[0m \u001b[0mcheck_classification_targets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[0marrays_transformer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mArraysTransformer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 106\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 107\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m self.sampling_strategy_ = check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msampling_strategy\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sampling_type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\imblearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y, accept_sparse)\u001b[0m\n\u001b[0;32m 157\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_check_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 158\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m\"csr\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"csc\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 160\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_target_type\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindicate_one_vs_all\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 161\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreset\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 162\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinarize_y\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m 646\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;34m\"estimator\"\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcheck_y_params\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 647\u001b[0m \u001b[0mcheck_y_params\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 648\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"y\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 649\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 650\u001b[1;33m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 651\u001b[0m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 652\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 653\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mno_val_X\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mcheck_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"ensure_2d\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[0;32m 1297\u001b[0m raise ValueError(\n\u001b[0;32m 1298\u001b[0m \u001b[1;33mf\"\u001b[0m\u001b[1;33m{\u001b[0m\u001b[0mestimator_name\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m requires y to be passed, but the target y is None\u001b[0m\u001b[1;33m\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1299\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1300\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1301\u001b[1;33m X = check_array(\n\u001b[0m\u001b[0;32m 1302\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1303\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1304\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 1009\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1010\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1011\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1012\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1013\u001b[1;33m \u001b[1;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1014\u001b[0m raise ValueError(\n\u001b[0;32m 1015\u001b[0m \u001b[1;34m\"Complex data not supported\\n{}\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1016\u001b[0m \u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\sklearn\\utils\\_array_api.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(array, dtype, order, copy, xp, device)\u001b[0m\n\u001b[0;32m 741\u001b[0m \u001b[1;31m# Use NumPy API to support order\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 742\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 743\u001b[0m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 744\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 745\u001b[1;33m \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 746\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 748\u001b[0m \u001b[1;31m# container that is consistent with the input's namespace.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, dtype, copy)\u001b[0m\n\u001b[0;32m 2149\u001b[0m def __array__(\n\u001b[0;32m 2150\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mnpt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDTypeLike\u001b[0m \u001b[1;33m|\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mbool_t\u001b[0m \u001b[1;33m|\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2151\u001b[0m \u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2152\u001b[0m \u001b[0mvalues\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2153\u001b[1;33m \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2154\u001b[0m if (\n\u001b[0;32m 2155\u001b[0m \u001b[0mastype_is_view\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2156\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0musing_copy_on_write\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: could not convert string to float: 'Automatic'"
]
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Gear_box_type.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Gear_box_type\"])\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn.Gear_box_type.value_counts())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

2584
notebooks/lab3_1.ipynb Normal file

File diff suppressed because one or more lines are too long

2971
notebooks/lab3_2.ipynb Normal file

File diff suppressed because one or more lines are too long

3418
notebooks/lab4.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

2811
notebooks/lab4_sandbox.ipynb Normal file

File diff suppressed because one or more lines are too long

1195
notebooks/lab5_1.ipynb Normal file

File diff suppressed because one or more lines are too long

13326
notebooks/lab6_1.ipynb Normal file

File diff suppressed because it is too large Load Diff

17
notebooks/transformers.py Normal file
View File

@ -0,0 +1,17 @@
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class CarsFeatures(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X["Age"] = 2020 - X["Prod. year"]
return X
def get_feature_names_out(self, features_in):
return np.append(features_in, ["Age"], axis=0)

100
notebooks/utils_clusters.py Normal file
View File

@ -0,0 +1,100 @@
import math
from typing import Dict, List, Tuple
import numpy as np
from pandas import DataFrame
from sklearn import cluster
from sklearn.metrics import silhouette_samples, silhouette_score
def run_agglomerative(
df: DataFrame, num_clusters: int | None = 2
) -> cluster.AgglomerativeClustering:
agglomerative = cluster.AgglomerativeClustering(
n_clusters=num_clusters,
compute_distances=True,
)
return agglomerative.fit(df)
def get_linkage_matrix(model: cluster.AgglomerativeClustering) -> np.ndarray:
counts = np.zeros(model.children_.shape[0]) # type: ignore
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_): # type: ignore
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
return np.column_stack([model.children_, model.distances_, counts]).astype(float)
def print_cluster_result(
df: DataFrame, clusters_num: int, labels: np.ndarray, separator: str = ", "
):
for cluster_id in range(clusters_num):
cluster_indices = np.where(labels == cluster_id)[0]
print(f"Cluster {cluster_id + 1} ({len(cluster_indices)}):")
rules = [str(df.index[idx]) for idx in cluster_indices]
print(separator.join(rules))
print("")
print("--------")
def run_kmeans(
df: DataFrame, num_clusters: int, random_state: int
) -> Tuple[np.ndarray, np.ndarray]:
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
labels = kmeans.fit_predict(df)
return labels, kmeans.cluster_centers_
def fit_kmeans(
reduced_data: np.ndarray, num_clusters: int, random_state: int
) -> cluster.KMeans:
kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=random_state)
kmeans.fit(reduced_data)
return kmeans
def _get_kmeans_range(
df: DataFrame | np.ndarray, random_state: int
) -> Tuple[List, range]:
max_clusters = int(math.sqrt(len(df)))
clusters_range = range(2, max_clusters + 1)
kmeans_per_k = [
cluster.KMeans(n_clusters=k, random_state=random_state).fit(df)
for k in clusters_range
]
return kmeans_per_k, clusters_range
def get_clusters_inertia(df: DataFrame, random_state: int) -> Tuple[List, range]:
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
return [model.inertia_ for model in kmeans_per_k], clusters_range
def get_clusters_silhouette_scores(
df: DataFrame, random_state: int
) -> Tuple[List, range]:
kmeans_per_k, clusters_range = _get_kmeans_range(df, random_state)
return [
float(silhouette_score(df, model.labels_)) for model in kmeans_per_k
], clusters_range
def get_clusters_silhouettes(df: np.ndarray, random_state: int) -> Dict:
kmeans_per_k, _ = _get_kmeans_range(df, random_state)
clusters_silhouettes: Dict = {}
for model in kmeans_per_k:
silhouette_value = silhouette_score(df, model.labels_)
sample_silhouette_values = silhouette_samples(df, model.labels_)
clusters_silhouettes[model.n_clusters] = (
silhouette_value,
sample_silhouette_values,
model,
)
return clusters_silhouettes

242
notebooks/visual.py Normal file
View File

@ -0,0 +1,242 @@
from typing import Any, Dict, List
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
from scipy.cluster import hierarchy
from sklearn.cluster import KMeans
def draw_data_2d(
df: DataFrame,
col1: int,
col2: int,
y: List | None = None,
classes: List | None = None,
subplot: Any | None = None,
):
ax = None
if subplot is None:
_, ax = plt.subplots()
else:
ax = subplot
scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y)
ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2])
if classes is not None:
ax.legend(
scatter.legend_elements()[0], classes, loc="lower right", title="Classes"
)
def draw_dendrogram(linkage_matrix: np.ndarray):
hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3)
def draw_cluster_results(
df: DataFrame,
col1: int,
col2: int,
labels: np.ndarray,
cluster_centers: np.ndarray,
subplot: Any | None = None,
):
ax = None
if subplot is None:
ax = plt
else:
ax = subplot
centroids = cluster_centers
u_labels = np.unique(labels)
for i in u_labels:
ax.scatter(
df[labels == i][df.columns[col1]],
df[labels == i][df.columns[col2]],
label=i,
)
ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k")
def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans):
h = 0.02
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
Z,
interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired, # type: ignore
aspect="auto",
origin="lower",
)
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
centroids = kmeans.cluster_centers_
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"K-means clustering (PCA-reduced data)\n"
"Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
def _draw_cluster_scores(
data: List,
clusters_range: range,
score_name: str,
title: str,
):
plt.figure(figsize=(8, 5))
plt.plot(clusters_range, data, "bo-")
plt.xlabel("$k$", fontsize=8)
plt.ylabel(score_name, fontsize=8)
plt.title(title)
def draw_elbow_diagram(inertias: List, clusters_range: range):
_draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram")
def draw_silhouettes_diagram(silhouette: List, clusters_range: range):
_draw_cluster_scores(
silhouette, clusters_range, "Silhouette score", "The Silhouette score"
)
def _draw_silhouette(
ax: Any,
reduced_data: np.ndarray,
n_clusters: int,
silhouette_avg: float,
sample_silhouette_values: List,
cluster_labels: List,
):
ax.set_xlim([-0.1, 1])
ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters) # type: ignore
ax.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10 # 10 for the 0 samples
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
ax.set_yticks([])
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
def _draw_cluster_data(
ax: Any,
reduced_data: np.ndarray,
n_clusters: int,
cluster_labels: np.ndarray,
cluster_centers: np.ndarray,
):
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) # type: ignore
ax.scatter(
reduced_data[:, 0],
reduced_data[:, 1],
marker=".",
s=30,
lw=0,
alpha=0.7,
c=colors,
edgecolor="k",
)
ax.scatter(
cluster_centers[:, 0],
cluster_centers[:, 1],
marker="o",
c="white",
alpha=1,
s=200,
edgecolor="k",
)
for i, c in enumerate(cluster_centers):
ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
ax.set_title("The visualization of the clustered data.")
ax.set_xlabel("Feature space for the 1st feature")
ax.set_ylabel("Feature space for the 2nd feature")
def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict):
for key, value in silhouettes.items():
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
n_clusters = key
silhouette_avg = value[0]
sample_silhouette_values = value[1]
cluster_labels = value[2].labels_
cluster_centers = value[2].cluster_centers_
_draw_silhouette(
ax1,
reduced_data,
n_clusters,
silhouette_avg,
sample_silhouette_values,
cluster_labels,
)
_draw_cluster_data(
ax2,
reduced_data,
n_clusters,
cluster_labels,
cluster_centers,
)
plt.suptitle(
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
% n_clusters,
fontsize=14,
fontweight="bold",
)

206
poetry.lock generated
View File

@ -467,6 +467,17 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "cloudpickle"
version = "3.1.0"
description = "Pickler class to extend the standard pickle.Pickler functionality"
optional = false
python-versions = ">=3.8"
files = [
{file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"},
{file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"},
]
[[package]]
name = "colorama"
version = "0.4.6"
@ -661,6 +672,17 @@ files = [
[package.extras]
tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
[[package]]
name = "farama-notifications"
version = "0.0.4"
description = "Notifications for all Farama Foundation maintained libraries."
optional = false
python-versions = "*"
files = [
{file = "Farama-Notifications-0.0.4.tar.gz", hash = "sha256:13fceff2d14314cf80703c8266462ebf3733c7d165336eee998fc58e545efd18"},
{file = "Farama_Notifications-0.0.4-py3-none-any.whl", hash = "sha256:14de931035a41961f7c056361dc7f980762a143d05791ef5794a751a2caf05ae"},
]
[[package]]
name = "fastjsonschema"
version = "2.20.0"
@ -675,6 +697,41 @@ files = [
[package.extras]
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
[[package]]
name = "featuretools"
version = "1.31.0"
description = "a framework for automated feature engineering"
optional = false
python-versions = "<4,>=3.9"
files = [
{file = "featuretools-1.31.0-py3-none-any.whl", hash = "sha256:87c94e9ae959c89acd83da96bd2583f3ef0f6daaa9639cbb6e46dbde2c742a18"},
{file = "featuretools-1.31.0.tar.gz", hash = "sha256:01bfb17fcc1715b4c3623c7bc94a8982122c4a0fa03350ed478601bb81f90155"},
]
[package.dependencies]
cloudpickle = ">=1.5.0"
holidays = ">=0.17"
numpy = ">=1.25.0"
packaging = ">=20.0"
pandas = ">=2.0.0"
psutil = ">=5.7.0"
scipy = ">=1.10.0"
tqdm = ">=4.66.3"
woodwork = ">=0.28.0"
[package.extras]
autonormalize = ["autonormalize (>=2.0.1)"]
complete = ["featuretools[dask,nlp,premium]"]
dask = ["dask[dataframe] (>=2023.2.0)", "distributed (>=2023.2.0)"]
dev = ["black[jupyter] (>=23.1.0)", "featuretools[dask,docs,test]", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)"]
docs = ["Sphinx (==5.1.1)", "autonormalize (>=2.0.1)", "click (>=7.0.0)", "featuretools[dask,test]", "ipython (==8.4.0)", "jupyter (==1.0.0)", "jupyter-client (>=8.0.2)", "matplotlib (==3.7.2)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"]
nlp = ["nlp-primitives (>=2.12.0)"]
premium = ["premium-primitives (>=0.0.3)"]
sklearn = ["featuretools-sklearn-transformer (>=1.0.0)"]
sql = ["featuretools-sql (>=0.0.1)", "psycopg2-binary (>=2.9.3)"]
test = ["boto3 (>=1.34.32)", "composeml (>=0.8.0)", "graphviz (>=0.8.4)", "moto[all] (>=5.0.0)", "pip (>=23.3.0)", "pyarrow (>=14.0.1)", "pympler (>=0.8)", "pytest (>=7.1.2)", "pytest-cov (>=3.0.0)", "pytest-timeout (>=2.1.0)", "pytest-xdist (>=2.5.0)", "smart-open (>=5.0.0)", "urllib3 (>=1.26.18)"]
tsfresh = ["featuretools-tsfresh-primitives (>=1.0.0)"]
[[package]]
name = "flask"
version = "3.0.3"
@ -822,6 +879,36 @@ files = [
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
]
[[package]]
name = "gymnasium"
version = "1.0.0"
description = "A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)."
optional = false
python-versions = ">=3.8"
files = [
{file = "gymnasium-1.0.0-py3-none-any.whl", hash = "sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad"},
{file = "gymnasium-1.0.0.tar.gz", hash = "sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403"},
]
[package.dependencies]
cloudpickle = ">=1.2.0"
farama-notifications = ">=0.0.1"
numpy = ">=1.21.0"
typing-extensions = ">=4.3.0"
[package.extras]
all = ["ale-py (>=0.9)", "box2d-py (==2.3.5)", "cython (<3)", "flax (>=0.5.0)", "imageio (>=2.14.1)", "jax (>=0.4.0)", "jaxlib (>=0.4.0)", "matplotlib (>=3.0)", "moviepy (>=1.0.0)", "mujoco (>=2.1.5)", "mujoco-py (>=2.1,<2.2)", "opencv-python (>=3.0)", "pygame (>=2.1.3)", "swig (==4.*)", "torch (>=1.0.0)"]
atari = ["ale-py (>=0.9)"]
box2d = ["box2d-py (==2.3.5)", "pygame (>=2.1.3)", "swig (==4.*)"]
classic-control = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
jax = ["flax (>=0.5.0)", "jax (>=0.4.0)", "jaxlib (>=0.4.0)"]
mujoco = ["imageio (>=2.14.1)", "mujoco (>=2.1.5)"]
mujoco-py = ["cython (<3)", "cython (<3)", "mujoco-py (>=2.1,<2.2)", "mujoco-py (>=2.1,<2.2)"]
other = ["matplotlib (>=3.0)", "moviepy (>=1.0.0)", "opencv-python (>=3.0)"]
testing = ["dill (>=0.3.7)", "pytest (==7.1.3)", "scipy (>=1.7.3)"]
torch = ["torch (>=1.0.0)"]
toy-text = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"]
[[package]]
name = "h11"
version = "0.14.0"
@ -833,6 +920,20 @@ files = [
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
]
[[package]]
name = "holidays"
version = "0.60"
description = "World Holidays Framework"
optional = false
python-versions = ">=3.9"
files = [
{file = "holidays-0.60-py3-none-any.whl", hash = "sha256:d857949c5ee35655215a10c5a26e6a856bdc3beccc4fbbc8debef98dfba17b82"},
{file = "holidays-0.60.tar.gz", hash = "sha256:495fc5123f5d92b92673237375eb8c15a03d21c647b089db509b7d9612267556"},
]
[package.dependencies]
python-dateutil = "*"
[[package]]
name = "httpcore"
version = "1.0.5"
@ -914,6 +1015,25 @@ examples = ["keras (>=2.4.3)", "matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "seab
optional = ["keras (>=2.4.3)", "pandas (>=1.0.5)", "tensorflow (>=2.4.3)"]
tests = ["black (>=23.3.0)", "flake8 (>=3.8.2)", "keras (>=2.4.3)", "mypy (>=1.3.0)", "pandas (>=1.0.5)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "tensorflow (>=2.4.3)"]
[[package]]
name = "importlib-resources"
version = "6.4.5"
description = "Read resources from Python packages"
optional = false
python-versions = ">=3.8"
files = [
{file = "importlib_resources-6.4.5-py3-none-any.whl", hash = "sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717"},
{file = "importlib_resources-6.4.5.tar.gz", hash = "sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065"},
]
[package.extras]
check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
cover = ["pytest-cov"]
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
enabler = ["pytest-enabler (>=2.2)"]
test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"]
type = ["pytest-mypy"]
[[package]]
name = "ipykernel"
version = "6.29.5"
@ -2708,6 +2828,11 @@ files = [
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@ -2781,6 +2906,27 @@ dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodest
doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.13.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<=7.3.7)", "sphinx-design (>=0.4.0)"]
test = ["Cython", "array-api-strict (>=2.0)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
[[package]]
name = "seaborn"
version = "0.13.2"
description = "Statistical data visualization"
optional = false
python-versions = ">=3.8"
files = [
{file = "seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987"},
{file = "seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7"},
]
[package.dependencies]
matplotlib = ">=3.4,<3.6.1 || >3.6.1"
numpy = ">=1.20,<1.24.0 || >1.24.0"
pandas = ">=1.2"
[package.extras]
dev = ["flake8", "flit", "mypy", "pandas-stubs", "pre-commit", "pytest", "pytest-cov", "pytest-xdist"]
docs = ["ipykernel", "nbconvert", "numpydoc", "pydata_sphinx_theme (==0.10.0rc2)", "pyyaml", "sphinx (<6.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-issues"]
stats = ["scipy (>=1.7)", "statsmodels (>=0.12)"]
[[package]]
name = "send2trash"
version = "1.8.3"
@ -2939,6 +3085,27 @@ files = [
{file = "tornado-6.4.1.tar.gz", hash = "sha256:92d3ab53183d8c50f8204a51e6f91d18a15d5ef261e84d452800d4ff6fc504e9"},
]
[[package]]
name = "tqdm"
version = "4.67.0"
description = "Fast, Extensible Progress Meter"
optional = false
python-versions = ">=3.7"
files = [
{file = "tqdm-4.67.0-py3-none-any.whl", hash = "sha256:0cd8af9d56911acab92182e88d763100d4788bdf421d251616040cc4d44863be"},
{file = "tqdm-4.67.0.tar.gz", hash = "sha256:fe5a6f95e6fe0b9755e9469b77b9c3cf850048224ecaa8293d7d2d31f97d869a"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[package.extras]
dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
discord = ["requests"]
notebook = ["ipywidgets (>=6)"]
slack = ["slack-sdk"]
telegram = ["requests"]
[[package]]
name = "traitlets"
version = "5.14.3"
@ -2965,6 +3132,17 @@ files = [
{file = "types_python_dateutil-2.9.0.20240821-py3-none-any.whl", hash = "sha256:f5889fcb4e63ed4aaa379b44f93c32593d50b9a94c9a60a0c854d8cc3511cd57"},
]
[[package]]
name = "typing-extensions"
version = "4.12.2"
description = "Backported and Experimental Type Hints for Python 3.8+"
optional = false
python-versions = ">=3.8"
files = [
{file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
]
[[package]]
name = "tzdata"
version = "2024.1"
@ -3110,7 +3288,33 @@ files = [
{file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"},
]
[[package]]
name = "woodwork"
version = "0.31.0"
description = "a data typing library for machine learning"
optional = false
python-versions = "<4,>=3.9"
files = [
{file = "woodwork-0.31.0-py3-none-any.whl", hash = "sha256:5cb3370553b5f466f8c8599b1bf559584dc0b798cc1f2da26bbd7029d256c6f9"},
{file = "woodwork-0.31.0.tar.gz", hash = "sha256:6ef82af1d5b6525b02efe6417c574c810cfdcc606cb266bd0d7fb17a1d066b67"},
]
[package.dependencies]
importlib-resources = ">=5.10.0"
numpy = ">=1.25.0"
pandas = ">=2.0.0"
python-dateutil = ">=2.8.2"
scikit-learn = ">=1.1.0"
scipy = ">=1.10.0"
[package.extras]
complete = ["woodwork[updater]"]
dev = ["click (>=8.1.7)", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)", "woodwork[docs,test]"]
docs = ["Sphinx (==5.1.1)", "ipython (==8.4.0)", "jupyter (==1.0.0)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pyarrow (>=14.0.1)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"]
test = ["boto3 (>=1.34.32)", "moto[all] (>=5.0.0)", "pyarrow (>=14.0.1)", "pytest (>=7.0.1)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=2.1.0)", "smart-open (>=5.0.0)"]
updater = ["alteryx-open-src-update-checker (>=3.1.0)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.12"
content-hash = "a7e3d516bde2d6e4173d8a9770fb5337a0c806dadaeda355084b262c1995f7ea"
content-hash = "76a7ecc0524f2a9a187e4242566cf9813bf2265aa4176553ea4f33c9a4c78f17"

View File

@ -17,8 +17,15 @@ apiflask = "^2.2.0"
flask-cors = "^5.0.0"
scikit-learn = "^1.5.2"
imbalanced-learn = "^0.12.3"
featuretools = "^1.31.0"
seaborn = "^0.13.2"
gymnasium = "^1.0.0"
scipy = "^1.14.1"
[tool.poetry.group.dev.dependencies]
ipykernel = "^6.29.5"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

Binary file not shown.

Binary file not shown.