Compare commits
10 Commits
Author | SHA1 | Date | |
---|---|---|---|
4005e0aa20 | |||
0d8c460548 | |||
2b2d89a4da | |||
4dac1232b3 | |||
9c05f4ee16 | |||
9382cf1770 | |||
1deabcad1d | |||
0564f69345 | |||
f76f901e65 | |||
4555a2f137 |
2601
lab_2/datasetlab2/Forbes Billionaires.csv
Normal file
2601
lab_2/datasetlab2/Forbes Billionaires.csv
Normal file
File diff suppressed because it is too large
Load Diff
897
lab_2/datasetlab2/Stores.csv
Normal file
897
lab_2/datasetlab2/Stores.csv
Normal file
@ -0,0 +1,897 @@
|
||||
Store ID ,Store_Area,Items_Available,Daily_Customer_Count,Store_Sales
|
||||
1,1659,1961,530,66490
|
||||
2,1461,1752,210,39820
|
||||
3,1340,1609,720,54010
|
||||
4,1451,1748,620,53730
|
||||
5,1770,2111,450,46620
|
||||
6,1442,1733,760,45260
|
||||
7,1542,1858,1030,72240
|
||||
8,1261,1507,1020,37720
|
||||
9,1090,1321,680,46310
|
||||
10,1030,1235,1130,44150
|
||||
11,1187,1439,1090,71280
|
||||
12,1751,2098,720,57620
|
||||
13,1746,2064,1050,60470
|
||||
14,1615,1931,1160,59130
|
||||
15,1469,1756,770,66360
|
||||
16,1644,1950,790,78870
|
||||
17,1578,1907,1440,77250
|
||||
18,1703,2045,670,38170
|
||||
19,1438,1731,1030,63540
|
||||
20,1940,2340,980,40190
|
||||
21,1421,1700,370,43460
|
||||
22,1458,1746,690,68890
|
||||
23,1719,2065,950,52780
|
||||
24,1449,1752,620,50680
|
||||
25,1234,1488,840,41880
|
||||
26,1732,2073,820,70050
|
||||
27,1475,1777,1100,25820
|
||||
28,1390,1648,980,60530
|
||||
29,1642,1943,710,78100
|
||||
30,1715,2071,650,84860
|
||||
31,1439,1746,990,80140
|
||||
32,1250,1508,990,14920
|
||||
33,1331,1608,880,60460
|
||||
34,1784,2163,620,74560
|
||||
35,1375,1648,1020,72430
|
||||
36,1871,2230,700,45460
|
||||
37,1442,1744,610,41570
|
||||
38,1174,1411,1080,62870
|
||||
39,1839,2204,1010,55170
|
||||
40,1270,1516,10,45480
|
||||
41,1435,1725,1250,49550
|
||||
42,965,1152,600,48140
|
||||
43,1665,2001,730,67640
|
||||
44,1780,2117,780,39730
|
||||
45,1009,1194,520,35800
|
||||
46,1227,1471,870,49270
|
||||
47,1769,2087,690,66510
|
||||
48,1660,1982,910,62530
|
||||
49,1472,1776,1260,59980
|
||||
50,1408,1688,1040,76350
|
||||
51,1514,1820,910,81820
|
||||
52,1565,1880,1300,57830
|
||||
53,1074,1288,320,70450
|
||||
54,1864,2240,530,67000
|
||||
55,1570,1898,980,64090
|
||||
56,1417,1701,740,48670
|
||||
57,1734,2060,1240,66210
|
||||
58,1470,1763,1080,83660
|
||||
59,1761,2104,1080,70770
|
||||
60,1756,2070,460,53870
|
||||
61,1704,2045,300,71300
|
||||
62,2011,2391,530,46100
|
||||
63,1472,1748,600,49100
|
||||
64,1310,1561,860,65920
|
||||
65,1544,1821,590,58660
|
||||
66,1707,2052,920,69130
|
||||
67,1881,2262,570,49080
|
||||
68,1416,1681,290,72710
|
||||
69,1631,1941,650,33430
|
||||
70,1318,1576,710,42430
|
||||
71,1692,2019,850,56650
|
||||
72,1152,1380,530,33580
|
||||
73,891,1073,630,67370
|
||||
74,1468,1749,700,71780
|
||||
75,1539,1833,650,84840
|
||||
76,1635,1956,720,82070
|
||||
77,1267,1520,450,26770
|
||||
78,1250,1475,1390,65560
|
||||
79,1720,2044,960,38660
|
||||
80,1462,1761,600,65660
|
||||
81,1431,1711,620,40700
|
||||
82,1539,1858,1020,88910
|
||||
83,1441,1723,330,57860
|
||||
84,1572,1884,1410,42670
|
||||
85,1287,1525,1200,90180
|
||||
86,1468,1760,280,51280
|
||||
87,1931,2342,940,97260
|
||||
88,1252,1506,850,39650
|
||||
89,1238,1468,960,45720
|
||||
90,1479,1758,420,42060
|
||||
91,1590,1912,830,65350
|
||||
92,2169,2617,600,67080
|
||||
93,1838,2205,400,54030
|
||||
94,1385,1655,760,56360
|
||||
95,1921,2305,1470,77120
|
||||
96,1975,2385,500,50810
|
||||
97,1853,2235,1120,60960
|
||||
98,1816,2171,1160,61180
|
||||
99,1785,2147,820,63660
|
||||
100,1579,1899,1140,41190
|
||||
101,1096,1321,900,78420
|
||||
102,1919,2294,760,65580
|
||||
103,1262,1500,1170,89080
|
||||
104,1374,1655,1080,94170
|
||||
105,1309,1587,1000,50950
|
||||
106,1207,1434,690,65180
|
||||
107,1692,2031,810,69310
|
||||
108,1929,2311,630,79210
|
||||
109,1573,1878,650,23740
|
||||
110,1415,1700,920,36330
|
||||
111,1162,1382,1260,51700
|
||||
112,1485,1787,800,62950
|
||||
113,1897,2248,1330,56010
|
||||
114,1607,1927,940,45080
|
||||
115,1909,2287,1210,46830
|
||||
116,1274,1503,660,64750
|
||||
117,1157,1379,770,80780
|
||||
118,1712,2046,460,31180
|
||||
119,1500,1798,860,56710
|
||||
120,1682,2017,780,49390
|
||||
121,1441,1727,890,66000
|
||||
122,1525,1835,900,32770
|
||||
123,1408,1669,530,46580
|
||||
124,1947,2333,790,79780
|
||||
125,1164,1390,370,35510
|
||||
126,1787,2137,610,80970
|
||||
127,1871,2241,500,61150
|
||||
128,1718,2051,750,49210
|
||||
129,1365,1636,980,79950
|
||||
130,1368,1654,530,68740
|
||||
131,1342,1595,910,57480
|
||||
132,1076,1270,620,72630
|
||||
133,1396,1672,1170,50070
|
||||
134,1713,2071,900,40490
|
||||
135,1370,1638,980,51850
|
||||
136,1667,1993,740,42840
|
||||
137,1638,1972,810,60940
|
||||
138,1581,1905,810,62280
|
||||
139,1795,2187,300,76530
|
||||
140,1179,1412,790,85130
|
||||
141,1978,2374,800,48590
|
||||
142,1688,2042,760,73080
|
||||
143,1214,1456,530,48950
|
||||
144,1504,1805,540,48560
|
||||
145,1498,1770,620,59380
|
||||
146,1462,1762,1010,51190
|
||||
147,1442,1750,130,58920
|
||||
148,1250,1486,730,50360
|
||||
149,1229,1480,830,38070
|
||||
150,1936,2300,1060,49170
|
||||
151,1369,1629,770,39740
|
||||
152,1662,1986,70,63730
|
||||
153,1548,1855,670,85330
|
||||
154,1649,1963,490,27410
|
||||
155,1393,1663,670,37320
|
||||
156,1450,1734,380,71120
|
||||
157,1613,1921,1200,72800
|
||||
158,1408,1696,350,34410
|
||||
159,775,932,1090,42530
|
||||
160,1275,1534,1230,54300
|
||||
161,1740,2078,680,50780
|
||||
162,1372,1657,580,45020
|
||||
163,1414,1723,680,69600
|
||||
164,2044,2474,340,80340
|
||||
165,1823,2176,700,37810
|
||||
166,955,1133,580,46140
|
||||
167,1465,1763,680,99570
|
||||
168,1331,1606,630,38650
|
||||
169,1232,1487,860,49800
|
||||
170,1481,1765,490,69910
|
||||
171,1343,1599,870,44910
|
||||
172,1539,1837,990,78470
|
||||
173,1007,1207,670,47460
|
||||
174,1762,2145,490,33460
|
||||
175,1527,1832,580,44090
|
||||
176,1356,1619,700,42620
|
||||
177,1536,1848,670,69450
|
||||
178,1605,1902,390,73120
|
||||
179,1704,2032,590,48300
|
||||
180,1626,1941,1350,58090
|
||||
181,1612,1939,840,74250
|
||||
182,1174,1396,1100,40930
|
||||
183,1923,2339,950,70930
|
||||
184,1702,2053,950,64670
|
||||
185,1398,1692,650,77420
|
||||
186,1437,1717,230,32330
|
||||
187,1524,1796,1060,41080
|
||||
188,1660,1985,1180,42860
|
||||
189,1302,1569,710,68450
|
||||
190,1666,2000,480,39730
|
||||
191,1391,1649,810,83750
|
||||
192,1778,2148,1140,69940
|
||||
193,1462,1770,1070,67710
|
||||
194,1751,2115,790,67360
|
||||
195,1652,1982,690,52460
|
||||
196,1841,2215,610,88760
|
||||
197,1496,1791,1240,67030
|
||||
198,1504,1827,840,78230
|
||||
199,1524,1808,460,62270
|
||||
200,1148,1371,940,49760
|
||||
201,1468,1744,590,73660
|
||||
202,1310,1558,890,72320
|
||||
203,1321,1579,770,68890
|
||||
204,992,1192,900,34180
|
||||
205,1540,1857,1020,58260
|
||||
206,1807,2149,910,38120
|
||||
207,1526,1853,660,49070
|
||||
208,1406,1677,480,61660
|
||||
209,1703,2055,1080,37830
|
||||
210,1575,1872,690,52270
|
||||
211,1309,1572,510,52280
|
||||
212,1488,1807,1030,70810
|
||||
213,1658,1988,370,71530
|
||||
214,1863,2245,640,77260
|
||||
215,1458,1725,750,75550
|
||||
216,1604,1909,370,33730
|
||||
217,1575,1899,840,66270
|
||||
218,1525,1829,840,55820
|
||||
219,1451,1737,890,68430
|
||||
220,1390,1687,620,73990
|
||||
221,1442,1742,310,62800
|
||||
222,1620,1922,550,33740
|
||||
223,1251,1527,380,63830
|
||||
224,1318,1606,1200,24410
|
||||
225,1647,1962,800,70020
|
||||
226,1829,2175,870,92240
|
||||
227,1852,2227,1220,68230
|
||||
228,1699,2053,1080,81870
|
||||
229,1325,1595,540,73860
|
||||
230,1350,1634,880,77120
|
||||
231,1347,1628,120,72350
|
||||
232,1397,1661,1410,49160
|
||||
233,1245,1499,570,45650
|
||||
234,1366,1649,940,52780
|
||||
235,1378,1658,760,90960
|
||||
236,1767,2110,1200,64950
|
||||
237,1184,1434,670,47230
|
||||
238,1257,1505,950,83250
|
||||
239,1863,2247,480,51950
|
||||
240,1881,2244,920,66030
|
||||
241,1329,1609,1150,68590
|
||||
242,1539,1848,750,47140
|
||||
243,1557,1861,370,69940
|
||||
244,2007,2397,610,65890
|
||||
245,1185,1418,1150,89310
|
||||
246,1657,2003,1070,58540
|
||||
247,1294,1539,790,78130
|
||||
248,1296,1559,1070,92300
|
||||
249,1733,2097,730,56170
|
||||
250,1641,1976,620,46050
|
||||
251,1373,1648,530,43390
|
||||
252,1550,1845,700,61750
|
||||
253,1583,1907,680,21830
|
||||
254,1428,1719,1060,39800
|
||||
255,1604,1925,670,54370
|
||||
256,1439,1735,400,62470
|
||||
257,1648,2003,910,82930
|
||||
258,1025,1231,760,63720
|
||||
259,2001,2394,540,79180
|
||||
260,1145,1370,350,38210
|
||||
261,1174,1426,980,25950
|
||||
262,913,1106,860,56610
|
||||
263,1199,1433,1020,73710
|
||||
264,1875,2254,1120,70400
|
||||
265,1153,1397,1020,50440
|
||||
266,1240,1492,940,66840
|
||||
267,1381,1660,970,50170
|
||||
268,1701,2030,830,60140
|
||||
269,1206,1456,920,37130
|
||||
270,1476,1777,660,42890
|
||||
271,1189,1439,780,26220
|
||||
272,1837,2220,340,50840
|
||||
273,1319,1571,1190,25630
|
||||
274,1617,1901,490,60770
|
||||
275,1631,1967,1090,69600
|
||||
276,1517,1805,1040,41740
|
||||
277,1764,2109,1210,50130
|
||||
278,1572,1869,1030,21750
|
||||
279,1855,2197,1170,80490
|
||||
280,1327,1571,730,34020
|
||||
281,1270,1515,720,60240
|
||||
282,1734,2073,500,39460
|
||||
283,1533,1848,1070,56440
|
||||
284,1390,1646,800,46840
|
||||
285,1856,2216,1020,64820
|
||||
286,1000,1215,1070,52520
|
||||
287,1313,1586,420,45940
|
||||
288,1494,1799,510,38970
|
||||
289,1386,1674,1210,58610
|
||||
290,1979,2364,660,30810
|
||||
291,1057,1264,360,47730
|
||||
292,902,1093,1210,64640
|
||||
293,1347,1622,560,44860
|
||||
294,1314,1576,360,55660
|
||||
295,1513,1803,970,57530
|
||||
296,1305,1548,480,75200
|
||||
297,1180,1436,690,37330
|
||||
298,1142,1352,710,35280
|
||||
299,1471,1768,780,70610
|
||||
300,1075,1288,630,49720
|
||||
301,1578,1885,220,68850
|
||||
302,1585,1916,1110,50740
|
||||
303,1391,1648,720,77070
|
||||
304,1577,1892,560,74730
|
||||
305,1092,1314,600,76530
|
||||
306,1375,1681,440,68900
|
||||
307,1523,1813,520,44960
|
||||
308,1373,1654,410,41490
|
||||
309,1550,1871,590,74320
|
||||
310,1614,1946,740,73800
|
||||
311,1566,1889,610,56400
|
||||
312,2019,2396,540,71570
|
||||
313,1494,1806,1450,43640
|
||||
314,1659,2008,620,35120
|
||||
315,1766,2131,340,58670
|
||||
316,1293,1554,970,75800
|
||||
317,1375,1659,1080,76640
|
||||
318,1236,1484,560,31890
|
||||
319,1332,1586,630,61670
|
||||
320,1513,1825,980,75950
|
||||
321,1208,1459,930,41490
|
||||
322,1190,1429,470,66170
|
||||
323,1448,1734,680,37980
|
||||
324,1771,2147,430,62710
|
||||
325,1365,1645,830,60470
|
||||
326,1510,1810,950,35230
|
||||
327,1458,1736,870,48550
|
||||
328,1808,2157,730,56810
|
||||
329,1615,1954,760,41080
|
||||
330,1640,1948,960,51270
|
||||
331,1060,1273,860,57500
|
||||
332,1633,1968,330,81470
|
||||
333,1222,1473,630,49570
|
||||
334,1619,1957,1280,45580
|
||||
335,1624,1973,1440,44660
|
||||
336,1887,2278,570,76240
|
||||
337,1320,1583,540,43720
|
||||
338,1450,1750,480,46700
|
||||
339,1455,1764,390,84690
|
||||
340,966,1172,900,85470
|
||||
341,1922,2290,290,80410
|
||||
342,1678,1999,740,46650
|
||||
343,1638,1952,690,81840
|
||||
344,1145,1375,950,63590
|
||||
345,2004,2390,930,50130
|
||||
346,1954,2378,810,45820
|
||||
347,1577,1879,760,86710
|
||||
348,1766,2138,580,49980
|
||||
349,1362,1634,770,82940
|
||||
350,1886,2228,1530,40350
|
||||
351,1291,1546,420,93950
|
||||
352,1584,1897,1210,47310
|
||||
353,1397,1686,850,21300
|
||||
354,1445,1709,1340,62180
|
||||
355,1433,1707,1160,61460
|
||||
356,1269,1511,500,54360
|
||||
357,1798,2134,820,72050
|
||||
358,1514,1822,670,48090
|
||||
359,1015,1216,460,27310
|
||||
360,1495,1799,950,57160
|
||||
361,1759,2095,980,34190
|
||||
362,1219,1468,850,35600
|
||||
363,1571,1877,580,54670
|
||||
364,1404,1670,620,76730
|
||||
365,1124,1369,650,63540
|
||||
366,1514,1837,1130,36690
|
||||
367,1207,1476,720,87370
|
||||
368,1484,1774,940,59800
|
||||
369,1398,1678,920,48030
|
||||
370,1769,2112,660,96650
|
||||
371,1111,1322,610,65500
|
||||
372,1078,1305,1190,55530
|
||||
373,1876,2254,1340,21650
|
||||
374,1909,2306,820,31940
|
||||
375,1940,2343,1130,84690
|
||||
376,1391,1683,890,68390
|
||||
377,1496,1774,810,75490
|
||||
378,1412,1699,680,39200
|
||||
379,1121,1345,320,85670
|
||||
380,1691,2026,700,60530
|
||||
381,1599,1946,940,78090
|
||||
382,1208,1467,910,50720
|
||||
383,1454,1737,870,23090
|
||||
384,1555,1881,1260,91360
|
||||
385,1554,1852,440,48120
|
||||
386,1491,1800,980,75620
|
||||
387,1415,1697,1200,39420
|
||||
388,1487,1801,890,51130
|
||||
389,1339,1589,1050,33890
|
||||
390,1320,1562,610,87170
|
||||
391,1509,1799,960,38600
|
||||
392,1406,1680,860,60980
|
||||
393,1264,1502,800,79410
|
||||
394,1905,2280,1060,82350
|
||||
395,1209,1464,600,36740
|
||||
396,1546,1829,380,27720
|
||||
397,1689,2044,1140,32260
|
||||
398,1153,1381,730,53270
|
||||
399,2063,2493,810,51480
|
||||
400,1848,2254,1000,59970
|
||||
401,1718,2056,1220,83600
|
||||
402,1480,1773,360,63020
|
||||
403,1439,1724,900,50920
|
||||
404,1576,1896,750,56450
|
||||
405,1948,2374,930,89540
|
||||
406,1613,1893,1180,46030
|
||||
407,896,1059,870,75110
|
||||
408,1625,1943,680,74520
|
||||
409,1303,1587,1310,102310
|
||||
410,1340,1605,1000,53400
|
||||
411,1410,1704,1130,59760
|
||||
412,1432,1719,990,49540
|
||||
413,1891,2280,360,51560
|
||||
414,1322,1583,720,49510
|
||||
415,1378,1670,950,58610
|
||||
416,1462,1732,840,68260
|
||||
417,1440,1733,1120,65310
|
||||
418,1421,1724,920,52090
|
||||
419,1280,1530,1240,43860
|
||||
420,1431,1710,840,74170
|
||||
421,1161,1404,430,58380
|
||||
422,1175,1405,810,91200
|
||||
423,1395,1662,920,90940
|
||||
424,1443,1755,880,49330
|
||||
425,1247,1523,1350,53500
|
||||
426,1788,2133,1000,54590
|
||||
427,1138,1375,1220,57450
|
||||
428,1709,2042,430,33240
|
||||
429,1777,2145,520,80790
|
||||
430,1612,1918,580,61000
|
||||
431,1618,1943,460,47620
|
||||
432,1311,1571,470,72090
|
||||
433,1365,1638,680,102920
|
||||
434,1249,1503,950,61970
|
||||
435,1373,1635,840,61040
|
||||
436,1536,1836,990,52060
|
||||
437,1744,2101,520,69570
|
||||
438,1513,1820,520,66020
|
||||
439,1297,1561,1070,40000
|
||||
440,1908,2304,990,79500
|
||||
441,1721,2076,710,76300
|
||||
442,1243,1491,430,69030
|
||||
443,1472,1766,1290,57140
|
||||
444,1307,1570,1080,41710
|
||||
445,1628,1959,890,71480
|
||||
446,1556,1869,1000,33010
|
||||
447,1179,1428,1290,74570
|
||||
448,1768,2123,860,49590
|
||||
449,1378,1656,1010,73170
|
||||
450,1685,2015,1160,79220
|
||||
451,1474,1759,880,75880
|
||||
452,1794,2137,670,67610
|
||||
453,1086,1291,930,69090
|
||||
454,1808,2154,990,35220
|
||||
455,1501,1790,450,53940
|
||||
456,1353,1598,690,56660
|
||||
457,1455,1757,650,67520
|
||||
458,1165,1411,1020,38620
|
||||
459,1332,1610,880,38890
|
||||
460,1396,1668,760,79270
|
||||
461,1513,1821,690,42880
|
||||
462,1618,1940,630,44240
|
||||
463,1845,2233,370,43190
|
||||
464,1172,1411,460,74550
|
||||
465,1436,1737,770,57090
|
||||
466,1738,2065,750,56480
|
||||
467,2229,2667,660,87410
|
||||
468,1490,1783,730,81370
|
||||
469,1060,1279,670,97360
|
||||
470,2015,2436,680,77960
|
||||
471,1611,1919,960,71240
|
||||
472,1187,1417,1230,58940
|
||||
473,1430,1690,800,78950
|
||||
474,1543,1840,450,36380
|
||||
475,1836,2195,940,45160
|
||||
476,1463,1764,1060,69050
|
||||
477,1213,1462,560,56830
|
||||
478,1244,1480,860,93530
|
||||
479,1745,2108,730,46920
|
||||
480,933,1121,940,55990
|
||||
481,1764,2132,920,40840
|
||||
482,1675,2002,1050,64990
|
||||
483,1688,2046,380,53550
|
||||
484,1842,2204,930,51320
|
||||
485,1316,1597,980,36560
|
||||
486,1440,1719,580,66050
|
||||
487,1760,2111,680,52400
|
||||
488,1323,1571,850,27970
|
||||
489,1230,1466,730,67100
|
||||
490,1540,1838,570,43710
|
||||
491,1167,1388,620,38600
|
||||
492,1429,1695,890,53890
|
||||
493,1491,1770,800,52610
|
||||
494,1313,1574,920,43130
|
||||
495,1609,1943,920,40300
|
||||
496,1109,1342,760,49750
|
||||
497,1207,1440,500,43840
|
||||
498,902,1087,680,56820
|
||||
499,1191,1422,770,36350
|
||||
500,1335,1601,460,50820
|
||||
501,1382,1660,1070,83720
|
||||
502,1588,1906,450,46970
|
||||
503,1918,2284,310,78020
|
||||
504,1484,1774,880,45080
|
||||
505,1334,1607,370,55160
|
||||
506,1556,1846,760,72020
|
||||
507,1784,2142,950,64010
|
||||
508,1244,1477,890,27840
|
||||
509,1496,1787,800,58070
|
||||
510,1719,2058,700,51760
|
||||
511,1678,2022,1050,66050
|
||||
512,1247,1490,330,65750
|
||||
513,1191,1421,980,65820
|
||||
514,1832,2208,1090,46760
|
||||
515,1271,1523,1140,50940
|
||||
516,1735,2084,820,56440
|
||||
517,1627,1948,890,32610
|
||||
518,1351,1616,650,62770
|
||||
519,1520,1817,850,63600
|
||||
520,1490,1788,360,45840
|
||||
521,1777,2117,780,38280
|
||||
522,1688,2037,590,50960
|
||||
523,1537,1836,670,39480
|
||||
524,1622,1968,340,69610
|
||||
525,1148,1384,730,47800
|
||||
526,1001,1194,1210,44890
|
||||
527,1857,2236,1280,67420
|
||||
528,1552,1869,710,78870
|
||||
529,1700,2064,940,70310
|
||||
530,1554,1844,670,38530
|
||||
531,1482,1777,800,77570
|
||||
532,1275,1517,790,59920
|
||||
533,1642,1981,720,54450
|
||||
534,1381,1633,1270,50250
|
||||
535,1381,1634,930,30790
|
||||
536,1057,1262,1490,35420
|
||||
537,1192,1445,810,43470
|
||||
538,1601,1920,600,61000
|
||||
539,1622,1968,210,64780
|
||||
540,1607,1909,460,39030
|
||||
541,2214,2647,740,65900
|
||||
542,1633,1936,1320,46050
|
||||
543,1546,1845,760,59070
|
||||
544,1475,1753,920,44670
|
||||
545,1270,1519,920,58390
|
||||
546,1185,1420,880,80370
|
||||
547,1614,1938,1110,53230
|
||||
548,1141,1353,1370,72000
|
||||
549,1244,1481,410,84040
|
||||
550,869,1050,850,52540
|
||||
551,2049,2465,720,63510
|
||||
552,1883,2262,570,42240
|
||||
553,1526,1842,690,39580
|
||||
554,1165,1390,1220,54610
|
||||
555,1832,2185,840,87330
|
||||
556,1723,2072,560,88410
|
||||
557,932,1138,820,89760
|
||||
558,1137,1374,700,101780
|
||||
559,1231,1472,810,70290
|
||||
560,1237,1512,1070,88210
|
||||
561,1371,1650,540,87160
|
||||
562,1767,2158,530,41540
|
||||
563,1748,2092,580,49170
|
||||
564,1212,1440,500,63950
|
||||
565,1466,1743,1200,70810
|
||||
566,1152,1386,980,49590
|
||||
567,1439,1703,1000,67290
|
||||
568,2026,2400,720,51240
|
||||
569,1772,2146,1030,48540
|
||||
570,1511,1822,420,72410
|
||||
571,1199,1461,1070,54370
|
||||
572,1834,2184,830,94460
|
||||
573,1143,1375,940,85160
|
||||
574,1494,1794,550,52130
|
||||
575,1770,2131,1140,54650
|
||||
576,1455,1747,750,69320
|
||||
577,1141,1372,620,51480
|
||||
578,1586,1886,660,50060
|
||||
579,1701,2034,660,62180
|
||||
580,1860,2246,410,79780
|
||||
581,1167,1406,440,42860
|
||||
582,1424,1716,630,54410
|
||||
583,1710,2053,730,69390
|
||||
584,1408,1708,220,42810
|
||||
585,1517,1831,610,30840
|
||||
586,1227,1476,720,56260
|
||||
587,1609,1930,740,76470
|
||||
588,1553,1831,740,35680
|
||||
589,1814,2174,770,90070
|
||||
590,1240,1493,590,33120
|
||||
591,1206,1437,1330,54060
|
||||
592,1847,2186,910,75120
|
||||
593,1009,1202,330,41600
|
||||
594,1624,1946,870,20270
|
||||
595,1612,1931,790,60060
|
||||
596,1498,1805,1270,82270
|
||||
597,946,1125,590,29170
|
||||
598,1563,1872,1080,68420
|
||||
599,1664,2016,830,59130
|
||||
600,1619,1947,910,74330
|
||||
601,1433,1722,830,77080
|
||||
602,1241,1489,1380,76250
|
||||
603,1429,1720,1180,59540
|
||||
604,1241,1488,770,54690
|
||||
605,1078,1306,680,84360
|
||||
606,1690,2065,910,51420
|
||||
607,1289,1536,540,65120
|
||||
608,1581,1894,760,49380
|
||||
609,1608,1945,760,37830
|
||||
610,1344,1608,730,35980
|
||||
611,1513,1804,430,69190
|
||||
612,1529,1839,1000,50590
|
||||
613,1677,2014,660,60800
|
||||
614,1015,1229,930,31180
|
||||
615,1438,1751,760,77790
|
||||
616,1426,1718,370,47570
|
||||
617,1412,1701,630,69130
|
||||
618,1622,1944,360,75970
|
||||
619,1503,1791,630,68350
|
||||
620,1501,1789,670,41680
|
||||
621,1971,2342,690,86560
|
||||
622,1383,1687,830,81390
|
||||
623,1371,1635,720,50730
|
||||
624,1508,1823,520,71290
|
||||
625,1057,1284,750,70110
|
||||
626,1411,1680,1070,61590
|
||||
627,1466,1746,590,69370
|
||||
628,1545,1888,600,67110
|
||||
629,2044,2408,380,82020
|
||||
630,1887,2264,830,62050
|
||||
631,1505,1836,940,61730
|
||||
632,1422,1722,560,58660
|
||||
633,1564,1869,1030,53370
|
||||
634,1510,1810,730,39700
|
||||
635,1568,1920,890,53750
|
||||
636,1933,2338,1140,44730
|
||||
637,1501,1822,590,49350
|
||||
638,1593,1911,580,43340
|
||||
639,1812,2189,310,78090
|
||||
640,1580,1895,720,54950
|
||||
641,1440,1749,490,75530
|
||||
642,1100,1331,1010,57330
|
||||
643,1534,1841,680,87930
|
||||
644,1299,1555,1020,56850
|
||||
645,1767,2121,1050,78430
|
||||
646,1368,1649,740,63660
|
||||
647,1393,1670,410,62960
|
||||
648,1327,1590,770,81870
|
||||
649,1514,1794,1400,54820
|
||||
650,1989,2414,860,116320
|
||||
651,1334,1584,840,57200
|
||||
652,1533,1817,950,84360
|
||||
653,1809,2145,940,36530
|
||||
654,1607,1933,930,81260
|
||||
655,1165,1387,1060,82350
|
||||
656,1193,1430,560,80830
|
||||
657,1709,2065,670,30610
|
||||
658,1525,1839,540,51310
|
||||
659,1348,1623,1010,72940
|
||||
660,1132,1366,1340,52450
|
||||
661,1667,2020,980,66070
|
||||
662,1427,1720,630,43190
|
||||
663,1211,1447,1110,40730
|
||||
664,1717,2048,700,78530
|
||||
665,1766,2111,580,94690
|
||||
666,1086,1299,1050,44400
|
||||
667,1410,1692,790,73800
|
||||
668,1476,1760,600,37390
|
||||
669,1068,1278,440,64120
|
||||
670,1485,1785,1340,66160
|
||||
671,1461,1739,1250,22310
|
||||
672,1685,2010,990,62380
|
||||
673,1624,1958,290,63850
|
||||
674,1658,2000,350,36210
|
||||
675,1427,1677,210,54590
|
||||
676,1755,2072,810,69610
|
||||
677,1211,1472,790,65390
|
||||
678,1591,1896,780,78130
|
||||
679,1797,2126,730,55710
|
||||
680,1519,1823,1040,69210
|
||||
681,1637,1958,760,59940
|
||||
682,1451,1750,570,72550
|
||||
683,1203,1446,620,44260
|
||||
684,1884,2262,310,56910
|
||||
685,1540,1820,310,82390
|
||||
686,1121,1332,790,54590
|
||||
687,1307,1562,490,69990
|
||||
688,1475,1775,230,72740
|
||||
689,1160,1401,900,35360
|
||||
690,1078,1276,640,94370
|
||||
691,1191,1436,840,43520
|
||||
692,1317,1569,780,36000
|
||||
693,1548,1858,480,99480
|
||||
694,1560,1883,1040,83220
|
||||
695,1297,1529,870,52940
|
||||
696,1645,1958,530,93360
|
||||
697,1225,1455,750,73590
|
||||
698,1421,1704,840,53840
|
||||
699,1655,1956,800,47350
|
||||
700,1615,1928,660,65080
|
||||
701,1872,2262,560,62050
|
||||
702,1317,1581,910,30020
|
||||
703,1434,1729,480,49510
|
||||
704,1791,2167,700,64320
|
||||
705,932,1120,660,35590
|
||||
706,1609,1924,1170,63050
|
||||
707,1495,1793,1020,65300
|
||||
708,1769,2153,580,69560
|
||||
709,1693,2032,610,41910
|
||||
710,1247,1497,590,28330
|
||||
711,1502,1815,190,55980
|
||||
712,1360,1612,490,61080
|
||||
713,1542,1844,680,51380
|
||||
714,1631,1947,670,84410
|
||||
715,1246,1482,1070,60680
|
||||
716,1990,2384,1110,64690
|
||||
717,967,1154,560,45780
|
||||
718,1582,1894,1100,41800
|
||||
719,1430,1743,970,53230
|
||||
720,1827,2160,930,36160
|
||||
721,1118,1338,1040,40450
|
||||
722,1766,2109,1120,57910
|
||||
723,1799,2173,910,36280
|
||||
724,1167,1411,440,39190
|
||||
725,1493,1795,530,62380
|
||||
726,1445,1734,900,21470
|
||||
727,1033,1237,740,34610
|
||||
728,1440,1711,1020,88120
|
||||
729,1487,1773,970,59190
|
||||
730,1854,2205,890,36290
|
||||
731,1748,2086,550,53760
|
||||
732,1937,2310,520,66300
|
||||
733,1641,1999,950,93000
|
||||
734,1659,1999,650,65660
|
||||
735,1743,2061,860,81930
|
||||
736,1449,1733,320,60060
|
||||
737,1098,1309,860,59530
|
||||
738,1121,1351,900,46380
|
||||
739,1526,1858,550,76200
|
||||
740,1358,1645,770,56860
|
||||
741,1336,1616,710,86620
|
||||
742,1502,1802,840,49730
|
||||
743,1534,1858,860,88370
|
||||
744,1418,1699,870,49160
|
||||
745,854,1018,660,77740
|
||||
746,1450,1728,930,38560
|
||||
747,1474,1776,1020,51990
|
||||
748,1524,1819,1190,39970
|
||||
749,1361,1638,1140,46040
|
||||
750,1398,1683,490,49500
|
||||
751,1085,1308,1170,76670
|
||||
752,1660,1979,480,75800
|
||||
753,1648,2017,930,81720
|
||||
754,1453,1749,890,58440
|
||||
755,1323,1591,680,85720
|
||||
756,1385,1643,740,70940
|
||||
757,1250,1506,990,62420
|
||||
758,1389,1683,680,56880
|
||||
759,1486,1758,820,101820
|
||||
760,1655,1993,440,86890
|
||||
761,1645,1963,900,47300
|
||||
762,1464,1771,1080,31270
|
||||
763,1197,1428,830,65410
|
||||
764,1878,2264,310,54200
|
||||
765,1150,1378,730,67390
|
||||
766,1562,1881,740,54530
|
||||
767,1596,1939,960,79760
|
||||
768,1119,1345,790,78060
|
||||
769,1116,1347,700,74080
|
||||
770,1934,2349,750,52990
|
||||
771,1299,1540,590,70580
|
||||
772,1417,1689,570,34310
|
||||
773,1235,1503,660,74160
|
||||
774,1497,1815,700,59190
|
||||
775,1430,1704,1070,43370
|
||||
776,1537,1877,660,17670
|
||||
777,1444,1742,840,56710
|
||||
778,1477,1798,850,59820
|
||||
779,1041,1246,600,36190
|
||||
780,1226,1472,710,60440
|
||||
781,1489,1783,450,75300
|
||||
782,1549,1871,740,74080
|
||||
783,1073,1280,1240,60440
|
||||
784,1473,1785,570,80720
|
||||
785,2013,2396,580,47060
|
||||
786,1975,2368,450,86830
|
||||
787,1561,1877,790,56790
|
||||
788,1427,1723,1040,67090
|
||||
789,1441,1747,670,44370
|
||||
790,1275,1548,370,82970
|
||||
791,1574,1876,620,56230
|
||||
792,1511,1791,1010,53760
|
||||
793,1428,1713,550,55390
|
||||
794,1388,1672,800,73500
|
||||
795,1057,1280,610,41050
|
||||
796,1440,1747,1090,67320
|
||||
797,1349,1610,700,65890
|
||||
798,1536,1808,830,56380
|
||||
799,2019,2420,850,85670
|
||||
800,1236,1508,1260,70830
|
||||
801,1436,1715,1030,48180
|
||||
802,1862,2248,1160,51910
|
||||
803,1200,1442,880,44320
|
||||
804,1360,1650,420,58940
|
||||
805,1722,2078,770,73610
|
||||
806,1577,1902,910,54060
|
||||
807,1850,2214,1110,85000
|
||||
808,1447,1730,510,49030
|
||||
809,1496,1800,780,63300
|
||||
810,1679,2008,790,84300
|
||||
811,994,1194,1090,81390
|
||||
812,1354,1635,1270,95900
|
||||
813,1597,1918,1260,71830
|
||||
814,1873,2252,330,79310
|
||||
815,1218,1459,540,87890
|
||||
816,1458,1746,720,48610
|
||||
817,1546,1860,670,73160
|
||||
818,1608,1962,770,36280
|
||||
819,1822,2160,860,49720
|
||||
820,1716,2038,410,44400
|
||||
821,1072,1296,900,47590
|
||||
822,1330,1604,480,51460
|
||||
823,1588,1892,540,57750
|
||||
824,1425,1733,760,66000
|
||||
825,1778,2133,280,45950
|
||||
826,1363,1630,1120,53900
|
||||
827,1609,1928,160,37920
|
||||
828,1671,2024,620,63100
|
||||
829,1379,1636,440,36770
|
||||
830,1218,1452,870,43910
|
||||
831,1724,2101,900,66390
|
||||
832,986,1179,710,59160
|
||||
833,1330,1606,590,38510
|
||||
834,1437,1725,910,46220
|
||||
835,1327,1609,1320,41500
|
||||
836,1651,2009,1000,58160
|
||||
837,1211,1462,670,38530
|
||||
838,1916,2277,610,55880
|
||||
839,1638,1937,1050,70940
|
||||
840,1172,1413,480,53940
|
||||
841,1350,1606,770,43030
|
||||
842,1528,1843,340,59820
|
||||
843,1305,1557,580,55500
|
||||
844,1463,1751,900,49990
|
||||
845,1409,1727,700,42980
|
||||
846,1419,1743,860,65970
|
||||
847,1535,1819,540,59290
|
||||
848,1474,1745,970,63020
|
||||
849,919,1099,1560,73810
|
||||
850,2067,2492,790,70230
|
||||
851,1977,2362,1020,59950
|
||||
852,1293,1558,790,78100
|
||||
853,1477,1790,880,16370
|
||||
854,1582,1906,550,92640
|
||||
855,1481,1789,550,63540
|
||||
856,1214,1455,950,87220
|
||||
857,1206,1460,810,41990
|
||||
858,1653,1982,390,79410
|
||||
859,1152,1393,860,54380
|
||||
860,1458,1757,850,58600
|
||||
861,1249,1510,660,48950
|
||||
862,1939,2333,830,40670
|
||||
863,1591,1919,640,52340
|
||||
864,1180,1397,750,39140
|
||||
865,1846,2195,1170,41090
|
||||
866,780,951,790,25600
|
||||
867,1565,1854,900,100900
|
||||
868,1648,1959,370,77080
|
||||
869,1775,2104,980,105150
|
||||
870,1439,1732,1170,80580
|
||||
871,1487,1776,800,46230
|
||||
872,1800,2158,1100,98260
|
||||
873,1690,2024,1070,75930
|
||||
874,1209,1452,830,52050
|
||||
875,1859,2222,1210,87000
|
||||
876,1691,2023,540,60270
|
||||
877,1259,1493,100,88270
|
||||
878,1771,2138,820,57820
|
||||
879,1205,1468,1210,61210
|
||||
880,1792,2131,810,76420
|
||||
881,1263,1516,780,70980
|
||||
882,1344,1605,1160,76740
|
||||
883,1819,2187,590,47920
|
||||
884,1357,1625,1140,52160
|
||||
885,1396,1673,690,32740
|
||||
886,1118,1337,560,72270
|
||||
887,1655,1986,1150,77430
|
||||
888,1156,1398,140,92370
|
||||
889,1451,1734,670,34880
|
||||
890,1539,1829,650,46580
|
||||
891,1549,1851,1220,70620
|
||||
892,1582,1910,1080,66390
|
||||
893,1387,1663,850,82080
|
||||
894,1200,1436,1060,76440
|
||||
895,1299,1560,770,96610
|
||||
896,1174,1429,1110,54340
|
|
21614
lab_2/datasetlab2/kc_house_data.csv
Normal file
21614
lab_2/datasetlab2/kc_house_data.csv
Normal file
File diff suppressed because it is too large
Load Diff
599
lab_2/lab2.ipynb
Normal file
599
lab_2/lab2.ipynb
Normal file
@ -0,0 +1,599 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Lab2 Pibd-31 Malafeev**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"*Загрузка трёх других датасетов(не своего варианта)*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 97,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"df = pd.read_csv(\".//datasetlab2//kc_house_data.csv\", sep=\",\")\n",
|
||||
"df2 = pd.read_csv(\".//datasetlab2//Stores.csv\", sep=\",\")\n",
|
||||
"df3 = pd.read_csv(\".//datasetlab2//Forbes Billionaires.csv\", sep=\",\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Далее будут выполнены в Markdown пукнты лабораторной 2-8 с пометкой каждого пункта."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2.Проблемной областью явлются: датасет stores.csv - магазины, kc_house_data.csv - датасет продажи домов и Forber Billionares.csv - датасет миллионеров."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"3.Объектами наблюдениями явлются магазины, дома и миллионеры. Связи между объектами нет, единственная схожесть - магазин и дом являются зданиями."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"4.Датасет миллионеров нужны например для сайта forbes - чтобы составить тир лист. В целом, другие датасеты тоже подходят для составления тир листа)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"5.Технический проект - тир лист, на входе датасет"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"6.Пункт будем выполнять в коде, оставлю к каждому комменты:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 98,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"id 0\n",
|
||||
"date 0\n",
|
||||
"price 0\n",
|
||||
"bedrooms 0\n",
|
||||
"bathrooms 0\n",
|
||||
"sqft_living 0\n",
|
||||
"sqft_lot 0\n",
|
||||
"floors 0\n",
|
||||
"waterfront 0\n",
|
||||
"view 0\n",
|
||||
"condition 0\n",
|
||||
"grade 0\n",
|
||||
"sqft_above 0\n",
|
||||
"sqft_basement 0\n",
|
||||
"yr_built 0\n",
|
||||
"yr_renovated 0\n",
|
||||
"zipcode 0\n",
|
||||
"lat 0\n",
|
||||
"long 0\n",
|
||||
"sqft_living15 0\n",
|
||||
"sqft_lot15 0\n",
|
||||
"dtype: int64\n",
|
||||
"Store ID 0\n",
|
||||
"Store_Area 0\n",
|
||||
"Items_Available 0\n",
|
||||
"Daily_Customer_Count 0\n",
|
||||
"Store_Sales 0\n",
|
||||
"dtype: int64\n",
|
||||
"Rank 0\n",
|
||||
"Name 0\n",
|
||||
"Networth 0\n",
|
||||
"Age 0\n",
|
||||
"Country 0\n",
|
||||
"Source 0\n",
|
||||
"Industry 0\n",
|
||||
"dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Проверка на пропущенные значения\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"print(df2.isnull().sum())\n",
|
||||
"print(df3.isnull().sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 99,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" id price bedrooms bathrooms sqft_living \\\n",
|
||||
"count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 \n",
|
||||
"mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 \n",
|
||||
"std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 \n",
|
||||
"min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 \n",
|
||||
"25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 \n",
|
||||
"50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 \n",
|
||||
"75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 \n",
|
||||
"max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 \n",
|
||||
"\n",
|
||||
" sqft_lot floors waterfront view condition \\\n",
|
||||
"count 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 \n",
|
||||
"mean 1.510697e+04 1.494309 0.007542 0.234303 3.409430 \n",
|
||||
"std 4.142051e+04 0.539989 0.086517 0.766318 0.650743 \n",
|
||||
"min 5.200000e+02 1.000000 0.000000 0.000000 1.000000 \n",
|
||||
"25% 5.040000e+03 1.000000 0.000000 0.000000 3.000000 \n",
|
||||
"50% 7.618000e+03 1.500000 0.000000 0.000000 3.000000 \n",
|
||||
"75% 1.068800e+04 2.000000 0.000000 0.000000 4.000000 \n",
|
||||
"max 1.651359e+06 3.500000 1.000000 4.000000 5.000000 \n",
|
||||
"\n",
|
||||
" grade sqft_above sqft_basement yr_built yr_renovated \\\n",
|
||||
"count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 \n",
|
||||
"mean 7.656873 1788.390691 291.509045 1971.005136 84.402258 \n",
|
||||
"std 1.175459 828.090978 442.575043 29.373411 401.679240 \n",
|
||||
"min 1.000000 290.000000 0.000000 1900.000000 0.000000 \n",
|
||||
"25% 7.000000 1190.000000 0.000000 1951.000000 0.000000 \n",
|
||||
"50% 7.000000 1560.000000 0.000000 1975.000000 0.000000 \n",
|
||||
"75% 8.000000 2210.000000 560.000000 1997.000000 0.000000 \n",
|
||||
"max 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 \n",
|
||||
"\n",
|
||||
" zipcode lat long sqft_living15 sqft_lot15 \n",
|
||||
"count 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 \n",
|
||||
"mean 98077.939805 47.560053 -122.213896 1986.552492 12768.455652 \n",
|
||||
"std 53.505026 0.138564 0.140828 685.391304 27304.179631 \n",
|
||||
"min 98001.000000 47.155900 -122.519000 399.000000 651.000000 \n",
|
||||
"25% 98033.000000 47.471000 -122.328000 1490.000000 5100.000000 \n",
|
||||
"50% 98065.000000 47.571800 -122.230000 1840.000000 7620.000000 \n",
|
||||
"75% 98118.000000 47.678000 -122.125000 2360.000000 10083.000000 \n",
|
||||
"max 98199.000000 47.777600 -121.315000 6210.000000 871200.000000 \n",
|
||||
" Store ID Store_Area Items_Available Daily_Customer_Count \\\n",
|
||||
"count 896.000000 896.000000 896.000000 896.000000 \n",
|
||||
"mean 448.500000 1485.409598 1782.035714 786.350446 \n",
|
||||
"std 258.797218 250.237011 299.872053 265.389281 \n",
|
||||
"min 1.000000 775.000000 932.000000 10.000000 \n",
|
||||
"25% 224.750000 1316.750000 1575.500000 600.000000 \n",
|
||||
"50% 448.500000 1477.000000 1773.500000 780.000000 \n",
|
||||
"75% 672.250000 1653.500000 1982.750000 970.000000 \n",
|
||||
"max 896.000000 2229.000000 2667.000000 1560.000000 \n",
|
||||
"\n",
|
||||
" Store_Sales \n",
|
||||
"count 896.000000 \n",
|
||||
"mean 59351.305804 \n",
|
||||
"std 17190.741895 \n",
|
||||
"min 14920.000000 \n",
|
||||
"25% 46530.000000 \n",
|
||||
"50% 58605.000000 \n",
|
||||
"75% 71872.500000 \n",
|
||||
"max 116320.000000 \n",
|
||||
" Rank Networth Age\n",
|
||||
"count 2600.000000 2600.000000 2600.000000\n",
|
||||
"mean 1269.570769 4.860750 64.271923\n",
|
||||
"std 728.146364 10.659671 13.220607\n",
|
||||
"min 1.000000 1.000000 19.000000\n",
|
||||
"25% 637.000000 1.500000 55.000000\n",
|
||||
"50% 1292.000000 2.400000 64.000000\n",
|
||||
"75% 1929.000000 4.500000 74.000000\n",
|
||||
"max 2578.000000 219.000000 100.000000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Статистика по числовым данным для выявления аномальных распределений\n",
|
||||
"print(df.describe())\n",
|
||||
"print(df2.describe())\n",
|
||||
"print(df3.describe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"7.По перым трём строкам кода, т.е после проверки на пропущенные значения выявлено, что их нет. А дальше я обнаружил аномалию: в датасете миллионеров есть столбец networth - чистое количество денег во всех формах ( в миллиардах ), в этом солбце минимальное значение является единицей, медиана в районе 2.4, а максимальное - 219. В ЭТОМ СТОЛБЦЕ АНОМАЛИЯ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"8.Наши датасеты довольно информационные. Например у миллионер датасета можно посмотреть фио, сколько денег, что он сделал. Датасет по продаже домов гораздо информационнее, является лидером по наполненности и соответствует реальности. А вот датасет магазинов слабоват, можно например добавить: количество филлиалов, работников, прибыль"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"9.Возьмём датасет магазинов, будем удалять столбцы, где площадь ниже 1500 (по тз надо)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 100,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales\n",
|
||||
"0 1 1659 1961 530 66490\n",
|
||||
"4 5 1770 2111 450 46620\n",
|
||||
"6 7 1542 1858 1030 72240\n",
|
||||
"11 12 1751 2098 720 57620\n",
|
||||
"12 13 1746 2064 1050 60470\n",
|
||||
".. ... ... ... ... ...\n",
|
||||
"882 883 1819 2187 590 47920\n",
|
||||
"886 887 1655 1986 1150 77430\n",
|
||||
"889 890 1539 1829 650 46580\n",
|
||||
"890 891 1549 1851 1220 70620\n",
|
||||
"891 892 1582 1910 1080 66390\n",
|
||||
"\n",
|
||||
"[415 rows x 5 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df2_filtered = df2[df2['Store_Area'] >= 1500]\n",
|
||||
"print(df2_filtered)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Теперь в датасете магазнов price заменим у всех на константное значение - 1 500 000"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 101,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||||
"0 7129300520 20141013T000000 1500000 3 1.00 1180 \n",
|
||||
"1 6414100192 20141209T000000 1500000 3 2.25 2570 \n",
|
||||
"2 5631500400 20150225T000000 1500000 2 1.00 770 \n",
|
||||
"3 2487200875 20141209T000000 1500000 4 3.00 1960 \n",
|
||||
"4 1954400510 20150218T000000 1500000 3 2.00 1680 \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"21608 263000018 20140521T000000 1500000 3 2.50 1530 \n",
|
||||
"21609 6600060120 20150223T000000 1500000 4 2.50 2310 \n",
|
||||
"21610 1523300141 20140623T000000 1500000 2 0.75 1020 \n",
|
||||
"21611 291310100 20150116T000000 1500000 3 2.50 1600 \n",
|
||||
"21612 1523300157 20141015T000000 1500000 2 0.75 1020 \n",
|
||||
"\n",
|
||||
" sqft_lot floors waterfront view ... grade sqft_above \\\n",
|
||||
"0 5650 1.0 0 0 ... 7 1180 \n",
|
||||
"1 7242 2.0 0 0 ... 7 2170 \n",
|
||||
"2 10000 1.0 0 0 ... 6 770 \n",
|
||||
"3 5000 1.0 0 0 ... 7 1050 \n",
|
||||
"4 8080 1.0 0 0 ... 8 1680 \n",
|
||||
"... ... ... ... ... ... ... ... \n",
|
||||
"21608 1131 3.0 0 0 ... 8 1530 \n",
|
||||
"21609 5813 2.0 0 0 ... 8 2310 \n",
|
||||
"21610 1350 2.0 0 0 ... 7 1020 \n",
|
||||
"21611 2388 2.0 0 0 ... 8 1600 \n",
|
||||
"21612 1076 2.0 0 0 ... 7 1020 \n",
|
||||
"\n",
|
||||
" sqft_basement yr_built yr_renovated zipcode lat long \\\n",
|
||||
"0 0 1955 0 98178 47.5112 -122.257 \n",
|
||||
"1 400 1951 1991 98125 47.7210 -122.319 \n",
|
||||
"2 0 1933 0 98028 47.7379 -122.233 \n",
|
||||
"3 910 1965 0 98136 47.5208 -122.393 \n",
|
||||
"4 0 1987 0 98074 47.6168 -122.045 \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"21608 0 2009 0 98103 47.6993 -122.346 \n",
|
||||
"21609 0 2014 0 98146 47.5107 -122.362 \n",
|
||||
"21610 0 2009 0 98144 47.5944 -122.299 \n",
|
||||
"21611 0 2004 0 98027 47.5345 -122.069 \n",
|
||||
"21612 0 2008 0 98144 47.5941 -122.299 \n",
|
||||
"\n",
|
||||
" sqft_living15 sqft_lot15 \n",
|
||||
"0 1340 5650 \n",
|
||||
"1 1690 7639 \n",
|
||||
"2 2720 8062 \n",
|
||||
"3 1360 5000 \n",
|
||||
"4 1800 7503 \n",
|
||||
"... ... ... \n",
|
||||
"21608 1530 1509 \n",
|
||||
"21609 1830 7200 \n",
|
||||
"21610 1020 2007 \n",
|
||||
"21611 1410 1287 \n",
|
||||
"21612 1020 1357 \n",
|
||||
"\n",
|
||||
"[21613 rows x 21 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df['price'] = 1500000\n",
|
||||
"print(df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Теперь у миллионеров в networth подставим среднее по столбцу:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 102,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Networth\n",
|
||||
"0 4.86075\n",
|
||||
"1 4.86075\n",
|
||||
"2 4.86075\n",
|
||||
"3 4.86075\n",
|
||||
"4 4.86075\n",
|
||||
"... ...\n",
|
||||
"2595 4.86075\n",
|
||||
"2596 4.86075\n",
|
||||
"2597 4.86075\n",
|
||||
"2598 4.86075\n",
|
||||
"2599 4.86075\n",
|
||||
"\n",
|
||||
"[2600 rows x 1 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"networth_mean = df3['Networth'].mean()\n",
|
||||
"df3['Networth'] = networth_mean\n",
|
||||
"print(df3[['Networth']])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"10.КОД"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 103,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train df: (15129, 21), Validation df: (3242, 21), Test df: (3242, 21)\n",
|
||||
"Train df2: (627, 5), Validation df2: (134, 5), Test df2: (135, 5)\n",
|
||||
"Train df3: (1820, 7), Validation df3: (390, 7), Test df3: (390, 7)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
||||
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42) \n",
|
||||
"\n",
|
||||
"train_df2, temp_df2 = train_test_split(df2, test_size=0.3, random_state=42)\n",
|
||||
"val_df2, test_df2 = train_test_split(temp_df2, test_size=0.5, random_state=42)\n",
|
||||
"\n",
|
||||
"train_df3, temp_df3 = train_test_split(df3, test_size=0.3, random_state=42)\n",
|
||||
"val_df3, test_df3 = train_test_split(temp_df3, test_size=0.5, random_state=42)\n",
|
||||
"print(f\"Train df: {train_df.shape}, Validation df: {val_df.shape}, Test df: {test_df.shape}\")\n",
|
||||
"print(f\"Train df2: {train_df2.shape}, Validation df2: {val_df2.shape}, Test df2: {test_df2.shape}\")\n",
|
||||
"print(f\"Train df3: {train_df3.shape}, Validation df3: {val_df3.shape}, Test df3: {test_df3.shape}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Было сделаное разбиение на три выборки: 70%, 15% и 15%. Подключена была библиотека scikit-learn и функция train_test_split , как сказано в пункте 15. Вполне сбалансированные"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"12.Качаем библиотеку imbalanced-learn, достаём нужные функции и погнали"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 104,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Class distribution after oversampling (df):\n",
|
||||
"price_category\n",
|
||||
"Low 10787\n",
|
||||
"Medium 10787\n",
|
||||
"High 10787\n",
|
||||
"Luxury 10787\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Class distribution after undersampling (df):\n",
|
||||
"price_category\n",
|
||||
"Low 1465\n",
|
||||
"Medium 1465\n",
|
||||
"High 1465\n",
|
||||
"Luxury 1465\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||||
"df = pd.read_csv(\".//datasetlab2//kc_house_data.csv\", sep=\",\")\n",
|
||||
"df['price_category'] = pd.cut(df['price'], bins=[0, 300000, 600000, 1000000, float('inf')],\n",
|
||||
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
|
||||
"\n",
|
||||
"y = df['price_category']\n",
|
||||
"X = df.drop(columns=['price', 'price_category'])\n",
|
||||
"\n",
|
||||
"oversampler = RandomOverSampler(random_state=42)\n",
|
||||
"X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
||||
"\n",
|
||||
"undersampler = RandomUnderSampler(random_state=42)\n",
|
||||
"X_resampled_under, y_resampled_under = undersampler.fit_resample(X, y)\n",
|
||||
"\n",
|
||||
"print(\"Class distribution after oversampling (df):\")\n",
|
||||
"print(pd.Series(y_resampled).value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Class distribution after undersampling (df):\")\n",
|
||||
"print(pd.Series(y_resampled_under).value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 105,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Class distribution after oversampling (df3):\n",
|
||||
"AGE_category\n",
|
||||
"Young 1401\n",
|
||||
"Middle-aged 1401\n",
|
||||
"Senior 1401\n",
|
||||
"Elderly 1401\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Class distribution after undersampling (df3):\n",
|
||||
"AGE_category\n",
|
||||
"Young 15\n",
|
||||
"Middle-aged 15\n",
|
||||
"Senior 15\n",
|
||||
"Elderly 15\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df3 = pd.read_csv(\".//datasetlab2//Forbes Billionaires.csv\", sep=\",\")\n",
|
||||
"\n",
|
||||
"df3['AGE_category'] = pd.cut(df3['Age'], bins=[0, 30, 50, 70, float('inf')],\n",
|
||||
" labels=['Young', 'Middle-aged', 'Senior', 'Elderly'])\n",
|
||||
"\n",
|
||||
"y3 = df3['AGE_category']\n",
|
||||
"X3 = df3.drop(columns=['Age', 'AGE_category'])\n",
|
||||
"\n",
|
||||
"oversampler3 = RandomOverSampler(random_state=42)\n",
|
||||
"X_resampled_3, y_resampled_3 = oversampler3.fit_resample(X3, y3)\n",
|
||||
"\n",
|
||||
"undersampler3 = RandomUnderSampler(random_state=42)\n",
|
||||
"X_resampled_3_under, y_resampled_3_under = undersampler3.fit_resample(X3, y3)\n",
|
||||
"\n",
|
||||
"print(\"Class distribution after oversampling (df3):\")\n",
|
||||
"print(pd.Series(y_resampled_3).value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Class distribution after undersampling (df3):\")\n",
|
||||
"print(pd.Series(y_resampled_3_under).value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 106,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Class distribution after oversampling (df2):\n",
|
||||
"Sales_category\n",
|
||||
"Low 598\n",
|
||||
"Medium 598\n",
|
||||
"High 598\n",
|
||||
"Luxury 0\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Class distribution after undersampling (df2):\n",
|
||||
"Sales_category\n",
|
||||
"Low 7\n",
|
||||
"Medium 7\n",
|
||||
"High 7\n",
|
||||
"Luxury 0\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df2 = pd.read_csv(\".//datasetlab2//Stores.csv\", sep=\",\")\n",
|
||||
"\n",
|
||||
"df2['Sales_category'] = pd.cut(df2['Store_Sales'], bins=[0, 50000, 100000, 200000, float('inf')],\n",
|
||||
" labels=['Low', 'Medium', 'High', 'Luxury'])\n",
|
||||
"\n",
|
||||
"y2 = df2['Sales_category']\n",
|
||||
"X2 = df2.drop(columns=['Store_Sales', 'Sales_category'])\n",
|
||||
"\n",
|
||||
"oversampler2 = RandomOverSampler(random_state=42)\n",
|
||||
"X_resampled_2, y_resampled_2 = oversampler2.fit_resample(X2, y2)\n",
|
||||
"\n",
|
||||
"undersampler2 = RandomUnderSampler(random_state=42)\n",
|
||||
"X_resampled_2_under, y_resampled_2_under = undersampler2.fit_resample(X2, y2)\n",
|
||||
"\n",
|
||||
"print(\"Class distribution after oversampling (df2):\")\n",
|
||||
"print(pd.Series(y_resampled_2).value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Class distribution after undersampling (df2):\")\n",
|
||||
"print(pd.Series(y_resampled_2_under).value_counts())"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "miivenv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
2241
lab_3/datasetlab1/marketing_campaign.csv
Normal file
2241
lab_3/datasetlab1/marketing_campaign.csv
Normal file
File diff suppressed because it is too large
Load Diff
1000
lab_3/lab3.ipynb
Normal file
1000
lab_3/lab3.ipynb
Normal file
File diff suppressed because one or more lines are too long
2241
lab_4/datasetlab1/marketing_campaign.csv
Normal file
2241
lab_4/datasetlab1/marketing_campaign.csv
Normal file
File diff suppressed because it is too large
Load Diff
2241
lab_4/datasetlab1/marketing_campaign2.csv
Normal file
2241
lab_4/datasetlab1/marketing_campaign2.csv
Normal file
File diff suppressed because it is too large
Load Diff
496
lab_4/lab4.ipynb
Normal file
496
lab_4/lab4.ipynb
Normal file
@ -0,0 +1,496 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Lab 4 Malafeev PIbd-31**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1.Для начала выберем бизнес-цели для задач регрессии и классификации."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Классификация. Цель: определить, откликнется ли клиент на маркетинговую кампанию. Столбец целевой переменной - Response, 1 - откликнулся, 0 - нет. Признаки - Возраст, Уровень дохода. (Age, Income)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Регрессия. Цель: прогноз расходов клиента. Столбец целевой переменной: Total_Spending - общие расходы, будут считаться по всем расходам. Признаки такие же."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2.Достижимый уровень качества:\n",
|
||||
"Классификация:\n",
|
||||
"Оценка метрики accuracy: ориентир 70-80% (с учетом ограниченных признаков).\n",
|
||||
"Регрессия:\n",
|
||||
"MSE (среднеквадратичная ошибка): минимизация, ориентир в зависимости от разброса целевой переменной.\n",
|
||||
"R^2 > 0.6"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"3.Ориентир. Классификация:\n",
|
||||
"DummyClassifier, предсказывающий самый частый класс, даст accuracy ~50-60%.\n",
|
||||
"Регрессия:\n",
|
||||
"Прогноз среднего значения целевой переменной."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"X_class_train: (1568, 2), y_class_train: (1568,)\n",
|
||||
"X_reg_train: (1568, 2), y_reg_train: (1568,)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"\n",
|
||||
"data = pd.read_csv(\".//datasetlab1//marketing_campaign.csv\", sep=\"\\t\")\n",
|
||||
"data2 = pd.read_csv(\".//datasetlab1//marketing_campaign2.csv\", sep=\"\\t\")\n",
|
||||
"\n",
|
||||
"# Преобразуем данные для классификации (дата для отклика на кампанию)\n",
|
||||
"data['Age'] = 2024 - data['Year_Birth'] \n",
|
||||
"data = data[['Age', 'Income', 'Response']] \n",
|
||||
"\n",
|
||||
"X_class = data[['Age', 'Income']]\n",
|
||||
"y_class = data['Response']\n",
|
||||
"\n",
|
||||
"# Преобразуем данные для регрессии (прогноз расходов)\n",
|
||||
"data2['Age'] = 2024 - data2['Year_Birth'] \n",
|
||||
"data2['Total_Spending'] = (data2['MntWines'] + data2['MntFruits'] + data2['MntMeatProducts'] +\n",
|
||||
" data2['MntFishProducts'] + data2['MntSweetProducts'] + data2['MntGoldProds'])\n",
|
||||
"data2 = data2[['Age', 'Income', 'Total_Spending']] \n",
|
||||
"\n",
|
||||
"# Разделение на признаки и целевую переменную для регрессии\n",
|
||||
"X_reg = data2[['Age', 'Income']]\n",
|
||||
"y_reg = data2['Total_Spending']\n",
|
||||
"\n",
|
||||
"# Масштабирование данных\n",
|
||||
"scaler = StandardScaler()\n",
|
||||
"X_class_scaled = scaler.fit_transform(X_class)\n",
|
||||
"X_reg_scaled = scaler.fit_transform(X_reg)\n",
|
||||
"\n",
|
||||
"# Разделение на тренировочные и тестовые выборки\n",
|
||||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class_scaled, y_class, test_size=0.3, random_state=42)\n",
|
||||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg_scaled, y_reg, test_size=0.3, random_state=42)\n",
|
||||
"\n",
|
||||
"# Проверим, что все выглядит правильно\n",
|
||||
"print(f\"X_class_train: {X_train_class.shape}, y_class_train: {y_train_class.shape}\")\n",
|
||||
"print(f\"X_reg_train: {X_train_reg.shape}, y_reg_train: {y_train_reg.shape}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"5-6.Выбор трёх моделей и построение конвейера"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Logistic Regression - Средняя точность модели: 0.8475 ± 0.0027\n",
|
||||
"Random Forest - Средняя точность модели: 0.8258 ± 0.0099\n",
|
||||
"SVM - Средняя точность модели: 0.8529 ± 0.0027\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.svm import SVC\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"\n",
|
||||
"# Удаляем строки с пропущенными значениями\n",
|
||||
"X_class_scaled = X_class.dropna()\n",
|
||||
"y_class = y_class[X_class_scaled.index]\n",
|
||||
"\n",
|
||||
"models = [\n",
|
||||
" ('Logistic Regression', LogisticRegression(max_iter=1000)),\n",
|
||||
" ('Random Forest', RandomForestClassifier(n_estimators=100)),\n",
|
||||
" ('SVM', SVC())\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Создаем конвейер\n",
|
||||
"imputer = SimpleImputer(strategy='mean') \n",
|
||||
"scaler = StandardScaler()\n",
|
||||
"\n",
|
||||
"for name, model in models:\n",
|
||||
" pipe = Pipeline([\n",
|
||||
" ('imputer', imputer),\n",
|
||||
" ('scaler', scaler),\n",
|
||||
" ('classifier', model)\n",
|
||||
" ])\n",
|
||||
" \n",
|
||||
" scores = cross_val_score(pipe, X_class_scaled, y_class, cv=5, scoring='accuracy')\n",
|
||||
" print(f\"{name} - Средняя точность модели: {scores.mean():.4f} ± {scores.std():.4f}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Вот такие модели и конвейр я выбрал: Imputer: Заполняет пропущенные значения средним (если они есть).\n",
|
||||
"Scaler: Масштабирует данные с помощью StandardScaler.\n",
|
||||
"Classifier: Используются три модели:\n",
|
||||
"LogisticRegression: Логистическая регрессия.\n",
|
||||
"RandomForestClassifier: Случайный лес.\n",
|
||||
"SVC: Метод опорных векторов (SVM)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"7.Теперь сделаем настройку гиперпараметров."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
|
||||
"Logistic Regression - Лучшие гиперпараметры: {'classifier__C': 0.1, 'classifier__solver': 'lbfgs'}\n",
|
||||
"Logistic Regression - Лучшая точность: 0.8484\n",
|
||||
"--------------------------------------------------\n",
|
||||
"Fitting 5 folds for each of 9 candidates, totalling 45 fits\n",
|
||||
"Random Forest - Лучшие гиперпараметры: {'classifier__max_depth': 10, 'classifier__n_estimators': 100}\n",
|
||||
"Random Forest - Лучшая точность: 0.8520\n",
|
||||
"--------------------------------------------------\n",
|
||||
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
|
||||
"SVM - Лучшие гиперпараметры: {'classifier__C': 1, 'classifier__kernel': 'rbf'}\n",
|
||||
"SVM - Лучшая точность: 0.8529\n",
|
||||
"--------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.model_selection import GridSearchCV\n",
|
||||
"models = [\n",
|
||||
" ('Logistic Regression', LogisticRegression(max_iter=1000), {'classifier__C': [0.1, 1, 10], 'classifier__solver': ['lbfgs', 'liblinear']}),\n",
|
||||
" ('Random Forest', RandomForestClassifier(n_estimators=100), {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [10, 20, None]}),\n",
|
||||
" ('SVM', SVC(), {'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf']})\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for name, model, param_grid in models:\n",
|
||||
" pipe = Pipeline([\n",
|
||||
" ('imputer', imputer),\n",
|
||||
" ('scaler', scaler),\n",
|
||||
" ('classifier', model)\n",
|
||||
" ])\n",
|
||||
" \n",
|
||||
" grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)\n",
|
||||
" grid_search.fit(X_class_scaled, y_class)\n",
|
||||
"\n",
|
||||
" print(f\"{name} - Лучшие гиперпараметры: {grid_search.best_params_}\")\n",
|
||||
" print(f\"{name} - Лучшая точность: {grid_search.best_score_:.4f}\")\n",
|
||||
" print(\"-\" * 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Тут мы проходим по моделям и настраиваем гиперпараметры с помощью GridSearchCV с помощью кросс-валидации. Параметры: cv=5: 5 фолдов для кросс-валидации.\n",
|
||||
"scoring='accuracy': Мы используем точность как метрику.\n",
|
||||
"n_jobs=-1: Используем все доступные процессоры для ускорения вычислений.\n",
|
||||
"verbose=1: Подробный вывод процесса."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"8.Обучим модели"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
|
||||
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
|
||||
"Fitting 5 folds for each of 6 candidates, totalling 30 fits\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"best_models = {} \n",
|
||||
"for name, model, param_grid in models: \n",
|
||||
" grid_search.fit(X_class_scaled, y_class)\n",
|
||||
" best_models[name] = grid_search.best_estimator_ \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"9.Оценим модели."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
|
||||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||||
"\n",
|
||||
"# Оценка качества классификации\n",
|
||||
"for name, model in best_models.items():\n",
|
||||
" y_pred_class = model.predict(X_class_scaled) # Предсказание для классификации\n",
|
||||
"\n",
|
||||
"# Оценка качества регрессии\n",
|
||||
"for name, model in best_models.items():\n",
|
||||
" y_pred_reg = model.predict(X_reg_scaled) # Предсказание для регрессии\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Т.к. вывод слишком длинный, приложу его тут(вылазит Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...):<br>Оценка качества для модели Logistic Regression:<br>\n",
|
||||
"Accuracy: 0.8528880866425993<br>\n",
|
||||
"Precision: 0.8181818181818182<br>\n",
|
||||
"Recall: 0.02702702702702703<br>\n",
|
||||
"F1-Score: 0.05232558139534884<br>\n",
|
||||
"ROC AUC: Не поддерживается для этой модели<br>\n",
|
||||
"<br>\n",
|
||||
"<br>\n",
|
||||
"Оценка качества для модели Random Forest:<br>\n",
|
||||
"Accuracy: 0.8528880866425993<br>\n",
|
||||
"Precision: 0.8181818181818182<br>\n",
|
||||
"Recall: 0.02702702702702703<br>\n",
|
||||
"F1-Score: 0.05232558139534884<br>\n",
|
||||
"ROC AUC: Не поддерживается для этой модели<br>\n",
|
||||
"<br>\n",
|
||||
"<br>\n",
|
||||
"Оценка качества для модели SVM:<br>\n",
|
||||
"Accuracy: 0.8528880866425993<br>\n",
|
||||
"Precision: 0.8181818181818182<br>\n",
|
||||
"Recall: 0.02702702702702703<br>\n",
|
||||
"F1-Score: 0.05232558139534884<br>\n",
|
||||
"ROC AUC: Не поддерживается для этой модели<br>\n",
|
||||
"<br>\n",
|
||||
"<br>Задача регрессии: <br>\n",
|
||||
"Оценка качества для модели Logistic Regression:<br>\n",
|
||||
"MAE: 605.7982142857143<br>\n",
|
||||
"MSE: 729533.7598214286<br>\n",
|
||||
"RMSE: 854.1274845252485<br>\n",
|
||||
"R²: -1.0122722045012051<br>\n",
|
||||
"<br>\n",
|
||||
"<br>\n",
|
||||
"Оценка качества для модели Random Forest:<br>\n",
|
||||
"MAE: 605.7982142857143<br>\n",
|
||||
"MSE: 729533.7598214286<br>\n",
|
||||
"RMSE: 854.1274845252485<br>\n",
|
||||
"R²: -1.0122722045012051<br>\n",
|
||||
"<br>\n",
|
||||
"<br>\n",
|
||||
"Оценка качества для модели SVM:<br>\n",
|
||||
"MAE: 605.7982142857143<br>\n",
|
||||
"MSE: 729533.7598214286<br>\n",
|
||||
"RMSE: 854.1274845252485<br>\n",
|
||||
"R²: -1.0122722045012051<br>\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Почему выбрал эти метирки:<br>Классификация (Отклик на предложение)<br>\n",
|
||||
"Целевая переменная — бинарная (0 и 1), где 1 — откликнулся, а 0 — не откликнулся. Для классификации подходящими метриками являются:<br>\n",
|
||||
"<br>\n",
|
||||
"Accuracy (Точность):\n",
|
||||
"Это доля правильно классифицированных объектов среди всех. \n",
|
||||
"Подходит для оценки общей эффективности модели. Однако важно учитывать, что если классы несбалансированы, точность может быть обманчивой.<br>\n",
|
||||
"Precision (Точность):\n",
|
||||
"\n",
|
||||
"Это доля истинных положительных случаев среди всех предсказанных положительных случаев.\n",
|
||||
"Важна для задач, где важно минимизировать количество ложных срабатываний, например, когда модель ошибочно классифицирует клиента как откликнувшегося (True Positive).<br>\n",
|
||||
"Recall (Полнота):\n",
|
||||
"\n",
|
||||
"Это доля истинных положительных случаев среди всех истинных положительных случаев.\n",
|
||||
"Важно для задач, где важно не пропустить откликнувшихся клиентов (False Negatives).<br>\n",
|
||||
"F1-Score:\n",
|
||||
"\n",
|
||||
"Это гармоническое среднее между точностью и полнотой.\n",
|
||||
"Подходит для оценки моделей в случаях, когда важно иметь баланс между точностью и полнотой, особенно в ситуациях с несбалансированными классами.<br>\n",
|
||||
"ROC AUC:\n",
|
||||
"Площадь под кривой ROC, которая отображает способность модели различать положительные и отрицательные классы.\n",
|
||||
"Чем выше значение AUC, тем лучше модель справляется с разделением классов."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Регрессия (Прогноз расходов)<br>\n",
|
||||
"Целевая переменная — это числовое значение (расходы клиента). Для задач регрессии используются другие метрики:<br>\n",
|
||||
"\n",
|
||||
"Mean Absolute Error (MAE):\n",
|
||||
"\n",
|
||||
"Это средняя абсолютная ошибка предсказания.\n",
|
||||
"Простой и интерпретируемый показатель, который описывает среднее отклонение предсказанных значений от фактических.<br>\n",
|
||||
"Mean Squared Error (MSE):\n",
|
||||
"\n",
|
||||
"Это средняя квадратичная ошибка.\n",
|
||||
"Чувствителен к большим ошибкам, так как квадратичный штраф увеличивает вес больших отклонений, что полезно, если вы хотите минимизировать большие ошибки.<br>\n",
|
||||
"Root Mean Squared Error (RMSE):\n",
|
||||
"\n",
|
||||
"Это квадратный корень из MSE.\n",
|
||||
"Подходит для задач, где важно учитывать большие ошибки, так как более чувствителен к выбросам.<br>\n",
|
||||
"R-squared (R²):\n",
|
||||
"\n",
|
||||
"Это коэффициент детерминации, который показывает, какая доля дисперсии целевой переменной объясняется моделью.\n",
|
||||
"R² может быть полезен для оценки того, насколько хорошо модель объясняет вариацию целевой переменной, но не всегда подходит, если модель имеет много выбросов или некорректно подогнана.<br>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"9.Оценка"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Evaluating model: Logistic Regression\n",
|
||||
"Train Accuracy: 0.8476, Test Accuracy: 0.8586\n",
|
||||
"Bias: 0.0000, Variance: 0.0000\n",
|
||||
"Train Error (MSE): 732205.3240, Test Error (MSE): 723300.1101\n",
|
||||
"Bias: 0.0000, Variance: 0.0000\n",
|
||||
"Evaluating model: Random Forest\n",
|
||||
"Train Accuracy: 0.8476, Test Accuracy: 0.8586\n",
|
||||
"Bias: 0.0000, Variance: 0.0000\n",
|
||||
"Train Error (MSE): 732205.3240, Test Error (MSE): 723300.1101\n",
|
||||
"Bias: 0.0000, Variance: 0.0000\n",
|
||||
"Evaluating model: SVM\n",
|
||||
"Train Accuracy: 0.8476, Test Accuracy: 0.8586\n",
|
||||
"Bias: 0.0000, Variance: 0.0000\n",
|
||||
"Train Error (MSE): 732205.3240, Test Error (MSE): 723300.1101\n",
|
||||
"Bias: 0.0000, Variance: 0.0000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"from sklearn.metrics import mean_squared_error, accuracy_score\n",
|
||||
"\n",
|
||||
"# Оценка смещения и дисперсии для классификации и регрессии\n",
|
||||
"def evaluate_bias_variance(model, X_train, y_train, X_test, y_test, task='classification'):\n",
|
||||
" # Прогнозы на обучающих и тестовых данных\n",
|
||||
" y_train_pred = model.predict(X_train)\n",
|
||||
" y_test_pred = model.predict(X_test)\n",
|
||||
"\n",
|
||||
" if task == 'classification':\n",
|
||||
" # Для классификации считаем точность\n",
|
||||
" train_accuracy = accuracy_score(y_train, y_train_pred)\n",
|
||||
" test_accuracy = accuracy_score(y_test, y_test_pred)\n",
|
||||
" print(f\"Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}\")\n",
|
||||
" elif task == 'regression':\n",
|
||||
" # Для регрессии считаем среднеквадратичную ошибку (MSE)\n",
|
||||
" train_error = mean_squared_error(y_train, y_train_pred)\n",
|
||||
" test_error = mean_squared_error(y_test, y_test_pred)\n",
|
||||
" print(f\"Train Error (MSE): {train_error:.4f}, Test Error (MSE): {test_error:.4f}\")\n",
|
||||
"\n",
|
||||
" # Для оценки смещения и дисперсии на тестовых данных\n",
|
||||
" bias = np.mean(y_test_pred - y_train_pred[:len(y_test_pred)]) # Смещение: разница между тестом и обучением\n",
|
||||
" variance = np.var(y_test_pred - y_train_pred[:len(y_test_pred)]) # Дисперсия: варьирование прогнозов\n",
|
||||
"\n",
|
||||
" print(f\"Bias: {bias:.4f}, Variance: {variance:.4f}\")\n",
|
||||
"\n",
|
||||
"# Оценим для каждой из моделей\n",
|
||||
"for name, model in best_models.items():\n",
|
||||
" print(f\"Evaluating model: {name}\")\n",
|
||||
" # Для классификации\n",
|
||||
" evaluate_bias_variance(model, X_train_class, y_train_class, X_test_class, y_test_class, task='classification') \n",
|
||||
" # Для регрессии\n",
|
||||
" evaluate_bias_variance(model, X_train_reg, y_train_reg, X_test_reg, y_test_reg, task='regression') \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Конец"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "miivenv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
2241
lab_5/datasetlab1/marketing_campaign.csv
Normal file
2241
lab_5/datasetlab1/marketing_campaign.csv
Normal file
File diff suppressed because it is too large
Load Diff
2241
lab_5/datasetlab1/marketing_campaign2.csv
Normal file
2241
lab_5/datasetlab1/marketing_campaign2.csv
Normal file
File diff suppressed because it is too large
Load Diff
277
lab_5/lab5.ipynb
Normal file
277
lab_5/lab5.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user