Ну типо сделал
This commit is contained in:
parent
88f9aec41f
commit
3720eb2ec7
19238
data/car_price_prediction.csv
Normal file
19238
data/car_price_prediction.csv
Normal file
File diff suppressed because it is too large
Load Diff
101
data/unicorns.csv
Normal file
101
data/unicorns.csv
Normal file
@ -0,0 +1,101 @@
|
||||
Company;Valuation;Country;State;City;Industries;FoundedYear;Name of Founders;TotalFunding;Number of Employees
|
||||
Bytedance;140,00 US$;China;Beijing;Beijing;Content, Data Mining, Internet;2012;Yiming Zhang;$7,440.00M;10.000
|
||||
SpaceX;100,30 US$;United States;California;Hawthorne;Aerospace, Manufacturing, Space Travel, Transportation;2002;Elon Musk;$383.02M;5,000-10,000
|
||||
Stripe;95,00 US$;United States;California;San Francisco;Finance, FinTech, Mobile Payments, SaaS;2010;John Collison, Patrick Collison;$300.00M;1,000-5,000
|
||||
Klarna;45,60 US$;Sweden;;Stockholm;E-Commerce, FinTech, Payments, Shopping;2005;Niklas Adalberth, Sebastian Siemiatkowski, Victor Jacobsson;$3,471.72M;5,000-10,000
|
||||
Epic Games;42,00 US$;United States;North Carolina;Cary;Developer Platform, Gaming, Software, Video Games;1991;Mark Rein, Tim Sweeney;$544.93M;1,000-5,000
|
||||
Canva;40,00 US$;Australia;New South Wales;Surry Hills;Graphic Design, Photo Editing, Publishing, Software, Web Design;2012;Cameron Adams, Cliff Obrecht, Melanie Perkins;$571.26M;500-1,000
|
||||
Checkout.com;40,00 US$;United Kingdom;England;London;E-Commerce, FinTech, Payments, Transaction Processing;2012;Guillaume Pousaz;$1,830.00M;1,000-5,000
|
||||
Instacart;39,00 US$;United States;California;San Francisco;Delivery Service, E-Commerce, Grocery, Shopping;2012;Apoorva Mehta, Brandon Leonardo, Max Mullen;$2,686.01M;5,000-10,000
|
||||
Databricks;38,00 US$;United States;California;San Francisco;Analytics, Artificial Intelligence, Information Technology, Machine Learning, Software;2013;Ali Ghodsi, Andy Konwinski, Ion Stoica, Matei Zaharia, Patrick Wendell, Reynold Xin, Scott Shenker;$557.15M;1,000-5,000
|
||||
Revolut;33,00 US$;United Kingdom;England;London;Banking, Financial Services, FinTech, Mobile Payments;2015;Nikolay Storonsky, Vlad Yatsenko;$1,715.98M;1,000-5,000
|
||||
FTX;32,00 US$;Bahamas;;;Cryptocurrency, Finance, Financial Exchanges, Financial Services, Trading Platform;2018;Gary Wang, Sam Bankman-Fried;$1,828.69M;100-250
|
||||
Chime;25,00 US$;United States;California;San Francisco;Banking, Debit Cards, Financial Services, FinTech;2013;Chris Britt, Ryan King;$3,396.75M;1,000-5,000
|
||||
BYJU's;21,00 US$;India;;Bengaluru;E-Learning, EdTech, Education, Higher Education, Software;2008;Byju Raveendran, Divya Gokulnath;$5,182.78M;1,000-5,000
|
||||
J&T Express;20,00 US$;Indonesia;;Jakarta;Courier Service, E-Commerce, Freight Service;2015;Jet Lee, Tony Chen;$4,653.00M;10.000
|
||||
Xiaohongshu;20,00 US$;China;Shanghai;Shanghai;E-Commerce, Mobile Apps, Shopping, Social;2013;Charlwin Mao Wenchao, Miranda Qu;$917.50M;1,000-5,000
|
||||
Fanatics;18,00 US$;United States;Florida;Jacksonville;Manufacturing, Retail, Sporting Goods, Sports;1995;Alan Trager, Michael G. Rubin, Mitch Trager;$1,170.29M;1,000-5,000
|
||||
Miro;17,50 US$;United States;California;San Francisco;B2B, Enterprise Applications, Enterprise Software, Product Management, UX Design;2011;Andrey Khusid, Oleg Shardin, Oleg Shardin;$355.00M;1,000-5,000
|
||||
Yuanfudao;15,50 US$;China;Beijing;Beijing;E-Learning, EdTech, Tutoring;2012;Ke Shuai, Xin Li, Yong Li;$4,044.20M;10.000
|
||||
Ripple;15,00 US$;United States;California;San Francisco;Blockchain, Cryptocurrency, FinTech, Internet, Payments;2012;Arthur Britto, Chris Larsen, Jed McCaleb, Ryan Fugger;$293.90M;500-1,000
|
||||
DJI Innovations;15,00 US$;China;Guangdong;Shenzhen;Aerospace, Consumer Electronics, Drones, Manufacturing, Photography, Wireless;2006;Frank Wang;$1,135.00M;10.000
|
||||
goPuff;15,00 US$;United States;Pennsylvania;Philadelphia;Delivery Service, E-Commerce, Food Delivery, Grocery, Mobile Apps;2013;Rafael Ilishayev, Yakir Gola;$290.88M;5,000-10,000
|
||||
SHEIN;15,00 US$;China;Guangdong;Shenzhen;Consumer, E-Commerce, Fashion, Marketplace, Textiles;2008;Xiaoqing Ren, Yang Pei, Yangtian Xu;$553.36M;1,000-5,000
|
||||
Yuanqi Senlin;15,00 US$;China;Beijing;Beijing;Food and Beverage;2016;Binsen Tang;$721.31M;5,000-10,000
|
||||
Plaid;13,40 US$;United States;California;San Francisco;Finance, Financial Services, FinTech;2012;William Hockey, Zachary Perret;$734.80M;500-1,000
|
||||
OpenSea;13,30 US$;United States;New York;New York;Blockchain, Cryptocurrency, Marketplace;2017;Alex Atallah, Devin Finzer;$425.12M;100-250
|
||||
Grammarly;13,00 US$;United States;California;San Francisco;Assistive Technology, Information Technology, Productivity Tools;2009;Alex Shevchenko, Dmytro Lider, Max Lytvyn;$400.00M;500-1,000
|
||||
Devoted Health;12,60 US$;United States;Minnesota;Saint Paul;Elder Care, Elderly, Health Care, Hospital;2017;Ed Park, Jeremy Delinsky, Todd Park;$1,968.95M;1,000-5,000
|
||||
Faire;12,40 US$;United States;California;San Francisco;E-Commerce, Marketplace, Retail, Retail Technology, Wholesale;2017;Daniele Perito, Jeffrey Kolovson, Lauren Cooks Levitan, Marcelo Cortes, Max Rhodes;$1,096.12M;500-1,000
|
||||
Brex;12,30 US$;United States;Utah;Draper;Banking, Credit Cards, Financial Services, FinTech;2017;Henrique Dubugras, Pedro Franceschi;$1,490.12M;500-1,000
|
||||
Biosplice Therapeutics;12,00 US$;United States;California;San Diego;Biotechnology, Health Care, Life Science;2008;Osman Kibar;$285.71M;50-100
|
||||
Bitmain Technologies;12,00 US$;China;Beijing;Beijing;Application Specific Integrated Circuit (ASIC), Bitcoin, Electronics, Manufacturing, Semiconductor;2013;Jihan Wu, Micree Zhan;$450.00M;100-250
|
||||
GoodLeap;12,00 US$;United States;California;Roseville;Lending, Renewable Energy;2003;Hayes Barnard;$1,800.00M;1,000-5,000
|
||||
JUUL Labs;12,00 US$;United States;California;San Francisco;B2C, Consumer Electronics, Consumer Goods, Leisure, Lifestyle;2015;Adam Bowen, James Monsees, Kevin Burns, Tim Danaher;$15,371.68M;1,000-5,000
|
||||
Airtable;11,70 US$;United States;California;San Francisco;Collaboration, Database, Developer Tools, SaaS;2013;Andrew Ofstad, Emmett Nicholas, Howie Liu;$2,236.60M;250-500
|
||||
ZongMu Technology;11,40 US$;China;Shanghai;Shanghai;Automotive, Autonomous Vehicles, Robotics;2013;Rui Tang;$210.83M;10-50
|
||||
Global Switch;11,10 US$;United Kingdom;England;London;Data Center, Real Estate, Wholesale;1998;Andy Ruhan;$6,254.75M;250-500
|
||||
Bolt;11,00 US$;United States;California;San Francisco;E-Commerce, Fraud Detection, Mobile Payments, Payments;2014;Eric Feldman, Ryan Breslow;$963.00M;500-1,000
|
||||
Celonis;11,00 US$;Germany;;Munich;Analytics, Business Intelligence, SaaS, Software;2011;Alexander Rinke, Bastian Nominacher, Martin Klenk;$1,367.50M;1,000-5,000
|
||||
Weilong;10,88 US$;China;Henan;Luohe;Food and Beverage, Manufacturing, Snack Food;1999;Liu Fuping, Liu Weiping;$559.74M;No Data
|
||||
Swiggy;10,70 US$;India;;Bengaluru;E-Commerce Platforms, Food Delivery, Mobile Apps;2014;Nandan Reddy, Phani Kishan Addepalli, Rahul Jaimini, Sriharsha Majety;$3,571.00M;10.000
|
||||
Figma;10,00 US$;United States;California;San Francisco;Developer Tools, Graphic Design, Software, UX Design, Web Design;2012;Dylan Field, Evan Wallace;$333.50M;250-500
|
||||
Talkdesk;10,00 US$;United States;California;San Francisco;Cloud Computing, CRM, Customer Service, SaaS;2011;Cristina Fonseca, Tiago Paiva;$504.77M;1,000-5,000
|
||||
Digital Currency Group;10,00 US$;United States;New York;New York;Bitcoin, Blockchain, Financial Services, Venture Capital;2015;Barry Silbert;$600.00M;50-100
|
||||
Gusto;10,00 US$;United States;California;San Francisco;Employee Benefits, Enterprise Software, Financial Services, FinTech, Human Resources, SaaS;2011;Edward Kim, Joshua Reeves, Tomer London;$930.83M;1,000-5,000
|
||||
Lalamove;10,00 US$;Hong Kong;;Cheung Sha Wan;Apps, Delivery, Logistics, Supply Chain Management, Transportation;2013;Chow Shing Yuk, Gary Hui, Santit Jirawongkraisorn;$2,475.00M;500-1,000
|
||||
Notion Labs;10,00 US$;United States;California;San Francisco;Apps, Collaboration, Product Management, Real Time, Software;2016;Ivan Zhao, Simon Last;$342.00M;250-500
|
||||
reddit;10,00 US$;United States;California;San Francisco;Content, News, Social Bookmarking, Social Media, Social Network;2005;Aaron Swartz, Alexis Ohanian, Steve Huffman;$1,487.23M;500-1,000
|
||||
Thrasio;10,00 US$;United States;Massachusetts;Walpole;Brand Marketing, Consumer Goods, E-Commerce;2018;Carlos Cashman, Joshua Silberstein;$3,396.46M;1,000-5,000
|
||||
OYO Rooms;9,60 US$;India;;Gurugram;Hospitality, Travel, Travel Accommodations;2012;Ritesh Agarwal;$3,113.68M;5,000-10,000
|
||||
OutSystems;9,50 US$;United States;Massachusetts;Boston;Data Integration, Developer Platform, Developer Tools, PaaS, SaaS, Software;2001;Paulo Rosado, Rui Pereira;$208.00M;1,000-5,000
|
||||
ServiceTitan;9,50 US$;United States;California;Glendale;CRM, Home Services, Information Technology, SaaS;2012;Ara Mahdessian, Vahe Kuzoyan;$1,098.84M;1,000-5,000
|
||||
HEYTEA;9,28 US$;China;Guangdong;Shenzhen;Food and Beverage, Tea;2012;Yunqi Nie;$579.23M;1,000-5,000
|
||||
N26;9,23 US$;Germany;;Berlin;Banking, Finance, Financial Services, FinTech;2013;Maximilian Tayenthal, Valentin Stalf;$1,722.36M;1,000-5,000
|
||||
Klaviyo;9,20 US$;United States;Massachusetts;Boston;Advertising, Analytics, E-Commerce, Marketing, Marketing Automation, Software;2012;Andrew Bialecki, Ed Hallen;$678.50M;500-1,000
|
||||
Northvolt;9,08 US$;Sweden;;Stockholm;Battery, Clean Energy, CleanTech, Electronics, Manufacturing;2016;Paolo Cerruti, Peter Carlsson;$6,162.15M;1,000-5,000
|
||||
Chehaoduo;9,00 US$;China;Beijing;Beijing;Automotive, E-Commerce, Online Auctions;2015;Mark Yang;$696.76M;10.000
|
||||
Niantic;9,00 US$;United States;California;San Francisco;Augmented Reality, Software, Video Games, Virtual Reality;2015;John Hanke, Phil Keslin;$770.00M;500-1,000
|
||||
Tanium;9,00 US$;United States;Washington;Kirkland;Cyber Security, Enterprise Software, Information Technology, SaaS, Security;2007;David Hindawi, Orion Hindawi;$4,376.50M;1,000-5,000
|
||||
Rapyd;8,75 US$;United Kingdom;England;London;Financial Services, FinTech, Mobile Payments, Payments;2016;Arik Shtilman, Arkady Karpman, Omer Priel;$775.00M;250-500
|
||||
Kavak;8,70 US$;Mexico;;Lerma de Villada;Automotive, E-Commerce, E-Commerce Platforms, Online Portals;2016;Carlos Julio Garcia, Roger Laughlin;$1,188.00M;1,000-5,000
|
||||
Nuro;8,60 US$;United States;California;Mountain View;Autonomous Vehicles, Fleet Management, Information Technology, Robotics, Transportation;2016;Dave Ferguson, Jiajun Zhu;$2,132.00M;1,000-5,000
|
||||
Snyk;8,60 US$;United States;Massachusetts;Boston;Cyber Security, Internet, Security, Software;2015;Assaf Hefetz, Danny Grander, Guy Podjarny, Jacob Tarango;$1,026.09M;500-1,000
|
||||
Bolt;8,40 US$;Estonia;;Tallinn;Car Sharing, Electric Vehicle, Food Delivery, Grocery, Last Mile Transportation, Mobile Apps, Public Transportation, Ride Sharing, Transportation;2013;Markus Villig, Martin Villig, Oliver Leisalu;$1,970.53M;1,000-5,000
|
||||
Tipalti;8,30 US$;United States;California;San Mateo;Accounting, Financial Services, FinTech, Payments, Software;2010;Chen Amit, Oren Zeev;$502.50M;500-1,000
|
||||
Lacework;8,30 US$;United States;California;San Jose;Cloud Security, Compliance, Cyber Security, Developer Tools;2015;Mike Speiser, Sanjay Kalra, Vikram Kapoor;$1,906.70M;500-1,000
|
||||
Tempus;8,10 US$;United States;Illinois;Chicago;Artificial Intelligence, Biotechnology, Health Care, Machine Learning, Medical;2015;Eric Lefkofsky;$1,070.00M;1,000-5,000
|
||||
Fireblocks;8,00 US$;United States;New York;New York;Blockchain, Cryptocurrency, Cyber Security;2018;Idan Ofrat, Michael Shaulov, Pavel Berengoltz;$1,039.00M;100-250
|
||||
Dream11;8,00 US$;India;;Mumbai;Fantasy Sports, Information Technology, Sports;2007;Bhavit Sheth, Harsh Jain;$1,165.08M;500-1,000
|
||||
Xingsheng Selected;8,00 US$;China;Hunan;Changsha;E-Commerce, Food and Beverage, Grocery, Retail, Shopping;2009;Lihua Yue;$5,040.00M;10.000
|
||||
Caris Life Sciences;7,83 US$;United States;Texas;Irving;Biotechnology, Health Care, Health Diagnostics, Medical;1996;David D. Halbert;$1,314.49M;1,000-5,000
|
||||
Hopin;7,75 US$;United Kingdom;England;London;Events, Meeting Software, Video Conferencing;2019;Johnny Boufarhat;$1,021.73M;500-1,000
|
||||
Dapper Labs;7,60 US$;Canada;British Columbia;Vancouver;Blockchain, Gaming, Software;2018;Dieter Shirley, Mack Flavelle, Roham Gharegozlou;$665.07M;100-250
|
||||
Getir;7,50 US$;Turkey;;Istanbul;Delivery Service, E-Commerce, Logistics, Mobile Apps;2015;Arkady Volozh, Mert Salur, Nazım Salur, Serkan Borançılı, Tuncay Tütek;$1,172.00M;1,000-5,000
|
||||
Razorpay;7,50 US$;India;;Bengaluru;Finance, Financial Services, FinTech, Payments;2013;Harshil Mathur, Shashank Kumar;$741.62M;1,000-5,000
|
||||
Netskope;7,50 US$;United States;California;Santa Clara;Cloud Security, Cyber Security, Enterprise Software, Software;2012;Krishna Narayanaswamy, Lebin Cheng, Ravi Ithal, Sanjay Beri;$1,040.10M;1,000-5,000
|
||||
Ola Cabs;7,50 US$;India;;Bengaluru;Apps, Mobile, Ride Sharing, Transportation;2011;Ankit Bhati, Bhavish Aggarwal;$5,008.30M;5,000-10,000
|
||||
Carta;7,40 US$;United States;California;San Francisco;Finance, FinTech, Software, Stock Exchanges;2012;Henry Ward, Manu Kumar;$1,157.80M;1,000-5,000
|
||||
Toss;7,40 US$;South Korea;;Seoul;Financial Services, FinTech, Mobile Apps, Mobile Payments, Personal Finance;2013;Seunggun Lee;$844.20M;500-1,000
|
||||
Scale AI;7,30 US$;United States;California;San Francisco;Artificial Intelligence, Image Recognition, Machine Learning, SaaS;2016;Alexandr Wang, Lucy Guo;$602.82M;250-500
|
||||
TripActions;7,25 US$;United States;California;Palo Alto;Business Travel, Customer Service, Payments, Software;2015;Ariel Cohen, Ilan Twig;$1,040.48M;1,000-5,000
|
||||
Argo AI;7,25 US$;United States;Pennsylvania;Pittsburgh;Artificial Intelligence, Autonomous Vehicles, Robotics, Transportation;2016;Bryan Salesky, Peter Rander;$500.00M;1,000-5,000
|
||||
Gong;7,25 US$;United States;California;Palo Alto;Artificial Intelligence, CRM, Enterprise Software, Information Technology, Machine Learning, Sales, Software;2015;Amit Bendov, Eilon Reshef;$583.00M;500-1,000
|
||||
Gemini;7,10 US$;United States;New York;New York;Cryptocurrency, Finance, Financial Services, FinTech;2015;Cameron Winklevoss, Tyler Winklevoss;$400.00M;500-1,000
|
||||
Discord;7,00 US$;United States;California;San Francisco;Communities, Messaging, Social Network, Software, Video Chat;2012;Jason Citron, Stanislav Vishnevskiy;$979.30M;500-1,000
|
||||
We Doctor;7,00 US$;China;Zhejiang;Hangzhou;Health Care, Hospitality, Internet, Medical;2010;Liao Jieyuan;$1,786.00M;1,000-5,000
|
||||
1Password;6,80 US$;Canada;Ontario;Toronto;Cyber Security, Network Security, Privacy, Software;2005;Dave Teare, Natalia Karimov, Roustem Karimov, Sara Teare;$920.14M;250-500
|
||||
Automation Anywhere;6,80 US$;United States;California;San Jose;Artificial Intelligence, Enterprise Software, Machine Learning, SaaS, Software;2003;Ankur Kothari, Mihir Shukla, Neeti Mehta, Rushabh Parmani;$840.00M;1,000-5,000
|
||||
Ziroom;6,60 US$;China;Beijing;Beijing;Real Estate, Rental, Rental Property;2011;Lin Xiong;$2,121.00M;10.000
|
||||
National Stock Exchange of India;6,50 US$;India;;Mumbai;Financial Services, FinTech, Stock Exchanges;1992;Mukesh Agarwal;$149.50M;250-500
|
||||
Mollie;6,50 US$;Netherlands;;Amsterdam;E-Commerce, Financial Services, FinTech, Mobile Payments;2004;Adriaan Mol;$934.32M;250-500
|
||||
Rippling;6,50 US$;United States;California;San Francisco;Employment, Human Resources, Information Technology, IT Management, Productivity Tools;2017;Parker Conrad, Prasanna Sankar;$447.12M;500-1,000
|
||||
DataRobot;6,30 US$;United States;Massachusetts;Boston;Artificial Intelligence, Enterprise Software, Machine Learning, SaaS;2012;Jeremy Achin, Thomas DeGodoy;$1,089.37M;1,000-5,000
|
||||
Personio;6,30 US$;Germany;;Munich;Employment, Human Resources, Recruiting, SaaS;2015;Arseniy Vershinin, Hanno Renner, Ignaz Forstmeier, Ignaz Forstmeier, Roman Schumacher;$524.83M;500-1,000
|
||||
Upgrade;6,28 US$;United States;California;San Francisco;Banking, Credit, Financial Services, FinTech, Mobile;2016;Adelina Grozdanova, Jeff Bogan, Matt Wierman, Renaud Laplanche, Soul Htite, Visar Nimani;$562.50M;250-500
|
||||
Hinge Health;6,20 US$;United States;California;San Francisco;Health Care, Medical, Therapeutics, Wearables;2015;Daniel Perez, Gabriel Mecklenburg;$853.85M;1,000-5,000
|
||||
Benchling;6,10 US$;United States;California;San Francisco;Biotechnology, Life Science, Software;2012;Ashutosh Singhal, Cory Li, Sajith Wickramasekara;$412.00M;500-1,000
|
||||
Black Unicorn Factory;6,10 US$;United States;California;Los Angeles;;2020;Johnny Stewart;$645M;No Data
|
||||
Better.com;6,00 US$;United States;New York;New York;Consumer Lending, Financial Services, FinTech, Lending, Real Estate;2016;Eric Wilson, Erik Bernhardsson, Shawn Low, Viral Shah, Vishal Garg;$1,655.00M;5,000-10,000
|
||||
Wiz;6,00 US$;Israel;;Tel Aviv;Cloud Security, Cyber Security, Enterprise Software, Security;2020;Ami Luttwak, Assaf Rappaport, Roy Reznik, Yinon Costica;$600.00M;100-250
|
||||
iCapital Network;6,00 US$;United States;New York;New York;Asset Management, Banking, Financial Services, FinTech;2013;Dan Vene, John Robertshaw, Nick Veronis, Phil Pool;$181.50M;250-500
|
|
28
lec1.ipynb
28
lec1.ipynb
@ -9,7 +9,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -108,7 +108,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -128,7 +128,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -195,7 +195,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -311,7 +311,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -391,7 +391,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -433,7 +433,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -476,7 +476,7 @@
|
||||
"<Axes: title={'center': 'Population 2020'}, xlabel='Continent'>"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
@ -543,7 +543,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -552,7 +552,7 @@
|
||||
"<Axes: ylabel='Frequency'>"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
@ -580,7 +580,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -610,7 +610,7 @@
|
||||
"<Axes: xlabel='Country (or dependency)', ylabel='Population 2020'>"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
@ -639,7 +639,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -656,7 +656,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
927
lec2.ipynb
927
lec2.ipynb
File diff suppressed because it is too large
Load Diff
723
lec2unicorns.ipynb
Normal file
723
lec2unicorns.ipynb
Normal file
@ -0,0 +1,723 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Загрузка данных в DataFrame"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 76,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"Index: 100 entries, Bytedance to iCapital Network\n",
|
||||
"Data columns (total 9 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 Valuation 100 non-null object\n",
|
||||
" 1 Country 100 non-null object\n",
|
||||
" 2 State 79 non-null object\n",
|
||||
" 3 City 99 non-null object\n",
|
||||
" 4 Industries 99 non-null object\n",
|
||||
" 5 FoundedYear 100 non-null int64 \n",
|
||||
" 6 Name of Founders 100 non-null object\n",
|
||||
" 7 TotalFunding 100 non-null object\n",
|
||||
" 8 Number of Employees 100 non-null object\n",
|
||||
"dtypes: int64(1), object(8)\n",
|
||||
"memory usage: 7.8+ KB\n",
|
||||
"(100, 10)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Valuation</th>\n",
|
||||
" <th>Country</th>\n",
|
||||
" <th>State</th>\n",
|
||||
" <th>City</th>\n",
|
||||
" <th>Industries</th>\n",
|
||||
" <th>FoundedYear</th>\n",
|
||||
" <th>Name of Founders</th>\n",
|
||||
" <th>TotalFunding</th>\n",
|
||||
" <th>Number of Employees</th>\n",
|
||||
" <th>IsChina</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Company</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>Bytedance</th>\n",
|
||||
" <td>140.0</td>\n",
|
||||
" <td>China</td>\n",
|
||||
" <td>Beijing</td>\n",
|
||||
" <td>Beijing</td>\n",
|
||||
" <td>Content, Data Mining, Internet</td>\n",
|
||||
" <td>2012</td>\n",
|
||||
" <td>Yiming Zhang</td>\n",
|
||||
" <td>7440.00</td>\n",
|
||||
" <td>10.000</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>SpaceX</th>\n",
|
||||
" <td>100.3</td>\n",
|
||||
" <td>United States</td>\n",
|
||||
" <td>California</td>\n",
|
||||
" <td>Hawthorne</td>\n",
|
||||
" <td>Aerospace, Manufacturing, Space Travel, Transp...</td>\n",
|
||||
" <td>2002</td>\n",
|
||||
" <td>Elon Musk</td>\n",
|
||||
" <td>383.02</td>\n",
|
||||
" <td>5,000-10,000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Stripe</th>\n",
|
||||
" <td>95.0</td>\n",
|
||||
" <td>United States</td>\n",
|
||||
" <td>California</td>\n",
|
||||
" <td>San Francisco</td>\n",
|
||||
" <td>Finance, FinTech, Mobile Payments, SaaS</td>\n",
|
||||
" <td>2010</td>\n",
|
||||
" <td>John Collison, Patrick Collison</td>\n",
|
||||
" <td>300.00</td>\n",
|
||||
" <td>1,000-5,000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Klarna</th>\n",
|
||||
" <td>45.6</td>\n",
|
||||
" <td>Sweden</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Stockholm</td>\n",
|
||||
" <td>E-Commerce, FinTech, Payments, Shopping</td>\n",
|
||||
" <td>2005</td>\n",
|
||||
" <td>Niklas Adalberth, Sebastian Siemiatkowski, Vic...</td>\n",
|
||||
" <td>3471.72</td>\n",
|
||||
" <td>5,000-10,000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Epic Games</th>\n",
|
||||
" <td>42.0</td>\n",
|
||||
" <td>United States</td>\n",
|
||||
" <td>North Carolina</td>\n",
|
||||
" <td>Cary</td>\n",
|
||||
" <td>Developer Platform, Gaming, Software, Video Games</td>\n",
|
||||
" <td>1991</td>\n",
|
||||
" <td>Mark Rein, Tim Sweeney</td>\n",
|
||||
" <td>544.93</td>\n",
|
||||
" <td>1,000-5,000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Valuation Country State City \\\n",
|
||||
"Company \n",
|
||||
"Bytedance 140.0 China Beijing Beijing \n",
|
||||
"SpaceX 100.3 United States California Hawthorne \n",
|
||||
"Stripe 95.0 United States California San Francisco \n",
|
||||
"Klarna 45.6 Sweden NaN Stockholm \n",
|
||||
"Epic Games 42.0 United States North Carolina Cary \n",
|
||||
"\n",
|
||||
" Industries FoundedYear \\\n",
|
||||
"Company \n",
|
||||
"Bytedance Content, Data Mining, Internet 2012 \n",
|
||||
"SpaceX Aerospace, Manufacturing, Space Travel, Transp... 2002 \n",
|
||||
"Stripe Finance, FinTech, Mobile Payments, SaaS 2010 \n",
|
||||
"Klarna E-Commerce, FinTech, Payments, Shopping 2005 \n",
|
||||
"Epic Games Developer Platform, Gaming, Software, Video Games 1991 \n",
|
||||
"\n",
|
||||
" Name of Founders TotalFunding \\\n",
|
||||
"Company \n",
|
||||
"Bytedance Yiming Zhang 7440.00 \n",
|
||||
"SpaceX Elon Musk 383.02 \n",
|
||||
"Stripe John Collison, Patrick Collison 300.00 \n",
|
||||
"Klarna Niklas Adalberth, Sebastian Siemiatkowski, Vic... 3471.72 \n",
|
||||
"Epic Games Mark Rein, Tim Sweeney 544.93 \n",
|
||||
"\n",
|
||||
" Number of Employees IsChina \n",
|
||||
"Company \n",
|
||||
"Bytedance 10.000 1 \n",
|
||||
"SpaceX 5,000-10,000 0 \n",
|
||||
"Stripe 1,000-5,000 0 \n",
|
||||
"Klarna 5,000-10,000 0 \n",
|
||||
"Epic Games 1,000-5,000 0 "
|
||||
]
|
||||
},
|
||||
"execution_count": 76,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(\"data/unicorns.csv\", index_col=\"Company\", sep=';')\n",
|
||||
"\n",
|
||||
"df.info()\n",
|
||||
"\n",
|
||||
"df[\"Valuation\"] = df[\"Valuation\"].apply(\n",
|
||||
" lambda x: float(x[:-4].replace(',', '.')),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"df[\"TotalFunding\"] = df[\"TotalFunding\"].apply(\n",
|
||||
" lambda x: float(x.strip(\"$M\").replace(\",\", \"\")),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"df[\"IsChina\"] = [int(country == 'China') for country in df[\"Country\"]]\n",
|
||||
"print(df.shape)\n",
|
||||
"\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Получение сведений о пропущенных данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Типы пропущенных данных:\n",
|
||||
"- None - представление пустых данных в Python\n",
|
||||
"- NaN - представление пустых данных в Pandas\n",
|
||||
"- '' - пустая строка"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 77,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Valuation 0\n",
|
||||
"Country 0\n",
|
||||
"State 21\n",
|
||||
"City 1\n",
|
||||
"Industries 1\n",
|
||||
"FoundedYear 0\n",
|
||||
"Name of Founders 0\n",
|
||||
"TotalFunding 0\n",
|
||||
"Number of Employees 0\n",
|
||||
"IsChina 0\n",
|
||||
"dtype: int64\n",
|
||||
"\n",
|
||||
"Valuation False\n",
|
||||
"Country False\n",
|
||||
"State True\n",
|
||||
"City True\n",
|
||||
"Industries True\n",
|
||||
"FoundedYear False\n",
|
||||
"Name of Founders False\n",
|
||||
"TotalFunding False\n",
|
||||
"Number of Employees False\n",
|
||||
"IsChina False\n",
|
||||
"dtype: bool\n",
|
||||
"\n",
|
||||
"State процент пустых значений: %21.00\n",
|
||||
"City процент пустых значений: %1.00\n",
|
||||
"Industries процент пустых значений: %1.00\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Количество пустых значений признаков\n",
|
||||
"print(df.isnull().sum())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Есть ли пустые значения признаков\n",
|
||||
"print(df.isnull().any())\n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"# Процент пустых значений признаков\n",
|
||||
"for i in df.columns:\n",
|
||||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||||
" if null_rate > 0:\n",
|
||||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Заполнение пропущенных данных\n",
|
||||
"\n",
|
||||
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
|
||||
"\n",
|
||||
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 78,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# fillna_df = df.fillna(0)\n",
|
||||
"\n",
|
||||
"# print(fillna_df.shape)\n",
|
||||
"\n",
|
||||
"# print(fillna_df.isnull().any())\n",
|
||||
"\n",
|
||||
"# # Замена пустых данных на 0\n",
|
||||
"# df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n",
|
||||
"\n",
|
||||
"# # Замена пустых данных на медиану\n",
|
||||
"# df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n",
|
||||
"\n",
|
||||
"# df.tail()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 79,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# df[\"AgeCopy\"] = df[\"Age\"]\n",
|
||||
"\n",
|
||||
"# # Замена данных сразу в DataFrame без копирования\n",
|
||||
"# df.fillna({\"AgeCopy\": 0}, inplace=True)\n",
|
||||
"\n",
|
||||
"# df.tail()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Удаление наблюдений с пропусками"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 80,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(100, 7)\n",
|
||||
"Valuation False\n",
|
||||
"Country False\n",
|
||||
"FoundedYear False\n",
|
||||
"Name of Founders False\n",
|
||||
"TotalFunding False\n",
|
||||
"Number of Employees False\n",
|
||||
"IsChina False\n",
|
||||
"dtype: bool\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = df.dropna(axis=1)\n",
|
||||
"\n",
|
||||
"print(df.shape)\n",
|
||||
"\n",
|
||||
"print(df.isnull().any())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Создание выборок данных\n",
|
||||
"\n",
|
||||
"Библиотека scikit-learn\n",
|
||||
"\n",
|
||||
"https://scikit-learn.org/stable/index.html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 81,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Функция для создания выборок\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def split_stratified_into_train_val_test(\n",
|
||||
" df_input,\n",
|
||||
" stratify_colname=\"y\",\n",
|
||||
" frac_train=0.6,\n",
|
||||
" frac_val=0.15,\n",
|
||||
" frac_test=0.25,\n",
|
||||
" random_state=None,\n",
|
||||
"):\n",
|
||||
" \"\"\"\n",
|
||||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||||
" following fractional ratios provided by the user, where each subset is\n",
|
||||
" stratified by the values in a specific column (that is, each subset has\n",
|
||||
" the same relative frequency of the values in the column). It performs this\n",
|
||||
" splitting by running train_test_split() twice.\n",
|
||||
"\n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" df_input : Pandas dataframe\n",
|
||||
" Input dataframe to be split.\n",
|
||||
" stratify_colname : str\n",
|
||||
" The name of the column that will be used for stratification. Usually\n",
|
||||
" this column would be for the label.\n",
|
||||
" frac_train : float\n",
|
||||
" frac_val : float\n",
|
||||
" frac_test : float\n",
|
||||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||||
" test data. The values should be expressed as float fractions and should\n",
|
||||
" sum to 1.0.\n",
|
||||
" random_state : int, None, or RandomStateInstance\n",
|
||||
" Value to be passed to train_test_split().\n",
|
||||
"\n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" df_train, df_val, df_test :\n",
|
||||
" Dataframes containing the three splits.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||||
" raise ValueError(\n",
|
||||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||||
" % (frac_train, frac_val, frac_test)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if stratify_colname not in df_input.columns:\n",
|
||||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||||
"\n",
|
||||
" X = df_input # Contains all columns.\n",
|
||||
" y = df_input[\n",
|
||||
" [stratify_colname]\n",
|
||||
" ] # Dataframe of just the column on which to stratify.\n",
|
||||
"\n",
|
||||
" # Split original dataframe into train and temp dataframes.\n",
|
||||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Split the temp dataframe into val and test dataframes.\n",
|
||||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||||
" df_temp,\n",
|
||||
" y_temp,\n",
|
||||
" stratify=y_temp,\n",
|
||||
" test_size=relative_frac_test,\n",
|
||||
" random_state=random_state,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||||
"\n",
|
||||
" return df_train, df_val, df_test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 82,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"IsChina\n",
|
||||
"0 86\n",
|
||||
"1 14\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Обучающая выборка: (60, 3)\n",
|
||||
"IsChina\n",
|
||||
"0 52\n",
|
||||
"1 8\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Контрольная выборка: (20, 3)\n",
|
||||
"IsChina\n",
|
||||
"0 17\n",
|
||||
"1 3\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Тестовая выборка: (20, 3)\n",
|
||||
"IsChina\n",
|
||||
"0 17\n",
|
||||
"1 3\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
||||
"print(df.IsChina.value_counts())\n",
|
||||
"\n",
|
||||
"data = df[[\"TotalFunding\", \"Valuation\", \"IsChina\"]].copy()\n",
|
||||
"\n",
|
||||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||||
" data,\n",
|
||||
" stratify_colname=\"IsChina\",\n",
|
||||
" frac_train=0.60,\n",
|
||||
" frac_val=0.20,\n",
|
||||
" frac_test=0.20,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||||
"print(df_train.IsChina.value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||||
"print(df_val.IsChina.value_counts())\n",
|
||||
"\n",
|
||||
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||||
"print(df_test.IsChina.value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Выборка с избытком (oversampling)\n",
|
||||
"\n",
|
||||
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
|
||||
"\n",
|
||||
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
|
||||
"\n",
|
||||
"Выборка с недостатком (undersampling)\n",
|
||||
"\n",
|
||||
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
|
||||
"\n",
|
||||
"Библиотека imbalanced-learn\n",
|
||||
"\n",
|
||||
"https://imbalanced-learn.org/stable/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 83,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Обучающая выборка: (60, 3)\n",
|
||||
"IsChina\n",
|
||||
"0 52\n",
|
||||
"1 8\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"Обучающая выборка после oversampling: (105, 3)\n",
|
||||
"IsChina\n",
|
||||
"1 53\n",
|
||||
"0 52\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>TotalFunding</th>\n",
|
||||
" <th>Valuation</th>\n",
|
||||
" <th>IsChina</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>208.000000</td>\n",
|
||||
" <td>9.500000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>4044.200000</td>\n",
|
||||
" <td>15.500000</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>447.120000</td>\n",
|
||||
" <td>6.500000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2121.000000</td>\n",
|
||||
" <td>6.600000</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2686.010000</td>\n",
|
||||
" <td>39.000000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>100</th>\n",
|
||||
" <td>1306.334794</td>\n",
|
||||
" <td>14.179790</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>101</th>\n",
|
||||
" <td>1492.220325</td>\n",
|
||||
" <td>10.610196</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>102</th>\n",
|
||||
" <td>1125.438822</td>\n",
|
||||
" <td>16.887502</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>103</th>\n",
|
||||
" <td>1728.312129</td>\n",
|
||||
" <td>7.708914</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>104</th>\n",
|
||||
" <td>1785.708076</td>\n",
|
||||
" <td>7.004370</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>105 rows × 3 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" TotalFunding Valuation IsChina\n",
|
||||
"0 208.000000 9.500000 0\n",
|
||||
"1 4044.200000 15.500000 1\n",
|
||||
"2 447.120000 6.500000 0\n",
|
||||
"3 2121.000000 6.600000 1\n",
|
||||
"4 2686.010000 39.000000 0\n",
|
||||
".. ... ... ...\n",
|
||||
"100 1306.334794 14.179790 1\n",
|
||||
"101 1492.220325 10.610196 1\n",
|
||||
"102 1125.438822 16.887502 1\n",
|
||||
"103 1728.312129 7.708914 1\n",
|
||||
"104 1785.708076 7.004370 1\n",
|
||||
"\n",
|
||||
"[105 rows x 3 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 83,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from imblearn.over_sampling import ADASYN\n",
|
||||
"\n",
|
||||
"ada = ADASYN()\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||||
"print(df_train.IsChina.value_counts())\n",
|
||||
"\n",
|
||||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"IsChina\"]) # type: ignore\n",
|
||||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||||
"\n",
|
||||
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||||
"print(df_train_adasyn.IsChina.value_counts())\n",
|
||||
"\n",
|
||||
"df_train_adasyn"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
BIN
Описание набора 1.docx
Normal file
BIN
Описание набора 1.docx
Normal file
Binary file not shown.
BIN
Описание набора 2.docx
Normal file
BIN
Описание набора 2.docx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user