27 lines
745 B
Python
27 lines
745 B
Python
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
from config import DATA_SIZE
|
|
|
|
|
|
def load_dataset():
|
|
data = pd.read_csv('true_car_listings.csv')[:DATA_SIZE]
|
|
|
|
names = ['Year', 'Mileage', 'City', 'State', 'Vin', 'Make', 'Model']
|
|
convert_to_num(data, 'City')
|
|
convert_to_num(data, 'State')
|
|
convert_to_num(data, 'Vin')
|
|
convert_to_num(data, 'Make')
|
|
convert_to_num(data, 'Model')
|
|
Y = data['Price']
|
|
X = data[names]
|
|
|
|
x_train, x_test, y_train, y_test = train_test_split(
|
|
X, Y, test_size=0.05, random_state=42)
|
|
|
|
return x_train, x_test, y_train, y_test
|
|
|
|
|
|
def convert_to_num(data, col):
|
|
unique_numbers = list(set(data[col]))
|
|
data[col] = data[col].apply(unique_numbers.index) |