数据预处理python
import pandas as pd
import numpy as np from sklearn import datasets, preprocessing from sklearn.model_selection import train_test_split #%% iris = datasets.load_iris() x = pd.DataFrame(iris['data'], columns=iris['feature_names']) y = pd.DataFrame(iris['target'].astype(str), columns=['target_names']) df = pd.concat([x,y], axis=1) #%% '檢查&補值' df.isna().sum() df=df.fillna(0) #0補值 df=df.fillna(method='pad') #前一筆 df=df.fillna(method='bfill')#後一筆 df['target_names']=df['target_names'].fillna((df['target_names'].mode())) #眾數 df['sepal length (cm)']=df['sepal length (cm)'].fillna((df['sepal length (cm)'].mean())) #平均值 #%% '資料轉換-標準化' min_max_scaler = preprocessing.MinMaxScaler() df = min_max_scaler.fit_transform(df) #%% '單熱' one_hot = pd.get_dummies(df) labels = np.array(one_hot['sepal length (cm)']) #應變數 features= one_hot.drop('sepal length (cm)', axis = 1) #排除應變數 feature_list = list(features.columns) #保留自變數名 features = np.array(features) #轉成array #%% '隨機抽樣' trainX, testX, trainY, testY = train_test_split(features, labels, test_size = 0.3, random_state = 42) trainY = trainY.reshape(-1, 1) #創行 testY = testY.reshape(-1, 1) #創行
评论
发表评论