1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
|
""" @author: zj @file: german.py @time: 2019-12-13 """
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split
def load_german_data(data_path, shuffle=True, tsize=0.8): data_list = pd.read_csv(data_path, header=None, sep='\s+')
data_array = data_list.values height, width = data_array.shape[:2] data_x = data_array[:, :(width - 1)] data_y = data_array[:, (width - 1)]
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=tsize, test_size=(1 - tsize), shuffle=shuffle)
y_train = np.array(list(map(lambda x: 1 if x == 2 else 0, y_train))) y_test = np.array(list(map(lambda x: 1 if x == 2 else 0, y_test)))
return x_train, x_test, y_train, y_test
if __name__ == '__main__': data_path = '/home/zj/data/german/german.data-numeric' x_train, x_test, y_train, y_test = load_german_data(data_path)
x_train = x_train.astype(np.double) x_test = x_test.astype(np.double) mu = np.mean(x_train, axis=0) var = np.var(x_train, axis=0) eps = 1e-8 x_train = (x_train - mu) / np.sqrt(var + eps) x_test = (x_test - mu) / np.sqrt(var + eps)
print(x_test)
|