处理表格数据常用的 pandas 库,mark一下常用的指令和技巧。
索引方式
# DataFrame数据
df = DataFrame({'a': 1,'b' :2}, index=['1', '2'])
df.columns # 获取列索引名称
df.values # 获取数值(ndarray)
df.iloc[:, 1:3] # 取某列series元素
df.columns.values[0] # 设置列名称
y = train_data.pop('30').values # pop将训练数据中的标签值y取出来
col = train_data.columns
x = train_data[col].values
# Series数据(单列数据)
sr.index # 获取行索引
sr.name # 设置列名称
sr.values # 获取数值(ndarray)
标准化
# MinMax标准化
df = df.iloc[:, 1:]
dat = (df - df.min()) / (df.max() - df.min())
分类和回归交差验证框架
## classification
kf = RepeatedStratifiedKFold(n_splits=kf)
i = 0
scores = np.zeros([kf*10, 2])
for train_idx, test_idx in kf.split(label_X, label_Y):
y_train, y_test = label_Y[train_idx], label_Y[test_idx]
# select features
model = ExtraTreesClassifier(n_estimators=100)
model.fit(label_X.iloc[train_idx, :], y_train)
fim = model.feature_importances_
b = np.array(sorted(enumerate(fim), key=lambda x: x[1]))
idx = b[:80, 0].astype(int)
x_train, x_test = label_X.iloc[train_idx, idx], label_X.iloc[test_idx, idx]
# KNN classification
neigh = KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train)
y_pred = neigh.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
scores[i, 0] = np.mean(np.diag(cm) / np.sum(cm, axis=1))
scores[i, 1] = f1_score(y_test, y_pred, average='weighted')
i = i + 1
print(np.mean(scores, axis=0), np.std(scores, axis=0))
## regression
scores = np.zeros([10, 2])
for i in range(10):
x_train, x_test, y_train, y_test = train_test_split(label_X, label_Y, test_size=0.3)
# SVR regression
svr_reg = SVR(kernel='linear')
svr_reg.fit(x_train, y_train)
y_pred = svr_reg.predict(x_test)
scores[i, 0] = mean_squared_error(y_test, y_pred)
scores[i, 1] = mean_absolute_error(y_test, y_pred)
print(np.mean(scores, axis=0), np.std(scores, axis=0))