实验汇报

实验目的

预测血糖值

实验步骤

Step1 数据概览

导部分入包

1
2
3
4
5
6
7
8
9
10
11
12
13
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.parser import parse
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from pylab import mpl
from scipy import stats
from scipy.stats import norm, skew

数据导入

1
2
3
4
5
6
# 导入文件
data = pd.read_csv('/home/hgg/Desktop/ex/data.csv', encoding='gbk')
# 设置sns调色板
color = sns.color_palette()
sns.set_style('darkgrid')

Step2 数据探索

接下来可以使用 info ,discribe ,columns等方打印数据信息,基本认识数据集

利用热力图来了解特征间的相关性

1
2
3
4
plt.figure(dpi=600)
sns.heatmap(data.corr(),xticklabels=data.corr().columns,yticklabels=data.corr().columns,cmap='vlag',center=0,annot=False)
plt.title('heatmap')
plt.show()

计算各个特征的缺失值

1
2
3
4
5
6
7
8
9
10
11
12
null_percentage = data.isnull().sum()/len(data)
print ('The null data percentage is:\n',null_percentage)
mpl.rcParams['font.sans-serif'] = ['Simhei']
null_percentage = null_percentage.reset_index()
null_percentage.columns = ['column_name','column_value']
ind = np.arange(null_percentage.shape[0])
fig , ax = plt.subplots(figsize = (6, 8))
rects = ax.barh(ind,null_percentage.column_value.values,color='red')
ax.set_yticks(ind)
ax.set_yticklabels(null_percentage.column_name.values,rotation='horizontal')
ax.set_xlabel("各基本特征缺失数据值")
plt.show()

查看标签分布

1
2
print(data['年龄'].value_counts())
sns.countplot(data['年龄'],label="Count")

查看异常值

1
2
3
4
5
6
7
8
9
10
11
data1=data['血糖']
u = data1.mean() # 计算均值
std = data1.std() # 计算标准差
stats.kstest(data1, 'norm', (u, std))
print('均值为:%.3f,标准差为:%.3f' % (u,std))
print('------')

error = data1[np.abs(data1 - u) > 3*std]
print(error)
data_c = data1[np.abs(data1 - u) <= 3*std]
print('异常值共%i条' % len(error))

Step3 数据预处理

将性别列转化为数字

1
data['性别']=data['性别'].map({'男': 1, '女': 0,"??": 0})

剔除无关列 由于 id日期 不能作为影响血糖值的因素故删除

1
2
3
data = data.drop('体检日期',axis=1)
data = data.drop('id',axis=1)
print(data)

缺失值处理 由于乙肝表面抗原乙肝表面抗体乙肝核心抗体乙肝e抗原乙肝e抗体这五个基本特征缺失比例超过70%,且这五种基本特征对于预测模型的影响权重较小所以这里选择将这几列直接删除,剩下的用均值填充

1
2
3
4
5
6
7
8
9
10
11
thresh_count = data.shape[0]*0.3
data = data.dropna(thresh=thresh_count,axis=1)
data_columns = data.columns
#data_columns = [i for i in data]
#print(data_columns)
import missingno as mno
p = mno.bar(data)
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan,strategy='mean')
data = imp.fit_transform(data,data_columns)
data=pd.DataFrame(data,columns=data_columns)

平滑处理

1
2
3
4
5
6
7
8
9
10
11
12
13
tmp, lambda_ = stats.boxcox(data['血糖'])

sns.distplot(tmp, fit=norm)
(mu, sigma) = norm.fit(tmp)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\\mu=$ {:.2f} and $\\sigma=$ {:.7f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequence')
plt.title('血糖分布')
fig = plt.figure()
res = stats.probplot(tmp, plot=plt)
plt.show()


异常值处理 删除异常行

1
2
data=data.drop(index=error.index,axis=0)
print(data)

Step 4 特征工程

利用回归方法mutual_info_regression计算特征的重要程度

1
2
3
4
5
6
7
8
from sklearn.feature_selection import mutual_info_regression
mutual_info_np = mutual_info_regression(part_data,data_y)
mutual_info_series = pd.Series(mutual_info_np, index=part_data.columns)
s1=sum(mutual_info_series)
print(s1)
print(mutual_info_series)
feature_v1 = mutual_info_series/s1
print(feature_v1)

利用单因素分析计算特征重要程度

1
2
3
4
5
6
7
8
9
10
data_x=part_data.values
corr_values = []
for i in range(data_x.shape[1]):
corr_values.append(abs(np.corrcoef(data_x[:,i], data_y)[0, 1]))
corr_series = pd.Series(corr_values, index=part_data.columns)
s=sum(corr_series)
print(s)
print(corr_series)
feature_v2=corr_series/s
print(feature_v2)

取两种方法的均值,并选出影响大的作为特征

1
2
3
4
5
6
feature_values = []
feature_values=(feature_v1+feature_v2)/2
print(feature_values)
tz = part_data.loc[:,feature_values > 0.01]
importances=tz.columns
print(importances)

特征选取,数据归一化

1
2
3
4
5
6
7
8
9
10
11
#数据归一化
from sklearn.preprocessing import StandardScaler
from sklearn import *
model = ensemble.ExtraTreesClassifier()
model.fit(tz,data_y.astype('int'))
scaler = StandardScaler()
# 训练标准化对象tz
scaler.fit(tz)
# transform()数据标准化x
tz= scaler.transform(tz)
print(tz)

Step 5 模型构建

划分数据集

1
2
3
4
5

from sklearn.model_selection import train_test_split
print(data_y.describe())
x_train,x_test,y_train,y_test = train_test_split(tz, data_y, test_size=0.3, random_state=20)

利用线性回归方法进行建模

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from sklearn.metrics import *
from sklearn.linear_model import LinearRegression
slr = LinearRegression()
slr.fit(x_train, y_train)

y_train_pred = slr.predict(x_train)

y_test_pred = slr.predict(x_test)
plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data')
#预测值与偏差的关系
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
plt.xlim([-10, 50])
plt.tight_layout()

plt.show()

利用岭回归建模

1
2
3
4
from numpy import genfromtxt
from sklearn import linear_model
model=linear_model.RidgeCV()#验证岭回归
model.fit(x_train,y_train)

模型评估 mse

1
2
3
4
5
6
7
8
mse_test=np.sum((y_test_pred-y_test)**2/len(y_test))
mse_test


y_predict=model.predict(x_test)
mse_predict = mean_squared_error(y_test, y_predict)
print(y_predict)
print(mse_predict)

1
2
3
4
5
6
7
8
9
10
11
12
import org.apache.spark.SparkContext 
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.HashPartitioner
object RemDup {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("RemDup")
val sc = new SparkContext(conf)
val dataFile = "file:///home/charles/data"
val data = sc.textFile(dataFile,2) val res = data.filter(_.trim().length>0).map(line=>(line.trim,"")).partitionBy(new HashPartitioner(1)).groupByKey().sortByKey().keys res.saveAsTextFile("result")
}
}