~vonfry/cpipc-2020

7768efae3595f2f84a993d0aaa3318a3a4c927fe — Vonfry 2 years ago 9bd8bd2
add roommate data and shell.nix for ga
8 files changed, 240 insertions(+), 77 deletions(-)

M .gitignore
D 0preprocess.py
M 3optimization.py
A ShubinSong/FFNN_Test.py
A ShubinSong/FFNN_Train.py
A ShubinSong/de_dim.py
A ShubinSong/units.py
M shell.nix
M .gitignore => .gitignore +5 -1
@@ 546,4 546,8 @@ $RECYCLE.BIN/
# End of https://www.toptal.com/developers/gitignore/api/windows,macos,linux,r,python,latex,office

data/
output/
\ No newline at end of file
output/
*.xlsx
*.csv
log/
*.model

D 0preprocess.py => 0preprocess.py +0 -71
@@ 1,71 0,0 @@
import pandas as pd
import numpy as np

data_sample = r'sample.xlsx'
data_285 = r'sample_285_313.xlsx'
data_354_path = r'354.xlsx'

data = pd.read_excel(data_sample, sheet_name = 'Sheet1')
data_285_313 = pd.read_excel(data_285, sheet_name = '操作变量')
data_354 = pd.read_excel(data_354_path, sheet_name = 'Sheet1')

data_1 = data.drop(
    ['样本编号','时间','硫含量,μg/g','辛烷值RON','饱和烃,v%(烷烃+环烷烃)','烯烃,v%','芳烃,v%','溴值,gBr/100g','密度(20℃),kg/m³',
    '硫含量μg/g','_辛烷值RON','RON损失(不是变量)','焦炭wt%','Swt%','焦炭,wt%','S, wt%'],
    axis=1)

data_354 = data_354[['位号','取值范围','偏差']]

maxlist = []
minlist = []

for index_y, item in data_354.iterrows():
    maxdata = item['取值范围'].split('~')[1]
    mindata = item['取值范围'].split('~')[0]
    # print(index_y)
    maxlist.append(float(maxdata))
    minlist.append(float(mindata))

ma =  pd.Series(maxlist)     
mi =  pd.Series(minlist) 

data_354['min'] = mi
data_354['max'] = ma

dlta = pd.Series(data_354['偏差'].values, index=data_354['位号'])
during_max = pd.Series(data_354['max'].values, index=data_354['位号']) + dlta
during_min = pd.Series(data_354['min'].values, index=data_354['位号']) - dlta
# print(during_max)
# print(during_min)


del_list = []

for index_y, item in data_1.iterrows():
    for index_x, row in item.iteritems():
        if index_x != '时间'and index_x != 'S-ZORB.SIS_LT_1001.PV' and index_x != 'S-ZORB.FT_1204.TOTAL' and index_x != 'S-ZORB.AI_2903.PV': 
            # if  index_y == 0:
            #     print(during_min[index_x], row , during_max[index_x] )
            #     print(row < during_min[index_x])       
            if row < during_min[index_x] or row > during_max[index_x] :
                # print(index_y, index_x, during_min[index_x],row ,during_max[index_x])
                if index_y not in del_list :
                    del_list.append(index_y)


data_1.drop(index = del_list , inplace = True)

x = data_1.mean()
v = []
xx = []
n = 

for index_y, item in data_1.iterrows():
    for index_x, row in item.iteritems():
        if index_x != '时间':
            v.append(x[index_x] - row)
            xx.append(row*row)

v_0 = np.multiply(v,v) 
print(v_0)


M 3optimization.py => 3optimization.py +15 -0
@@ 1,5 1,20 @@
import numpy as np

import pandas as pd
action_data = pd.read_excel('./data/附件四:354个操作变量信息.xlsx', index_col=0)
feature = pd.read_excel('./data/feature.xlsx', sheet_name='Sheet1')['name'].tolist()
df = pd.read_excel('./data/325.xlsx', index_col=0, sheet_name='Sheet2').drop(['时间'] ,axis=1)

import re
def get_var_range(index):
    str_range = action_data[action_data['位号'] == index].iloc[0]['取值范围']
    str_range = re.sub(r'\(|\)|(|)', '', str_range)
    str_range = re.sub(r'(?<=-)?(?<=\d)+-', '/', str_range, 1)
    num_range = list(map(int, str_range.split('/')))
    return num_range

from keras.models import load_model
model = load_model('./data/modelweight.model')
def fitness(args):
    # step1: calculate target value
    # step2: compute RON LOSS RATE

A ShubinSong/FFNN_Test.py => ShubinSong/FFNN_Test.py +24 -0
@@ 0,0 1,24 @@
import units as units
import pandas as pd
import tensorflow as tf
import numpy as np


data_path = '325.xlsx'
feature_path = 'feature.xlsx'
model_path = 'modelweight.model'

data = pd.read_excel(data_path, sheet_name='Sheet2').drop(['样本编号','时间'],axis=1) 
feature = pd.read_excel(feature_path, sheet_name='Sheet1')['name'].tolist()
model = tf.keras.models.load_model(model_path)

mse_ron,r2_ron,mse_ron_dlta,r2_ron_dlta = units.FFNN_test(model,data,feature)

print(mse_ron)
print(r2_ron)
print(mse_ron_dlta)
print(r2_ron_dlta)





A ShubinSong/FFNN_Train.py => ShubinSong/FFNN_Train.py +93 -0
@@ 0,0 1,93 @@
import pandas as pd
import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam,SGD
import numpy as np
import os
import math
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.utils import shuffle

ls = os.listdir('./log')
for i in ls:
	c_path=os.path.join('./log',i)
	os.remove(c_path)

inputfile = '325.xlsx'   #excel输入
featurefile = 'result.xlsx' #excel输出
modelfile = 'modelweight.model' #神经网络权重保存

da = pd.read_excel(inputfile, sheet_name='Sheet2').drop(['样本编号','时间'],axis=1) #pandas以DataFrame的格式读入excel表
data = shuffle(da).reset_index(drop=True)
feature = pd.read_excel(featurefile, sheet_name='Sheet1')['name'].tolist()

label = ['RON损失'] #标签一个,即需要进行预测的值
# label = ['_RON']
label_str = 'RON损失'  #标签一个,即需要进行预测的值

# print(feature)
data_train = data.loc[range(0,260)].copy() #标明excel表从第0行到520行是训练集
data_test_0 = data.loc[range(259,324)].copy()

data_mean = data_train.mean()  
data_std = data_train.std() 
data_train = (data_train - data_mean)/data_std #数据标准化
x_train = data_train[feature].values #特征数据
y_train = data_train[label].values #标签数据

test_mean = data_test_0.mean()
test_std = data_test_0.std() 
data_test = (data_test_0 - test_mean)/test_std
x_test = data_test[feature].values
y_test = data_test_0[label].values

batch_size = 65
epochs = 4000
learning_rate = 0.005

model = Sequential()  #层次模型
model.add(Dense(5,input_dim=len(feature),kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
# model.add(Dense(4,input_dim=8,kernel_initializer='uniform')) #输入层,Dense表示BP层
# model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(1,input_dim=5))  #输出层
adam = Adam(learning_rate=learning_rate)
model.compile(loss='mean_squared_error', optimizer=adam) #编译模型
model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size,callbacks=[TensorBoard(log_dir='./log')],validation_freq = 0) #训练模型1000次
model.save(modelfile) #保存模型

y = model.predict(x_test) * test_std[label_str] + test_mean[label_str]

y_test = pd.DataFrame(y_test)[0].tolist()
y = pd.DataFrame(y)[0].tolist()

score = mean_squared_error(y_test,y)
score2 = r2_score(y,y_test)

print('RON_均方差:' + str(score))
print('RON_R2:' + str(score2))

# ron_loss_pre = data_test_0['RON'].values - y
# ron_loss = data_test_0['RON损失'].values

# score = mean_squared_error(ron_loss,ron_loss_pre)
# score2 = r2_score(ron_loss,ron_loss_pre)

# print('RON_LOSS_均方差:' + str(score))
# print('RON_LOSS_R2:' + str(score2))

# data_pre = pd.DataFrame({
#     'RON': y_test,
#     'RON_PRED':y})

data_pre_loss = pd.DataFrame({
    'RON_LOSS': y_test,
    'RON_LOSS_PRED':y})

#6 画出预测结果图
import matplotlib.pyplot as plt 
# p = data_pre[['RON','RON_PRED']].plot(style=['b-o','r-*'])
p = data_pre_loss[['RON_LOSS','RON_LOSS_PRED']].plot(style=['b-o','r-*'])
plt.show()
\ No newline at end of file

A ShubinSong/de_dim.py => ShubinSong/de_dim.py +62 -0
@@ 0,0 1,62 @@

from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
 
inputfile = '325.xlsx'   #excel输入
data = pd.read_excel(inputfile,index='Date', sheet_name='Sheet2')

X = data.drop(['样本编号','时间','_硫含量','_RON','RON损失'],axis=1) 
X = pd.get_dummies(X)

Y = data['_RON'] #标签一个,即需要进行预测的值

model = RandomForestRegressor(random_state=1, max_depth=10)
model.fit(X, Y)

features = X.columns
importances = model.feature_importances_

df = pd.DataFrame({
    'name':features,
    'importances':importances
})

df.sort_values(by="importances" , ascending=False,inplace=True)

result0 = df[0:60]['name'].tolist()

XX = data[result0]

XX = XX.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
 
var = XX.var()#获取每一列的方差
cols = XX.columns
col = [ ]
for i in range(0,len(var)):
    if var[i]>=0.04:   # 将阈值设置为0.001
        col.append(cols[i])

XXX = data[col]
XXX = XXX.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

cor2 = XXX.corr()

print(cor2)

result1 = []
for index_y, item in cor2.iterrows():
    for index_x, row in item.iteritems():
        if abs(row) > 0.4 :
            break
        if index_x not in result1:
            result1.append(index_x)

print(result1,len(result1))

df = pd.DataFrame({'name':result1})
df.to_excel('feature.xlsx')



A ShubinSong/units.py => ShubinSong/units.py +27 -0
@@ 0,0 1,27 @@
import pandas as pd
import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.models import Sequential
from sklearn.metrics import r2_score,mean_squared_error

def FFNN_test(model,x,feature,label = '_RON'):
    mean = x.mean()  
    std = x.std() 
    x_test = (x - mean)/std #数据归一
    x_test = x[feature].values #特征数据
    y_test = x[label].values    
    y = model.predict(x_test) #预测结果
    y = y * std[label] + mean[label] #数据反归一
    y = pd.DataFrame(y)[0].values

    # mse_ron = mean_squared_error(y_test,y)
    # r2_ron = r2_score(y_test,y)

    RON = x['RON'].values
    RON_LOSS = x['RON损失'].values
    RON_dlta =  RON - y

    # mse_ron_dlta = mean_squared_error(RON_LOSS,RON_dlta)
    # r2_ron_dlta = r2_score(RON_LOSS,RON_dlta)

    return y,RON_dlta
\ No newline at end of file

M shell.nix => shell.nix +14 -5
@@ 5,14 5,23 @@ let tex-combined = texlive.combine {
      inherit (texlive) scheme-medium collection-latexextra
        collection-bibtexextra collection-publishers;
    };
    rDeps = with rPackages; [ ggplot2 openxlsx
    rDeps = with rPackages; [ R
                              ggplot2 openxlsx
                              randomForest MASS factoextra pcaMethods
                              neuralnet MLmetrics
                              genalg
                            ];
    pythonWith = python3.withPackages (p: with p; [ pip numpy ipython pandas
                                                    matplotlib # scikit-opt
                                                  ]);
    pythonPackages = python3Packages;
    pythonDeps = with pythonPackages; [ python numpy ipython pandas
                                        matplotlib # scikit-opt
                                        tensorflow Keras
                                        python3Packages.venvShellHook
                                      ];
in mkShell {
  buildInputs = [ R tex-combined pythonWith ] ++ rDeps;
  venvDir = ".venv";
  buildInputs = [ tex-combined ] ++ pythonDeps ++ rDeps;

  postShellHook = ''
    pip install -r requirements.txt
  '';
}