~vonfry/cpipc-2020

96b2d3035f7e4e11bd96884eac3b9a3a15fa099a — Vonfry 2 years ago 5b16d8e
optimzation and other model
M 3optimization.py => 3optimization.py +62 -34
@@ 1,56 1,84 @@
import pdb

import numpy as np

import pandas as pd
action_data = pd.read_excel('./data/附件四:354个操作变量信息.xlsx', index_col=0)
feature = pd.read_excel('./data/feature.xlsx', sheet_name='Sheet1')['name'].tolist()
feature = pd.read_excel('./data/feature.xlsx', sheet_name='Sheet1')['RON_LOSS_AND_S'].dropna(axis=0, how='any').tolist()
df = pd.read_excel('./data/325.xlsx', index_col=0, sheet_name='Sheet2').drop(['时间'] ,axis=1)

unmodified_feature_index = [5]
unmodified_feature = [ feature[i] for i in unmodified_feature_index ]
modified_feature = [ feature[i] for i in range(0, len(feature)) if not i in unmodified_feature_index]

import re
normalzation_record = [
    {}, # min
    {}  # max
]
normalization_record = {}
def get_var_range(index):
    str_range = action_data[action_data['位号'] == index].iloc[0]['取值范围']
    str_range = re.sub(r'\(|\)|(|)', '', str_range)
    str_range = re.sub(r'(?<=-)?(?<=\d)+-', '/', str_range, 1)
    num_range = list(map(int, str_range.split('/')))
    def norm_range(idx):
        num = num_range[idx]
        df_ = df[index].copy()
        df_.loc[len(df) + 1] = num
        mean = df_.mean()
        std = df_.std()
        normalzation_record[idx][index] = {
            'mean': mean,
            'std':  std
        }
    num_range = list(map(float, str_range.split('/')))
    def norm_range(num):
        mean = df[index].mean()
        std = df[index].std()
        global normalization_record
        normalization_record[index] = (mean, std)
        return (num - mean) / std
    num_range = list(map(norm_range , range(0, len(num_range))))
    num_range = list(map(norm_range , num_range))
    return num_range

from keras.models import load_model
model = load_model('./data/modelweight.model')
model = load_model('./data/RON_LOSS_AND_S.h5')

df_ron_loss = df['RON损失']
min_ron_loss = (df_ron_loss.min() - df_ron_loss.mean()) / df_ron_loss.std()
min_ron_loss = (df_ron_loss.mean() - df_ron_loss.min()) / df_ron_loss.std()
df_s = df['_硫含量']
norm_s = abs(df_s.mean() - 5) / df_s.std()

def fitness(args):
    # step1: calculate target value
    pred_ron_loss = model.predict(np.array([args]))[0][0]
    # step2: check sulfur
    # TODO?
    # step3: compute RON LOSS RATE
    loss_reduce_rate = abs(pred_ron_loss - min_ron_loss / min_ron_loss)
    return -loss_reduce_rate
def merge_feature_args(unmodified, modified):
    for i in range(0, len(unmodified_feature_index)):
        modified.insert(unmodified_feature_index[i] + 1, unmodified[i])

def fitness(fix_args):
    def fitness_fun(args):
        pred_args = args.tolist()
        merge_feature_args(fix_args, pred_args)
        # step1: calculate target value
        # pdb.set_trace()
        (pred_ron_loss, pred_s) = model.predict(np.array([pred_args]))[0].tolist()
        # step2: check sulfur
        if pred_ron_loss < 0 and pred_s > norm_s:
            return 1
        # step3: compute RON LOSS RATE
        loss_reduce_rate = (min_ron_loss - pred_ron_loss ) / min_ron_loss
        return -loss_reduce_rate
    return fitness_fun

from sko.GA import GA
feature_range = list(map(get_var_range, feature))
ga = GA(func=fitness, n_dim=11, size_pop=50, max_iter=800,
        lb=[r[0] for r in feature_range],
        ub=[r[1] for r in feature_range],
        precision=1e-7)
best_x, best_y = ga.run()
print('best_x:', best_x, '\n', 'best_y:', best_y)
modified_feature_range = list(map(get_var_range, modified_feature))
def calculate_minimum(rowindex):
    row = df.iloc[rowindex]
    fix_args = row[unmodified_feature].values.tolist()

    ga = GA(func=fitness(fix_args),
            size_pop=50, max_iter=800,
            n_dim=len(modified_feature),
            lb=[r[0] for r in modified_feature_range],
            ub=[r[1] for r in modified_feature_range],
            precision=1e-7)
    best_x, best_y = ga.run()
    print('best_x:', best_x, '\n', 'best_y:', best_y)
    original_best_x = [ best_x[i] * normalization_record[modified_feature[i]][1] + normalization_record[modified_feature[i]][0] for i in range(0, len(best_x))]
    print('original x:', original_best_x)
    best_x_dup = best_x.tolist()
    merge_feature_args(fix_args, best_x_dup)
    print('predicted ron loss: ', model.predict(np.array([ best_x_dup ]))[0][0])
    return fix_args + original_best_x

optimzation = list(map(calculate_minimum, range(0, len(df))))

df_original_best_x = pd.DataFrame(optimzation, columns = unmodified_feature + modified_feature)

df_original_best_x.to_csv('./data/optimization.csv')

import matplotlib.pyplot as plt

A ShubinSong/RL_S_Test.py => ShubinSong/RL_S_Test.py +36 -0
@@ 0,0 1,36 @@
import units as units
import pandas as pd
import tensorflow as tf
import numpy as np
from keras.models import load_model
import h5py

data_path = '325.xlsx'
feature_path = 'feature.xlsx'
model_path = 'RON_LOSS_AND_S.h5'
Label_MIX_name = 'RON_LOSS_AND_S'

data = pd.read_excel(data_path, sheet_name='Sheet2').drop(['样本编号','时间'],axis=1) 
feature = pd.read_excel(feature_path, sheet_name='Sheet1')[Label_MIX_name].dropna(axis=0, how='any').tolist()
model = load_model(model_path)

data_mean = data.mean()  
data_std = data.std() 
data_train = (data - data_mean)/data_std
x_train = data_train[feature].values

y = model.predict(x_train)

y_ron = y[:,0]* data_std['RON损失'] + data_mean['RON损失']
y_s = y[:,1]* data_std['_硫含量'] + data_mean['_硫含量']
    
RON_LOSS = pd.DataFrame(y_ron)[0].values
S = pd.DataFrame(y_s)[0].values

print(RON_LOSS)
print(S)






A ShubinSong/RON_LOSS_AND_S.h5 => ShubinSong/RON_LOSS_AND_S.h5 +0 -0
A ShubinSong/RON_LOSS_AND_S_Train.py => ShubinSong/RON_LOSS_AND_S_Train.py +104 -0
@@ 0,0 1,104 @@
import pandas as pd
import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam,SGD
import numpy as np
import os
import math
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.utils import shuffle

ls = os.listdir('./log')
for i in ls:
	c_path=os.path.join('./log',i)
	os.remove(c_path)

inputfile = '325.xlsx'   #excel输入
featurefile = 'feature.xlsx' #excel输出
modelfile = 'RON_LOSS_AND_S.h5' #神经网络权重保存

da = pd.read_excel(inputfile, sheet_name='Sheet2').drop(['样本编号','时间'],axis=1) #pandas以DataFrame的格式读入excel表
data = shuffle(da).reset_index(drop=True)
feature = pd.read_excel(featurefile, sheet_name='Sheet1')['RON_LOSS_AND_S'].dropna(axis=0, how='any').tolist()

label = ['RON损失','_硫含量']
label_str_1 = 'RON损失'
label_str_2 = '_硫含量'

data_train = data.loc[range(0,260)].copy() #标明excel表从第0行到520行是训练集
data_test_0 = data.loc[range(259,324)].copy()

data_mean = data_train.mean()  
data_std = data_train.std() 
data_train = (data_train - data_mean)/data_std #数据标准化
x_train = data_train[feature].values #特征数据
y_train = data_train[label].values #标签数据

test_mean = data_test_0.mean()
test_std = data_test_0.std() 
data_test = (data_test_0 - test_mean)/test_std
x_test = data_test[feature].values

y_test_RON = data_test_0[label[0]].values
y_test_s = data_test_0[label[1]].values

batch_size = 65
epochs = 6000
learning_rate = 0.004

model = Sequential()  #层次模型
model.add(Dense(40,input_dim=len(feature),kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(40,input_dim=40,kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(40,input_dim=40,kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(20,input_dim=40,kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(10,input_dim=20,kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(2,input_dim=10))  #输出层
adam = Adam(learning_rate=learning_rate)
model.compile(loss='mean_absolute_error', optimizer=adam) #编译模型
model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size,callbacks=[TensorBoard(log_dir='./log')]) #训练模型1000次
model.save(modelfile) #保存模型


y = model.predict(x_test) 
y_ron = y[:,0]* test_std[label_str_1] + test_mean[label_str_1]
y_s = y[:,1]* test_std[label_str_2] + test_mean[label_str_2]


y_test_RON = pd.DataFrame(y_test_RON)[0].tolist()
y_ron = pd.DataFrame(y_ron)[0].tolist()

score_ron = mean_squared_error(y_test_RON,y_ron)
score2_ron = r2_score(y_test_RON,y_ron)

print('RON_LOSS 均方差:' + str(score_ron))
print('RON_LOSS R2:' + str(score2_ron))

y_test_s = pd.DataFrame(y_test_s)[0].tolist()
y_s = pd.DataFrame(y_s)[0].tolist()

score_s = mean_squared_error(y_test_s,y_s)
score2_s = r2_score(y_test_s,y_s)

print('S_均方差:' + str(score_s))
print('S_R2:' + str(score2_s))

data_pre = pd.DataFrame({
    'RON_LOSS': y_test_RON,
    'RON_LOSS_PRE':y_ron})

data_pre_s = pd.DataFrame({
    'S': y_test_s,
    'S_PRED':y_s})

#6 画出预测结果图
import matplotlib.pyplot as plt 
p = data_pre[['RON_LOSS','RON_LOSS_PRE']].plot(style=['b-o','r-*'])
p = data_pre_s[['S','S_PRED']].plot(style=['b-o','r-*'])
plt.show()
\ No newline at end of file

M ShubinSong/de_dim.py => ShubinSong/de_dim.py +65 -33
@@ 2,6 2,7 @@
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from scipy import stats
import numpy as np
import pandas as pd
 


@@ 11,52 12,83 @@ data = pd.read_excel(inputfile,index='Date', sheet_name='Sheet2')
X = data.drop(['样本编号','时间','_硫含量','_RON','RON损失'],axis=1) 
X = pd.get_dummies(X)

Y = data['_RON'] #标签一个,即需要进行预测的值
Y1 = data['_RON'] #标签一个,即需要进行预测的值
Y2 = data['RON损失']
Y3 = data[['_硫含量','_RON']]
Y4 = data['_硫含量']
Y5 = data[['_硫含量','RON损失']]

model = RandomForestRegressor(random_state=1, max_depth=10)
model.fit(X, Y)
def de_dim(X,Y):
    model = RandomForestRegressor(random_state=10, max_depth=100)
    model.fit(X, Y)

features = X.columns
importances = model.feature_importances_
    features = X.columns
    importances = model.feature_importances_

df = pd.DataFrame({
    'name':features,
    'importances':importances
})
    df = pd.DataFrame({
        'name':features,
        'importances':importances
    })

df.sort_values(by="importances" , ascending=False,inplace=True)
    df.sort_values(by="importances" , ascending=False,inplace=True)

result0 = df[0:60]['name'].tolist()
    df.to_excel('forest.xlsx')

XX = data[result0]
    result0 = df[0:100]['name'].tolist()

XX = XX.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
 
var = XX.var()#获取每一列的方差
cols = XX.columns
col = [ ]
for i in range(0,len(var)):
    if var[i]>=0.04:   # 将阈值设置为0.001
        col.append(cols[i])
    XX = data[result0]

    XX = XX.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
    
    var = XX.var()#获取每一列的方差
    cols = XX.columns
    col = [ ]
    for i in range(0,len(var)):
        if var[i]>=0.05:   # 将阈值设置为0.001
            col.append(cols[i])

    XXX = data[col]
    XXX = XXX.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

    cor2 = XXX.corr(method='spearman')
    result1 = []
    for index_y,item in cor2.iterrows():
        for index_x, row in item.iteritems():
            if abs(row) > 0.6 :
                break
            if index_x not in result1:
                result1.append(index_x)  
    # print(cor2)          
    return result1

XXX = data[col]
XXX = XXX.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
f1 = de_dim(X,Y1)
f2 = de_dim(X,Y2)
f3 = de_dim(X,Y3)
f4 = de_dim(X,Y4)
f5 = de_dim(X,Y5)

cor2 = XXX.corr()
df1 = pd.DataFrame({'_RON':f1})
df2 = pd.DataFrame({'RON_LOSS':f2})
df3 = pd.DataFrame({'RON_AND_S':f3})
df4 = pd.DataFrame({'_S':f4})
df5 = pd.DataFrame({'RON_LOSS_AND_S':f5})
    

print(cor2)

result1 = []
for index_y, item in cor2.iterrows():
    for index_x, row in item.iteritems():
        if abs(row) > 0.4 :
            break
        if index_x not in result1:
            result1.append(index_x)
# d1 = pd.DataFrame({'MIX':f1})
d2 = pd.DataFrame({'MIX':f2})
# d3 = pd.DataFrame({'MIX':f3})
d4 = pd.DataFrame({'MIX':f4})

print(result1,len(result1))
d_mix = pd.concat([d2,d4], axis=0).drop_duplicates().reset_index(drop=True)

df = pd.concat([df1, df2, df3 ,df4 ,df5, d_mix], axis=1)

df = pd.DataFrame({'name':result1})
df.to_excel('feature.xlsx')
print(df)

# a = [x for x in f1 if x in f4]

# print(a)