~vonfry/cpipc-2020

86acbc1e37e4bdef9bbfe8a36c4f89ab0df6c1dc — Vonfry 2 years ago 5b123f4 + 062b245 v0.0.1
Merge branch 'release/v0.0.1'
A .gitignore => .gitignore +553 -0
@@ 0,0 1,553 @@

# Created by https://www.toptal.com/developers/gitignore/api/windows,macos,linux,r,python,latex,office
# Edit at https://www.toptal.com/developers/gitignore?templates=windows,macos,linux,r,python,latex,office

### LaTeX ###
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.log
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb

## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf

## Generated if empty string is given at "Please type another file name for output:"
.pdf

## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml

## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync

## Build tool directories for auxiliary files
# latexrun
latex.out/

## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa

# achemso
acs-*.bib

# amsthm
*.thm

# beamer
*.nav
*.pre
*.snm
*.vrb

# changes
*.soc

# comment
*.cut

# cprotect
*.cpt

# elsarticle (documentclass of Elsevier journals)
*.spl

# endnotes
*.ent

# fixme
*.lox

# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm

#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R

# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs

# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist

# gnuplottex
*-gnuplottex-*

# gregoriotex
*.gaux
*.gtex

# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref

# hyperref
*.brf

# knitr
*-concordance.tex
# TODO Comment the next line if you want to keep your tikz graphics files
*.tikz
*-tikzDictionary

# listings
*.lol

# luatexja-ruby
*.ltjruby

# makeidx
*.idx
*.ilg
*.ind

# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*

# minted
_minted*
*.pyg

# morewrites
*.mw

# nomencl
*.nlg
*.nlo
*.nls

# pax
*.pax

# pdfpcnotes
*.pdfpc

# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd

# scrwfile
*.wrt

# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/

# pdfcomment
*.upa
*.upb

# pythontex
*.pytxcode
pythontex-files-*/

# tcolorbox
*.listing

# thmtools
*.loe

# TikZ & PGF
*.dpth
*.md5
*.auxlock

# todonotes
*.tdo

# vhistory
*.hst
*.ver

# easy-todo
*.lod

# xcolor
*.xcp

# xmpincl
*.xmpi

# xindy
*.xdy

# xypic precompiled matrices and outlines
*.xyc
*.xyd

# endfloat
*.ttt
*.fff

# Latexian
TSWLatexianTemp*

## Editors:
# WinEdt
*.bak
*.sav

# Texpad
.texpadtmp

# LyX
*.lyx~

# Kile
*.backup

# gummi
.*.swp

# KBibTeX
*~[0-9]*

# TeXnicCenter
*.tps

# auto folder when using emacs and auctex
./auto/*
*.el

# expex forward references with \gathertags
*-tags.tex

# standalone packages
*.sta

# Makeindex log files
*.lpz

# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib

### LaTeX Patch ###
# LIPIcs / OASIcs
*.vtc

# glossaries
*.glstex

### Linux ###
*~

# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*

# KDE directory preferences
.directory

# Linux trash folder which might appear on any partition or disk
.Trash-*

# .nfs files are created when an open file is removed but is still being accessed
.nfs*

### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

#!! ERROR: office is undefined. Use list command to see defined gitignore types !!#

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log

# Translations
*.mo
*.pot

# Django stuff:
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/
doc/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

### R ###
# History files
.Rhistory
.Rapp.history

# Session Data files
.RData

# User-specific files
.Ruserdata

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# R Environment Variables
.Renviron

### R.Bookdown Stack ###
# R package: bookdown caching files
/*_files/

### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db

# Dump file
*.stackdump

# Folder config file
[Dd]esktop.ini

# Recycle Bin used on file shares
$RECYCLE.BIN/

# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp

# Windows shortcuts
*.lnk

# End of https://www.toptal.com/developers/gitignore/api/windows,macos,linux,r,python,latex,office

data/
output/
*.xlsx
*.csv
log/
*.model

A 0preprocess.r => 0preprocess.r +68 -0
@@ 0,0 1,68 @@
library(openxlsx)

original_data <- read.xlsx("./data/附件一:325个样本数据.xlsx",
                           rowNames = T,
                           rows = c(2, 4 : 400))

quest1.s285 <- read.xlsx("./data/附件三:285号和313号样本原始数据.xlsx",
                         rowNames = T,
                         rows = c(2, 4 : 43), sheet = 5)

quest1.s313 <- read.xlsx("./data/附件三:285号和313号样本原始数据.xlsx",
                         rowNames = T,
                         rows = c(2, 45 : 84), sheet = 5)

quest1.material <- read.xlsx("./data/附件三:285号和313号样本原始数据.xlsx",
                             rowNames = T,
                             sheet = 1)

quest1.production <- read.xlsx("./data/附件三:285号和313号样本原始数据.xlsx",
                               rowNames = T,
                               rows = c(1, 3:4),
                               sheet = 2)

quest1.adsorbent.wait <- read.xlsx("./data/附件三:285号和313号样本原始数据.xlsx",
                                   rowNames = T,
                                   rows = c(1, 3:4),
                                   sheet = 3)

quest1.adsorbent.reproduce <- read.xlsx("./data/附件三:285号和313号样本原始数据.xlsx",
                                        rowNames = T,
                                        rows = c(1, 3:4),
                                        sheet = 4)

action_data <- read.xlsx("./data/附件四:354个操作变量信息.xlsx",
                         rowNames = T)

quest1.fun.filter <- function(df) {
    reduce_fn <- function(dff, colname) {
        value_range <- c(min(original_data[,colname]),
                         max(original_data[,colname]))
        dfc <- df[colname]
        cond <- dfc >= value_range[1] & dfc <= value_range[2]
        if (FALSE %in% cond) {
            message(colname, " is not satisfied.\n", summary(cond))
        }
        dff[cond, ]
    }
    Reduce(
        reduce_fn,
        colnames(df)[! colnames(df) %in% c("时间",
                                           "S-ZORB.PT_7503.DACA",
                                           "S-ZORB.TE_3111.DACA")],
        init = df
    )
}

message("filter 285")
quest1.fun.filter(quest1.s285) -> quest1.s285_filtered
message("filter 313")
quest1.fun.filter(quest1.s313) -> quest1.s313_filtered

quest1.s285_mean <- lapply(quest1.s285_filtered, mean)
quest1.s313_mean <- lapply(quest1.s313_filtered, mean)

write.csv(quest1.s285_filtered, "./data/s285_filtered.csv")
write.csv(quest1.s313_filtered, "./data/s313_filtered.csv")
write.csv(quest1.s285_mean, "./data/s285_mean.csv")
write.csv(quest1.s313_mean, "./data/s313_mean.csv")

A 1dimension_reduce_cor_and_var.r => 1dimension_reduce_cor_and_var.r +27 -0
@@ 0,0 1,27 @@
library(openxlsx)

processed <- read.xlsx("./data/processed.xlsx", rowNames = T)

# correlation
cor_processed <- lapply(
    colnames(processed)[3:368],
    function(col) {
        cor(processed["RON损失(不是变量)"],
            processed[col],
            method = c("pearson", "kendall", "spearman"))
    }
)

write.csv(cor_processed, "./data/processed_cor.csv")
write.xlsx(cor_processed, "./data/processed_cor.xlsx")

# Variance
var_processed <- lapply(
    colnames(processed)[3:368],
    function(col) {
        var(processed[col])
    }
)

write.csv(var_processed, "./data/processed_var.csv")
write.xlsx(var_processed, "./data/processed_var.xlsx")

A 1dimension_reduce_pca.r => 1dimension_reduce_pca.r +61 -0
@@ 0,0 1,61 @@
library(openxlsx)

processed <- read.xlsx("./data/processed.xlsx", rowNames = T)

prepca <- processed[, ! colnames(processed) %in% c("时间", "_辛烷值RON", "RON损失(不是变量)")]
pca <- prcomp(prepca, scale = TRUE, rank. = 24)

library(factoextra)
jpeg("./output/pca.jpg", width = 1360, height = 1360)
fviz_eig(pca)
dev.off()

jpeg("./output/pca-acc-contrib.jpg", width = 1360, height = 1360)
eig <- get_eig(pca)
eig_part <- eig[1:8,]
ggplot(eig_part,
       aes(x = rownames(eig_part),
           y = cumulative.variance.percent,
           group = 1)) +
    geom_bar(stat="identity",
             fill= "steelblue",
             color = "steelblue"
             ) +
    geom_line(color = "black", linetype = "solid") +
    geom_point(shape = 19, color = "black") +
    labs(title = "Scree plot",
         x = "Dimensions",
         y = "cumulative.variance.percent")
dev.off()

jpeg("./output/pca-ind.jpg", width = 1360, height = 1360)
fviz_pca_ind(pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )
dev.off()

jpeg("./output/pca-biplot.jpg", width = 1360, height = 1360)
fviz_pca_biplot(pca, repel = TRUE,
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969"  # Individuals color
                )
dev.off()

jpeg("./output/pca-contrib.jpg", width = 1360, height = 1360)
fviz_pca_var(pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )
dev.off()

write.csv(pca$x, "./data/data-after-pca.csv")

# non-linear pca
# library(pcaMethods)
# nlpca <- nlpca(prepca, nPcs = 24, maxSteps = 2 * prod(dim(prepca)))


# use random forest to select features

A 1dimension_reduce_randomforest.r => 1dimension_reduce_randomforest.r +17 -0
@@ 0,0 1,17 @@
library(openxlsx)

processed <- read.xlsx("./data/processed.xlsx", rowNames = T)

library(randomForest)

prerf <- processed[, ! colnames(processed) %in% c("时间", "_辛烷值RON", "RON损失(不是变量)")]
# prerf[, "RONL"] <- processed[,"RON损失(不是变量)"]

rf <- randomForest(x = prerf, y = processed[, "RON损失(不是变量)"])
imp <- importance(rf)
ordered_imp <- imp[order(imp, decreasing = T),]

write.csv(ordered_imp, "./data/features_importance.csv")

# chi test
library(MASS)

A 2predict.r => 2predict.r +43 -0
@@ 0,0 1,43 @@
library(openxlsx)
read.xlsx("./data/processed.xlsx") -> processed

# method 1, linear regression
# lm_result <- lm(RONL ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
#                   PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 +
#                   PC20 + PC21 + PC22 + PC23 + PC24,
#               processed)

# method 2: BP neur network
train_and_test <- function(df, name = '') {
    smp_size <- floor(0.8 * nrow(df))
    set.seed(2020)
    train_ind <- sample(seq_len(nrow(df)), size = smp_size)

    dfs <- scale(df, scale = T)
    dfsdf <- as.data.frame(dfs)
    dfsdf[, "RONL"] <- processed["RON损失(不是变量)"]

    train <- dfsdf[train_ind, ]
    test <- dfsdf[-train_ind, ]

    library(neuralnet)
    net <- neuralnet(RONL ~ .,
                     train, hidden = 10, rep = 100, linear.output = T)
    saveRDS(net, sprintf("./data/bpnet_%s.rds", name))
    compute(net, train) -> train_result
    compute(net, test) -> test_result
    library(MLmetrics)
    message("train MSE:", MSE(train_result$net.result, train$RONL))
    message("train R2: ", R2_Score(train_result$net.result, train$RONL))
    message("test MSE: ", MSE(test_result$net.result, test$RONL))
    message("test R2:  ", R2_Score(test_result$net.result, test$RONL))
    library(ggplot2)
}

df <- read.csv("./data/data-after-pca.csv")
train_and_test(df, 'pca_24')

# another pca data set
df2 <- read.xlsx("./data/主成分数据(1).xlsx", cols = seq(3, 25), sheet = "主成分数据表1")

train_and_test(df2, 'pca_k')

A 3optimization.py => 3optimization.py +149 -0
@@ 0,0 1,149 @@
import pdb

import numpy as np

import pandas as pd
action_data = pd.read_excel('./data/附件四:354个操作变量信息.xlsx', index_col=0)
feature = pd.read_excel('./data/feature.xlsx', sheet_name='Sheet1')['RON_LOSS_AND_S'].dropna(axis=0, how='any').tolist()
df = pd.read_excel('./data/325.xlsx', index_col=0, sheet_name='Sheet2').drop(['时间'] ,axis=1)

unmodified_feature_index = [5]
unmodified_feature = [ feature[i] for i in unmodified_feature_index ]
modified_feature = [ feature[i] for i in range(0, len(feature)) if not i in unmodified_feature_index]

import re
normalization_record = {}
def get_var_range(index):
    str_range = action_data[action_data['位号'] == index].iloc[0]['取值范围']
    str_range = re.sub(r'\(|\)|(|)', '', str_range)
    str_range = re.sub(r'(?<=-)?(?<=\d)+-', '/', str_range, 1)
    num_range = list(map(float, str_range.split('/')))
    def norm_range(num):
        mean = df[index].mean()
        std = df[index].std()
        global normalization_record
        normalization_record[index] = (mean, std)
        return (num - mean) / std
    num_range = list(map(norm_range , num_range))
    return num_range

from keras.models import load_model
model = load_model('./data/RON_LOSS_AND_S.h5')

df_ron_loss = df['RON损失']
df_ron_loss_norm = (df_ron_loss - df_ron_loss.mean()) / df_ron_loss.std()
norm_ron_loss_zero = (0 - df_ron_loss.mean()) / df_ron_loss.std()
df_s = df['_硫含量']
norm_s = (5 - df_s.mean()) / df_s.std()
norm_s_zero = (0 - df_s.mean()) / df_s.std()

def merge_feature_args(unmodified, modified):
    for i in range(0, len(unmodified_feature_index)):
        modified.insert(unmodified_feature_index[i], unmodified[i])

def fitness(fix_args, rowindex):
    def fitness_fun(args):
        pred_args = args.tolist()
        merge_feature_args(fix_args, pred_args)
        # step1: calculate target value
        # pdb.set_trace()
        (pred_ron_loss, pred_s) = model.predict(np.array([pred_args]))[0].tolist()
        # step2: check sulfur
        if pred_ron_loss < norm_ron_loss_zero:
            return 2
        if not (pred_s > norm_s_zero and pred_s <= norm_s):
            return 1
        # step3: compute RON LOSS RATE
        ron_loss = df_ron_loss_norm.iloc[rowindex]
        loss_reduce_rate = (ron_loss - pred_ron_loss ) / ron_loss
        if loss_reduce_rate > 0.8 :
            return 0
        if loss_reduce_rate < 0.3:
            return 1
        return -loss_reduce_rate
    return fitness_fun

from sko.GA import GA
modified_feature_range = list(map(get_var_range, modified_feature))
calcount = 0
def calculate_minimum(rowindex):
    global calcount
    print('\ncal count: ', calcount)
    row = df.iloc[rowindex]
    fix_args = row[unmodified_feature]
    fix_means = df[unmodified_feature].mean()
    fix_std = df[unmodified_feature].std()
    norm_fix_args = ((fix_args - fix_means) * fix_std).values.tolist()

    ga = GA(func=fitness(norm_fix_args, rowindex),
            size_pop=50, max_iter=800,
            n_dim=len(modified_feature),
            lb=[r[0] for r in modified_feature_range],
            ub=[r[1] for r in modified_feature_range],
            precision=1e-3)
    best_x, best_y = ga.run()
    print('best_x:', best_x, '\n', 'best_y:', best_y)
    original_best_x = [ best_x[i] * normalization_record[modified_feature[i]][1] + normalization_record[modified_feature[i]][0] for i in range(0, len(best_x))]
    print('original x:', original_best_x)
    best_x_dup = best_x.tolist()
    merge_feature_args(norm_fix_args, best_x_dup)
    pred_ron_loss =  model.predict(np.array([ best_x_dup ]))[0][0]
    print('predicted ron loss: ', pred_ron_loss)
    calcount += 1
    return fix_args.tolist() + original_best_x

optimzation = list(map(calculate_minimum, range(0, len(df))))

df_original_best_x = pd.DataFrame(optimzation, columns = unmodified_feature + modified_feature)

df_original_best_x.index = df.index

df_original_best_x.to_csv('./data/optimization.csv')

diff = df[feature] - df_original_best_x
diff.to_csv('./data/optimization-diff.csv')

diff_mean = diff.mean()
diff.to_csv('./data/optimization-diff-mean.csv')

diff_norm = (diff - diff_mean) / diff.std()

# df_original_best_x = pd.read_csv('./data/optimization.csv', index_col = 0)
df_original_best_x_norm = (df_original_best_x - df_original_best_x.mean()) / df_original_best_x.std()

df_norm = (df - df.mean()) / df.std()

df_133_ron_loss_norm = df_norm.iloc[132]['RON损失']
df_std_s = df.std()['_硫含量']
df_mean_s = df.mean()['_硫含量']
df_std_ron_loss = df.std()['RON损失']
df_mean_ron_loss = df.mean()['RON损失']
pred_opted_133 = model.pred(np.array([df_original_best_x_norm[feature].iloc[132]]))[0]
optimization_133_analysis = pd.DataFrame({
    'optimized_s': pred_opted_133[1],
    'optimized_ron_loss': pred_opted_133[0],
    'ron_loss_rate': (pred_opted_133[0]- df_133_ron_loss_norm) / df_133_ron_loss_norm
})
optimization_133_analysis.to_csv('./data/optimization_133_analysis.csv')

import matplotlib.pyplot as plt

fig_diff, axe_diff = plt.subplots()
axe_diff.boxplot(diff_norm.drop(['饱和烃'], axis=1).T,
                 vert=True,
                 patch_artist=True,
                 labels=[ 'D' + str(i+1) for i in range(0, len(modified_feature))])
axe_diff.hlines(0, 0.5, 16.5, colors='C8', zorder=3)
fig_diff.savefig('./output/optimization-diff.jpg')

fig_133, axe_133 = plt.subplots()
width_133 = 0.35
x = np.arange(1, len(feature) + 1)
rects1 = axe_133.bar(x - width_133/2, df_norm[feature].iloc[132].values, width_133, label='Original')
rects2 = axe_133.bar(x + width_133/2, df_original_best_x_norm[feature].iloc[132].values, width_133, label='Optimized')
axe_133.set_xticks(x)
axe_133.set_xticklabels([ 'D' + str(i + 1) for i in range(0, len(feature))])
axe_133.set_ylabel('norm values')
axe_133.legend()
fig_133.tight_layout()
fig_133.savefig('./output/optimization-133.jpg')

A ShubinSong/FFNN_Test.py => ShubinSong/FFNN_Test.py +24 -0
@@ 0,0 1,24 @@
import units as units
import pandas as pd
import tensorflow as tf
import numpy as np


data_path = '325.xlsx'
feature_path = 'feature.xlsx'
model_path = 'modelweight.model'

data = pd.read_excel(data_path, sheet_name='Sheet2').drop(['样本编号','时间'],axis=1) 
feature = pd.read_excel(feature_path, sheet_name='Sheet1')['name'].tolist()
model = tf.keras.models.load_model(model_path)

mse_ron,r2_ron,mse_ron_dlta,r2_ron_dlta = units.FFNN_test(model,data,feature)

print(mse_ron)
print(r2_ron)
print(mse_ron_dlta)
print(r2_ron_dlta)





A ShubinSong/FFNN_Train.py => ShubinSong/FFNN_Train.py +93 -0
@@ 0,0 1,93 @@
import pandas as pd
import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam,SGD
import numpy as np
import os
import math
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.utils import shuffle

ls = os.listdir('./log')
for i in ls:
	c_path=os.path.join('./log',i)
	os.remove(c_path)

inputfile = '325.xlsx'   #excel输入
featurefile = 'result.xlsx' #excel输出
modelfile = 'modelweight.model' #神经网络权重保存

da = pd.read_excel(inputfile, sheet_name='Sheet2').drop(['样本编号','时间'],axis=1) #pandas以DataFrame的格式读入excel表
data = shuffle(da).reset_index(drop=True)
feature = pd.read_excel(featurefile, sheet_name='Sheet1')['name'].tolist()

label = ['RON损失'] #标签一个,即需要进行预测的值
# label = ['_RON']
label_str = 'RON损失'  #标签一个,即需要进行预测的值

# print(feature)
data_train = data.loc[range(0,260)].copy() #标明excel表从第0行到520行是训练集
data_test_0 = data.loc[range(259,324)].copy()

data_mean = data_train.mean()  
data_std = data_train.std() 
data_train = (data_train - data_mean)/data_std #数据标准化
x_train = data_train[feature].values #特征数据
y_train = data_train[label].values #标签数据

test_mean = data_test_0.mean()
test_std = data_test_0.std() 
data_test = (data_test_0 - test_mean)/test_std
x_test = data_test[feature].values
y_test = data_test_0[label].values

batch_size = 65
epochs = 4000
learning_rate = 0.005

model = Sequential()  #层次模型
model.add(Dense(5,input_dim=len(feature),kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
# model.add(Dense(4,input_dim=8,kernel_initializer='uniform')) #输入层,Dense表示BP层
# model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(1,input_dim=5))  #输出层
adam = Adam(learning_rate=learning_rate)
model.compile(loss='mean_squared_error', optimizer=adam) #编译模型
model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size,callbacks=[TensorBoard(log_dir='./log')],validation_freq = 0) #训练模型1000次
model.save(modelfile) #保存模型

y = model.predict(x_test) * test_std[label_str] + test_mean[label_str]

y_test = pd.DataFrame(y_test)[0].tolist()
y = pd.DataFrame(y)[0].tolist()

score = mean_squared_error(y_test,y)
score2 = r2_score(y,y_test)

print('RON_均方差:' + str(score))
print('RON_R2:' + str(score2))

# ron_loss_pre = data_test_0['RON'].values - y
# ron_loss = data_test_0['RON损失'].values

# score = mean_squared_error(ron_loss,ron_loss_pre)
# score2 = r2_score(ron_loss,ron_loss_pre)

# print('RON_LOSS_均方差:' + str(score))
# print('RON_LOSS_R2:' + str(score2))

# data_pre = pd.DataFrame({
#     'RON': y_test,
#     'RON_PRED':y})

data_pre_loss = pd.DataFrame({
    'RON_LOSS': y_test,
    'RON_LOSS_PRED':y})

#6 画出预测结果图
import matplotlib.pyplot as plt 
# p = data_pre[['RON','RON_PRED']].plot(style=['b-o','r-*'])
p = data_pre_loss[['RON_LOSS','RON_LOSS_PRED']].plot(style=['b-o','r-*'])
plt.show()
\ No newline at end of file

A ShubinSong/RL_S_Test.py => ShubinSong/RL_S_Test.py +36 -0
@@ 0,0 1,36 @@
import units as units
import pandas as pd
import tensorflow as tf
import numpy as np
from keras.models import load_model
import h5py

data_path = '325.xlsx'
feature_path = 'feature.xlsx'
model_path = 'RON_LOSS_AND_S.h5'
Label_MIX_name = 'RON_LOSS_AND_S'

data = pd.read_excel(data_path, sheet_name='Sheet2').drop(['样本编号','时间'],axis=1) 
feature = pd.read_excel(feature_path, sheet_name='Sheet1')[Label_MIX_name].dropna(axis=0, how='any').tolist()
model = load_model(model_path)

data_mean = data.mean()  
data_std = data.std() 
data_train = (data - data_mean)/data_std
x_train = data_train[feature].values

y = model.predict(x_train)

y_ron = y[:,0]* data_std['RON损失'] + data_mean['RON损失']
y_s = y[:,1]* data_std['_硫含量'] + data_mean['_硫含量']
    
RON_LOSS = pd.DataFrame(y_ron)[0].values
S = pd.DataFrame(y_s)[0].values

print(RON_LOSS)
print(S)






A ShubinSong/RON_LOSS_AND_S.h5 => ShubinSong/RON_LOSS_AND_S.h5 +0 -0
A ShubinSong/RON_LOSS_AND_S_Train.py => ShubinSong/RON_LOSS_AND_S_Train.py +104 -0
@@ 0,0 1,104 @@
import pandas as pd
import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam,SGD
import numpy as np
import os
import math
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.utils import shuffle

ls = os.listdir('./log')
for i in ls:
	c_path=os.path.join('./log',i)
	os.remove(c_path)

inputfile = '325.xlsx'   #excel输入
featurefile = 'feature.xlsx' #excel输出
modelfile = 'RON_LOSS_AND_S.h5' #神经网络权重保存

da = pd.read_excel(inputfile, sheet_name='Sheet2').drop(['样本编号','时间'],axis=1) #pandas以DataFrame的格式读入excel表
data = shuffle(da).reset_index(drop=True)
feature = pd.read_excel(featurefile, sheet_name='Sheet1')['RON_LOSS_AND_S'].dropna(axis=0, how='any').tolist()

label = ['RON损失','_硫含量']
label_str_1 = 'RON损失'
label_str_2 = '_硫含量'

data_train = data.loc[range(0,260)].copy() #标明excel表从第0行到520行是训练集
data_test_0 = data.loc[range(259,324)].copy()

data_mean = data_train.mean()  
data_std = data_train.std() 
data_train = (data_train - data_mean)/data_std #数据标准化
x_train = data_train[feature].values #特征数据
y_train = data_train[label].values #标签数据

test_mean = data_test_0.mean()
test_std = data_test_0.std() 
data_test = (data_test_0 - test_mean)/test_std
x_test = data_test[feature].values

y_test_RON = data_test_0[label[0]].values
y_test_s = data_test_0[label[1]].values

batch_size = 65
epochs = 6000
learning_rate = 0.004

model = Sequential()  #层次模型
model.add(Dense(40,input_dim=len(feature),kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(40,input_dim=40,kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(40,input_dim=40,kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(20,input_dim=40,kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(10,input_dim=20,kernel_initializer='uniform')) #输入层,Dense表示BP层
model.add(Activation('sigmoid'))  #添加激活函数
model.add(Dense(2,input_dim=10))  #输出层
adam = Adam(learning_rate=learning_rate)
model.compile(loss='mean_absolute_error', optimizer=adam) #编译模型
model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size,callbacks=[TensorBoard(log_dir='./log')]) #训练模型1000次
model.save(modelfile) #保存模型


y = model.predict(x_test) 
y_ron = y[:,0]* test_std[label_str_1] + test_mean[label_str_1]
y_s = y[:,1]* test_std[label_str_2] + test_mean[label_str_2]


y_test_RON = pd.DataFrame(y_test_RON)[0].tolist()
y_ron = pd.DataFrame(y_ron)[0].tolist()

score_ron = mean_squared_error(y_test_RON,y_ron)
score2_ron = r2_score(y_test_RON,y_ron)

print('RON_LOSS 均方差:' + str(score_ron))
print('RON_LOSS R2:' + str(score2_ron))

y_test_s = pd.DataFrame(y_test_s)[0].tolist()
y_s = pd.DataFrame(y_s)[0].tolist()

score_s = mean_squared_error(y_test_s,y_s)
score2_s = r2_score(y_test_s,y_s)

print('S_均方差:' + str(score_s))
print('S_R2:' + str(score2_s))

data_pre = pd.DataFrame({
    'RON_LOSS': y_test_RON,
    'RON_LOSS_PRE':y_ron})

data_pre_s = pd.DataFrame({
    'S': y_test_s,
    'S_PRED':y_s})

#6 画出预测结果图
import matplotlib.pyplot as plt 
p = data_pre[['RON_LOSS','RON_LOSS_PRE']].plot(style=['b-o','r-*'])
p = data_pre_s[['S','S_PRED']].plot(style=['b-o','r-*'])
plt.show()
\ No newline at end of file

A ShubinSong/de_dim.py => ShubinSong/de_dim.py +94 -0
@@ 0,0 1,94 @@

from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from scipy import stats
import numpy as np
import pandas as pd
 
inputfile = '325.xlsx'   #excel输入
data = pd.read_excel(inputfile,index='Date', sheet_name='Sheet2')

X = data.drop(['样本编号','时间','_硫含量','_RON','RON损失'],axis=1) 
X = pd.get_dummies(X)

Y1 = data['_RON'] #标签一个,即需要进行预测的值
Y2 = data['RON损失']
Y3 = data[['_硫含量','_RON']]
Y4 = data['_硫含量']
Y5 = data[['_硫含量','RON损失']]

def de_dim(X,Y):
    model = RandomForestRegressor(random_state=10, max_depth=100)
    model.fit(X, Y)

    features = X.columns
    importances = model.feature_importances_

    df = pd.DataFrame({
        'name':features,
        'importances':importances
    })

    df.sort_values(by="importances" , ascending=False,inplace=True)

    df.to_excel('forest.xlsx')

    result0 = df[0:100]['name'].tolist()

    XX = data[result0]

    XX = XX.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
    
    var = XX.var()#获取每一列的方差
    cols = XX.columns
    col = [ ]
    for i in range(0,len(var)):
        if var[i]>=0.05:   # 将阈值设置为0.001
            col.append(cols[i])

    XXX = data[col]
    XXX = XXX.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

    cor2 = XXX.corr(method='spearman')
    result1 = []
    for index_y,item in cor2.iterrows():
        for index_x, row in item.iteritems():
            if abs(row) > 0.6 :
                break
            if index_x not in result1:
                result1.append(index_x)  
    # print(cor2)          
    return result1

f1 = de_dim(X,Y1)
f2 = de_dim(X,Y2)
f3 = de_dim(X,Y3)
f4 = de_dim(X,Y4)
f5 = de_dim(X,Y5)

df1 = pd.DataFrame({'_RON':f1})
df2 = pd.DataFrame({'RON_LOSS':f2})
df3 = pd.DataFrame({'RON_AND_S':f3})
df4 = pd.DataFrame({'_S':f4})
df5 = pd.DataFrame({'RON_LOSS_AND_S':f5})
    


# d1 = pd.DataFrame({'MIX':f1})
d2 = pd.DataFrame({'MIX':f2})
# d3 = pd.DataFrame({'MIX':f3})
d4 = pd.DataFrame({'MIX':f4})

d_mix = pd.concat([d2,d4], axis=0).drop_duplicates().reset_index(drop=True)

df = pd.concat([df1, df2, df3 ,df4 ,df5, d_mix], axis=1)

df.to_excel('feature.xlsx')
print(df)

# a = [x for x in f1 if x in f4]

# print(a)



A ShubinSong/units.py => ShubinSong/units.py +27 -0
@@ 0,0 1,27 @@
import pandas as pd
import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.models import Sequential
from sklearn.metrics import r2_score,mean_squared_error

def FFNN_test(model,x,feature,label = '_RON'):
    mean = x.mean()  
    std = x.std() 
    x_test = (x - mean)/std #数据归一
    x_test = x[feature].values #特征数据
    y_test = x[label].values    
    y = model.predict(x_test) #预测结果
    y = y * std[label] + mean[label] #数据反归一
    y = pd.DataFrame(y)[0].values

    # mse_ron = mean_squared_error(y_test,y)
    # r2_ron = r2_score(y_test,y)

    RON = x['RON'].values
    RON_LOSS = x['RON损失'].values
    RON_dlta =  RON - y

    # mse_ron_dlta = mean_squared_error(RON_LOSS,RON_dlta)
    # r2_ron_dlta = r2_score(RON_LOSS,RON_dlta)

    return y,RON_dlta
\ No newline at end of file

A doc/paper.ref.txt => doc/paper.ref.txt +2 -0
@@ 0,0 1,2 @@
Kubesh, J., King, S. R., & Liss, W. E. (1992). Effect of Gas Composition on Octane Number of Natural Gas Fuels. SAE Technical Paper Series. doi:10.4271/922359 
Pasadakis, N., Gaganis, V., & Foteinopoulos, C. (2006). Octane number prediction for gasoline blends. Fuel Processing Technology, 87(6), 505–509. doi:10.1016/j.fuproc.2005.11.006 

A readme.md => readme.md +15 -0
@@ 0,0 1,15 @@
# 华为杯数学建模 2020年

成员三人,选题:

## Dependencies

see [shell.nix](./shell.nix)

## data

The data dir is listed in gitignore, so you have to add it into your project by
yourself.

- `project_dir/data`
- `project_dir/../2020年中国研究生数学建模竞赛赛题`

A requirements.txt => requirements.txt +1 -0
@@ 0,0 1,1 @@
scikit-opt

A shell.nix => shell.nix +27 -0
@@ 0,0 1,27 @@
{ pkgs ? import <nixpkgs> {} }:

with pkgs;
let tex-combined = texlive.combine {
      inherit (texlive) scheme-medium collection-latexextra
        collection-bibtexextra collection-publishers;
    };
    rDeps = with rPackages; [ R
                              ggplot2 openxlsx
                              randomForest MASS factoextra pcaMethods
                              neuralnet MLmetrics
                              genalg
                            ];
    pythonPackages = python3Packages;
    pythonDeps = with pythonPackages; [ python numpy ipython pandas
                                        matplotlib # scikit-opt
                                        tensorflow Keras scikitlearn
                                        python3Packages.venvShellHook
                                      ];
in mkShell {
  venvDir = ".venv";
  buildInputs = [ tex-combined ] ++ pythonDeps ++ rDeps;

  postShellHook = ''
    pip install -r requirements.txt
  '';
}