import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from keras.utils import to_categorical
fileName1='data1.csv'
fileName2='data2.csv'
batchSize=1
ratio = 0.8
def readFile(fileName):
data = pd.read_csv(fileName)
allData = np.array(data.loc[:2000, :])
xData=np.array(allData)
return xData
# 打乱数据并生成batch数据集
def batch_data(data):
# 打乱顺序
data_size = data.shape[0] # 数据集个数
arr = np.arange(data_size) # 生成0到data_size个数
np.random.shuffle(arr) # 随机打乱arr数组
data = data[arr] # 将data以arr索引重新组合
# label = label[arr]
# 将label以arr索引重新组合
num = np.int(data.shape[0] * ratio)
return data
def Transpose(stringArray):
list=[]
for i in range(stringArray.shape[0]):
list.append(np.empty(shape=(2*len(stringArray[0][0]))))
for t in range(stringArray.shape[0]):
index = 0
for i in range(len(stringArray[t][0])):
list[t][index]=ord(stringArray[t][0][i])
index=index+1
for j in range(len(stringArray[t][1])):
list[t][index] = ord(stringArray[t][1][j])
index=index+1
strarray=np.array(list)
return strarray
def one_hotEncoder(totaldata,len):
list1 = []
for i in range(totaldata.shape[0]):
list1.append(np.zeros(shape=(60,91)))
for t in range(totaldata.shape[0]):
data = totaldata[t][0]+totaldata[t][1]
alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789.,/?:;|]}[{=+-_)(*&^%$#@!`~ZXCVBNM<>ASDFGHJKLPOIUYTREWQ'
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
integer_encoded = [char_to_int[char] for char in data]
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
# print('np.array(onehot_encoded).shape')#行数就是字符数
list1.append(np.array(onehot_encoded))
onehot_array=np.array(list1)
return onehot_array
def dataSplit(transarray,stringarray):
num = np.int(stringarray.shape[0] * ratio)
x_train = transarray[:num,:,]
y_train = stringarray[:num,2:]
x_test = transarray[num:,:,]
y_test = stringarray[num:,2:]
return x_train,x_test,y_train,y_test
def active(n):
if n>0.5:
return 1
else:
return 0
roundT =1
learnRateT = 0.001
unitCount = 128
rowCount = 660
element_size = 28
time_steps = 28
num_classes =2
batch_size = 200
hidden_layer_size = 128
def builNetWork():
# time_steps =cellCounts
x = tf.placeholder(shape=[1,60,91], dtype=tf.float32)
yTrain = tf.placeholder(shape=[1],dtype=tf.float32)
rnn_cell = tf.contrib.rnn.GRUCell(hidden_layer_size)#hiden_layer_size就是最终输出的output的列数
outputs, finalState = tf.nn.dynamic_rnn(cell=rnn_cell,inputs=x,dtype=tf.float32,time_major=False)#output的size为(batchsize,hiddenlayersize)
outputs=tf.nn.dropout(outputs,0.01)
w2 = tf.Variable(tf.constant(0.1,shape=[hidden_layer_size,1]), dtype=tf.float32)
b2 = tf.Variable(tf.constant(0.1,shape=[1]), dtype=tf.float32)
y = tf.sigmoid(tf.reduce_sum(tf.matmul(outputs[-1], w2) )+ b2)
loss = tf.square(y - yTrain)
train = tf.train.AdagradOptimizer(learnRateT).minimize(loss)
return x, y, train, time_steps, yTrain,loss,outputs, finalState
#读取数据
data1=readFile(fileName1)
data2=readFile(fileName2)
#拼接数据
totaldata=np.vstack((data1,data2))
# #打乱数据
randomdata=batch_data(totaldata)
transarray=one_hotEncoder(randomdata,len)
# #数字化
# transarray=Transpose(randomdata)
#切分数据
x_train,x_test,y_train,y_test=dataSplit(transarray,randomdata)
#定义网络
x, y, train, time_steps, yTrain,loss,outputs, finalState=builNetWork()
#运行会话
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(roundT):
total_loss = 0.0
for j in range(y_train.shape[0]):
x_trains = np.reshape(x_train[j], (1, 60, 91))
result = sess.run([loss,y],
feed_dict={x: x_trains, yTrain: y_train[j]})
if j % 20 == 1:
print("i: %d, loss: %s,y:%s,ytrain:%d\n"
% (i, result[0],result[1],y_train[j]))
#测试数据
result = np.empty(shape=(y_test.shape[0],1), dtype=np.int)
for i in range(y_test.shape[0]):
x_trains = np.reshape(x_test[i], (1, 60,91))
# print(x_trains)
results = sess.run([y], feed_dict={x:x_trains})
print(results)
result[i][0] = results[0]
y1=[]
for i in range(result.shape[0]):
y1.append(result[i][0])
y2=[]
for i in range(y_test.shape[0]):
y2.append(y_test[i][0])
y_pred=np.array(y1)
y_true=np.array(y2)
print("accurcy")
print(accuracy_score(y_true, y_pred))
print("F1")
print(metrics.f1_score(y_true, y_pred, average='weighted'))
print("ROC")
print(roc_auc_score(y_true, y_pred))
print("召回率rec")
print(metrics.recall_score(y_true, y_pred, average='macro'))
print("pre准确率")
print(precision_score(y_true, y_pred, average='weighted'))