관리 메뉴

솜씨좋은장씨

[Kaggle DAY19]Real or Not? NLP with Disaster Tweets! 본문

Kaggle/Real or Not? NLP with Disaster Tweets

[Kaggle DAY19]Real or Not? NLP with Disaster Tweets!

솜씨좋은장씨 2020. 3. 17. 00:51
728x90
반응형

Kaggle 대회 19회차

오늘은 Bi-LSTM 모델을 활용해 보았습니다.

from keras import optimizers
from tqdm import tqdm

def getBestParams(params_list, lstm_hidden, embedding):
  count = 0
  histories = []

  my_accs = []
  my_batch = []
  my_lr = []
  my_epoch = []
  embeddings = []
  lstm_hiddens = []

  for i in tqdm(range(len(params_list['batch_size']))):
    for j in range(len(params_list['learning_rate'])):
      for k in range(len(params_list['epochs'])):
        batch_size = params_list['batch_size'][i]
        learning_rate = params_list['learning_rate'][j]
        epoch = params_list['epochs'][k]

        print(str(count) + "th train")
        print("batch_size : {} / lr : {} / epoch : {}".format(batch_size, learning_rate, epoch))

        adam = optimizers.Adam(lr=learning_rate, decay=0.1)

        accs = []
        index = 1
        for l in range(3):
          model = Sequential()
          model.add(Embedding(vocab_size, embedding, input_length=21))
          model.add(Bidirectional(LSTM(lstm_hidden)))
          model.add(Dropout(0.5))
          model.add(Dense(2, activation='sigmoid'))
          model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc'])
          history = model.fit(x_train_new, y_train_new, batch_size=batch_size, epochs=epoch, validation_split=0.1)
          model_name = "./day19_1_model/day19" + str(count + 1)+"th_" + str() + str(lstm_hidden) + '_' + str(embedding) +"model.h5"
          model.save(model_name)
          acc = model.evaluate(x_valid_new, y_valid_new)[1]*100
          accs.append(acc)
          index = index + 1
        my_acc = np.array(accs).mean()
        print("{}th mean acc : {}%".format(count, my_acc))

        my_accs.append(my_acc)
        my_batch.append(batch_size)
        my_lr.append(learning_rate)
        my_epoch.append(epoch)
        embeddings.append(embedding)
        lstm_hiddens.append(lstm_hidden)


        # histories.append(history)
        
        # result_dict[count] = {"acc":my_acc, "batch_size":batch_size, "learning_rate":learning_rate, "epoch":epoch}
        count = count + 1

  my_result_df = pd.DataFrame({"batch_size":my_batch, "learning_rate":my_lr, "epoch":my_epoch, "accuracy":my_accs, "Embedding":embeddings, "lstm":lstm_hiddens})
  csv_name = "./day19_model/day19" + str(lstm_hidden) + '_' + str(Embedding) + "" +"model.csv"
  my_result_df.to_csv()
    
  return my_result_df

기존에 만들었던 함수를 수정하여 사용해보았습니다.

params_list8 = {
    "batch_size":[8, 16, 20, 24, 28, 32],
    "learning_rate":[0.0000006, 0.0000009, 0.0000001, 0.01,  0.03, 0.05],
    "epochs":[1, 2, 3]
}
my_result_df_8_1 = getBestParams(params_list8, 128, 100)

params_list8 = {
    "batch_size":[8, 16, 20, 24, 28, 32],
    "learning_rate":[0.01,  0.03, 0.05],
    "epochs":[1, 2, 3]
}
my_result_df_8_1 = getBestParams(params_list8, 128, 100)

예상소요시간이 약 10시간 ~ 14시간이었지만 Colab환경의 최대시간보다 더 긴시간이라 중간에 끊어졌습니다 ㅠㅠ

그래서 함수가 실행되면서 저장된 모델을 load하고 validation 데이터로 평가하여 결과가 좋은 상위 5개의 모델을 활용해보았습니다.

from keras.models import load_model

def getModelsEvaluationAccs(model_file_path):
  evaluate_accs = []
  # model_names = os.listdir("./day19_1_model")
  model_names = os.listdir(model_file_path)


  for model_name in model_names:
    model = load_model(model_file_path + "/" + model_name)
    
    evaluation_acc = model.evaluate(x_valid_new, y_valid_new)[1]*100

    evaluate_accs.append(evaluation_acc)

  evaluation_DataFrame = pd.DataFrame({"model_name":model_names, "evaluate_accuracy":evaluate_accs})

  return evaluation_DataFrame
eval_df_1 = getModelsEvaluationAccs("./day19_1_model")
eval_df_1 = eval_df_1.sort_values(by=['evaluate_accuracy'], axis=0, ascending=False)
eval_df_1

:

eval_df_2 = getModelsEvaluationAccs("./day19_model")
eval_df_2 = eval_df_2.sort_values(by=['evaluate_accuracy'], axis=0, ascending=False)
eval_df_2

:

 

첫번째 제출

model_1 = load_model("./day19_model/" + eval_df_2['model_name'].iloc[0])
predict = model_1.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
  predict_labels[i] = predict_labels[i]

ids = list(test['id'])

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19.csv", index=False)

결과

 

두번째 제출

model_2 = load_model("./day19_1_model/" + eval_df_1['model_name'].iloc[0])
predict = model_2.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
  predict_labels[i] = predict_labels[i]

ids = list(test['id'])

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19_2.csv", index=False)

결과

 

세번째 제출

model_3 = load_model("./day19_1_model/" + eval_df_1['model_name'].iloc[1])
predict = model_3.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
  predict_labels[i] = predict_labels[i]

ids = list(test['id'])

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19_3.csv", index=False)

결과

 

네번째 제출

model_4 = load_model("./day19_1_model/" + eval_df_1['model_name'].iloc[2])
predict = model_4.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
  predict_labels[i] = predict_labels[i]

ids = list(test['id'])

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19_4.csv", index=False)

결과

 

 

다섯번째 제출

model_5 = load_model("./day19_model/" + eval_df_1['model_name'].iloc[1])

predict = model_5.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
  predict_labels[i] = predict_labels[i]

ids = list(test['id'])

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19_5.csv", index=False)

결과

다섯번째 제출은 데이터프레임을 잘못 선택하여 좋지 않은 결과를 얻었습니다.

 

Comments