Notice
Recent Posts
Recent Comments
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | |||||
3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 |
24 | 25 | 26 | 27 | 28 | 29 | 30 |
Tags
- gs25
- 편스토랑 우승상품
- Git
- 편스토랑
- Docker
- 데이콘
- 금융문자분석경진대회
- 프로그래머스
- github
- 우분투
- 자연어처리
- 프로그래머스 파이썬
- 파이썬
- 캐치카페
- 백준
- 코로나19
- 맥북
- programmers
- SW Expert Academy
- Baekjoon
- 더현대서울 맛집
- Real or Not? NLP with Disaster Tweets
- hackerrank
- PYTHON
- ubuntu
- Kaggle
- dacon
- ChatGPT
- AI 경진대회
- leetcode
Archives
- Today
- Total
솜씨좋은장씨
[Kaggle DAY19]Real or Not? NLP with Disaster Tweets! 본문
Kaggle/Real or Not? NLP with Disaster Tweets
[Kaggle DAY19]Real or Not? NLP with Disaster Tweets!
솜씨좋은장씨 2020. 3. 17. 00:51728x90
반응형
Kaggle 대회 19회차
오늘은 Bi-LSTM 모델을 활용해 보았습니다.
from keras import optimizers
from tqdm import tqdm
def getBestParams(params_list, lstm_hidden, embedding):
count = 0
histories = []
my_accs = []
my_batch = []
my_lr = []
my_epoch = []
embeddings = []
lstm_hiddens = []
for i in tqdm(range(len(params_list['batch_size']))):
for j in range(len(params_list['learning_rate'])):
for k in range(len(params_list['epochs'])):
batch_size = params_list['batch_size'][i]
learning_rate = params_list['learning_rate'][j]
epoch = params_list['epochs'][k]
print(str(count) + "th train")
print("batch_size : {} / lr : {} / epoch : {}".format(batch_size, learning_rate, epoch))
adam = optimizers.Adam(lr=learning_rate, decay=0.1)
accs = []
index = 1
for l in range(3):
model = Sequential()
model.add(Embedding(vocab_size, embedding, input_length=21))
model.add(Bidirectional(LSTM(lstm_hidden)))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc'])
history = model.fit(x_train_new, y_train_new, batch_size=batch_size, epochs=epoch, validation_split=0.1)
model_name = "./day19_1_model/day19" + str(count + 1)+"th_" + str() + str(lstm_hidden) + '_' + str(embedding) +"model.h5"
model.save(model_name)
acc = model.evaluate(x_valid_new, y_valid_new)[1]*100
accs.append(acc)
index = index + 1
my_acc = np.array(accs).mean()
print("{}th mean acc : {}%".format(count, my_acc))
my_accs.append(my_acc)
my_batch.append(batch_size)
my_lr.append(learning_rate)
my_epoch.append(epoch)
embeddings.append(embedding)
lstm_hiddens.append(lstm_hidden)
# histories.append(history)
# result_dict[count] = {"acc":my_acc, "batch_size":batch_size, "learning_rate":learning_rate, "epoch":epoch}
count = count + 1
my_result_df = pd.DataFrame({"batch_size":my_batch, "learning_rate":my_lr, "epoch":my_epoch, "accuracy":my_accs, "Embedding":embeddings, "lstm":lstm_hiddens})
csv_name = "./day19_model/day19" + str(lstm_hidden) + '_' + str(Embedding) + "" +"model.csv"
my_result_df.to_csv()
return my_result_df
기존에 만들었던 함수를 수정하여 사용해보았습니다.
params_list8 = {
"batch_size":[8, 16, 20, 24, 28, 32],
"learning_rate":[0.0000006, 0.0000009, 0.0000001, 0.01, 0.03, 0.05],
"epochs":[1, 2, 3]
}
my_result_df_8_1 = getBestParams(params_list8, 128, 100)
params_list8 = {
"batch_size":[8, 16, 20, 24, 28, 32],
"learning_rate":[0.01, 0.03, 0.05],
"epochs":[1, 2, 3]
}
my_result_df_8_1 = getBestParams(params_list8, 128, 100)
예상소요시간이 약 10시간 ~ 14시간이었지만 Colab환경의 최대시간보다 더 긴시간이라 중간에 끊어졌습니다 ㅠㅠ
그래서 함수가 실행되면서 저장된 모델을 load하고 validation 데이터로 평가하여 결과가 좋은 상위 5개의 모델을 활용해보았습니다.
from keras.models import load_model
def getModelsEvaluationAccs(model_file_path):
evaluate_accs = []
# model_names = os.listdir("./day19_1_model")
model_names = os.listdir(model_file_path)
for model_name in model_names:
model = load_model(model_file_path + "/" + model_name)
evaluation_acc = model.evaluate(x_valid_new, y_valid_new)[1]*100
evaluate_accs.append(evaluation_acc)
evaluation_DataFrame = pd.DataFrame({"model_name":model_names, "evaluate_accuracy":evaluate_accs})
return evaluation_DataFrame
eval_df_1 = getModelsEvaluationAccs("./day19_1_model")
eval_df_1 = eval_df_1.sort_values(by=['evaluate_accuracy'], axis=0, ascending=False)
eval_df_1
:
eval_df_2 = getModelsEvaluationAccs("./day19_model")
eval_df_2 = eval_df_2.sort_values(by=['evaluate_accuracy'], axis=0, ascending=False)
eval_df_2
:
첫번째 제출
model_1 = load_model("./day19_model/" + eval_df_2['model_name'].iloc[0])
predict = model_1.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
predict_labels[i] = predict_labels[i]
ids = list(test['id'])
submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19.csv", index=False)
결과
두번째 제출
model_2 = load_model("./day19_1_model/" + eval_df_1['model_name'].iloc[0])
predict = model_2.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
predict_labels[i] = predict_labels[i]
ids = list(test['id'])
submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19_2.csv", index=False)
결과
세번째 제출
model_3 = load_model("./day19_1_model/" + eval_df_1['model_name'].iloc[1])
predict = model_3.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
predict_labels[i] = predict_labels[i]
ids = list(test['id'])
submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19_3.csv", index=False)
결과
네번째 제출
model_4 = load_model("./day19_1_model/" + eval_df_1['model_name'].iloc[2])
predict = model_4.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
predict_labels[i] = predict_labels[i]
ids = list(test['id'])
submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19_4.csv", index=False)
결과
다섯번째 제출
model_5 = load_model("./day19_model/" + eval_df_1['model_name'].iloc[1])
predict = model_5.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
predict_labels[i] = predict_labels[i]
ids = list(test['id'])
submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day19_5.csv", index=False)
결과
다섯번째 제출은 데이터프레임을 잘못 선택하여 좋지 않은 결과를 얻었습니다.
'Kaggle > Real or Not? NLP with Disaster Tweets' 카테고리의 다른 글
[Kaggle DAY21]Real or Not? NLP with Disaster Tweets! (0) | 2020.03.19 |
---|---|
[Kaggle DAY20]Real or Not? NLP with Disaster Tweets! (0) | 2020.03.18 |
[Kaggle DAY18]Real or Not? NLP with Disaster Tweets! (0) | 2020.03.15 |
[Kaggle DAY17]Real or Not? NLP with Disaster Tweets! (0) | 2020.03.14 |
[Kaggle DAY16]Real or Not? NLP with Disaster Tweets! (0) | 2020.03.13 |
Comments