관리 메뉴

솜씨좋은장씨

[Kaggle DAY22]Real or Not? NLP with Disaster Tweets! 본문

Kaggle/Real or Not? NLP with Disaster Tweets

[Kaggle DAY22]Real or Not? NLP with Disaster Tweets!

솜씨좋은장씨 2020. 3. 21. 07:32
728x90
반응형

Kaggle 도전 22회차!

오늘은 아르바이트를 다녀온 후 시간이 빠듯하여 그동안 제출했던 모델중에 가장 결과가 좋았던 모델들에

바뀐 데이터 전처리방식을 적용한 데이터를 활용하여 학습하고 결과를 도출해보았습니다.

 

데이터 전처리방식은 21회차와 동일합니다.

 

from keras.preprocessing.text import Tokenizer
max_words = 12396
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(X_train)
X_train_vec = tokenizer.texts_to_sequences(X_train)
X_test_vec = tokenizer.texts_to_sequences(X_test)
import matplotlib.pyplot as plt

print("문자의 최대 길이 :" , max(len(l) for l in X_train_vec))
print("문자의 평균 길이 : ", sum(map(len, X_train_vec))/ len(X_train_vec))
plt.hist([len(s) for s in X_train_vec], bins=50)
plt.xlabel('length of Data')
plt.ylabel('number of Data')
plt.show()

from keras.utils import np_utils
import numpy as np

y_train = []

for i in range(len(train['target'])):
  if train['target'].iloc[i] == 1:
    y_train.append([0, 1])
  elif train['target'].iloc[i] == 0:
    y_train.append([1, 0])

y_train = np.array(y_train)
from keras.layers import Embedding, Dense, LSTM, GRU, Dropout, Flatten, Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
max_len = 21

X_train_vec = pad_sequences(X_train_vec, maxlen=max_len)
X_test_vec = pad_sequences(X_test_vec, maxlen=max_len)

첫번째 제출

from keras import optimizers
adam2 = optimizers.Adam(lr=0.05, decay=0.1) 
model_3 = Sequential() 
model_3.add(Embedding(max_words, 100)) 
model_3.add(GRU(32)) 
model_3.add(Dropout(0.5)) 
model_3.add(Dense(2, activation='sigmoid')) 
model_3.compile(loss='binary_crossentropy', optimizer=adam2, metrics=['acc']) 
history = model_3.fit(X_train_vec, y_train, batch_size=16, epochs=1, validation_split=0.1)

predict = model_3.predict(X_test_vec) 
predict_labels = np.argmax(predict, axis=1) 
for i in range(len(predict_labels)): 
  predict_labels[i] = predict_labels[i] 
ids = list(test['id']) 
submission_dic = {"id":ids, "target":predict_labels} 
submission_df = pd.DataFrame(submission_dic) 
submission_df.to_csv("kaggle_day22.csv", index=False)

결과

 

두번째 제출

model2 = Sequential() 
model2.add(Embedding(max_words, 100, input_length=21)) 
model2.add(Flatten()) 
model2.add(Dense(128, activation='relu')) 
model2.add(Dense(2, activation='sigmoid')) 
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 
history = model2.fit(X_train_vec, y_train, epochs=1, batch_size=32, validation_split=0.1)

predict = model2.predict(X_test_vec) 
predict_labels = np.argmax(predict, axis=1) 
for i in range(len(predict_labels)): 
  predict_labels[i] = predict_labels[i] 
ids = list(test['id']) 
submission_dic = {"id":ids, "target":predict_labels} 
submission_df = pd.DataFrame(submission_dic) 
submission_df.to_csv("kaggle_day22_2.csv", index=False)

결과

 

세번째 제출

model2 = Sequential() 
model2.add(Embedding(max_words, 128, input_length=21)) 
model2.add(Dropout(0.2)) 
model2.add(Conv1D(256, 3, padding='valid', activation='relu', strides=1)) 
model2.add(GlobalMaxPooling1D()) 
model2.add(Dense(32, activation='relu')) 
model2.add(Dropout(0.2)) 
model2.add(Dense(2, activation='sigmoid')) 
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 
history2 = model2.fit(X_train_vec, y_train, epochs=1, batch_size=16, validation_split=0.1)

predict = model2.predict(X_test_vec) 
predict_labels = np.argmax(predict, axis=1) 
for i in range(len(predict_labels)): 
  predict_labels[i] = predict_labels[i] 
ids = list(test['id']) 
submission_dic = {"id":ids, "target":predict_labels} 
submission_df = pd.DataFrame(submission_dic) 
submission_df.to_csv("kaggle_day22_3.csv", index=False)

결과

 

네번째 제출

adam3 = optimizers.Adam(lr=0.03, decay=0.1) 
model_4 = Sequential() 
model_4.add(Embedding(max_words, 100)) 
model_4.add(GRU(32)) 
model_4.add(Dropout(0.5)) 
model_4.add(Dense(2, activation='sigmoid')) 
model_4.compile(loss='binary_crossentropy', optimizer=adam2, metrics=['acc']) 
history = model_4.fit(X_train_vec, y_train, batch_size=20, epochs=1, validation_split=0.1)

predict = model_4.predict(X_test_vec) 
predict_labels = np.argmax(predict, axis=1) 
for i in range(len(predict_labels)): 
  predict_labels[i] = predict_labels[i] 
ids = list(test['id']) 
submission_dic = {"id":ids, "target":predict_labels} 
submission_df = pd.DataFrame(submission_dic) 
submission_df.to_csv("kaggle_day22_4.csv", index=False)

결과

 

다섯번째 제출

adam2 = optimizers.Adam(lr=0.05, decay=0.1) 
model_3 = Sequential() 
model_3.add(Embedding(max_words, 100)) 
model_3.add(GRU(32)) 
model_3.add(Dropout(0.1)) 
model_3.add(Dense(2, activation='sigmoid')) 
model_3.compile(loss='binary_crossentropy', optimizer=adam2, metrics=['acc']) 
history = model_3.fit(X_train_vec, y_train, batch_size=16, epochs=1, validation_split=0.1)

predict = model_3.predict(X_test_vec) 
predict_labels = np.argmax(predict, axis=1) 
for i in range(len(predict_labels)): 
  predict_labels[i] = predict_labels[i] 
ids = list(test['id']) 
submission_dic = {"id":ids, "target":predict_labels} 
submission_df = pd.DataFrame(submission_dic) 
submission_df.to_csv("kaggle_day22_5.csv", index=False)

결과

Comments