관리 메뉴

솜씨좋은장씨

[Kaggle DAY21]Real or Not? NLP with Disaster Tweets! 본문

Kaggle/Real or Not? NLP with Disaster Tweets

[Kaggle DAY21]Real or Not? NLP with Disaster Tweets!

솜씨좋은장씨 2020. 3. 19. 19:23
728x90
반응형

Kaggle 도전 20회차!

이제 정말 종료까지 얼마 남지 않았았습니다.

 

오늘은 먼저 19회차의 방법으로 전처리를 한후에 TF-IDF로 임베딩을하고

lightGBM 모델을 사용하여 결과를 내보았습니다.

import re

clear_text_list = list(train['clear_text'])

X_train = []
for clear_text in clear_text_list:
  word_list = word_tokenize(clear_text)
  word_list = [word for word in word_list if len(word) > 2]
  word_list = [word for word in word_list if word not in stop_words]
  # word_list = [stemmer.stem(word) for word in word_list]
  word_list = [lemmatizer.lemmatize(word) for word in word_list]
  X_train.append(' '.join(word_list))
X_train[:7]
train['clear_text2'] = X_train

먼저 토큰화해서 저장했던 단어들을 다시 문장으로 합쳐서 clear_text2라는 column값으로 합쳐주었습니다.

import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

tweets = list(train['clear_text2'])
real_or_not = list(train['target'])

vectorizer = TfidfVectorizer(
            min_df= 2,
            analyzer="word",
            sublinear_tf=True,
            ngram_range=(1,3),
            max_features=10000
        )

X_data = vectorizer.fit_transform(tweets)
test_tweet = list(test['clear_text2'])
X_test = vectorizer.fit_transform(test_tweet)

그 clear_text2 의 column에 있는 데이터를 기준으로 TF-IDF를 만들었습니다.

단어 단위로 실시했고 uni-gram, bi-gram, Tri-gram을 사용하였습니다.

최대 feauture 수는 10,000개로 설정하였습니다.

vectorizer.vocabulary_.items()
dict_items([('reason', 5970), ('earthquake', 2121), ('may', 4537), ('god', 2983), ('may god', 4541), ('forest', 2770), ('fire', 2624), ('near', 4910), ('canada', 1014), ('forest fire', 2771), 
('fire near', 2641), ('resident', 6226), ('asked', 371), ('shelter', 6862), ('place', 5509), ('officer', 5132), ('evacuation', 2300), ('order', 5227), ('expected', 2358), ('people', 5401), ('receive', 5981), 
('wildfire', 9639), ('california', 974), ('evacuation order', 2304), ('got', 3014), ('sent', 6739), ('photo', 5458), ('alaska', 148), ('smoke', 7158), ('school', 6622), ('rocky', 6438), ('update', 8696), 
('hwy', 3495), ('closed', 1292), ('direction', 1929), ('due', 2094), ('lake', 3919), ('county', 1527), ('rocky fire', 6439), ('fire update', 2649), ('flood', 2704), ('disaster', 1940), ('heavy', 3244), 
('rain', 5847), ('cause', 1099), ('flash', 2687), ('flooding', 2721), ('street', 7551), ('colorado', 1360), ('spring', 7392), ('area', 311), ('flood disaster', 2714), ('heavy rain', 3245), 
('flash flooding', 2690), ('top', 8208), ('hill', 3301), ('see', 6671), ('wood', 9768), ('see fire', 6672), ('emergency', 2204), ('happening', 3156), ('building', 908), ('across', 51), ('afraid', 94), 
('tornado', 8228), ('coming', 1385), ('three', 8117), ('died', 1915), ('heat', 3237), ('wave', 9465), ('far', 2481), ('three people', 8120), ('people died', 5408), ('heat wave', 3240), ('haha', 3120), 
('south', 7283), ('tampa', 7881), ('getting', 2948), ('flooded', 2720), ('hah', 3119), ('wait', 9327), ('second', 6655), ('live', 4267), ('gon', 3001), ('raining', 5850), ('florida', 2726), ('day', 1707), 
('lost', 4346), ('myanmar', 4868), ('arrived', 344), ('damage', 1672), ('bus', 929), ('multi', 4835), ('car', 1043), ('crash', 1559), ('breaking', 830), ('school bus', 6623), ('man', 4458), ('love', 4356), 
('fruit', 2845), ('summer', 7651), ('lovely', 4367), ('fast', 2498), ('goal', 2982), ('ridiculous', 6358), ('london', 4293), ('cool', 1487), ('wonderful', 9763), ('way', 9475), ('eat', 2135), ('shit', 6887), 
('new', 4950), ('york', 9937), ('city', 1254), ('last', 3957), ('week', 9538), ('new york', 4976), ('york city', 9938), ('last week', 3965), ('new york city', 4977), ('girlfriend', 2961), ('col', 1321), 
('like', 4117), ('end', 2228), ('user', 8721), ('wholesale', 9609), ('market', 4502), ('ablaze', 19), ('link', 4153), ('wholesale market', 9610), ('ablaze link', 20), ('wholesale market ablaze', 9611), 
('always', 179), ('try', 8503), ('bring', 845), ('metal', 4612), ('retweet', 6282), ('retweet link', 6285), ('news', 4986), ('nigeria', 5016), ('flag', 2675), ('set', 6781), ('aba', 0), ('breaking news', 831), 
('set ablaze', 6782), ('cry', 1632), ('plus', 5565), ('side', 6970), ('look', 4307), ('sky', 7118), ('night', 5021), ('last night', 3962), ('built', 913), ('much', 4828), ('hype', 3497), ('around', 336), 
('doubt', 2022), ('season', 6652), ('office', 5131), ('set ablaze link', 6783), ('jamaica', 3736), ('two', 8589), ('santa', 6558), ('cruz', 1629), ('head', 3205), ('police', 5577), ('two car', 8594), 
('santa cruz', 6559), ('lord', 4330), ('check', 1184), ('safe', 6509), ('work', 9774), ('check link', 1185), ('link link', 4185), ('safe work', 6510), ('link link link', 4187), ('outside', 5263), 
('alive', 157), ('dead', 1715), ('inside', 3605), ('awesome', 458), ('time', 8154), ('site', 7081), ('thanks', 8046), ('taking', 7867), ('care', 1054), ('taking care', 7868), ('wanted', 9360), 
('chicago', 1203), ('hotel', 3425), ('gained', 2883), ('follower', 2745), ('know', 3881), ('stats', 7456), ('grow', 3072), ('west', 9550), ('burned', 920), ('thousand', 8103), ('alone', 169), 
('thousand wildfire', 8107), ('wildfire ablaze', 9642), ('west burned thousand', 9553), ('thousand wildfire ablaze', 8108), ('wildfire ablaze california', 9643), ('perfect', 5432), ('life', 4087), 
('leave', 4027), ('first', 2654), ('quite', 5819), ('weird', 9543), ('better', 599), ('get', 2936), ('used', 8717), ('wear', 9516), ('every', 2319), ('single', 7030), ('next', 5006), ('year', 9896), .....

 

이제 이 데이터를 가지고 lightGBM을 사용해보겠습니다.

from sklearn.model_selection import GridSearchCV

def get_best_params(model, params):
    grid_model = GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_data, y)
    rmse = np.sqrt(-1* grid_model.best_score_)
    print('최적 평균 RMSE 값:', np.round(rmse, 4))
    print('최적 파라미터:', grid_model.best_params_)
    
    return grid_model.best_estimator_

GridSearchCV를 활용하여  최적의 파라미터를 찾아보았습니다.

from lightgbm import LGBMClassifier, LGBMRegressor
lgb_for_best = LGBMClassifier()
lgb_param_grid = { 
    'n_estimators' : [100, 200, 300, 400], 
    'max_depth' : [5, 10, 15, 20],
    'learning_rate' : [0.001, 0.005, 0.01, 0.05, 0.1]
    }
best_lgb_param = get_best_params(lgb_for_best, lgb_param_grid)

lgb_param_grid2 = { 
    'n_estimators' : [250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350], 
    'max_depth' : [5, 10, 15, 20],
    'learning_rate' : [0.001, 0.005, 0.01, 0.05, 0.1]
    }
best_lgb_param = get_best_params(lgb_for_best, lgb_param_grid2)

lgb_param_grid = { 
    'n_estimators' : [500, 600, 700], 
    'max_depth' : [5, 10, 15, 20],
    'learning_rate' : [0.001, 0.005, 0.01, 0.05, 0.1]
    }

 

이번에는 feature수를 7000개로 줄여보았습니다.

tweets2 = list(train['clear_text2'])
real_or_not2 = list(train['target'])

vectorizer2 = TfidfVectorizer(
            min_df= 2,
            analyzer="word",
            sublinear_tf=True,
            ngram_range=(1,3),
            max_features=7000
        )

X_data2 = vectorizer2.fit_transform(tweets2)
test_tweet2 = list(test['clear_text2'])
X_test2 = vectorizer2.fit_transform(test_tweet2)
from sklearn.model_selection import GridSearchCV

def get_best_params2(model, params):
    grid_model = GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_data2, y)
    rmse = np.sqrt(-1* grid_model.best_score_)
    print('최적 평균 RMSE 값:', np.round(rmse, 4))
    print('최적 파라미터:', grid_model.best_params_)
    
    return grid_model.best_estimator_
lgb_param_grid = { 
    'n_estimators' : [100, 200, 300, 400], 
    'max_depth' : [5, 10, 15, 20],
    'learning_rate' : [0.001, 0.005, 0.01, 0.05, 0.1]
    }
best_lgb_param = get_best_params2(lgb_for_best, lgb_param_grid) 

첫번째 제출

LgbmClassifier

max_feautre : 7000 / learning_rate : 0.05 / max_depth : 20 / n_estimators : 400

lgb_model_best_params = LGBMClassifier(learning_rate=0.05, max_depth=20, n_estimators=400)
lgb_model_best_params.fit(X_data2, y)
predict = lgb_model_best_params.predict(X_test2)
predict_labels = predict

ids = list(test['id'])
print(len(ids))

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day21.csv", index=False)
pd.read_csv("kaggle_day21.csv")

결과

 

 

from lightgbm import LGBMClassifier, LGBMRegressor
lgb_for_best = LGBMClassifier(boosting_type= 'gbdt', objective = 'binary')
tweets3 = list(train['clear_text2'])
real_or_not3 = list(train['target'])

vectorizer3 = TfidfVectorizer(
            min_df= 2,
            analyzer="word",
            sublinear_tf=True,
            ngram_range=(1,3),
            max_features=3000
        )

X_data3 = vectorizer3.fit_transform(tweets3)
test_tweet3 = list(test['clear_text2'])
X_test3 = vectorizer3.fit_transform(test_tweet3)
from sklearn.model_selection import GridSearchCV

def get_best_params3(model, params):
    grid_model = GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_data3, y)
    rmse = np.sqrt(-1* grid_model.best_score_)
    print('최적 평균 RMSE 값:', np.round(rmse, 4))
    print('최적 파라미터:', grid_model.best_params_)
    
    return grid_model.best_estimator_
lgb_param_grid = { 
    'n_estimators' : [100, 200, 300, 400], 
    'max_depth' : [5, 10, 15, 20],
    'learning_rate' : [0.001, 0.005, 0.01, 0.05, 0.1]
    }
best_lgb_param = get_best_params3(lgb_for_best, lgb_param_grid) 

 

두번째 제출

LGBMClassifier

max_feature : 3000 / learning_rate : 0.1 / max_depth : 20 / n_estimators : 300 

lgb_model_best_params = LGBMClassifier(boosting_type= 'gbdt', objective = 'binary', learning_rate=0.1, max_depth=20, n_estimators=300)
lgb_model_best_params.fit(X_data3, y)
predict = lgb_model_best_params.predict(X_test3)
predict_labels = predict

ids = list(test['id'])
print(len(ids))

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day21_2.csv", index=False)
pd.read_csv("kaggle_day21_2.csv")

결과

 

세번째 제출

LogisticRegresstion

class_weight : balanced

from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight = 'balanced')
lgs.fit(X_data3, y)
predict = lgs.predict(X_test3)
predict_labels = predict
# predict_labels = np.argmax(predict, axis=1)
# for i in range(len(predict_labels)):
#   predict_labels[i] = predict_labels[i]


ids = list(test['id'])
print(len(ids))

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day21_3.csv", index=False)
pd.read_csv("kaggle_day21_3.csv")

결과

 

생각보다 너무.....너무..... 결과가 좋지 않았습니다.

 

이번엔 그냥 lstm 모델을 활용해보기로 했습니다.

import re

clear_text_list = list(train['clear_text'])

X_train2 = []
for clear_text in clear_text_list:
  word_list = word_tokenize(clear_text)
  word_list = [word for word in word_list if len(word) > 2]
  word_list = [word for word in word_list if word not in stop_words]
  # word_list = [stemmer.stem(word) for word in word_list]
  word_list = [lemmatizer.lemmatize(word) for word in word_list]
  X_train2.append(word_list)
X_train2[:7]
clear_text_list = list(test['clear_text'])

X_test2 = []
for clear_text in clear_text_list:
  word_list = word_tokenize(clear_text)
  word_list = [word for word in word_list if len(word) > 2]
  word_list = [word for word in word_list if word not in stop_words]
  # word_list = [stemmer.stem(word) for word in word_list]
  word_list = [lemmatizer.lemmatize(word) for word in word_list]
  X_test2.append(word_list)
X_test2[:7]
import numpy as np
import pandas as pd
from keras import backend as K
from keras.layers import Embedding, Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed, Dropout
from keras.models import Model

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm

import matplotlib.pyplot as plt
# 단어들에 넘버링 하기
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train2)
X_train_vec = tokenizer.texts_to_sequences(X_train2)
X_test_vec = tokenizer.texts_to_sequences(X_test2)
print('max length :',max(len(l) for l in X_train2))
print('average length :',sum(map(len, X_train2))/len(X_train2))
plt.hist([len(s) for s in X_train2], bins=50)
plt.xlabel('length')
plt.ylabel('number')
plt.show()

max_len = 21
x_train = pad_sequences(X_train_vec, maxlen=max_len)
x_test = pad_sequences(X_test_vec, maxlen=max_len)
from keras.utils import np_utils
import numpy as np

y_train = []

for i in range(len(train['target'])):
  if train['target'].iloc[i] == 1:
    y_train.append([0, 1])
  elif train['target'].iloc[i] == 0:
    y_train.append([1, 0])

y_train = np.array(y_train)
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
vocabs = []

for i in range(len(X_train2)):
  for j in range(len(X_train2[i])):
    vocabs.append(X_train2[i][j])
len(list(set(vocabs)))

vocab_size = 12396
from sklearn.model_selection import train_test_split
x_train_new, x_valid_new = train_test_split(x_train, test_size=0.1, shuffle=False, random_state=34)
y_train_new, y_valid_new = train_test_split(y_train, test_size=0.1, shuffle=False, random_state=34)
from keras import optimizers

from tqdm import tqdm

def getBestParams(params_list, lstm_hidden, embedding, dropout):
  count = 0

  for i in tqdm(range(len(params_list['batch_size']))):
    for j in range(len(params_list['learning_rate'])):
      for k in range(len(params_list['epochs'])):
        batch_size = params_list['batch_size'][i]
        learning_rate = params_list['learning_rate'][j]
        epoch = params_list['epochs'][k]

        print(str(count) + "th train")
        print("batch_size : {} / lr : {} / epoch : {}".format(batch_size, learning_rate, epoch))

        adam = optimizers.Adam(lr=learning_rate, decay=0.1)

        accs = []
        index = 1
        for l in range(3):
          model = Sequential()
          model.add(Embedding(vocab_size, embedding))
          model.add(LSTM(lstm_hidden))
          model.add(Dropout(dropout))
          model.add(Dense(2, activation='sigmoid'))
          model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc'])
          history = model.fit(x_train_new, y_train_new, batch_size=batch_size, epochs=epoch, validation_split=0.1)
          acc = model.evaluate(x_valid_new, y_valid_new)[1]*100
          model_name = "./day_21_model_1/day21_"+str(count)+ "_" + str(index + 1)+"th_" + "lstm_" + str(lstm_hidden) + '_Embedding_' + str(embedding) +"model_acc_" + str(acc) +".h5"
          model.save(model_name)
          
          print("{}th acc : {}\n\n".format(index, acc))
          accs.append(acc)
          index = index + 1
        my_acc = np.array(accs).mean()
        print("{}th mean acc : {}%".format(count, my_acc))
getBestParams(params_list, 128, 100, 0.1)
from keras.models import load_model

def getModelsEvaluationAccs(model_file_path):
  evaluate_accs = []
  # model_names = os.listdir("./day19_1_model")
  model_names = os.listdir(model_file_path)


  for model_name in model_names:
    model = load_model(model_file_path + "/" + model_name)
    
    evaluation_acc = model.evaluate(x_valid_new, y_valid_new)[1]*100

    evaluate_accs.append(evaluation_acc)

  evaluation_DataFrame = pd.DataFrame({"model_name":model_names, "evaluate_accuracy":evaluate_accs})

  return evaluation_DataFrame
eval_df_1 = getModelsEvaluationAccs("./day_21_model_1")
eval_df_1 = eval_df_1.sort_values(by=['evaluate_accuracy'], axis=0, ascending=False)
eval_df_1.head(10)

 

네번째 제출

model_4 = load_model("./day_21_model_1/" + eval_df_1['model_name'].iloc[0])
print(eval_df_1['model_name'].iloc[0])

predict = model_4.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
  predict_labels[i] = predict_labels[i]

ids = list(test['id'])

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day21_4.csv", index=False)

결과

 

다섯번째 제출

model_5 = load_model("./day_21_model_1/" + eval_df_1['model_name'].iloc[1])
print(eval_df_1['model_name'].iloc[1])

predict = model_5.predict(x_test)
predict_labels = np.argmax(predict, axis=1)
for i in range(len(predict_labels)):
  predict_labels[i] = predict_labels[i]

ids = list(test['id'])

submission_dic = {"id":ids, "target":predict_labels}
submission_df = pd.DataFrame(submission_dic)
submission_df.to_csv("kaggle_day21_5.csv", index=False)

결과

Comments