๋ฐ์ํ
-- ๋ณธ ํฌ์คํ
์ ํ์ดํ ์น๋ก ๋ฐฐ์ฐ๋ ์์ฐ์ด ์ฒ๋ฆฌ (ํ๋น๋ฏธ๋์ด) ์ฑ
์ ์ฐธ๊ณ ํด์ ์์ฑ๋ ๊ธ์
๋๋ค.
-- ์์ค์ฝ๋ ) https://github.com/rickiepark/nlp-with-pytorch
<PREVIOUS>
https://didu-story.tistory.com/83?category=952805
https://didu-story.tistory.com/86?category=952805
โถ ๋ ์คํ ๋ ๋ฆฌ๋ทฐ ๊ฐ์ฑ ๋ถ๋ฅํ๊ธฐ
์์ (1) , (2) ํฌ์คํ ์์ ๋ฐ์ดํฐ๋ฅผ ์ ์ฒ๋ฆฌํ๊ณ , ๋ฐ์ดํฐ๋ฅผ ํ์ดํ ์น์์ ํ์ฉ ๊ฐ๋ฅํ๊ฒ ๋ง๋ค์ด์ฃผ๋ ์ฌ๋ฌ๊ฐ์ง ํด๋์ค์ ๋ํด์ ์ดํด๋ณด์๋ค. (์ฌ๊ธฐ ์ดํดํ๋๋ฐ ๊ฐ์ค๋๊ฑธ๋ฆผ ;;...)
์ด์ ๊ฐ๋จํ ํผ์ ํธ๋ก ๋ชจ๋ธ์ ํ์ฉํด์ ๋ณธ๊ฒฉ์ ์ธ ๊ฐ์ฑ๋ถ๋ฅ๋ฅผ ์งํํด๋ณด์.
1. ํผ์ ํธ๋ก ๋ถ๋ฅ๊ธฐ ์ ์ํ๊ธฐ
- ReviewClassifier ํด๋์ค๋ ํ์ดํ ์น์ Module ํด๋์ค๋ฅผ ์์ํ๊ณ ๋จ์ผ ์ถ๋ ฅ์ ๋ง๋๋ Linear์ธต ํ๋๋ฅผ ์์ฑํ๋๋ก ํ ๊ฒ์ด๋ค.
- ๋ง์ง๋ง์๋ ๋น์ ํ ํ์ฑํ ํจ์๋ก ์๊ทธ๋ชจ์ด๋ ํจ์๋ฅผ ์ฌ์ฉํ ๊ฒ์ด๋ค.
- forward() ๋ฉ์๋
- ์ ํ์ ์ผ๋ก ์๊ทธ๋ชจ์ด๋ ํจ์๋ฅผ ์ ์ฉํ๋ ๋งค๊ฐ๋ณ์๋ฅผ ๋ง๋ฆ
- ์ด์ง๋ถ๋ฅ ๋ฌธ์ ์์๋ ์ด์ง ํฌ๋ก์คํผ ์ํธ๋กํผ ์์ค(BCELoss)๊ฐ ๊ฐ์ฅ ์ ์ ํ์ง๋ง, ์๊ทธ๋ชจ์ด๋์ ์์คํจ์๋ฅผ ์ฌ์ฉํ ๊ฒฝ์ฐ์ ์์น ์์ ์ฑ์ ์ด์๊ฐ ๋ฐ์ํ๋ค๊ณ ํ๋ค.
- ํ์ดํ ์น๋ ์๊ทธ๋ชจ์ด๋ ์์ด ๊ฐํธํ๊ฒ ์ฌ์ฉํ ์์๊ณ , ์์น์ ์ผ๋ก ์์ ๋ ๊ณ์ฐ์ ์ํ BSEWithLogitsLoss()๋ฅผ ์ฌ์ฉํ ์์๋ค.
class ReviewClassifier(nn.Module):
""" ๊ฐ๋จํ ํผ์
ํธ๋ก ๊ธฐ๋ฐ ๋ถ๋ฅ๊ธฐ """
def __init__(self, num_features):
"""
๋งค๊ฐ๋ณ์:
num_features (int): ์
๋ ฅ ํน์ฑ ๋ฒกํธ์ ํฌ๊ธฐ
"""
super(ReviewClassifier, self).__init__()
self.fc1 = nn.Linear(in_features=num_features,
out_features=1)
def forward(self, x_in, apply_sigmoid=False):
""" ๋ถ๋ฅ๊ธฐ์ ์ ๋ฐฉํฅ ๊ณ์ฐ
๋งค๊ฐ๋ณ์:
x_in (torch.Tensor): ์
๋ ฅ ๋ฐ์ดํฐ ํ
์
x_in.shape๋ (batch, num_features)์
๋๋ค.
apply_sigmoid (bool): ์๊ทธ๋ชจ์ด๋ ํ์ฑํ ํจ์๋ฅผ ์ํ ํ๋๊ทธ
ํฌ๋ก์ค-์ํธ๋กํผ ์์ค์ ์ฌ์ฉํ๋ ค๋ฉด False๋ก ์ง์ ํฉ๋๋ค
๋ฐํ๊ฐ:
๊ฒฐ๊ณผ ํ
์. tensor.shape์ (batch,)์
๋๋ค.
"""
y_out = self.fc1(x_in).squeeze()
## ์ ํ์ ์ผ๋ก ์๊ทธ๋ชจ์ด๋ ํจ์๋ฅผ ์ ์ฉํ๊ธฐ ์ํ ๊ณผ์
if apply_sigmoid:
y_out = torch.sigmoid(y_out)
return y_out
2. ๋ชจ๋ธ ํ๋ จ
2.1 ํผ์ ํธ๋ก ๋ถ๋ฅ๊ธฐ๋ฅผ ์ํ ํ์ดํผ ํ๋ผ๋ฏธํฐ์ ํ๋ก๊ทธ๋จ ์ต์ ์ ์ค์
from argparse import Namespace
args = Namespace(
# ๋ ์ง์ ๊ฒฝ๋ก ์ ๋ณด
frequency_cutoff=25,
model_state_file='model.pth',
review_csv='data/yelp/reviews_with_splits_lite.csv',
# review_csv='data/yelp/reviews_with_splits_full.csv',
save_dir='model_storage/ch3/yelp/',
vectorizer_file='vectorizer.json',
# ๋ชจ๋ธ ํ์ดํผํ๋ผ๋ฏธํฐ ์์
# ํ๋ จ ํ์ดํผํ๋ผ๋ฏธํฐ
batch_size=128,
early_stopping_criteria=5,
learning_rate=0.001,
num_epochs=100,
seed=1337,
# ์คํ ์ต์
catch_keyboard_interrupt=True,
cuda=True,
expand_filepaths_to_save_dir=True,
reload_from_files=False,
)
2.2 ๋ฐ์ดํฐ์ , ๋ชจ๋ธ, ์์ค, ์ตํฐ๋ง์ด์ , ํ๋ จ์ํ ๋์ ๋๋ฆฌ ์์ฑ
import torch.optim as optim
def make_train_state(args):
return {'epoch_index': 0,
'train_loss': [],
'train_acc': [],
'val_loss': [],
'val_acc': [],
'test_loss': -1,
'test_acc': -1}
train_state = make_train_state(args)
if not torch.cuda_is_available():
args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
# ๋ฐ์ดํฐ์
๊ณผ Vectorizer
dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
vectorizer = dataset.get_vecgtorizer()
# ๋ชจ๋ธ
classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))
classifier = classifier.to(args.device)
# ์์คํจ์์ ์ตํฐ๋ง์ด์
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr = args.learning_rage)
- args๊ฐ์ฒด๋ฅผ ๋งค๊ฐ๋ณ์๋ก ๋ฐ์์ ํ๋ จ ์ํ๋ฅผ ์ด๊ธฐํํ๋ ํจ์ ์์ฑ (make_train_state)
- ๋ฐ์ดํฐ์
๊ณผ ๋ชจ๋ธ์์ฑ
- ReviewDataset ํด๋์ค์์ vectorizer๋ฅผ ์์ฑํด์ค๋ค.
- ์์คํจ์๋ BCEWithLogistitsLoss() ์ฌ์ฉ
- ์ตํฐ๋ง์ด์ ๋ Adam
2.3 ํ๋ จ ๋ฐ๋ณต
# ์ํฌํฌ ํ์๋งํผ for๋ฌธ์ ๋ฐ๋ณตํ ๊ฒ์ด๋ค. (args์์ ์ ์)
for epoch_index in range(args.num_epochs):
train_state['epoch_index'] = epoch_index
# ํ๋ จ ์ธํธ์ ๋ํ ์ํ
# ํ๋ จ ์ธํธ์ ๋ฐฐ์น ์ ๋๋ ์ดํฐ ์ค๋น, ์์ค๊ณผ ์ ํ๋๋ฅผ 0์ผ๋ก ์ค์
dataset.set_split('train')
batch_generator = generate_batches(dataset,
batch_size=args.batch_size,
device=args.device)
running_loss = 0.0
running_acc = 0.0
classifier.train()
for batch_index, batch_dict in enumerate(batch_generator):
# ํ๋ จ ๊ณผ์ ์ 5๋จ๊ณ๋ก ์ด๋ฃจ์ด์ง๋๋ค
# --------------------------------------
# ๋จ๊ณ 1. ๊ทธ๋ ์ด๋์ธํธ๋ฅผ 0์ผ๋ก ์ด๊ธฐํํฉ๋๋ค
optimizer.zero_grad()
# ๋จ๊ณ 2. ์ถ๋ ฅ์ ๊ณ์ฐํฉ๋๋ค
y_pred = classifier(x_in=batch_dict['x_data'].float())
# ๋จ๊ณ 3. ์์ค์ ๊ณ์ฐํฉ๋๋ค
loss = loss_func(y_pred, batch_dict['y_target'].float())
loss_t = loss.item()
running_loss += (loss_t - running_loss) / (batch_index + 1)
# ๋จ๊ณ 4. ์์ค์ ์ฌ์ฉํด ๊ทธ๋ ์ด๋์ธํธ๋ฅผ ๊ณ์ฐํฉ๋๋ค
loss.backward()
# ๋จ๊ณ 5. ์ตํฐ๋ง์ด์ ๋ก ๊ฐ์ค์น๋ฅผ ์
๋ฐ์ดํธํฉ๋๋ค
optimizer.step()
# -----------------------------------------
# ์ ํ๋๋ฅผ ๊ณ์ฐํฉ๋๋ค
acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
running_acc += (acc_t - running_acc) / (batch_index + 1)
# ์งํ ๋ฐ ์
๋ฐ์ดํธ
train_bar.set_postfix(loss=running_loss,
acc=running_acc,
epoch=epoch_index)
train_bar.update()
train_state['train_loss'].append(running_loss)
train_state['train_acc'].append(running_acc)
# ๊ฒ์ฆ ์ธํธ์ ๋ํ ์ํ
# ๊ฒ์ฆ ์ธํธ์ ๋ฐฐ์น ์ ๋๋ ์ดํฐ ์ค๋น, ์์ค๊ณผ ์ ํ๋๋ฅผ 0์ผ๋ก ์ค์
dataset.set_split('val')
batch_generator = generate_batches(dataset,
batch_size=args.batch_size,
device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
# ๋จ๊ณ 1. ์ถ๋ ฅ์ ๊ณ์ฐํฉ๋๋ค
y_pred = classifier(x_in=batch_dict['x_data'].float())
# ๋จ๊ณ 2. ์์ค์ ๊ณ์ฐํฉ๋๋ค
loss = loss_func(y_pred, batch_dict['y_target'].float())
loss_t = loss.item()
running_loss += (loss_t - running_loss) / (batch_index + 1)
# ๋จ๊ณ 3. ์ ํ๋๋ฅผ ๊ณ์ฐํฉ๋๋ค
acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
running_acc += (acc_t - running_acc) / (batch_index + 1)
val_bar.set_postfix(loss=running_loss,
acc=running_acc,
epoch=epoch_index)
val_bar.update()
train_state['val_loss'].append(running_loss)
train_state['val_acc'].append(running_acc)
- ๋ด๋ถ for loop : ๋ฏธ๋ ๋ฐฐ์น์ ๋ํด์ ๋ฐ๋ณต ์ํ
- ๋ฏธ๋๋ฐฐ์น : ์์ธก - ์์ค๊ณ์ฐ - ์ ํ๋ ๊ณ์ฐ
- ์ธ๋ถ for loop : ๋ด๋ถ๋ฐ๋ณต๋ฌธ์ ์ฌ๋ฌ๋ฒ ๋ฐ๋ณตํ๋ค. ๋ด๋ถ ๋ฐ๋ณต๋ฌธ์์ ๋ฏธ๋๋ฐฐ์น๋ง๋ค ์์ค์ ๊ณ์ฐํ๊ณ ์ตํฐ๋ง์ด์ ๊ฐ ๋ชจ๋ธ ํ๋ผ๋ฏธํฐ๋ฅผ ์ ๋ฐ์ดํธ ํด์ค๋ค.
2.4 ํ๊ฐ, ์ถ๋ก , ๋ถ์
2.4.1 ํ ์คํธ ๋ฐ์ดํฐ๋ก ํ๊ฐํ๊ธฐ
- ์ฌ์ฉ ๋ฐ์ดํฐ๋ฅผ val ๋์ test๋ก ์ง์
dataset.set_split('test')
batch_generator = generate_batches(dataset,
batch_size=args.batch_size,
device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
# ๋จ๊ณ 1. ์ถ๋ ฅ์ ๊ณ์ฐํฉ๋๋ค
y_pred = classifier(x_in=batch_dict['x_data'].float())
# ๋จ๊ณ 2. ์์ค์ ๊ณ์ฐํฉ๋๋ค
loss = loss_func(y_pred, batch_dict['y_target'].float())
loss_batch = loss.item()
running_loss += (loss_batch - running_loss) / (batch_index + 1)
# ๋จ๊ณ 3. ์ ํ๋๋ฅผ ๊ณ์ฐํฉ๋๋ค
acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'].append(running_loss)
train_state['test_acc'].append(running_acc)
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))
2.4.2 ์๋ก์ด ๋ฐ์ดํฐ ํฌ์ธํธ ์ถ๋ก ํ์ฌ ๋ถ๋ฅํ๊ธฐ
# ์ ๊ท์์ ์ฌ์ฉํ์ฌ text๋ฅผ ํ ํฐํ
def preprocess_text(text):
text = text.lower()
text = re.sub(r"([.,!?])", r" \1 ", text)
text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
return text
def predict_rating(review, classifier, vectorizer, decision_threshold=0.5):
""" ๋ฆฌ๋ทฐ ์ ์ ์์ธกํ๊ธฐ
๋งค๊ฐ๋ณ์:
review (str): ๋ฆฌ๋ทฐ ํ
์คํธ
classifier (ReviewClassifier): ํ๋ จ๋ ๋ชจ๋ธ
vectorizer (ReviewVectorizer): Vectorizer ๊ฐ์ฒด
decision_threshold (float): ํด๋์ค๋ฅผ ๋๋ ๊ฒฐ์ ๊ฒฝ๊ณ
"""
review = preprocess_text(review)
vectorized_review = torch.tensor(vectorizer.vectorize(review))
result = classifier(vectorized_review.view(1, -1))
probability_value = torch.sigmoid(result).item()
index = 1
if probability_value < decision_threshold:
index = 0
return vectorizer.rating_vocab.lookup_index(index)
test_review = "this is a pretty awesome book"
#์์ ๋ฌธ์ฅ์ ์์ธกํด์ ๋ถ๋ฅํด๋ณด๊ธฐ
classifier = classifier.cpu()
prediction = predict_rating(test_review, classifier, vectorizer, decision_threshold=0.5)
print("{} -> {}".format(test_review, prediction))
2.4.3 ๋ชจ๋ธ ๊ฐ์ค์น ๋ถ์
ํ๋ จ์ด ๋๋ ๋ค ๋ชจ๋ธ์ด ์ ์๋ํ๋์ง ์์๋ณด๊ธฐ ์ํด ๊ฐ์ค์น๋ฅผ ๋ถ์ํด๋ณด์.
# ๊ฐ์ค์น ์ ๋ ฌ
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()
# ๊ธ์ ์ ์ธ ์์ 20๊ฐ ๋จ์ด
print("๊ธ์ ๋ฆฌ๋ทฐ์ ์ํฅ์ ๋ฏธ์น๋ ๋จ์ด:")
print("--------------------------------------")
for i in range(20):
print(vectorizer.review_vocab.lookup_index(indices[i]))
print("====\n\n\n")
# ๋ถ์ ์ ์ธ ์์ 20๊ฐ ๋จ์ด
print("๋ถ์ ๋ฆฌ๋ทฐ์ ์ํฅ์ ๋ฏธ์น๋ ๋จ์ด:")
print("--------------------------------------")
indices.reverse()
for i in range(20):
print(vectorizer.review_vocab.lookup_index(indices[i]))
๋ฐ์ํ