【项目实战】Kaggle电影评论情感分析

2022-11-06,,,,

前言

这几天持续摆烂了几天,原因是我自己对于Kaggle电影评论情感分析的这个赛题敲出来的代码无论如何没办法运行,其中数据变换的维度我无法把握好,所以总是在函数中传错数据。今天痛定思痛,重新写了一遍代码,终于成功。

从国籍分类入手

在这个题目之前,给了一个按照姓名分类国籍的写法

https://www.bilibili.com/video/BV1Y7411d7Ys?p=13

按照这个写法我来写这个赛题,代码以及注释如下

'''''''''
构建一个RNN分类器
任务:一个名称分类器,根据输入的名字判断其国籍,数据集有Name与Country
在这个场景中,由于输出无法通过线性层映射到某个维度,所以可以只用hn来连接线性层,对这个输入做一个18维的分类
'''''''''
import csv
import gzip
import torch
import matplotlib.pyplot as plt
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Dataset, DataLoader device = torch.device('cuda:0') HIDDEN_SIZE = 100
BATCH_SIZE = 256
N_LAYER = 2
N_EPOCHS = 100
N_CHARS = 128 # 字符集字典维度 class NameDataset(Dataset): #数据集类
def __init__(self,is_train_set = True):
filename = 'names_train.csv.gz' if is_train_set else 'names_test.csv.gz'
with gzip.open(filename,'rt') as f:
reader = csv.reader(f)
rows = list(reader)
self.names = [row[0] for row in rows] # 把名字字符串存到列表
self.len = len(self.names) # 数据集长度
self.countries = [row[1] for row in rows] # 所有国家字符串存到列表
self.country_list = list(sorted(set(self.countries))) # unique国家列表
self.country_dict = self.getCountryDict()
self.country_num = len(self.country_list) # unique国家数
def __getitem__(self,index):
return self.names[index],self.country_dict[self.countries[index]]
# 返回名称和国家,国家先通过index找到国家,再通过字典映射返回国家的序号
def __len__(self):
return self.len
def getCountryDict(self): # 把unique国家做成字典
country_dict = dict()
for idx,counrty_name in enumerate(self.country_list,0):
country_dict[counrty_name] = idx
return country_dict
def idx2country(self,index):
return self.country_list[index]
def getCountriesNum(self):
return self.country_num # 返回unique国家数 trainset = NameDataset(is_train_set=True)
trainloader = DataLoader(trainset,batch_size=BATCH_SIZE,shuffle=True)
testset = NameDataset(is_train_set=False)
testloader = DataLoader(testset,batch_size=BATCH_SIZE,shuffle=False) N_COUNTRY = trainset.getCountriesNum()
class RNNClassifier(torch.nn.Module):
def __init__(self,input_size,hidden_size,output_size,n_layers=1,bidirectional=True):
super(RNNClassifier, self).__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.n_directions = 2 if bidirectional else 1 self.embedding = torch.nn.Embedding(input_size,hidden_size) # inputs_size是名称字符集长度
self.gru = torch.nn.GRU(hidden_size,hidden_size,n_layers,bidirectional=bidirectional)
self.fc = torch.nn.Linear(hidden_size*self.n_directions,output_size) # output_size是N_COUNTRY def _init_hidden(self,batch_size):
hidden = torch.zeros(self.n_layers*self.n_directions,batch_size,self.hidden_size)
#layers*batch_size*hidden_size
return hidden.to(device) def forward(self,input,seq_lengths):
input = input.t() # b*s to s*b
batch_size = input.size(1)
hidden = self._init_hidden(batch_size)
embedding = self.embedding(input) gru_input = pack_padded_sequence(embedding,seq_lengths.cpu())
# 这是gru和lstm可以接受的一种输入, PackedSequence object
output,hidden = self.gru(gru_input,hidden)
if self.n_directions == 2:
hidden_cat = torch.cat((hidden[-1],hidden[-2]),dim=1)
else:
hiddden_cat = hidden[-1]
fc_output = self.fc(hidden_cat)
return fc_output def name2list(name):
arr = [ord(c) for c in name]
return arr,len(arr)
# 返回输入名字的asci码值的列表,和名字长度
def make_tensors(names,countries): # 把输入数据处理为tensor
sequences_and_lengths = [name2list(name) for name in names]
name_sequences = [sl[0] for sl in sequences_and_lengths]
seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths]) # seq_lengths to longTensor
countries = countries.long() # index of country to longTensor
# make tensor of name, batchsize*seqlen
seq_tensor = torch.zeros(len(name_sequences),seq_lengths.max()).long()
# 这一句先生成一个二维全0张量,高是名称序列数,宽是最长的名字长度,做成longTensor
for idx,(seq,seq_len) in enumerate(zip(name_sequences,seq_lengths),0):
seq_tensor[idx, :seq_len] = torch.LongTensor(seq) # 复制到上面的全0张量中
# sort sequences by length to use pack_padded_sequence
seq_lengths,perm_idx = seq_lengths.sort(dim=0,descending=True)
# torch中tensor类的tensor返回的是排完序列和索引
seq_tensor = seq_tensor[perm_idx]
countries = countries[perm_idx]
return seq_tensor.to(device),seq_lengths.to(device),countries.to(device) classifier = RNNClassifier(N_CHARS,HIDDEN_SIZE,N_COUNTRY,N_LAYER,True).to(device)
#classifier = classifier.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(),lr=0.001)
#start = time.time() def trainModel():
total_loss = 0
for i,(names,countries) in enumerate(trainloader,1): # i从1开始数
inputs,seq_lengths,target = make_tensors(names,countries)
output = classifier(inputs,seq_lengths) # 这是送的forward的参数.
loss = criterion(output,target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
if i%10 == 0:
print(f'Epoch{epoch}',end='')
print(f'[{i*len(inputs)}/{len(trainset)}]',end='')
print(f'loss={total_loss/(i*len(inputs))}')
return total_loss def testModel():
correct = 0
total = len(testset)
print('evaluating trained model ...')
with torch.no_grad():
for i,(names,countries) in enumerate(testloader,1):
inputs,seq_lengths,target = make_tensors(names,countries)
output = classifier(inputs,seq_lengths)
pred = output.max(dim=1,keepdim=True)[1]
correct += pred.eq(target.view_as(pred)).sum().item()
percent = '%.2f' % (100*correct/total)
print(f'Test set: Accuracy{correct}/{total} {percent}%')
return correct/total print('Training for %d epochs...' % N_EPOCHS)
acc_list = []
for epoch in range(1,30):
trainModel()
acc = testModel()
acc_list.append(acc) epoch = np.arange(1,len(acc_list)+1,1)
acc_list = np.array(acc_list)
plt.plot(epoch,acc_list)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid()
plt.show()

好,下面我们进入赛题部分

赛题理解

赛题以及数据的下载地址如下:https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

数据就是给出id,评论内容,以及标注好的情感极性,然后经过训练,测试集传入模型判断测试集评论的情感,给出csv格式文件,来进行评分,数据长得就是下面这个样子:

数据处理

由于给出的是tsv格式,所以我们用pandas自带的read来读取,只需要把phrase和sentiment取出来就行了,代码如下:

class NameDataset(Dataset): #数据集类
def __init__(self, is_train_set=True):
train = pd.read_csv('train.tsv', sep='\t') # 分隔符是空格
self.phrase = train['Phrase']
self.sentiment = train['Sentiment']
self.len = len(self.phrase) def __getitem__(self, index):
return self.phrase[index], self.sentiment[index] def __len__(self):
return self.len

由于评论是string字符,我们需要把它转成可以被接受的向量

def phrase2list(phrase):
arr = [ord(c) for c in phrase]
return arr, len(arr)
## 用ASCILL编码来转换字符 def make_tensors(phrase, sentiment):
sequences_and_lengths = [phrase2list(phrase) for phrase in phrase]
phrase_sequences = [sl[0] for sl in sequences_and_lengths]
seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths])
sentiment = sentiment.long() seq_tensor = torch.zeros(len(phrase_sequences), seq_lengths.max()).long()
for idx, (seq, seq_len) in enumerate(zip(phrase_sequences, seq_lengths), 0):
seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
seq_lengths, prem_idx = seq_lengths.sort(dim=0, descending=True)
seq_tensor = seq_tensor[prem_idx]
sentiment = sentiment[prem_idx]
return seq_tensor.to(device), seq_lengths.to(device), sentiment.to(device)

然后测试集是不需要转变sentiment的,因为根本没有,所以在搞一个专门给测试集传字符

def make_tensors1(phrase):
sequences_and_lengths = [phrase2list(phrase) for phrase in phrase]
phrase_sequences = [sl[0] for sl in sequences_and_lengths]
seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths]) seq_tensor = torch.zeros(len(phrase_sequences), seq_lengths.max()).long()
for idx, (seq, seq_len) in enumerate(zip(phrase_sequences, seq_lengths), 0):
seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
seq_lengths, prem_idx = seq_lengths.sort(dim=0, descending=True)
seq_tensor = seq_tensor[prem_idx]
_, index = prem_idx.sort(descending=False)
return seq_tensor.to(device), seq_lengths.to(device), index

设计模型

首先我们确定一些超参数

device = torch.device('cuda:0')
NUM_CHARS = 128
HIDDEN_SIZE = 100
NUM_CLASS = 5
NUM_LAYERS = 2
NUM_EPOCHS = 30
BATCH_SIZE = 512

然后我们使用GRU

class RNNClassifier(torch.nn.Module):
def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirectional=True):
super(RNNClassifier, self).__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.n_direction = 2 if bidirectional else 1 self.embedding = torch.nn.Embedding(input_size, hidden_size)
self.gru = torch.nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirectional)
self.fc = torch.nn.Linear(hidden_size*self.n_direction, output_size) def _init_hidden(self, batch_size):
hidden = torch.zeros(self.n_layers*self.n_direction, batch_size, self.hidden_size)
return hidden.to(device) def forward(self, input, seq_lengths):
input = input.t()
batch_size = input.size(1)
hidden = self._init_hidden(batch_size)
embedding = self.embedding(input) gru_input = pack_padded_sequence(embedding, seq_lengths.cpu())
output, hidden = self.gru(gru_input, hidden)
if self.n_direction == 2:
hidden_cat = torch.cat((hidden[-1], hidden[-2]), dim=1)
else:
hidden_cat = hidden[-1]
fc_output = self.fc(hidden_cat)
return fc_output

模型实例化并设计损失函数和优化器

classifier = RNNClassifier(NUM_CHARS, HIDDEN_SIZE, NUM_CLASS, NUM_LAYERS, True).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)

模型训练与测试

首先来设计训练函数

def train():
total_loss = 0
for i, (phrase, sentiment) in enumerate(train_loader, 1):
inputs, seq_lengths, target = make_tensors(phrase, sentiment)
output = classifier(inputs, seq_lengths)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.zero_grad()
optimizer.step()
total_loss += loss.item()
if i % 10 == 0:
print(f'Epoch{epoch}', end='')
print(f'[{i * len(inputs)}/{len(train_set)}]', end='')
print(f'loss={total_loss / (i * len(inputs))}')
return total_loss

然后设计函数来获取测试集数据

def get_test_set():
test_set = pd.read_csv('test.tsv', '\t')
PhraseId = test_set['PhraseId']
test_Phrase = test_set['Phrase']
return PhraseId, test_Phrase

测试函数设计如下

def testModel():
PhraseId, test_Phrase = get_test_set()
sentiment_list = [] # 定义预测结果列表
batchNum = math.ceil(PhraseId.shape[0] / BATCH_SIZE)
with torch.no_grad():
for i in range(batchNum):
print(i)
if i == batchNum - 1:
phraseBatch = test_Phrase[BATCH_SIZE * i:] # 处理最后不足BATCH_SIZE的情况
else:
phraseBatch = test_Phrase[BATCH_SIZE * i:BATCH_SIZE * (i + 1)]
inputs, seq_lengths, org_idx = make_tensors1(phraseBatch)
output = classifier(inputs, seq_lengths)
sentiment = output.max(dim=1, keepdim=True)[1]
sentiment = sentiment[org_idx].squeeze(1)
sentiment_list.append(sentiment.cpu().numpy().tolist())
sentiment_list = list(chain.from_iterable(sentiment_list)) # 将sentiment_list按行拼成一维列表
result = pd.DataFrame({'PhraseId': PhraseId, 'Sentiment': sentiment_list})
result.to_csv('SA_predict.csv', index=False)

开始跑代码

if __name__ == '__main__':
start = time.time()
print('Training for %d epochs...' % NUM_EPOCHS)
acc_list = []
for epoch in range(1, NUM_EPOCHS + 1):
train()
acc = train()
acc_list.append(acc)
if acc <= min(acc_list):
torch.save(classifier, 'sentimentAnalyst.pkl')
print('Save Model!')
testModel()
epoch = [epoch + 1 for epoch in range(len(acc_list))]
plt.plot(epoch, acc_list)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid()
plt.show()

结果

淦,跑完忘记截图platshow的图了,但是我上传kaggle,一跑完只有0.2分,哪里出了问题呢

改成14次,这次截图了

我陷入了沉思

inputs, seq_lengths, target = make_tensors(phrase, sentiment)
output = classifier(inputs, seq_lengths)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.zero_grad()
optimizer.step()
total_loss += loss.item()

问题出在这里

我他妈的一不小心多写了一步 optimizer.zero_grad(),结果跑了几个小时都在原地踏步

现在我们用修改后的模型再试一下

跑完十次的结果和分数如下

完整代码

import math
from itertools import chain
import torch
import matplotlib.pyplot as plt
import pandas as pd
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Dataset, DataLoader class NameDataset(Dataset): #数据集类
def __init__(self):
self.train = pd.read_csv('train.tsv', sep='\t')
self.phrase = self.train['Phrase']
self.sentiment = self.train['Sentiment']
self.len = self.train.shape[0] def __getitem__(self, index):
return self.phrase[index], self.sentiment[index] def __len__(self):
return self.len device = torch.device('cuda:0')
NUM_CHARS = 128
HIDDEN_SIZE = 128
NUM_LAYERS = 2
NUM_EPOCHS = 10
BATCH_SIZE = 512
train_set = NameDataset()
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
NUM_CLASS = len(set(train_set.sentiment)) class RNNClassifier(torch.nn.Module):
def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirectional=True):
super(RNNClassifier, self).__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.n_direction = 2 if bidirectional else 1 self.embedding = torch.nn.Embedding(input_size, hidden_size)
self.gru = torch.nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirectional)
self.fc = torch.nn.Linear(hidden_size*self.n_direction, output_size) def _init_hidden(self, batch_size):
hidden = torch.zeros(self.n_layers*self.n_direction, batch_size, self.hidden_size)
return hidden.to(device) def forward(self, input, seq_lengths):
input = input.t()
batch_size = input.size(1)
hidden = self._init_hidden(batch_size)
embedding = self.embedding(input) gru_input = pack_padded_sequence(embedding, seq_lengths.cpu())
output, hidden = self.gru(gru_input, hidden)
if self.n_direction == 2:
hidden_cat = torch.cat((hidden[-1], hidden[-2]), dim=1)
else:
hidden_cat = hidden[-1]
fc_output = self.fc(hidden_cat)
return fc_output def phrase2list(phrase):
arr = [ord(c) for c in phrase]
return arr, len(arr) def make_tensors(phrase, sentiment):
sequences_and_lengths = [phrase2list(phrase) for phrase in phrase]
phrase_sequences = [sl[0] for sl in sequences_and_lengths]
seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths])
sentiment = sentiment.long() seq_tensor = torch.zeros(len(phrase_sequences), seq_lengths.max()).long()
for idx, (seq, seq_len) in enumerate(zip(phrase_sequences, seq_lengths), 0):
seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
seq_lengths, prem_idx = seq_lengths.sort(dim=0, descending=True)
seq_tensor = seq_tensor[prem_idx]
sentiment = sentiment[prem_idx]
return seq_tensor.to(device), seq_lengths.to(device), sentiment.to(device) def make_tensors1(phrase):
sequences_and_lengths = [phrase2list(phrase) for phrase in phrase]
phrase_sequences = [sl[0] for sl in sequences_and_lengths]
seq_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths]) seq_tensor = torch.zeros(len(phrase_sequences), seq_lengths.max()).long()
for idx, (seq, seq_len) in enumerate(zip(phrase_sequences, seq_lengths), 0):
seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
seq_lengths, prem_idx = seq_lengths.sort(dim=0, descending=True)
seq_tensor = seq_tensor[prem_idx]
_, index = prem_idx.sort(descending=False)
return seq_tensor.to(device), seq_lengths.to(device), index def train():
total_loss = 0
for i, (phrase, sentiment) in enumerate(train_loader, 1):
inputs, seq_lengths, target = make_tensors(phrase, sentiment)
output = classifier(inputs, seq_lengths)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
if i % 10 == 0:
print(f'Epoch{epoch}', end='')
print(f'[{i * len(inputs)}/{len(train_set)}]', end='')
print(f'loss={total_loss / (i * len(inputs))}')
return total_loss def get_test_set():
test_set = pd.read_csv('test.tsv', '\t')
PhraseId = test_set['PhraseId']
test_Phrase = test_set['Phrase']
return PhraseId, test_Phrase def testModel():
PhraseId, test_Phrase = get_test_set()
sentiment_list = [] # 定义预测结果列表
batchNum = math.ceil(PhraseId.shape[0] / BATCH_SIZE)
with torch.no_grad():
for i in range(batchNum):
if i == batchNum - 1:
phraseBatch = test_Phrase[BATCH_SIZE * i:] # 处理最后不足BATCH_SIZE的情况
else:
phraseBatch = test_Phrase[BATCH_SIZE * i:BATCH_SIZE * (i + 1)]
inputs, seq_lengths, org_idx = make_tensors1(phraseBatch)
output = classifier(inputs, seq_lengths)
sentiment = output.max(dim=1, keepdim=True)[1]
sentiment = sentiment[org_idx].squeeze(1)
sentiment_list.append(sentiment.cpu().numpy().tolist()) sentiment_list = list(chain.from_iterable(sentiment_list)) # 将sentiment_list按行拼成一维列表
result = pd.DataFrame({'PhraseId': PhraseId, 'Sentiment': sentiment_list})
result.to_csv('SA_predict.csv', index=False) if __name__ == '__main__':
classifier = RNNClassifier(NUM_CHARS, HIDDEN_SIZE, NUM_CLASS, NUM_LAYERS).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
print('Training for %d epochs...' % NUM_EPOCHS)
acc_list = []
for epoch in range(1, NUM_EPOCHS + 1):
train()
acc = train()
acc_list.append(acc)
if acc <= min(acc_list):
torch.save(classifier, 'sentimentAnalyst.pkl')
print('Save Model!') testModel()
epoch = [epoch + 1 for epoch in range(len(acc_list))]
plt.plot(epoch, acc_list)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid()
plt.show()

其实这里可以感觉到验证集的重要性了,因为你很容易过拟合,所以这里附上大佬写的带验证集的代码链接

https://blog.csdn.net/qq_39187959/article/details/121102959

带有验证集可以让模型达到0.7分

项目实战】Kaggle电影评论情感分析的相关教程结束。

《【项目实战】Kaggle电影评论情感分析.doc》

下载本文的Word格式文档,以方便收藏与打印。