Add a new method: Naive CNN

This commit is contained in:
2023-11-17 07:01:58 +00:00
parent af858756bb
commit 3fcf73fcad
5 changed files with 229 additions and 0 deletions

0
methods/__init__.py Normal file
View File

51
methods/model.py Normal file
View File

@ -0,0 +1,51 @@
import torch.nn as nn
import torch.nn.functional as F
import torch
from transformers import BertModel
import numpy as np
class SentimentAspectCNN(nn.Module):
def __init__(self, embedding_dim, num_filters, filter_sizes, output_dim, dropout):
super().__init__()
self.convs = nn.ModuleList([
nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim))
for fs in filter_sizes
])
self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
self.dropout = nn.Dropout(dropout)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
# x shape: (batch_size, max_length, embedding_dim + 1)
x = x.unsqueeze(1) # Add a channel dimension, x shape: (batch_size, 1, max_length, embedding_dim + 1)
# Apply convolution and ReLU activation
x = [nn.functional.relu(conv(x)).squeeze(3) for conv in self.convs] # List of tensors of shape (batch_size, num_filters, max_length - filter_size + 1)
# Apply max pooling
x = [nn.functional.max_pool1d(tensor, tensor.size(2)).squeeze(2) for tensor in x] # List of tensors of shape (batch_size, num_filters)
# Concatenate the pooling results
x = torch.cat(x, dim=1) # Shape: (batch_size, len(filter_sizes) * num_filters)
# Apply dropout
x = self.dropout(x)
# Fully connected layer
x = self.fc(x) # Shape: (batch_size, output_dim)
# Sigmoid activation to get a score between 0 and 1
x = self.sigmoid(x)
return x
if __name__ == "__main__":
embedding_dim = 26 # 25 for word embeddings + 1 for aspect indicator
num_filters = 100
filter_sizes = [3, 4, 5]
output_dim = 1
dropout = 0.5
model = SentimentAspectCNN(embedding_dim, num_filters, filter_sizes, output_dim, dropout)
print(model)

83
methods/tokenizer.py Normal file
View File

@ -0,0 +1,83 @@
from sawaw import SAWAWEntry, SentimentResult
import torch
import gensim
import gensim.downloader
from loguru import logger
from typing import Optional
class GensimTokenizer:
def __init__(self):
glove_vectors = gensim.downloader.load('glove-twitter-25')
self.gensim_model: gensim.models.keyedvectors.KeyedVectors = glove_vectors
def word2vec(self, word: str, as_torch_tensor: bool = False, zero_if_not_found: bool = True):
ret = self.gensim_model[word] if word in self.gensim_model else [0] * 25 if zero_if_not_found else None
if as_torch_tensor:
ret = torch.tensor(ret)
return ret
def sentence2vec(self, sentence: str, pad_to_len: Optional[int]=None) -> torch.Tensor:
list_of_words = gensim.utils.simple_preprocess(sentence)
vec_result = torch.stack([self.word2vec(word, as_torch_tensor=True, zero_if_not_found=True) for word in list_of_words]) # shape: (num_of_words, 25)
if pad_to_len is not None and vec_result.shape[0] < pad_to_len:
vec_result = torch.cat([vec_result, torch.zeros((pad_to_len - vec_result.shape[0], 25))])
elif pad_to_len is not None and vec_result.shape[0] > pad_to_len:
vec_result = vec_result[:pad_to_len]
logger.warning("Dropping words after '{}' from sentence '{}'", list_of_words[pad_to_len], sentence)
return vec_result
def _to_vec(sentence: str, aspect_word: str, tokenizer: GensimTokenizer, max_len: int = 25) -> torch.Tensor:
# Tokenize and convert sentence to vectors
sentence_vec = tokenizer.sentence2vec(sentence, pad_to_len=max_len) # shape: (max_len, 25)
# Preprocess and tokenize aspect words
aspect_word = gensim.utils.simple_preprocess(aspect_word)
aspect_indicators = torch.zeros(max_len, 1)
# Iterate over the sentence and mark aspect words
for i, word in enumerate(gensim.utils.simple_preprocess(sentence)):
if i >= max_len:
break
if word in aspect_word:
aspect_indicators[i] = 1
# Concatenate the sentence vectors with the aspect indicators
combined_vec = torch.cat((sentence_vec, aspect_indicators), dim=1) # shape: (max_len, 26)
return combined_vec
gt = GensimTokenizer()
def to_vec(entry: SAWAWEntry, max_len: int = 80, should_return_sentiment: bool=True) -> torch.Tensor:
aspect_word_encoded_sentence = []
sentiment_result = []
for i, aspect_word in enumerate(entry.aspect_words):
vec = _to_vec(entry.comment, aspect_word, gt, max_len=max_len) # shape: (max_len, 26)
aspect_word_encoded_sentence.append(vec)
if should_return_sentiment:
sent = entry.sentiment_results[i]
if sent == SentimentResult.UNDEFINED:
logger.warning("Sentiment result for aspect word '{}' is undefined, but to_vec is called with should_return_sentiment=True. Assuming neutral.", aspect_word)
sentiment_result = 0.5
elif sent == SentimentResult.NEGATIVE:
sentiment_result = 0
elif sent == SentimentResult.POSITIVE:
sentiment_result = 1
elif sent == SentimentResult.NEUTRAL or sent == SentimentResult.NONE:
sentiment_result = 0.5
if should_return_sentiment:
return torch.stack(aspect_word_encoded_sentence), torch.tensor(sentiment_result)
else:
return torch.stack(aspect_word_encoded_sentence) # shape: (num_of_aspect_words, max_len, 26)
if __name__ == '__main__':
tokenizer = GensimTokenizer()
sentence = "The pizza at this restaurant is amazing, but the service is slow."
aspect_words = "service"
vectorized_sentence = to_vec(sentence, aspect_words, tokenizer)
vectorized_sentence.shape # should be (25, 26)