sawaw/methods/tokenizer.py

from sawaw import SAWAWEntry, SentimentResult
import torch
import gensim
import gensim.downloader
from loguru import logger
from typing import Optional

class GensimTokenizer:

    def __init__(self):
        glove_vectors = gensim.downloader.load('glove-twitter-25')
        self.gensim_model: gensim.models.keyedvectors.KeyedVectors = glove_vectors

    def word2vec(self, word: str, as_torch_tensor: bool = False, zero_if_not_found: bool = True):
        ret = self.gensim_model[word] if word in self.gensim_model else [0] * 25 if zero_if_not_found else None
        if as_torch_tensor:
            ret = torch.tensor(ret)
        return ret

    def sentence2vec(self, sentence: str, pad_to_len: Optional[int]=None) -> torch.Tensor:
        list_of_words = gensim.utils.simple_preprocess(sentence)
        vec_result = torch.stack([self.word2vec(word, as_torch_tensor=True, zero_if_not_found=True) for word in list_of_words]) # shape: (num_of_words, 25)
        if pad_to_len is not None and vec_result.shape[0] < pad_to_len:
            vec_result = torch.cat([vec_result, torch.zeros((pad_to_len - vec_result.shape[0], 25))])
        elif pad_to_len is not None and vec_result.shape[0] > pad_to_len:
            vec_result = vec_result[:pad_to_len]
            logger.warning("Dropping words after '{}' from sentence '{}'", list_of_words[pad_to_len], sentence)
        return vec_result


def _to_vec(sentence: str, aspect_word: str, tokenizer: GensimTokenizer, max_len: int = 25) -> torch.Tensor:
    # Tokenize and convert sentence to vectors
    sentence_vec = tokenizer.sentence2vec(sentence, pad_to_len=max_len)  # shape: (max_len, 25)

    # Preprocess and tokenize aspect words
    aspect_word = gensim.utils.simple_preprocess(aspect_word)
    aspect_indicators = torch.zeros(max_len, 1)

    # Iterate over the sentence and mark aspect words
    for i, word in enumerate(gensim.utils.simple_preprocess(sentence)):
        if i >= max_len:
            break
        if word in aspect_word:
            aspect_indicators[i] = 1

    # Concatenate the sentence vectors with the aspect indicators
    combined_vec = torch.cat((sentence_vec, aspect_indicators), dim=1)  # shape: (max_len, 26)
    return combined_vec


gt = GensimTokenizer()

def to_vec(entry: SAWAWEntry, max_len: int = 80, should_return_sentiment: bool=True) -> torch.Tensor:
    aspect_word_encoded_sentence = []
    sentiment_result = []
    for i, aspect_word in enumerate(entry.aspect_words):
        vec = _to_vec(entry.comment, aspect_word, gt, max_len=max_len)  # shape: (max_len, 26)
        aspect_word_encoded_sentence.append(vec)
        if should_return_sentiment:
            sent = entry.sentiment_results[i]
            if sent == SentimentResult.UNDEFINED:
                logger.warning("Sentiment result for aspect word '{}' is undefined, but to_vec is called with should_return_sentiment=True. Assuming neutral.", aspect_word)
                sentiment_result = 0.5
            elif sent == SentimentResult.NEGATIVE:
                sentiment_result = 0
            elif sent == SentimentResult.POSITIVE:
                sentiment_result = 1
            elif sent == SentimentResult.NEUTRAL or sent == SentimentResult.NONE:
                sentiment_result = 0.5

    if should_return_sentiment:
        return torch.stack(aspect_word_encoded_sentence), torch.tensor(sentiment_result)
    else:
        return torch.stack(aspect_word_encoded_sentence)  # shape: (num_of_aspect_words, max_len, 26)


if __name__ == '__main__':
    tokenizer = GensimTokenizer()
    sentence = "The pizza at this restaurant is amazing, but the service is slow."
    aspect_words = "service"
    vectorized_sentence = to_vec(sentence, aspect_words, tokenizer)
    vectorized_sentence.shape  # should be (25, 26)