Module stylotool.src.freestylo.MGHPreprocessor

Classes

class MGHPreprocessor

This class preprocesses Middle High German text.

Constructor for the MGHPreprocessor class.

Expand source code
class MGHPreprocessor:
    """
    This class preprocesses Middle High German text.
    """
    def __init__(self):
        """
        Constructor for the MGHPreprocessor class.
        """
        self.text = ""
        self.model = fasttext.load_model(get_model_path("fasttext_mgh.bin"))
        pass

    # make class callable with ()
    def __call__(self, text):
        """
        This method preprocesses Middle High German text.
        
        Parameters
        ----------
        text : str
            The text to be preprocessed.

        Returns
        -------
        list
            A list of MGH tokens.
        """
        self.text = normalize_middle_high_german(text)

        tokens = []

        idx = 0
        pos_tagger = POSTag('middle_high_german')
        lemmatizer = BackoffMHGLemmatizer()
        # custom tokenizer, because I need the character index of the word
        while True:
            word, next_idx = self.get_next_word(self.text, idx)

            pos = pos_tagger.tag_tnt(word)[0][1]

            lemma = min(lemmatizer.lemmatize([word])[0][1], key=len)

            dep = ""

            vector = self.model.get_word_vector(word)


            tokens.append(MGHToken(word, pos, lemma, dep, vector, idx))

            if next_idx is None:
                break
            idx = next_idx
        return tokens



    def get_next_word(self, text, idx):
        """
        This method finds the next word in a text.

        Parameters
        ----------
        text : list[str]
            The text to be searched.
        idx : int
            The index of the current word.

        Returns
        -------
        str
            The next word in the text.
        int
            The index of the next word.
        """
        cursor = idx
        is_end = False 
        # find end of current word
        while cursor < len(text):
            try:
                if text[cursor] in [" ", "\n", "\t"]:
                    break
            except: # end of text
                is_end = True
                break
            cursor += 1

        end_word = cursor

        #find start of next word
        while cursor < len(text):
            try:
                if text[cursor] not in [" ", "\n", "\t"]:
                    break
            except:
                is_end = True
                break
            cursor += 1

        next_word = cursor

        if cursor == len(text):
            next_word = None

        word = text[idx:end_word]

        return word, next_word

Methods

def get_next_word(self, text, idx)

This method finds the next word in a text.

Parameters

text : list[str]
The text to be searched.
idx : int
The index of the current word.

Returns

str
The next word in the text.
int
The index of the next word.
class MGHToken (text, pos, lemma, dep, vector, idx)

This class represents a Middle High German token.

Constructor for the MGHToken class.

Parameters

text : str
The text of the token.
pos : str
The part of speech of the token.
lemma : str
The lemma of the token.
dep : str
The dependency of the token.
vector : np.array
The vector representation of the token.
idx : int
The index of the token in the text.
Expand source code
class MGHToken:
    """
    This class represents a Middle High German token.
    """
    def __init__(self, text, pos, lemma, dep, vector, idx):
        """
        Constructor for the MGHToken class.

        Parameters
        ----------
        text : str
            The text of the token.
        pos : str
            The part of speech of the token.
        lemma : str
            The lemma of the token.
        dep : str
            The dependency of the token.
        vector : np.array
            The vector representation of the token.
        idx : int
            The index of the token in the text.
        """
        self.text = text
        self.pos = pos
        self.lemma = lemma
        self.dep = dep
        self.vector = vector
        self.idx = idx