Module stylotool.src.freestylo.ChiasmusAnnotation

Functions

def cosine_similarity(vec1, vec2)

This method calculates the cosine similarity between two vectors.

Parameters

vec1 : np.array
The first vector.
vec2 : np.array
The second vector.

Classes

class ChiasmusAnnotation (text: freestylo.TextObject.TextObject, window_size=30)

This class is used to find chiasmus candidates in a text. It uses the TextObject class to store the text and its annotations.

Parameters

text : TextObject
The text to be analyzed.
window_size : int, optional
The window size to search for chiasmus candidates
Expand source code
class ChiasmusAnnotation:
    """
    This class is used to find chiasmus candidates in a text.
    It uses the TextObject class to store the text and its annotations.
    """
    def __init__(self, text : TextObject, window_size=30):
        """
        Parameters
        ----------
        text : TextObject
            The text to be analyzed.
        window_size : int, optional
            The window size to search for chiasmus candidates
        """
        self.text = text
        text.annotations.append(self)
        self.window_size = window_size
        self.candidates = []
        self.denylist = []
        self.allowlist = []
        self.neglist = []
        self.poslist = []
        self.conjlist = []
        self.type = "chiasmus"
        self.model = None


    def find_candidates(self):
        """
        This method finds chiasmus candidates in the text.
        It uses the window_size to search for candidates.
        """
        pos = self.text.pos

        outer_matches = []
        for i in range(len(pos)):
            outer_matches += self._find_matches(i, i + self.window_size)

        for match in outer_matches:
            A, A_ = match
            start_inner = A + 1
            inner_matches = self._find_matches(start_inner, A_)
            for B, B_ in inner_matches:
                self.candidates.append(ChiasmusCandidate(A, B, B_, A_))

    def load_classification_model(self, model_path):
        """
        This method loads a classification model to score the chiasmus candidates.
        Parameters
        ----------
        model_path : str
            The path to the model file.
        """
        import pickle
        with open(get_model_path(model_path), "rb") as f:
            self.model = pickle.load(f)

    def serialize(self) -> list:
        """
        This method serializes the chiasmus candidates.

        Returns
        -------
        list
            A list of serialized candidates.
        """
        candidates = []
        for c in self.candidates:
            candidates.append({
        "ids": c.ids,
        "A": c.A,
        "B": c.B,
        "B_": c.B_,
        "A_": c.A_,
        "score": c.score})
        return candidates

        
        
    
    def _find_matches(self, start : int, end : int) -> list:
        """
        This method finds matches in the pos list of the text.
        It uses the start and end index to search for matches.

        Parameters
        ----------
        start : int
            The start index of the search.
        end : int
            The end index of the search.
        """
        pos = self.text.pos

        #if end > len(pos):
        #    end = len(pos)

        #if end < start+3:
        #    return []

        if not self._check_pos(pos[start]):
            return []
        matches = []
        for i in range(start+1, end):
            try:
                if pos[start] == pos[i]:
                    matches.append((start, i))
            except IndexError:
                pass
        return matches

    def _check_pos(self, pos):
        """
        This method checks if a pos is in the allowlist or not in the denylist.

        Parameters
        ----------
        pos : str
            The pos to check.
        """
        if len(self.allowlist) > 0 and pos not in self.allowlist:
            return False
        if len(self.denylist) > 0 and pos in self.denylist:
            return False
        return True

    def has_candidates(self):
        """
        This method checks if the text has chiasmus candidates.
        """
        return len(self.candidates) > 0

    def score_candidates(self):
        """
        This method scores the chiasmus candidates.
        """
        features = []
        for candidate in self.candidates:
            features.append(self.get_features(candidate))
        if self.model is None:
            print("Load Chiasmus Model before scoring the candidates")
            return False
        features = np.stack(features)
        scores = self.model.decision_function(features)
        for score, candidate in zip(scores, self.candidates):
            candidate.score = score
        return True

    def get_features(self, candidate):
        """
        This method extracts features for a chiasmus candidate.

        Parameters
        ----------
        candidate : ChiasmusCandidate
            The candidate to extract features from.

        Returns
        -------
        np.array
            An array of features.
        """

        dubremetz_features = self.get_dubremetz_features(candidate)
        lexical_features = self.get_lexical_features(candidate)
        semantic_features = self.get_semantic_features(candidate)
        return np.concatenate((dubremetz_features, lexical_features, semantic_features))

    def get_dubremetz_features(self, candidate):
        """
        This method extracts Dubremetz features for a chiasmus candidate.

        Returns
        -------
        np.array
            An array of Dubremetz features
        """
        tokens = self.text.tokens
        lemmas = self.text.lemmas
        pos = self.text.pos
        dep = self.text.dep
        vectors = self.text.vectors

        context_start = candidate.A - 5
        context_end = candidate.A_ + 5

        tokens_main = [tokens[i] for i in range(candidate.A, candidate.A_+1)]
        lemmas_main = [lemmas[i] for i in range(candidate.A, candidate.A_+1)]
        pos_main = [pos[i] for i in range(candidate.A, candidate.A_+1)]
        dep_main = [dep[i] for i in range(candidate.A, candidate.A_+1)]
        vectors_main = [vectors[i] for i in range(candidate.A, candidate.A_+1)]

        neglist = self.neglist
        poslist = self.poslist
        conjlist = self.conjlist

        hardp_list = ['.', '(', ')', "[", "]"] 
        softp_list = [',', ';']

        features = []

         # Basic

        num_punct = 0
        for h in hardp_list:
            if h in tokens[ candidate.ids[0]+1 : candidate.ids[1] ]: num_punct+=1
            if h in tokens[ candidate.ids[2]+1 : candidate.ids[3] ]: num_punct+=1
        features.append(num_punct)

        num_punct = 0
        for h in hardp_list:
            if h in tokens[ candidate.ids[0]+1 : candidate.ids[1] ]: num_punct+=1
            if h in tokens[ candidate.ids[2]+1 : candidate.ids[3] ]: num_punct+=1
        features.append(num_punct)

        num_punct = 0
        for h in hardp_list:
            if h in tokens[ candidate.ids[1]+1 : candidate.ids[2] ]: num_punct+=1
        features.append(num_punct)

        rep_a1 = -1
        if lemmas[candidate.ids[0]] == lemmas[candidate.ids[3]]:
            rep_a1 -= 1
        rep_a1 += lemmas.count(lemmas[candidate.ids[0]])
        features.append(rep_a1)

        rep_b1 = -1
        if lemmas[candidate.ids[1]] == lemmas[candidate.ids[2]]:
            rep_b1 -= 1
        rep_b1 += lemmas.count(lemmas[candidate.ids[1]])
        features.append(rep_b1)

        rep_b2 = -1
        if lemmas[candidate.ids[1]] == lemmas[candidate.ids[2]]:
            rep_b2 -= 1
        rep_b2 += lemmas.count(lemmas[candidate.ids[2]])
        features.append(rep_b2)

        rep_a2 = -1
        if lemmas[candidate.ids[0]] == lemmas[candidate.ids[3]]:
            rep_a2 -= 1
        rep_a2 += lemmas.count(lemmas[candidate.ids[3]])
        features.append(rep_b2)

        # Size

        diff_size = abs((candidate.ids[1]-candidate.ids[0]) - (candidate.ids[3]-candidate.ids[2]))
        features.append(diff_size)

        toks_in_bc = candidate.ids[3]-candidate.ids[1]
        features.append(toks_in_bc)

        # Similarity

        exact_match = ([" ".join(tokens[candidate.ids[0]+1 : candidate.ids[1]])] == [" ".join(tokens[candidate.ids[2]+1 : candidate.ids[3]])])
        features.append(exact_match)

        same_tok = 0
        for l in lemmas[candidate.ids[0]+1 : candidate.ids[1]]:
            if l in lemmas[candidate.ids[2]+1 : candidate.ids[3]]: same_tok += 1
        features.append(same_tok)

        sim_score = same_tok / (candidate.ids[1]-candidate.ids[0])
        features.append(sim_score)

        num_bigrams = 0
        t1 = " ".join(tokens[candidate.ids[0]+1 : candidate.ids[1]])
        t2 = " ".join(tokens[candidate.ids[2]+1 : candidate.ids[3]])
        s1 = set()
        s2 = set()
        for t in range(len(t1)-1):
            bigram = t1[t:t+2]
            s1.add(bigram)
        for t in range(len(t2)-1):
            bigram = t2[t:t+2]
            s2.add(bigram)
        for b in s1:
            if b in s2: num_bigrams += 1
        bigrams_normed = (num_bigrams/max(len(s1)+1, len(s2)+1))
        features.append(bigrams_normed)

        num_trigrams = 0
        t1 = " ".join(tokens[candidate.ids[0]+1 : candidate.ids[1]])
        t2 = " ".join(tokens[candidate.ids[2]+1 : candidate.ids[3]])
        s1 = set()
        s2 = set()
        for t in range(len(t1)-2):
            trigram = t1[t:t+3]
            s1.add(trigram)
        for t in range(len(t2)-2):
            trigram = t2[t:t+3]
            s2.add(trigram)
        for t in s1:
            if t in s2: num_trigrams += 1
        trigrams_normed = (num_trigrams/max(len(s1)+1, len(s2)+1))
        features.append(trigrams_normed)

        same_cont = 0
        t1 = set(tokens[candidate.ids[0]+1:candidate.ids[1]])
        t2 = set(tokens[candidate.ids[2]+1:candidate.ids[3]])
        for t in t1:
            if t in t2: same_cont += 1
        features.append(same_cont)

        # Lexical clues

        conj = 0
        for c in conjlist:
            if c in tokens[candidate.ids[1]+1:candidate.ids[2]]+lemmas[candidate.ids[1]+1:candidate.ids[2]]:
                conj = 1
        features.append(conj)


        neg = 0
        for n in neglist:
            if n in tokens[candidate.ids[1]+1:candidate.ids[2]]+lemmas[candidate.ids[1]+1:candidate.ids[2]]:
                neg = 1
        features.append(neg)


        # Dependency score

        if dep[candidate.ids[1]] == dep[candidate.ids[3]]:
            features.append(1)  
        else: 
            features.append(0)

        if dep[candidate.ids[0]] == dep[candidate.ids[2]]:
            features.append(1)  
        else: 
            features.append(0)

        if dep[candidate.ids[1]] == dep[candidate.ids[2]]:
            features.append(1)  
        else: 
            features.append(0)

        if dep[candidate.ids[0]] == dep[candidate.ids[3]]:
            features.append(1)  
        else: 
            features.append(0)

        features = np.array(features)
        return features

    def get_lexical_features(self, candidate):
        """
        This method extracts lexical features for a chiasmus candidate.

        Returns
        -------
        np.array
            An array of lexical features
        """
        tokens = self.text.tokens
        lemmas = self.text.lemmas
        pos = self.text.pos
        dep = self.text.dep
        vectors = self.text.vectors

        context_start = candidate.A - 5
        context_end = candidate.A_ + 5

        lemmas_main = [lemmas[i] for i in candidate.ids]


        neglist = self.neglist
        poslist = self.poslist

        features = []

        
        for i in range(len(lemmas_main)):
            for j in range(i+1, len(lemmas_main)):
                if lemmas_main[i] == lemmas_main[j]:
                    features.append(1)
                else:
                    features.append(0)

        features = np.array(features)
        return features

    def get_semantic_features(self, candidate):
        """
        This method extracts semantic features for a chiasmus candidate.

        Returns
        -------
        np.array
            An array of semantic features
        """
        tokens = self.text.tokens
        lemmas = self.text.lemmas
        pos = self.text.pos
        dep = self.text.dep
        vectors = self.text.vectors

        context_start = candidate.A - 5
        context_end = candidate.A_ + 5

        vectors_main = [vectors[i] for i in candidate.ids]


        features = []
        for i in range(len(vectors_main)):
            for j in range(i+1, len(vectors_main)):
                features.append(cosine_similarity(vectors_main[i], vectors_main[j]))

        features = np.array(features)
        return features

Methods

def find_candidates(self)

This method finds chiasmus candidates in the text. It uses the window_size to search for candidates.

def get_dubremetz_features(self, candidate)

This method extracts Dubremetz features for a chiasmus candidate.

Returns

np.array
An array of Dubremetz features
def get_features(self, candidate)

This method extracts features for a chiasmus candidate.

Parameters

candidate : ChiasmusCandidate
The candidate to extract features from.

Returns

np.array
An array of features.
def get_lexical_features(self, candidate)

This method extracts lexical features for a chiasmus candidate.

Returns

np.array
An array of lexical features
def get_semantic_features(self, candidate)

This method extracts semantic features for a chiasmus candidate.

Returns

np.array
An array of semantic features
def has_candidates(self)

This method checks if the text has chiasmus candidates.

def load_classification_model(self, model_path)

This method loads a classification model to score the chiasmus candidates. Parameters


model_path : str
The path to the model file.
def score_candidates(self)

This method scores the chiasmus candidates.

def serialize(self) ‑> list

This method serializes the chiasmus candidates.

Returns

list
A list of serialized candidates.
class ChiasmusCandidate (A, B, B_, A_)

This class represents a chiasmus candidate.

Parameters

A : int
Index of the first supporting word
B : int
Index of the second supporting word
B_ : int
Index of the third supporting word, paired with B
A_ : int
Index of the fourth supporting word, paired with A
Expand source code
class ChiasmusCandidate:
    """
    This class represents a chiasmus candidate.
    """
    def __init__(self, A, B, B_, A_):
        """
        Parameters
        ----------
        A : int
            Index of the first supporting word
        B : int
            Index of the second supporting word
        B_ : int
            Index of the third supporting word, paired with B
        A_ : int
            Index of the fourth supporting word, paired with A
        """

        self.ids = [A, B, B_, A_]
        self.A = A
        self.B = B
        self.B_ = B_
        self.A_ = A_
        self.score = None

    def __str__(self):
        """
        This method returns a string representation of the chiasmus candidate.
        """
        return f"{self.A} {self.B} {self.B_} {self.A_}"