from enum import Enum
from itertools import groupby
from typing import Type, cast, Iterable
from collections_extended import RangeMap
from fastapi.encoders import jsonable_encoder
from starlette.responses import Response, JSONResponse
from pydantic import BaseModel, Field
from pymultirole_plugins.v1.processor import ProcessorParameters, ProcessorBase
from pymultirole_plugins.v1.schema import Document, Annotation
[docs]class ConsolidationType(str, Enum):
linker = 'linker'
unknown = 'unknown'
unknown_only = 'unknown_only'
[docs]class AFPEntitiesParameters(ProcessorParameters):
type: ConsolidationType = Field(ConsolidationType.unknown, description="Type of consolidation")
[docs]def keyfunc(a: Annotation):
if a.terms:
return a.terms[0].lexicon
else:
return ""
[docs]class AFPEntitiesProcessor(ProcessorBase):
"""AFPEntities processor .
"""
[docs] @classmethod
def get_model(cls) -> Type[BaseModel]:
return AFPEntitiesParameters
[docs] def filter_annotations(self, input: Document):
"""Filter a sequence of annotations and remove duplicates or overlaps. When spans overlap, the (first)
longest span is preferred over shorter spans.
annotations (iterable): The annotations to filter.
RETURNS (list): The filtered annotations.
"""
get_sort_key = lambda ann: (ann.end - ann.start, -ann.start, ann.labelName == None)
sorted_annotations: Iterable[Annotation] = sorted(input.annotations, key=get_sort_key, reverse=True)
result = []
seen_offsets = RangeMap()
for ann in sorted_annotations:
# Check for end - 1 here because boundaries are inclusive
if seen_offsets.get(ann.start) is None and seen_offsets.get(ann.end - 1) is None:
result.append(ann)
seen_offsets[ann.start:ann.end] = ann
else:
target = seen_offsets.get(ann.start) or seen_offsets.get(ann.end - 1)
# if target.labelName in kb_labels and ann.labelName in white_labels and (target.start-ann.start != 0 or target.end-ann.end != 0):
if (target.start - ann.start == 0 or target.end - ann.end == 0) and (ann.end - ann.start) / (
target.end - target.start) > 0.8:
if ann.terms:
terms = set(target.terms or [])
terms.update(ann.terms)
target.terms = list(terms)
if ann.properties:
props = target.properties or {}
terms.update(ann.terms)
props.terms = props
result = sorted(result, key=lambda ann: ann.start)
return result