__init__.py 2.65 KB
Newer Older
1
2
3
4
5
6
7
# -*- coding: utf-8 -*-
""" Process/Core functions for Named Entities Recognition.
"""
from nazca.utils.tokenizer import RichStringTokenizer


###############################################################################
Adrien Di Mascio's avatar
flake8    
Adrien Di Mascio committed
8
# NER PROCESS #################################################################
9
10
###############################################################################
class NerProcess(object):
Simon Chabot's avatar
Simon Chabot committed
11
    """High-level process for Named Entities Recognition"""
12

Simon Chabot's avatar
Simon Chabot committed
13
    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
Simon Chabot's avatar
Simon Chabot committed
14
        """Initialise the class.
15
16
17
18
19
20
21
22
23

        :tokenizer: an instance of tokenizer
        """
        self.ner_sources = list(ner_sources)
        self.preprocessors = preprocessors or []
        self.filters = filters or []
        self.unique = unique

    def add_ner_source(self, process):
Simon Chabot's avatar
Simon Chabot committed
24
        """Add a ner process"""
25
26
27
        self.ner_sources.append(process)

    def add_preprocessors(self, preprocessor):
Simon Chabot's avatar
Simon Chabot committed
28
        """Add a preprocessor"""
29
30
31
        self.preprocessors.append(preprocessor)

    def add_filters(self, filter):
Simon Chabot's avatar
Simon Chabot committed
32
        """Add a filter"""
33
34
35
        self.filters.append(filter)

    def process_text(self, text):
Simon Chabot's avatar
Simon Chabot committed
36
        """High level function for analyzing a text"""
37
38
39
40
        tokenizer = RichStringTokenizer(text)
        return self.recognize_tokens(tokenizer)

    def recognize_tokens(self, tokens):
Simon Chabot's avatar
Simon Chabot committed
41
        """Recognize Named Entities from a tokenizer or
42
43
44
45
46
47
        an iterator yielding tokens.
        """
        last_stop = 0
        named_entities = []
        for token in tokens:
            if token.start < last_stop:
Adrien Di Mascio's avatar
Adrien Di Mascio committed
48
                continue  # this token overlaps with a previous match
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
            # Applies preprocessors
            # XXX Preprocessors may be sources dependant
            for preprocessor in self.preprocessors:
                token = preprocessor(token)
                if not token:
                    break
            if not token:
                continue
            recognized = False
            for process in self.ner_sources:
                for uri in process.recognize_token(token):
                    named_entities.append((uri, process.name, token))
                    recognized = True
                    last_stop = token.end
                    if self.unique:
                        break
                if recognized and self.unique:
                    break
        # XXX Postprocess/filters may be sources dependant
        return self.postprocess(named_entities)

    def postprocess(self, named_entities):
        """ Postprocess the results by applying filters """
        for filter in self.filters:
            named_entities = filter(named_entities)
        return named_entities