Commit eeda2462 authored by Simon Chabot's avatar Simon Chabot
Browse files

style(black): paint it, black

I see a red door and I want it painted black.
parent 466930b1d96a
Pipeline #14026 passed with stages
in 2 minutes and 17 seconds
......@@ -8,11 +8,10 @@ from nazca.utils.tokenizer import RichStringTokenizer
# NER PROCESS #################################################################
###############################################################################
class NerProcess(object):
""" High-level process for Named Entities Recognition
"""
"""High-level process for Named Entities Recognition"""
def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
""" Initialise the class.
"""Initialise the class.
:tokenizer: an instance of tokenizer
"""
......@@ -22,28 +21,24 @@ class NerProcess(object):
self.unique = unique
def add_ner_source(self, process):
""" Add a ner process
"""
"""Add a ner process"""
self.ner_sources.append(process)
def add_preprocessors(self, preprocessor):
""" Add a preprocessor
"""
"""Add a preprocessor"""
self.preprocessors.append(preprocessor)
def add_filters(self, filter):
""" Add a filter
"""
"""Add a filter"""
self.filters.append(filter)
def process_text(self, text):
""" High level function for analyzing a text
"""
"""High level function for analyzing a text"""
tokenizer = RichStringTokenizer(text)
return self.recognize_tokens(tokenizer)
def recognize_tokens(self, tokens):
""" Recognize Named Entities from a tokenizer or
"""Recognize Named Entities from a tokenizer or
an iterator yielding tokens.
"""
last_stop = 0
......
......@@ -8,15 +8,14 @@ from nazca.utils.dataio import sparqlquery
# NER FILTERS #################################################################
###############################################################################
class AbstractNerFilter(object):
""" A filter used for cleaning named entities results
"""
"""A filter used for cleaning named entities results"""
def __call__(self, named_entities):
raise NotImplementedError
class NerOccurenceFilter(object):
""" A filter based on the number of occurence of
"""A filter based on the number of occurence of
named entities in the results.
"""
......@@ -38,7 +37,7 @@ class NerOccurenceFilter(object):
class NerRDFTypeFilter(object):
""" A filter based on the RDF type on entity
"""A filter based on the RDF type on entity
E.g.
filter = NerRDFTypeFilter('http://dbpedia.org/sparql',
......@@ -72,7 +71,7 @@ class NerRDFTypeFilter(object):
class NerDisambiguationWordParts(object):
""" Disambiguate named entities based on the words parts.
"""Disambiguate named entities based on the words parts.
E.g.:
'toto tutu': 'http://example.com/toto_tutu',
'toto': 'http://example.com/toto'
......@@ -99,8 +98,7 @@ class NerDisambiguationWordParts(object):
class NerReplacementRulesFilter(object):
""" Allow to define replacement rules for Named Entities
"""
"""Allow to define replacement rules for Named Entities"""
def __init__(self, rules):
self.rules = rules
......
......@@ -11,16 +11,14 @@ STOPWORDS = {"fr": FRENCH_STOPWORDS, "en": ENGLISH_STOPWORDS}
# NER PREPROCESSORS ###########################################################
###############################################################################
class AbstractNerPreprocessor(object):
""" Preprocessor
"""
"""Preprocessor"""
def __call__(self, token):
raise NotImplementedError
class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
""" Remove token based on the size of the word
"""
"""Remove token based on the size of the word"""
def __init__(self, min_size=None, max_size=None):
self.min_size = min_size
......@@ -35,16 +33,14 @@ class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
class NerLowerCaseFilterPreprocessor(AbstractNerPreprocessor):
""" Remove token with word in lower case
"""
"""Remove token with word in lower case"""
def __call__(self, token):
return None if token.word.islower() else token
class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
""" Lower the first word of each sentence if it is a stopword.
"""
"""Lower the first word of each sentence if it is a stopword."""
def __init__(self, lang="en"):
self.lang = lang
......@@ -59,8 +55,7 @@ class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
""" Remove stopwords
"""
"""Remove stopwords"""
def __init__(self, split_words=False, lang="en"):
self.split_words = split_words
......@@ -76,8 +71,7 @@ class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
class NerHashTagPreprocessor(AbstractNerPreprocessor):
""" Cleanup hashtag
"""
"""Cleanup hashtag"""
def __call__(self, token):
if token.word.startswith("@"):
......
......@@ -8,12 +8,10 @@ from nazca.utils.dataio import sparqlquery, rqlquery
# NER SOURCE ##################################################################
###############################################################################
class AbstractNerSource(object):
""" High-level source for Named Entities Recognition
"""
"""High-level source for Named Entities Recognition"""
def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
""" Initialise the class.
"""
"""Initialise the class."""
self.endpoint = endpoint
self.query = query
self.name = name
......@@ -22,13 +20,11 @@ class AbstractNerSource(object):
self._recognized_cache = {}
def add_preprocessors(self, preprocessor):
""" Add a preprocessor
"""
"""Add a preprocessor"""
self.preprocessors.append(preprocessor)
def recognize_token(self, token):
""" Recognize a token
"""
"""Recognize a token"""
# Applies source specific preprocessors
for preprocessor in self.preprocessors:
token = preprocessor(token)
......@@ -42,14 +38,12 @@ class AbstractNerSource(object):
return uris
def query_word(self, word):
""" Query a word for a Named Entities Recognition process
"""
"""Query a word for a Named Entities Recognition process"""
raise NotImplementedError
class NerSourceLexicon(AbstractNerSource):
""" Source based on a (pre-computed) dictionnary of words (token, uri)
"""
"""Source based on a (pre-computed) dictionnary of words (token, uri)"""
def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
self.lexicon = lexicon
......@@ -60,17 +54,22 @@ class NerSourceLexicon(AbstractNerSource):
def query_word(self, word):
uri = self.lexicon.get(word)
return [uri,] if uri else []
return (
[
uri,
]
if uri
else []
)
class NerSourceLocalRql(AbstractNerSource):
""" High-level source for Named Entities Recognition
"""High-level source for Named Entities Recognition
Local RQL version
"""
def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
""" Initialise the class.
"""
"""Initialise the class."""
self.query = query
self.session = session
self.name = name
......@@ -79,19 +78,17 @@ class NerSourceLocalRql(AbstractNerSource):
self._recognized_cache = {}
def query_word(self, word):
""" Query a word for a Named Entities Recognition process
"""
"""Query a word for a Named Entities Recognition process"""
return [r[0] for r in self.session.execute(self.query, dict(word=word))]
class NerSourceRql(AbstractNerSource):
""" High-level source for Named Entities Recognition
"""High-level source for Named Entities Recognition
Url version (distant source)
"""
def query_word(self, word):
""" Query a word for a Named Entities Recognition process
"""
"""Query a word for a Named Entities Recognition process"""
if self.endpoint.startswith("http://"):
# url
return [r[0] for r in rqlquery(self.endpoint, self.query % {"word": word})]
......@@ -100,25 +97,24 @@ class NerSourceRql(AbstractNerSource):
class NerSourceSparql(AbstractNerSource):
""" High-level source for Named Entities Recognition
SPARQL version
>>> from ner.core import NerSourceSparql
>>> ner_source = NerSourceSparql('''SELECT ?uri
WHERE{
?uri rdfs:label "%(word)s"@en}''',
'http://dbpedia.org/sparql')
>>> print ner_source.recognize_token('Victor Hugo')
... ['http://dbpedia.org/resource/Category:Victor_Hugo',
'http://dbpedia.org/resource/Victor_Hugo',
'http://dbpedia.org/class/yago/VictorHugo',
'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
"""High-level source for Named Entities Recognition
SPARQL version
>>> from ner.core import NerSourceSparql
>>> ner_source = NerSourceSparql('''SELECT ?uri
WHERE{
?uri rdfs:label "%(word)s"@en}''',
'http://dbpedia.org/sparql')
>>> print ner_source.recognize_token('Victor Hugo')
... ['http://dbpedia.org/resource/Category:Victor_Hugo',
'http://dbpedia.org/resource/Victor_Hugo',
'http://dbpedia.org/class/yago/VictorHugo',
'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
"""
def query_word(self, word):
""" Query a word for a Named Entities Recognition process
"""
"""Query a word for a Named Entities Recognition process"""
return [r[0] for r in sparqlquery(self.endpoint, self.query % {"word": word})]
......@@ -28,8 +28,7 @@ from nazca.utils.dataio import parsefile
# UTILITY FUNCTIONS ###########################################################
###############################################################################
def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
""" Return the aligned pairs
"""
"""Return the aligned pairs"""
if unique:
for refid in global_matched:
bestid, _ = sorted(global_matched[refid], key=lambda x: x[1])[0]
......@@ -68,13 +67,13 @@ class BaseAligner(object):
self.logger = logging.getLogger("nazca.aligner")
def register_ref_normalizer(self, normalizer):
""" Register normalizers to be applied
before alignment """
"""Register normalizers to be applied
before alignment"""
self.ref_normalizer = normalizer
def register_target_normalizer(self, normalizer):
""" Register normalizers to be applied
before alignment """
"""Register normalizers to be applied
before alignment"""
self.target_normalizer = normalizer
def register_blocking(self, blocking):
......@@ -86,7 +85,7 @@ class BaseAligner(object):
return dataset
def compute_distance_matrix(self, refset, targetset, ref_indexes, target_indexes):
""" Compute and return the global alignment matrix.
"""Compute and return the global alignment matrix.
For each `processing` a `Distancematrix` is built, then all the
matrices are summed with their own weighting and the result is the global
alignment matrix, which is returned.
......@@ -98,7 +97,7 @@ class BaseAligner(object):
return distmatrix
def threshold_matched(self, distmatrix):
""" Return the matched elements within a dictionnary,
"""Return the matched elements within a dictionnary,
each key being the indice from X, and the corresponding
values being a list of couple (indice from Y, distance)
"""
......@@ -128,7 +127,7 @@ class BaseAligner(object):
return mat, new_matched
def align(self, refset, targetset, get_matrix=True):
""" Perform the alignment on the referenceset
"""Perform the alignment on the referenceset
and the targetset
"""
start_time = time.time()
......@@ -161,8 +160,7 @@ class BaseAligner(object):
return global_mat, global_matched
def get_aligned_pairs(self, refset, targetset, unique=True, use_distance=True):
""" Get the pairs of aligned elements
"""
"""Get the pairs of aligned elements"""
if not refset or not targetset:
return
global_mat, global_matched = self.align(refset, targetset, get_matrix=use_distance)
......@@ -185,7 +183,7 @@ class BaseAligner(object):
target_separator="\t",
get_matrix=True,
):
""" Align data from files
"""Align data from files
Parameters
----------
......@@ -224,8 +222,7 @@ class BaseAligner(object):
target_separator="\t",
unique=True,
):
""" Get the pairs of aligned elements
"""
"""Get the pairs of aligned elements"""
refset = parsefile(
reffile, indexes=ref_indexes, encoding=ref_encoding, delimiter=ref_separator
)
......@@ -238,8 +235,7 @@ class BaseAligner(object):
yield from iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique)
def log_infos(self):
""" Display some info on the aligner process
"""
"""Display some info on the aligner process"""
self.logger.info("Computation time : %s" % self.time)
self.logger.info("Size reference set : %s" % self.refset_size)
self.logger.info("Size target set : %s" % self.targetset_size)
......@@ -276,7 +272,7 @@ class BaseAligner(object):
# PIPELINE ALIGNER OBJECT ####################################################
###############################################################################
class PipelineAligner(object):
""" This pipeline will perform iterative alignments, removing each time
"""This pipeline will perform iterative alignments, removing each time
the aligned results from the previous aligner.
"""
......@@ -293,8 +289,7 @@ class PipelineAligner(object):
self.logger = logging.getLogger("nazca.aligner")
def get_aligned_pairs(self, refset, targetset, unique=True):
""" Get the pairs of aligned elements
"""
"""Get the pairs of aligned elements"""
if not refset or not targetset:
return
start_time = time.time()
......@@ -328,8 +323,7 @@ class PipelineAligner(object):
self.log_infos()
def log_infos(self):
""" Display some info on the aligner process
"""
"""Display some info on the aligner process"""
self.logger.info("Computation time : %s" % self.time)
self.logger.info("Size reference set : %s" % self.refset_size)
self.logger.info("Size target set : %s" % self.targetset_size)
......
......@@ -39,12 +39,12 @@ from nazca.utils.distances import soundexcode
# GENERAL BLOCKING ############################################################
###############################################################################
class BaseBlocking(object):
""" An abstract general blocking object that exposes
"""An abstract general blocking object that exposes
the API that should be common to all blockings object
"""
def __init__(self, ref_attr_index, target_attr_index):
""" Build the blocking object
"""Build the blocking object
Parameters
----------
......@@ -67,17 +67,15 @@ class BaseBlocking(object):
raise NotImplementedError
def _iter_blocks(self):
""" Internal iteration function over blocks
"""
"""Internal iteration function over blocks"""
raise NotImplementedError
def _cleanup(self):
""" Internal cleanup blocking for further use (e.g. in pipeline)
"""
"""Internal cleanup blocking for further use (e.g. in pipeline)"""
raise NotImplementedError
def fit(self, refset, targetset):
""" Fit the blocking technique on the reference and target datasets
"""Fit the blocking technique on the reference and target datasets
Parameters
----------
......@@ -92,7 +90,7 @@ class BaseBlocking(object):
self.is_fitted = True
def iter_blocks(self):
""" Iterator over the different possible blocks.
"""Iterator over the different possible blocks.
Returns
-------
......@@ -105,7 +103,7 @@ class BaseBlocking(object):
return self._iter_blocks()
def iter_indice_blocks(self):
""" Iterator over the different possible blocks.
"""Iterator over the different possible blocks.
Returns
-------
......@@ -119,7 +117,7 @@ class BaseBlocking(object):
yield [r[0] for r in block1], [r[0] for r in block2]
def iter_id_blocks(self):
""" Iterator over the different possible blocks.
"""Iterator over the different possible blocks.
Returns
-------
......@@ -133,7 +131,7 @@ class BaseBlocking(object):
yield [r[1] for r in block1], [r[1] for r in block2]
def iter_pairs(self):
""" Iterator over the different possible pairs.
"""Iterator over the different possible pairs.
Returns
-------
......@@ -149,7 +147,7 @@ class BaseBlocking(object):
yield val1, val2
def iter_indice_pairs(self):
""" Iterator over the different possible pairs.
"""Iterator over the different possible pairs.
Returns
-------
......@@ -164,7 +162,7 @@ class BaseBlocking(object):
yield val1, val2
def iter_id_pairs(self):
""" Iterator over the different possible pairs.
"""Iterator over the different possible pairs.
Returns
-------
......@@ -179,8 +177,7 @@ class BaseBlocking(object):
yield val1, val2
def cleanup(self):
""" Cleanup blocking for further use (e.g. in pipeline)
"""
"""Cleanup blocking for further use (e.g. in pipeline)"""
self.is_fitted = True
self._cleanup()
......@@ -189,7 +186,7 @@ class BaseBlocking(object):
# KEY BLOCKING ################################################################
###############################################################################
class KeyBlocking(BaseBlocking):
""" This blocking technique is based on a a blocking criteria
"""This blocking technique is based on a a blocking criteria
(or blocking key), that will be used to divide the datasets.
The main idea here is:
......@@ -210,8 +207,7 @@ class KeyBlocking(BaseBlocking):
self.target_index = {}
def _fit(self, refset, targetset):
""" Fit a dataset in an index using the callback
"""
"""Fit a dataset in an index using the callback"""
for ind, rec in enumerate(refset):
key = self.callback(rec[self.ref_attr_index])
if not key and self.ignore_none:
......@@ -224,7 +220,7 @@ class KeyBlocking(BaseBlocking):
self.target_index.setdefault(key, []).append((ind, rec[0]))
def _iter_blocks(self):
""" Iterator over the different possible blocks.
"""Iterator over the different possible blocks.
Returns
-------
......@@ -239,15 +235,17 @@ class KeyBlocking(BaseBlocking):
yield (block1, block2)
def _cleanup(self):
""" Cleanup blocking for further use (e.g. in pipeline)
"""
"""Cleanup blocking for further use (e.g. in pipeline)"""
self.reference_index = {}
self.target_index = {}
class SoundexBlocking(KeyBlocking):
def __init__(
self, ref_attr_index, target_attr_index, language="french",
self,
ref_attr_index,
target_attr_index,
language="french",
):
super(SoundexBlocking, self).__init__(
ref_attr_index, target_attr_index, partial(soundexcode, language=language)
......@@ -258,8 +256,7 @@ class SoundexBlocking(KeyBlocking):
# BIGRAM BLOCKING #############################################################
###############################################################################
class NGramBlocking(BaseBlocking):
""" This blocking technique is based on a a n-gram key.
"""
"""This blocking technique is based on a a n-gram key."""
def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
......@@ -269,8 +266,7 @@ class NGramBlocking(BaseBlocking):
self.target_index = {}
def _fit_dataset(self, dataset, cur_index, attr_index):
""" Fit a dataset
"""
"""Fit a dataset"""
for ind, r in enumerate(dataset):
cur_dict = cur_index
text = r[attr_index]
......@@ -281,14 +277,12 @@ class NGramBlocking(BaseBlocking):
cur_dict.setdefault(ngram, []).append((ind, r[0]))
def _fit(self, refset, targetset):
""" Fit the two sets (reference set and target set)
"""
"""Fit the two sets (reference set and target set)"""
self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
self._fit_dataset(targetset, self.target_index, self.target_attr_index)
def _iter_dict(self, ref_cur_dict, target_cur_dict):
""" Iterative function used to create blocks from dicts
"""
"""Iterative function used to create blocks from dicts"""
for key, sub_dict in ref_cur_dict.items():
if key in target_cur_dict:
if isinstance(sub_dict, dict):
......@@ -300,7 +294,7 @@ class NGramBlocking(BaseBlocking):
yield sub_dict, target_cur_dict[key]
def _iter_blocks(self):
""" Iterator over the different possible blocks.
"""Iterator over the different possible blocks.
Returns
-------
......@@ -314,8 +308,7 @@ class NGramBlocking(BaseBlocking):
yield block1, block2
def _cleanup(self):
""" Cleanup blocking for further use (e.g. in pipeline)
"""
"""Cleanup blocking for further use (e.g. in pipeline)"""
self.reference_index = {}
self.target_index = {}
......@@ -324,7 +317,7 @@ class NGramBlocking(BaseBlocking):
# SORTKEY BLOCKING ############################################################
###############################################################################
class SortedNeighborhoodBlocking(BaseBlocking):
""" This blocking technique is based on a a sorting blocking criteria
"""This blocking technique is based on a a sorting blocking criteria
(or blocking key), that will be used to divide the datasets.
"""
......@@ -335,8 +328,7 @@ class SortedNeighborhoodBlocking(BaseBlocking):
self.sorted_dataset = None
def _fit(self, refset, targetset):
""" Fit a dataset in an index using the callback
"""
"""Fit a dataset in an index using the callback"""
self.sorted_dataset = [
((ind, r[0]), r[self.ref_attr_index], 0) for ind, r in enumerate(refset)
]
......@@ -346,8 +338,7 @@ class SortedNeighborhoodBlocking(BaseBlocking):
self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
def _iter_blocks(self):
""" Iterator over the different possible blocks.
"""
"""Iterator over the different possible blocks."""
for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
# Only keep reference set record
if dset == 1:
......@@ -363,8 +354,7 @@ class SortedNeighborhoodBlocking(BaseBlocking):
yield (block1, block2)
def _cleanup(self):
""" Cleanup blocking for further use (e.g. in pipeline)
"""
"""Cleanup blocking for further use (e.g. in pipeline)"""
self.sorted_dataset = None
......@@ -372,7 +362,7 @@ class SortedNeighborhoodBlocking(BaseBlocking):
# MERGE BLOCKING ##############################################################
###############################################################################