Newer
Older
# -*- coding: utf-8 -*-
# copyright 2016 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr -- mailto:contact@logilab.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from elasticsearch_dsl import Q, query as dsl_query
from logilab.mtconverter import xml_escape
def compose_search(search, query=None, fields=(),
fuzzy=False, phrase=True, common=True):
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
'''
Compose a elasticsearch-dsl query from queries :
* simple term
* simple terms (OR)
* negation (add - in front of a term)
* explicit OR
* quoted terms (AND)
search:
search object (used to set doc_type and index_name outside of
compose_search)
query:
text of the query to be composed (can contain quotes)
fields:
restrict and boost search on certain fields eg. ('title^2', '_all')
fuzzy:
add a fuzzy search element to part of the query generated
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-fuzzy-query.html
'''
# FIXME TODO - restructure entier code base, have a proper lexer
for char in ('"', "'", xml_escape('"'), xml_escape("'")):
# TODO - implement phrase + term
if len(query.split(char)) == 3:
# TODO add this to most important queries, instead of single query ?
return search.query('bool',
must=Q('multi_match',
query=query.split(char)[1],
type="phrase",
fields=fields))
must = []
must_not = []
should = []
cutoff_frequency = .001
# https://www.elastic.co/guide/en/elasticsearch/reference/2.4/query-dsl-minimum-should-match.html
minimum_should_match = "1"
# proximity booster - phrase with slop=50
# TODO find a better way to do this
if not phrase and not common: # invalid combination
phrase = common = True
if phrase:
phrase_query = Q('multi_match',
query=query,
type="phrase",
slop=50,
fields=fields)
should.append(phrase_query)
# highfrequency/lowfrequency query
# https://www.elastic.co/blog/stop-stopping-stop-words-a-look-at-common-terms-query
if common:
common_query = dsl_query.Common(_all={"query": query,
"cutoff_frequency": cutoff_frequency,
"low_freq_operator": "and",
"minimum_should_match": {
"high_freq": "70%"
}})
should.append(common_query)
elements = query.split()
elements_lowercase = [e.lower() for e in elements]
if 'or' in elements_lowercase and len(elements) >= 3:
for element in query.split('or'):
should.append(Q('multi_match',
query=element.strip(),
fields=fields))
elements = []
for element in elements:
if element.startswith('-'):
must_not.append(Q('multi_match',
query=element[1:],
fields=fields))
else:
if fuzzy:
should.append(dsl_query.Fuzzy(_all=element))
bool_query = dsl_query.Bool(must=must,
must_not=must_not,
should=should,
minimum_should_match=minimum_should_match)
search.query = bool_query
return search