Skip to content
Snippets Groups Projects

chore: replace compose_search function by simple_search_query

Merged Elodie Thiéblin requested to merge topic/default/simple-query-string into branch/default
1 file
+ 77
50
Compare changes
  • Side-by-side
  • Inline
@@ -15,9 +15,49 @@
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
from elasticsearch_dsl import Q, query as dsl_query
from logilab.mtconverter import xml_escape
SIMPLE_QUERY_OPERATORS = "|+-\"()*~"
SIMPLE_QUERY_OPERATORS_RE = r"[\|\+\-\"\(\)\*\~]"
def is_simple_query_string(query):
"""
Define if the query contains any of operators supported by simple_query_string
query:
text of the query to be composed (can contain quotes)
In ES the simple_query_string query supports the following operators:
+ signifies AND operation
| signifies OR operation
- negates a single token
" wraps a number of tokens to signify a phrase for searching
* at the end of a term signifies a prefix query
( and ) signify precedence
~N after a word signifies edit distance (fuzziness)
~N after a phrase signifies slop amount
https://www.elastic.co/guide/en/elasticsearch/reference/7.9/query-dsl-simple-query-string-query.html
"""
# for all operators except "-", if it appears, then we assume it is a simple query
for operator in SIMPLE_QUERY_OPERATORS.replace("-", ""):
if operator in query:
return True
# in the case of the "-" operator, we accept it only if it is at the beginning of the query
# or after a space or an operator
# queries like "mont-saint-michel" should not be regarded as simple_query_string
if query.startswith("-"):
return True
if " -" in query:
return True
return False
def compose_search(
@@ -43,67 +83,54 @@ def compose_search(
add a fuzzy search element to part of the query generated
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-fuzzy-query.html
"""
# FIXME TODO - restructure entier code base, have a proper lexer
for char in ('"', "'", xml_escape('"'), xml_escape("'")):
# TODO - implement phrase + term
if len(query.split(char)) == 3:
# TODO add this to most important queries, instead of single query ?
return search.query(
"bool",
must=Q(
"multi_match",
query=query.split(char)[1],
type="phrase",
fields=fields,
),
)
must = []
must_not = []
should = []
cutoff_frequency = 0.001
# https://www.elastic.co/guide/en/elasticsearch/reference/2.4/query-dsl-minimum-should-match.html
minimum_should_match = "1"
# proximity booster - phrase with slop=50
# TODO find a better way to do this
if not phrase and not common: # invalid combination
phrase = common = True
if phrase:
phrase_query = Q(
"multi_match", query=query, type="phrase", slop=50, fields=fields
)
should.append(phrase_query)
# highfrequency/lowfrequency query
# https://www.elastic.co/blog/stop-stopping-stop-words-a-look-at-common-terms-query
if common:
common_query = dsl_query.Common(
alltext={
"query": query,
"cutoff_frequency": cutoff_frequency,
"low_freq_operator": "and",
"minimum_should_match": {"high_freq": "70%"},
}
if is_simple_query_string(query):
query_string = Q(
"simple_query_string",
query=query,
fields=fields,
default_operator="and"
)
should.append(common_query)
elements = query.split()
elements_lowercase = [e.lower() for e in elements]
if "or" in elements_lowercase and len(elements) >= 3:
for element in query.split("or"):
should.append(Q("multi_match", query=element.strip(), fields=fields))
elements = []
for element in elements:
if element.startswith("-"):
must_not.append(Q("multi_match", query=element[1:], fields=fields))
else:
should.append(query_string)
else:
if not phrase and not common: # invalid combination
phrase = common = True
if phrase:
phrase_query = Q(
"multi_match",
query=query,
type="phrase",
slop=50,
fields=fields
)
should.append(phrase_query)
# highfrequency/lowfrequency query
# https://www.elastic.co/blog/stop-stopping-stop-words-a-look-at-common-terms-query
if common:
common_query = dsl_query.Common(
alltext={
"query": query,
"cutoff_frequency": 0.001,
"low_freq_operator": "and",
"minimum_should_match": {"high_freq": "70%"},
}
)
should.append(common_query)
if fuzzy:
elements = re.sub(SIMPLE_QUERY_OPERATORS_RE, " ", query).split()
for element in elements:
if fuzzy:
should.append(dsl_query.Fuzzy(alltext=element))
should.append(dsl_query.Fuzzy(alltext=element.replace('"', '')))
bool_query = dsl_query.Bool(
must=must,
must_not=must_not,
should=should,
minimum_should_match=minimum_should_match,
minimum_should_match=1,
)
search.query = bool_query
return search
Loading