diff --git a/cubicweb_elasticsearch/search_helpers.py b/cubicweb_elasticsearch/search_helpers.py index 42e158ee431e4bc56f59df78eae7feb5fb2a495d_Y3ViaWN3ZWJfZWxhc3RpY3NlYXJjaC9zZWFyY2hfaGVscGVycy5weQ==..7a6c2b4dbc753caa5b379386a13affe503d5d4fd_Y3ViaWN3ZWJfZWxhc3RpY3NlYXJjaC9zZWFyY2hfaGVscGVycy5weQ== 100644 --- a/cubicweb_elasticsearch/search_helpers.py +++ b/cubicweb_elasticsearch/search_helpers.py @@ -15,5 +15,7 @@ # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. +import re + from elasticsearch_dsl import Q, query as dsl_query @@ -18,6 +20,44 @@ from elasticsearch_dsl import Q, query as dsl_query -from logilab.mtconverter import xml_escape +SIMPLE_QUERY_OPERATORS = "|+-\"()*~" +SIMPLE_QUERY_OPERATORS_RE = r"[\|\+\-\"\(\)\*\~]" + + +def is_simple_query_string(query): + """ + Define if the query contains any of operators supported by simple_query_string + + query: + text of the query to be composed (can contain quotes) + + In ES the simple_query_string query supports the following operators: + + + signifies AND operation + | signifies OR operation + - negates a single token + " wraps a number of tokens to signify a phrase for searching + * at the end of a term signifies a prefix query + ( and ) signify precedence + ~N after a word signifies edit distance (fuzziness) + ~N after a phrase signifies slop amount + + https://www.elastic.co/guide/en/elasticsearch/reference/7.9/query-dsl-simple-query-string-query.html +""" + # for all operators except "-", if it appears, then we assume it is a simple query + for operator in SIMPLE_QUERY_OPERATORS.replace("-", ""): + if operator in query: + return True + + # in the case of the "-" operator, we accept it only if it is at the beginning of the query + # or after a space or an operator + # queries like "mont-saint-michel" should not be regarded as simple_query_string + if query.startswith("-"): + return True + + if " -" in query: + return True + + return False def compose_search( @@ -43,20 +83,6 @@ add a fuzzy search element to part of the query generated https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-fuzzy-query.html """ - # FIXME TODO - restructure entier code base, have a proper lexer - for char in ('"', "'", xml_escape('"'), xml_escape("'")): - # TODO - implement phrase + term - if len(query.split(char)) == 3: - # TODO add this to most important queries, instead of single query ? - return search.query( - "bool", - must=Q( - "multi_match", - query=query.split(char)[1], - type="phrase", - fields=fields, - ), - ) must = [] must_not = [] should = [] @@ -60,8 +86,4 @@ must = [] must_not = [] should = [] - cutoff_frequency = 0.001 - # https://www.elastic.co/guide/en/elasticsearch/reference/2.4/query-dsl-minimum-should-match.html - minimum_should_match = "1" - # proximity booster - phrase with slop=50 @@ -67,8 +89,8 @@ - # TODO find a better way to do this - if not phrase and not common: # invalid combination - phrase = common = True - if phrase: - phrase_query = Q( - "multi_match", query=query, type="phrase", slop=50, fields=fields + if is_simple_query_string(query): + query_string = Q( + "simple_query_string", + query=query, + fields=fields, + default_operator="and" ) @@ -74,15 +96,3 @@ ) - should.append(phrase_query) - # highfrequency/lowfrequency query - # https://www.elastic.co/blog/stop-stopping-stop-words-a-look-at-common-terms-query - if common: - common_query = dsl_query.Common( - alltext={ - "query": query, - "cutoff_frequency": cutoff_frequency, - "low_freq_operator": "and", - "minimum_should_match": {"high_freq": "70%"}, - } - ) - should.append(common_query) + should.append(query_string) @@ -88,11 +98,26 @@ - elements = query.split() - elements_lowercase = [e.lower() for e in elements] - if "or" in elements_lowercase and len(elements) >= 3: - for element in query.split("or"): - should.append(Q("multi_match", query=element.strip(), fields=fields)) - elements = [] - for element in elements: - if element.startswith("-"): - must_not.append(Q("multi_match", query=element[1:], fields=fields)) + else: + if not phrase and not common: # invalid combination + phrase = common = True + if phrase: + phrase_query = Q( + "multi_match", + query=query, + type="phrase", + slop=50, + fields=fields + ) + should.append(phrase_query) + # highfrequency/lowfrequency query + # https://www.elastic.co/blog/stop-stopping-stop-words-a-look-at-common-terms-query + if common: + common_query = dsl_query.Common( + alltext={ + "query": query, + "cutoff_frequency": 0.001, + "low_freq_operator": "and", + "minimum_should_match": {"high_freq": "70%"}, + } + ) + should.append(common_query) @@ -98,3 +123,5 @@ - else: + if fuzzy: + elements = re.sub(SIMPLE_QUERY_OPERATORS_RE, " ", query).split() + for element in elements: if fuzzy: @@ -100,6 +127,6 @@ if fuzzy: - should.append(dsl_query.Fuzzy(alltext=element)) + should.append(dsl_query.Fuzzy(alltext=element.replace('"', ''))) bool_query = dsl_query.Bool( must=must, must_not=must_not, should=should, @@ -102,8 +129,8 @@ bool_query = dsl_query.Bool( must=must, must_not=must_not, should=should, - minimum_should_match=minimum_should_match, + minimum_should_match=1, ) search.query = bool_query return search