Commit 2194e65f authored by Sylvain Thénault's avatar Sylvain Thénault
Browse files

[fti] support for fti ranking: has_text query results sorted by relevance, and...

[fti] support for fti ranking: has_text query results sorted by relevance, and provides a way to control weight per entity / entity's attribute
parent c397819f2482
......@@ -52,7 +52,7 @@ __depends__ = {
'Twisted': '',
# XXX graphviz
# server dependencies
'logilab-database': '',
'logilab-database': '1.1.0',
'pysqlite': '>= 2.5.5', # XXX install pysqlite2
}
......
......@@ -1107,6 +1107,7 @@ _EXT_REGISTERED = False
def register_stored_procedures():
from logilab.database import FunctionDescr
from rql.utils import register_function, iter_funcnode_variables
from rql.nodes import SortTerm, Constant, VariableRef
global _EXT_REGISTERED
if _EXT_REGISTERED:
......@@ -1152,6 +1153,34 @@ def register_stored_procedures():
register_function(TEXT_LIMIT_SIZE)
class FTIRANK(FunctionDescr):
"""return ranking of a variable that must be used as some has_text
relation subject in the query's restriction. Usually used to sort result
of full-text search by ranking.
"""
supported_backends = ('postgres',)
rtype = 'Float'
def st_check_backend(self, backend, funcnode):
"""overriden so that on backend not supporting fti ranking, the
function is removed when in an orderby clause, or replaced by a 1.0
constant.
"""
if not self.supports(backend):
parent = funcnode.parent
while parent is not None and not isinstance(parent, SortTerm):
parent = parent.parent
if isinstance(parent, SortTerm):
parent.parent.remove(parent)
else:
funcnode.parent.replace(funcnode, Constant(1.0, 'Float'))
parent = funcnode
for vref in parent.iget_nodes(VariableRef):
vref.unregister_reference()
register_function(FTIRANK)
class FSPATH(FunctionDescr):
"""return path of some bytes attribute stored using the Bytes
File-System Storage (bfss)
......
......@@ -33,7 +33,7 @@ XB-Python-Version: ${python:Versions}
Conflicts: cubicweb-multisources
Replaces: cubicweb-multisources
Provides: cubicweb-multisources
Depends: ${python:Depends}, cubicweb-common (= ${source:Version}), cubicweb-ctl (= ${source:Version}), python-logilab-database (>= 1.0.2), cubicweb-postgresql-support | cubicweb-mysql-support | python-pysqlite2
Depends: ${python:Depends}, cubicweb-common (= ${source:Version}), cubicweb-ctl (= ${source:Version}), python-logilab-database (>= 1.1.0), cubicweb-postgresql-support | cubicweb-mysql-support | python-pysqlite2
Recommends: pyro, cubicweb-documentation (= ${source:Version})
Description: server part of the CubicWeb framework
CubicWeb is a semantic web application framework.
......
......@@ -16,8 +16,8 @@
# You should have received a copy of the GNU Lesser General Public License along
# with CubicWeb. If not, see <http://www.gnu.org/licenses/>.
"""Fake objects to ease testing of cubicweb without a fully working environment
"""
__docformat__ = "restructuredtext en"
from logilab.database import get_db_helper
......@@ -46,7 +46,7 @@ class FakeConfig(dict, BaseApptestConfiguration):
return self._cubes
def sources(self):
return {}
return {'system': {'db-driver': 'sqlite'}}
class FakeRequest(CubicWebRequestBase):
......
......@@ -18,8 +18,8 @@
"""some utilities to ease repository testing
This module contains functions to initialize a new repository.
"""
__docformat__ = "restructuredtext en"
from pprint import pprint
......@@ -134,24 +134,32 @@ def restore_schema_eids_idx(schema, schema_eids):
schema._eid_index[rdef.eid] = rdef
from logilab.common.testlib import TestCase
from logilab.common.testlib import TestCase, mock_object
from logilab.database import get_db_helper
from rql import RQLHelper
from cubicweb.devtools.fake import FakeRepo, FakeSession
from cubicweb.server import set_debug
from cubicweb.server.querier import QuerierHelper
from cubicweb.server.session import Session
from cubicweb.server.sources.rql2sql import remove_unused_solutions
from cubicweb.server.sources.rql2sql import SQLGenerator, remove_unused_solutions
class RQLGeneratorTC(TestCase):
schema = None # set this in concret test
schema = backend = None # set this in concret test
def setUp(self):
self.repo = FakeRepo(self.schema)
self.repo.system_source = mock_object(dbdriver=self.backend)
self.rqlhelper = RQLHelper(self.schema, special_relations={'eid': 'uid',
'has_text': 'fti'})
'has_text': 'fti'},
backend=self.backend)
self.qhelper = QuerierHelper(self.repo, self.schema)
ExecutionPlan._check_permissions = _dummy_check_permissions
rqlannotation._select_principal = _select_principal
if self.backend is not None:
dbhelper = get_db_helper(self.backend)
self.o = SQLGenerator(self.schema, dbhelper)
def tearDown(self):
ExecutionPlan._check_permissions = _orig_check_permissions
......@@ -270,6 +278,7 @@ class BasePlannerTC(BaseQuerierTC):
self.system = self.sources[-1]
do_monkey_patch()
self._dumb_sessions = [] # by hi-jacked parent setup
self.repo.vreg.rqlhelper.backend = 'postgres' # so FTIRANK is considered
def add_source(self, sourcecls, uri):
self.sources.append(sourcecls(self.repo, self.o.schema,
......
......@@ -108,6 +108,10 @@ class IFTIndexableAdapter(EntityAdapter):
else:
yield entity
# weight in ABCD
entity_weight = 1.0
attr_weight = {}
def get_words(self):
"""used by the full text indexer to get words to index
......@@ -121,10 +125,11 @@ class IFTIndexableAdapter(EntityAdapter):
# take care to cases where we're modyfying the schema
entity = self.entity
pending = self._cw.transaction_data.setdefault('pendingrdefs', set())
words = []
words = {}
for rschema in entity.e_schema.indexable_attributes():
if (entity.e_schema, rschema) in pending:
continue
weight = self.attr_weight.get(rschema, 'C')
try:
value = entity.printable_value(rschema, format='text/plain')
except TransformError:
......@@ -134,16 +139,19 @@ class IFTIndexableAdapter(EntityAdapter):
rschema, entity.eid)
continue
if value:
words += tokenize(value)
words.setdefault(weight, []).extend(tokenize(value))
for rschema, role in entity.e_schema.fulltext_relations():
if role == 'subject':
for entity_ in getattr(entity, rschema.type):
words += entity_.cw_adapt_to('IFTIndexable').get_words()
merge_weight_dict(words, entity_.cw_adapt_to('IFTIndexable').get_words())
else: # if role == 'object':
for entity_ in getattr(entity, 'reverse_%s' % rschema.type):
words += entity_.cw_adapt_to('IFTIndexable').get_words()
merge_weight_dict(words, entity_.cw_adapt_to('IFTIndexable').get_words())
return words
def merge_weight_dict(maindict, newdict):
for weight, words in newdict.iteritems():
maindict.setdefault(weight, []).extend(words)
class IDownloadableAdapter(EntityAdapter):
"""interface for downloadable entities"""
......
if repo.system_source.dbdriver == 'postgres':
sql('ALTER TABLE appears ADD COLUMN weight float')
sql('UPDATE appears SET weight=1.0 ')
......@@ -577,7 +577,6 @@ class CubicWebSchema(Schema):
except BadSchemaDefinition:
reversed_etype_map = dict( (v, k) for k, v in ETYPE_NAME_MAP.iteritems() )
if rdef.subject in reversed_etype_map or rdef.object in reversed_etype_map:
self.warning('huuuu')
return
raise
if rdefs:
......
......@@ -96,7 +96,7 @@ from logilab.common.decorators import cached
from rql.stmts import Union, Select
from rql.nodes import (VariableRef, Comparison, Relation, Constant, Variable,
Not, Exists)
Not, Exists, SortTerm, Function)
from cubicweb import server
from cubicweb.utils import make_uid
......@@ -1330,6 +1330,12 @@ class TermsFiltererVisitor(object):
orderby.append)
if orderby:
newroot.set_orderby(orderby)
elif rqlst.orderby:
for sortterm in rqlst.orderby:
if any(f for f in sortterm.iget_nodes(Function) if f.name == 'FTIRANK'):
newnode, oldnode = sortterm.accept(self, newroot, terms)
if newnode is not None:
newroot.add_sort_term(newnode)
self.process_selection(newroot, terms, rqlst)
elif not newroot.where:
# no restrictions have been copied, just select terms and add
......@@ -1530,12 +1536,38 @@ class TermsFiltererVisitor(object):
copy.operator = '='
return copy, node
def visit_function(self, node, newroot, terms):
if node.name == 'FTIRANK':
# FTIRANK is somewhat special... Rank function should be included in
# the same query has the has_text relation, potentially added to
# selection for latter usage
if not self.hasaggrstep and self.final and node not in self.skip:
return self.visit_default(node, newroot, terms)
elif any(s for s in self.sources if s.uri != 'system'):
return None, node
# p = node.parent
# while p is not None and not isinstance(p, SortTerm):
# p = p.parent
# if isinstance(p, SortTerm):
if not self.hasaggrstep and self.final and node in self.skip:
return Constant(self.skip[node], 'Int'), node
# XXX only if not yet selected
newroot.append_selected(node.copy(newroot))
self.skip[node] = len(newroot.selection)
return None, node
return self.visit_default(node, newroot, terms)
def visit_default(self, node, newroot, terms):
subparts, node = self._visit_children(node, newroot, terms)
return copy_node(newroot, node, subparts), node
visit_mathexpression = visit_constant = visit_function = visit_default
visit_sort = visit_sortterm = visit_default
visit_mathexpression = visit_constant = visit_default
def visit_sortterm(self, node, newroot, terms):
subparts, node = self._visit_children(node, newroot, terms)
if not subparts:
return None, node
return copy_node(newroot, node, subparts), node
def _visit_children(self, node, newroot, terms):
subparts = []
......
......@@ -140,13 +140,6 @@ class AggrStep(LimitOffsetMixIn, Step):
def mytest_repr(self):
"""return a representation of this step suitable for test"""
sel = self.select.selection
restr = self.select.where
self.select.selection = self.selection
self.select.where = None
rql = self.select.as_string(kwargs=self.plan.args)
self.select.selection = sel
self.select.where = restr
try:
# rely on a monkey patch (cf unittest_querier)
table = self.plan.tablesinorder[self.table]
......@@ -155,12 +148,19 @@ class AggrStep(LimitOffsetMixIn, Step):
# not monkey patched
table = self.table
outputtable = self.outputtable
return (self.__class__.__name__, rql, self.limit, self.offset, table,
outputtable)
sql = self.get_sql().replace(self.table, table)
return (self.__class__.__name__, sql, outputtable)
def execute(self):
"""execute this step"""
self.execute_children()
sql = self.get_sql()
if self.outputtable:
self.plan.create_temp_table(self.outputtable)
sql = 'INSERT INTO %s %s' % (self.outputtable, sql)
return self.plan.sqlexec(sql, self.plan.args)
def get_sql(self):
self.inputmap = inputmap = self.children[-1].outputmap
# get the select clause
clause = []
......@@ -223,17 +223,15 @@ class AggrStep(LimitOffsetMixIn, Step):
sql.append('LIMIT %s' % self.limit)
if self.offset:
sql.append('OFFSET %s' % self.offset)
#print 'DATA', plan.sqlexec('SELECT * FROM %s' % self.table, None)
sql = ' '.join(sql)
if self.outputtable:
self.plan.create_temp_table(self.outputtable)
sql = 'INSERT INTO %s %s' % (self.outputtable, sql)
return self.plan.sqlexec(sql, self.plan.args)
return ' '.join(sql)
def visit_function(self, function):
"""generate SQL name for a function"""
return '%s(%s)' % (function.name,
','.join(c.accept(self) for c in function.children))
try:
return self.children[0].outputmap[str(function)]
except KeyError:
return '%s(%s)' % (function.name,
','.join(c.accept(self) for c in function.children))
def visit_variableref(self, variableref):
"""get the sql name for a variable reference"""
......
......@@ -29,7 +29,7 @@ from logilab.common.cache import Cache
from logilab.common.compat import any
from rql import RQLSyntaxError
from rql.stmts import Union, Select
from rql.nodes import Relation, VariableRef, Constant, SubQuery
from rql.nodes import Relation, VariableRef, Constant, SubQuery, Function
from cubicweb import Unauthorized, QueryError, UnknownEid, typed_eid
from cubicweb import server
......@@ -50,7 +50,8 @@ def update_varmap(varmap, selected, table):
key = term.as_string()
value = '%s.C%s' % (table, i)
if varmap.get(key, value) != value:
raise Exception('variable name conflict on %s' % key)
raise Exception('variable name conflict on %s: got %s / %s'
% (key, value, varmap))
varmap[key] = value
# permission utilities ########################################################
......@@ -285,7 +286,26 @@ class ExecutionPlan(object):
for term in origselection:
newselect.append_selected(term.copy(newselect))
if select.orderby:
newselect.set_orderby([s.copy(newselect) for s in select.orderby])
sortterms = []
for sortterm in select.orderby:
sortterms.append(sortterm.copy(newselect))
for fnode in sortterm.get_nodes(Function):
if fnode.name == 'FTIRANK':
# we've to fetch the has_text relation as well
var = fnode.children[0].variable
rel = iter(var.stinfo['ftirels']).next()
assert not rel.ored(), 'unsupported'
newselect.add_restriction(rel.copy(newselect))
# remove relation from the orig select and
# cleanup variable stinfo
rel.parent.remove(rel)
var.stinfo['ftirels'].remove(rel)
var.stinfo['relations'].remove(rel)
# XXX not properly re-annotated after security insertion?
newvar = newselect.get_variable(var.name)
newvar.stinfo.setdefault('ftirels', set()).add(rel)
newvar.stinfo.setdefault('relations', set()).add(rel)
newselect.set_orderby(sortterms)
_expand_selection(select.orderby, selected, aliases, select, newselect)
select.orderby = () # XXX dereference?
if select.groupby:
......@@ -562,6 +582,8 @@ class QuerierHelper(object):
# rql parsing / analysing helper
self.solutions = repo.vreg.solutions
rqlhelper = repo.vreg.rqlhelper
# set backend on the rql helper, will be used for function checking
rqlhelper.backend = repo.config.sources()['system']['db-driver']
self._parse = rqlhelper.parse
self._annotate = rqlhelper.annotate
# rql planner
......
......@@ -22,8 +22,8 @@ Notes:
from which it comes from) are stored in a varchar column encoded as a base64
string. This is because it should actually be Bytes but we want an index on
it for fast querying.
"""
from __future__ import with_statement
__docformat__ = "restructuredtext en"
......
......@@ -568,12 +568,14 @@ class SQLGenerator(object):
sql += '\nHAVING %s' % having
# sort
if sorts:
sql += '\nORDER BY %s' % ','.join(self._sortterm_sql(sortterm,
fselectidx)
for sortterm in sorts)
if fneedwrap:
selection = ['T1.C%s' % i for i in xrange(len(origselection))]
sql = 'SELECT %s FROM (%s) AS T1' % (','.join(selection), sql)
sqlsortterms = [self._sortterm_sql(sortterm, fselectidx)
for sortterm in sorts]
sqlsortterms = [x for x in sqlsortterms if x is not None]
if sqlsortterms:
sql += '\nORDER BY %s' % ','.join(sqlsortterms)
if sorts and fneedwrap:
selection = ['T1.C%s' % i for i in xrange(len(origselection))]
sql = 'SELECT %s FROM (%s) AS T1' % (','.join(selection), sql)
state.finalize_source_cbs()
finally:
select.selection = origselection
......@@ -651,12 +653,14 @@ class SQLGenerator(object):
def _sortterm_sql(self, sortterm, selectidx):
term = sortterm.term
try:
sqlterm = str(selectidx.index(str(term)) + 1)
sqlterm = selectidx.index(str(term)) + 1
except ValueError:
# Constant node or non selected term
sqlterm = str(term.accept(self))
sqlterm = term.accept(self)
if sqlterm is None:
return None
if sortterm.asc:
return sqlterm
return str(sqlterm)
else:
return '%s DESC' % sqlterm
......@@ -1014,7 +1018,8 @@ class SQLGenerator(object):
not_ = True
else:
not_ = False
return self.dbhelper.fti_restriction_sql(alias, const.eval(self._args),
query = const.eval(self._args)
return self.dbhelper.fti_restriction_sql(alias, query,
jointo, not_) + restriction
def visit_comparison(self, cmp):
......@@ -1057,6 +1062,15 @@ class SQLGenerator(object):
def visit_function(self, func):
"""generate SQL name for a function"""
if func.name == 'FTIRANK':
try:
rel = iter(func.children[0].variable.stinfo['ftirels']).next()
except KeyError:
raise BadRQLQuery("can't use FTIRANK on variable not used in an"
" 'has_text' relation (eg full-text search)")
const = rel.get_parts()[1].children[0]
return self.dbhelper.fti_rank_order(self._fti_table(rel),
const.eval(self._args))
args = [c.accept(self) for c in func.children]
if func in self._state.source_cb_funcs:
# function executed as a callback on the source
......
......@@ -15,9 +15,8 @@
#
# You should have received a copy of the GNU Lesser General Public License along
# with CubicWeb. If not, see <http://www.gnu.org/licenses/>.
"""SQL utilities functions and classes.
"""SQL utilities functions and classes."""
"""
__docformat__ = "restructuredtext en"
import os
......
......@@ -15,9 +15,6 @@
#
# You should have received a copy of the GNU Lesser General Public License along
# with CubicWeb. If not, see <http://www.gnu.org/licenses/>.
"""
"""
from logilab.database import FunctionDescr
from logilab.database.sqlite import register_sqlite_pyfunc
......@@ -25,7 +22,7 @@ from rql.utils import register_function
try:
class DUMB_SORT(FunctionDescr):
supported_backends = ('sqlite',)
pass
register_function(DUMB_SORT)
def dumb_sort(something):
......
[system]
db-driver = postgres
db-host = localhost
db-port =
adapter = native
db-name = cw_fti_test
db-encoding = UTF-8
db-user = syt
db-password = syt
[admin]
login = admin
password = gingkow
from __future__ import with_statement
from cubicweb.devtools import ApptestConfiguration
from cubicweb.devtools.testlib import CubicWebTC
from cubicweb.selectors import implements
from cubicweb.entities.adapters import IFTIndexableAdapter
class PostgresFTITC(CubicWebTC):
config = ApptestConfiguration('data', sourcefile='sources_fti')
def test_occurence_count(self):
req = self.request()
c1 = req.create_entity('Card', title=u'c1',
content=u'cubicweb cubicweb cubicweb')
c2 = req.create_entity('Card', title=u'c3',
content=u'cubicweb')
c3 = req.create_entity('Card', title=u'c2',
content=u'cubicweb cubicweb')
self.commit()
self.assertEquals(req.execute('Card X ORDERBY FTIRANK(X) DESC WHERE X has_text "cubicweb"').rows,
[[c1.eid], [c3.eid], [c2.eid]])
def test_attr_weight(self):
class CardIFTIndexableAdapter(IFTIndexableAdapter):
__select__ = implements('Card')
attr_weight = {'title': 'A'}
with self.temporary_appobjects(CardIFTIndexableAdapter):
req = self.request()
c1 = req.create_entity('Card', title=u'c1',
content=u'cubicweb cubicweb cubicweb')
c2 = req.create_entity('Card', title=u'c2',
content=u'cubicweb cubicweb')
c3 = req.create_entity('Card', title=u'cubicweb',
content=u'autre chose')
self.commit()
self.assertEquals(req.execute('Card X ORDERBY FTIRANK(X) DESC WHERE X has_text "cubicweb"').rows,
[[c3.eid], [c1.eid], [c2.eid]])
def test_entity_weight(self):
class PersonneIFTIndexableAdapter(IFTIndexableAdapter):
__select__ = implements('Personne')
entity_weight = 2.0
with self.temporary_appobjects(PersonneIFTIndexableAdapter):
req = self.request()
c1 = req.create_entity('Personne', nom=u'c1', prenom=u'cubicweb')
c2 = req.create_entity('Comment', content=u'cubicweb cubicweb', comments=c1)
c3 = req.create_entity('Comment', content=u'cubicweb cubicweb cubicweb', comments=c1)
self.commit()
self.assertEquals(req.execute('Any X ORDERBY FTIRANK(X) DESC WHERE X has_text "cubicweb"').rows,
[[c1.eid], [c3.eid], [c2.eid]])
......@@ -413,7 +413,7 @@ class MSPlannerTC(BaseMSPlannerTC):
"""retrieve CWUser X from both sources and return concatenation of results
"""
self._test('CWUser X ORDERBY X LIMIT 10 OFFSET 10',
[('AggrStep', 'Any X ORDERBY X', 10, 10, 'table0', None, [
[('AggrStep', 'SELECT table0.C0 FROM table0 ORDER BY table0.C0 LIMIT 10 OFFSET 10', None, [
('FetchStep', [('Any X WHERE X is CWUser', [{'X': 'CWUser'}])],
[self.ldap, self.system], {}, {'X': 'table0.C0'}, []),
]),
......@@ -423,7 +423,7 @@ class MSPlannerTC(BaseMSPlannerTC):
"""
# COUNT(X) is kept in sub-step and transformed into SUM(X) in the AggrStep
self._test('Any COUNT(X) WHERE X is CWUser',
[('AggrStep', 'Any COUNT(X)', None, None, 'table0', None, [
[('AggrStep', 'SELECT SUM(table0.C0) FROM table0', None, [
('FetchStep', [('Any COUNT(X) WHERE X is CWUser', [{'X': 'CWUser'}])],
[self.ldap, self.system], {}, {'COUNT(X)': 'table0.C0'}, []),
]),
......@@ -498,7 +498,7 @@ class MSPlannerTC(BaseMSPlannerTC):
def test_complex_ordered(self):
self._test('Any L ORDERBY L WHERE X login L',
[('AggrStep', 'Any L ORDERBY L', None, None, 'table0', None,
[('AggrStep', 'SELECT table0.C0 FROM table0 ORDER BY table0.C0', None,
[('FetchStep', [('Any L WHERE X login L, X is CWUser',
[{'X': 'CWUser', 'L': 'String'}])],
[self.ldap, self.system], {}, {'X.login': 'table0.C0', 'L': 'table0.C0'}, []),
......@@ -507,7 +507,7 @@ class MSPlannerTC(BaseMSPlannerTC):
def test_complex_ordered_limit_offset(self):
self._test('Any L ORDERBY L LIMIT 10 OFFSET 10 WHERE X login L',
[('AggrStep', 'Any L ORDERBY L', 10, 10, 'table0', None,
[('AggrStep', 'SELECT table0.C0 FROM table0 ORDER BY table0.C0 LIMIT 10 OFFSET 10', None,
[('FetchStep', [('Any L WHERE X login L, X is CWUser',
[{'X': 'CWUser', 'L': 'String'}])],
[self.ldap, self.system], {}, {'X.login': 'table0.C0', 'L': 'table0.C0'}, []),
......@@ -593,7 +593,7 @@ class MSPlannerTC(BaseMSPlannerTC):
2. return content of the table sorted
"""
self._test('Any X,F ORDERBY F WHERE X firstname F',
[('AggrStep', 'Any X,F ORDERBY F', None, None, 'table0', None,
[('AggrStep', 'SELECT table0.C0, table0.C1 FROM table0 ORDER BY table0.C1', None,
[('FetchStep', [('Any X,F WHERE X firstname F, X is CWUser',
[{'X': 'CWUser', 'F': 'String'}])],
[self.ldap, self.system], {},
......@@ -657,7 +657,7 @@ class MSPlannerTC(BaseMSPlannerTC):
def test_complex_typed_aggregat(self):
self._test('Any MAX(X) WHERE X is Card',
[('AggrStep', 'Any MAX(X)', None, None, 'table0', None,
[('AggrStep', 'SELECT MAX(table0.C0) FROM table0', None,
[('FetchStep',
[('Any MAX(X) WHERE X is Card', [{'X': 'Card'}])],
[self.cards, self.system], {}, {'MAX(X)': 'table0.C0'}, [])
......@@ -1299,9 +1299,66 @@ class MSPlannerTC(BaseMSPlannerTC):