Commit 53f27046 authored by Nicolas Chauvat's avatar Nicolas Chauvat
Browse files

use new cube similarity

parent 33e681ff8a0d
......@@ -43,7 +43,7 @@ for dname in ('entities', 'views', 'sobjects', 'hooks', 'schema', 'data', 'i18n'
# Note: here, you'll need to add subdirectories if you want
# them to be included in the debian package
__depends_cubes__ = {}
__depends_cubes__ = {'similarity': None, }
__depends__ = {'cubicweb': '>= 3.6.0'}
__use__ = ()
__recommend__ = ()
# -*- encoding: utf-8 -*-
"""this contains the cube-specific entities' classes
:organization: Logilab
......@@ -6,136 +7,72 @@
:license: GNU Lesser General Public License, v2.1 -
import re, math
from collections import defaultdict
from itertools import chain
import re
import functools as ft
from cubicweb.entities import AnyEntity
#import gensim
'fr': set('''le la les un une d du de des ou et donc or ni car pas - avec sur
au dans dont ds en ce sa se il'''.split()),
'vlce': 'valence',
'chbs': 'chambres',
'chbrs': 'chambres',
'grd': 'grand',
'ttes': 'toutes',
'niv': 'niveau',
TOKENIZER = re.compile('[\' \t\r\n,.;/>+\(\):\!]')
def tokenize(text):
for word in TOKENIZER.split(text.lower()):
if not word:
if word in STOP_WORDS['fr']:
# XXX stemming word would be better
word = WORD_MAP.get(word, word)
yield word
def norm(vector):
return math.sqrt(sum(val*val for val in vector.values()))
def dot_product(a, b):
return math.sqrt(sum(abs(a.get(key,0)*b.get(key,0)) for key in set(chain(a,b))))
def cosine_similarity(a, b):
bottom = norm(a)*norm(b)
if bottom == 0:
return 0
top = dot_product(a,b)
return top / bottom
def combinations(iterable):
items = sorted(iterable)
for i,item1 in enumerate(items):
for item2 in items[i+1:]:
yield (item1, item2)
def mk_key(vec1, vec2):
return (min(vec1, vec2), max(vec1, vec2))
class VectorSpaceModel(object):
def __init__(self):
def reset(self):
self.words = set()
self.corpus = {}
self.word_freq = defaultdict(int)
self.document_freq = defaultdict(int)
self._similarity = {}
self._tfidf = {}
def add_text(self, eid, text):
vec = defaultdict(int)
if text:
for word in tokenize(text):
vec[word] += 1
self.word_freq[word] += 1
for word in vec:
self.document_freq[word] += 1
self.corpus[eid] = vec
def tfidf(self, vector):
vec = defaultdict(int)
for word, count in vector.items():
tf = count*1./self.word_freq[word]
idf = math.log(len(self.corpus)/self.document_freq[word])
except OverflowError:
print '*'*10,len(self.corpus), self.document_freq[word]
idf = 0
vec[word] = tf * idf
# normalize result
total = norm(vec)
for word, count in vec.items():
vec[word] = count/total
return vec
def similarity(self, vector):
similarity = [(cosine_similarity(self.tfidf(vector), other), eid) for eid, other in self.corpus.items()]
return similarity
def tfidf_by_id(self, eid):
if eid in self._tfidf:
return self._tfidf[eid]
return self._tfidf.setdefault(eid, self.tfidf(self.corpus[eid]))
def similarity_by_id(self, eid):
similarity = []
for other, vector in self.corpus.items():
if other == eid:
key = mk_key(eid, other)
if key not in self._similarity:
v1 = self.tfidf_by_id(eid)
v2 = self.tfidf_by_id(other)
self._similarity[key] = cosine_similarity(v1, v2)
similarity.append((self._similarity[key], other))
return similarity
VSM = VectorSpaceModel()
def reset_proximity_cache():
def update_proximity_cache(_cw):
rset = _cw.execute('Any X WHERE X is ClassifiedAd')
for entity in rset.entities():
VSM.add_text(entity.eid, ' '.join([entity.title]*3+[entity.description]))
from cubes.similarity import register_similarity, reset_similarity, vsm
def replace_filter(table, text):
for src, dst in table:
text = text.replace(src, dst)
return text
def const(value, *args):
return value
def prefix_join(value, *args):
return u'%s: %s' % (value, u' '.join(args))
number = re.compile('^\d+$')
(('vlce',), ft.partial(const, 'valence')),
(('chbs',), ft.partial(const, 'chambre')),
(('chbrs',), ft.partial(const, 'chambre')),
(('chambres',), ft.partial(const, 'chambre')),
(('grd',), ft.partial(const, 'grand')),
(('ttes',), ft.partial(const, 'toutes')),
(('niv',), ft.partial(const, 'niveau')),
(('st',), ft.partial(const, 'saint')),
(('min',), ft.partial(const, 'minute')),
(('mn',), ft.partial(const, 'minute')),
(('hab',), ft.partial(const, 'habitable')),
(('sde',), ft.partial(const, 'salle d\'eau')),
(('sdb',), ft.partial(const, 'salle de bain')),
(('salle','bains'), ft.partial(const, 'salle de bain')),
(('salle','eau'), ft.partial(const, 'salle d\'eau')),
((number, u'm²'), ft.partial(prefix_join, 'surface')),
((number, 'm2'), ft.partial(prefix_join, 'surface')),
(('drome',), ft.partial(const, u'drôme')),
(('gare', 'tgv'), ft.partial(prefix_join, 'location')),
(('danton',), ft.partial(prefix_join, 'location')),
(('appartement',), ft.partial(prefix_join, 'property-type')),
(('maison',), ft.partial(prefix_join, 'property-type')),
(('villa',), ft.partial(const, 'property-type: maison')),
(('valence',), ft.partial(const, 'location:')),
(('26000', 'valence',), ft.partial(const, 'location:')),
(('st', 'marcel'), ft.partial(const, 'location:')),
(('saint', 'marcel'), ft.partial(const, 'location:')),
((number, number, '€'), ft.partial(prefix_join, 'price')),
(' ', ' '),
filters = [ft.partial(replace_filter, REPLACE_TABLE),
ft.partial(vsm.tokenize_filter, vsm.TOKENIZE_PATTERN),
ft.partial(vsm.exclude_filter, vsm.STOP_WORDS['fr']),
ft.partial(vsm.transform_filter, TRANSLATE_TABLE),
ft.partial(vsm.ngrams_filter, 2),
rql = 'Any X WHERE X is ClassifiedAd'
def ad_to_text(entity):
return u' '.join([entity.title or '']*3+[entity.description or ''])
register_similarity('ClassifiedAd', rql, ad_to_text, filters)
class ClassifiedAd(AnyEntity):
__regid__ = 'ClassifiedAd'
......@@ -145,8 +82,3 @@ class ClassifiedAd(AnyEntity):
def parent(self):
if self.advertise:
return self.advertise[0]
def closest_ads(self):
if not VSM.corpus:
return VSM.similarity_by_id(self.eid)
from cubes.classifiedad.entities import reset_proximity_cache
from cubes.similarity import reset_similarity
from cubicweb.server.hook import Hook
from cubicweb.selectors import implements
......@@ -8,5 +8,5 @@ class ClassifiedAdAttributeHook(Hook):
__select__ = Hook.__select__ & implements('ClassifiedAd')
def __call__(self):
......@@ -14,6 +14,8 @@ from cubicweb.web import uicfg, component, box
from cubicweb.web.views import primary, baseviews
from cubicweb.web.facet import RelationFacet, AttributeFacet, RangeFacet, DateRangeFacet
from cubes.similarity import get_vspace
uicfg.primaryview_section.tag_subject_of(('ClassifiedAd', 'same_as', '*'), 'hidden')
uicfg.primaryview_section.tag_subject_of(('ClassifiedAd', 'url', '*'), 'hidden')
uicfg.primaryview_section.tag_subject_of(('ClassifiedAd', 'has_image', '*'), 'hidden')
......@@ -107,9 +109,9 @@ class ProximityOfAds(baseviews.EntityView):
self.w(u'<tr class="%s"><td>' % (row % 2 and u'even' or u'odd'))
scores = ad.closest_ads()
for score, eid in scores[:3]:
self.w(u'<li>%s - %s</li>' % (score, self._cw.entity_from_eid(eid).view('outofcontext')))
vspace = get_vspace('ClassifiedAd', self._cw)
for score, eids in vspace.similarity_by_id(ad.eid)[:3]:
self.w(u'<li>%.2f - %s</li>' % (score, ' '.join(self._cw.entity_from_eid(eid).view('outofcontext') for eid in eids)))
......@@ -120,13 +122,15 @@ class ClosestAdsBox(box.EntityBoxTemplate):
def cell_call(self, row, col, **kwargs):
entity = self.cw_rset.get_entity(row, col)
scores = entity.closest_ads()
vspace = get_vspace('ClassifiedAd', self._cw)
scores = vspace.similarity_by_id(entity.eid)
if scores:
self.w(u'<div class="sideBox">')
self.w(u'<div class="sideBoxTitle"><span>%s</span></div>' % _('Similar ads'))
self.w(u'<div class="%s"><div class="sideBoxBody">' % 'sideBox')
for score, eid in scores[:5]:
self.w(u'<span>%s - %s</span><br />' % (score, self._cw.entity_from_eid(eid).view('outofcontext')))
for score, eids in scores[:5]:
for eid in eids:
self.w(u'<span>%.2f - %s</span><br />' % (score, self._cw.entity_from_eid(eid).view('outofcontext')))
......@@ -140,7 +144,7 @@ class ClassifiedAdSimilarView(baseviews.EntityView):
from cubes.classifiedad import entities
for row, ad in enumerate(self.cw_rset.entities()):
vector = entities.VSM.tfidf(entities.VSM.corpus[ad.eid]).items()
vector = entities.vspace.tfidf_by_id(ad.eid)
self.w(xml_escape(unicode(repr(sorted(vector, reverse=True, key=lambda x: x[1])))))
self.w(u'<hr />'+xml_escape(unicode(repr(sorted(entities.VSM.words)))))
self.w(u'<hr />'+xml_escape(unicode(repr(sorted(word for word in entities.VSM.words if len(word) < 3)))))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment