Commit ff0e69da authored by Nicolas Chauvat's avatar Nicolas Chauvat
Browse files

refactor ads proximity to cache results

parent 8f866fc8494d
......@@ -6,8 +6,58 @@
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses
"""
from collections import defaultdict
from itertools import chain
from cubicweb.entities import AnyEntity
ADS_PROXIMITY = {}
VECTORS = {}
# find ads similar to a given ad (by comparing descriptions)
def distance(a, b):
return sum(abs(a.get(key,0)-b.get(key,0)) for key in set(chain(a,b)))
def vectorize(entity):
vec = defaultdict(int)
if entity.description:
for word in entity.description.split():
word = word.lower() # XXX stemming word would be better
vec[word] += 1
return vec
def mk_key(vec1, vec2):
return (min(vec1, vec2), max(vec1, vec2))
def get_vectors(_cw):
rset = _cw.execute('Any X WHERE X is ClassifiedAd')
ads = dict((entity.eid, entity) for entity in rset.entities())
vectors = dict((eid, vectorize(entity)) for eid, entity in ads.items())
return vectors
def get_ads_proximity(vectors):
distances = {}
keys = sorted(vectors)
for i,eid1 in enumerate(keys):
for eid2 in keys[i+1:]:
distances[mk_key(eid1, eid2)] = distance(vectors[eid1], vectors[eid2])
return distances
def reset_proximity_cache():
global ADS_PROXIMITY, VECTORS
ADS_PROXIMITY = {}
VECTORS = {}
print 'emptying proximity cache *****************'
def update_proximity_cache(_cw):
if not VECTORS:
VECTORS.update(get_vectors(_cw))
print 'computing vectors ****************'
if not ADS_PROXIMITY:
ADS_PROXIMITY.update(get_ads_proximity(VECTORS))
print 'computing proximity ****************'
class ClassifiedAd(AnyEntity):
__regid__ = 'ClassifiedAd'
......@@ -16,3 +66,9 @@ class ClassifiedAd(AnyEntity):
def parent(self):
if self.advertise:
return self.advertise[0]
def closest_ads(self):
update_proximity_cache(self._cw)
scores = [(ADS_PROXIMITY[mk_key(eid, self.eid)], eid) for eid in VECTORS if eid != self.eid]
scores.sort()
return scores
from cubes.classifiedad.entities import reset_proximity_cache
from cubicweb.server.hook import Hook
from cubicweb.selectors import implements
class ClassifiedAdAttributeHook(Hook):
__regid__ = 'classifiedad_attribute_hook'
events = ('after_update_entity',)
__select__ = Hook.__select__ & implements('ClassifiedAd')
def __call__(self):
reset_proximity_cache()
......@@ -6,9 +6,6 @@
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses
"""
from collections import defaultdict
from itertools import chain
from logilab.mtconverter import xml_escape
from cubicweb.selectors import has_related_entities, implements
......@@ -90,45 +87,21 @@ class ClassifiedAdInContextView(baseviews.SameETypeListItemView):
# __select__ = DateRangeFacet.__select__ & implements('ClassifiedAd')
# rtype = 'publication_date'
# find ads similar to a given ad (by comparing descriptions)
def distance(a,b):
return sum(abs(a.get(key,0)-b.get(key,0)) for key in set(chain(a,b)))
def vectorize(entity):
vec = defaultdict(int)
if entity.description:
for word in entity.description.split():
vec[word] += 1 # XXX stemming word would be better
return vec
def mk_key(vec1, vec2):
return (min(vec1, vec2), max(vec1, vec2))
class ProximityOfAds(baseviews.EntityView):
__regid__ = 'proximity-of-ads'
title = _('proximity of ads')
__select__ = implements('ClassifiedAd')
def call(self):
rset = self._cw.execute('Any X WHERE X is ClassifiedAd')
ads = dict((entity.eid, entity) for entity in rset.entities())
vectors = dict((eid, vectorize(entity)) for eid, entity in ads.items())
distances = {}
keys = sorted(vectors)
for i,eid1 in enumerate(keys):
for eid2 in keys[i+1:]:
distances[mk_key(eid1, eid2)] = distance(vectors[eid1], vectors[eid2])
self.w(u'<table><tr><th width="50%%">%s</th><th>%s</th></tr>' %
(self._cw._('ClassifiedAd'), self._cw._('Similar Ads')))
for row, ad in enumerate(self.cw_rset.entities()):
self.w(u'<tr class="%s"><td>' % (row % 2 and u'even' or u'odd'))
self.w(ad.view('sameetypelistitem'))
self.w(u'</td><td><ul>')
scores = [(distances[mk_key(eid, ad.eid)], eid) for eid in vectors if eid != ad.eid]
scores.sort()
scores = ad.closest_ads()
for score, eid in scores[:3]:
self.w(u'<li>%s - %s</li>' % (score, ads[eid].view('outofcontext')))
self.w(u'<li>%s - %s</li>' % (score, self._cw.entity_from_eid(eid).view('outofcontext')))
self.w(u'</ul></td></tr>')
self.w(u'</table>')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment