Commit 633afddc authored by Nicolas Chauvat's avatar Nicolas Chauvat
Browse files

[entities] use Vector Space Model to compute similarities

parent 7a123eea7195
......@@ -12,81 +12,130 @@ from itertools import chain
from cubicweb.entities import AnyEntity
ADS_PROXIMITY = {}
VECTORS = {}
# find ads similar to a given ad (by comparing descriptions)
#import gensim
STOP_WORDS = {
'fr': 'le la les un une d du de des ou et donc or ni car pas - avec sur au'.split(),
'fr': set('''le la les un une d du de des ou et donc or ni car pas - avec sur
au dans dont ds en ce sa se il'''.split()),
}
WHITESPACES = re.compile('[\' \t\r\n]')
def tokenize(text):
return WHITESPACES.split(text)
def distance(a, b):
return sum(abs(a.get(key,0)-b.get(key,0)) for key in set(chain(a,b)))
def vectorize(entity):
vec = defaultdict(int)
if entity.description:
for word in tokenize(entity.description):
if not word:
continue
word = word.lower() # XXX stemming word would be better
if word in STOP_WORDS['fr']:
continue
vec[word] += 1
return vec
WORD_MAP = {
'vlce': 'valence',
'chbs': 'chambres',
'chbrs': 'chambres',
'grd': 'grand',
'ttes': 'toutes',
'niv': 'niveau',
}
def normalize_vectors(vectors):
word_count = defaultdict(int)
for vector in vectors.values():
for word, count in vector.items():
word_count[word] += count
for vector in vectors.values():
for word, count in vector.items():
vector[word] = math.log(1+count)/math.log(1+word_count[word])
#total = sum(vector.values())
#for word, count in vector.items():
# vector[word] = count/total
# debug
# for vector in vectors.values():
# l = sorted(vector.items(), key=lambda x: x[1])
# print list(reversed(l[-10:]))
TOKENIZER = re.compile('[\' \t\r\n,.;/>+\(\):\!]')
def tokenize(text):
for word in TOKENIZER.split(text.lower()):
if not word:
continue
if word in STOP_WORDS['fr']:
continue
# XXX stemming word would be better
word = WORD_MAP.get(word, word)
yield word
def norm(vector):
return math.sqrt(sum(val*val for val in vector.values()))
def dot_product(a, b):
return math.sqrt(sum(abs(a.get(key,0)*b.get(key,0)) for key in set(chain(a,b))))
def cosine_similarity(a, b):
bottom = norm(a)*norm(b)
if bottom == 0:
return 0
top = dot_product(a,b)
return top / bottom
def combinations(iterable):
items = sorted(iterable)
for i,item1 in enumerate(items):
for item2 in items[i+1:]:
yield (item1, item2)
def mk_key(vec1, vec2):
return (min(vec1, vec2), max(vec1, vec2))
def get_vectors(_cw):
rset = _cw.execute('Any X WHERE X is ClassifiedAd')
ads = dict((entity.eid, entity) for entity in rset.entities())
vectors = dict((eid, vectorize(entity)) for eid, entity in ads.items())
normalize_vectors(vectors)
return vectors
def get_ads_proximity(vectors):
distances = {}
keys = sorted(vectors)
for i,eid1 in enumerate(keys):
for eid2 in keys[i+1:]:
distances[mk_key(eid1, eid2)] = distance(vectors[eid1], vectors[eid2])
return distances
class VectorSpaceModel(object):
def __init__(self):
self.reset()
def reset(self):
self.words = set()
self.corpus = {}
self.word_freq = defaultdict(int)
self.document_freq = defaultdict(int)
self._similarity = {}
self._tfidf = {}
def add_text(self, eid, text):
vec = defaultdict(int)
if text:
for word in tokenize(text):
vec[word] += 1
self.words.add(word)
self.word_freq[word] += 1
for word in vec:
self.document_freq[word] += 1
self.corpus[eid] = vec
def tfidf(self, vector):
vec = defaultdict(int)
for word, count in vector.items():
tf = count*1./self.word_freq[word]
try:
idf = math.log(len(self.corpus)/self.document_freq[word])
except OverflowError:
print '*'*10,len(self.corpus), self.document_freq[word]
idf = 0
vec[word] = tf * idf
# normalize result
total = sum(vec.values())
for word, count in vec.items():
vec[word] = count/total
return vec
def similarity(self, vector):
similarity = [(cosine_similarity(self.tfidf(vector), other), eid) for eid, other in self.corpus.items()]
similarity.sort(reverse=True)
return similarity
def tfidf_by_id(self, eid):
if eid in self._tfidf:
return self._tfidf[eid]
else:
return self._tfidf.setdefault(eid, self.tfidf(self.corpus[eid]))
def similarity_by_id(self, eid):
similarity = []
for other, vector in self.corpus.items():
if other == eid:
continue
key = mk_key(eid, other)
if key not in self._similarity:
v1 = self.tfidf_by_id(eid)
v2 = self.tfidf_by_id(other)
self._similarity[key] = cosine_similarity(v1, v2)
similarity.append((self._similarity[key], other))
similarity.sort(reverse=True)
return similarity
VSM = VectorSpaceModel()
def reset_proximity_cache():
global ADS_PROXIMITY, VECTORS
ADS_PROXIMITY = {}
VECTORS = {}
#print 'emptying proximity cache *****************'
VSM.reset()
def update_proximity_cache(_cw):
if not VECTORS:
VECTORS.update(get_vectors(_cw))
#print 'computing vectors ****************'
if not ADS_PROXIMITY:
ADS_PROXIMITY.update(get_ads_proximity(VECTORS))
#print 'computing proximity ****************'
rset = _cw.execute('Any X WHERE X is ClassifiedAd')
for entity in rset.entities():
VSM.add_text(entity.eid, entity.description)
#VSM.compute_similarities()
class ClassifiedAd(AnyEntity):
__regid__ = 'ClassifiedAd'
......@@ -98,7 +147,6 @@ class ClassifiedAd(AnyEntity):
return self.advertise[0]
def closest_ads(self):
update_proximity_cache(self._cw)
scores = [(ADS_PROXIMITY[mk_key(eid, self.eid)], eid) for eid in VECTORS if eid != self.eid]
scores.sort()
return scores
if not VSM.corpus:
update_proximity_cache(self._cw)
return VSM.similarity_by_id(self.eid)
......@@ -122,3 +122,22 @@ class ClosestAdsBox(box.EntityBoxTemplate):
self.w(u'</div>')
self.w(u'</div>')
self.w(u'</div>')
class ClassifiedAdSimilarView(baseviews.EntityView):
__regid__ = 'similar-ads'
title = _('similar ads')
__select__ = implements('ClassifiedAd')
def call(self):
from cubes.classifiedad import entities
entities.update_proximity_cache(self._cw)
for row, ad in enumerate(self.cw_rset.entities()):
vector = entities.VSM.tfidf(entities.VSM.corpus[ad.eid]).items()
self.w(xml_escape(unicode(repr(sorted(vector, reverse=True, key=lambda x: x[1])))))
self.w(u'<hr />'+xml_escape(unicode(repr(sorted(entities.VSM.words)))))
self.w(u'<hr />'+xml_escape(unicode(repr(sorted(word for word in entities.VSM.words if len(word) < 3)))))
self.w(u'<ul>')
scores = ad.closest_ads()
for score, eid in scores:
self.w(u'<li>%s - %s</li>' % (score, self._cw.entity_from_eid(eid).view('outofcontext')))
self.w(u'</ul>')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment