Commit b738f56e authored by Nicolas Chauvat's avatar Nicolas Chauvat
Browse files

[entities] improve search by normalizing vectors

parent 05483144db18
......@@ -6,6 +6,7 @@
:license: GNU Lesser General Public License, v2.1 -
import re, math
from collections import defaultdict
from itertools import chain
......@@ -16,17 +17,45 @@ VECTORS = {}
# find ads similar to a given ad (by comparing descriptions)
'fr': 'le la les un une d du de des ou et donc or ni car pas - avec sur au'.split(),
WHITESPACES = re.compile('[\' \t\r\n]')
def tokenize(text):
return WHITESPACES.split(text)
def distance(a, b):
return sum(abs(a.get(key,0)-b.get(key,0)) for key in set(chain(a,b)))
def vectorize(entity):
vec = defaultdict(int)
if entity.description:
for word in entity.description.split():
for word in tokenize(entity.description):
if not word:
word = word.lower() # XXX stemming word would be better
if word in STOP_WORDS['fr']:
vec[word] += 1
return vec
def normalize_vectors(vectors):
word_count = defaultdict(int)
for vector in vectors.values():
for word, count in vector.items():
word_count[word] += count
for vector in vectors.values():
for word, count in vector.items():
vector[word] = math.log(1+count)/math.log(1+word_count[word])
#total = sum(vector.values())
#for word, count in vector.items():
# vector[word] = count/total
# debug
# for vector in vectors.values():
# l = sorted(vector.items(), key=lambda x: x[1])
# print list(reversed(l[-10:]))
def mk_key(vec1, vec2):
return (min(vec1, vec2), max(vec1, vec2))
......@@ -34,6 +63,7 @@ def get_vectors(_cw):
rset = _cw.execute('Any X WHERE X is ClassifiedAd')
ads = dict((entity.eid, entity) for entity in rset.entities())
vectors = dict((eid, vectorize(entity)) for eid, entity in ads.items())
return vectors
def get_ads_proximity(vectors):
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment