Commit 40c03a3f authored by Nicolas Chauvat's avatar Nicolas Chauvat
Browse files

[similar] improve information extraction

parent 04de47d5d435
......@@ -17,16 +17,27 @@ from cubes.similarity import register_similarity, reset_similarity, vsm
def prefix_slice(value, start, end, *args):
return u'%s: %s' % (value, u' '.join(args[start:end]))
def rgx_replace(value, *args):
for idx, arg in enumerate(args):
value = value.replace(u'\\%i' % (idx+1), arg)
return value
number2digit_rgx = re.compile('^\d\d$')
number4digit_rgx = re.compile('^\d\d\d\d$')
number5digit_rgx = re.compile('^\d\d\d\d\d$')
number_rgx = re.compile('^\d+$')
surface_rgx = re.compile(u'^\d+m[²2]$')
annee_rgx = re.compile(u'^ann[ée]e$')
TRANSLATE_TABLE = [
(('vlce',), ft.partial(vsm.const, 'valence')),
(('s-sol',), ft.partial(vsm.const, 'sous-sol')),
(('dt',), ft.partial(vsm.const, 'dont')),
(('ch',), ft.partial(vsm.const, 'chambre')),
(('chbs',), ft.partial(vsm.const, 'chambre')),
(('chbrs',), ft.partial(vsm.const, 'chambre')),
(('chambres',), ft.partial(vsm.const, 'chambre')),
(('grd',), ft.partial(vsm.const, 'grand')),
(('gde',), ft.partial(vsm.const, 'grande')),
(('ttes',), ft.partial(vsm.const, 'toutes')),
(('niv',), ft.partial(vsm.const, 'niveau')),
(('st',), ft.partial(vsm.const, 'saint')),
......@@ -38,12 +49,16 @@ TRANSLATE_TABLE = [
(('salle','à','manger'), ft.partial(vsm.const, 'salle à manger')),
(('salle','bains'), ft.partial(vsm.const, 'salle de bain')),
(('salle','eau'), ft.partial(vsm.const, 'salle d\'eau')),
(('jacuzi'), ft.partial(vsm.const, 'jacuzzi')),
(('jacuzi',), ft.partial(vsm.const, 'jacuzzi')),
((number5digit_rgx,), ft.partial(vsm.prefix_join, 'postcode')),
((number_rgx, u'm²'), ft.partial(vsm.prefix_join, 'surface')),
((number_rgx, 'm2'), lambda x,y: u'surface: %s m²' % x),
((surface_rgx,), lambda x: u'surface: %s m²' % x[:-2]),
((number_rgx, u'pièces'), ft.partial(prefix_slice, u'rooms', 0, 1)),
(('drome',), ft.partial(vsm.const, u'drôme')),
((annee_rgx, number4digit_rgx), ft.partial(rgx_replace, u'year-built: \\2')),
((annee_rgx, number2digit_rgx), ft.partial(rgx_replace, u'year-built: 19\\2')),
(('drome',), ft.partial(vsm.const, u'location: http://fr.wikipedia.org/wiki/Dr%C3%B4me_%28d%C3%A9partement%29')),
(('drôme',), ft.partial(vsm.const, u'location: http://fr.wikipedia.org/wiki/Dr%C3%B4me_%28d%C3%A9partement%29')),
(('gare', 'tgv'), ft.partial(vsm.prefix_join, 'location')),
(('danton',), ft.partial(vsm.prefix_join, 'location')),
(('grand','charran'), ft.partial(vsm.prefix_join, 'location')),
......@@ -51,6 +66,7 @@ TRANSLATE_TABLE = [
(('appartement',), ft.partial(vsm.prefix_join, 'property-type')),
(('maison',), ft.partial(vsm.prefix_join, 'property-type')),
(('villa',), ft.partial(vsm.const, 'property-type: maison')),
(('vlce',), ft.partial(vsm.const, 'location: http://dbpedia.org/resource/Valence%2C_Dr%C3%B4me')),
(('valence',), ft.partial(vsm.const, 'location: http://dbpedia.org/resource/Valence%2C_Dr%C3%B4me')),
(('26000', 'valence',), ft.partial(vsm.const, 'location: http://dbpedia.org/resource/Valence%2C_Dr%C3%B4me')),
(('st', 'marcel'), ft.partial(vsm.const, 'location: http://dbpedia.org/resource/Saint-Marcel-l%C3%A8s-Valence')),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment