[ccplugin] use parallel_bulk when asked for

073889b33c0a · Arthur Lutz · 70db85e13042 · 073889b3
Commit 073889b33c0a authored 8 years ago by Arthur Lutz
--- a/ccplugin.py
+++ b/ccplugin.py
@@ -10,6 +10,7 @@
 import os.path as osp
 from elasticsearch import Elasticsearch
+from elasticsearch.helpers import parallel_bulk
 from cubicweb.cwctl import CWCTL
 from cubicweb.utils import admincnx
@@ -22,6 +23,20 @@
 HERE = osp.dirname(osp.abspath(__file__))
+# TODO optimisation : json serialize on one side, send to ES on the other
+# TODO progress bar
+def bulk_actions(rset, index_name, etype):
+    for entity in rset.entities():
+        serializer = entity.cw_adapt_to('ISerializable')
+        json = serializer.serialize()
+        yield {'_op_type': 'index',
+               '_index': index_name,
+               '_type': etype,
+               '_id': entity.eid,
+               '_source': json
+               }
 class IndexInES(Command):
    """Index content in ElasticSearch.
@@ -34,6 +49,8 @@
    arguments = '<instance id>'
    options = [('dry-run', {'type': 'yn', 'default': False,
                            'help': 'set to True if you want to skip the insertion in ES'}),
+               ('bulk', {'type': 'yn', 'default': False,
+                         'help': 'set to True if you want to insert in bulk in ES'}),
               ('debug', {'type': 'yn', 'default': False,
                          'help': 'set to True if you want to print out debug info and progress'}),
               ]
@@ -60,5 +77,7 @@
                for etype in indexable_types(schema):
                    rset = cnx.execute(
                        'Any X WHERE X is %(etype)s' % {'etype': etype})
+                    if len(rset) == 0:
+                        continue
                    if self.config.debug:
                        print(u'indexing {} {}'.format(etype, len(rset)))
@@ -63,24 +82,23 @@
                    if self.config.debug:
                        print(u'indexing {} {}'.format(etype, len(rset)))
-                    for entity in rset.entities():
+                    if self.config.bulk:
-                        # TODO add specific IFTIES adapter
+                        # success, failed = bulk(es, bulk_actions(rset, index_name, etype))
-                        serializer = entity.cw_adapt_to('ISerializable')
+                        # if self.config.debug:
-                        json = serializer.serialize()
+                        #     print(u'ES bulk : {} success {} failed'.format(success, failed))
-                        # TODO remove non indexable data or (better) serialize only
+                        list(parallel_bulk(es, bulk_actions(rset, index_name, etype),
-                        if not self.config.dry_run:
+                                           raise_on_error=False,
-                            es.index(index=index_name,
+                                           raise_on_exception=False))
-                                     id=entity.eid,
+                    else:
-                                     doc_type=etype,
+                        for entity in rset.entities():
-                                     body=json)
+                            # TODO add specific IFTIES adapter
-                        # TODO optimize with elasticsearch.helpers.bulk
+                            serializer = entity.cw_adapt_to('ISerializable')
-                        # or elasticsearch.helpers.parallel_bulk
+                            json = serializer.serialize()
-                        # or elasticsearch.helpers.streaming_bulk
+                            if not self.config.bulk:
-                        # TODO optimisation : json serialize on one side, send to ES on the other
+                                if not self.config.dry_run:
-                        # TODO progress bar
+                                    es.index(index=index_name,
-                        if self.config.debug:
+                                             id=entity.eid,
-                            print(u'.', end=u'')
+                                             doc_type=etype,
-                    if self.config.debug:
+                                             body=json)
-                        print(u'')
            else:
                if self.config.debug:
                    print(u'no elasticsearch configuration found, skipping')