dataimport.py 6.28 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# coding: utf-8
# copyright 2016 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr -- mailto:contact@logilab.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""cubicweb-seda data import tools"""

from __future__ import print_function

from itertools import count
from os.path import join, dirname

from six import text_type

from cubicweb.dataimport.importer import SimpleImportLog

from cubes.skos import lcsv, sobjects as skos


LCSV_FILES = (
    # schemes extracted from SEDA 2 XSD
    (u'SEDA 2 : Actions',
     'seda_final_action', 'SEDAStorageRule',
     'final_action_storage_code_type.csv'),
    (u'SEDA 2 : Unités de mesure',
     'seda_unit', ('SEDAWidth', 'SEDAHeight', 'SEDADepth',
                   'SEDADiameter', 'SEDALength', 'SEDAThickness'),
     'measurement_units_type.csv'),
    (u'SEDA 2 : Unités de poids',
     'seda_unit', 'SEDAWeight',
     'measurement_weight_units_type.csv'),
    (u'SEDA 2 : Types de mot-clé',
     'seda_keyword_type_to', (),
     'code_keyword_type.csv'),
    (u'SEDA : Niveaux de description',
     'seda_description_level', (),
     'level_type.csv'),
    # schemes extracted from SEDA 2 XSD, completed to support earlier SEDA versions
    (u'SEDA : Sort final',
     'seda_final_action', 'SEDAAppraisalRule',
     'final_action_appraisal_code_type.csv'),
    # schemes extracted from earlier SEDA versions
    (u"SEDA : Durée d'utilité administrative",
     'seda_rule', 'SEDASeqAppraisalRuleRule',
     'dua.csv'),
    (u"SEDA : Codes de restriction d'accès",
     'seda_rule', 'SEDASeqAccessRuleRule',
     'access_control.csv'),
60
61
62
    (u"SEDA : Types d'objets-données",
     'seda_type_to', (),
     'document_type_code.csv'),
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    # other schemes
    (u'Types MIME',
     'seda_mime_type_to', (),
     'mime_types.csv'),
    (u"Types d'évènement",
     'seda_event_type_to', (),
     'event_types.csv'),
    (u'Encodages (extraits du schéma UN/CEFACT)',
     'seda_encoding_to', (),
     'encodings.csv'),
    (u'Formats de fichier (PRONOM)',
     'seda_format_id_to', (),
     'file_formats.csv'),
    (u'Niveau de classification (IGI 1300)',
     'seda_classification_level', (),
     'classification_levels.csv'),
    (u'Langues (ISO-639-3)',
     ('seda_language_to', 'seda_description_language_to'), (),
     'languages.csv'),
)


def lcsv_import(cnx, store, fname, scheme_uri):
    """Actually import LCSV data file."""
    with open(join(dirname(__file__), 'migration', 'data', fname)) as stream:
        extentities = skos.lcsv_extentities(stream, scheme_uri, ';', 'utf-8')
        import_log = SimpleImportLog(fname)
        skos.store_skos_extentities(cnx, store, extentities, import_log,
                                    raise_on_error=True, extid_as_cwuri=False)


def lcsv_check(cnx, store, fname, scheme_uri):
    """Simply check data file consistency."""
    counter = count()

    def uri_generator(val):
        return text_type(next(counter)) + val

    with open(join(dirname(__file__), 'migration', 'data', fname)) as stream:
        lcsv2rdf = lcsv.LCSV2RDF(stream, ';', 'utf-8',
                                 # XXX drop once skos is released
                                 uri_generator=uri_generator, uri_cls=text_type)
        list(lcsv2rdf.triples())


108
def init_seda_scheme(cnx, title):
109
110
111
112
    """Create a scheme to hold SEDA concepts with the given title.

    Separated function to be monkey-patched if one need to customize the store (eg saem).
    """
113
114
115
116
    description = u'edition 2009' if title.startswith('SEDA :') else None
    return cnx.create_entity('ConceptScheme', title=title, description=description)


117
118
119
120
121
def get_store(cnx):
    """Return the store to be used to import LCSV data files.

    Separated function to be monkey-patched if one needs to customize the store (eg saem).
    """
122
123
    if cnx.repo.system_source.dbdriver == 'postgres':
        from cubicweb.dataimport.massive_store import MassiveObjectStore
124
        return MassiveObjectStore(cnx, eids_seq_range=1000)
125
126
    else:
        from cubicweb.dataimport.stores import NoHookRQLObjectStore
127
128
129
130
131
        return NoHookRQLObjectStore(cnx)


def import_seda_schemes(cnx, lcsv_import=lcsv_import):
    """Import all LCSV data files defined in LCSV_FILES."""
132
    feed_extid2eid_cache(cnx)
133
    store = get_store(cnx)
134
135
136
    for title, rtypes, etypes, fname in LCSV_FILES:
        if not cnx.find('ConceptScheme', title=title):
            print('importing', title.encode('utf-8'))
137
            scheme = init_seda_scheme(cnx, title)
138
139
140
141
142
143
144
145
146
147
148
            lcsv_import(cnx, store, fname, scheme.cwuri)
            if not isinstance(rtypes, tuple):
                rtypes = (rtypes,)
            for rtype in rtypes:
                rtype_e = cnx.find('CWRType', name=rtype).one()
                scheme.cw_set(scheme_relation_type=rtype_e)
            if not isinstance(etypes, tuple):
                etypes = (etypes,)
            for etype in etypes:
                etype_e = cnx.find('CWEType', name=etype).one()
                scheme.cw_set(scheme_entity_type=etype_e)
149
            store.flush()
150
151
    store.commit()
    store.finish()
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172


# hack to avoid recomputing extid2eid mapping for each lcsv file, this is costly with massive store
# since index may have been removed
from logilab.common.decorators import monkeypatch  # noqa
from cubicweb.dataimport.importer import cwuri2eid as orig_cwuri2eid  # noqa
from cubes.skos import post321_import  # noqa

EXTID2EID_CACHE = None


def feed_extid2eid_cache(cnx):
    global EXTID2EID_CACHE
    EXTID2EID_CACHE = orig_cwuri2eid(cnx, ('ConceptScheme', 'Label'))
    # though concepts and external URIs may come from any source
    EXTID2EID_CACHE.update(cwuri2eid(cnx, ('Concept', 'ExternalUri')))


@monkeypatch(post321_import)
def cwuri2eid(cnx, etypes, source_eid=None):
    return EXTID2EID_CACHE