dataimport.py 7.58 KB
Newer Older
1
# coding: utf-8
2
# copyright 2016-2021 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# contact http://www.logilab.fr -- mailto:contact@logilab.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""cubicweb-seda data import tools"""

from __future__ import print_function

21
import io
22
from itertools import count
23
from os.path import abspath, dirname, join
24

25
26
from cubicweb.server.checkintegrity import reindex_entities
from cubicweb.dataimport.stores import NoHookRQLObjectStore
27
28
29
30
from cubicweb.dataimport.importer import (
    SimpleImportLog,
    cwuri2eid,
)
31

32
from cubicweb_skos import lcsv, sobjects as skos
33

34
35
36
37
# If you want to add a vocabulary here, add it to the end of the list if you
# want to create it in migration script by simply calling import_seda_schemes
# for proper interaction with saem which attempt to allocate reproducible ARK
# identifier to them.
38
39
40
LCSV_FILES = [(title, rtype, etype,
               join(abspath(dirname(__file__)), 'migration', 'data', fname))
              for title, rtype, etype, fname in (
41
42
43
44
45
46
47
48
49
50
51
    # schemes extracted from SEDA 2 XSD
    (u'SEDA 2 : Actions',
     'seda_final_action', 'SEDAStorageRule',
     'final_action_storage_code_type.csv'),
    (u'SEDA 2 : Unités de mesure',
     'seda_unit', ('SEDAWidth', 'SEDAHeight', 'SEDADepth',
                   'SEDADiameter', 'SEDALength', 'SEDAThickness'),
     'measurement_units_type.csv'),
    (u'SEDA 2 : Unités de poids',
     'seda_unit', 'SEDAWeight',
     'measurement_weight_units_type.csv'),
52
    (u'SEDA : Types de mot-clé',
53
54
     'seda_keyword_type_to', (),
     'code_keyword_type.csv'),
55
56
57
    (u'SEDA 2 : Status légaux',
     'seda_legal_status_to', (),
     'legal_status.csv'),
58
    (u'SEDA : Niveaux de description',
59
     'seda_description_level_to', (),
60
61
62
63
64
65
66
67
68
69
70
71
     'level_type.csv'),
    # schemes extracted from SEDA 2 XSD, completed to support earlier SEDA versions
    (u'SEDA : Sort final',
     'seda_final_action', 'SEDAAppraisalRule',
     'final_action_appraisal_code_type.csv'),
    # schemes extracted from earlier SEDA versions
    (u"SEDA : Durée d'utilité administrative",
     'seda_rule', 'SEDASeqAppraisalRuleRule',
     'dua.csv'),
    (u"SEDA : Codes de restriction d'accès",
     'seda_rule', 'SEDASeqAccessRuleRule',
     'access_control.csv'),
72
73
74
    (u"SEDA : Règles de diffusion",
     'seda_rule', 'SEDASeqDisseminationRuleRule',
     'dissemination.csv'),
75
76
77
    (u"SEDA : Types d'objets-données",
     'seda_type_to', (),
     'document_type_code.csv'),
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    # other schemes
    (u'Types MIME',
     'seda_mime_type_to', (),
     'mime_types.csv'),
    (u"Types d'évènement",
     'seda_event_type_to', (),
     'event_types.csv'),
    (u'Encodages (extraits du schéma UN/CEFACT)',
     'seda_encoding_to', (),
     'encodings.csv'),
    (u'Formats de fichier (PRONOM)',
     'seda_format_id_to', (),
     'file_formats.csv'),
    (u'Niveau de classification (IGI 1300)',
     'seda_classification_level', (),
     'classification_levels.csv'),
    (u'Langues (ISO-639-3)',
95
     (), (),
96
     'languages.csv'),
97
98
99
    (u"Algorithmes d'empreinte",
     'seda_algorithm', 'SEDABinaryDataObject',
     'digest_algorithms.csv'),
100
101
102
103

    (u'Catégories de fichier',
     'file_category', (),
     'file_categories.csv'),
104
105
106
    (u'Langues restreintes (compatible SEDA 0.2)',
     ('seda_language_to', 'seda_description_language_to'), (),
     'languages-seda02.csv'),
107
)]
108
109


110
def lcsv_import(cnx, store, fname, scheme_uri, **kwargs):
111
    """Actually import LCSV data file."""
112
    with io.open(fname, 'rb') as stream:
113
114
115
        extentities = skos.lcsv_extentities(stream, scheme_uri, ';', 'utf-8')
        import_log = SimpleImportLog(fname)
        skos.store_skos_extentities(cnx, store, extentities, import_log,
116
                                    raise_on_error=True, extid_as_cwuri=False, **kwargs)
117
118


119
def lcsv_check(cnx, store, fname, scheme_uri, separator=';', **kwargs):
120
121
122
123
    """Simply check data file consistency."""
    counter = count()

    def uri_generator(val):
Noé Gaumont's avatar
Noé Gaumont committed
124
        return str(next(counter)) + val
125

126
    with io.open(join(dirname(__file__), 'migration', 'data', fname), 'rb') as stream:
127
        lcsv2rdf = lcsv.LCSV2RDF(stream, separator, 'utf-8',
128
                                 # XXX drop once skos is released
Noé Gaumont's avatar
Noé Gaumont committed
129
                                 uri_generator=uri_generator, uri_cls=str)
130
131
        list(lcsv2rdf.triples())

132
133
        # also check there are the expected number of separator for each line
        stream.seek(0)
134
        expected_separators = stream.readline().decode('utf-8').count(separator)
135
        for i, line in enumerate(stream):
136
            line = line.decode('utf-8')
137
138
139
140
141
142
            if line.count(separator) != expected_separators:
                linenum = i + 2
                raise AssertionError('Got %s %s on line %s of %s, %s where expected'
                                     % (line.count(separator), separator, linenum,
                                        fname, expected_separators))

143

144
def init_seda_scheme(cnx, title):
145
146
147
148
    """Create a scheme to hold SEDA concepts with the given title.

    Separated function to be monkey-patched if one need to customize the store (eg saem).
    """
149
150
151
152
    description = u'edition 2009' if title.startswith('SEDA :') else None
    return cnx.create_entity('ConceptScheme', title=title, description=description)


153
154
155
156
157
def get_store(cnx):
    """Return the store to be used to import LCSV data files.

    Separated function to be monkey-patched if one needs to customize the store (eg saem).
    """
158
159
    if cnx.repo.system_source.dbdriver == 'postgres':
        from cubicweb.dataimport.massive_store import MassiveObjectStore
160
        return MassiveObjectStore(cnx, eids_seq_range=1000)
161
    else:
162
163
164
        return NoHookRQLObjectStore(cnx)


165
def import_seda_schemes(cnx, lcsv_import=lcsv_import, lcsv_files=LCSV_FILES):
166
    """Import all LCSV data files defined in LCSV_FILES."""
167
168
169
    extid2eid = cwuri2eid(cnx, ('ConceptScheme', 'Label'))
    # concepts and external URIs may come from any source
    extid2eid.update(cwuri2eid(cnx, ('Concept', 'ExternalUri')))
170
    store = get_store(cnx)
171
    for title, rtypes, etypes, fname in lcsv_files:
172
173
        if not cnx.find('ConceptScheme', title=title):
            print('importing', title.encode('utf-8'))
174
            scheme = init_seda_scheme(cnx, title)
175
176
            extid2eid[scheme.cwuri] = scheme.eid
            lcsv_import(cnx, store, fname, scheme.cwuri, extid2eid=extid2eid)
177
178
179
180
181
182
183
184
185
186
            if not isinstance(rtypes, tuple):
                rtypes = (rtypes,)
            for rtype in rtypes:
                rtype_e = cnx.find('CWRType', name=rtype).one()
                scheme.cw_set(scheme_relation_type=rtype_e)
            if not isinstance(etypes, tuple):
                etypes = (etypes,)
            for etype in etypes:
                etype_e = cnx.find('CWEType', name=etype).one()
                scheme.cw_set(scheme_entity_type=etype_e)
187
            store.flush()
188
189
    store.commit()
    store.finish()
190
191
192
    if not isinstance(store, NoHookRQLObjectStore):
        # when using the massive store, we need explicit reindexation
        reindex_entities(cnx.repo.schema, cnx, etypes=['Concept', 'ConceptScheme'])