__init__.py 10.7 KB
Newer Older
1
# copyright 2006-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This file is part of logilab-mtconverter.
#
# logilab-mtconverter is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation, either version 2.1 of the License, or (at your
# option) any later version.
#
# logilab-mtconverter is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
# for more details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>.
18
19
"""Mime type conversion package.

20
  2006-2012 `LOGILAB S.A. <http://www.logilab.fr>`_ (Paris, FRANCE),
21
22
  all rights reserved.

23
  http://www.logilab.org/project/logilab-mtconverter --
24
25
  mailto:python-projects@logilab.org

Sylvain Thénault's avatar
Sylvain Thénault committed
26
  `Lesser General Public License version 2`
27
"""
Rémi Cardona's avatar
Rémi Cardona committed
28

Nsukami Patrick's avatar
Nsukami Patrick committed
29
from _io import BytesIO
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
30

31
32
import locale
import mimetypes
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
33
import re
Laurent Peuch's avatar
Laurent Peuch committed
34

35
import codecs
Nsukami Patrick's avatar
Nsukami Patrick committed
36
37
38
from io import BytesIO  # noqa: F811
from typing import Optional, Callable, Any, Tuple
from types import ModuleType
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
39

40
41
from logilab.common import deprecation

Philippe Pepiot's avatar
Philippe Pepiot committed
42
from html.entities import name2codepoint
43

44
import pkg_resources
Laurent Peuch's avatar
Laurent Peuch committed
45

Nsukami Patrick's avatar
Nsukami Patrick committed
46
47
48
49
50
maketrans: Callable[[bytes, bytes], bytes] = bytes.maketrans

__version__: str = pkg_resources.get_distribution("logilab-mtconverter").version

__docformat__: str = "restructuredtext en"
51

52
53
54
55
try:
    import chardet
except ImportError:
    # chardet unvailable
Nsukami Patrick's avatar
Nsukami Patrick committed
56
57
    # Name 'chardet' already defined (by an import)
    chardet: Optional[ModuleType] = None  # type: ignore[no-redef]
58

59
60
61
62
try:
    DEFAULT_ENCODING = locale.getpreferredencoding()
except locale.Error:
    DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False)
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
63

Laurent Peuch's avatar
Laurent Peuch committed
64
BINARY_ENCODINGS = set(("gzip", "bzip2", "base64"))
65

Laurent Peuch's avatar
Laurent Peuch committed
66
TEXT_MIMETYPES = set(("application/xml", "application/xhtml+xml"))
67

Laurent Peuch's avatar
Laurent Peuch committed
68
UNICODE_POLICY = "strict"
69

Laurent Peuch's avatar
Laurent Peuch committed
70
71
72
_CHARSET_DECL_RGX = "(?:charset|(?:(?:en)?coding))[=:\\s\"']*([^\\s\"']*)".encode(
    "ascii"
)
73
CHARSET_DECL_RGX = re.compile(_CHARSET_DECL_RGX, re.I | re.S)
74
CHARSET_DECL_SEARCH_SIZE = 512
75
76
77
78

CHARDET_MIN_SIZE = 20
CHARDET_CONFIDENCE_THRESHOLD = 0.75

Laurent Peuch's avatar
Laurent Peuch committed
79

Nsukami Patrick's avatar
Nsukami Patrick committed
80
def need_guess(mimetype: str, encoding: str) -> bool:
Sylvain Thénault's avatar
Sylvain Thénault committed
81
82
83
84
85
86
87
    """return True if we can complete given mimetype / encoding information"""
    if not mimetype:
        return True
    if not encoding and is_text_mimetype(mimetype):
        return True
    return False

Laurent Peuch's avatar
Laurent Peuch committed
88

Nsukami Patrick's avatar
Nsukami Patrick committed
89
def is_text_mimetype(mimetype: str) -> bool:
Laurent Peuch's avatar
Laurent Peuch committed
90
91
    return mimetype.startswith("text/") or mimetype in TEXT_MIMETYPES

92

Nsukami Patrick's avatar
Nsukami Patrick committed
93
def guess_encoding(buffer: BytesIO, fallbackencoding: Optional[Any] = None) -> str:
94
    """try to guess encoding from a buffer"""
Laurent Peuch's avatar
Laurent Peuch committed
95
    if hasattr(buffer, "getvalue"):  # may be a StringIO
96
        buffer = buffer.getvalue()
97
98
    # try to get a character set declaration
    m = CHARSET_DECL_RGX.search(buffer[:CHARSET_DECL_SEARCH_SIZE])
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
99
    if m is not None:
Laurent Peuch's avatar
Laurent Peuch committed
100
        guessed = m.group(1).decode("ascii")
101
102
103
104
105
106
        try:
            # ensure encoding is known by python
            codecs.lookup(guessed)
            return guessed
        except LookupError:
            pass
Laurent Peuch's avatar
Laurent Peuch committed
107
    if buffer.lstrip().startswith("<?xml".encode("ascii")):
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
108
        # xml files with no encoding declaration default to UTF-8
Laurent Peuch's avatar
Laurent Peuch committed
109
        return "UTF-8"
110
    # use text analysis if enough data
111
112
    if chardet is not None and len(buffer) > CHARDET_MIN_SIZE:
        detected = chardet.detect(buffer)
Laurent Peuch's avatar
Laurent Peuch committed
113
114
        if detected["confidence"] >= CHARDET_CONFIDENCE_THRESHOLD:
            return detected["encoding"]
115
116
    return fallbackencoding or DEFAULT_ENCODING

Laurent Peuch's avatar
Laurent Peuch committed
117
118

def guess_mimetype_and_encoding(
Nsukami Patrick's avatar
Nsukami Patrick committed
119
120
121
122
123
124
125
    format: Optional[str] = None,
    encoding: Optional[Any] = None,
    data: Optional[bytes] = None,
    filename: Optional[str] = None,
    fallbackencoding: Optional[Any] = None,
    fallbackmimetype: str = u"application/octet-stream",
) -> Tuple[Optional[str], Optional[str]]:
Laurent Peuch's avatar
Laurent Peuch committed
126
127
    if format and format.split("/")[-1] in BINARY_ENCODINGS:
        format = None  # try to do better
128
129
130
    if filename and not format:
        format, enc = mimetypes.guess_type(filename)
        if format:
131
132
            if not encoding:
                encoding = enc
133
        elif enc:
Laurent Peuch's avatar
Laurent Peuch committed
134
            format = u"application/%s" % enc
135
        else:
136
            format = fallbackmimetype
Sylvain's avatar
Sylvain committed
137
    if not encoding and data and format and is_text_mimetype(format):
138
        encoding = guess_encoding(data, fallbackencoding)
139
    return format, encoding
140

141

Philippe Pepiot's avatar
Philippe Pepiot committed
142
CONTROL_CHARS = [bytes((ci,)) for ci in range(32)]
Nsukami Patrick's avatar
Nsukami Patrick committed
143
_TR_CONTROL_CHARS = [" "] * len(CONTROL_CHARS)
Laurent Peuch's avatar
Laurent Peuch committed
144
for c in ("\n", "\r", "\t"):
Nsukami Patrick's avatar
Nsukami Patrick committed
145
146
147
148
    _TR_CONTROL_CHARS[ord(c)] = c
_TR_CONTROL_CHARS[ord("\f")] = "\n"
_TR_CONTROL_CHARS[ord("\v")] = "\n"
TR_CONTROL_CHARS = [c.encode("ascii") for c in _TR_CONTROL_CHARS]
Laurent Peuch's avatar
Laurent Peuch committed
149
150
151
152
ESC_CAR_TABLE = maketrans(
    "".encode("ascii").join(CONTROL_CHARS), "".encode("ascii").join(TR_CONTROL_CHARS)
)
ESC_UCAR_TABLE = ESC_CAR_TABLE.decode("latin1")
153

Laurent Peuch's avatar
Laurent Peuch committed
154

Nsukami Patrick's avatar
Nsukami Patrick committed
155
def xml_escape(data: str) -> str:
156
    """escapes XML forbidden characters in attributes and PCDATA"""
Philippe Pepiot's avatar
Philippe Pepiot committed
157
    if isinstance(data, str):
158
159
160
        data = data.translate(ESC_UCAR_TABLE)
    else:
        data = data.translate(ESC_CAR_TABLE)
Laurent Peuch's avatar
Laurent Peuch committed
161
162
163
164
165
166
167
168
    return (
        data.replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
        .replace('"', "&quot;")
        .replace("'", "&#39;")
    )

169

170
171
172
html_escape = deprecation.renamed("html_escape", xml_escape)


Nsukami Patrick's avatar
Nsukami Patrick committed
173
def html_unescape(data: str) -> str:
174
    """unescapes XML/HTML entities"""
175
    for entityname, codepoint in name2codepoint.items():
Laurent Peuch's avatar
Laurent Peuch committed
176
177
178
        data = data.replace("&%s;" % entityname, chr(codepoint))
    return data.replace("&#39;", "'")

"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
179

180
class TransformData:
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
181
182
183
    """wrapper arround transformed data to add extra infos such as MIME
    type and encoding in case it applies
    """
Laurent Peuch's avatar
Laurent Peuch committed
184

Nsukami Patrick's avatar
Nsukami Patrick committed
185
186
187
    def __init__(
        self, data: str, mimetype: str, encoding: Optional[str] = None, **kwargs: Any
    ) -> None:
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
188
189
190
        self.__dict__.update(kwargs)
        self.data = data
        self.mimetype = mimetype
Sylvain's avatar
oops    
Sylvain committed
191
        self.encoding = encoding
Philippe Pepiot's avatar
Philippe Pepiot committed
192
        if not self.is_binary() and not encoding and not isinstance(self.data, str):
193
            self.encoding = guess_encoding(data)
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
194

Nsukami Patrick's avatar
Nsukami Patrick committed
195
    def get(self, attr: str, default: Optional[Any] = None) -> Optional[Any]:
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
196
197
        """get an optional data attribute"""
        return getattr(self, attr, default)
198

Nsukami Patrick's avatar
Nsukami Patrick committed
199
    def decode(self, force: bool = False) -> str:
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
200
        """return the data as an unicode string"""
Philippe Pepiot's avatar
Philippe Pepiot committed
201
        if isinstance(self.data, str):
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
202
            return self.data
Sylvain's avatar
Sylvain committed
203
204
205
206
        if force:
            if self.encoding in BINARY_ENCODINGS:
                self.binary_decode()
        elif self.is_binary():
Laurent Peuch's avatar
Laurent Peuch committed
207
208
209
210
            raise Exception(
                "can't decode binary stream (mime type: %s, encoding: %s)"
                % (self.mimetype, self.encoding)
            )
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
211
212
213
214
        if self.encoding:
            encoding = self.encoding
        else:
            encoding = guess_encoding(self.data)
Sylvain's avatar
Sylvain committed
215
        return self.data.decode(encoding, UNICODE_POLICY)
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
216

Nsukami Patrick's avatar
Nsukami Patrick committed
217
    def encode(self, encoding: Optional[Any] = None) -> bytes:
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
218
        """return the data as an encoded string"""
Laurent Peuch's avatar
Laurent Peuch committed
219
220
221
        if (encoding is None or self.encoding == encoding) and isinstance(
            self.data, bytes
        ):
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
222
            return self.data
Laurent Peuch's avatar
Laurent Peuch committed
223
        encoding = encoding or self.encoding or "utf8"
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
224
225
        return self.decode().encode(encoding)

Nsukami Patrick's avatar
Nsukami Patrick committed
226
    def is_binary(self) -> bool:
Laurent Peuch's avatar
Laurent Peuch committed
227
        return not is_text_mimetype(self.mimetype) or self.encoding in BINARY_ENCODINGS
228

Nsukami Patrick's avatar
Nsukami Patrick committed
229
    def check_encoding(self) -> None:
Sylvain's avatar
Sylvain committed
230
        if is_text_mimetype(self.mimetype) and self.is_binary():
231
232
            raise TransformError()

Nsukami Patrick's avatar
Nsukami Patrick committed
233
    def binary_decode(self):  # type: ignore[no-untyped-def] # FIXME: is self.data a str or a byte?
Laurent Peuch's avatar
Laurent Peuch committed
234
        if self.encoding == "gzip":
235
            import gzip
Laurent Peuch's avatar
Laurent Peuch committed
236

237
            stream = gzip.GzipFile(fileobj=BytesIO(self.data))
238
239
            self.data = stream.read()
            self.encoding = guess_encoding(self.data)
Laurent Peuch's avatar
Laurent Peuch committed
240
        elif self.encoding == "bzip2":
241
            import bz2
Laurent Peuch's avatar
Laurent Peuch committed
242
243

            self.data = bz2.decompress(BytesIO(self.data))  # StringIO or not?
244
            self.encoding = guess_encoding(self.data)
Laurent Peuch's avatar
Laurent Peuch committed
245
        elif self.encoding == "base64":
246
            import base64
Laurent Peuch's avatar
Laurent Peuch committed
247

248
249
250
            self.data = base64.decodestring(self.data)
            self.encoding = guess_encoding(self.data)

251

"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
252
253
class MtConverterError(Exception):
    """base class for this package's errors"""
254

Laurent Peuch's avatar
Laurent Peuch committed
255

256
class MissingBinary(MtConverterError):
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
257
258
    """raised when a system binary on whic rely a transform has not been found
    """
Laurent Peuch's avatar
Laurent Peuch committed
259
260


261
class TransformError(MtConverterError):
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
262
263
264
265
266
    """raised when something can't be transformed due to missing necessary
    transforms
    """


Nsukami Patrick's avatar
Nsukami Patrick committed
267
268
269
def register_pil_transforms(
    engine: Any, verb: bool = True
) -> bool:  # FIXME: engine: TransformEngine
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
270
271
272
273
274
    try:
        from logilab.mtconverter.transforms import piltransforms
    except ImportError:
        # pil not available, do nothing
        if verb:
Rémi Cardona's avatar
Rémi Cardona committed
275
            print("PIL isn't available, image transforms won't be available'")
276
        return False
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
277
278
279
    else:
        for trclass in piltransforms.transform_classes:
            engine.add_transform(trclass())
280
        return True
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
281

282

Nsukami Patrick's avatar
Nsukami Patrick committed
283
284
285
def register_pygments_transforms(
    engine: Any, verb: bool = True
) -> bool:  # FIXME: engine: TransformEngine
286
287
288
289
290
    try:
        from logilab.mtconverter.transforms import pygmentstransforms
    except ImportError:
        # pygments not available, do nothing
        if verb:
Rémi Cardona's avatar
Rémi Cardona committed
291
            print("PYGMENTS isn't available, transforms won't be available'")
292
        return False
293
294
295
    else:
        for trclass in pygmentstransforms.transform_classes:
            engine.add_transform(trclass())
296
        return True
297
298


Nsukami Patrick's avatar
Nsukami Patrick committed
299
300
301
def register_base_transforms(
    engine: Any, verb: bool = True
) -> bool:  # FIXME: engine: TransformEngine
Laurent Peuch's avatar
Laurent Peuch committed
302
303
304
305
306
307
308
    from logilab.mtconverter.transforms import (
        cmdtransforms,
        text_to_text,
        xml_to_text,
        text_to_html,
        xlog_to_html,
    )
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
309
    from logilab.mtconverter.transforms.python import python_to_html
310
    from logilab.mtconverter.transforms.htmltransform import html_to_formatted_text
Aurelien Campeas's avatar
Aurelien Campeas committed
311
    from logilab.mtconverter.transforms.odt2text import odt_to_unformatted_text
312
    from logilab.mtconverter.transforms.pgpsignature import pgpsignature_to_text
Laurent Peuch's avatar
Laurent Peuch committed
313

"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
314
    engine.add_transform(text_to_text())
315
    engine.add_transform(xml_to_text())
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
316
    engine.add_transform(text_to_html())
David Douard's avatar
David Douard committed
317
    engine.add_transform(xlog_to_html())
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
318
    engine.add_transform(python_to_html())
319
    engine.add_transform(html_to_formatted_text())
Aurelien Campeas's avatar
Aurelien Campeas committed
320
    engine.add_transform(odt_to_unformatted_text())
321
    engine.add_transform(pgpsignature_to_text())
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
322
323
324
    for trclass in cmdtransforms.transform_classes:
        try:
            engine.add_transform(trclass())
Rémi Cardona's avatar
Rémi Cardona committed
325
        except MissingBinary as ex:
"Sylvain ext:(%22)'s avatar
"Sylvain ext:(%22) committed
326
            if verb:
Rémi Cardona's avatar
Rémi Cardona committed
327
                print(ex)
328
    return True