Commit 0dbb1834 authored by Julien Jehannet's avatar Julien Jehannet
Browse files

(merge)

......@@ -11,3 +11,7 @@ acbfe6847e91104f3594b66693d447c99c351011 logilab-mtconverter-debian-version-0_4_
ad86d72f21d50104f511b0f2ed513a5912e15c31 logilab-mtconverter-version-0_6_0
ca07a7d744c52d37cb1873f5ae37c952061f4559 logilab-mtconverter-debian-version-0_6_0-1
e98b542dea5a975b08149667b9695c6abbe6c9f4 logilab-mtconverter-debian-version-0_6_0-2
7b93cad2780451bf0282b1af3c05e7180e0fe746 logilab-mtconverter-version-0_6_2
6d4f3b56b8646b2d7161f375242948e07359d3b2 logilab-mtconverter-debian-version-0_6_2-1
5b83d92d67d6076009fceb3448c762139071b22e logilab-mtconverter-version-0_6_3
2d45e594838d07d7e3854b69d8800cb0d91f289a logilab-mtconverter-debian-version-0_6_3-1
Change log for mtconverter
==========================
--
* fix for xml_escape called with unicode characters
2009-07-06 -- 0.6.2
* xml_escape / html_escape now escape all control characters (ascii code < 32)
2009-06-29 -- 0.6.1
* fix potential crash with wrong local setting
2009-02-12 -- 0.6.0
* xml_escape
* application/vnd.oasis.opendocument.text -> text/plain transformation
* html_unescape now unescapes any HTML entity
2008-08-06 -- 0.5.0
* application/xml -> text/plain transformation
* new fallbackencoding argument to guess_mimetype_and_encoding, given to
guess_encoding when necessary
2008-06-30 -- 0.4.0
* use a new pure python transformation to transform html into
formatted text. The code is based on http://www.aaronsw.com/2002/html2text/
developed by Aaron Swartz.
2008-01-14 -- 0.3.0
* unicode policy handling
* enhance text mimetype detection
* fix pygment's html transform to return only a html snippet, not a full
document, and avoid returning encoded string
* register_* function return True or False telling if optional transforms
are available
* restore python 2.3 compatibility
2007-12-11 -- 0.2.0
* guess_encoding test and fixes
* new guess_format_and_encoding utility method
* deal with binary encoding (eg raise TransformError when necessary)
* renamed ._transforms attribute of the engine into transforms since
it's useful to access it to check transform availability
* pygments based transforms
2007-10-23 -- 0.1.4
* various html fixes
2007-02-09 -- 0.1.3
* fix lynx transform encoding problem
* strip output of popen transforms
* fix text to html transform to escape html special chars
2006-10-27 -- 0.1.2
* POpenTransform encoding fixes
2006-10-13 -- 0.1.1
* fixed some name errors...
2006-10-10 -- 0.1.0
* initial revision, backported from PortalTransforms with some API changes
......@@ -19,6 +19,7 @@ from logilab.mtconverter.__pkginfo__ import version as __version__
import locale
import mimetypes
import re
import string
from StringIO import StringIO
import htmlentitydefs
......@@ -29,8 +30,11 @@ except ImportError:
chardet = None
mimetypes.encodings_map['.bz2'] = 'bzip2' # register bzip2 encoding
try:
DEFAULT_ENCODING = locale.getpreferredencoding()
except locale.Error:
DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False)
DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False)
BINARY_ENCODINGS = set(('gzip', 'bzip2', 'base64'))
TEXT_MIMETYPES = set(('application/xml', 'application/xhtml+xml'))
......@@ -44,8 +48,16 @@ CHARSET_DECL_SEARCH_SIZE = 1024
CHARDET_MIN_SIZE = 20
CHARDET_CONFIDENCE_THRESHOLD = 0.75
def need_guess(mimetype, encoding):
"""return True if we can complete given mimetype / encoding information"""
if not mimetype:
return True
if not encoding and is_text_mimetype(mimetype):
return True
return False
def is_text_mimetype(mimetype):
return (mimetype.startswith('text/') or mimetype in TEXT_MIMETYPES)
return (mimetype.startswith('text/') or mimetype in TEXT_MIMETYPES)
def guess_encoding(buffer, fallbackencoding=None):
"""try to guess encoding from a buffer"""
......@@ -66,29 +78,47 @@ def guess_encoding(buffer, fallbackencoding=None):
return fallbackencoding or DEFAULT_ENCODING
def guess_mimetype_and_encoding(format=None, encoding=None, data=None,
filename=None, fallbackencoding=None):
filename=None, fallbackencoding=None,
fallbackmimetype=u'application/octet-stream'):
if format and format.split('/')[-1] in BINARY_ENCODINGS:
format = None # try to do better
if filename and not format:
format, enc = mimetypes.guess_type(filename)
if format:
encoding = enc
if not encoding:
encoding = enc
elif enc:
format = u'application/%s' % enc
else:
format = u'application/octet-stream'
format = fallbackmimetype
if not encoding and data and format and is_text_mimetype(format):
encoding = guess_encoding(data, fallbackencoding)
encoding = guess_encoding(data, fallbackencoding)
return format, encoding
CONTROL_CHARS = [chr(ci) for ci in range(32)]
TR_CONTROL_CHARS = [' '] * len(CONTROL_CHARS)
for c in ('\n', '\r', '\t'):
TR_CONTROL_CHARS[ord(c)] = c
TR_CONTROL_CHARS[ord('\f')] = '\n'
TR_CONTROL_CHARS[ord('\v')] = '\n'
ESC_CAR_TABLE = string.maketrans(''.join(CONTROL_CHARS),
''.join(TR_CONTROL_CHARS))
ESC_UCAR_TABLE = unicode(ESC_CAR_TABLE, 'latin1')
# XXX deprecate at some point (once less used :)
#@obsolete('use xml_escape')
def html_escape(data):
"""escapes XML/HTML forbidden characters in attributes and PCDATA"""
return (data.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
.replace('"','&quot;').replace("'",'&#39;'))
return xml_escape(data)
def xml_escape(data):
# XXX remove more control characters
return html_escape(data).replace('\f', '\n').replace('\b', '')
"""escapes XML forbidden characters in attributes and PCDATA"""
if isinstance(data, unicode):
data = data.translate(ESC_UCAR_TABLE)
else:
data = data.translate(ESC_CAR_TABLE)
return (data.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
.replace('"','&quot;').replace("'",'&#39;'))
def html_unescape(data):
"""unescapes XML/HTML entities"""
......@@ -111,7 +141,7 @@ class TransformData(object):
def get(self, attr, default=None):
"""get an optional data attribute"""
return getattr(self, attr, default)
def decode(self, force=False):
"""return the data as an unicode string"""
if isinstance(self.data, unicode):
......@@ -159,10 +189,10 @@ class TransformData(object):
self.data = base64.decodestring(self.data)
self.encoding = guess_encoding(self.data)
class MtConverterError(Exception):
"""base class for this package's errors"""
class MissingBinary(MtConverterError):
"""raised when a system binary on whic rely a transform has not been found
"""
......
"""
Copyright (c) 2006-2008 LOGILAB S.A. (Paris, FRANCE).
Copyright (c) 2006-2009 LOGILAB S.A. (Paris, FRANCE).
http://www.logilab.fr/ -- mailto:contact@logilab.fr
mtconverter packaging information
......@@ -8,7 +8,7 @@ mtconverter packaging information
modname = "mtconverter"
distname = "logilab-mtconverter"
subpackage_of = 'logilab'
numversion = (0, 6, 0)
numversion = (0, 6, 3)
version = '.'.join([str(num) for num in numversion])
license = 'GPL'
......
logilab-mtconverter (0.6.3-1) unstable; urgency=low
* new upstream release
-- Sylvain Thénault <sylvain.thenault@logilab.fr> Mon, 06 Jul 2009 17:56:44 +0200
logilab-mtconverter (0.6.2-1) unstable; urgency=low
* new upstream release
-- Sylvain Thénault <sylvain.thenault@logilab.fr> Mon, 06 Jul 2009 12:07:01 +0200
logilab-mtconverter (0.6.1-1) unstable; urgency=low
* new upstream release
-- Sylvain Thénault <sylvain.thenault@logilab.fr> Mon, 29 Jun 2009 13:34:22 +0200
logilab-mtconverter (0.6.0-2) unstable; urgency=low
* swith to python-support
* update Standard-Version to 3.8.1
* path setup.py to be sure not using test utils when invoked from debian/rules
-- Sylvain Thénault <sylvain.thenault@logilab.fr> Wed, 25 Mar 2009 09:55:21 +0100
logilab-mtconverter (0.6.0-1) DISTRIBUTION; urgency=low
......
......@@ -5,9 +5,18 @@ import locale
from StringIO import StringIO
from logilab.mtconverter import *
SPECIAL_CHARS = {
'\f' : '\n',
'\b' : ' ',
'\n' : '\n',
'\r' : '\r',
'\r\n' : '\r\n',
'\t' : '\t',
'\v' : '\n',
}
class HtmlEscapeTC(TestCase):
def test_escape(self):
for data, expected in [('toto', 'toto'),
('r&d', 'r&amp;d'),
......@@ -15,7 +24,26 @@ class HtmlEscapeTC(TestCase):
('d"h"', 'd&quot;h&quot;'),
("h'", 'h&#39;'),
]:
self.assertEquals(html_escape(data), expected)
yield self.assertEquals, xml_escape(data), expected
def test_escape_special_chars(self):
for car, trcar in SPECIAL_CHARS.items():
yield self.assertEquals, xml_escape(car), trcar
for carnum in xrange(32):
car = chr(carnum)
if car in SPECIAL_CHARS:
continue
yield self.assertEquals, xml_escape(car), ' '
yield self.assertEquals, xml_escape(u'é'), u'é'
def test_escape_special_chars_unicode(self):
for car, trcar in SPECIAL_CHARS.items():
yield self.assertEquals, xml_escape(unicode(car)), trcar
for carnum in xrange(32):
car = chr(carnum)
if car in SPECIAL_CHARS:
continue
yield self.assertEquals, xml_escape(unicode(car)), ' '
def test_html_unescape(self):
for data, expected in [('toto', 'toto'),
......@@ -25,24 +53,24 @@ class HtmlEscapeTC(TestCase):
('h&#39;', "h'"),
('x &equiv; y', u"x \u2261 y"),
]:
self.assertEquals(html_unescape(data), expected)
yield self.assertEquals, html_unescape(data), expected
class GuessEncodingTC(TestCase):
def test_emacs_style_declaration(self):
data = '''# -*- coding: latin1 -*-'''
self.assertEquals(guess_encoding(data), 'latin1')
def test_emacs_style_declaration_stringIO(self):
data = '''# -*- coding: latin1 -*-'''
self.assertEquals(guess_encoding(StringIO(data)), 'latin1')
def test_xml_style_declaration(self):
data = '''<?xml version="1.0" encoding="latin1"?>
<root/>'''
self.assertEquals(guess_encoding(data), 'latin1')
def test_html_style_declaration(self):
data = '''<html xmlns="http://www.w3.org/1999/xhtml" xmlns:erudi="http://www.logilab.fr/" xml:lang="fr" lang="fr">
<head>
......@@ -89,7 +117,7 @@ class GuessMimetymeAndEncodingTC(TestCase):
self.assertEquals(format, u'application/octet-stream')
self.assertEquals(encoding, None)
class TransformDataTC(TestCase):
def test_autodetect_encoding_if_necessary(self):
data = TransformData('''<?xml version="1.0" encoding="latin1"?>
......
......@@ -13,14 +13,14 @@
"""some basic transformations (pure python)
:organization: Logilab
:copyright: 2006-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
:copyright: 2006-2009 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
"""
__docformat__ = "restructuredtext en"
import re
from logilab.mtconverter import html_escape
from logilab.mtconverter import xml_escape
from logilab.mtconverter.transform import Transform
class IdentityTransform(Transform):
......@@ -37,7 +37,7 @@ class text_to_text(IdentityTransform):
class rest_to_text(Transform):
inputs = ('text/rest', 'text/x-rst')
output = 'text/plain'
def _convert(self, trdata):
res = []
for line in trdata.data.splitlines():
......@@ -45,7 +45,7 @@ class rest_to_text(Transform):
if sline.startswith('.. '):
continue
res.append(line)
return '\n'.join(res)
return '\n'.join(res)
_TAG_PROG = re.compile(r'</?.*?>', re.U)
......@@ -56,7 +56,7 @@ class xml_to_text(Transform):
def _convert(self, trdata):
return _TAG_PROG.sub(' ', trdata.data)
class text_to_html(Transform):
inputs = ('text/plain',)
output = 'text/html'
......@@ -69,10 +69,10 @@ class text_to_html(Transform):
if not res[-1].endswith('<p>'):
res.append('</p><p>')
else:
res.append(html_escape(line))
res.append(xml_escape(line))
res.append('</p>')
return '\n'.join(res)
return '\n'.join(res)
class text_to_html_pre(Transform):
"""variant for text 2 html transformation : simply wrap text into pre tags
......@@ -82,7 +82,7 @@ class text_to_html_pre(Transform):
def _convert(self, trdata):
res = ['<pre>']
res.append(trdata.data)
res.append(xml_escape(trdata.data))
res.append('</pre>')
return '\n'.join(res)
......@@ -92,4 +92,4 @@ class xlog_to_html(Transform):
output = 'text/html'
def _convert(self, trdata):
return '\n'.join([x+'<BR/>' for x in trdata.data.splitlines()])
return '\n'.join([xml_escape(x)+'<BR/>' for x in trdata.data.splitlines()])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment