Commit e1ceca58 authored by Rémi Cardona's avatar Rémi Cardona
Browse files

[py3k] deal with unicode/str mess

Related to #268148.
parent e3c63740aca5
......@@ -34,11 +34,14 @@ from logilab.mtconverter.__pkginfo__ import version as __version__
import locale
import mimetypes
import re
import string
try:
maketrans = bytes.maketrans
except AttributeError:
from string import maketrans
import codecs
from io import BytesIO
from six import text_type, binary_type
from six import text_type, binary_type, int2byte, unichr
from six.moves.html_entities import name2codepoint
try:
......@@ -59,8 +62,8 @@ TEXT_MIMETYPES = set(('application/xml', 'application/xhtml+xml'))
UNICODE_POLICY = 'strict'
CHARSET_DECL_RGX = re.compile('(?:charset|(?:(?:en)?coding))[=:\s"\']*([^\s"\']*)',
re.I | re.S | re.U)
_CHARSET_DECL_RGX = '(?:charset|(?:(?:en)?coding))[=:\s"\']*([^\s"\']*)'.encode('ascii')
CHARSET_DECL_RGX = re.compile(_CHARSET_DECL_RGX, re.I | re.S)
CHARSET_DECL_SEARCH_SIZE = 512
CHARDET_MIN_SIZE = 20
......@@ -84,14 +87,14 @@ def guess_encoding(buffer, fallbackencoding=None):
# try to get a character set declaration
m = CHARSET_DECL_RGX.search(buffer[:CHARSET_DECL_SEARCH_SIZE])
if m is not None:
guessed = m.group(1)
guessed = m.group(1).decode('ascii')
try:
# ensure encoding is known by python
codecs.lookup(guessed)
return guessed
except LookupError:
pass
if buffer.lstrip().startswith('<?xml'):
if buffer.lstrip().startswith('<?xml'.encode('ascii')):
# xml files with no encoding declaration default to UTF-8
return 'UTF-8'
# use text analysis if enough data
......@@ -120,15 +123,16 @@ def guess_mimetype_and_encoding(format=None, encoding=None, data=None,
return format, encoding
CONTROL_CHARS = [chr(ci) for ci in range(32)]
CONTROL_CHARS = [int2byte(ci) for ci in range(32)]
TR_CONTROL_CHARS = [' '] * len(CONTROL_CHARS)
for c in ('\n', '\r', '\t'):
TR_CONTROL_CHARS[ord(c)] = c
TR_CONTROL_CHARS[ord('\f')] = '\n'
TR_CONTROL_CHARS[ord('\v')] = '\n'
ESC_CAR_TABLE = string.maketrans(''.join(CONTROL_CHARS),
''.join(TR_CONTROL_CHARS))
ESC_UCAR_TABLE = unicode(ESC_CAR_TABLE, 'latin1')
TR_CONTROL_CHARS = [c.encode('ascii') for c in TR_CONTROL_CHARS]
ESC_CAR_TABLE = maketrans(''.encode('ascii').join(CONTROL_CHARS),
''.encode('ascii').join(TR_CONTROL_CHARS))
ESC_UCAR_TABLE = ESC_CAR_TABLE.decode('latin1')
# XXX deprecate at some point (once less used :)
#@obsolete('use xml_escape')
......
......@@ -17,6 +17,7 @@
# You should have received a copy of the GNU Lesser General Public License along
# with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>.
from logilab.common.testlib import TestCase, unittest_main
from six import u
from six.moves import range
import locale
......@@ -56,12 +57,12 @@ class HtmlEscapeTC(TestCase):
def test_escape_special_chars_unicode(self):
for car, trcar in SPECIAL_CHARS.items():
yield self.assertEqual, xml_escape(unicode(car)), trcar
yield self.assertEqual, xml_escape(u(car)), trcar
for carnum in range(32):
car = chr(carnum)
if car in SPECIAL_CHARS:
continue
yield self.assertEqual, xml_escape(unicode(car)), ' '
yield self.assertEqual, xml_escape(u(car)), ' '
def test_html_unescape(self):
for data, expected in [('toto', 'toto'),
......@@ -113,43 +114,43 @@ class GuessEncodingTC(TestCase):
class GuessMimetymeAndEncodingTC(TestCase):
def test_base(self):
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt", data="xxx")
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt", data=b"xxx")
self.assertEqual(format, u'text/plain')
self.assertEqual(encoding, locale.getpreferredencoding())
def test_set_mime_and_encoding_gz_file(self):
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt.gz", data="xxx")
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt.gz", data=b"xxx")
self.assertEqual(format, u'text/plain')
self.assertEqual(encoding, u'gzip')
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt.gz", data="xxx",
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt.gz", data=b"xxx",
format='application/gzip')
self.assertEqual(format, u'text/plain')
self.assertEqual(encoding, u'gzip')
format, encoding = guess_mimetype_and_encoding(filename=u"foo.gz", data="xxx")
format, encoding = guess_mimetype_and_encoding(filename=u"foo.gz", data=b"xxx")
self.assertEqual(format, u'application/gzip')
self.assertEqual(encoding, None)
def test_set_mime_and_encoding_bz2_file(self):
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt.bz2", data="xxx")
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt.bz2", data=b"xxx")
self.assertEqual(format, u'text/plain')
self.assertEqual(encoding, u'bzip2')
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt.bz2", data="xxx",
format, encoding = guess_mimetype_and_encoding(filename=u"foo.txt.bz2", data=b"xxx",
format='application/bzip2')
self.assertEqual(format, u'text/plain')
self.assertEqual(encoding, u'bzip2')
format, encoding = guess_mimetype_and_encoding(filename=u"foo.bz2", data="xxx")
format, encoding = guess_mimetype_and_encoding(filename=u"foo.bz2", data=b"xxx")
self.assertEqual(format, u'application/bzip2')
self.assertEqual(encoding, None)
def test_set_mime_and_encoding_unknwon_ext(self):
format, encoding = guess_mimetype_and_encoding(filename=u"foo.789", data="xxx")
format, encoding = guess_mimetype_and_encoding(filename=u"foo.789", data=b"xxx")
self.assertEqual(format, u'application/octet-stream')
self.assertEqual(encoding, None)
class TransformDataTC(TestCase):
def test_autodetect_encoding_if_necessary(self):
data = TransformData('''<?xml version="1.0" encoding="latin1"?>
data = TransformData(b'''<?xml version="1.0" encoding="latin1"?>
<root/>''', 'text/xml')
self.assertEqual(data.encoding, 'latin1')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment