Commit 921e768f authored by Sylvain Thénault's avatar Sylvain Thénault
Browse files

when encoding found by the regexp, check it's known to avoid bad matches

parent 296b57e55475
Change log for mtconverter
==========================
2009-07-21 -- 0.7.0
2009-07-21 -- 0.7.0
* new need_guess function
* new fallbackmimetype argument to guess_mimetype_and_encoding
......
......@@ -20,8 +20,9 @@ import locale
import mimetypes
import re
import string
from StringIO import StringIO
import htmlentitydefs
import codecs
from StringIO import StringIO
try:
import chardet
......@@ -66,7 +67,13 @@ def guess_encoding(buffer, fallbackencoding=None):
# try to get a character set declaration
m = CHARSET_DECL_RGX.search(buffer[:CHARSET_DECL_SEARCH_SIZE])
if m is not None:
return m.group(1)
guessed = m.group(1)
try:
# ensure encoding is known by python
codecs.lookup(guessed)
return guessed
except LookupError:
pass
if buffer.lstrip().startswith('<?xml'):
# xml files with no encoding declaration default to UTF-8
return 'UTF-8'
......
......@@ -81,6 +81,17 @@ class GuessEncodingTC(TestCase):
</html>'''
self.assertEquals(guess_encoding(data), 'latin1')
def test_bad_detection(self):
data = '''class SchemaViewer(object):
"""return an ureport layout for some part of a schema"""
def __init__(self, req=None, encoding=None):
'''
# ascii detected by chardet
try:
import chardet
self.assertEquals(guess_encoding(data), 'ascii')
except ImportError:
self.assertEquals(guess_encoding(data), DEFAULT_ENCODING)
class GuessMimetymeAndEncodingTC(TestCase):
def test_base(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment