Commit 921e768f authored by Sylvain Thénault's avatar Sylvain Thénault
Browse files

when encoding found by the regexp, check it's known to avoid bad matches

parent 296b57e55475
Change log for mtconverter
2009-07-21 -- 0.7.0
2009-07-21 -- 0.7.0
* new need_guess function
* new fallbackmimetype argument to guess_mimetype_and_encoding
......@@ -20,8 +20,9 @@ import locale
import mimetypes
import re
import string
from StringIO import StringIO
import htmlentitydefs
import codecs
from StringIO import StringIO
import chardet
......@@ -66,7 +67,13 @@ def guess_encoding(buffer, fallbackencoding=None):
# try to get a character set declaration
if m is not None:
guessed =
# ensure encoding is known by python
return guessed
except LookupError:
if buffer.lstrip().startswith('<?xml'):
# xml files with no encoding declaration default to UTF-8
return 'UTF-8'
......@@ -81,6 +81,17 @@ class GuessEncodingTC(TestCase):
self.assertEquals(guess_encoding(data), 'latin1')
def test_bad_detection(self):
data = '''class SchemaViewer(object):
"""return an ureport layout for some part of a schema"""
def __init__(self, req=None, encoding=None):
# ascii detected by chardet
import chardet
self.assertEquals(guess_encoding(data), 'ascii')
except ImportError:
self.assertEquals(guess_encoding(data), DEFAULT_ENCODING)
class GuessMimetymeAndEncodingTC(TestCase):
def test_base(self):
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment