Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
open-source
logilab-mtconverter
Commits
56e26715c09b
Commit
e1ceca58
authored
Aug 02, 2014
by
Rémi Cardona
Browse files
[py3k] deal with unicode/str mess
Related to #268148.
parent
e3c63740aca5
Changes
2
Hide whitespace changes
Inline
Side-by-side
__init__.py
View file @
56e26715
...
...
@@ -34,11 +34,14 @@ from logilab.mtconverter.__pkginfo__ import version as __version__
import
locale
import
mimetypes
import
re
import
string
try
:
maketrans
=
bytes
.
maketrans
except
AttributeError
:
from
string
import
maketrans
import
codecs
from
io
import
BytesIO
from
six
import
text_type
,
binary_type
from
six
import
text_type
,
binary_type
,
int2byte
,
unichr
from
six.moves.html_entities
import
name2codepoint
try
:
...
...
@@ -59,8 +62,8 @@ TEXT_MIMETYPES = set(('application/xml', 'application/xhtml+xml'))
UNICODE_POLICY
=
'strict'
CHARSET_DECL_RGX
=
re
.
compile
(
'(?:charset|(?:(?:en)?coding))[=:\s"
\'
]*([^\s"
\'
]*)'
,
re
.
I
|
re
.
S
|
re
.
U
)
_
CHARSET_DECL_RGX
=
'(?:charset|(?:(?:en)?coding))[=:\s"
\'
]*([^\s"
\'
]*)'
.
encode
(
'ascii'
)
CHARSET_DECL_RGX
=
re
.
compile
(
_CHARSET_DECL_RGX
,
re
.
I
|
re
.
S
)
CHARSET_DECL_SEARCH_SIZE
=
512
CHARDET_MIN_SIZE
=
20
...
...
@@ -84,14 +87,14 @@ def guess_encoding(buffer, fallbackencoding=None):
# try to get a character set declaration
m
=
CHARSET_DECL_RGX
.
search
(
buffer
[:
CHARSET_DECL_SEARCH_SIZE
])
if
m
is
not
None
:
guessed
=
m
.
group
(
1
)
guessed
=
m
.
group
(
1
)
.
decode
(
'ascii'
)
try
:
# ensure encoding is known by python
codecs
.
lookup
(
guessed
)
return
guessed
except
LookupError
:
pass
if
buffer
.
lstrip
().
startswith
(
'<?xml'
):
if
buffer
.
lstrip
().
startswith
(
'<?xml'
.
encode
(
'ascii'
)
):
# xml files with no encoding declaration default to UTF-8
return
'UTF-8'
# use text analysis if enough data
...
...
@@ -120,15 +123,16 @@ def guess_mimetype_and_encoding(format=None, encoding=None, data=None,
return
format
,
encoding
CONTROL_CHARS
=
[
chr
(
ci
)
for
ci
in
range
(
32
)]
CONTROL_CHARS
=
[
int2byte
(
ci
)
for
ci
in
range
(
32
)]
TR_CONTROL_CHARS
=
[
' '
]
*
len
(
CONTROL_CHARS
)
for
c
in
(
'
\n
'
,
'
\r
'
,
'
\t
'
):
TR_CONTROL_CHARS
[
ord
(
c
)]
=
c
TR_CONTROL_CHARS
[
ord
(
'
\f
'
)]
=
'
\n
'
TR_CONTROL_CHARS
[
ord
(
'
\v
'
)]
=
'
\n
'
ESC_CAR_TABLE
=
string
.
maketrans
(
''
.
join
(
CONTROL_CHARS
),
''
.
join
(
TR_CONTROL_CHARS
))
ESC_UCAR_TABLE
=
unicode
(
ESC_CAR_TABLE
,
'latin1'
)
TR_CONTROL_CHARS
=
[
c
.
encode
(
'ascii'
)
for
c
in
TR_CONTROL_CHARS
]
ESC_CAR_TABLE
=
maketrans
(
''
.
encode
(
'ascii'
).
join
(
CONTROL_CHARS
),
''
.
encode
(
'ascii'
).
join
(
TR_CONTROL_CHARS
))
ESC_UCAR_TABLE
=
ESC_CAR_TABLE
.
decode
(
'latin1'
)
# XXX deprecate at some point (once less used :)
#@obsolete('use xml_escape')
...
...
test/unittest_utils.py
View file @
56e26715
...
...
@@ -17,6 +17,7 @@
# You should have received a copy of the GNU Lesser General Public License along
# with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>.
from
logilab.common.testlib
import
TestCase
,
unittest_main
from
six
import
u
from
six.moves
import
range
import
locale
...
...
@@ -56,12 +57,12 @@ class HtmlEscapeTC(TestCase):
def
test_escape_special_chars_unicode
(
self
):
for
car
,
trcar
in
SPECIAL_CHARS
.
items
():
yield
self
.
assertEqual
,
xml_escape
(
u
nicode
(
car
)),
trcar
yield
self
.
assertEqual
,
xml_escape
(
u
(
car
)),
trcar
for
carnum
in
range
(
32
):
car
=
chr
(
carnum
)
if
car
in
SPECIAL_CHARS
:
continue
yield
self
.
assertEqual
,
xml_escape
(
u
nicode
(
car
)),
' '
yield
self
.
assertEqual
,
xml_escape
(
u
(
car
)),
' '
def
test_html_unescape
(
self
):
for
data
,
expected
in
[(
'toto'
,
'toto'
),
...
...
@@ -113,43 +114,43 @@ class GuessEncodingTC(TestCase):
class
GuessMimetymeAndEncodingTC
(
TestCase
):
def
test_base
(
self
):
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt"
,
data
=
"xxx"
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt"
,
data
=
b
"xxx"
)
self
.
assertEqual
(
format
,
u
'text/plain'
)
self
.
assertEqual
(
encoding
,
locale
.
getpreferredencoding
())
def
test_set_mime_and_encoding_gz_file
(
self
):
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt.gz"
,
data
=
"xxx"
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt.gz"
,
data
=
b
"xxx"
)
self
.
assertEqual
(
format
,
u
'text/plain'
)
self
.
assertEqual
(
encoding
,
u
'gzip'
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt.gz"
,
data
=
"xxx"
,
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt.gz"
,
data
=
b
"xxx"
,
format
=
'application/gzip'
)
self
.
assertEqual
(
format
,
u
'text/plain'
)
self
.
assertEqual
(
encoding
,
u
'gzip'
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.gz"
,
data
=
"xxx"
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.gz"
,
data
=
b
"xxx"
)
self
.
assertEqual
(
format
,
u
'application/gzip'
)
self
.
assertEqual
(
encoding
,
None
)
def
test_set_mime_and_encoding_bz2_file
(
self
):
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt.bz2"
,
data
=
"xxx"
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt.bz2"
,
data
=
b
"xxx"
)
self
.
assertEqual
(
format
,
u
'text/plain'
)
self
.
assertEqual
(
encoding
,
u
'bzip2'
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt.bz2"
,
data
=
"xxx"
,
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.txt.bz2"
,
data
=
b
"xxx"
,
format
=
'application/bzip2'
)
self
.
assertEqual
(
format
,
u
'text/plain'
)
self
.
assertEqual
(
encoding
,
u
'bzip2'
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.bz2"
,
data
=
"xxx"
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.bz2"
,
data
=
b
"xxx"
)
self
.
assertEqual
(
format
,
u
'application/bzip2'
)
self
.
assertEqual
(
encoding
,
None
)
def
test_set_mime_and_encoding_unknwon_ext
(
self
):
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.789"
,
data
=
"xxx"
)
format
,
encoding
=
guess_mimetype_and_encoding
(
filename
=
u
"foo.789"
,
data
=
b
"xxx"
)
self
.
assertEqual
(
format
,
u
'application/octet-stream'
)
self
.
assertEqual
(
encoding
,
None
)
class
TransformDataTC
(
TestCase
):
def
test_autodetect_encoding_if_necessary
(
self
):
data
=
TransformData
(
'''<?xml version="1.0" encoding="latin1"?>
data
=
TransformData
(
b
'''<?xml version="1.0" encoding="latin1"?>
<root/>'''
,
'text/xml'
)
self
.
assertEqual
(
data
.
encoding
,
'latin1'
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment