Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
open-source
logilab-mtconverter
Commits
0756a18e6ac2
Commit
3e10bcd2
authored
Jul 06, 2009
by
Sylvain Thénault
Browse files
escape more control characters
parent
b900389e1eee
Changes
3
Hide whitespace changes
Inline
Side-by-side
ChangeLog
View file @
0756a18e
Change log for mtconverter
==========================
--
* xml_escape / html_escape now escape all control characters (ascii code < 32)
2009-06-29 -- 0.6.1
* fix potential crash with wrong local setting
2009-02-12 -- 0.6.0
* xml_escape
* application/vnd.oasis.opendocument.text -> text/plain transformation
...
...
__init__.py
View file @
0756a18e
...
...
@@ -19,6 +19,7 @@ from logilab.mtconverter.__pkginfo__ import version as __version__
import
locale
import
mimetypes
import
re
import
string
from
StringIO
import
StringIO
import
htmlentitydefs
...
...
@@ -85,14 +86,26 @@ def guess_mimetype_and_encoding(format=None, encoding=None, data=None,
encoding
=
guess_encoding
(
data
,
fallbackencoding
)
return
format
,
encoding
CONTROL_CHARS
=
[
chr
(
ci
)
for
ci
in
range
(
32
)]
TR_CONTROL_CHARS
=
[
' '
]
*
len
(
CONTROL_CHARS
)
for
c
in
(
'
\n
'
,
'
\r
'
,
'
\t
'
):
TR_CONTROL_CHARS
[
ord
(
c
)]
=
c
TR_CONTROL_CHARS
[
ord
(
'
\f
'
)]
=
'
\n
'
TR_CONTROL_CHARS
[
ord
(
'
\v
'
)]
=
'
\n
'
ESC_CAR_TABLE
=
string
.
maketrans
(
''
.
join
(
CONTROL_CHARS
),
''
.
join
(
TR_CONTROL_CHARS
))
# XXX deprecate at some point (once less used :)
#@obsolete('use xml_escape')
def
html_escape
(
data
):
"""escapes XML/HTML forbidden characters in attributes and PCDATA"""
return
(
data
.
replace
(
'&'
,
'&'
).
replace
(
'<'
,
'<'
).
replace
(
'>'
,
'>'
)
.
replace
(
'"'
,
'"'
).
replace
(
"'"
,
'''
))
return
xml_escape
(
data
)
def
xml_escape
(
data
):
# XXX remove more control characters
return
html_escape
(
data
).
replace
(
'
\f
'
,
'
\n
'
).
replace
(
'
\b
'
,
''
)
"""escapes XML forbidden characters in attributes and PCDATA"""
data
=
data
.
translate
(
ESC_CAR_TABLE
)
return
(
data
.
replace
(
'&'
,
'&'
).
replace
(
'<'
,
'<'
).
replace
(
'>'
,
'>'
)
.
replace
(
'"'
,
'"'
).
replace
(
"'"
,
'''
))
def
html_unescape
(
data
):
"""unescapes XML/HTML entities"""
...
...
test/unittest_utils.py
View file @
0756a18e
...
...
@@ -5,9 +5,18 @@ import locale
from
StringIO
import
StringIO
from
logilab.mtconverter
import
*
SPECIAL_CHARS
=
{
'
\f
'
:
'
\n
'
,
'
\b
'
:
' '
,
'
\n
'
:
'
\n
'
,
'
\r
'
:
'
\r
'
,
'
\r\n
'
:
'
\r\n
'
,
'
\t
'
:
'
\t
'
,
'
\v
'
:
'
\n
'
,
}
class
HtmlEscapeTC
(
TestCase
):
def
test_escape
(
self
):
for
data
,
expected
in
[(
'toto'
,
'toto'
),
(
'r&d'
,
'r&d'
),
...
...
@@ -15,7 +24,16 @@ class HtmlEscapeTC(TestCase):
(
'd"h"'
,
'd"h"'
),
(
"h'"
,
'h''
),
]:
self
.
assertEquals
(
html_escape
(
data
),
expected
)
yield
self
.
assertEquals
,
xml_escape
(
data
),
expected
def
test_escape_special_chars
(
self
):
for
car
,
trcar
in
SPECIAL_CHARS
.
items
():
yield
self
.
assertEquals
,
xml_escape
(
car
),
trcar
for
carnum
in
xrange
(
32
):
car
=
chr
(
carnum
)
if
car
in
SPECIAL_CHARS
:
continue
yield
self
.
assertEquals
,
xml_escape
(
car
),
' '
def
test_html_unescape
(
self
):
for
data
,
expected
in
[(
'toto'
,
'toto'
),
...
...
@@ -25,24 +43,24 @@ class HtmlEscapeTC(TestCase):
(
'h''
,
"h'"
),
(
'x ≡ y'
,
u
"x
\u2261
y"
),
]:
self
.
assertEquals
(
html_unescape
(
data
),
expected
)
yield
self
.
assertEquals
,
html_unescape
(
data
),
expected
class
GuessEncodingTC
(
TestCase
):
def
test_emacs_style_declaration
(
self
):
data
=
'''# -*- coding: latin1 -*-'''
self
.
assertEquals
(
guess_encoding
(
data
),
'latin1'
)
def
test_emacs_style_declaration_stringIO
(
self
):
data
=
'''# -*- coding: latin1 -*-'''
self
.
assertEquals
(
guess_encoding
(
StringIO
(
data
)),
'latin1'
)
def
test_xml_style_declaration
(
self
):
data
=
'''<?xml version="1.0" encoding="latin1"?>
<root/>'''
self
.
assertEquals
(
guess_encoding
(
data
),
'latin1'
)
def
test_html_style_declaration
(
self
):
data
=
'''<html xmlns="http://www.w3.org/1999/xhtml" xmlns:erudi="http://www.logilab.fr/" xml:lang="fr" lang="fr">
<head>
...
...
@@ -89,7 +107,7 @@ class GuessMimetymeAndEncodingTC(TestCase):
self
.
assertEquals
(
format
,
u
'application/octet-stream'
)
self
.
assertEquals
(
encoding
,
None
)
class
TransformDataTC
(
TestCase
):
def
test_autodetect_encoding_if_necessary
(
self
):
data
=
TransformData
(
'''<?xml version="1.0" encoding="latin1"?>
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment