Commit 3e10bcd2 authored by Sylvain Thénault's avatar Sylvain Thénault
Browse files

escape more control characters

parent b900389e1eee
Change log for mtconverter
==========================
--
* xml_escape / html_escape now escape all control characters (ascii code < 32)
2009-06-29 -- 0.6.1
* fix potential crash with wrong local setting
2009-02-12 -- 0.6.0
* xml_escape
* application/vnd.oasis.opendocument.text -> text/plain transformation
......
......@@ -19,6 +19,7 @@ from logilab.mtconverter.__pkginfo__ import version as __version__
import locale
import mimetypes
import re
import string
from StringIO import StringIO
import htmlentitydefs
......@@ -85,14 +86,26 @@ def guess_mimetype_and_encoding(format=None, encoding=None, data=None,
encoding = guess_encoding(data, fallbackencoding)
return format, encoding
CONTROL_CHARS = [chr(ci) for ci in range(32)]
TR_CONTROL_CHARS = [' '] * len(CONTROL_CHARS)
for c in ('\n', '\r', '\t'):
TR_CONTROL_CHARS[ord(c)] = c
TR_CONTROL_CHARS[ord('\f')] = '\n'
TR_CONTROL_CHARS[ord('\v')] = '\n'
ESC_CAR_TABLE = string.maketrans(''.join(CONTROL_CHARS),
''.join(TR_CONTROL_CHARS))
# XXX deprecate at some point (once less used :)
#@obsolete('use xml_escape')
def html_escape(data):
"""escapes XML/HTML forbidden characters in attributes and PCDATA"""
return (data.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
.replace('"','&quot;').replace("'",'&#39;'))
return xml_escape(data)
def xml_escape(data):
# XXX remove more control characters
return html_escape(data).replace('\f', '\n').replace('\b', '')
"""escapes XML forbidden characters in attributes and PCDATA"""
data = data.translate(ESC_CAR_TABLE)
return (data.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
.replace('"','&quot;').replace("'",'&#39;'))
def html_unescape(data):
"""unescapes XML/HTML entities"""
......
......@@ -5,9 +5,18 @@ import locale
from StringIO import StringIO
from logilab.mtconverter import *
SPECIAL_CHARS = {
'\f' : '\n',
'\b' : ' ',
'\n' : '\n',
'\r' : '\r',
'\r\n' : '\r\n',
'\t' : '\t',
'\v' : '\n',
}
class HtmlEscapeTC(TestCase):
def test_escape(self):
for data, expected in [('toto', 'toto'),
('r&d', 'r&amp;d'),
......@@ -15,7 +24,16 @@ class HtmlEscapeTC(TestCase):
('d"h"', 'd&quot;h&quot;'),
("h'", 'h&#39;'),
]:
self.assertEquals(html_escape(data), expected)
yield self.assertEquals, xml_escape(data), expected
def test_escape_special_chars(self):
for car, trcar in SPECIAL_CHARS.items():
yield self.assertEquals, xml_escape(car), trcar
for carnum in xrange(32):
car = chr(carnum)
if car in SPECIAL_CHARS:
continue
yield self.assertEquals, xml_escape(car), ' '
def test_html_unescape(self):
for data, expected in [('toto', 'toto'),
......@@ -25,24 +43,24 @@ class HtmlEscapeTC(TestCase):
('h&#39;', "h'"),
('x &equiv; y', u"x \u2261 y"),
]:
self.assertEquals(html_unescape(data), expected)
yield self.assertEquals, html_unescape(data), expected
class GuessEncodingTC(TestCase):
def test_emacs_style_declaration(self):
data = '''# -*- coding: latin1 -*-'''
self.assertEquals(guess_encoding(data), 'latin1')
def test_emacs_style_declaration_stringIO(self):
data = '''# -*- coding: latin1 -*-'''
self.assertEquals(guess_encoding(StringIO(data)), 'latin1')
def test_xml_style_declaration(self):
data = '''<?xml version="1.0" encoding="latin1"?>
<root/>'''
self.assertEquals(guess_encoding(data), 'latin1')
def test_html_style_declaration(self):
data = '''<html xmlns="http://www.w3.org/1999/xhtml" xmlns:erudi="http://www.logilab.fr/" xml:lang="fr" lang="fr">
<head>
......@@ -89,7 +107,7 @@ class GuessMimetymeAndEncodingTC(TestCase):
self.assertEquals(format, u'application/octet-stream')
self.assertEquals(encoding, None)
class TransformDataTC(TestCase):
def test_autodetect_encoding_if_necessary(self):
data = TransformData('''<?xml version="1.0" encoding="latin1"?>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment