Commit 36710be3 authored by Adrien Di Mascio's avatar Adrien Di Mascio
Browse files

provide a more robust implementation of html_unescape, considering any HTML entities

parent bca8695c467f
......@@ -20,6 +20,7 @@ import locale
import mimetypes
import re
from StringIO import StringIO
import htmlentitydefs
import chardet
......@@ -86,10 +87,10 @@ def html_escape(data):
def html_unescape(data):
"""escapes XML/HTML forbidden characters in attributes and PCDATA"""
return (data.replace('&amp;','&').replace('&lt;', '<').replace('&gt;', '>')
.replace('&quot;', '"').replace('&apos;', "'").replace('&#39;', "'"))
"""unescapes XML/HTML entities"""
for entityname, codepoint in htmlentitydefs.name2codepoint.iteritems():
data = data.replace('&%s;' % entityname, unichr(codepoint))
return data.replace('&#39;', "'")
class TransformData(object):
"""wrapper arround transformed data to add extra infos such as MIME
# -*- coding: utf-8 -*-
from logilab.common.testlib import TestCase, unittest_main
import locale
......@@ -16,6 +17,16 @@ class HtmlEscapeTC(TestCase):
self.assertEquals(html_escape(data), expected)
def test_html_unescape(self):
for data, expected in [('toto', 'toto'),
('r&amp;d', 'r&d' ),
('23&lt;12 &amp;&amp; 3&gt;2', '23<12 && 3>2'),
('d&quot;h&quot;', 'd"h"'),
('h&#39;', "h'"),
('x &equiv; y', u"x \u2261 y"),
self.assertEquals(html_unescape(data), expected)
class GuessEncodingTC(TestCase):
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment