Commit 67e69da5 authored by "Sylvain ext:(%22)'s avatar "Sylvain ext:(%22)
Browse files

new mtconverter library

parents
"""mime type conversion package"""
import re
DEFAULT_ENCODING = 'ISO-8859-1'
XML_ENCODING_RGX = re.compile('<\?xml version=[^\s]*\s*encoding=([^\s]*)\s*\?>', re.I | re.S | re.U)
CHARSET_RGX = re.compile('charset=([^\s"]*)', re.I | re.S | re.U)
def guess_encoding(buffer):
"""try to guess encoding from a buffer
"""
# try to get charset declaration
# FIXME: should we check it's html before ?
m = CHARSET_RGX.search(buffer)
if m is not None:
return m.group(1)
# check for xml encoding declaration
if buffer.lstrip().startswith('<?xml'):
m = XML_ENCODING_RGX.match(buffer)
if m is not None:
return m.group(1)[1:-1]
# xml files with no encoding declaration default to UTF-8
return 'UTF-8'
return DEFAUT_ENCODING
def html_escape(data):
"""escapes XML/HTML forbidden characters in attributes and PCDATA"""
return (data.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
.replace('"','&quot;').replace("'",'&apos;'))
class TransformData(object):
"""wrapper arround transformed data to add extra infos such as MIME
type and encoding in case it applies
"""
def __init__(self, data, mimetype, encoding=None, **kwargs):
self.__dict__.update(kwargs)
self.data = data
self.mimetype = mimetype
self.encoding = encoding
def get(self, attr, default=None):
"""get an optional data attribute"""
return getattr(self, attr, default)
def decode(self):
"""return the data as an unicode string"""
if isinstance(self.data, unicode):
return self.data
if self.is_binary():
raise Exception("can't decode binary stream")
if self.encoding:
encoding = self.encoding
else:
encoding = guess_encoding(self.data)
return self.data.decode(encoding)
def encode(self, encoding='UTF8'):
"""return the data as an encoded string"""
if self.encoding == encoding and isinstance(self.data, str):
return self.data
return self.decode().encode(encoding)
def is_binary(self):
return not self.mimetype.startswith('text/')
class MtConverterError(Exception):
"""base class for this package's errors"""
class MissingBinary(Exception):
"""raised when a system binary on whic rely a transform has not been found
"""
class TransformError(Exception):
"""raised when something can't be transformed due to missing necessary
transforms
"""
def register_pil_transforms(engine, verb=True):
try:
from logilab.mtconverter.transforms import piltransforms
except ImportError:
# pil not available, do nothing
if verb:
print "PIL isn't available, image transforms won't be available'"
else:
for trclass in piltransforms.transform_classes:
engine.add_transform(trclass())
def register_base_transforms(engine, verb=True):
from logilab.mtconverter.transforms import cmdtransforms, text_to_text, text_to_html
from logilab.mtconverter.transforms.python import python_to_html
engine.add_transform(text_to_text())
engine.add_transform(text_to_html())
engine.add_transform(python_to_html())
for trclass in cmdtransforms.transform_classes:
try:
engine.add_transform(trclass())
except MissingBinary, ex:
if verb:
print ex
"""
Copyright (c) 2006 LOGILAB S.A. (Paris, FRANCE).
http://www.logilab.fr/ -- mailto:contact@logilab.fr
mtconverter packaging information
"""
modname = "mtconverter"
numversion = (0, 1, 0)
version = '.'.join([str(num) for num in numversion])
license = 'GPL'
copyright = '''Copyright (c) 2003-2006 LOGILAB S.A. (Paris, FRANCE).
http://www.logilab.fr/ -- mailto:contact@logilab.fr'''
author = "Sylvain Thenault"
author_email = "devel@logilab.fr"
short_desc = "a library to convert from a MIME type to another"
long_desc = """This package originally a backport of Zope's PortalTransforms tool with
all Zope's internal removed (e.g. most of the code).
"""
web = "http://www.logilab.org/projects/mtconverter"
Source: mtconverter
Section: python
Priority: optional
Maintainer: Sylvain Thenault <sylvain.thenault@logilab.fr>
Build-Depends: debhelper (>= 5.0.37.1), python-dev (>=2.3.5-7), python (>=2.3.5-7), python-central
XS-Python-Version: all
Standards-Version: 3.7.2
Package: python-mtconverter
Architecture: all
Depends: ${python:Depends}
Suggests: python-imaging, pdftotext, lynx
XB-Python-Version: ${python:Versions}
Description: a library to convert from a MIME type to another
This package originally a backport of Zope's PortalTransforms tool with
all Zope's internal removed (e.g. most of the code).
.
homepage:o
"""the transformation engine"""
from mtconverter import TransformError
def split_mimetype(mimetype):
try:
main, sub = mimetype.split('/')
except ValueError:
raise TransformError('bad mime type %s' % mimetype)
if not (main and sub):
raise TransformError('bad mime type %s' % mimetype)
return main, sub
class TransformEngine(object):
"""mimetype oriented conversions engine"""
def __init__(self):
self._mtmap = {}
self._mtmainmap = {}
self._transforms = {}
def add_transform(self, transform):
"""register a new transform"""
self._map_transform(transform)
def remove_transform(self, name):
""" unregister a transform
name is the name of a registered transform
"""
self._unmap_transform(self._transforms[name])
def has_input(self, mimetype):
"""return True if the engine has a transformation taking the given
mimetype as input
"""
if mimetype in self._mtmap:
return True
if split_mimetype(mimetype)[0] in self._mtmainmap:
return True
return False
def convert(self, trdata, targetmimetype):
"""convert the given data structure into the given mime type
:param trdata: `TransformData`
:rtype: `TransformData`
"""
# get a path to output mime type
#
# even if target mime type is the same as input mime type, try
# to find a path in case an identity transform is available
path = self._find_path(trdata.mimetype, targetmimetype)
if not path:
if trdata.mimetype == targetmimetype:
return trdata
raise TransformError('no transformation path from %s to %s'
% (trdata.mimetype, targetmimetype))
if len(path) > 1:
transform = chain(path)
else:
transform = path[0]
return transform.convert(trdata)
def _map_transform(self, transform):
"""map transform to internal structures"""
if not (transform.inputs and transform.output):
raise TransformError('transform is missing input or output')
if split_mimetype(transform.output)[1] == '*':
raise TransformError('bad output mime type, wildcard only allowed in inputs')
for mt in transform.inputs:
main, sub = split_mimetype(mt)
if sub == '*':
inmap = self._mtmainmap.setdefault(main, {})
else:
inmap = self._mtmap.setdefault(mt, {})
try:
inmap[transform.output].append(transform)
except KeyError:
inmap[transform.output] = [transform]
self._transforms[transform.name] = transform
def _unmap_transform(self, transform):
"""unmap transform from internal structures"""
for mt in transform.inputs:
main, sub = split_mimetype(mt)
if sub == '*':
inmap = self._mtmainmap[main]
else:
inmap = self._mtmap[mt]
inmap[transform.output].remove(transform)
del self._transforms[transform.name]
def _find_path(self, orig, target, required_transforms=()):
"""return the shortest path for transformation from orig mimetype to
target mimetype
"""
# naive algorithm :
# find all possible paths with required transforms
# take the shortest
#
# it should be enough since we should not have so much possible paths
# and I wouldn't like to get a 1000 transformations path
shortest, winner = 100, None
for path in self._get_paths(orig, target, required_transforms):
if len(path) < shortest:
winner = path
shortest = len(path)
return winner
def _get_paths(self, orig, target, requirements, path=None, result=None):
"""return a all path for transformation from orig mimetype to
target mimetype
"""
if path is None:
result = []
path = []
requirements = list(requirements)
# get main type, and check mime type at the same time
main = split_mimetype(orig)[0]
# search most specific first
outputs = self._mtmap.get(orig)
if outputs is not None:
self._search_outputs(outputs, target, requirements, path, result)
# then search generic wildcard transforms
outputs = self._mtmainmap.get(main)
if outputs is not None:
self._search_outputs(outputs, target, requirements, path, result)
# we are done
return result
def _search_outputs(self, outputs, target, requirements, path, result):
path.append(None)
for outputmimetype, transforms in outputs.items():
for transform in transforms:
required = False
name = transform.name
if name in requirements:
requirements.remove(name)
required = True
if transform in path:
# avoid infinite loop...
continue
path[-1] = transform
if outputmimetype == target:
if not requirements:
result.append(path[:])
else:
self._get_paths(outputmimetype, target, requirements, path, result)
if required:
requirements.append(name)
path.pop()
from logilab.common.testlib import TestCase, unittest_main
import urllib
import re
from logilab.mtconverter import TransformData, TransformError
from logilab.mtconverter.transforms import text_to_text
from logilab.mtconverter.transform import Transform, TransformsChain
from logilab.mtconverter.engine import TransformEngine
class HtmlToText(Transform):
inputs = ('text/html',)
output = 'text/plain'
def __call__(self, orig):
orig = re.sub('<[^>]*>(?i)(?m)', '', orig)
return urllib.unquote(re.sub('\n+', '\n', orig)).strip()
def _convert(self, data):
return self.__call__(data.data)
class HtmlToTextWithEncoding(HtmlToText):
output_encoding = 'utf8'
class FooToBar(Transform):
inputs = ('text/*',)
output = 'text/bar'
def __call__(self, orig):
orig = re.sub('foo', 'bar', orig)
return urllib.unquote(re.sub('\n+', '\n', orig)).strip()
def _convert(self, data):
return self.__call__(data.data)
class HtmlIdTransform(Transform):
inputs = ('text/html',)
output = 'text/html'
def _convert(self, data):
return data.data + ' transformed'
class TransformNoIO(Transform):
pass
class BadTransformNoInput(Transform):
inputs = ()
output = 'text/plain'
class BadTransformBadInput1(Transform):
inputs = ('text/bla/bla',)
output = 'text/plain'
class BadTransformBadInput2(Transform):
inputs = ('text/',)
output = 'text/plain'
class BadTransformBadOutput1(Transform):
inputs = ('text/plain',)
output = 'text/bla/bla'
class BadTransformBadOutput2(Transform):
inputs = ('text/plain',)
output = 'text/'
class BadTransformWildcardOutput(Transform):
inputs = ('text/plain',)
output = 'text/*'
def html_data():
return TransformData('<b>foo</b>', 'text/html', 'ascii')
class EngineTC(TestCase):
def setUp(self):
self.engine = TransformEngine()
def register(self):
#A default set of transforms to prove the interfaces work
self.engine.add_transform(HtmlToText())
self.engine.add_transform(FooToBar())
def test_register_fail(self):
register = self.engine.add_transform
self.assertRaises(TransformError, register, TransformNoIO())
self.assertRaises(TransformError, register, BadTransformNoInput())
self.assertRaises(TransformError, register, BadTransformBadInput1())
self.assertRaises(TransformError, register, BadTransformBadInput2())
self.assertRaises(TransformError, register, BadTransformWildcardOutput())
self.assertRaises(TransformError, register, BadTransformBadOutput1())
self.assertRaises(TransformError, register, BadTransformBadOutput2())
def test_has_input(self):
self.register()
self.failUnless(self.engine.has_input('text/html'))
self.failUnless(self.engine.has_input('text/plain'))
self.failUnless(self.engine.has_input('text/whatever'))
self.failIf(self.engine.has_input('application/octet-stream'))
def test_convert(self):
self.register()
self.engine.add_transform(text_to_text())
data = TransformData("This is a test", 'text/x-diff', 'ascii')
out = self.engine.convert(data, 'text/plain')
self.failUnlessEqual(out.data, "This is a test")
self.failUnlessEqual(out.mimetype, 'text/plain')
self.failUnlessEqual(out.encoding, 'ascii')
# html_to_text transform should take priority over text_to_text
data = self.engine.convert(html_data(), "text/plain")
self.failUnlessEqual(data.data, "foo")
self.failUnlessEqual(data.mimetype, 'text/plain')
self.failUnlessEqual(data.encoding, 'ascii')
self.engine.remove_transform('HtmlToText')
self.engine.remove_transform('FooToBar')
self.engine.add_transform(HtmlToTextWithEncoding())
data = self.engine.convert(html_data(), "text/plain")
self.failUnlessEqual(data.mimetype, 'text/plain')
self.failUnlessEqual(data.encoding, 'utf8')
self.engine.add_transform(FooToBar())
data = self.engine.convert(html_data(), 'text/bar')
self.failUnlessEqual(data.data, "<b>bar</b>")
def test_chain(self):
#self.register()
hb = TransformsChain('hbar')
hb.append(HtmlToText())
hb.append(FooToBar())
self.engine.add_transform(hb)
cache = self.engine.convert(html_data(), 'text/bar')
self.failUnlessEqual(cache.data, "bar")
def test_same(self):
data = TransformData("This is a test", 'text/plain', 'ascii')
out = self.engine.convert(data, 'text/plain')
self.failUnlessEqual(out.data, "This is a test")
self.failUnlessEqual(out.mimetype, 'text/plain')
self.failUnlessEqual(out.encoding, 'ascii')
self.engine.add_transform(HtmlIdTransform())
out = self.engine.convert(html_data(), 'text/html')
self.failUnlessEqual(out.data, "<b>foo</b> transformed")
self.failUnlessEqual(out.mimetype, 'text/html')
self.failUnlessEqual(out.encoding, 'ascii')
if __name__ == '__main__':
unittest_main()
from logilab.common.testlib import TestCase, unittest_main
from utils import input_file_path, output_file_path, normalize_html,\
load, matching_inputs
from logilab.mtconverter import MissingBinary
from logilab.mtconverter.transforms.piltransforms import image_to_gif
from logilab.mtconverter.transforms.piltransforms import image_to_png
from logilab.mtconverter.transforms.piltransforms import image_to_jpeg
from logilab.mtconverter.transforms.piltransforms import image_to_bmp
from logilab.mtconverter.transforms.piltransforms import image_to_tiff
from logilab.mtconverter.transforms.piltransforms import image_to_ppm
from logilab.mtconverter.transforms.piltransforms import image_to_pcx
from os.path import exists
import sys
# we have to set locale because lynx output is locale sensitive !
os.environ['LC_ALL'] = 'C'
class TransformTest(TestCase):
def do_convert(self, filename=None):
if filename is None and exists(self.output + '.nofilename'):
output = self.output + '.nofilename'
else:
output = self.output
input = open(self.input)
orig = input.read()
input.close()
data = datastream(self.transform.name())
res_data = self.transform.convert(orig, data, filename=filename)
self.assert_(idatastream.isImplementedBy(res_data))
got = res_data.getData()
try:
output = open(output)
except IOError:
import sys
print >>sys.stderr, 'No output file found.'
print >>sys.stderr, 'File %s created, check it !' % self.output
output = open(output, 'w')
output.write(got)
output.close()
self.assert_(0)
expected = output.read()
if self.normalize is not None:
expected = self.normalize(expected)
got = self.normalize(got)
output.close()
self.assertEquals(got, expected,
'[%s]\n\n!=\n\n[%s]\n\nIN %s(%s)' % (
got, expected, self.transform.name(), self.input))
self.assertEquals(self.subobjects, len(res_data.getSubObjects()),
'%s\n\n!=\n\n%s\n\nIN %s(%s)' % (
self.subobjects, len(res_data.getSubObjects()), self.transform.name(), self.input))
def testSame(self):
self.do_convert(filename=self.input)
def testSameNoFilename(self):
self.do_convert()
def __repr__(self):
return self.transform.name()
class PILTransformsTest(TestCase):
def afterSetUp(self):
ATSiteTestCase.afterSetUp(self)
self.pt = self.portal.portal_transforms
def test_image_to_bmp(self):
self.pt.registerTransform(image_to_bmp())
imgFile = open(input_file_path('logo.jpg'), 'rb')
data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg')
data = self.pt.convertTo(target_mimetype='image/x-ms-bmp',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/x-ms-bmp')
def test_image_to_gif(self):
self.pt.registerTransform(image_to_gif())
imgFile = open(input_file_path('logo.png'), 'rb')
data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/png')
data = self.pt.convertTo(target_mimetype='image/gif',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/gif')
def test_image_to_jpeg(self):
self.pt.registerTransform(image_to_jpeg())
imgFile = open(input_file_path('logo.gif'), 'rb')
data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/gif')
data = self.pt.convertTo(target_mimetype='image/jpeg',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/jpeg')
def test_image_to_png(self):
self.pt.registerTransform(image_to_png())
imgFile = open(input_file_path('logo.jpg'), 'rb')
data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg')
data = self.pt.convertTo(target_mimetype='image/png',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/png')
def test_image_to_pcx(self):
self.pt.registerTransform(image_to_pcx())
imgFile = open(input_file_path('logo.gif'), 'rb')
data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/gif')
data = self.pt.convertTo(target_mimetype='image/pcx',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/pcx')
def test_image_to_ppm(self):
self.pt.registerTransform(image_to_ppm())
imgFile = open(input_file_path('logo.png'), 'rb')
data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/png')
data = self.pt.convertTo(target_mimetype='image/x-portable-pixmap',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/x-portable-pixmap')
def test_image_to_tiff(self):
self.pt.registerTransform(image_to_tiff())
imgFile = open(input_file_path('logo.jpg'), 'rb')
data = imgFile.read()
self.failUnlessEqual(self.portal.mimetypes_registry.classify(data),'image/jpeg')
data = self.pt.convertTo(target_mimetype='image/tiff',orig=data)
self.failUnlessEqual(data.getMetadata()['mimetype'], 'image/tiff')
TRANSFORMS_TESTINFO = (
('Products.PortalTransforms.transforms.pdf_to_html',
"demo1.pdf", "demo1.html", None, 0
),
('Products.PortalTransforms.transforms.word_to_html',
"test.doc", "test_word.html", normalize_html, 0
),
('Products.PortalTransforms.transforms.lynx_dump',
"test_lynx.html", "test_lynx.txt", None, 0
),
('Products.PortalTransforms.transforms.html_to_text',
"test_lynx.html", "test_html_to_text.txt", None, 0
),
('Products.PortalTransforms.transforms.identity',
"rest1.rst", "rest1.rst", None, 0
),
('Products.PortalTransforms.transforms.text_to_html',
"rest1.rst", "rest1.html", None, 0
),
('Products.PortalTransforms.transforms.safe_html',
"test_safehtml.html", "test_safe.html", None, 0
),
('Products.PortalTransforms.transforms.image_to_bmp',
"logo.jpg", "logo.bmp", None, 0