Commit 355c5462 authored by Aurelien Campeas's avatar Aurelien Campeas
Browse files

odt_to_text transform

parent 992655600bc7
......@@ -2,17 +2,20 @@ Change log for mtconverter
==========================
--
* application/vnd.oasis.opendocument.text -> text/plain transformation
* html_unescape now unescapes any HTML entity
2008-08-06 -- 0.5.0
* application/xml -> text/plain transformation
* new fallbackencoding argument to guess_mimetype_and_encoding, given to
guess_encoding when necessary
* html_unescape now unescapes any HTML entity
2008-06-30 -- 0.4.0
* use a new pure python transformation to transform html into
formatted text. The code is based on http://www.aaronsw.com/2002/html2text/
developed by Aaron Swartz.
developed by Aaron Swartz.
2008-01-14 -- 0.3.0
* unicode policy handling
* enhance text mimetype detection
......@@ -21,7 +24,7 @@ Change log for mtconverter
* register_* function return True or False telling if optional transforms
are available
* restore python 2.3 compatibility
2007-12-11 -- 0.2.0
* guess_encoding test and fixes
* new guess_format_and_encoding utility method
......@@ -30,10 +33,10 @@ Change log for mtconverter
it's useful to access it to check transform availability
* pygments based transforms
2007-10-23 -- 0.1.4
* various html fixes
2007-02-09 -- 0.1.3
* fix lynx transform encoding problem
* strip output of popen transforms
......
......@@ -201,12 +201,14 @@ def register_base_transforms(engine, verb=True):
xml_to_text, text_to_html, xlog_to_html
from logilab.mtconverter.transforms.python import python_to_html
from logilab.mtconverter.transforms.html2text import html_to_formatted_text
from logilab.mtconverter.transforms.odt2text import odt_to_unformatted_text
engine.add_transform(text_to_text())
engine.add_transform(xml_to_text())
engine.add_transform(text_to_html())
engine.add_transform(xlog_to_html())
engine.add_transform(python_to_html())
engine.add_transform(html_to_formatted_text())
engine.add_transform(odt_to_unformatted_text())
for trclass in cmdtransforms.transform_classes:
try:
engine.add_transform(trclass())
......
File added
......@@ -9,6 +9,9 @@ ENGINE = TransformEngine()
register_base_transforms(ENGINE)
register_pil_transforms(ENGINE)
import logilab.mtconverter as mtc
import os.path as osp
DATAPATH = osp.join(osp.split(mtc.__file__)[0], 'test')
class Html2TextTC(TestCase):
def test_html_to_text(self):
......@@ -21,7 +24,20 @@ class Html2TextTC(TestCase):
converted = ENGINE.convert(data, 'text/plain').decode().strip()
self.assertEquals(converted, u'yo (zou ;) a b')
class Odt2TextTC(TestCase):
def test_odt_to_text(self):
data = TransformData(open(osp.join(DATAPATH, 'hello.odt')),
'application/vnd.oasis.opendocument.text', 'utf8')
converted = ENGINE.convert(data, 'text/plain').decode().strip()
self.assertEquals(converted, u'Hello ! OpenOffice.org/2.4$Unix OpenOffice.org_project/680m17$Build-9310 Hello quoi de neuf doc ? bonjour 2008-07-08T16:19:35 2009-01-09T14:44:54 mot-clef 1 PT37S')
# ZipFile will complain that
# TypeError: file() argument 1 must be (encoded string without NULL bytes), not str
# if given a plain str ... we shielded us from that.
data = TransformData(open(osp.join(DATAPATH, 'hello.odt')).read(),
'application/vnd.oasis.opendocument.text', 'utf8')
converted = ENGINE.convert(data, 'text/plain').decode().strip()
self.assertEquals(converted, u'Hello ! OpenOffice.org/2.4$Unix OpenOffice.org_project/680m17$Build-9310 Hello quoi de neuf doc ? bonjour 2008-07-08T16:19:35 2009-01-09T14:44:54 mot-clef 1 PT37S')
if __name__ == '__main__':
unittest_main()
......
"""odt2text: Turn odt file into equivalent plain text file.
Copyright (C) 2009 Logilab S.A.
"""
from zipfile import ZipFile
from lxml import etree
from tempfile import TemporaryFile as tmpfile
from logilab.mtconverter.transform import Transform
class odt_to_unformatted_text(Transform):
"""transforms odt content to unformatted plain text"""
name = "odt_to_text"
inputs = ("application/vnd.oasis.opendocument.text",)
output = "text/plain"
def _convert(self, trdata):
data = trdata.data
# XXX ZipFile should also accept a string
# however, there is some bug within
# so we feed it a file
if isinstance(data, str):
tmp = tmpfile(mode='w+b')
tmp.write(data)
tmp.seek(0)
data = tmp
# /XXX
zip = ZipFile(data, 'r')
alltext = []
for subelt in ('content.xml', 'meta.xml'):
root = etree.fromstring(zip.read(subelt))
for node in root.iter():
for attr in ('text', 'tail'):
text = getattr(node, attr)
if text:
text = text.strip()
if text:
alltext.append(text)
return u' '.join(alltext)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment