Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
open-source
logilab-mtconverter
Commits
aa0dc153f655
Commit
355c5462
authored
Jan 09, 2009
by
Aurelien Campeas
Browse files
odt_to_text transform
parent
992655600bc7
Changes
5
Hide whitespace changes
Inline
Side-by-side
ChangeLog
View file @
aa0dc153
...
...
@@ -2,17 +2,20 @@ Change log for mtconverter
==========================
--
* application/vnd.oasis.opendocument.text -> text/plain transformation
* html_unescape now unescapes any HTML entity
2008-08-06 -- 0.5.0
* application/xml -> text/plain transformation
* new fallbackencoding argument to guess_mimetype_and_encoding, given to
guess_encoding when necessary
* html_unescape now unescapes any HTML entity
2008-06-30 -- 0.4.0
* use a new pure python transformation to transform html into
formatted text. The code is based on http://www.aaronsw.com/2002/html2text/
developed by Aaron Swartz.
developed by Aaron Swartz.
2008-01-14 -- 0.3.0
* unicode policy handling
* enhance text mimetype detection
...
...
@@ -21,7 +24,7 @@ Change log for mtconverter
* register_* function return True or False telling if optional transforms
are available
* restore python 2.3 compatibility
2007-12-11 -- 0.2.0
* guess_encoding test and fixes
* new guess_format_and_encoding utility method
...
...
@@ -30,10 +33,10 @@ Change log for mtconverter
it's useful to access it to check transform availability
* pygments based transforms
2007-10-23 -- 0.1.4
* various html fixes
2007-02-09 -- 0.1.3
* fix lynx transform encoding problem
* strip output of popen transforms
...
...
__init__.py
View file @
aa0dc153
...
...
@@ -201,12 +201,14 @@ def register_base_transforms(engine, verb=True):
xml_to_text
,
text_to_html
,
xlog_to_html
from
logilab.mtconverter.transforms.python
import
python_to_html
from
logilab.mtconverter.transforms.html2text
import
html_to_formatted_text
from
logilab.mtconverter.transforms.odt2text
import
odt_to_unformatted_text
engine
.
add_transform
(
text_to_text
())
engine
.
add_transform
(
xml_to_text
())
engine
.
add_transform
(
text_to_html
())
engine
.
add_transform
(
xlog_to_html
())
engine
.
add_transform
(
python_to_html
())
engine
.
add_transform
(
html_to_formatted_text
())
engine
.
add_transform
(
odt_to_unformatted_text
())
for
trclass
in
cmdtransforms
.
transform_classes
:
try
:
engine
.
add_transform
(
trclass
())
...
...
test/hello.odt
0 → 100644
View file @
aa0dc153
File added
test/unittest_transforms.py
View file @
aa0dc153
...
...
@@ -9,6 +9,9 @@ ENGINE = TransformEngine()
register_base_transforms
(
ENGINE
)
register_pil_transforms
(
ENGINE
)
import
logilab.mtconverter
as
mtc
import
os.path
as
osp
DATAPATH
=
osp
.
join
(
osp
.
split
(
mtc
.
__file__
)[
0
],
'test'
)
class
Html2TextTC
(
TestCase
):
def
test_html_to_text
(
self
):
...
...
@@ -21,7 +24,20 @@ class Html2TextTC(TestCase):
converted
=
ENGINE
.
convert
(
data
,
'text/plain'
).
decode
().
strip
()
self
.
assertEquals
(
converted
,
u
'yo (zou ;) a b'
)
class
Odt2TextTC
(
TestCase
):
def
test_odt_to_text
(
self
):
data
=
TransformData
(
open
(
osp
.
join
(
DATAPATH
,
'hello.odt'
)),
'application/vnd.oasis.opendocument.text'
,
'utf8'
)
converted
=
ENGINE
.
convert
(
data
,
'text/plain'
).
decode
().
strip
()
self
.
assertEquals
(
converted
,
u
'Hello ! OpenOffice.org/2.4$Unix OpenOffice.org_project/680m17$Build-9310 Hello quoi de neuf doc ? bonjour 2008-07-08T16:19:35 2009-01-09T14:44:54 mot-clef 1 PT37S'
)
# ZipFile will complain that
# TypeError: file() argument 1 must be (encoded string without NULL bytes), not str
# if given a plain str ... we shielded us from that.
data
=
TransformData
(
open
(
osp
.
join
(
DATAPATH
,
'hello.odt'
)).
read
(),
'application/vnd.oasis.opendocument.text'
,
'utf8'
)
converted
=
ENGINE
.
convert
(
data
,
'text/plain'
).
decode
().
strip
()
self
.
assertEquals
(
converted
,
u
'Hello ! OpenOffice.org/2.4$Unix OpenOffice.org_project/680m17$Build-9310 Hello quoi de neuf doc ? bonjour 2008-07-08T16:19:35 2009-01-09T14:44:54 mot-clef 1 PT37S'
)
if
__name__
==
'__main__'
:
unittest_main
()
...
...
transforms/odt2text.py
0 → 100644
View file @
aa0dc153
"""odt2text: Turn odt file into equivalent plain text file.
Copyright (C) 2009 Logilab S.A.
"""
from
zipfile
import
ZipFile
from
lxml
import
etree
from
tempfile
import
TemporaryFile
as
tmpfile
from
logilab.mtconverter.transform
import
Transform
class
odt_to_unformatted_text
(
Transform
):
"""transforms odt content to unformatted plain text"""
name
=
"odt_to_text"
inputs
=
(
"application/vnd.oasis.opendocument.text"
,)
output
=
"text/plain"
def
_convert
(
self
,
trdata
):
data
=
trdata
.
data
# XXX ZipFile should also accept a string
# however, there is some bug within
# so we feed it a file
if
isinstance
(
data
,
str
):
tmp
=
tmpfile
(
mode
=
'w+b'
)
tmp
.
write
(
data
)
tmp
.
seek
(
0
)
data
=
tmp
# /XXX
zip
=
ZipFile
(
data
,
'r'
)
alltext
=
[]
for
subelt
in
(
'content.xml'
,
'meta.xml'
):
root
=
etree
.
fromstring
(
zip
.
read
(
subelt
))
for
node
in
root
.
iter
():
for
attr
in
(
'text'
,
'tail'
):
text
=
getattr
(
node
,
attr
)
if
text
:
text
=
text
.
strip
()
if
text
:
alltext
.
append
(
text
)
return
u
' '
.
join
(
alltext
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment