Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
blog
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
cubicweb
cubes
blog
Commits
fe739ada925c
Commit
fe739ada925c
authored
14 years ago
by
Nicolas Chauvat
Browse files
Options
Downloads
Patches
Plain Diff
[import] improve data extraction from atom and rss feeds
parent
e8340fe485c9
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
sobjects.py
+50
-17
50 additions, 17 deletions
sobjects.py
with
50 additions
and
17 deletions
sobjects.py
+
50
−
17
View file @
fe739ada
...
@@ -3,9 +3,10 @@
...
@@ -3,9 +3,10 @@
import
sys
import
sys
import
re
import
re
from
datetime
import
datetime
from
datetime
import
datetime
from
lxml.html
import
fromstring
,
tostring
from
lxml.html
import
fromstring
,
tostring
import
feedparser
import
feedparser
import
rdflib
import
rdflib
from
cubes.datafeed.sobjects
import
DataFeedParser
from
cubes.datafeed.sobjects
import
DataFeedParser
...
@@ -6,12 +7,12 @@
...
@@ -6,12 +7,12 @@
from
lxml.html
import
fromstring
,
tostring
from
lxml.html
import
fromstring
,
tostring
import
feedparser
import
feedparser
import
rdflib
import
rdflib
from
cubes.datafeed.sobjects
import
DataFeedParser
from
cubes.datafeed.sobjects
import
DataFeedParser
SIOC
=
'
http://rdfs.org/sioc/
ns#
'
RDF
=
rdflib
.
Namespace
(
'
http://www.w3.org/1999/02/22-rdf-syntax-
ns#
'
)
RDF
=
'
http://www.w3.org/1999/02/22-rdf-syntax-
ns#
'
SIOC
=
rdflib
.
Namespace
(
'
http://rdfs.org/sioc/
ns#
'
)
DCTERMS
=
'
http://purl.org/dc/terms/
'
DCTERMS
=
rdflib
.
Namespace
(
'
http://purl.org/dc/terms/
'
)
def
get_subject
(
g
,
pred
,
obj
):
def
get_subject
(
g
,
pred
,
obj
):
subjects
=
list
(
g
.
subjects
(
pred
,
obj
))
subjects
=
list
(
g
.
subjects
(
pred
,
obj
))
...
@@ -26,9 +27,5 @@
...
@@ -26,9 +27,5 @@
def
parse_blogpost_sioc
(
url
):
def
parse_blogpost_sioc
(
url
):
g
=
rdflib
.
ConjunctiveGraph
()
g
=
rdflib
.
ConjunctiveGraph
()
g
.
parse
(
url
)
g
.
parse
(
url
)
rdf_type
=
rdflib
.
URIRef
(
RDF
+
'
type
'
)
for
post
,
type_
,
blogpost_
in
g
.
triples
((
None
,
RDF
.
type
,
SIOC
.
BlogPost
)):
sioc_blogpost
=
rdflib
.
URIRef
(
SIOC
+
'
BlogPost
'
)
dcterms_title
=
rdflib
.
URIRef
(
DCTERMS
+
'
title
'
)
sioc_content
=
rdflib
.
URIRef
(
SIOC
+
'
content
'
)
for
post
,
type_
,
blogpost_
in
g
.
triples
((
None
,
rdf_type
,
sioc_blogpost
)):
item
=
{
'
uri
'
:
unicode
(
post
)}
item
=
{
'
uri
'
:
unicode
(
post
)}
...
@@ -34,6 +31,6 @@
...
@@ -34,6 +31,6 @@
item
=
{
'
uri
'
:
unicode
(
post
)}
item
=
{
'
uri
'
:
unicode
(
post
)}
item
[
'
title
'
]
=
unicode
(
get_object
(
g
,
post
,
dcterms_
title
))
item
[
'
title
'
]
=
unicode
(
get_object
(
g
,
post
,
DCTERMS
.
title
))
item
[
'
content
'
]
=
unicode
(
get_object
(
g
,
post
,
sioc_
content
))
item
[
'
content
'
]
=
unicode
(
get_object
(
g
,
post
,
SIOC
.
content
))
yield
item
yield
item
format_map
=
{
'
application/xhtml+xml
'
:
u
'
text/html
'
,
format_map
=
{
'
application/xhtml+xml
'
:
u
'
text/html
'
,
...
@@ -41,5 +38,25 @@
...
@@ -41,5 +38,25 @@
'
text/plain
'
:
u
'
text/plain
'
,
'
text/plain
'
:
u
'
text/plain
'
,
}
}
IMG_SPIES
=
[
'
http://feeds.feedburner.com
'
,
'
http://creatives.commindo-media
'
,
'
http://imp.constantcontact.com
'
,
'
https://blogger.googleusercontent.com/tracker
'
,
]
def
is_img_spy
(
node
):
if
node
.
tag
!=
'
img
'
:
return
False
for
url
in
IMG_SPIES
:
if
node
.
get
(
'
src
'
).
startswith
(
url
):
return
True
return
False
def
is_tweetmeme_spy
(
node
):
href
=
node
.
get
(
'
href
'
)
if
href
and
href
.
startswith
(
'
http://api.tweetmeme.com/share
'
):
return
True
return
False
def
remove_content_spies
(
content
):
def
remove_content_spies
(
content
):
root
=
fromstring
(
content
)
root
=
fromstring
(
content
)
...
@@ -44,3 +61,5 @@
...
@@ -44,3 +61,5 @@
def
remove_content_spies
(
content
):
def
remove_content_spies
(
content
):
root
=
fromstring
(
content
)
root
=
fromstring
(
content
)
if
is_img_spy
(
root
):
return
u
''
for
img
in
root
.
findall
(
'
.//img
'
):
for
img
in
root
.
findall
(
'
.//img
'
):
...
@@ -46,3 +65,3 @@
...
@@ -46,3 +65,3 @@
for
img
in
root
.
findall
(
'
.//img
'
):
for
img
in
root
.
findall
(
'
.//img
'
):
if
i
mg
.
get
(
'
src
'
).
startswith
(
'
http://feeds.feedburner.com
'
):
if
i
s_img_spy
(
img
):
img
.
drop_tree
()
img
.
drop_tree
()
...
@@ -48,2 +67,4 @@
...
@@ -48,2 +67,4 @@
img
.
drop_tree
()
img
.
drop_tree
()
elif
img
.
get
(
'
height
'
)
==
'
1
'
and
img
.
get
(
'
width
'
)
==
'
1
'
:
print
tostring
(
img
),
'
is probably a spy
'
for
anchor
in
root
.
findall
(
'
.//a
'
):
for
anchor
in
root
.
findall
(
'
.//a
'
):
...
@@ -49,7 +70,6 @@
...
@@ -49,7 +70,6 @@
for
anchor
in
root
.
findall
(
'
.//a
'
):
for
anchor
in
root
.
findall
(
'
.//a
'
):
href
=
anchor
.
get
(
'
href
'
)
if
is_tweetmeme_spy
(
anchor
):
if
href
and
href
.
startswith
(
'
http://api.tweetmeme.com/share
'
):
anchor
.
drop_tree
()
anchor
.
drop_tree
()
return
unicode
(
tostring
(
root
))
return
unicode
(
tostring
(
root
))
def
parse_blogpost_rss
(
url
):
def
parse_blogpost_rss
(
url
):
...
@@ -52,7 +72,8 @@
...
@@ -52,7 +72,8 @@
anchor
.
drop_tree
()
anchor
.
drop_tree
()
return
unicode
(
tostring
(
root
))
return
unicode
(
tostring
(
root
))
def
parse_blogpost_rss
(
url
):
def
parse_blogpost_rss
(
url
):
feed
=
feedparser
.
parse
(
url
)
data
=
feedparser
.
parse
(
url
)
for
entry
in
feed
.
entries
:
feed
=
data
.
feed
for
entry
in
data
.
entries
:
item
=
{}
item
=
{}
...
@@ -58,6 +79,6 @@
...
@@ -58,6 +79,6 @@
item
=
{}
item
=
{}
if
'
id
'
in
entry
:
if
'
feedburner_origlink
'
in
entry
:
item
[
'
uri
'
]
=
entry
.
id
item
[
'
uri
'
]
=
entry
.
feedburner_origlink
else
:
else
:
item
[
'
uri
'
]
=
entry
.
link
item
[
'
uri
'
]
=
entry
.
link
item
[
'
title
'
]
=
entry
.
title
item
[
'
title
'
]
=
entry
.
title
...
@@ -76,6 +97,17 @@
...
@@ -76,6 +97,17 @@
item
[
'
content_format
'
]
=
format_map
.
get
(
mimetype
,
u
'
text/plain
'
)
item
[
'
content_format
'
]
=
format_map
.
get
(
mimetype
,
u
'
text/plain
'
)
if
hasattr
(
entry
,
'
date_parsed
'
):
if
hasattr
(
entry
,
'
date_parsed
'
):
item
[
'
creation_date
'
]
=
datetime
(
*
entry
.
date_parsed
[:
6
])
item
[
'
creation_date
'
]
=
datetime
(
*
entry
.
date_parsed
[:
6
])
if
hasattr
(
entry
,
'
author_detail
'
)
and
hasattr
(
entry
.
author_detail
,
'
href
'
):
item
[
'
author
'
]
=
entry
.
author_detail
.
href
elif
hasattr
(
feed
,
'
author_detail
'
)
and
hasattr
(
feed
.
author_detail
,
'
href
'
):
item
[
'
author
'
]
=
feed
.
author_detail
.
href
elif
hasattr
(
feed
,
'
author
'
):
item
[
'
author
'
]
=
feed
.
author
elif
hasattr
(
feed
,
'
image
'
)
and
hasattr
(
feed
.
image
,
'
link
'
):
item
[
'
author
'
]
=
feed
.
image
.
link
else
:
item
[
'
author
'
]
=
url
item
[
'
cwuri
'
]
=
feed
.
link
yield
item
yield
item
def
parse_microblogpost_rss
(
url
):
def
parse_microblogpost_rss
(
url
):
...
@@ -87,6 +119,7 @@
...
@@ -87,6 +119,7 @@
item
[
'
creation_date
'
]
=
datetime
(
*
entry
.
date_parsed
[:
6
])
item
[
'
creation_date
'
]
=
datetime
(
*
entry
.
date_parsed
[:
6
])
item
[
'
modification_date
'
]
=
datetime
(
*
entry
.
date_parsed
[:
6
])
item
[
'
modification_date
'
]
=
datetime
(
*
entry
.
date_parsed
[:
6
])
item
[
'
author
'
]
=
feed
.
channel
.
link
# true for twitter
item
[
'
author
'
]
=
feed
.
channel
.
link
# true for twitter
item
[
'
cwuri
'
]
=
feed
.
channel
.
link
screen_name
=
feed
.
channel
.
link
.
split
(
'
/
'
)[
-
1
]
screen_name
=
feed
.
channel
.
link
.
split
(
'
/
'
)[
-
1
]
item
[
'
avatar
'
]
=
get_twitter_avatar
(
screen_name
)
item
[
'
avatar
'
]
=
get_twitter_avatar
(
screen_name
)
yield
item
yield
item
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment