Skip to content
Snippets Groups Projects
Commit fe739ada925c authored by Nicolas Chauvat's avatar Nicolas Chauvat
Browse files

[import] improve data extraction from atom and rss feeds

parent e8340fe485c9
No related branches found
No related tags found
No related merge requests found
...@@ -3,9 +3,10 @@ ...@@ -3,9 +3,10 @@
import sys import sys
import re import re
from datetime import datetime from datetime import datetime
from lxml.html import fromstring, tostring from lxml.html import fromstring, tostring
import feedparser import feedparser
import rdflib import rdflib
from cubes.datafeed.sobjects import DataFeedParser from cubes.datafeed.sobjects import DataFeedParser
...@@ -6,12 +7,12 @@ ...@@ -6,12 +7,12 @@
from lxml.html import fromstring, tostring from lxml.html import fromstring, tostring
import feedparser import feedparser
import rdflib import rdflib
from cubes.datafeed.sobjects import DataFeedParser from cubes.datafeed.sobjects import DataFeedParser
SIOC = 'http://rdfs.org/sioc/ns#' RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' SIOC = rdflib.Namespace('http://rdfs.org/sioc/ns#')
DCTERMS = 'http://purl.org/dc/terms/' DCTERMS = rdflib.Namespace('http://purl.org/dc/terms/')
def get_subject(g, pred, obj): def get_subject(g, pred, obj):
subjects = list(g.subjects(pred, obj)) subjects = list(g.subjects(pred, obj))
...@@ -26,9 +27,5 @@ ...@@ -26,9 +27,5 @@
def parse_blogpost_sioc(url): def parse_blogpost_sioc(url):
g = rdflib.ConjunctiveGraph() g = rdflib.ConjunctiveGraph()
g.parse(url) g.parse(url)
rdf_type = rdflib.URIRef(RDF+'type') for post, type_, blogpost_ in g.triples((None, RDF.type, SIOC.BlogPost)):
sioc_blogpost = rdflib.URIRef(SIOC+'BlogPost')
dcterms_title = rdflib.URIRef(DCTERMS+'title')
sioc_content = rdflib.URIRef(SIOC+'content')
for post, type_, blogpost_ in g.triples((None, rdf_type, sioc_blogpost)):
item = {'uri': unicode(post)} item = {'uri': unicode(post)}
...@@ -34,6 +31,6 @@ ...@@ -34,6 +31,6 @@
item = {'uri': unicode(post)} item = {'uri': unicode(post)}
item['title'] = unicode(get_object(g, post, dcterms_title)) item['title'] = unicode(get_object(g, post, DCTERMS.title))
item['content'] = unicode(get_object(g, post, sioc_content)) item['content'] = unicode(get_object(g, post, SIOC.content))
yield item yield item
format_map = {'application/xhtml+xml':u'text/html', format_map = {'application/xhtml+xml':u'text/html',
...@@ -41,5 +38,25 @@ ...@@ -41,5 +38,25 @@
'text/plain':u'text/plain', 'text/plain':u'text/plain',
} }
IMG_SPIES = ['http://feeds.feedburner.com',
'http://creatives.commindo-media',
'http://imp.constantcontact.com',
'https://blogger.googleusercontent.com/tracker',
]
def is_img_spy(node):
if node.tag != 'img':
return False
for url in IMG_SPIES:
if node.get('src').startswith(url):
return True
return False
def is_tweetmeme_spy(node):
href = node.get('href')
if href and href.startswith('http://api.tweetmeme.com/share'):
return True
return False
def remove_content_spies(content): def remove_content_spies(content):
root = fromstring(content) root = fromstring(content)
...@@ -44,3 +61,5 @@ ...@@ -44,3 +61,5 @@
def remove_content_spies(content): def remove_content_spies(content):
root = fromstring(content) root = fromstring(content)
if is_img_spy(root):
return u''
for img in root.findall('.//img'): for img in root.findall('.//img'):
...@@ -46,3 +65,3 @@ ...@@ -46,3 +65,3 @@
for img in root.findall('.//img'): for img in root.findall('.//img'):
if img.get('src').startswith('http://feeds.feedburner.com'): if is_img_spy(img):
img.drop_tree() img.drop_tree()
...@@ -48,2 +67,4 @@ ...@@ -48,2 +67,4 @@
img.drop_tree() img.drop_tree()
elif img.get('height') == '1' and img.get('width') == '1':
print tostring(img), 'is probably a spy'
for anchor in root.findall('.//a'): for anchor in root.findall('.//a'):
...@@ -49,7 +70,6 @@ ...@@ -49,7 +70,6 @@
for anchor in root.findall('.//a'): for anchor in root.findall('.//a'):
href = anchor.get('href') if is_tweetmeme_spy(anchor):
if href and href.startswith('http://api.tweetmeme.com/share'):
anchor.drop_tree() anchor.drop_tree()
return unicode(tostring(root)) return unicode(tostring(root))
def parse_blogpost_rss(url): def parse_blogpost_rss(url):
...@@ -52,7 +72,8 @@ ...@@ -52,7 +72,8 @@
anchor.drop_tree() anchor.drop_tree()
return unicode(tostring(root)) return unicode(tostring(root))
def parse_blogpost_rss(url): def parse_blogpost_rss(url):
feed = feedparser.parse(url) data = feedparser.parse(url)
for entry in feed.entries: feed = data.feed
for entry in data.entries:
item = {} item = {}
...@@ -58,6 +79,6 @@ ...@@ -58,6 +79,6 @@
item = {} item = {}
if 'id' in entry: if 'feedburner_origlink' in entry:
item['uri'] = entry.id item['uri'] = entry.feedburner_origlink
else: else:
item['uri'] = entry.link item['uri'] = entry.link
item['title'] = entry.title item['title'] = entry.title
...@@ -76,6 +97,17 @@ ...@@ -76,6 +97,17 @@
item['content_format'] = format_map.get(mimetype, u'text/plain') item['content_format'] = format_map.get(mimetype, u'text/plain')
if hasattr(entry, 'date_parsed'): if hasattr(entry, 'date_parsed'):
item['creation_date'] = datetime(*entry.date_parsed[:6]) item['creation_date'] = datetime(*entry.date_parsed[:6])
if hasattr(entry, 'author_detail') and hasattr(entry.author_detail, 'href'):
item['author'] = entry.author_detail.href
elif hasattr(feed, 'author_detail') and hasattr(feed.author_detail, 'href'):
item['author'] = feed.author_detail.href
elif hasattr(feed, 'author'):
item['author'] = feed.author
elif hasattr(feed, 'image') and hasattr(feed.image, 'link'):
item['author'] = feed.image.link
else:
item['author'] = url
item['cwuri'] = feed.link
yield item yield item
def parse_microblogpost_rss(url): def parse_microblogpost_rss(url):
...@@ -87,6 +119,7 @@ ...@@ -87,6 +119,7 @@
item['creation_date'] = datetime(*entry.date_parsed[:6]) item['creation_date'] = datetime(*entry.date_parsed[:6])
item['modification_date'] = datetime(*entry.date_parsed[:6]) item['modification_date'] = datetime(*entry.date_parsed[:6])
item['author'] = feed.channel.link # true for twitter item['author'] = feed.channel.link # true for twitter
item['cwuri'] = feed.channel.link
screen_name = feed.channel.link.split('/')[-1] screen_name = feed.channel.link.split('/')[-1]
item['avatar'] = get_twitter_avatar(screen_name) item['avatar'] = get_twitter_avatar(screen_name)
yield item yield item
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment