Skip to content
Snippets Groups Projects
Commit fe739ada925c authored by Nicolas Chauvat's avatar Nicolas Chauvat
Browse files

[import] improve data extraction from atom and rss feeds

parent e8340fe485c9
No related branches found
No related tags found
No related merge requests found
......@@ -3,9 +3,10 @@
import sys
import re
from datetime import datetime
from lxml.html import fromstring, tostring
import feedparser
import rdflib
from cubes.datafeed.sobjects import DataFeedParser
......@@ -6,12 +7,12 @@
from lxml.html import fromstring, tostring
import feedparser
import rdflib
from cubes.datafeed.sobjects import DataFeedParser
SIOC = 'http://rdfs.org/sioc/ns#'
RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
DCTERMS = 'http://purl.org/dc/terms/'
RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
SIOC = rdflib.Namespace('http://rdfs.org/sioc/ns#')
DCTERMS = rdflib.Namespace('http://purl.org/dc/terms/')
def get_subject(g, pred, obj):
subjects = list(g.subjects(pred, obj))
......@@ -26,9 +27,5 @@
def parse_blogpost_sioc(url):
g = rdflib.ConjunctiveGraph()
g.parse(url)
rdf_type = rdflib.URIRef(RDF+'type')
sioc_blogpost = rdflib.URIRef(SIOC+'BlogPost')
dcterms_title = rdflib.URIRef(DCTERMS+'title')
sioc_content = rdflib.URIRef(SIOC+'content')
for post, type_, blogpost_ in g.triples((None, rdf_type, sioc_blogpost)):
for post, type_, blogpost_ in g.triples((None, RDF.type, SIOC.BlogPost)):
item = {'uri': unicode(post)}
......@@ -34,6 +31,6 @@
item = {'uri': unicode(post)}
item['title'] = unicode(get_object(g, post, dcterms_title))
item['content'] = unicode(get_object(g, post, sioc_content))
item['title'] = unicode(get_object(g, post, DCTERMS.title))
item['content'] = unicode(get_object(g, post, SIOC.content))
yield item
format_map = {'application/xhtml+xml':u'text/html',
......@@ -41,5 +38,25 @@
'text/plain':u'text/plain',
}
IMG_SPIES = ['http://feeds.feedburner.com',
'http://creatives.commindo-media',
'http://imp.constantcontact.com',
'https://blogger.googleusercontent.com/tracker',
]
def is_img_spy(node):
if node.tag != 'img':
return False
for url in IMG_SPIES:
if node.get('src').startswith(url):
return True
return False
def is_tweetmeme_spy(node):
href = node.get('href')
if href and href.startswith('http://api.tweetmeme.com/share'):
return True
return False
def remove_content_spies(content):
root = fromstring(content)
......@@ -44,3 +61,5 @@
def remove_content_spies(content):
root = fromstring(content)
if is_img_spy(root):
return u''
for img in root.findall('.//img'):
......@@ -46,3 +65,3 @@
for img in root.findall('.//img'):
if img.get('src').startswith('http://feeds.feedburner.com'):
if is_img_spy(img):
img.drop_tree()
......@@ -48,2 +67,4 @@
img.drop_tree()
elif img.get('height') == '1' and img.get('width') == '1':
print tostring(img), 'is probably a spy'
for anchor in root.findall('.//a'):
......@@ -49,7 +70,6 @@
for anchor in root.findall('.//a'):
href = anchor.get('href')
if href and href.startswith('http://api.tweetmeme.com/share'):
if is_tweetmeme_spy(anchor):
anchor.drop_tree()
return unicode(tostring(root))
def parse_blogpost_rss(url):
......@@ -52,7 +72,8 @@
anchor.drop_tree()
return unicode(tostring(root))
def parse_blogpost_rss(url):
feed = feedparser.parse(url)
for entry in feed.entries:
data = feedparser.parse(url)
feed = data.feed
for entry in data.entries:
item = {}
......@@ -58,6 +79,6 @@
item = {}
if 'id' in entry:
item['uri'] = entry.id
if 'feedburner_origlink' in entry:
item['uri'] = entry.feedburner_origlink
else:
item['uri'] = entry.link
item['title'] = entry.title
......@@ -76,6 +97,17 @@
item['content_format'] = format_map.get(mimetype, u'text/plain')
if hasattr(entry, 'date_parsed'):
item['creation_date'] = datetime(*entry.date_parsed[:6])
if hasattr(entry, 'author_detail') and hasattr(entry.author_detail, 'href'):
item['author'] = entry.author_detail.href
elif hasattr(feed, 'author_detail') and hasattr(feed.author_detail, 'href'):
item['author'] = feed.author_detail.href
elif hasattr(feed, 'author'):
item['author'] = feed.author
elif hasattr(feed, 'image') and hasattr(feed.image, 'link'):
item['author'] = feed.image.link
else:
item['author'] = url
item['cwuri'] = feed.link
yield item
def parse_microblogpost_rss(url):
......@@ -87,6 +119,7 @@
item['creation_date'] = datetime(*entry.date_parsed[:6])
item['modification_date'] = datetime(*entry.date_parsed[:6])
item['author'] = feed.channel.link # true for twitter
item['cwuri'] = feed.channel.link
screen_name = feed.channel.link.split('/')[-1]
item['avatar'] = get_twitter_avatar(screen_name)
yield item
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment