# HG changeset patch # User Nicolas Chauvat <nicolas.chauvat@logilab.fr> # Date 1281029451 -7200 # Thu Aug 05 19:30:51 2010 +0200 # Node ID fe739ada925c21fae59f18304c11d3a044a0d5c5 # Parent e8340fe485c92437f0bec4dffbf5acff3bd69f77 [import] improve data extraction from atom and rss feeds diff --git a/sobjects.py b/sobjects.py --- a/sobjects.py +++ b/sobjects.py @@ -3,15 +3,16 @@ import sys import re from datetime import datetime + from lxml.html import fromstring, tostring import feedparser import rdflib from cubes.datafeed.sobjects import DataFeedParser -SIOC = 'http://rdfs.org/sioc/ns#' -RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' -DCTERMS = 'http://purl.org/dc/terms/' +RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') +SIOC = rdflib.Namespace('http://rdfs.org/sioc/ns#') +DCTERMS = rdflib.Namespace('http://purl.org/dc/terms/') def get_subject(g, pred, obj): subjects = list(g.subjects(pred, obj)) @@ -26,14 +27,10 @@ def parse_blogpost_sioc(url): g = rdflib.ConjunctiveGraph() g.parse(url) - rdf_type = rdflib.URIRef(RDF+'type') - sioc_blogpost = rdflib.URIRef(SIOC+'BlogPost') - dcterms_title = rdflib.URIRef(DCTERMS+'title') - sioc_content = rdflib.URIRef(SIOC+'content') - for post, type_, blogpost_ in g.triples((None, rdf_type, sioc_blogpost)): + for post, type_, blogpost_ in g.triples((None, RDF.type, SIOC.BlogPost)): item = {'uri': unicode(post)} - item['title'] = unicode(get_object(g, post, dcterms_title)) - item['content'] = unicode(get_object(g, post, sioc_content)) + item['title'] = unicode(get_object(g, post, DCTERMS.title)) + item['content'] = unicode(get_object(g, post, SIOC.content)) yield item format_map = {'application/xhtml+xml':u'text/html', @@ -41,23 +38,47 @@ 'text/plain':u'text/plain', } +IMG_SPIES = ['http://feeds.feedburner.com', + 'http://creatives.commindo-media', + 'http://imp.constantcontact.com', + 'https://blogger.googleusercontent.com/tracker', + ] + +def is_img_spy(node): + if node.tag != 'img': + return False + for url in IMG_SPIES: + if node.get('src').startswith(url): + return True + return False + +def is_tweetmeme_spy(node): + href = node.get('href') + if href and href.startswith('http://api.tweetmeme.com/share'): + return True + return False + def remove_content_spies(content): root = fromstring(content) + if is_img_spy(root): + return u'' for img in root.findall('.//img'): - if img.get('src').startswith('http://feeds.feedburner.com'): + if is_img_spy(img): img.drop_tree() + elif img.get('height') == '1' and img.get('width') == '1': + print tostring(img), 'is probably a spy' for anchor in root.findall('.//a'): - href = anchor.get('href') - if href and href.startswith('http://api.tweetmeme.com/share'): + if is_tweetmeme_spy(anchor): anchor.drop_tree() return unicode(tostring(root)) def parse_blogpost_rss(url): - feed = feedparser.parse(url) - for entry in feed.entries: + data = feedparser.parse(url) + feed = data.feed + for entry in data.entries: item = {} - if 'id' in entry: - item['uri'] = entry.id + if 'feedburner_origlink' in entry: + item['uri'] = entry.feedburner_origlink else: item['uri'] = entry.link item['title'] = entry.title @@ -76,6 +97,17 @@ item['content_format'] = format_map.get(mimetype, u'text/plain') if hasattr(entry, 'date_parsed'): item['creation_date'] = datetime(*entry.date_parsed[:6]) + if hasattr(entry, 'author_detail') and hasattr(entry.author_detail, 'href'): + item['author'] = entry.author_detail.href + elif hasattr(feed, 'author_detail') and hasattr(feed.author_detail, 'href'): + item['author'] = feed.author_detail.href + elif hasattr(feed, 'author'): + item['author'] = feed.author + elif hasattr(feed, 'image') and hasattr(feed.image, 'link'): + item['author'] = feed.image.link + else: + item['author'] = url + item['cwuri'] = feed.link yield item def parse_microblogpost_rss(url): @@ -87,6 +119,7 @@ item['creation_date'] = datetime(*entry.date_parsed[:6]) item['modification_date'] = datetime(*entry.date_parsed[:6]) item['author'] = feed.channel.link # true for twitter + item['cwuri'] = feed.channel.link screen_name = feed.channel.link.split('/')[-1] item['avatar'] = get_twitter_avatar(screen_name) yield item