[import] improve data extraction from atom and rss feeds

fe739ada925c · Nicolas Chauvat · e8340fe485c9 · fe739ada
Commit fe739ada925c authored 14 years ago by Nicolas Chauvat
--- a/sobjects.py
+++ b/sobjects.py
@@ -3,9 +3,10 @@
 import sys
 import re
 from datetime import datetime
 from lxml.html import fromstring, tostring
 import feedparser
 import rdflib
 from cubes.datafeed.sobjects import DataFeedParser
@@ -6,12 +7,12 @@
 from lxml.html import fromstring, tostring
 import feedparser
 import rdflib
 from cubes.datafeed.sobjects import DataFeedParser
-SIOC = 'http://rdfs.org/sioc/ns#'
+RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
-RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
+SIOC = rdflib.Namespace('http://rdfs.org/sioc/ns#')
-DCTERMS = 'http://purl.org/dc/terms/'
+DCTERMS = rdflib.Namespace('http://purl.org/dc/terms/')
 def get_subject(g, pred, obj):
    subjects = list(g.subjects(pred, obj))
@@ -26,9 +27,5 @@
 def parse_blogpost_sioc(url):
    g = rdflib.ConjunctiveGraph()
    g.parse(url)
-    rdf_type = rdflib.URIRef(RDF+'type')
+    for post, type_, blogpost_ in g.triples((None, RDF.type, SIOC.BlogPost)):
-    sioc_blogpost = rdflib.URIRef(SIOC+'BlogPost')
-    dcterms_title = rdflib.URIRef(DCTERMS+'title')
-    sioc_content = rdflib.URIRef(SIOC+'content')
-    for post, type_, blogpost_ in g.triples((None, rdf_type, sioc_blogpost)):
        item = {'uri': unicode(post)}
@@ -34,6 +31,6 @@
        item = {'uri': unicode(post)}
-        item['title'] = unicode(get_object(g, post, dcterms_title))
+        item['title'] = unicode(get_object(g, post, DCTERMS.title))
-        item['content'] = unicode(get_object(g, post, sioc_content))
+        item['content'] = unicode(get_object(g, post, SIOC.content))
        yield item
 format_map = {'application/xhtml+xml':u'text/html',
@@ -41,5 +38,25 @@
              'text/plain':u'text/plain',
              }
+IMG_SPIES = ['http://feeds.feedburner.com',
+             'http://creatives.commindo-media',
+             'http://imp.constantcontact.com',
+             'https://blogger.googleusercontent.com/tracker',
+             ]
+def is_img_spy(node):
+    if node.tag != 'img':
+        return False
+    for url in IMG_SPIES:
+        if node.get('src').startswith(url):
+            return True
+    return False
+def is_tweetmeme_spy(node):
+    href = node.get('href')
+    if href and href.startswith('http://api.tweetmeme.com/share'):
+        return True
+    return False
 def remove_content_spies(content):
    root = fromstring(content)
@@ -44,3 +61,5 @@
 def remove_content_spies(content):
    root = fromstring(content)
+    if is_img_spy(root):
+        return u''
    for img in root.findall('.//img'):
@@ -46,3 +65,3 @@
    for img in root.findall('.//img'):
-        if img.get('src').startswith('http://feeds.feedburner.com'):
+        if is_img_spy(img):
            img.drop_tree()
@@ -48,2 +67,4 @@
            img.drop_tree()
+        elif img.get('height') == '1' and img.get('width') == '1':
+            print tostring(img), 'is probably a spy'
    for anchor in root.findall('.//a'):
@@ -49,7 +70,6 @@
    for anchor in root.findall('.//a'):
-        href = anchor.get('href')
+        if is_tweetmeme_spy(anchor):
-        if href and href.startswith('http://api.tweetmeme.com/share'):
            anchor.drop_tree()
    return unicode(tostring(root))
 def parse_blogpost_rss(url):
@@ -52,7 +72,8 @@
            anchor.drop_tree()
    return unicode(tostring(root))
 def parse_blogpost_rss(url):
-    feed = feedparser.parse(url)
+    data = feedparser.parse(url)
-    for entry in feed.entries:
+    feed = data.feed
+    for entry in data.entries:
        item = {}
@@ -58,6 +79,6 @@
        item = {}
-        if 'id' in entry:
+        if 'feedburner_origlink' in entry:
-            item['uri'] = entry.id
+            item['uri'] = entry.feedburner_origlink
        else:
            item['uri'] = entry.link
        item['title'] = entry.title
@@ -76,6 +97,17 @@
        item['content_format'] = format_map.get(mimetype, u'text/plain')
        if hasattr(entry, 'date_parsed'):
            item['creation_date'] = datetime(*entry.date_parsed[:6])
+        if hasattr(entry, 'author_detail') and hasattr(entry.author_detail, 'href'):
+            item['author'] = entry.author_detail.href
+        elif hasattr(feed, 'author_detail') and hasattr(feed.author_detail, 'href'):
+            item['author'] = feed.author_detail.href
+        elif hasattr(feed, 'author'):
+            item['author'] = feed.author
+        elif hasattr(feed, 'image') and hasattr(feed.image, 'link'):
+            item['author'] = feed.image.link
+        else:
+            item['author'] = url
+        item['cwuri'] = feed.link
        yield item
 def parse_microblogpost_rss(url):
@@ -87,6 +119,7 @@
        item['creation_date'] = datetime(*entry.date_parsed[:6])
        item['modification_date'] = datetime(*entry.date_parsed[:6])
        item['author'] = feed.channel.link # true for twitter
+        item['cwuri'] = feed.channel.link
        screen_name = feed.channel.link.split('/')[-1]
        item['avatar'] = get_twitter_avatar(screen_name)
        yield item