sobjects.py 7.95 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
from __future__ import print_function
3
4

from datetime import datetime
Arthur Lutz's avatar
flake8    
Arthur Lutz committed
5
from six import text_type as unicode
6

Nicolas Chauvat's avatar
Nicolas Chauvat committed
7
from lxml.html import fromstring, tostring
8

9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
try:
    import feedparser
except ImportError:
    feedparser = None

try:
    import rdflib
except ImportError:
    rdflib = None
else:
    RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
    SIOC = rdflib.Namespace('http://rdfs.org/sioc/ns#')
    DCTERMS = rdflib.Namespace('http://purl.org/dc/terms/')

try:
24
    from cubicweb_datafeed.sobjects import DataFeedParser
25
26
except ImportError:
    DataFeedParser = None
27

Arthur Lutz's avatar
Arthur Lutz committed
28

29
30
31
32
33
def get_subject(g, pred, obj):
    subjects = list(g.subjects(pred, obj))
    assert len(subjects) == 1
    return subjects[0]

Arthur Lutz's avatar
Arthur Lutz committed
34

35
36
37
38
39
def get_object(g, subj, pred):
    objects = list(g.objects(subj, pred))
    assert len(objects) == 1
    return objects[0]

Arthur Lutz's avatar
Arthur Lutz committed
40

41
42
43
def parse_blogpost_sioc(url):
    g = rdflib.ConjunctiveGraph()
    g.parse(url)
44
    for post, type_, blogpost_ in g.triples((None, RDF.type, SIOC.BlogPost)):
45
        item = {'uri': unicode(post)}
46
47
        item['title'] = unicode(get_object(g, post, DCTERMS.title))
        item['content'] = unicode(get_object(g, post, SIOC.content))
48
49
        yield item

50

Arthur Lutz's avatar
Arthur Lutz committed
51
52
53
format_map = {'application/xhtml+xml': u'text/html',
              'text/html': u'text/html',
              'text/plain': u'text/plain',
Nicolas Chauvat's avatar
Nicolas Chauvat committed
54
55
              }

56
57
58
59
IMG_SPIES = ['http://feeds.feedburner.com',
             'http://creatives.commindo-media',
             'http://imp.constantcontact.com',
             'https://blogger.googleusercontent.com/tracker',
Nicolas Chauvat's avatar
Nicolas Chauvat committed
60
             'http://stats.wordpress.com/',
61
62
             ]

Arthur Lutz's avatar
Arthur Lutz committed
63

64
65
66
67
68
69
70
71
def is_img_spy(node):
    if node.tag != 'img':
        return False
    for url in IMG_SPIES:
        if node.get('src').startswith(url):
            return True
    return False

Arthur Lutz's avatar
Arthur Lutz committed
72

73
74
75
76
77
78
def is_tweetmeme_spy(node):
    href = node.get('href')
    if href and href.startswith('http://api.tweetmeme.com/share'):
        return True
    return False

Arthur Lutz's avatar
Arthur Lutz committed
79

Nicolas Chauvat's avatar
Nicolas Chauvat committed
80
81
def remove_content_spies(content):
    root = fromstring(content)
82
83
    if is_img_spy(root):
        return u''
Nicolas Chauvat's avatar
Nicolas Chauvat committed
84
    for img in root.findall('.//img'):
85
        if is_img_spy(img):
Nicolas Chauvat's avatar
Nicolas Chauvat committed
86
            img.drop_tree()
87
        elif img.get('height') == '1' and img.get('width') == '1':
88
            print(tostring(img), 'is probably a spy')
Nicolas Chauvat's avatar
Nicolas Chauvat committed
89
    for anchor in root.findall('.//a'):
90
        if is_tweetmeme_spy(anchor):
Nicolas Chauvat's avatar
Nicolas Chauvat committed
91
92
93
            anchor.drop_tree()
    return unicode(tostring(root))

Arthur Lutz's avatar
Arthur Lutz committed
94

95
def parse_blogpost_rss(url):
96
97
98
    data = feedparser.parse(url)
    feed = data.feed
    for entry in data.entries:
99
        item = {}
100
101
        if 'feedburner_origlink' in entry:
            item['uri'] = entry.feedburner_origlink
Nicolas Chauvat's avatar
Nicolas Chauvat committed
102
103
        else:
            item['uri'] = entry.link
104
        item['title'] = entry.title
Nicolas Chauvat's avatar
Nicolas Chauvat committed
105
106
107
108
109
110
111
        if hasattr(entry, 'content'):
            content = entry.content[0].value
            mimetype = entry.content[0].type
        elif hasattr(entry, 'summary_detail'):
            content = entry.summary_detail.value
            mimetype = entry.summary_detail.type
        else:
Arthur Lutz's avatar
Arthur Lutz committed
112
            content = u''  # XXX entry.description?
Nicolas Chauvat's avatar
Nicolas Chauvat committed
113
114
115
116
117
118
119
            mimetype = u'text/plain'
        if mimetype == u'text/html':
            content = remove_content_spies(content)
        item['content'] = content
        item['content_format'] = format_map.get(mimetype, u'text/plain')
        if hasattr(entry, 'date_parsed'):
            item['creation_date'] = datetime(*entry.date_parsed[:6])
120
121
122
123
124
125
126
127
128
129
130
        if hasattr(entry, 'author_detail') and hasattr(entry.author_detail, 'href'):
            item['author'] = entry.author_detail.href
        elif hasattr(feed, 'author_detail') and hasattr(feed.author_detail, 'href'):
            item['author'] = feed.author_detail.href
        elif hasattr(feed, 'author'):
            item['author'] = feed.author
        elif hasattr(feed, 'image') and hasattr(feed.image, 'link'):
            item['author'] = feed.image.link
        else:
            item['author'] = url
        item['cwuri'] = feed.link
Nicolas Chauvat's avatar
Nicolas Chauvat committed
131
132
        yield item

Arthur Lutz's avatar
Arthur Lutz committed
133

Nicolas Chauvat's avatar
Nicolas Chauvat committed
134
135
136
137
138
def parse_microblogpost_rss(url):
    feed = feedparser.parse(url)
    for entry in feed.entries:
        item = {}
        item['uri'] = entry.id
139
140
141
142
143
144
145
146
147
148
149
150
151
        # fix weird parsing
        if hasattr(entry, 'content'):
            content = entry.content[0].value
            mimetype = entry.content[0].type
        else:
            content = entry.description
            mimetype = u'text/plain'
        if ': ' in content:
            author, text = content.split(': ', 1)
            if ' ' not in author:
                content = text
        item['content'] = content
        item['content_format'] = format_map.get(mimetype, u'text/plain')
152
        item['creation_date'] = datetime(*entry.date_parsed[:6])
Nicolas Chauvat's avatar
Nicolas Chauvat committed
153
        item['modification_date'] = datetime(*entry.date_parsed[:6])
Arthur Lutz's avatar
Arthur Lutz committed
154
        item['author'] = feed.channel.link  # true for twitter
155
        item['cwuri'] = feed.channel.link
156
157
158
159
160
161
162
        for link in entry.links:
            if link.type.startswith('image/') and link.rel == 'image':
                item['avatar'] = link.href
                break
        else:
            screen_name = feed.channel.link.split('/')[-1]
            item['avatar'] = get_twitter_avatar(screen_name)
163
164
        yield item

Arthur Lutz's avatar
Arthur Lutz committed
165

166
167
168
def search_twitter(word):
    import urllib2
    from simplejson import loads
Arthur Lutz's avatar
Arthur Lutz committed
169
170
    data = urllib2.urlopen(
        'http://search.twitter.com/search.json?q=%s&rpp=100' % word).read()
Arthur Lutz's avatar
flake8    
Arthur Lutz committed
171
    loads(data)
172
173
174
175
    # process results
    # print results
    return []

176

Nicolas Chauvat's avatar
Nicolas Chauvat committed
177
178
AVATAR_CACHE = {}

Arthur Lutz's avatar
Arthur Lutz committed
179

Nicolas Chauvat's avatar
Nicolas Chauvat committed
180
181
182
183
def get_twitter_avatar(screen_name):
    if screen_name not in AVATAR_CACHE:
        from urllib2 import urlopen
        import simplejson
Arthur Lutz's avatar
Arthur Lutz committed
184
185
        data = urlopen(
            'http://api.twitter.com/1/users/show.json?screen_name=%s' % screen_name).read()
Nicolas Chauvat's avatar
Nicolas Chauvat committed
186
187
188
189
        user = simplejson.loads(data)
        AVATAR_CACHE[screen_name] = user['profile_image_url']
    return AVATAR_CACHE[screen_name]

190

191
192
193
194
195
196
if DataFeedParser is not None:
    class BlogPostParser(DataFeedParser):
        __abstract__ = True
        entity_type = 'BlogEntry'

        def process(self, url):
Arthur Lutz's avatar
Arthur Lutz committed
197
            stats = {'update': 0, 'creation': 0}
198
199
200
201
202
            for item in self.parse(url):
                author = item.pop('author', None)
                avatar = item.pop('avatar', None)
                euri = self.sget_entity('ExternalUri', uri=item.pop('uri'))
                if euri.same_as:
Arthur Lutz's avatar
Arthur Lutz committed
203
                    # sys.stdout.write('.')
204
                    stats['update'] += 1
205
206
                    post = self.update_blogpost(euri.same_as[0], item)
                else:
Arthur Lutz's avatar
Arthur Lutz committed
207
                    # sys.stdout.write('+')
208
                    stats['creation'] += 1
209
210
211
212
213
214
215
                    post = self.create_blogpost(item, euri)
                if author:
                    account = self.sget_entity('UserAccount', name=author)
                    self.sget_relation(post.eid, 'has_creator', account.eid)
                    if avatar:
                        auri = self.sget_entity('ExternalUri', uri=avatar)
                        self.sget_relation(account.eid, 'has_avatar', auri.eid)
Arthur Lutz's avatar
Arthur Lutz committed
216
                # sys.stdout.flush()
217
            return stats
218
219
220
221
222
223
224

        def create_blogpost(self, item, uri):
            entity = self._cw.create_entity(self.entity_type, **item)
            entity.set_relations(same_as=uri)
            return entity

        def update_blogpost(self, entity, item):
225
            entity.cw_set(**item)
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
            return entity

    if rdflib is not None:
        class BlogPostSiocParser(BlogPostParser):
            __regid__ = 'blogpost-sioc'
            parse = staticmethod(parse_blogpost_sioc)

    if feedparser is not None:
        class BlogPostRSSParser(BlogPostParser):
            __regid__ = 'blogpost-rss'
            parse = staticmethod(parse_blogpost_rss)

        class MicroBlogPostRSSParser(BlogPostParser):
            __regid__ = 'microblogpost-rss'
            entity_type = 'MicroBlogEntry'
            parse = staticmethod(parse_microblogpost_rss)

Nicolas Chauvat's avatar
Nicolas Chauvat committed
243

244
245
246
247
248
249
250
251
252
if __name__ == '__main__':
    import sys
    from pprint import pprint

    name = sys.argv[1]
    url = sys.argv[2]

    parser = globals()[name]
    pprint(list(parser(url)))