diff --git a/entities.py b/entities.py index 4acad691ab732c5cedca2e5e4da6951f69bf1719_ZW50aXRpZXMucHk=..4c9a9b321087f7d978c42085e496a715c36a4f36_ZW50aXRpZXMucHk= 100644 --- a/entities.py +++ b/entities.py @@ -27,7 +27,7 @@ vtitle=self.entity.dc_title()) class BlogISiocContainerAdapter(EntityAdapter): - __regid__ = 'ISiocContainer' + __regid__ = 'ISIOCContainer' __select__ = EntityAdapter.__select__ & is_instance('Blog') def isioc_type(self): @@ -84,7 +84,7 @@ class BlogEntryISiocItemAdapter(EntityAdapter): - __regid__ = 'ISiocItem' + __regid__ = 'ISIOCItem' __select__ = EntityAdapter.__select__ & is_instance('BlogEntry') def isioc_content(self): diff --git a/site_cubicweb.py b/site_cubicweb.py new file mode 100644 index 0000000000000000000000000000000000000000..4c9a9b321087f7d978c42085e496a715c36a4f36_c2l0ZV9jdWJpY3dlYi5weQ== --- /dev/null +++ b/site_cubicweb.py @@ -0,0 +1,6 @@ +# XML <-> yams equivalence +from cubicweb.xy import xy +xy.add_equivalence('Blog', 'sioc:Weblog') +xy.add_equivalence('BlogEntry', 'sioc:BlogPost') +xy.add_equivalence('BlogEntry title', 'dcterms:title') +xy.add_equivalence('BlogEntry content', 'sioc:content') diff --git a/sobjects.py b/sobjects.py new file mode 100644 index 0000000000000000000000000000000000000000..4c9a9b321087f7d978c42085e496a715c36a4f36_c29iamVjdHMucHk= --- /dev/null +++ b/sobjects.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +import sys +from datetime import datetime +from lxml import etree +import feedparser +import rdflib + +from cubes.datafeed.sobjects import DataFeedParser + +SIOC = 'http://rdfs.org/sioc/ns#' +RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' +DCTERMS = 'http://purl.org/dc/terms/' + +def get_subject(g, pred, obj): + subjects = list(g.subjects(pred, obj)) + assert len(subjects) == 1 + return subjects[0] + +def get_object(g, subj, pred): + objects = list(g.objects(subj, pred)) + assert len(objects) == 1 + return objects[0] + +def parse_blogpost_sioc(url): + g = rdflib.ConjunctiveGraph() + g.parse(url) + rdf_type = rdflib.URIRef(RDF+'type') + sioc_blogpost = rdflib.URIRef(SIOC+'BlogPost') + dcterms_title = rdflib.URIRef(DCTERMS+'title') + sioc_content = rdflib.URIRef(SIOC+'content') + for post, type_, blogpost_ in g.triples((None, rdf_type, sioc_blogpost)): + item = {'uri': unicode(post)} + item['title'] = unicode(get_object(g, post, dcterms_title)) + item['content'] = unicode(get_object(g, post, sioc_content)) + yield item + +def parse_blogpost_rss(url): + feed = feedparser.parse(url) + for entry in feed.entries: + item = {} + item['uri'] = entry.id + item['title'] = entry.title + item['content'] = entry.description + item['creation_date'] = datetime(*entry.date_parsed[:6]) + yield item + +class BlogPostParser(DataFeedParser): + __abstract__ = True + + def process(self, url): + for item in self.parse(url): + euri = self.sget_externaluri(item.pop('uri')) + if euri.same_as: + sys.stdout.write('.') + self.update_blogpost(euri.same_as[0], item) + else: + sys.stdout.write('+') + self.create_blogpost(item, euri) + sys.stdout.flush() + + def create_blogpost(self, item, uri): + entity = self._cw.create_entity('BlogEntry', **item) + entity.set_relations(same_as=uri) + return self.update_blogpost(entity, None) + + def update_blogpost(self, entity, item): + if item: + entity.set_attributes(**item) + return entity + +class BlogPostSiocParser(BlogPostParser): + __regid__ = 'blogpost-sioc' + parse = staticmethod(parse_blogpost_sioc) + +class BlogPostRSSParser(BlogPostParser): + __regid__ = 'blogpost-rss' + parse = staticmethod(parse_blogpost_rss) + +if __name__ == '__main__': + import sys + from pprint import pprint + + name = sys.argv[1] + url = sys.argv[2] + + parser = globals()[name] + pprint(list(parser(url))) + diff --git a/update-feeds.py b/update-feeds.py new file mode 100644 index 0000000000000000000000000000000000000000..4c9a9b321087f7d978c42085e496a715c36a4f36_dXBkYXRlLWZlZWRzLnB5 --- /dev/null +++ b/update-feeds.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +import sys + +feeds = rql('Any A WHERE A is DataFeed').entities() +for feed in feeds: + if 'reset' in sys.argv: + feed.set_attributes(latest_retrieval=None) + else: + print '----- processing %r with %s' % (feed.title, feed.parser) + feed.pull_data() + print +commit() +