# -*- coding: utf-8 -*- """update-webstats cubicweb plugin Usage: cubicweb-ctl update-webstats [options] startdate [stopdate] This command will generate webstats objects for all linked document types. """ from datetime import datetime, timedelta from logilab.common.date import first_day, last_day, date_range from logilab.common.shellutils import ProgressBar from cubicweb import cwconfig, UnknownEid from cubicweb import AuthenticationError from cubicweb.dbapi import in_memory_repo_cnx from cubicweb.toolsutils import Command from cubicweb.cwctl import CWCTL from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \ get_or_create_statperiod, compress_old_hits def url_count_from_stats(stats_dict): ''' parse most visited urls in stats_dict generated from awstats txt file returns two dictionnaries with eid as key and sequence of values as value one for normal navigation, the other for rdf navigation ''' if 'SIDER' not in stats_dict.keys(): return {}, {} visit_count_dict = {} visit_count_rdf_dict = {} for item in stats_dict['SIDER'].values(): url = item[SECTIONSPEC['SIDER'][0]] hits = int(item[SECTIONSPEC['SIDER'][1]]) eid = eid_from_url(url) if not eid: continue if 'rdf' in url: visit_count_rdf_dict.setdefault(eid, []) visit_count_rdf_dict[eid].append((hits, url)) else: visit_count_dict.setdefault(eid, []) visit_count_dict[eid].append((hits, url)) return visit_count_dict, visit_count_rdf_dict class UpdateWebstatsCommand(Command): """ Update cubicweb web stats from awstats processed files. If startdate is not entered, the update will be done on current day or current month. If only startdate is enterred, the day or month will be processed. If both dates are enterred, all the dates between these two dates will be processed. According to periodicity setting the input format for the date is different : * month 05/2011 * day 15/05/2011 * hour 15/05/2011-13h (not implemented yet) """ arguments = ' [startdate [stopdate]]' name = 'update-webstats' min_args = 1 max_args = 3 options = [ ("skip-compress", {"action": 'store_true', 'help' : u'Skip the compression of old daily hits into month stats'}), ] def get_current_stats_period(self, session, chosendate): """ return a statperiod for the current month, if it doesn't exist, create it """ start, stop = self.choose_period(session, chosendate) return get_or_create_statperiod(session, start, stop) def choose_period(self, session, chosendate): periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/ if periodicity == 'month': start = first_day(chosendate) end = last_day(start) elif periodicity == 'day': start = datetime(chosendate.year, chosendate.month, chosendate.day) end = datetime(chosendate.year, chosendate.month, chosendate.day, 23, 59, 59) elif periodicity == 'hour': start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour) end = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59) return start, end def choose_dateformat(self, periodicity): return {'hour':'%m%Y%d%H', 'day': '%m%Y%d', 'month': '%m%Y'}[periodicity] def update_stats(self, session, args): ''' parses awstats and creates or updates the corresponding data in the cubicweb instance''' periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/ assert periodicity in ('hour', 'day', 'month') start = stop = None if args: # FIXME - adapt according to periodicity input_format = {'month':'%m/%Y', 'day': '%d/%m/%Y', 'hour': '%d/%m/%Y-%Hh'}[periodicity] try: start = datetime.strptime(args[0], input_format) except ValueError: print 'Error : %s not a proper date' % args[0] return if len(args) > 1: try: stop = datetime.strptime(args[1], input_format) except ValueError: print 'Error : %s not a proper date' % args[1] return else: start = stop = datetime.now() if stop is None: stop = start update_stats = {'updated':0, 'created':0, 'exists no change':0, 'skipped':0, 'periods':0, 'compressed':0 } pb = ProgressBar(((stop+timedelta(days=1))-start).days, 70, title='Import') for chosendate in date_range(start, stop+timedelta(days=1)): self.update_stats_for_date(session, chosendate, update_stats) pb.update() pb.finish() if not self.config.skip_compress: compress_old_hits(session, update_stats) print '''=== Update Report === Number of periods imported : %(periods)s Number of stat objects created : %(created)s Number of stat objects updated : %(updated)s Number of stat objects already existed : %(exists no change)s Number of stat objects skipped : %(skipped)s Number of stat objects compressed : %(compressed)s ''' % update_stats def update_stats_for_date(self, session, chosendate, update_stats): stats_period = self.get_current_stats_period(session, chosendate) periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/ dateformat_in_file = self.choose_dateformat(periodicity) domain = session.vreg.config.get('awstats-domain', '') filename = 'awstats%s%s.txt' % (chosendate.strftime(dateformat_in_file), domain and '.%s' % domain) awstatsdir = session.vreg.config.get('awstats-dir', '/var/lib/awstats') stats_dict = extract_stats_dict(awstatsdir, filename) normal_dict, rdf_dict = url_count_from_stats(stats_dict) is_rdf = False rql = 'Any N WHERE X relation_type R, R name "stats_about", X to_entity Y, Y name N' rset = session.execute(rql) allowed_types = [item[0] for item in rset] for count_dict, is_rdf in ((normal_dict, False), (rdf_dict, True)): for eid, values in count_dict.items(): self.update_hits_for_eid(eid, values, session, update_stats, allowed_types, stats_period, is_rdf) def update_hits_for_eid(self, eid, values, session, update_stats, allowed_types, stats_period, is_rdf): visit_count = visit_count_rdf = 0 total_hits = sum([item[0] for item in values]) try: entity = session.entity_from_eid(eid) except UnknownEid: update_stats['skipped'] += 1 return if not entity.__regid__ in allowed_types: update_stats['skipped'] += 1 return rql = 'Any X,V WHERE X is Hits, X count V, X hit_type "%(hit_type)s",' \ 'X stats_about E, E eid %(e)s, X period P, P eid %(sp)s' rset = session.execute(rql % {'e':eid, 'sp':stats_period.eid, 'hit_type': is_rdf and 'rdf' or 'normal'}) if rset: if rset[0][1] != total_hits: update_stats['updated'] += 1 session.execute('SET X count %(hits)s WHERE X eid %(e)s' % {'e':rset[0][0], 'hits':total_hits}) else: update_stats['exists no change'] += 1 else: update_stats['created'] += 1 session.create_entity('Hits', count = total_hits, period=stats_period, stats_about = entity, hit_type=is_rdf and u'rdf' or u'normal') def _init_cw_connection(self, appid): config = cwconfig.instance_configuration(appid) sourcescfg = config.sources() config.set_sources_mode(('system',)) cnx = repo = None while cnx is None: try: login = sourcescfg['admin']['login'] pwd = sourcescfg['admin']['password'] except KeyError: login, pwd = manager_userpasswd() try: repo, cnx = in_memory_repo_cnx(config, login=login, password=pwd) except AuthenticationError: print 'wrong user/password' else: break session = repo._get_session(cnx.sessionid) # XXX keep reference on cnx otherwise cnx.__del__ will cause trouble # (file a ticket) return cnx, session def main_run(self, args, rcfile=None): """Run the command and return status 0 if everything went fine. If :exc:`CommandError` is raised by the underlying command, simply log the error and return status 2. Any other exceptions, including :exc:`BadCommandUsage` will be propagated. """ # XXX (adim): rcfile handling is spectacularly messy but I can't # get it right without refactoring pivotdoc for now if rcfile is None: if '-c' in args: rcfile = args[args.index('-c')+1] elif '--config' in args: rcfile = args[args.index('--config')+1] else: rcfile = None#self.config.config return Command.main_run(self, args, rcfile) def run(self, args): appid = args.pop(0) cw_cnx, session = self._init_cw_connection(appid) session.set_cnxset() self.update_stats(session, args) session.commit() CWCTL.register(UpdateWebstatsCommand)