Commit 16e94b6c authored by Arthur Lutz's avatar Arthur Lutz
Browse files

added options and a bit of refactoring

parent 835221f718ce
# -*- coding: utf-8 -*-
"""update-webstats cubicweb plugin
Usage: cubicweb-ctl update-webstats [options] <instance-name>
Usage: cubicweb-ctl update-webstats [options] <instance-name> startdate [stopdate]
This command will generate webstats objects for all linked document types.
"""
from datetime import datetime
from logilab.common.date import first_day, last_day
from datetime import datetime, timedelta
from logilab.common.date import first_day, last_day, date_range
from logilab.common.shellutils import ProgressBar
from cubicweb import cwconfig, UnknownEid
from cubicweb import AuthenticationError
......@@ -16,7 +17,8 @@ from cubicweb.dbapi import in_memory_repo_cnx
from cubicweb.toolsutils import Command
from cubicweb.cwctl import CWCTL
from utils import SECTIONSPEC, extract_stats_dict, eid_from_url
from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \
get_or_create_statperiod, compress_old_hits
def url_count_from_stats(stats_dict):
'''
......@@ -43,32 +45,40 @@ def url_count_from_stats(stats_dict):
class UpdateWebstatsCommand(Command):
""" Update web stats
""" Update cubicweb web stats from awstats processed files.
according to periodicity setting the input format for the date is different :
If startdate is not entered, the update will be done on current
day or current month. If only startdate is enterred, the day or
month will be processed. If both dates are enterred, all the dates
between these two dates will be processed.
According to periodicity setting the input format for the date is
different :
* month 05/2011
* day 15/05/2011
* hour 15/05/2011-13h
* hour 15/05/2011-13h (not implemented yet)
"""
arguments = '<instance>'
arguments = '<instance> [startdate [stopdate]]'
name = 'update-webstats'
min_args = 1
def get_current_stats_period(self, session, chosendate, periodicity):
max_args = 3
options = [
("skip-compress", {"action": 'store_true',
'help' : u'Skip the compression of old daily hits into month stats'}),
# ('dry-run', {'action': 'store_true',
# 'help': "do not actually insert data in the database",
# }),
]
def get_current_stats_period(self, session, chosendate):
""" return a statperiod for the current month, if it doesn't exist, create it """
start, end = self.choose_period(chosendate, periodicity)
rql = 'Any P WHERE P is StatPeriod, P start "%(start_date)s", P stop "%(end_date)s"'
rset = session.execute(rql %
{'start_date':start,
'end_date':end})
if rset:
return rset.get_entity(0, 0)
else:
return session.create_entity('StatPeriod', start=start, stop=end)
start, stop = self.choose_period(session, chosendate)
return get_or_create_statperiod(session, start, stop)
def choose_period(self, chosendate, periodicity):
def choose_period(self, session, chosendate):
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
if periodicity == 'month':
start = first_day(chosendate)
end = last_day(start)
......@@ -88,76 +98,98 @@ class UpdateWebstatsCommand(Command):
def update_stats(self, session, args):
''' parses awstats and creates or updates the corresponding
data in the cubicweb instance'''
awstatsdir = session.vreg.config.get('awstats-dir', '/var/lib/awstats')
domain = session.vreg.config.get('awstats-domain', '')
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
assert periodicity in ('hour', 'day', 'month')
start = stop = None
if args:
# FIXME - adapt according to periodicity
input_format = {'month':'%m/%Y',
'day': '%d/%m/%Y',
'hour': '%d/%m/%Y-%Hh'}[periodicity]
chosendate = datetime.strptime(args[0], input_format)
# TODO - probably need a command to update stats from day X to day Y...
start = datetime.strptime(args[0], input_format)
if len(args) > 1:
stop = datetime.strptime(args[1], input_format)
else:
chosendate = datetime.now()
stats_period = self.get_current_stats_period(session, chosendate, periodicity)
dateformat_in_file = self.choose_dateformat(periodicity)
filename = 'awstats%s%s.txt' % (chosendate.strftime(dateformat_in_file), domain and '.%s' % domain)
stats_dict = extract_stats_dict(awstatsdir, filename)
normal_dict, rdf_dict = url_count_from_stats(stats_dict)
is_rdf = False
rql = 'Any N WHERE X relation_type R, R name "stats_about", X to_entity Y, Y name N'
rset = session.execute(rql)
allowed_types = [item[0] for item in rset]
start = stop = datetime.now()
if stop is None:
stop = start
update_stats = {'updated':0,
'created':0,
'exists no change':0,
'skipped':0,
'periods':0,
'compressed':0
}
for count_dict, is_rdf in ((normal_dict, False),
(rdf_dict, True)):
for eid, values in count_dict.items():
visit_count = visit_count_rdf = 0
total_hits = sum([item[0] for item in values])
try:
entity = session.entity_from_eid(eid)
except UnknownEid:
update_stats['skipped'] += 1
continue
if not entity.__regid__ in allowed_types:
update_stats['skipped'] += 1
continue
rql = 'Any X,V WHERE X is Hits, X count V, X hit_type "%(hit_type)s",' \
'X stats_about E, E eid %(e)s, X period P, P eid %(sp)s'
rset = session.execute(rql % {'e':eid,
'sp':stats_period.eid,
'hit_type': is_rdf and 'rdf' or 'normal'})
if rset:
if rset[0][1] != total_hits:
update_stats['updated'] += 1
session.execute('SET X count %(hits)s WHERE X eid %(e)s' %
{'e':rset[0][0],
'hits':total_hits})
else:
update_stats['exists no change'] += 1
else:
update_stats['created'] += 1
session.create_entity('Hits', count = total_hits,
period=stats_period,
stats_about = entity,
hit_type=is_rdf and u'rdf' or u'normal')
pb = ProgressBar(((stop+timedelta(days=1))-start).days, 70, title='Import')
for chosendate in date_range(start, stop+timedelta(days=1)):
#self.update_stats_for_date(session, chosendate, update_stats)
pb.update()
pb.finish()
if not self.config.skip_compress:
compress_old_hits(session, update_stats)
print '''=== Update Report ===
Number of periods imported : %(periods)s
Number of stat objects created : %(created)s
Number of stat objects updated : %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped : %(skipped)s
Number of stat objects compressed : %(compressed)s
''' % update_stats
def update_stats_for_date(self, session, chosendate, update_stats):
stats_period = self.get_current_stats_period(session, chosendate)
update_stats['periods'] += 1
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
dateformat_in_file = self.choose_dateformat(periodicity)
domain = session.vreg.config.get('awstats-domain', '')
filename = 'awstats%s%s.txt' % (chosendate.strftime(dateformat_in_file), domain and '.%s' % domain)
awstatsdir = session.vreg.config.get('awstats-dir', '/var/lib/awstats')
stats_dict = extract_stats_dict(awstatsdir, filename)
normal_dict, rdf_dict = url_count_from_stats(stats_dict)
is_rdf = False
rql = 'Any N WHERE X relation_type R, R name "stats_about", X to_entity Y, Y name N'
rset = session.execute(rql)
allowed_types = [item[0] for item in rset]
for count_dict, is_rdf in ((normal_dict, False),
(rdf_dict, True)):
for eid, values in count_dict.items():
self.update_hits_for_eid(eid, values, session, update_stats,
allowed_types, stats_period, is_rdf)
def update_hits_for_eid(self, eid, values, session, update_stats,
allowed_types, stats_period, is_rdf):
visit_count = visit_count_rdf = 0
total_hits = sum([item[0] for item in values])
try:
entity = session.entity_from_eid(eid)
except UnknownEid:
update_stats['skipped'] += 1
return
if not entity.__regid__ in allowed_types:
update_stats['skipped'] += 1
return
rql = 'Any X,V WHERE X is Hits, X count V, X hit_type "%(hit_type)s",' \
'X stats_about E, E eid %(e)s, X period P, P eid %(sp)s'
rset = session.execute(rql % {'e':eid,
'sp':stats_period.eid,
'hit_type': is_rdf and 'rdf' or 'normal'})
if rset:
if rset[0][1] != total_hits:
update_stats['updated'] += 1
session.execute('SET X count %(hits)s WHERE X eid %(e)s' %
{'e':rset[0][0],
'hits':total_hits})
else:
update_stats['exists no change'] += 1
else:
update_stats['created'] += 1
session.create_entity('Hits', count = total_hits,
period=stats_period,
stats_about = entity,
hit_type=is_rdf and u'rdf' or u'normal')
def _init_cw_connection(self, appid):
self.config = config = cwconfig.instance_configuration(appid)
config = cwconfig.instance_configuration(appid)
sourcescfg = config.sources()
config.set_sources_mode(('system',))
cnx = repo = None
......@@ -205,6 +237,5 @@ Number of stat objects skipped : %(skipped)s
self.update_stats(session, args)
session.commit()
CWCTL.register(UpdateWebstatsCommand)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment