Commit 8821f180 authored by Arthur Lutz's avatar Arthur Lutz
Browse files

imported patch proposed_refactoring.diff

parent a90975d04742
......@@ -6,8 +6,10 @@ Usage: cubicweb-ctl update-webstats [options] <instance-name> startdate [stopdat
This command will generate webstats objects for all linked document types.
"""
from datetime import datetime, timedelta
from logilab.common.date import first_day, last_day, date_range
import os.path as osp
from datetime import datetime
from logilab.common.date import first_day, last_day, date_range, ONDEDAY
from logilab.common.shellutils import ProgressBar
from cubicweb import cwconfig, UnknownEid
......@@ -20,6 +22,7 @@ from cubicweb.cwctl import CWCTL
from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \
get_or_create_statperiod, compress_old_hits
def url_count_from_stats(stats_dict):
'''
parse most visited urls in stats_dict generated from awstats txt file
......@@ -27,7 +30,7 @@ def url_count_from_stats(stats_dict):
returns two dictionnaries with eid as key and sequence of values as value
one for normal navigation, the other for rdf navigation
'''
if 'SIDER' not in stats_dict.keys():
if 'SIDER' not in stats_dict:
return {}, {}
visit_count_dict = {}
visit_count_rdf_dict = {}
......@@ -46,6 +49,127 @@ def url_count_from_stats(stats_dict):
return visit_count_dict, visit_count_rdf_dict
def parse_input_date(date, periodicity):
input_formats = {'month':'%m/%Y',
'day': '%d/%m/%Y',
'hour': '%d/%m/%Y-%Hh'}
try:
return datetime.strptime(date, input_formats[periodicity])
except ValueError:
print 'Error : %s not a proper date' % date
return None
def track_progress(iterable, nb_ops=None, pb_size=20, pb_title=''):
# nb_ops must be set is iterable doesn't support length protocol
nb_ops = nb_ops or len(iterable)
pb = ProgressBar(nb_ops, size=pb_size, title=pb_title)
for item in iterable:
yield item
pb.update()
pb.finish()
class StatsUpdater(object):
def __init__(self, session):
self.session = session
self.config = session.vreg.config
self.allowed_etypes = frozenset(eschema.type for eschema in
session.vreg.schema.rschema('stats_about').objects())
self.all_hits = {}
hits_rset = session.execute('Any H,HC,HT,E,P,PSA,PSO WHERE '
'H is Hits, H count HC, H hit_type HT, '
'H stats_about E, H period P, P start PSA, P stop PSO')
for hit in hits_rset.entities():
hit_key = (hit.stats_about[0].eid, hit.period[0].eid, hit.hit_type)
self.all_hits[hit_key] = hit
## internal utilities #####################################################
def awstats_filepath(self, date):
config = self.config
date_formats = {'month': '%m%Y',
'day': '%m%Y%d',
'hour':'%m%Y%d%H'}
domain = config['awstats-domain']
if config['awstats-domain']:
domain_ext = '.' + config['awstats-domain']
else:
domain_ext = ''
filename = 'awstats%s%s.txt' % (
date.strftime(date_formats[config['awstats-periodicity']]),
domain_ext)
return osp.join(config['awstats-dir'], filename)
def stats_period_for_date(self, chosendate):
""" return a statperiod for the current month, if it doesn't exist, create it """
periodicity = self.config['awstats-periodicity']
if periodicity == 'month':
start = first_day(chosendate)
stop = last_day(start)
elif periodicity == 'day':
start = datetime(chosendate.year, chosendate.month, chosendate.day)
stop = datetime(chosendate.year, chosendate.month, chosendate.day, 23, 59, 59)
elif periodicity == 'hour':
start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour)
stop = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59)
return get_or_create_statperiod(self.session, start, stop)
## update API #############################################################
def update_stats(self, start, stop, skip_compress=False):
''' parses awstats and creates or updates the corresponding
data in the cubicweb instance
:param start: period start (included)
:param stop: period stop (excluded)
'''
session.set_cnxset()
stats_report = dict.fromkeys(('updated', 'created', 'exists no change',
'skipped', 'periods', 'compressed'), 0)
for chosendate in track_progress(date_range(start, stop), (stop-start).days,
pb_size=70, pb_title='Import'):
self._update_stats_for_date(chosendate, stats_report)
if not skip_compress:
compress_old_hits(self.session, stats_report)
self.session.commit()
return stats_report
def _update_stats_for_date(self, chosendate, stats_report):
stats_dict = extract_stats_dict(self.awstats_filepath(chosendate))
stats_period = self.stats_period_for_date(chosendate)
normal_dict, rdf_dict = url_count_from_stats(stats_dict)
for count_dict, hit_type in ((normal_dict, u'normal'),
(rdf_dict, u'rdf')):
for eid, values in count_dict.items():
status = self._update_hits_for_eid(eid, values,
stats_period, hit_type)
stats_report[status] += 1
def _update_hits_for_eid(self, eid, values, stats_period, hit_type):
visit_count = visit_count_rdf = 0
total_hits = sum([item[0] for item in values])
try:
entity = self.session.entity_from_eid(eid)
except UnknownEid:
return 'skipped'
if entity.__regid__ not in self.allowed_etypes:
return 'skipped'
try:
hit = self.all_hits[(eid, stats_period.eid, hit_type)]
except KeyError: # no hit yet, create one
status = 'created'
hit = self.session.create_entity('Hits', count=total_hits, hit_type=hit_type,
period=stats_period, stats_about=entity)
# append it to the cache
self.all_hits[(eid, stats_period.eid, hit_type)] = hit
else:
if hit.count != total_hits:
status = 'updated'
hit.set_attributes(count=total_hits)
else:
status = 'exists no change'
return status
class UpdateWebstatsCommand(Command):
""" Update cubicweb web stats from awstats processed files.
......@@ -68,132 +192,10 @@ class UpdateWebstatsCommand(Command):
max_args = 3
options = [
("skip-compress", {"action": 'store_true',
'help' : u'Skip the compression of old daily hits into month stats'}),
'help' : u'Skip the compression of old daily hits into month stats'}),
]
def get_current_stats_period(self, session, chosendate):
""" return a statperiod for the current month, if it doesn't exist, create it """
start, stop = self.choose_period(session, chosendate)
return get_or_create_statperiod(session, start, stop)
def choose_period(self, session, chosendate):
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
if periodicity == 'month':
start = first_day(chosendate)
end = last_day(start)
elif periodicity == 'day':
start = datetime(chosendate.year, chosendate.month, chosendate.day)
end = datetime(chosendate.year, chosendate.month, chosendate.day, 23, 59, 59)
elif periodicity == 'hour':
start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour)
end = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59)
return start, end
def choose_dateformat(self, periodicity):
return {'hour':'%m%Y%d%H',
'day': '%m%Y%d',
'month': '%m%Y'}[periodicity]
def update_stats(self, session, args):
''' parses awstats and creates or updates the corresponding
data in the cubicweb instance'''
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
assert periodicity in ('hour', 'day', 'month')
start = stop = None
if args:
# FIXME - adapt according to periodicity
input_format = {'month':'%m/%Y',
'day': '%d/%m/%Y',
'hour': '%d/%m/%Y-%Hh'}[periodicity]
try:
start = datetime.strptime(args[0], input_format)
except ValueError:
print 'Error : %s not a proper date' % args[0]
return
if len(args) > 1:
try:
stop = datetime.strptime(args[1], input_format)
except ValueError:
print 'Error : %s not a proper date' % args[1]
return
else:
start = stop = datetime.now()
if stop is None:
stop = start
update_stats = {'updated':0,
'created':0,
'exists no change':0,
'skipped':0,
'periods':0,
'compressed':0
}
pb = ProgressBar(((stop+timedelta(days=1))-start).days, 70, title='Import')
for chosendate in date_range(start, stop+timedelta(days=1)):
self.update_stats_for_date(session, chosendate, update_stats)
pb.update()
pb.finish()
if not self.config.skip_compress:
compress_old_hits(session, update_stats)
print '''=== Update Report ===
Number of periods imported : %(periods)s
Number of stat objects created : %(created)s
Number of stat objects updated : %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped : %(skipped)s
Number of stat objects compressed : %(compressed)s
''' % update_stats
def update_stats_for_date(self, session, chosendate, update_stats):
stats_period = self.get_current_stats_period(session, chosendate)
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
dateformat_in_file = self.choose_dateformat(periodicity)
domain = session.vreg.config.get('awstats-domain', '')
filename = 'awstats%s%s.txt' % (chosendate.strftime(dateformat_in_file), domain and '.%s' % domain)
awstatsdir = session.vreg.config.get('awstats-dir', '/var/lib/awstats')
stats_dict = extract_stats_dict(awstatsdir, filename)
normal_dict, rdf_dict = url_count_from_stats(stats_dict)
is_rdf = False
rql = 'Any N WHERE X relation_type R, R name "stats_about", X to_entity Y, Y name N'
rset = session.execute(rql)
allowed_types = [item[0] for item in rset]
for count_dict, is_rdf in ((normal_dict, False),
(rdf_dict, True)):
for eid, values in count_dict.items():
self.update_hits_for_eid(eid, values, session, update_stats,
allowed_types, stats_period, is_rdf)
def update_hits_for_eid(self, eid, values, session, update_stats,
allowed_types, stats_period, is_rdf):
visit_count = visit_count_rdf = 0
total_hits = sum([item[0] for item in values])
try:
entity = session.entity_from_eid(eid)
except UnknownEid:
update_stats['skipped'] += 1
return
if not entity.__regid__ in allowed_types:
update_stats['skipped'] += 1
return
rql = 'Any X,V WHERE X is Hits, X count V, X hit_type "%(hit_type)s",' \
'X stats_about E, E eid %(e)s, X period P, P eid %(sp)s'
rset = session.execute(rql % {'e':eid,
'sp':stats_period.eid,
'hit_type': is_rdf and 'rdf' or 'normal'})
if rset:
if rset[0][1] != total_hits:
update_stats['updated'] += 1
session.execute('SET X count %(hits)s WHERE X eid %(e)s' %
{'e':rset[0][0],
'hits':total_hits})
else:
update_stats['exists no change'] += 1
else:
update_stats['created'] += 1
session.create_entity('Hits', count = total_hits,
period=stats_period,
stats_about = entity,
hit_type=is_rdf and u'rdf' or u'normal')
## command / initial setup API ############################################
def _init_cw_connection(self, appid):
config = cwconfig.instance_configuration(appid)
sourcescfg = config.sources()
......@@ -213,35 +215,33 @@ Number of stat objects compressed : %(compressed)s
break
session = repo._get_session(cnx.sessionid)
# XXX keep reference on cnx otherwise cnx.__del__ will cause trouble
# (file a ticket)
return cnx, session
def main_run(self, args, rcfile=None):
"""Run the command and return status 0 if everything went fine.
If :exc:`CommandError` is raised by the underlying command, simply log
the error and return status 2.
Any other exceptions, including :exc:`BadCommandUsage` will be
propagated.
"""
# XXX (adim): rcfile handling is spectacularly messy but I can't
# get it right without refactoring pivotdoc for now
if rcfile is None:
if '-c' in args:
rcfile = args[args.index('-c')+1]
elif '--config' in args:
rcfile = args[args.index('--config')+1]
else:
rcfile = None#self.config.config
return Command.main_run(self, args, rcfile)
def run(self, args):
# args = (appid, start[, stop])
appid = args.pop(0)
cw_cnx, session = self._init_cw_connection(appid)
session.set_cnxset()
self.update_stats(session, args)
session.commit()
periodicity = session.vreg.config['awstats-periodicity']
if start is None:
start = datetime.now()
else:
start = parse_input_date(start, periodicity)
if stop is None:
stop = start
else:
stop = parse_input_date(stop, periodicity)
if start is None or stop is None:
sys.exit(1) # parse_input_date failed to parse date
stop += ONEDAY # date_range() excludes stop boundary
stats_updater = StatsUpdater(session)
stats_report = stats_updater.update_stats(start, stop)
print '''=== Update Report ===
Number of periods imported : %(periods)s
Number of stat objects created : %(created)s
Number of stat objects updated : %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped : %(skipped)s
Number of stat objects compressed : %(compressed)s
''' % stats_report
CWCTL.register(UpdateWebstatsCommand)
......@@ -11,4 +11,11 @@ options = (
'help': 'domain of the website (eg. example.org). ',
'group': 'awstats', 'level': 0,
}),
('awstats-periodicity',
{'type' : 'choice',
'choices' : ('hour', 'day', 'month'),
'default': 'day',
'help': 'stats periodicity',
'group': 'awstats', 'level': 0,
}),
)
......@@ -17,9 +17,12 @@
import re
import os.path as osp
from datetime import datetime
from logilab.common.date import previous_month, first_day
from logilab.common.shellutils import ProgressBar
from cubicweb.req import FindEntityError
SECTIONSPEC = {
# commented sections are not usefull to view
# 'MAP' : ['section', 'offset'],
......@@ -91,7 +94,7 @@ ORIGIN_LABELS = {
}
def extract_stats_dict(awstats_dir, filename):
def extract_stats_dict(filepath):
''' from an awstats file extract structured data into a dict
returns a dictionnary like this :
......@@ -108,12 +111,12 @@ def extract_stats_dict(awstats_dir, filename):
}
}
'''
if not osp.isfile(osp.join(awstats_dir, filename)):
if not osp.isfile(filepath):
return {}
section_name = None
parsed_countdown = 0
stats_dict = {}
for line in file(osp.join(awstats_dir, filename)).readlines():
for line in file(filepath):
if line.startswith('BEGIN_'):
section_name, nb_of_lines = line.split('_', 1)[1].split()
if section_name in SECTIONSPEC:
......@@ -147,13 +150,9 @@ def eid_from_url(value):
pass
def get_or_create_statperiod(session, start, stop):
rql = 'Any P WHERE P is StatPeriod, P start "%(start_date)s", P stop "%(end_date)s"'
rset = session.execute(rql %
{'start_date':start,
'end_date':stop})
if rset:
return rset.get_entity(0, 0)
else:
try:
return session.find_one_entity('StatPeriod', start=start, stop=stop)
except FindEntityError:
return session.create_entity('StatPeriod', start=start, stop=stop)
def compress_old_hits(req, update_stats={}):
......
......@@ -18,6 +18,7 @@
import os
import os.path as osp
import re
from datetime import datetime, timedelta
import urllib
......@@ -128,11 +129,11 @@ class AwstatsView(StartupView):
filename = 'awstats%s%s.txt' % (month, domain and '.%s' % domain)
awstats_dir = self._cw.vreg.config['awstats-dir']
try:
stats_dict = extract_stats_dict(awstats_dir, filename)
stats_dict = extract_stats_dict(osp.join(awstats_dir, filename))
except IOError:
filename = 'awstats%s%s.txt' % (extract_available_months(form)[0],
domain and '.%s' % domain)
stats_dict = extract_stats_dict(awstats_dir, filename)
stats_dict = extract_stats_dict(osp.join(awstats_dir, filename))
self.w(u'<div id="awstats">')
self.w(u'<h1>%s : %s</h1>' % (_('Domain'), domain or 'default'))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment