Commit 8821f180 authored by Arthur Lutz's avatar Arthur Lutz
Browse files

imported patch proposed_refactoring.diff

parent a90975d04742
...@@ -6,8 +6,10 @@ Usage: cubicweb-ctl update-webstats [options] <instance-name> startdate [stopdat ...@@ -6,8 +6,10 @@ Usage: cubicweb-ctl update-webstats [options] <instance-name> startdate [stopdat
This command will generate webstats objects for all linked document types. This command will generate webstats objects for all linked document types.
""" """
from datetime import datetime, timedelta import os.path as osp
from logilab.common.date import first_day, last_day, date_range from datetime import datetime
from logilab.common.date import first_day, last_day, date_range, ONDEDAY
from logilab.common.shellutils import ProgressBar from logilab.common.shellutils import ProgressBar
from cubicweb import cwconfig, UnknownEid from cubicweb import cwconfig, UnknownEid
...@@ -20,6 +22,7 @@ from cubicweb.cwctl import CWCTL ...@@ -20,6 +22,7 @@ from cubicweb.cwctl import CWCTL
from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \ from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \
get_or_create_statperiod, compress_old_hits get_or_create_statperiod, compress_old_hits
def url_count_from_stats(stats_dict): def url_count_from_stats(stats_dict):
''' '''
parse most visited urls in stats_dict generated from awstats txt file parse most visited urls in stats_dict generated from awstats txt file
...@@ -27,7 +30,7 @@ def url_count_from_stats(stats_dict): ...@@ -27,7 +30,7 @@ def url_count_from_stats(stats_dict):
returns two dictionnaries with eid as key and sequence of values as value returns two dictionnaries with eid as key and sequence of values as value
one for normal navigation, the other for rdf navigation one for normal navigation, the other for rdf navigation
''' '''
if 'SIDER' not in stats_dict.keys(): if 'SIDER' not in stats_dict:
return {}, {} return {}, {}
visit_count_dict = {} visit_count_dict = {}
visit_count_rdf_dict = {} visit_count_rdf_dict = {}
...@@ -46,6 +49,127 @@ def url_count_from_stats(stats_dict): ...@@ -46,6 +49,127 @@ def url_count_from_stats(stats_dict):
return visit_count_dict, visit_count_rdf_dict return visit_count_dict, visit_count_rdf_dict
def parse_input_date(date, periodicity):
input_formats = {'month':'%m/%Y',
'day': '%d/%m/%Y',
'hour': '%d/%m/%Y-%Hh'}
try:
return datetime.strptime(date, input_formats[periodicity])
except ValueError:
print 'Error : %s not a proper date' % date
return None
def track_progress(iterable, nb_ops=None, pb_size=20, pb_title=''):
# nb_ops must be set is iterable doesn't support length protocol
nb_ops = nb_ops or len(iterable)
pb = ProgressBar(nb_ops, size=pb_size, title=pb_title)
for item in iterable:
yield item
pb.update()
pb.finish()
class StatsUpdater(object):
def __init__(self, session):
self.session = session
self.config = session.vreg.config
self.allowed_etypes = frozenset(eschema.type for eschema in
session.vreg.schema.rschema('stats_about').objects())
self.all_hits = {}
hits_rset = session.execute('Any H,HC,HT,E,P,PSA,PSO WHERE '
'H is Hits, H count HC, H hit_type HT, '
'H stats_about E, H period P, P start PSA, P stop PSO')
for hit in hits_rset.entities():
hit_key = (hit.stats_about[0].eid, hit.period[0].eid, hit.hit_type)
self.all_hits[hit_key] = hit
## internal utilities #####################################################
def awstats_filepath(self, date):
config = self.config
date_formats = {'month': '%m%Y',
'day': '%m%Y%d',
'hour':'%m%Y%d%H'}
domain = config['awstats-domain']
if config['awstats-domain']:
domain_ext = '.' + config['awstats-domain']
else:
domain_ext = ''
filename = 'awstats%s%s.txt' % (
date.strftime(date_formats[config['awstats-periodicity']]),
domain_ext)
return osp.join(config['awstats-dir'], filename)
def stats_period_for_date(self, chosendate):
""" return a statperiod for the current month, if it doesn't exist, create it """
periodicity = self.config['awstats-periodicity']
if periodicity == 'month':
start = first_day(chosendate)
stop = last_day(start)
elif periodicity == 'day':
start = datetime(chosendate.year, chosendate.month, chosendate.day)
stop = datetime(chosendate.year, chosendate.month, chosendate.day, 23, 59, 59)
elif periodicity == 'hour':
start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour)
stop = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59)
return get_or_create_statperiod(self.session, start, stop)
## update API #############################################################
def update_stats(self, start, stop, skip_compress=False):
''' parses awstats and creates or updates the corresponding
data in the cubicweb instance
:param start: period start (included)
:param stop: period stop (excluded)
'''
session.set_cnxset()
stats_report = dict.fromkeys(('updated', 'created', 'exists no change',
'skipped', 'periods', 'compressed'), 0)
for chosendate in track_progress(date_range(start, stop), (stop-start).days,
pb_size=70, pb_title='Import'):
self._update_stats_for_date(chosendate, stats_report)
if not skip_compress:
compress_old_hits(self.session, stats_report)
self.session.commit()
return stats_report
def _update_stats_for_date(self, chosendate, stats_report):
stats_dict = extract_stats_dict(self.awstats_filepath(chosendate))
stats_period = self.stats_period_for_date(chosendate)
normal_dict, rdf_dict = url_count_from_stats(stats_dict)
for count_dict, hit_type in ((normal_dict, u'normal'),
(rdf_dict, u'rdf')):
for eid, values in count_dict.items():
status = self._update_hits_for_eid(eid, values,
stats_period, hit_type)
stats_report[status] += 1
def _update_hits_for_eid(self, eid, values, stats_period, hit_type):
visit_count = visit_count_rdf = 0
total_hits = sum([item[0] for item in values])
try:
entity = self.session.entity_from_eid(eid)
except UnknownEid:
return 'skipped'
if entity.__regid__ not in self.allowed_etypes:
return 'skipped'
try:
hit = self.all_hits[(eid, stats_period.eid, hit_type)]
except KeyError: # no hit yet, create one
status = 'created'
hit = self.session.create_entity('Hits', count=total_hits, hit_type=hit_type,
period=stats_period, stats_about=entity)
# append it to the cache
self.all_hits[(eid, stats_period.eid, hit_type)] = hit
else:
if hit.count != total_hits:
status = 'updated'
hit.set_attributes(count=total_hits)
else:
status = 'exists no change'
return status
class UpdateWebstatsCommand(Command): class UpdateWebstatsCommand(Command):
""" Update cubicweb web stats from awstats processed files. """ Update cubicweb web stats from awstats processed files.
...@@ -68,132 +192,10 @@ class UpdateWebstatsCommand(Command): ...@@ -68,132 +192,10 @@ class UpdateWebstatsCommand(Command):
max_args = 3 max_args = 3
options = [ options = [
("skip-compress", {"action": 'store_true', ("skip-compress", {"action": 'store_true',
'help' : u'Skip the compression of old daily hits into month stats'}), 'help' : u'Skip the compression of old daily hits into month stats'}),
] ]
def get_current_stats_period(self, session, chosendate):
""" return a statperiod for the current month, if it doesn't exist, create it """
start, stop = self.choose_period(session, chosendate)
return get_or_create_statperiod(session, start, stop)
def choose_period(self, session, chosendate):
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
if periodicity == 'month':
start = first_day(chosendate)
end = last_day(start)
elif periodicity == 'day':
start = datetime(chosendate.year, chosendate.month, chosendate.day)
end = datetime(chosendate.year, chosendate.month, chosendate.day, 23, 59, 59)
elif periodicity == 'hour':
start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour)
end = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59)
return start, end
def choose_dateformat(self, periodicity):
return {'hour':'%m%Y%d%H',
'day': '%m%Y%d',
'month': '%m%Y'}[periodicity]
def update_stats(self, session, args):
''' parses awstats and creates or updates the corresponding
data in the cubicweb instance'''
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
assert periodicity in ('hour', 'day', 'month')
start = stop = None
if args:
# FIXME - adapt according to periodicity
input_format = {'month':'%m/%Y',
'day': '%d/%m/%Y',
'hour': '%d/%m/%Y-%Hh'}[periodicity]
try:
start = datetime.strptime(args[0], input_format)
except ValueError:
print 'Error : %s not a proper date' % args[0]
return
if len(args) > 1:
try:
stop = datetime.strptime(args[1], input_format)
except ValueError:
print 'Error : %s not a proper date' % args[1]
return
else:
start = stop = datetime.now()
if stop is None:
stop = start
update_stats = {'updated':0,
'created':0,
'exists no change':0,
'skipped':0,
'periods':0,
'compressed':0
}
pb = ProgressBar(((stop+timedelta(days=1))-start).days, 70, title='Import')
for chosendate in date_range(start, stop+timedelta(days=1)):
self.update_stats_for_date(session, chosendate, update_stats)
pb.update()
pb.finish()
if not self.config.skip_compress:
compress_old_hits(session, update_stats)
print '''=== Update Report ===
Number of periods imported : %(periods)s
Number of stat objects created : %(created)s
Number of stat objects updated : %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped : %(skipped)s
Number of stat objects compressed : %(compressed)s
''' % update_stats
def update_stats_for_date(self, session, chosendate, update_stats):
stats_period = self.get_current_stats_period(session, chosendate)
periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
dateformat_in_file = self.choose_dateformat(periodicity)
domain = session.vreg.config.get('awstats-domain', '')
filename = 'awstats%s%s.txt' % (chosendate.strftime(dateformat_in_file), domain and '.%s' % domain)
awstatsdir = session.vreg.config.get('awstats-dir', '/var/lib/awstats')
stats_dict = extract_stats_dict(awstatsdir, filename)
normal_dict, rdf_dict = url_count_from_stats(stats_dict)
is_rdf = False
rql = 'Any N WHERE X relation_type R, R name "stats_about", X to_entity Y, Y name N'
rset = session.execute(rql)
allowed_types = [item[0] for item in rset]
for count_dict, is_rdf in ((normal_dict, False),
(rdf_dict, True)):
for eid, values in count_dict.items():
self.update_hits_for_eid(eid, values, session, update_stats,
allowed_types, stats_period, is_rdf)
def update_hits_for_eid(self, eid, values, session, update_stats,
allowed_types, stats_period, is_rdf):
visit_count = visit_count_rdf = 0
total_hits = sum([item[0] for item in values])
try:
entity = session.entity_from_eid(eid)
except UnknownEid:
update_stats['skipped'] += 1
return
if not entity.__regid__ in allowed_types:
update_stats['skipped'] += 1
return
rql = 'Any X,V WHERE X is Hits, X count V, X hit_type "%(hit_type)s",' \
'X stats_about E, E eid %(e)s, X period P, P eid %(sp)s'
rset = session.execute(rql % {'e':eid,
'sp':stats_period.eid,
'hit_type': is_rdf and 'rdf' or 'normal'})
if rset:
if rset[0][1] != total_hits:
update_stats['updated'] += 1
session.execute('SET X count %(hits)s WHERE X eid %(e)s' %
{'e':rset[0][0],
'hits':total_hits})
else:
update_stats['exists no change'] += 1
else:
update_stats['created'] += 1
session.create_entity('Hits', count = total_hits,
period=stats_period,
stats_about = entity,
hit_type=is_rdf and u'rdf' or u'normal')
## command / initial setup API ############################################
def _init_cw_connection(self, appid): def _init_cw_connection(self, appid):
config = cwconfig.instance_configuration(appid) config = cwconfig.instance_configuration(appid)
sourcescfg = config.sources() sourcescfg = config.sources()
...@@ -213,35 +215,33 @@ Number of stat objects compressed : %(compressed)s ...@@ -213,35 +215,33 @@ Number of stat objects compressed : %(compressed)s
break break
session = repo._get_session(cnx.sessionid) session = repo._get_session(cnx.sessionid)
# XXX keep reference on cnx otherwise cnx.__del__ will cause trouble # XXX keep reference on cnx otherwise cnx.__del__ will cause trouble
# (file a ticket)
return cnx, session return cnx, session
def main_run(self, args, rcfile=None):
"""Run the command and return status 0 if everything went fine.
If :exc:`CommandError` is raised by the underlying command, simply log
the error and return status 2.
Any other exceptions, including :exc:`BadCommandUsage` will be
propagated.
"""
# XXX (adim): rcfile handling is spectacularly messy but I can't
# get it right without refactoring pivotdoc for now
if rcfile is None:
if '-c' in args:
rcfile = args[args.index('-c')+1]
elif '--config' in args:
rcfile = args[args.index('--config')+1]
else:
rcfile = None#self.config.config
return Command.main_run(self, args, rcfile)
def run(self, args): def run(self, args):
# args = (appid, start[, stop])
appid = args.pop(0) appid = args.pop(0)
cw_cnx, session = self._init_cw_connection(appid) cw_cnx, session = self._init_cw_connection(appid)
session.set_cnxset() periodicity = session.vreg.config['awstats-periodicity']
self.update_stats(session, args) if start is None:
session.commit() start = datetime.now()
else:
start = parse_input_date(start, periodicity)
if stop is None:
stop = start
else:
stop = parse_input_date(stop, periodicity)
if start is None or stop is None:
sys.exit(1) # parse_input_date failed to parse date
stop += ONEDAY # date_range() excludes stop boundary
stats_updater = StatsUpdater(session)
stats_report = stats_updater.update_stats(start, stop)
print '''=== Update Report ===
Number of periods imported : %(periods)s
Number of stat objects created : %(created)s
Number of stat objects updated : %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped : %(skipped)s
Number of stat objects compressed : %(compressed)s
''' % stats_report
CWCTL.register(UpdateWebstatsCommand) CWCTL.register(UpdateWebstatsCommand)
...@@ -11,4 +11,11 @@ options = ( ...@@ -11,4 +11,11 @@ options = (
'help': 'domain of the website (eg. example.org). ', 'help': 'domain of the website (eg. example.org). ',
'group': 'awstats', 'level': 0, 'group': 'awstats', 'level': 0,
}), }),
('awstats-periodicity',
{'type' : 'choice',
'choices' : ('hour', 'day', 'month'),
'default': 'day',
'help': 'stats periodicity',
'group': 'awstats', 'level': 0,
}),
) )
...@@ -17,9 +17,12 @@ ...@@ -17,9 +17,12 @@
import re import re
import os.path as osp import os.path as osp
from datetime import datetime from datetime import datetime
from logilab.common.date import previous_month, first_day from logilab.common.date import previous_month, first_day
from logilab.common.shellutils import ProgressBar from logilab.common.shellutils import ProgressBar
from cubicweb.req import FindEntityError
SECTIONSPEC = { SECTIONSPEC = {
# commented sections are not usefull to view # commented sections are not usefull to view
# 'MAP' : ['section', 'offset'], # 'MAP' : ['section', 'offset'],
...@@ -91,7 +94,7 @@ ORIGIN_LABELS = { ...@@ -91,7 +94,7 @@ ORIGIN_LABELS = {
} }
def extract_stats_dict(awstats_dir, filename): def extract_stats_dict(filepath):
''' from an awstats file extract structured data into a dict ''' from an awstats file extract structured data into a dict
returns a dictionnary like this : returns a dictionnary like this :
...@@ -108,12 +111,12 @@ def extract_stats_dict(awstats_dir, filename): ...@@ -108,12 +111,12 @@ def extract_stats_dict(awstats_dir, filename):
} }
} }
''' '''
if not osp.isfile(osp.join(awstats_dir, filename)): if not osp.isfile(filepath):
return {} return {}
section_name = None section_name = None
parsed_countdown = 0 parsed_countdown = 0
stats_dict = {} stats_dict = {}
for line in file(osp.join(awstats_dir, filename)).readlines(): for line in file(filepath):
if line.startswith('BEGIN_'): if line.startswith('BEGIN_'):
section_name, nb_of_lines = line.split('_', 1)[1].split() section_name, nb_of_lines = line.split('_', 1)[1].split()
if section_name in SECTIONSPEC: if section_name in SECTIONSPEC:
...@@ -147,13 +150,9 @@ def eid_from_url(value): ...@@ -147,13 +150,9 @@ def eid_from_url(value):
pass pass
def get_or_create_statperiod(session, start, stop): def get_or_create_statperiod(session, start, stop):
rql = 'Any P WHERE P is StatPeriod, P start "%(start_date)s", P stop "%(end_date)s"' try:
rset = session.execute(rql % return session.find_one_entity('StatPeriod', start=start, stop=stop)
{'start_date':start, except FindEntityError:
'end_date':stop})
if rset:
return rset.get_entity(0, 0)
else:
return session.create_entity('StatPeriod', start=start, stop=stop) return session.create_entity('StatPeriod', start=start, stop=stop)
def compress_old_hits(req, update_stats={}): def compress_old_hits(req, update_stats={}):
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
import os import os
import os.path as osp
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
import urllib import urllib
...@@ -128,11 +129,11 @@ class AwstatsView(StartupView): ...@@ -128,11 +129,11 @@ class AwstatsView(StartupView):
filename = 'awstats%s%s.txt' % (month, domain and '.%s' % domain) filename = 'awstats%s%s.txt' % (month, domain and '.%s' % domain)
awstats_dir = self._cw.vreg.config['awstats-dir'] awstats_dir = self._cw.vreg.config['awstats-dir']
try: try:
stats_dict = extract_stats_dict(awstats_dir, filename) stats_dict = extract_stats_dict(osp.join(awstats_dir, filename))
except IOError: except IOError:
filename = 'awstats%s%s.txt' % (extract_available_months(form)[0], filename = 'awstats%s%s.txt' % (extract_available_months(form)[0],
domain and '.%s' % domain) domain and '.%s' % domain)
stats_dict = extract_stats_dict(awstats_dir, filename) stats_dict = extract_stats_dict(osp.join(awstats_dir, filename))
self.w(u'<div id="awstats">') self.w(u'<div id="awstats">')
self.w(u'<h1>%s : %s</h1>' % (_('Domain'), domain or 'default')) self.w(u'<h1>%s : %s</h1>' % (_('Domain'), domain or 'default'))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment