Commit bad0f99b authored by Arthur Lutz's avatar Arthur Lutz
Browse files

Corrected functionnality, refactoring, now working

parent 74238ebac23a
......@@ -7,9 +7,9 @@ This command will generate webstats objects for all linked document types.
"""
import os.path as osp
from datetime import datetime
from datetime import datetime, timedelta
from logilab.common.date import first_day, last_day, date_range, ONDEDAY
from logilab.common.date import first_day, last_day, date_range, ONEDAY
from logilab.common.shellutils import ProgressBar
from cubicweb import cwconfig, UnknownEid
......@@ -23,7 +23,7 @@ from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \
get_or_create_statperiod, compress_old_hits
def url_count_from_stats(stats_dict):
def url_count_from_stats(session, stats_dict):
'''
parse most visited urls in stats_dict generated from awstats txt file
......@@ -37,7 +37,7 @@ def url_count_from_stats(stats_dict):
for item in stats_dict['SIDER'].values():
url = item[SECTIONSPEC['SIDER'][0]]
hits = int(item[SECTIONSPEC['SIDER'][1]])
eid = eid_from_url(url)
eid = eid_from_url(session, url)
if not eid:
continue
if 'rdf' in url:
......@@ -62,25 +62,32 @@ def parse_input_date(date, periodicity):
def track_progress(iterable, nb_ops=None, pb_size=20, pb_title=''):
# nb_ops must be set is iterable doesn't support length protocol
nb_ops = nb_ops or len(iterable)
if nb_ops is None:
nb_ops = len(iterable)
pb = ProgressBar(nb_ops, size=pb_size, title=pb_title)
for item in iterable:
yield item
pb.update()
yield item
pb.finish()
class StatsUpdater(object):
def __init__(self, session):
def __init__(self, session, start, stop):
self.session = session
self.config = session.vreg.config
self.start = start
self.stop = stop
self.allowed_etypes = frozenset(eschema.type for eschema in
session.vreg.schema.rschema('stats_about').objects())
self.all_hits = {}
hits_rset = session.execute('Any H,HC,HT,E,P,PSA,PSO WHERE '
'H is Hits, H count HC, H hit_type HT, '
'H stats_about E, H period P, P start PSA, P stop PSO')
for hit in hits_rset.entities():
'H stats_about E, H period P, P start PSA, P stop PSO '
'HAVING (PSA >= %(start)s, PSO <= %(stop)s) ',
{'start':start,
'stop':stop})
for hit in track_progress(hits_rset.entities(), nb_ops=len(hits_rset),
pb_size=62, pb_title='Building cache'):
hit_key = (hit.stats_about[0].eid, hit.period[0].eid, hit.hit_type)
self.all_hits[hit_key] = hit
......@@ -100,7 +107,7 @@ class StatsUpdater(object):
domain_ext)
return osp.join(config['awstats-dir'], filename)
def stats_period_for_date(self, chosendate):
def stats_period_for_date(self, chosendate, stats_report):
""" return a statperiod for the current month, if it doesn't exist, create it """
periodicity = self.config['awstats-periodicity']
if periodicity == 'month':
......@@ -112,20 +119,20 @@ class StatsUpdater(object):
elif periodicity == 'hour':
start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour)
stop = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59)
return get_or_create_statperiod(self.session, start, stop)
return get_or_create_statperiod(self.session, start, stop, stats_report)
## update API #############################################################
def update_stats(self, start, stop, skip_compress=False):
def update_stats(self, skip_compress=False):
''' parses awstats and creates or updates the corresponding
data in the cubicweb instance
:param start: period start (included)
:param stop: period stop (excluded)
'''
session.set_cnxset()
stats_report = dict.fromkeys(('updated', 'created', 'exists no change',
'skipped', 'periods', 'compressed'), 0)
for chosendate in track_progress(date_range(start, stop), (stop-start).days,
for chosendate in track_progress(date_range(self.start, self.stop),
(self.stop-self.start).days,
pb_size=70, pb_title='Import'):
self._update_stats_for_date(chosendate, stats_report)
if not skip_compress:
......@@ -135,8 +142,8 @@ class StatsUpdater(object):
def _update_stats_for_date(self, chosendate, stats_report):
stats_dict = extract_stats_dict(self.awstats_filepath(chosendate))
stats_period = self.stats_period_for_date(chosendate)
normal_dict, rdf_dict = url_count_from_stats(stats_dict)
stats_period = self.stats_period_for_date(chosendate, stats_report)
normal_dict, rdf_dict = url_count_from_stats(self.session, stats_dict)
for count_dict, hit_type in ((normal_dict, u'normal'),
(rdf_dict, u'rdf')):
for eid, values in count_dict.items():
......@@ -173,8 +180,8 @@ class StatsUpdater(object):
class UpdateWebstatsCommand(Command):
""" Update cubicweb web stats from awstats processed files.
If startdate is not entered, the update will be done on current
day or current month. If only startdate is enterred, the day or
If startdate is not entered, the update will be done on the previous
day or the previous month. If only startdate is enterred, the day or
month will be processed. If both dates are enterred, all the dates
between these two dates will be processed.
......@@ -193,6 +200,8 @@ class UpdateWebstatsCommand(Command):
options = [
("skip-compress", {"action": 'store_true',
'help' : u'Skip the compression of old daily hits into month stats'}),
("today", {"action": 'store_true',
'help' : u'Process stats for the current day (for testing)'}),
]
## command / initial setup API ############################################
......@@ -221,20 +230,26 @@ class UpdateWebstatsCommand(Command):
# args = (appid, start[, stop])
appid = args.pop(0)
cw_cnx, session = self._init_cw_connection(appid)
session.set_cnxset()
periodicity = session.vreg.config['awstats-periodicity']
start = stop = None
if len(args) > 0:
start = parse_input_date(args[0], periodicity)
if start is None:
start = datetime.now()
else:
start = parse_input_date(start, periodicity)
if self.config.today:
chosendate = datetime.now()
else:
chosendate = datetime.now()-timedelta(1)
start = datetime(chosendate.year, chosendate.month, chosendate.day)
if len(args) > 1:
stop = parse_input_date(args[1], periodicity)
if stop is None:
stop = start
else:
stop = parse_input_date(stop, periodicity)
if start is None or stop is None:
sys.exit(1) # parse_input_date failed to parse date
stop += ONEDAY # date_range() excludes stop boundary
stats_updater = StatsUpdater(session)
stats_report = stats_updater.update_stats(start, stop)
stats_updater = StatsUpdater(session, start, stop)
stats_report = stats_updater.update_stats(self.config.skip_compress)
print '''=== Update Report ===
Number of periods imported : %(periods)s
Number of stat objects created : %(created)s
......
......@@ -16,9 +16,9 @@
import re
import os.path as osp
from datetime import datetime
from datetime import datetime, date
from logilab.common.date import previous_month, first_day
from logilab.common.date import previous_month, first_day, date_range, last_day
from logilab.common.shellutils import ProgressBar
from cubicweb.req import FindEntityError
......@@ -57,32 +57,32 @@ SECTIONSPEC = {
SECTIONLABELS = {
'TIME': "Visits by hour",
'VISITOR': 'Top visitors (by host)',
'DAY': 'Visits by days of the month',
'DOMAIN': 'Visitors domains/countries',
'LOGIN': 'logged in users',
'ROBOT': 'Robots/Spiders visitors',
'WORMS': 'Worm visits',
'EMAILSENDER': 'email sender',
'EMAILRECEIVER': 'email receiver',
'SESSION': 'Visits duration',
'SIDER': 'Most visited URLs',
'FILETYPES': 'Visited file types',
'OS': 'Visiting operating systems',
'BROWSER': 'Visiting browsers',
'SCREENSIZE': 'Hits by Screen size',
'UNKNOWNREFERER': 'Unknown referer os',
'UNKNOWNREFERERBROWSER': 'Unknown referer browser',
'ORIGIN': 'Origin of hits',
'SEREFERRALS': 'Search engine referers hits',
'PAGEREFS': 'Main external page referers',
'SEARCHWORDS': 'Hits from search keyphrases',
'KEYWORDS': 'Hits from search keywords',
#'MISC': ['misc id', 'pages', 'hits', 'bandwidth'],
'ERRORS': 'HTTP Status codes',
'CLUSTER': 'Visits by cluster id',
'SIDER_404': 'Hits with 404 errors',
'TIME': _('Visits by hour'),
'VISITOR': _('Top visitors (by host)'),
'DAY': _('Visits by days of the month'),
'DOMAIN': _('Visitors domains/countries'),
'LOGIN': _('logged in users'),
'ROBOT': _('Robots/Spiders visitors'),
'WORMS': _('Worm visits'),
'EMAILSENDER': _('email sender'),
'EMAILRECEIVER': _('email receiver'),
'SESSION': _('Visits duration'),
'SIDER': _('Most visited URLs'),
'FILETYPES': _('Visited file types'),
'OS': _('Visiting operating systems'),
'BROWSER': _('Visiting browsers'),
'SCREENSIZE': _('Hits by Screen size'),
'UNKNOWNREFERER': _('Unknown referer os'),
'UNKNOWNREFERERBROWSER': _('Unknown referer browser'),
'ORIGIN': _('Origin of hits'),
'SEREFERRALS': _('Search engine referers hits'),
'PAGEREFS': _('Main external page referers'),
'SEARCHWORDS': _('Hits from search keyphrases'),
'KEYWORDS': _('Hits from search keywords'),
#'MISC': ['misc id'), 'pages'), 'hits'), 'bandwidth'],
'ERRORS': _('HTTP Status codes'),
'CLUSTER': _('Visits by cluster id'),
'SIDER_404': _('Hits with 404 errors'),
}
ORIGIN_LABELS = {
......@@ -135,10 +135,16 @@ def extract_stats_dict(filepath):
section_name = None
return stats_dict
def eid_from_url(value):
def eid_from_url(session, value):
''' return an eid from an url '''
# FIXME - should use url_resolver for a more serious guess
# FIXME - BNF specific right now
#url_resolver = session.vreg['components'].select('urlpublisher',
# vreg=session.vreg)
#req = session
#req.form = {}
#pmid, rset = url_resolver.process(session, value)
#print value, pmid, rset
for pattern in ['/(\d+)/(.*?)/',
'/(.*?)/(.*?)/(fr|en|es).html',
'/(.*?)/(.*?)/rdf.(xml|n3|nt)']:
......@@ -149,53 +155,60 @@ def eid_from_url(value):
except:
pass
def get_or_create_statperiod(session, start, stop):
try:
return session.find_one_entity('StatPeriod', start=start, stop=stop)
except FindEntityError:
def get_or_create_statperiod(session, start, stop, stats_report={}):
rql = 'Any P WHERE P is StatPeriod, P start "%(start_date)s", P stop "%(end_date)s"'
rset = session.execute(rql %
{'start_date':start,
'end_date':stop})
if rset:
return rset.get_entity(0, 0)
else:
stats_report.setdefault('periods', 0)
stats_report['periods'] += 1
return session.create_entity('StatPeriod', start=start, stop=stop)
def compress_old_hits(req, update_stats={}):
# TODO roll complete 12 months into a year
rql = 'Any X WHERE X is StatPeriod, X start START, X stop STOP, X stop <= %(date)s HAVING STOP-START <= 20'
rset = req.execute(rql, {'date':previous_month(datetime.now(), 4)})
if not rset:
return
pb = ProgressBar(16, 55, title='Compressing old stats')
for monthsbefore in range(4,20):
pb.update()
stop = previous_month(datetime.now(), monthsbefore)
start = first_day(stop)
rql = 'Any E, SUM(C) GROUPBY E WHERE X is Hits, X count C, X hit_type %(hit_type)s,'\
'X period P, P start >= %(start)s, P stop <= %(stop)s, X stats_about E,'\
'S start START, S stop STOP HAVING STOP-START <= 20'
def time_params(req):
params = []
rset = req.execute('Any START ORDERBY START LIMIT 1 WHERE P is StatPeriod, P start START, P stop STOP HAVING STOP-START <= 2')
for (item,) in rset:
for first_day in date_range(previous_month(item), previous_month(datetime.now(), 5), incmonth=True):
delta = 2
params.append((first_day, last_day(first_day), delta))
# TODO - roll complete 12 months into a year
return params
def compress_old_hits(req, update_stats={}, progressbar=True):
tp = time_params(req)
if progressbar:
pb = ProgressBar(len(tp), 55, title='Compressing old stats')
for start, stop, delta in tp:
if progressbar:
pb.update()
rql = 'DISTINCT Any E,SUM(C) GROUPBY E WHERE H is Hits, H count C, H hit_type %(hit_type)s,'\
'H period P, P start >= %(start)s, P stop <= %(stop)s, H stats_about E,'\
'P start START, P stop STOP HAVING STOP-START <= %(timedelta)s'
results = {}
type_rset = req.execute('Any C GROUPBY C WHERE X is Hits, X hit_type C')
for hit_type in type_rset:
results[hit_type[0]] = req.execute(rql, {'start': start,
'stop': stop,
'hit_type':hit_type[0]})
type_rset = req.execute('DISTINCT Any C WHERE X is Hits, X hit_type C')
for (hit_type,) in type_rset:
results[hit_type] = req.execute(rql,
{'start': start,
'stop': stop,
'hit_type':hit_type,
'timedelta': delta})
if not any(results.values()):
continue
req.execute('DELETE StatPeriod P WHERE P start >= %(start)s, P stop <= %(stop)s',
{'start': start,
'stop': stop,})
# deleting statperiod deletes all associated hits
drset = req.execute('DELETE StatPeriod P WHERE P start >= %(start)s, P stop <= %(stop)s',
{'start': start,
'stop': stop,})
update_stats['compressed'] += len(drset)
stp = get_or_create_statperiod(req, start, stop)
created_entities = []
for hit_type, rset in results.items():
for eid, count in rset:
content_entity = req.entity_from_eid(eid)
try:
created_entities.append(req.create_entity('Hits', hit_type=hit_type, period=stp, count=count,
stats_about=content_entity))
except Exception:
pass
delete_rql = 'DISTINCT Hits X WHERE X period P, P start >= %(start)s, P stop <= %(stop)s, S start START, S stop STOP HAVING STOP-START <= 20'
rset = req.execute(delete_rql, {'start': start,
'stop': stop})
created_eids = [x.eid for x in created_entities]
for e in rset.entities():
if e.eid not in created_eids:
e.cw_delete()
update_stats['compressed'] += 1
pb.finish()
# FIXME if Hits for period and content exist, update it ?
req.create_entity('Hits', hit_type=hit_type, period=stp, count=count,
stats_about=content_entity)
if progressbar:
pb.finish()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment