Commit 0c9ea9a7 authored by Arthur Lutz's avatar Arthur Lutz
Browse files

Initial functionality and simple views

parent ce2fbaa1d24e
# -*- coding: utf-8 -*-
"""update-webstats cubicweb plugin
Usage: cubicweb-ctl update-webstats [options] <instance-name>
This command will generate webstats objects for all linked document types.
"""
from datetime import datetime
from logilab.common.date import first_day, last_day
from cubicweb import cwconfig
from cubicweb import AuthenticationError
from cubicweb.dbapi import in_memory_repo_cnx
from cubicweb.toolsutils import Command
from cubicweb.cwctl import CWCTL
from utils import SECTIONSPEC, extract_stats_dict, eid_from_url
def url_count_from_stats(stats_dict):
visit_count_dict = {}
visit_count_rdf_dict = {}
for item in stats_dict['SIDER'].values():
url = item[SECTIONSPEC['SIDER'][0]]
hits = int(item[SECTIONSPEC['SIDER'][1]])
eid = eid_from_url(url)
if not eid:
continue
if 'rdf' in url:
visit_count_rdf_dict.setdefault(eid, [])
visit_count_rdf_dict[eid].append((hits, url))
else:
visit_count_dict.setdefault(eid, [])
visit_count_dict[eid].append((hits, url))
return visit_count_dict, visit_count_rdf_dict
class UpdateWebstatsCommand(Command):
""" Update web stats """
arguments = '<instance>'
name = 'update-webstats'
min_args = 1
def get_current_stats_period(self, session, chosendate):
start = first_day(chosendate)
end = last_day(start)
rset = session.execute('Any P WHERE P is StatPeriod, P start "%(start_date)s", P stop "%(end_date)s"' %
{'start_date':start,
'end_date':end})
if rset:
return rset.get_entity(0,0)
else:
return session.create_entity('StatPeriod', start=start, stop=end)
def update_stats(self, session, args):
if args:
chosendate = datetime.strptime(args[0], '%m/%Y')
else:
chosendate = datetime.now()
awstatsdir = session.vreg.config.get('awstats-dir', '/var/lib/awstats')
domain = session.vreg.config.get('awstats-domain', '')
filename = 'awstats%s%s.txt' % (chosendate.strftime('%m%Y'), domain and '.%s' % domain)
stats_dict = extract_stats_dict(awstatsdir, filename)
normal_dict, rdf_dict = url_count_from_stats(stats_dict)
is_rdf = False
rset = session.execute('Any N WHERE X relation_type R, R name "stats_about", X to_entity Y, Y name N')
allowed_types = [item[0] for item in rset]
update_stats = {'updated':0,
'created':0,
'exists no change':0,
'skipped':0,
}
for count_dict, is_rdf in ((normal_dict, False),
(rdf_dict, True)):
for eid, values in count_dict.items():
visit_count = visit_count_rdf = 0
total_hits = sum([item[0] for item in values])
stats_period = self.get_current_stats_period(session, chosendate)
entity = session.entity_from_eid(eid)
if not entity.__regid__ in allowed_types:
update_stats['skipped'] += 1
continue
rql = 'Any X,V WHERE X is Hits, X count V, X hit_type "%(hit_type)s", X stats_about E, E eid %(e)s, X period P, P eid %(sp)s'
rset = session.execute(rql % {'e':eid,
'sp':stats_period.eid,
'hit_type': is_rdf and 'rdf' or 'normal'})
if rset:
if rset[0][1] != total_hits:
print 'update', entity
update_stats['updated'] += 1
session.execute('SET X count %(hits)s WHERE X eid %(e)s' %
{'e':rset[0][0],
'hits':total_hits})
else:
print 'no change', entity
update_stats['exists no change'] += 1
else:
print 'create', entity
update_stats['created'] += 1
session.create_entity('Hits', count = total_hits, period=stats_period,
stats_about = entity, hit_type=is_rdf and u'rdf' or u'normal')
print update_stats
def _init_cw_connection(self, appid):
self.config = config = cwconfig.instance_configuration(appid)
sourcescfg = config.sources()
config.set_sources_mode(('system',))
cnx = repo = None
while cnx is None:
try:
login = sourcescfg['admin']['login']
pwd = sourcescfg['admin']['password']
except KeyError:
login, pwd = manager_userpasswd()
try:
repo, cnx = in_memory_repo_cnx(config, login=login, password=pwd)
except AuthenticationError:
print 'wrong user/password'
else:
break
session = repo._get_session(cnx.sessionid)
# XXX keep reference on cnx otherwise cnx.__del__ will cause trouble
# (file a ticket)
return cnx, session
def main_run(self, args, rcfile=None):
"""Run the command and return status 0 if everything went fine.
If :exc:`CommandError` is raised by the underlying command, simply log
the error and return status 2.
Any other exceptions, including :exc:`BadCommandUsage` will be
propagated.
"""
# XXX (adim): rcfile handling is spectacularly messy but I can't
# get it right without refactoring pivotdoc for now
if rcfile is None:
if '-c' in args:
rcfile = args[args.index('-c')+1]
elif '--config' in args:
rcfile = args[args.index('--config')+1]
else:
rcfile = None#self.config.config
return Command.main_run(self, args, rcfile)
def run(self, args):
appid = args.pop(0)
cw_cnx, session = self._init_cw_connection(appid)
session.set_pool()
self.update_stats(session, args)
session.commit()
CWCTL.register(UpdateWebstatsCommand)
......@@ -15,3 +15,20 @@
# with this program. If not, see <http://www.gnu.org/licenses/>.
"""cubicweb-awstats schema"""
from yams.buildobjs import (EntityType, String, Int, Date, Boolean,
SubjectRelation, RelationDefinition, RelationType)
_ = unicode
class StatPeriod(EntityType):
start = Date()
stop = Date()
class Hits(EntityType):
name = String(maxsize=128)
hit_type = String(maxsize=128)
count = Int()
period = SubjectRelation('StatPeriod', cardinality='?*')
# to establish a link with the entity you want to build stats for
# you need to create a relationship Hits "stats_about" X
import re
import os.path as osp
SECTIONSPEC = {
# commented sections are not usefull to view
# 'MAP' : ['section', 'offset'],
# 'GENERAL': ['key', None],
'TIME': ['hour', 'pages', 'hits', 'bandwidth', 'not viewed pages', 'not viewed hits', 'not viewed bandwidth'],
'VISITOR': ['host', 'pages', 'hits', 'bandwidth', 'last visit date', 'start date of last visit', 'last page of last visit'],
'DAY': ['date', 'pages', 'hits', 'bandwidth', 'visits'],
'DOMAIN': ['domain', 'pages', 'hits', 'bandwidth'],
'LOGIN': ['cluster id', 'pages', 'hits', 'bandwidth', 'last visit date'],
'ROBOT': ['most visiting robots', 'hits', 'bandwidth', 'last visit', 'hits on robots.txt'],
'WORMS': ['worm id', 'hits', 'bandwidth', 'last visit'],
'EMAILSENDER': ['email', 'hits', 'bandwidth', 'last visit'],
'EMAILRECEIVER': ['email', 'hits', 'bandwidth', 'last visit'],
'SESSION': ['session range', 'hits'],
'SIDER': ['most visited URLs', 'hits', 'bandwidth', 'entry', 'exit'],
'FILETYPES': ['served files type', 'hits', 'bandwidth', 'bandwidth without compression', 'bandwidth after compression'],
'OS': ['operating systems', 'hits'],
'BROWSER': ['browser id', 'hits'],
'SCREENSIZE': ['screen size', 'hits'],
'UNKNOWNREFERER': ['unknown referer os', 'last visit date'],
'UNKNOWNREFERERBROWSER': ['unknown referer browser', 'last visit date'],
'ORIGIN': ['origin', 'pages', 'hits'],
'SEREFERRALS': ['search engine referers id', 'pages', 'hits'],
'PAGEREFS': ['external page referers', 'pages', 'hits'],
'SEARCHWORDS': ['main search keyphrases', 'hits'],
'KEYWORDS': ['main search keyword', 'hits'],
#'MISC': ['misc id', 'pages', 'hits', 'bandwidth'],
'ERRORS': ['errors', 'hits', 'bandwidth'],
'CLUSTER': ['cluster id', 'pages', 'hits', 'bandwidth'],
'SIDER_404': ['urls with 404 errors', 'hits', 'last url referer'],
}
def extract_stats_dict(awstats_dir, filename):
''' from an awstats file extract structured data into a dict
returns a dictionnary like this :
{'SIDER': {
'/someurl': {
'most visisted url':'/someurl',
'hits' : '1234',
'bandwidth' : '4321',
'entry' : '12',
'exit' : '8'
}
...
}
}
'''
section_name = None
parsed_countdown = 0
stats_dict = {}
for line in file(osp.join(awstats_dir, filename)).readlines():
if line.startswith('BEGIN_'):
section_name, nb_of_lines = line.split('_', 1)[1].split()
if section_name in SECTIONSPEC:
stats_dict.setdefault(section_name, {})
parsed_countdown = int(nb_of_lines)-1 if int(nb_of_lines) else 0
elif section_name and parsed_countdown:
for index, value in enumerate(line.split()):
key = line.split()[0]
stats_dict[section_name].setdefault(key, {})
try:
stats_dict[section_name][key][SECTIONSPEC[section_name][index]] = value
except IndexError:
print index, value, line
parsed_countdown -= 1
elif section_name and parsed_countdown == 0:
section_name = None
return stats_dict
def eid_from_url(value):
''' return an eid from an url '''
# FIXME - should use url_resolver for a more serious guess
# FIXME - BNF specific right now
for pattern in ['/(\d+)/(.*?)/',
'/(.*?)/(.*?)/(fr|en|es).html',
'/(.*?)/(.*?)/rdf.(xml|n3|nt)']:
match = re.search(pattern, value)
if match and match.group(1):
try:
return int(match.group(1))
except:
pass
......@@ -15,3 +15,22 @@
# with this program. If not, see <http://www.gnu.org/licenses/>.
"""cubicweb-awstats views/forms/actions/components for web ui"""
from cubicweb.web.views import primary
from cubicweb.selectors import is_instance
class StatPeriodPrimaryView(primary.PrimaryView):
__select__ = is_instance('StatPeriod')
def cell_call(self, row, col):
entity = self.cw_rset.get_entity(row, col)
self.w(u'<h1>%s %s - %s</h1>' % (_('Stats for period :'), entity.start, entity.stop) )
# TODO - could loop over hit_type and make tabs
rql = 'Any X, T, C ORDERBY C DESC WHERE H is Hits, H stats_about X, H hit_type T, H count C, H period P, P eid %(e)s'
rset = self._cw.execute(rql, {'e':entity.eid})
self.w(self._cw.view('table', rset))
def registration_callback(vreg):
vreg.register(StatPeriodPrimaryView)
# copyright 2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr -- mailto:contact@logilab.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
"""cubicweb-awstats views/forms/actions/components for web ui"""
from cubicweb.web.views import primary
from cubicweb.selectors import is_instance
class StatPeriodPrimaryView(primary.PrimaryView):
__select__ = is_instance('StatPeriod')
def cell_call(self, row, col):
entity = self.cw_rset.get_entity(row, col)
self.w(u'<h1>%s %s - %s</h1>' % (_('Stats for period :'), entity.start, entity.stop) )
# TODO - could loop over hit_type and make tabs
# TODO - facets ?
rql = 'Any X, T, C ORDERBY C DESC WHERE H is Hits, H stats_about X, H hit_type T, H count C, H period P, P eid %(e)s'
rset = self._cw.execute(rql, {'e':entity.eid})
self.w(self._cw.view('table', rset, 'null'))
def registration_callback(vreg):
vreg.register(StatPeriodPrimaryView)
# copyright 2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr -- mailto:contact@logilab.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
"""cubicweb-awstats startup views """
import os
import os.path as osp
import re
from datetime import datetime
from logilab.mtconverter import xml_escape
from logilab.common.textutils import BYTE_UNITS
from cubicweb.view import StartupView
from cubicweb.web.views import forms
from cubicweb.web.formfields import StringField
from cubicweb.web import formwidgets as fwdgs
from cubicweb import tags, Unauthorized
from cubes.awstats.utils import SECTIONSPEC, extract_stats_dict
# FIXME - find a clean way to attach or pass this to form so it is available in form choices
AWSTATS_DIR = '/var/lib/awstats'
#AWSTATS_DIR = '/home/arthur/local/awstats'
#AWSTATS_DIR = self._cw.vreg.config['awstats-dir']
def extract_available_months(form, **attrs):
months = []
selected_domain = form.req.form.get('domain', '')
#rint 'XXX', form.domain.choices[0]
for filename in os.listdir(AWSTATS_DIR):
match = re.search('awstats(\d{6})\.?%s.txt' % selected_domain, filename)
if match:
months.append(match.group(1))
months.sort()
return months
def extract_available_domains(form, **attrs):
domains = []
for filename in os.listdir(AWSTATS_DIR):
match = re.search('awstats(\d{2})(\d{4})\.?(.*).txt', filename)
if match and match.group(3) not in domains:
domains.append(match.group(3))
domains.sort()
return domains
def use_as_sort_key(value):
try:
return int(value)
except ValueError:
return value
def specific_format(header, value):
if value is None:
return
elif header == 'bandwidth':
return convert_to_bytes(int(value))
elif value and value.startswith('http://'):
return '<a href="%s">%s</a>' % (value, value)
elif re.search('^\d{14}$', value):
return datetime.strptime(value, '%Y%m%d%H%M%S%f').strftime('%d/%m/%Y %H:%M')
elif re.search('^\d{8}$', value):
try:
return datetime.strptime(value, '%Y%m%d').strftime('%d/%m/%Y')
except ValueError:
pass
return xml_escape(value)
def convert_to_bytes(value):
ordered = [(size, label) for label,size in BYTE_UNITS.items()]
ordered.sort(reverse=True)
for size, label in ordered:
if value / size != 0:
return '%s %s' % (value / size, label)
class AwstatsRefreshForm(forms.FieldsForm):
__regid__ = 'select-awstats'
action = '/?vid=awstats'
domain = StringField(widget=fwdgs.Select(attrs={'onchange':'this.form.submit()'}),
label=_('Domain:'),
choices=extract_available_domains)
month = StringField(widget=fwdgs.Select(attrs={'onchange':'this.form.submit()'}),
label=_('Period:'),
choices=extract_available_months)
limit = StringField(widget=fwdgs.Select(attrs={'onchange':'this.form.submit()'}),
label=_('Number of results :'),
choices=[10,25,50,100])
section = StringField(widget=fwdgs.Select(attrs={'onchange':'this.form.submit()'}),
label=_('Show section :'),
choices=['',]+SECTIONSPEC.keys())
form_buttons = [fwdgs.SubmitButton(label=_('Apply'))]
class AwstatsView(StartupView):
__regid__ = 'awstats'
def call(self):
req = self._cw
form = self._cw.vreg['forms'].select('select-awstats', self._cw)
form.render(w=self.w)
domain = req.form.get('domain', '')
month = req.form.get('month', extract_available_months(form)[0])
limit = int(req.form.get('limit', 10))
filename = 'awstats%s%s.txt' % (month, domain and '.%s' % domain)
try:
stats_dict = extract_stats_dict(AWSTATS_DIR, filename)
except IOError:
filename = 'awstats%s%s.txt' % (extract_available_months(form)[0], domain and '.%s' % domain)
stats_dict = extract_stats_dict(AWSTATS_DIR, filename)
self.w(u'<h1>%s : %s</h1>' % (_('Domain'), domain or 'default'))
self.w(u'<h2>%s : %s</h2>' % (_('Time period'), '%s/%s' % (month[:2], month[2:]) ))
if req.form.get('section'):
self.generic_table(req.form.get('section'), stats_dict, limit)
else:
self.render_navigation(stats_dict)
for key, value in SECTIONSPEC.items():
self.generic_table(key, stats_dict, limit)
#if value[1] == 'hits':
# self.simple_hits_display(key, stats_dict, limit)
def render_navigation(self, stats_dict):
self.w(u'<div>')
# FIXME - have inline list using css (better : in cubicweb)
self.w(u'<ul style="list-style-type: none;">')
for key in SECTIONSPEC.keys():
if stats_dict[key].values():
self.w(u'<li><a href="#%s">%s</a></li>' % (key, key))
self.w(u'</ul>')
self.w(u'</div>')
def generic_table(self, section_name, stats_dict, limit):
if not stats_dict[section_name].values():
return
self.w(u'<a name="%s"/>' % section_name)
self.w(u'<div><table class="listing">')
self.w(u'<tr class="header">')
for header in SECTIONSPEC[section_name]:
self.w(u'<th>%s</th>' % header)
self.w(u'</tr><tbody>')
if "hits" in SECTIONSPEC[section_name]:
order_key = "hits"
else:
order_key = SECTIONSPEC[section_name][1]
for item in stats_dict[section_name].values():
try:
item[order_key]
except KeyError:
print item
ordered_values = [(int(item[order_key]), item) for item in stats_dict[section_name].values()]
ordered_values.sort(reverse=True)
for index, item in enumerate([item[1] for item in ordered_values]):
self.w(u'<tr>')
for header in SECTIONSPEC[section_name]:
self.w(u'<td>%s</td>' % specific_format(header, item.get(header)))
self.w(u'</tr>')
if index > limit:
break
self.w(u'</tbody></table></div><br/>')
def simple_hits_display(self, section_name, stats_dict, limit):
key = SECTIONSPEC[section_name][0]
value_list = [(use_as_sort_key(item['hits']), item[key]) for item in stats_dict[section_name].values() if 'hits' in item]
if value_list:
value_list.sort(reverse=True)
itemlist = ''.join(['<li>%s (%s)</li>' % (xml_escape(item[1]), item[0]) for item in value_list[:limit]])
self.w(u'<ul><li><h2>%s (%s)</h2></li><ul> %s</ul></ul>' % (key, len(value_list), itemlist))
def registration_callback(vreg):
vreg.register(AwstatsView)
vreg.register(AwstatsRefreshForm)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment