ccplugin.py 10.9 KB
Newer Older
1
2
3
# -*- coding: utf-8 -*-
"""update-webstats cubicweb plugin

4
Usage: cubicweb-ctl update-webstats [options] <instance-name> startdate [stopdate]
5
6
7
8

This command will generate webstats objects for all linked document types.
"""

9
import os.path as osp
10
from datetime import datetime, timedelta
11

12
from logilab.common.date import first_day, last_day, date_range, ONEDAY
13
from logilab.common.shellutils import ProgressBar
14

15
from cubicweb import cwconfig, UnknownEid
16
17
18
19
20
21
from cubicweb import AuthenticationError
from cubicweb.dbapi import in_memory_repo_cnx

from cubicweb.toolsutils import Command
from cubicweb.cwctl import CWCTL

22
23
from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \
     get_or_create_statperiod, compress_old_hits
24

25

Arthur Lutz's avatar
Arthur Lutz committed
26
def url_count_from_stats(cnx, stats_dict):
Arthur Lutz's avatar
Arthur Lutz committed
27
28
29
30
31
32
    '''
    parse most visited urls in stats_dict generated from awstats txt file

    returns two dictionnaries with eid as key and sequence of values as value
    one for normal navigation, the other for rdf navigation
    '''
33
    if 'SIDER' not in stats_dict:
Arthur Lutz's avatar
Arthur Lutz committed
34
        return {}, {}
35
36
37
38
39
    visit_count_dict = {}
    visit_count_rdf_dict = {}
    for item in stats_dict['SIDER'].values():
        url = item[SECTIONSPEC['SIDER'][0]]
        hits = int(item[SECTIONSPEC['SIDER'][1]])
Arthur Lutz's avatar
Arthur Lutz committed
40
41
        req = cnx.request()
        eid = eid_from_url(req, url)
42
43
44
45
46
47
48
49
50
51
52
        if not eid:
            continue
        if 'rdf' in url:
            visit_count_rdf_dict.setdefault(eid, [])
            visit_count_rdf_dict[eid].append((hits, url))
        else:
            visit_count_dict.setdefault(eid, [])
            visit_count_dict[eid].append((hits, url))
    return visit_count_dict, visit_count_rdf_dict


53
54
55
56
57
58
59
60
61
62
63
64
65
def parse_input_date(date, periodicity):
    input_formats = {'month':'%m/%Y',
                     'day': '%d/%m/%Y',
                     'hour': '%d/%m/%Y-%Hh'}
    try:
        return datetime.strptime(date, input_formats[periodicity])
    except ValueError:
        print 'Error : %s not a proper date' % date
        return None


def track_progress(iterable, nb_ops=None, pb_size=20, pb_title=''):
    # nb_ops must be set is iterable doesn't support length protocol
66
67
    if nb_ops is None:
        nb_ops = len(iterable)
68
69
70
    pb = ProgressBar(nb_ops, size=pb_size, title=pb_title)
    for item in iterable:
        pb.update()
71
        yield item
72
73
74
75
    pb.finish()


class StatsUpdater(object):
Arthur Lutz's avatar
Arthur Lutz committed
76
    def __init__(self, session, cnx, start, stop):
77
        self.session = session
Arthur Lutz's avatar
Arthur Lutz committed
78
        self.cnx = cnx
79
        self.config = session.vreg.config
80
81
        self.start = start
        self.stop = stop
82
83
84
85
86
        self.allowed_etypes = frozenset(eschema.type for eschema in
                                        session.vreg.schema.rschema('stats_about').objects())
        self.all_hits = {}
        hits_rset = session.execute('Any H,HC,HT,E,P,PSA,PSO WHERE '
                                    'H is Hits, H count HC, H hit_type HT, '
87
88
89
90
91
92
                                    'H stats_about E, H period P, P start PSA, P stop PSO '
                                    'HAVING (PSA >= %(start)s, PSO <= %(stop)s) ',
                                    {'start':start,
                                     'stop':stop})
        for hit in track_progress(hits_rset.entities(), nb_ops=len(hits_rset),
                                  pb_size=62, pb_title='Building cache'):
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
            hit_key = (hit.stats_about[0].eid, hit.period[0].eid, hit.hit_type)
            self.all_hits[hit_key] = hit

    ## internal utilities #####################################################
    def awstats_filepath(self, date):
        config = self.config
        date_formats = {'month': '%m%Y',
                        'day': '%m%Y%d',
                        'hour':'%m%Y%d%H'}
        domain = config['awstats-domain']
        if config['awstats-domain']:
            domain_ext = '.' + config['awstats-domain']
        else:
            domain_ext = ''
        filename = 'awstats%s%s.txt' % (
            date.strftime(date_formats[config['awstats-periodicity']]),
            domain_ext)
        return osp.join(config['awstats-dir'], filename)

112
    def stats_period_for_date(self, chosendate, stats_report):
113
114
115
116
117
118
119
120
121
122
123
        """ return a statperiod for the current month, if it doesn't exist, create it """
        periodicity = self.config['awstats-periodicity']
        if periodicity == 'month':
            start = first_day(chosendate)
            stop = last_day(start)
        elif periodicity == 'day':
            start = datetime(chosendate.year, chosendate.month, chosendate.day)
            stop = datetime(chosendate.year, chosendate.month, chosendate.day, 23, 59, 59)
        elif periodicity == 'hour':
            start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour)
            stop = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59)
124
        return get_or_create_statperiod(self.session, start, stop, stats_report)
125
126

    ## update API #############################################################
127
    def update_stats(self, skip_compress=False):
128
129
130
131
132
133
134
135
        ''' parses awstats and creates or updates the corresponding
        data in the cubicweb instance

        :param start: period start (included)
        :param stop: period stop (excluded)
        '''
        stats_report = dict.fromkeys(('updated', 'created', 'exists no change',
                                      'skipped', 'periods', 'compressed'), 0)
136
137
        for chosendate in track_progress(date_range(self.start, self.stop),
                                         (self.stop-self.start).days,
138
139
140
141
142
                                         pb_size=70, pb_title='Import'):
            self._update_stats_for_date(chosendate, stats_report)
        if not skip_compress:
            compress_old_hits(self.session, stats_report)
        self.session.commit()
Arthur Lutz's avatar
Arthur Lutz committed
143
        self.session.set_cnxset()
144
145
146
147
        return stats_report

    def _update_stats_for_date(self, chosendate, stats_report):
        stats_dict = extract_stats_dict(self.awstats_filepath(chosendate))
148
        stats_period = self.stats_period_for_date(chosendate, stats_report)
Arthur Lutz's avatar
Arthur Lutz committed
149
        normal_dict, rdf_dict = url_count_from_stats(self.cnx, stats_dict)
150
151
152
153
154
155
156
157
        for count_dict, hit_type in ((normal_dict, u'normal'),
                                     (rdf_dict, u'rdf')):
            for eid, values in count_dict.items():
                status = self._update_hits_for_eid(eid, values,
                                                   stats_period, hit_type)
                stats_report[status] += 1

    def _update_hits_for_eid(self, eid, values, stats_period, hit_type):
Arthur Lutz's avatar
Arthur Lutz committed
158
159
        self.session.commit()
        self.session.set_cnxset()
160
161
162
163
164
165
166
167
168
169
170
171
        visit_count = visit_count_rdf = 0
        total_hits = sum([item[0] for item in values])
        try:
            entity = self.session.entity_from_eid(eid)
        except UnknownEid:
            return 'skipped'
        if entity.__regid__ not in self.allowed_etypes:
            return 'skipped'
        try:
            hit = self.all_hits[(eid, stats_period.eid, hit_type)]
        except KeyError: # no hit yet, create one
            status = 'created'
Arthur Lutz's avatar
Arthur Lutz committed
172
173
            req = self.cnx.request()
            hit = req.create_entity('Hits', count=total_hits, hit_type=hit_type,
174
175
176
177
178
179
180
181
182
183
184
185
                                             period=stats_period, stats_about=entity)
            # append it to the cache
            self.all_hits[(eid, stats_period.eid, hit_type)] = hit
        else:
            if hit.count != total_hits:
                status = 'updated'
                hit.set_attributes(count=total_hits)
            else:
                status = 'exists no change'
        return status


186
class UpdateWebstatsCommand(Command):
187
    """ Update cubicweb web stats from awstats processed files.
188

189
190
    If startdate is not entered, the update will be done on the previous
    day or the previous month. If only startdate is enterred, the day or
191
192
193
194
195
    month will be processed. If both dates are enterred, all the dates
    between these two dates will be processed.

    According to periodicity setting the input format for the date is
    different :
196
197
198

      * month 05/2011
      * day   15/05/2011
199
      * hour  15/05/2011-13h (not implemented yet)
200
    """
201

202
    arguments = '<instance> [startdate [stopdate]]'
203
204
    name = 'update-webstats'
    min_args = 1
205
206
207
    max_args = 3
    options = [
        ("skip-compress", {"action": 'store_true',
208
                           'help' : u'Skip the compression of old daily hits into month stats'}),
209
210
        ("today", {"action": 'store_true',
                   'help' : u'Process stats for the current day (for testing)'}),
211
        ]
212

213
    ## command / initial setup API ############################################
214
    def _init_cw_connection(self, appid):
215
        config = cwconfig.instance_configuration(appid)
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
        sourcescfg = config.sources()
        config.set_sources_mode(('system',))
        cnx = repo = None
        while cnx is None:
            try:
                login = sourcescfg['admin']['login']
                pwd = sourcescfg['admin']['password']
            except KeyError:
                login, pwd = manager_userpasswd()
            try:
                repo, cnx = in_memory_repo_cnx(config, login=login, password=pwd)
            except AuthenticationError:
                print 'wrong user/password'
            else:
                break
        session = repo._get_session(cnx.sessionid)
        # XXX keep reference on cnx otherwise cnx.__del__ will cause trouble
Arthur Lutz's avatar
Arthur Lutz committed
233
        cnx.use_web_compatible_requests(session.vreg.config['base-url'])
234
235
236
        return cnx, session

    def run(self, args):
237
        # args = (appid, start[, stop])
238
239
        appid = args.pop(0)
        cw_cnx, session = self._init_cw_connection(appid)
240
        session.set_cnxset()
241
        periodicity = session.vreg.config['awstats-periodicity']
242
243
244
        start = stop = None
        if len(args) > 0:
            start = parse_input_date(args[0], periodicity)
245
        if start is None:
246
247
248
249
250
251
252
            if self.config.today:
                chosendate = datetime.now()
            else:
                chosendate = datetime.now()-timedelta(1)
            start = datetime(chosendate.year, chosendate.month, chosendate.day)
        if len(args) > 1:
            stop = parse_input_date(args[1], periodicity)
253
254
255
256
257
        if stop is None:
            stop = start
        if start is None or stop is None:
            sys.exit(1) # parse_input_date failed to parse date
        stop += ONEDAY # date_range() excludes stop boundary
Arthur Lutz's avatar
Arthur Lutz committed
258
        stats_updater = StatsUpdater(session, cw_cnx, start, stop)
259
        stats_report = stats_updater.update_stats(self.config.skip_compress)
260
261
262
263
264
265
266
267
        print '''=== Update Report ===
Number of periods imported :             %(periods)s
Number of stat objects created :         %(created)s
Number of stat objects updated :         %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped :         %(skipped)s
Number of stat objects compressed :      %(compressed)s
        ''' % stats_report
268
269

CWCTL.register(UpdateWebstatsCommand)