ccplugin.py 10.6 KB
Newer Older
1
2
3
# -*- coding: utf-8 -*-
"""update-webstats cubicweb plugin

4
Usage: cubicweb-ctl update-webstats [options] <instance-name> startdate [stopdate]
5
6
7
8

This command will generate webstats objects for all linked document types.
"""

9
import os.path as osp
10
from datetime import datetime, timedelta
11

12
from logilab.common.date import first_day, last_day, date_range, ONEDAY
13
from logilab.common.shellutils import ProgressBar
14

15
from cubicweb import cwconfig, UnknownEid
16
17
18
19
20
21
from cubicweb import AuthenticationError
from cubicweb.dbapi import in_memory_repo_cnx

from cubicweb.toolsutils import Command
from cubicweb.cwctl import CWCTL

22
23
from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \
     get_or_create_statperiod, compress_old_hits
24

25

26
def url_count_from_stats(session, stats_dict):
Arthur Lutz's avatar
Arthur Lutz committed
27
28
29
30
31
32
    '''
    parse most visited urls in stats_dict generated from awstats txt file

    returns two dictionnaries with eid as key and sequence of values as value
    one for normal navigation, the other for rdf navigation
    '''
33
    if 'SIDER' not in stats_dict:
Arthur Lutz's avatar
Arthur Lutz committed
34
        return {}, {}
35
36
37
38
39
    visit_count_dict = {}
    visit_count_rdf_dict = {}
    for item in stats_dict['SIDER'].values():
        url = item[SECTIONSPEC['SIDER'][0]]
        hits = int(item[SECTIONSPEC['SIDER'][1]])
40
        eid = eid_from_url(session, url)
41
42
43
44
45
46
47
48
49
50
51
        if not eid:
            continue
        if 'rdf' in url:
            visit_count_rdf_dict.setdefault(eid, [])
            visit_count_rdf_dict[eid].append((hits, url))
        else:
            visit_count_dict.setdefault(eid, [])
            visit_count_dict[eid].append((hits, url))
    return visit_count_dict, visit_count_rdf_dict


52
53
54
55
56
57
58
59
60
61
62
63
64
def parse_input_date(date, periodicity):
    input_formats = {'month':'%m/%Y',
                     'day': '%d/%m/%Y',
                     'hour': '%d/%m/%Y-%Hh'}
    try:
        return datetime.strptime(date, input_formats[periodicity])
    except ValueError:
        print 'Error : %s not a proper date' % date
        return None


def track_progress(iterable, nb_ops=None, pb_size=20, pb_title=''):
    # nb_ops must be set is iterable doesn't support length protocol
65
66
    if nb_ops is None:
        nb_ops = len(iterable)
67
68
69
    pb = ProgressBar(nb_ops, size=pb_size, title=pb_title)
    for item in iterable:
        pb.update()
70
        yield item
71
72
73
74
    pb.finish()


class StatsUpdater(object):
75
    def __init__(self, session, start, stop):
76
77
        self.session = session
        self.config = session.vreg.config
78
79
        self.start = start
        self.stop = stop
80
81
82
83
84
        self.allowed_etypes = frozenset(eschema.type for eschema in
                                        session.vreg.schema.rschema('stats_about').objects())
        self.all_hits = {}
        hits_rset = session.execute('Any H,HC,HT,E,P,PSA,PSO WHERE '
                                    'H is Hits, H count HC, H hit_type HT, '
85
86
87
88
89
90
                                    'H stats_about E, H period P, P start PSA, P stop PSO '
                                    'HAVING (PSA >= %(start)s, PSO <= %(stop)s) ',
                                    {'start':start,
                                     'stop':stop})
        for hit in track_progress(hits_rset.entities(), nb_ops=len(hits_rset),
                                  pb_size=62, pb_title='Building cache'):
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
            hit_key = (hit.stats_about[0].eid, hit.period[0].eid, hit.hit_type)
            self.all_hits[hit_key] = hit

    ## internal utilities #####################################################
    def awstats_filepath(self, date):
        config = self.config
        date_formats = {'month': '%m%Y',
                        'day': '%m%Y%d',
                        'hour':'%m%Y%d%H'}
        domain = config['awstats-domain']
        if config['awstats-domain']:
            domain_ext = '.' + config['awstats-domain']
        else:
            domain_ext = ''
        filename = 'awstats%s%s.txt' % (
            date.strftime(date_formats[config['awstats-periodicity']]),
            domain_ext)
        return osp.join(config['awstats-dir'], filename)

110
    def stats_period_for_date(self, chosendate, stats_report):
111
112
113
114
115
116
117
118
119
120
121
        """ return a statperiod for the current month, if it doesn't exist, create it """
        periodicity = self.config['awstats-periodicity']
        if periodicity == 'month':
            start = first_day(chosendate)
            stop = last_day(start)
        elif periodicity == 'day':
            start = datetime(chosendate.year, chosendate.month, chosendate.day)
            stop = datetime(chosendate.year, chosendate.month, chosendate.day, 23, 59, 59)
        elif periodicity == 'hour':
            start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour)
            stop = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59)
122
        return get_or_create_statperiod(self.session, start, stop, stats_report)
123
124

    ## update API #############################################################
125
    def update_stats(self, skip_compress=False):
126
127
128
129
130
131
132
133
        ''' parses awstats and creates or updates the corresponding
        data in the cubicweb instance

        :param start: period start (included)
        :param stop: period stop (excluded)
        '''
        stats_report = dict.fromkeys(('updated', 'created', 'exists no change',
                                      'skipped', 'periods', 'compressed'), 0)
134
135
        for chosendate in track_progress(date_range(self.start, self.stop),
                                         (self.stop-self.start).days,
136
137
138
139
140
141
142
143
144
                                         pb_size=70, pb_title='Import'):
            self._update_stats_for_date(chosendate, stats_report)
        if not skip_compress:
            compress_old_hits(self.session, stats_report)
        self.session.commit()
        return stats_report

    def _update_stats_for_date(self, chosendate, stats_report):
        stats_dict = extract_stats_dict(self.awstats_filepath(chosendate))
145
146
        stats_period = self.stats_period_for_date(chosendate, stats_report)
        normal_dict, rdf_dict = url_count_from_stats(self.session, stats_dict)
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
        for count_dict, hit_type in ((normal_dict, u'normal'),
                                     (rdf_dict, u'rdf')):
            for eid, values in count_dict.items():
                status = self._update_hits_for_eid(eid, values,
                                                   stats_period, hit_type)
                stats_report[status] += 1

    def _update_hits_for_eid(self, eid, values, stats_period, hit_type):
        visit_count = visit_count_rdf = 0
        total_hits = sum([item[0] for item in values])
        try:
            entity = self.session.entity_from_eid(eid)
        except UnknownEid:
            return 'skipped'
        if entity.__regid__ not in self.allowed_etypes:
            return 'skipped'
        try:
            hit = self.all_hits[(eid, stats_period.eid, hit_type)]
        except KeyError: # no hit yet, create one
            status = 'created'
            hit = self.session.create_entity('Hits', count=total_hits, hit_type=hit_type,
                                             period=stats_period, stats_about=entity)
            # append it to the cache
            self.all_hits[(eid, stats_period.eid, hit_type)] = hit
        else:
            if hit.count != total_hits:
                status = 'updated'
                hit.set_attributes(count=total_hits)
            else:
                status = 'exists no change'
        return status


180
class UpdateWebstatsCommand(Command):
181
    """ Update cubicweb web stats from awstats processed files.
182

183
184
    If startdate is not entered, the update will be done on the previous
    day or the previous month. If only startdate is enterred, the day or
185
186
187
188
189
    month will be processed. If both dates are enterred, all the dates
    between these two dates will be processed.

    According to periodicity setting the input format for the date is
    different :
190
191
192

      * month 05/2011
      * day   15/05/2011
193
      * hour  15/05/2011-13h (not implemented yet)
194
    """
195

196
    arguments = '<instance> [startdate [stopdate]]'
197
198
    name = 'update-webstats'
    min_args = 1
199
200
201
    max_args = 3
    options = [
        ("skip-compress", {"action": 'store_true',
202
                           'help' : u'Skip the compression of old daily hits into month stats'}),
203
204
        ("today", {"action": 'store_true',
                   'help' : u'Process stats for the current day (for testing)'}),
205
        ]
206

207
    ## command / initial setup API ############################################
208
    def _init_cw_connection(self, appid):
209
        config = cwconfig.instance_configuration(appid)
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
        sourcescfg = config.sources()
        config.set_sources_mode(('system',))
        cnx = repo = None
        while cnx is None:
            try:
                login = sourcescfg['admin']['login']
                pwd = sourcescfg['admin']['password']
            except KeyError:
                login, pwd = manager_userpasswd()
            try:
                repo, cnx = in_memory_repo_cnx(config, login=login, password=pwd)
            except AuthenticationError:
                print 'wrong user/password'
            else:
                break
        session = repo._get_session(cnx.sessionid)
        # XXX keep reference on cnx otherwise cnx.__del__ will cause trouble
        return cnx, session

    def run(self, args):
230
        # args = (appid, start[, stop])
231
232
        appid = args.pop(0)
        cw_cnx, session = self._init_cw_connection(appid)
233
        session.set_cnxset()
234
        periodicity = session.vreg.config['awstats-periodicity']
235
236
237
        start = stop = None
        if len(args) > 0:
            start = parse_input_date(args[0], periodicity)
238
        if start is None:
239
240
241
242
243
244
245
            if self.config.today:
                chosendate = datetime.now()
            else:
                chosendate = datetime.now()-timedelta(1)
            start = datetime(chosendate.year, chosendate.month, chosendate.day)
        if len(args) > 1:
            stop = parse_input_date(args[1], periodicity)
246
247
248
249
250
        if stop is None:
            stop = start
        if start is None or stop is None:
            sys.exit(1) # parse_input_date failed to parse date
        stop += ONEDAY # date_range() excludes stop boundary
251
252
        stats_updater = StatsUpdater(session, start, stop)
        stats_report = stats_updater.update_stats(self.config.skip_compress)
253
254
255
256
257
258
259
260
        print '''=== Update Report ===
Number of periods imported :             %(periods)s
Number of stat objects created :         %(created)s
Number of stat objects updated :         %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped :         %(skipped)s
Number of stat objects compressed :      %(compressed)s
        ''' % stats_report
261
262

CWCTL.register(UpdateWebstatsCommand)