ccplugin.py 10.1 KB
Newer Older
1
2
3
# -*- coding: utf-8 -*-
"""update-webstats cubicweb plugin

4
Usage: cubicweb-ctl update-webstats [options] <instance-name> startdate [stopdate]
5
6
7
8

This command will generate webstats objects for all linked document types.
"""

9
10
11
from datetime import datetime, timedelta
from logilab.common.date import first_day, last_day, date_range
from logilab.common.shellutils import ProgressBar
12

13
from cubicweb import cwconfig, UnknownEid
14
15
16
17
18
19
from cubicweb import AuthenticationError
from cubicweb.dbapi import in_memory_repo_cnx

from cubicweb.toolsutils import Command
from cubicweb.cwctl import CWCTL

20
21
from utils import SECTIONSPEC, extract_stats_dict, eid_from_url, \
     get_or_create_statperiod, compress_old_hits
22
23

def url_count_from_stats(stats_dict):
Arthur Lutz's avatar
Arthur Lutz committed
24
25
26
27
28
29
    '''
    parse most visited urls in stats_dict generated from awstats txt file

    returns two dictionnaries with eid as key and sequence of values as value
    one for normal navigation, the other for rdf navigation
    '''
Arthur Lutz's avatar
Arthur Lutz committed
30
31
    if 'SIDER' not in stats_dict.keys():
        return {}, {}
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
    visit_count_dict = {}
    visit_count_rdf_dict = {}
    for item in stats_dict['SIDER'].values():
        url = item[SECTIONSPEC['SIDER'][0]]
        hits = int(item[SECTIONSPEC['SIDER'][1]])
        eid = eid_from_url(url)
        if not eid:
            continue
        if 'rdf' in url:
            visit_count_rdf_dict.setdefault(eid, [])
            visit_count_rdf_dict[eid].append((hits, url))
        else:
            visit_count_dict.setdefault(eid, [])
            visit_count_dict[eid].append((hits, url))
    return visit_count_dict, visit_count_rdf_dict


class UpdateWebstatsCommand(Command):
50
    """ Update cubicweb web stats from awstats processed files.
51

52
53
54
55
56
57
58
    If startdate is not entered, the update will be done on current
    day or current month. If only startdate is enterred, the day or
    month will be processed. If both dates are enterred, all the dates
    between these two dates will be processed.

    According to periodicity setting the input format for the date is
    different :
59
60
61

      * month 05/2011
      * day   15/05/2011
62
      * hour  15/05/2011-13h (not implemented yet)
63
    """
64

65
    arguments = '<instance> [startdate [stopdate]]'
66
67
    name = 'update-webstats'
    min_args = 1
68
69
70
71
72
73
    max_args = 3
    options = [
        ("skip-compress", {"action": 'store_true',
                                    'help' : u'Skip the compression of old daily hits into month stats'}),
        ]
    def get_current_stats_period(self, session, chosendate):
Arthur Lutz's avatar
Arthur Lutz committed
74
        """ return a statperiod for the current month, if it doesn't exist, create it """
75
76
77
        start, stop = self.choose_period(session, chosendate)
        return get_or_create_statperiod(session, start, stop)

78

79
80
    def choose_period(self, session, chosendate):
        periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
        if periodicity == 'month':
            start = first_day(chosendate)
            end = last_day(start)
        elif periodicity == 'day':
            start = datetime(chosendate.year, chosendate.month, chosendate.day)
            end = datetime(chosendate.year, chosendate.month, chosendate.day, 23, 59, 59)
        elif periodicity == 'hour':
            start = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour)
            end = datetime(chosendate.year, chosendate.month, chosendate.day, chosendate.hour, 59, 59)
        return start, end

    def choose_dateformat(self, periodicity):
        return {'hour':'%m%Y%d%H',
                'day': '%m%Y%d',
                'month': '%m%Y'}[periodicity]

97
    def update_stats(self, session, args):
Arthur Lutz's avatar
Arthur Lutz committed
98
99
        ''' parses awstats and creates or updates the corresponding
        data in the cubicweb instance'''
100
101
        periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
        assert periodicity in ('hour', 'day', 'month')
102
        start = stop = None
103
        if args:
104
105
106
107
            # FIXME - adapt according to periodicity
            input_format = {'month':'%m/%Y',
                            'day': '%d/%m/%Y',
                            'hour': '%d/%m/%Y-%Hh'}[periodicity]
Arthur Lutz's avatar
Arthur Lutz committed
108
109
110
111
112
            try:
                start = datetime.strptime(args[0], input_format)
            except ValueError:
                print 'Error : %s not a proper date' % args[0]
                return
113
            if len(args) > 1:
Arthur Lutz's avatar
Arthur Lutz committed
114
115
116
117
118
                try:
                    stop = datetime.strptime(args[1], input_format)
                except ValueError:
                    print 'Error : %s not a proper date' % args[1]
                    return
119
        else:
120
121
122
            start = stop = datetime.now()
        if stop is None:
            stop = start
123
124
125
126
        update_stats = {'updated':0,
                        'created':0,
                        'exists no change':0,
                        'skipped':0,
127
128
                        'periods':0,
                        'compressed':0
129
                        }
130
131
        pb = ProgressBar(((stop+timedelta(days=1))-start).days, 70, title='Import')
        for chosendate in date_range(start, stop+timedelta(days=1)):
Arthur Lutz's avatar
Arthur Lutz committed
132
            self.update_stats_for_date(session, chosendate, update_stats)
133
134
135
136
            pb.update()
        pb.finish()
        if not self.config.skip_compress:
            compress_old_hits(session, update_stats)
137
        print '''=== Update Report ===
138
Number of periods imported :             %(periods)s
139
140
141
142
Number of stat objects created :         %(created)s
Number of stat objects updated :         %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped :         %(skipped)s
143
Number of stat objects compressed :      %(compressed)s
144
        ''' % update_stats
145

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    def update_stats_for_date(self, session, chosendate, update_stats):
        stats_period = self.get_current_stats_period(session, chosendate)
        periodicity = session.vreg.config.get('awstats-periodicity', 'day') #FIXME s/day/month/
        dateformat_in_file = self.choose_dateformat(periodicity)
        domain = session.vreg.config.get('awstats-domain', '')
        filename = 'awstats%s%s.txt' % (chosendate.strftime(dateformat_in_file), domain and '.%s' % domain)
        awstatsdir = session.vreg.config.get('awstats-dir', '/var/lib/awstats')
        stats_dict = extract_stats_dict(awstatsdir, filename)
        normal_dict, rdf_dict = url_count_from_stats(stats_dict)
        is_rdf = False
        rql = 'Any N WHERE X relation_type R, R name "stats_about", X to_entity Y, Y name N'
        rset = session.execute(rql)
        allowed_types = [item[0] for item in rset]
        for count_dict, is_rdf in ((normal_dict, False),
                                   (rdf_dict, True)):
            for eid, values in count_dict.items():
                self.update_hits_for_eid(eid, values, session, update_stats,
                                         allowed_types, stats_period, is_rdf)

    def update_hits_for_eid(self, eid, values, session,  update_stats,
                            allowed_types, stats_period, is_rdf):
        visit_count = visit_count_rdf = 0
        total_hits = sum([item[0] for item in values])
        try:
            entity = session.entity_from_eid(eid)
        except UnknownEid:
            update_stats['skipped'] += 1
            return
        if not entity.__regid__ in allowed_types:
            update_stats['skipped'] += 1
            return
        rql = 'Any X,V WHERE X is Hits, X count V, X hit_type "%(hit_type)s",' \
              'X stats_about E, E eid %(e)s, X period P, P eid %(sp)s'
        rset = session.execute(rql % {'e':eid,
                                      'sp':stats_period.eid,
                                      'hit_type': is_rdf and 'rdf' or 'normal'})
        if rset:
            if rset[0][1] != total_hits:
                update_stats['updated'] += 1
                session.execute('SET X count %(hits)s WHERE X eid %(e)s' %
                                 {'e':rset[0][0],
                                  'hits':total_hits})
            else:
                update_stats['exists no change'] += 1
        else:
            update_stats['created'] += 1
            session.create_entity('Hits', count = total_hits,
                                  period=stats_period,
                                  stats_about = entity,
                                  hit_type=is_rdf and u'rdf' or u'normal')
196
197

    def _init_cw_connection(self, appid):
198
        config = cwconfig.instance_configuration(appid)
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
        sourcescfg = config.sources()
        config.set_sources_mode(('system',))
        cnx = repo = None
        while cnx is None:
            try:
                login = sourcescfg['admin']['login']
                pwd = sourcescfg['admin']['password']
            except KeyError:
                login, pwd = manager_userpasswd()
            try:
                repo, cnx = in_memory_repo_cnx(config, login=login, password=pwd)
            except AuthenticationError:
                print 'wrong user/password'
            else:
                break
        session = repo._get_session(cnx.sessionid)
        # XXX keep reference on cnx otherwise cnx.__del__ will cause trouble
        #     (file a ticket)
        return cnx, session

    def main_run(self, args, rcfile=None):
        """Run the command and return status 0 if everything went fine.

        If :exc:`CommandError` is raised by the underlying command, simply log
        the error and return status 2.

        Any other exceptions, including :exc:`BadCommandUsage` will be
        propagated.
        """
        # XXX (adim): rcfile handling is spectacularly messy but I can't
        #             get it right without refactoring pivotdoc for now
        if rcfile is None:
            if '-c' in args:
                rcfile = args[args.index('-c')+1]
            elif '--config' in args:
                rcfile = args[args.index('--config')+1]
            else:
                rcfile = None#self.config.config
        return Command.main_run(self, args, rcfile)

    def run(self, args):
        appid = args.pop(0)
        cw_cnx, session = self._init_cw_connection(appid)
242
        session.set_cnxset()
243
244
245
246
247
        self.update_stats(session, args)
        session.commit()

CWCTL.register(UpdateWebstatsCommand)