utils.py 5.83 KB
Newer Older
Arthur Lutz's avatar
Arthur Lutz committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# copyright 2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr -- mailto:contact@logilab.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
import os.path as osp

SECTIONSPEC = {
# commented sections are not usefull to view
#    'MAP' : ['section', 'offset'],
#    'GENERAL': ['key', None],
    'TIME': ['hour', 'pages', 'hits', 'bandwidth', 'not viewed pages', 'not viewed hits', 'not viewed bandwidth'],
    'VISITOR': ['host', 'pages', 'hits', 'bandwidth', 'last visit date', 'start date of last visit', 'last page of last visit'],
    'DAY': ['date', 'pages', 'hits', 'bandwidth', 'visits'],
    'DOMAIN': ['domain', 'pages', 'hits', 'bandwidth'],
    'LOGIN': ['cluster id', 'pages', 'hits', 'bandwidth', 'last visit date'],
    'ROBOT': ['most visiting robots', 'hits', 'bandwidth', 'last visit', 'hits on robots.txt'],
    'WORMS': ['worm id', 'hits', 'bandwidth', 'last visit'],
    'EMAILSENDER': ['email', 'hits', 'bandwidth', 'last visit'],
    'EMAILRECEIVER': ['email', 'hits', 'bandwidth', 'last visit'],
    'SESSION': ['session range', 'hits'],
    'SIDER': ['most visited URLs', 'hits', 'bandwidth', 'entry', 'exit'],
    'FILETYPES': ['served files type', 'hits', 'bandwidth', 'bandwidth without compression', 'bandwidth after compression'],
    'OS': ['operating systems', 'hits'],
    'BROWSER': ['browser id', 'hits'],
    'SCREENSIZE': ['screen size', 'hits'],
    'UNKNOWNREFERER': ['unknown referer os', 'last visit date'],
    'UNKNOWNREFERERBROWSER': ['unknown referer browser', 'last visit date'],
    'ORIGIN': ['origin', 'pages', 'hits'],
    'SEREFERRALS': ['search engine referers id', 'pages', 'hits'],
    'PAGEREFS': ['external page referers', 'pages', 'hits'],
    'SEARCHWORDS': ['main search keyphrases', 'hits'],
    'KEYWORDS': ['main search keyword', 'hits'],
     #'MISC': ['misc id', 'pages', 'hits', 'bandwidth'],
    'ERRORS': ['errors', 'hits', 'bandwidth'],
    'CLUSTER': ['cluster id', 'pages', 'hits', 'bandwidth'],
    'SIDER_404': ['urls with 404 errors', 'hits', 'last url referer'],
}

Arthur Lutz's avatar
Arthur Lutz committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

SECTIONLABELS = {
    'TIME': "Visits by hour",
    'VISITOR': 'Top visitors (by host)',
    'DAY': 'Visits by days of the month',
    'DOMAIN': 'Visitors domains/countries',
    'LOGIN': 'logged in users',
    'ROBOT': 'Robots/Spiders visitors',
    'WORMS': 'Worm visits',
    'EMAILSENDER': 'email sender',
    'EMAILRECEIVER': 'email receiver',
    'SESSION': 'Visits duration',
    'SIDER': 'Most visited URLs',
    'FILETYPES': 'Visited file types',
    'OS': 'Visiting operating systems',
    'BROWSER': 'Visiting browsers',
    'SCREENSIZE': 'Hits by Screen size',
    'UNKNOWNREFERER': 'Unknown referer os',
    'UNKNOWNREFERERBROWSER': 'Unknown referer browser',
    'ORIGIN': 'Origin of hits',
    'SEREFERRALS': 'Search engine referers hits',
    'PAGEREFS': 'Main external page referers',
    'SEARCHWORDS': 'Hits from search keyphrases',
    'KEYWORDS': 'Hits from search keywords',
     #'MISC': ['misc id', 'pages', 'hits', 'bandwidth'],
    'ERRORS': 'HTTP Status codes',
    'CLUSTER': 'Visits by cluster id',
    'SIDER_404': 'Hits with 404 errors',
}

ORIGIN_LABELS = {
    'From0':'Direct address / Bookmark / Link in email...',
    'From1':'Unknown Origin',
    'From2':'Links from an Internet Search Engine',
    'From3':'Links from an external page (other web sites except search engines)',
    'From4':'Internal Link',
    }


91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
def extract_stats_dict(awstats_dir, filename):
    ''' from an awstats file extract structured data into a dict

    returns a dictionnary like this :

    {'SIDER':  {
        '/someurl': {
            'most visisted url':'/someurl',
            'hits' : '1234',
            'bandwidth' : '4321',
            'entry' : '12',
            'exit' : '8'
            }
        ...
        }
    }
    '''
108
109
    if not osp.isfile(osp.join(awstats_dir, filename)):
        return {}
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
    section_name = None
    parsed_countdown = 0
    stats_dict = {}
    for line in file(osp.join(awstats_dir, filename)).readlines():
        if line.startswith('BEGIN_'):
            section_name, nb_of_lines = line.split('_', 1)[1].split()
            if section_name in SECTIONSPEC:
                stats_dict.setdefault(section_name, {})
                parsed_countdown = int(nb_of_lines)-1 if int(nb_of_lines) else 0
        elif section_name and parsed_countdown:
            for index, value in enumerate(line.split()):
                key = line.split()[0]
                stats_dict[section_name].setdefault(key, {})
                try:
                    stats_dict[section_name][key][SECTIONSPEC[section_name][index]] = value
                except IndexError:
                    print index, value, line
            parsed_countdown -= 1
        elif section_name and parsed_countdown == 0:
            section_name = None
    return stats_dict

def eid_from_url(value):
    ''' return an eid from an url '''
    # FIXME - should use url_resolver for a more serious guess
    # FIXME - BNF specific right now
    for pattern in ['/(\d+)/(.*?)/',
                    '/(.*?)/(.*?)/(fr|en|es).html',
                    '/(.*?)/(.*?)/rdf.(xml|n3|nt)']:
        match = re.search(pattern, value)
        if match and match.group(1):
            try:
                return int(match.group(1))
            except:
                pass