Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
cubicweb
cubes
awstats
Commits
48baca6b979e
Commit
8821f180
authored
Jun 07, 2011
by
Arthur Lutz
Browse files
imported patch proposed_refactoring.diff
parent
a90975d04742
Changes
4
Hide whitespace changes
Inline
Side-by-side
ccplugin.py
View file @
48baca6b
...
...
@@ -6,8 +6,10 @@ Usage: cubicweb-ctl update-webstats [options] <instance-name> startdate [stopdat
This command will generate webstats objects for all linked document types.
"""
from
datetime
import
datetime
,
timedelta
from
logilab.common.date
import
first_day
,
last_day
,
date_range
import
os.path
as
osp
from
datetime
import
datetime
from
logilab.common.date
import
first_day
,
last_day
,
date_range
,
ONDEDAY
from
logilab.common.shellutils
import
ProgressBar
from
cubicweb
import
cwconfig
,
UnknownEid
...
...
@@ -20,6 +22,7 @@ from cubicweb.cwctl import CWCTL
from
utils
import
SECTIONSPEC
,
extract_stats_dict
,
eid_from_url
,
\
get_or_create_statperiod
,
compress_old_hits
def
url_count_from_stats
(
stats_dict
):
'''
parse most visited urls in stats_dict generated from awstats txt file
...
...
@@ -27,7 +30,7 @@ def url_count_from_stats(stats_dict):
returns two dictionnaries with eid as key and sequence of values as value
one for normal navigation, the other for rdf navigation
'''
if
'SIDER'
not
in
stats_dict
.
keys
()
:
if
'SIDER'
not
in
stats_dict
:
return
{},
{}
visit_count_dict
=
{}
visit_count_rdf_dict
=
{}
...
...
@@ -46,6 +49,127 @@ def url_count_from_stats(stats_dict):
return
visit_count_dict
,
visit_count_rdf_dict
def
parse_input_date
(
date
,
periodicity
):
input_formats
=
{
'month'
:
'%m/%Y'
,
'day'
:
'%d/%m/%Y'
,
'hour'
:
'%d/%m/%Y-%Hh'
}
try
:
return
datetime
.
strptime
(
date
,
input_formats
[
periodicity
])
except
ValueError
:
print
'Error : %s not a proper date'
%
date
return
None
def
track_progress
(
iterable
,
nb_ops
=
None
,
pb_size
=
20
,
pb_title
=
''
):
# nb_ops must be set is iterable doesn't support length protocol
nb_ops
=
nb_ops
or
len
(
iterable
)
pb
=
ProgressBar
(
nb_ops
,
size
=
pb_size
,
title
=
pb_title
)
for
item
in
iterable
:
yield
item
pb
.
update
()
pb
.
finish
()
class
StatsUpdater
(
object
):
def
__init__
(
self
,
session
):
self
.
session
=
session
self
.
config
=
session
.
vreg
.
config
self
.
allowed_etypes
=
frozenset
(
eschema
.
type
for
eschema
in
session
.
vreg
.
schema
.
rschema
(
'stats_about'
).
objects
())
self
.
all_hits
=
{}
hits_rset
=
session
.
execute
(
'Any H,HC,HT,E,P,PSA,PSO WHERE '
'H is Hits, H count HC, H hit_type HT, '
'H stats_about E, H period P, P start PSA, P stop PSO'
)
for
hit
in
hits_rset
.
entities
():
hit_key
=
(
hit
.
stats_about
[
0
].
eid
,
hit
.
period
[
0
].
eid
,
hit
.
hit_type
)
self
.
all_hits
[
hit_key
]
=
hit
## internal utilities #####################################################
def
awstats_filepath
(
self
,
date
):
config
=
self
.
config
date_formats
=
{
'month'
:
'%m%Y'
,
'day'
:
'%m%Y%d'
,
'hour'
:
'%m%Y%d%H'
}
domain
=
config
[
'awstats-domain'
]
if
config
[
'awstats-domain'
]:
domain_ext
=
'.'
+
config
[
'awstats-domain'
]
else
:
domain_ext
=
''
filename
=
'awstats%s%s.txt'
%
(
date
.
strftime
(
date_formats
[
config
[
'awstats-periodicity'
]]),
domain_ext
)
return
osp
.
join
(
config
[
'awstats-dir'
],
filename
)
def
stats_period_for_date
(
self
,
chosendate
):
""" return a statperiod for the current month, if it doesn't exist, create it """
periodicity
=
self
.
config
[
'awstats-periodicity'
]
if
periodicity
==
'month'
:
start
=
first_day
(
chosendate
)
stop
=
last_day
(
start
)
elif
periodicity
==
'day'
:
start
=
datetime
(
chosendate
.
year
,
chosendate
.
month
,
chosendate
.
day
)
stop
=
datetime
(
chosendate
.
year
,
chosendate
.
month
,
chosendate
.
day
,
23
,
59
,
59
)
elif
periodicity
==
'hour'
:
start
=
datetime
(
chosendate
.
year
,
chosendate
.
month
,
chosendate
.
day
,
chosendate
.
hour
)
stop
=
datetime
(
chosendate
.
year
,
chosendate
.
month
,
chosendate
.
day
,
chosendate
.
hour
,
59
,
59
)
return
get_or_create_statperiod
(
self
.
session
,
start
,
stop
)
## update API #############################################################
def
update_stats
(
self
,
start
,
stop
,
skip_compress
=
False
):
''' parses awstats and creates or updates the corresponding
data in the cubicweb instance
:param start: period start (included)
:param stop: period stop (excluded)
'''
session
.
set_cnxset
()
stats_report
=
dict
.
fromkeys
((
'updated'
,
'created'
,
'exists no change'
,
'skipped'
,
'periods'
,
'compressed'
),
0
)
for
chosendate
in
track_progress
(
date_range
(
start
,
stop
),
(
stop
-
start
).
days
,
pb_size
=
70
,
pb_title
=
'Import'
):
self
.
_update_stats_for_date
(
chosendate
,
stats_report
)
if
not
skip_compress
:
compress_old_hits
(
self
.
session
,
stats_report
)
self
.
session
.
commit
()
return
stats_report
def
_update_stats_for_date
(
self
,
chosendate
,
stats_report
):
stats_dict
=
extract_stats_dict
(
self
.
awstats_filepath
(
chosendate
))
stats_period
=
self
.
stats_period_for_date
(
chosendate
)
normal_dict
,
rdf_dict
=
url_count_from_stats
(
stats_dict
)
for
count_dict
,
hit_type
in
((
normal_dict
,
u
'normal'
),
(
rdf_dict
,
u
'rdf'
)):
for
eid
,
values
in
count_dict
.
items
():
status
=
self
.
_update_hits_for_eid
(
eid
,
values
,
stats_period
,
hit_type
)
stats_report
[
status
]
+=
1
def
_update_hits_for_eid
(
self
,
eid
,
values
,
stats_period
,
hit_type
):
visit_count
=
visit_count_rdf
=
0
total_hits
=
sum
([
item
[
0
]
for
item
in
values
])
try
:
entity
=
self
.
session
.
entity_from_eid
(
eid
)
except
UnknownEid
:
return
'skipped'
if
entity
.
__regid__
not
in
self
.
allowed_etypes
:
return
'skipped'
try
:
hit
=
self
.
all_hits
[(
eid
,
stats_period
.
eid
,
hit_type
)]
except
KeyError
:
# no hit yet, create one
status
=
'created'
hit
=
self
.
session
.
create_entity
(
'Hits'
,
count
=
total_hits
,
hit_type
=
hit_type
,
period
=
stats_period
,
stats_about
=
entity
)
# append it to the cache
self
.
all_hits
[(
eid
,
stats_period
.
eid
,
hit_type
)]
=
hit
else
:
if
hit
.
count
!=
total_hits
:
status
=
'updated'
hit
.
set_attributes
(
count
=
total_hits
)
else
:
status
=
'exists no change'
return
status
class
UpdateWebstatsCommand
(
Command
):
""" Update cubicweb web stats from awstats processed files.
...
...
@@ -68,132 +192,10 @@ class UpdateWebstatsCommand(Command):
max_args
=
3
options
=
[
(
"skip-compress"
,
{
"action"
:
'store_true'
,
'help'
:
u
'Skip the compression of old daily hits into month stats'
}),
'help'
:
u
'Skip the compression of old daily hits into month stats'
}),
]
def
get_current_stats_period
(
self
,
session
,
chosendate
):
""" return a statperiod for the current month, if it doesn't exist, create it """
start
,
stop
=
self
.
choose_period
(
session
,
chosendate
)
return
get_or_create_statperiod
(
session
,
start
,
stop
)
def
choose_period
(
self
,
session
,
chosendate
):
periodicity
=
session
.
vreg
.
config
.
get
(
'awstats-periodicity'
,
'day'
)
#FIXME s/day/month/
if
periodicity
==
'month'
:
start
=
first_day
(
chosendate
)
end
=
last_day
(
start
)
elif
periodicity
==
'day'
:
start
=
datetime
(
chosendate
.
year
,
chosendate
.
month
,
chosendate
.
day
)
end
=
datetime
(
chosendate
.
year
,
chosendate
.
month
,
chosendate
.
day
,
23
,
59
,
59
)
elif
periodicity
==
'hour'
:
start
=
datetime
(
chosendate
.
year
,
chosendate
.
month
,
chosendate
.
day
,
chosendate
.
hour
)
end
=
datetime
(
chosendate
.
year
,
chosendate
.
month
,
chosendate
.
day
,
chosendate
.
hour
,
59
,
59
)
return
start
,
end
def
choose_dateformat
(
self
,
periodicity
):
return
{
'hour'
:
'%m%Y%d%H'
,
'day'
:
'%m%Y%d'
,
'month'
:
'%m%Y'
}[
periodicity
]
def
update_stats
(
self
,
session
,
args
):
''' parses awstats and creates or updates the corresponding
data in the cubicweb instance'''
periodicity
=
session
.
vreg
.
config
.
get
(
'awstats-periodicity'
,
'day'
)
#FIXME s/day/month/
assert
periodicity
in
(
'hour'
,
'day'
,
'month'
)
start
=
stop
=
None
if
args
:
# FIXME - adapt according to periodicity
input_format
=
{
'month'
:
'%m/%Y'
,
'day'
:
'%d/%m/%Y'
,
'hour'
:
'%d/%m/%Y-%Hh'
}[
periodicity
]
try
:
start
=
datetime
.
strptime
(
args
[
0
],
input_format
)
except
ValueError
:
print
'Error : %s not a proper date'
%
args
[
0
]
return
if
len
(
args
)
>
1
:
try
:
stop
=
datetime
.
strptime
(
args
[
1
],
input_format
)
except
ValueError
:
print
'Error : %s not a proper date'
%
args
[
1
]
return
else
:
start
=
stop
=
datetime
.
now
()
if
stop
is
None
:
stop
=
start
update_stats
=
{
'updated'
:
0
,
'created'
:
0
,
'exists no change'
:
0
,
'skipped'
:
0
,
'periods'
:
0
,
'compressed'
:
0
}
pb
=
ProgressBar
(((
stop
+
timedelta
(
days
=
1
))
-
start
).
days
,
70
,
title
=
'Import'
)
for
chosendate
in
date_range
(
start
,
stop
+
timedelta
(
days
=
1
)):
self
.
update_stats_for_date
(
session
,
chosendate
,
update_stats
)
pb
.
update
()
pb
.
finish
()
if
not
self
.
config
.
skip_compress
:
compress_old_hits
(
session
,
update_stats
)
print
'''=== Update Report ===
Number of periods imported : %(periods)s
Number of stat objects created : %(created)s
Number of stat objects updated : %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped : %(skipped)s
Number of stat objects compressed : %(compressed)s
'''
%
update_stats
def
update_stats_for_date
(
self
,
session
,
chosendate
,
update_stats
):
stats_period
=
self
.
get_current_stats_period
(
session
,
chosendate
)
periodicity
=
session
.
vreg
.
config
.
get
(
'awstats-periodicity'
,
'day'
)
#FIXME s/day/month/
dateformat_in_file
=
self
.
choose_dateformat
(
periodicity
)
domain
=
session
.
vreg
.
config
.
get
(
'awstats-domain'
,
''
)
filename
=
'awstats%s%s.txt'
%
(
chosendate
.
strftime
(
dateformat_in_file
),
domain
and
'.%s'
%
domain
)
awstatsdir
=
session
.
vreg
.
config
.
get
(
'awstats-dir'
,
'/var/lib/awstats'
)
stats_dict
=
extract_stats_dict
(
awstatsdir
,
filename
)
normal_dict
,
rdf_dict
=
url_count_from_stats
(
stats_dict
)
is_rdf
=
False
rql
=
'Any N WHERE X relation_type R, R name "stats_about", X to_entity Y, Y name N'
rset
=
session
.
execute
(
rql
)
allowed_types
=
[
item
[
0
]
for
item
in
rset
]
for
count_dict
,
is_rdf
in
((
normal_dict
,
False
),
(
rdf_dict
,
True
)):
for
eid
,
values
in
count_dict
.
items
():
self
.
update_hits_for_eid
(
eid
,
values
,
session
,
update_stats
,
allowed_types
,
stats_period
,
is_rdf
)
def
update_hits_for_eid
(
self
,
eid
,
values
,
session
,
update_stats
,
allowed_types
,
stats_period
,
is_rdf
):
visit_count
=
visit_count_rdf
=
0
total_hits
=
sum
([
item
[
0
]
for
item
in
values
])
try
:
entity
=
session
.
entity_from_eid
(
eid
)
except
UnknownEid
:
update_stats
[
'skipped'
]
+=
1
return
if
not
entity
.
__regid__
in
allowed_types
:
update_stats
[
'skipped'
]
+=
1
return
rql
=
'Any X,V WHERE X is Hits, X count V, X hit_type "%(hit_type)s",'
\
'X stats_about E, E eid %(e)s, X period P, P eid %(sp)s'
rset
=
session
.
execute
(
rql
%
{
'e'
:
eid
,
'sp'
:
stats_period
.
eid
,
'hit_type'
:
is_rdf
and
'rdf'
or
'normal'
})
if
rset
:
if
rset
[
0
][
1
]
!=
total_hits
:
update_stats
[
'updated'
]
+=
1
session
.
execute
(
'SET X count %(hits)s WHERE X eid %(e)s'
%
{
'e'
:
rset
[
0
][
0
],
'hits'
:
total_hits
})
else
:
update_stats
[
'exists no change'
]
+=
1
else
:
update_stats
[
'created'
]
+=
1
session
.
create_entity
(
'Hits'
,
count
=
total_hits
,
period
=
stats_period
,
stats_about
=
entity
,
hit_type
=
is_rdf
and
u
'rdf'
or
u
'normal'
)
## command / initial setup API ############################################
def
_init_cw_connection
(
self
,
appid
):
config
=
cwconfig
.
instance_configuration
(
appid
)
sourcescfg
=
config
.
sources
()
...
...
@@ -213,35 +215,33 @@ Number of stat objects compressed : %(compressed)s
break
session
=
repo
.
_get_session
(
cnx
.
sessionid
)
# XXX keep reference on cnx otherwise cnx.__del__ will cause trouble
# (file a ticket)
return
cnx
,
session
def
main_run
(
self
,
args
,
rcfile
=
None
):
"""Run the command and return status 0 if everything went fine.
If :exc:`CommandError` is raised by the underlying command, simply log
the error and return status 2.
Any other exceptions, including :exc:`BadCommandUsage` will be
propagated.
"""
# XXX (adim): rcfile handling is spectacularly messy but I can't
# get it right without refactoring pivotdoc for now
if
rcfile
is
None
:
if
'-c'
in
args
:
rcfile
=
args
[
args
.
index
(
'-c'
)
+
1
]
elif
'--config'
in
args
:
rcfile
=
args
[
args
.
index
(
'--config'
)
+
1
]
else
:
rcfile
=
None
#self.config.config
return
Command
.
main_run
(
self
,
args
,
rcfile
)
def
run
(
self
,
args
):
# args = (appid, start[, stop])
appid
=
args
.
pop
(
0
)
cw_cnx
,
session
=
self
.
_init_cw_connection
(
appid
)
session
.
set_cnxset
()
self
.
update_stats
(
session
,
args
)
session
.
commit
()
periodicity
=
session
.
vreg
.
config
[
'awstats-periodicity'
]
if
start
is
None
:
start
=
datetime
.
now
()
else
:
start
=
parse_input_date
(
start
,
periodicity
)
if
stop
is
None
:
stop
=
start
else
:
stop
=
parse_input_date
(
stop
,
periodicity
)
if
start
is
None
or
stop
is
None
:
sys
.
exit
(
1
)
# parse_input_date failed to parse date
stop
+=
ONEDAY
# date_range() excludes stop boundary
stats_updater
=
StatsUpdater
(
session
)
stats_report
=
stats_updater
.
update_stats
(
start
,
stop
)
print
'''=== Update Report ===
Number of periods imported : %(periods)s
Number of stat objects created : %(created)s
Number of stat objects updated : %(updated)s
Number of stat objects already existed : %(exists no change)s
Number of stat objects skipped : %(skipped)s
Number of stat objects compressed : %(compressed)s
'''
%
stats_report
CWCTL
.
register
(
UpdateWebstatsCommand
)
site_cubicweb.py
View file @
48baca6b
...
...
@@ -11,4 +11,11 @@ options = (
'help'
:
'domain of the website (eg. example.org). '
,
'group'
:
'awstats'
,
'level'
:
0
,
}),
(
'awstats-periodicity'
,
{
'type'
:
'choice'
,
'choices'
:
(
'hour'
,
'day'
,
'month'
),
'default'
:
'day'
,
'help'
:
'stats periodicity'
,
'group'
:
'awstats'
,
'level'
:
0
,
}),
)
utils.py
View file @
48baca6b
...
...
@@ -17,9 +17,12 @@
import
re
import
os.path
as
osp
from
datetime
import
datetime
from
logilab.common.date
import
previous_month
,
first_day
from
logilab.common.shellutils
import
ProgressBar
from
cubicweb.req
import
FindEntityError
SECTIONSPEC
=
{
# commented sections are not usefull to view
# 'MAP' : ['section', 'offset'],
...
...
@@ -91,7 +94,7 @@ ORIGIN_LABELS = {
}
def
extract_stats_dict
(
awstats_dir
,
filename
):
def
extract_stats_dict
(
filepath
):
''' from an awstats file extract structured data into a dict
returns a dictionnary like this :
...
...
@@ -108,12 +111,12 @@ def extract_stats_dict(awstats_dir, filename):
}
}
'''
if
not
osp
.
isfile
(
osp
.
join
(
awstats_dir
,
filename
)
):
if
not
osp
.
isfile
(
filepath
):
return
{}
section_name
=
None
parsed_countdown
=
0
stats_dict
=
{}
for
line
in
file
(
osp
.
join
(
awstats_dir
,
filename
)).
readlines
(
):
for
line
in
file
(
filepath
):
if
line
.
startswith
(
'BEGIN_'
):
section_name
,
nb_of_lines
=
line
.
split
(
'_'
,
1
)[
1
].
split
()
if
section_name
in
SECTIONSPEC
:
...
...
@@ -147,13 +150,9 @@ def eid_from_url(value):
pass
def
get_or_create_statperiod
(
session
,
start
,
stop
):
rql
=
'Any P WHERE P is StatPeriod, P start "%(start_date)s", P stop "%(end_date)s"'
rset
=
session
.
execute
(
rql
%
{
'start_date'
:
start
,
'end_date'
:
stop
})
if
rset
:
return
rset
.
get_entity
(
0
,
0
)
else
:
try
:
return
session
.
find_one_entity
(
'StatPeriod'
,
start
=
start
,
stop
=
stop
)
except
FindEntityError
:
return
session
.
create_entity
(
'StatPeriod'
,
start
=
start
,
stop
=
stop
)
def
compress_old_hits
(
req
,
update_stats
=
{}):
...
...
views/startup.py
View file @
48baca6b
...
...
@@ -18,6 +18,7 @@
import
os
import
os.path
as
osp
import
re
from
datetime
import
datetime
,
timedelta
import
urllib
...
...
@@ -128,11 +129,11 @@ class AwstatsView(StartupView):
filename
=
'awstats%s%s.txt'
%
(
month
,
domain
and
'.%s'
%
domain
)
awstats_dir
=
self
.
_cw
.
vreg
.
config
[
'awstats-dir'
]
try
:
stats_dict
=
extract_stats_dict
(
awstats_dir
,
filename
)
stats_dict
=
extract_stats_dict
(
osp
.
join
(
awstats_dir
,
filename
)
)
except
IOError
:
filename
=
'awstats%s%s.txt'
%
(
extract_available_months
(
form
)[
0
],
domain
and
'.%s'
%
domain
)
stats_dict
=
extract_stats_dict
(
awstats_dir
,
filename
)
stats_dict
=
extract_stats_dict
(
osp
.
join
(
awstats_dir
,
filename
)
)
self
.
w
(
u
'<div id="awstats">'
)
self
.
w
(
u
'<h1>%s : %s</h1>'
%
(
_
(
'Domain'
),
domain
or
'default'
))
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment