diff options
author | Gonzalo Odiard <godiard@gmail.com> | 2013-04-12 02:13:08 (GMT) |
---|---|---|
committer | Gonzalo Odiard <godiard@gmail.com> | 2013-04-12 02:13:08 (GMT) |
commit | 566ee052c69ca02427f2874b5bc19e8577dd6b0f (patch) | |
tree | 7dcc76c28e865d4e19f480ed61c836a94ce22a09 | |
parent | 8f2a9b96ee10e1c84ddc2587c1d3e4395c642bb4 (diff) |
activities_aslo_stats now save a file with json data
In this way we can process many times the data without need spider
the site every time.
Signed-off-by: Gonzalo Odiard <gonzalo@laptop.org>
-rw-r--r-- | statistics/activities_aslo_stats.py | 123 |
1 files changed, 78 insertions, 45 deletions
diff --git a/statistics/activities_aslo_stats.py b/statistics/activities_aslo_stats.py index 60fa9c5..fe2556c 100644 --- a/statistics/activities_aslo_stats.py +++ b/statistics/activities_aslo_stats.py @@ -1,58 +1,91 @@ from urllib2 import urlopen from BeautifulSoup import BeautifulSoup as bs - -html_act_index = urlopen('http://download.sugarlabs.org/activities/') -sopa_index = bs(html_act_index) -pre = sopa_index.find('pre') - -for link in pre.findAll('a'): - activity_code = link.text.replace('/', '') - - if activity_code != link.text: - - date = link.nextSibling.strip() - date = date[:date.find(' ')] - # search the activity uploader - act_url = 'http://activities.sugarlabs.org/es-ES/sugar/addons/versions/%s' % activity_code - - uploader = '' - activity_name = '' - compat_from = '' - compat_to = '' - try: - sopa_act = bs(urlopen(act_url)) +import json +import os + +data = [] + +if not os.path.exists('activities_aslo_stats.json'): + + html_act_index = urlopen('http://download.sugarlabs.org/activities/') + sopa_index = bs(html_act_index) + pre = sopa_index.find('pre') + + + for link in pre.findAll('a'): + activity_code = link.text.replace('/', '') + + if activity_code != link.text: + + date = link.nextSibling.strip() + date = date[:date.find(' ')] + # search the activity uploader + act_url = 'http://activities.sugarlabs.org/es-ES/sugar/addons/versions/%s' % activity_code + + uploader = '' + activity_name = '' + compat_from = '' + compat_to = '' try: - uploader_div = sopa_act.find('div', {'class': 'uploader'}) - uploader = uploader_div.find('a').text - except: - pass - if uploader in ('nickname', ''): - # try reading from http://activities.sugarlabs.org/es-ES/sugar/addon/%s + sopa_act = bs(urlopen(act_url)) try: - act_main_url = 'http://activities.sugarlabs.org/es-ES/sugar/addon/%s' % activity_code - sopa_act_main = bs(urlopen(act_main_url)) - uploader = sopa_act_main.find('a', {'class': 'profileLink'}).text + uploader_div = sopa_act.find('div', {'class': 'uploader'}) + uploader = uploader_div.find('a').text except: pass + if uploader in ('nickname', ''): + # try reading from http://activities.sugarlabs.org/es-ES/sugar/addon/%s + try: + act_main_url = 'http://activities.sugarlabs.org/es-ES/sugar/addon/%s' % activity_code + sopa_act_main = bs(urlopen(act_main_url)) + uploader = sopa_act_main.find('a', {'class': 'profileLink'}).text + except: + pass - try: - main_div = sopa_act.find('div', role='main') - activity_name = main_div.find('a', href='/es-ES/sugar/addon/%s' % activity_code).text - except: - pass - try: - compat_div = sopa_act.find('div', {'class': 'app_compat'}) - compat = compat_div.find('li').text.replace('–', '-') - compat = compat.replace('Sugar:', '') - compat_from = compat[:compat.find('-')].strip().replace('.',',') - compat_to = compat[compat.find('-') + 1:].strip().replace('.',',') + try: + main_div = sopa_act.find('div', role='main') + activity_name = main_div.find('a', href='/es-ES/sugar/addon/%s' % activity_code).text + except: + pass + try: + compat_div = sopa_act.find('div', {'class': 'app_compat'}) + compat = compat_div.find('li').text.replace('–', '-') + compat = compat.replace('Sugar:', '') + compat_from = compat[:compat.find('-')].strip().replace('.',',') + compat_to = compat[compat.find('-') + 1:].strip().replace('.',',') + except: + pass except: pass - except: - pass - if activity_name != '' and uploader != '': - print "%s;%s;%s;%s;%s;%s" % (activity_code, date, activity_name, uploader, compat_from, compat_to) + if activity_name != '' and uploader != '': + print "%s;%s;%s;%s;%s;%s" % (activity_code, date, activity_name, uploader, compat_from, compat_to) + + activity_data = {'code': activity_code, 'date': date, + 'name': activity_name, 'uploader': uploader, + 'compat_from': compat_from, 'compat_to': compat_to} + data.append(activity_data) + + f = open('activities_aslo_stats.json', 'w') + f.write(json.dumps(data)) + f.close() + +else: + + f = open('activities_aslo_stats.json') + data = json.loads(f.read()) + f.close() + + # count activities by uploader + uploaders = {} + for activity_data in data: + uploader = activity_data['uploader'] + if uploader in uploaders.keys(): + uploaders[uploader] = uploaders[uploader] + 1 + else: + uploaders[uploader] = 1 + for uploader in uploaders: + print '%s;%s' % (uploader, uploaders[uploader]) |