1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
|
import re
import server
import md5
import urllib
import collections
import os
import subprocess
BASEWORD = r"Image"
BASE_URL="http://upload.wikimedia.org/wikipedia/commons"
def get_source_url(filename):
return "%s/%s" % (BASE_URL, get_endpath(filename))
def get_dirs(filename):
m = md5.new()
m.update(filename)
h = m.hexdigest()
return (h[0], h[:2])
def get_endpath(filename):
d = get_dirs(filename)
p = "%s/%s/%s" % (d[0], d[1], filename)
return p
def canonicalize_filename(wikiname):
wikiname = wikiname.replace(' ', '_')
wikiname = wikiname[0].upper() + wikiname[1:]
return wikiname
class WorkaroundURLopener(urllib.FancyURLopener):
version = "OLPC_wikislicer/0.1"
urllib._urlopener = WorkaroundURLopener()
def download_image(filename, base_dir):
source = get_source_url(filename)
dirs = get_dirs(filename)
destdir = "%s/%s/%s" % (base_dir, dirs[0], dirs[1])
try:
os.makedirs(destdir)
except:
pass #This just means that destdir already exists
dest = "%s/%s" % (destdir, filename)
try:
urllib.urlretrieve(source,dest)
except:
print "Failed to download " + source
return False
return dest
def download_and_process(imgdict, base_dir, thumb_width):
for wikiname in imgdict:
filename = canonicalize_filename(wikiname)
d = download_image(filename, base_dir)
vector = filename[-3:].upper() == 'SVG'
if d and vector:
print "Downloaded vector image " + d
if d and not vector:
print "Downloaded raster image" + d
width = None
height= None
for p in imgdict[wikiname]:
if p.width is not None:
width = max(width, p.width)
elif p.thumbnail:
width = max(width, thumb_width)
if p.height is not None:
height = max(height, p.height)
if width is not None:
if height is None:
newsize = "%i>" % width
else:
newsize = "%ix%i>" % (width, height)
try:
subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % d])
print "Succesfully resized " + d
except:
print "Error: convert failed on " + wikiname + " " + d
try:
os.remove(d)
except:
print "Error: failed to remove " + d
class ImageProps:
thumbnail = False
width = None
height = None
upright = False
def __repr__(self):
return "%s (%s, %s) %s" % (self.thumbnail, self.width, self.height, self.upright)
class ImageFinder:
def __init__(self, image_word):
self.word = image_word
self.db = server.WPWikiDB()
def find_images(self, text):
L = []
#pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?:\|(?P<type>thumb|thumbnail)|(?P<width>\d+)(?:x(?P<height>\d+))?px|(?P<upright>upright)|(?:[^\|\[\]]|\[[^\|\[\]]*\]|\[\[[^\|\[\]]*\]\])*)*\]\]" % (BASEWORD, self.word)
#pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*)\]\]" % (BASEWORD, self.word)
pattern = r"\[\[(?:%s|%s):\s*(?P<filename>[^\|\]]+?)\s*(?:\|(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*))?\]\]" % (BASEWORD, self.word)
for match in re.finditer(pattern, text):
if match:
#d = match.groupdict(None)
f = match.group('filename')
p = ImageProps()
for s in match.group('options').split('|'):
if s == 'thumb' or s == 'thumbnail':
p.thumbnail = True
elif s == 'upright':
p.upright = False
elif s[-2:] == 'px':
dims = s[:-2].split('x')
if len(dims) > 0:
p.width = int(dims[0])
if len(dims) > 1:
p.height = int(dims[1])
print (f,p)
L.append((f,p))
return L
def get_images_info(self, title):
text = self.db.getExpandedArticle(title)
return self.find_images(text)
def list_images(self, title):
props = self.get_images_info(title)
filenames = [t[0] for t in props]
return filenames
def get_metadata_all(self, titles):
d = collections.defaultdict(list)
for t in titles:
L = self.get_images_info(t)
for (fname, props) in L:
d[fname].append(props)
return d
def read_links(index):
f = open(index)
text = f.read()
f.close()
titles = []
for match in re.finditer('href\s*=\s*[\'\"]/wiki/([^\'\"]+)[\'\"]', text):
if match:
titles.append(match.group(1))
return titles
def main_task(db_path, indexfile, image_word, base_dir, thumb_width):
titles = read_links(indexfile)
print titles
server.load_db(db_path)
p = ImageFinder(image_word)
m = p.get_metadata_all(titles)
print m
download_and_process(m, base_dir, thumb_width)
main_task("/home/olpc/40ormore.xml.bz2", "../static/index.html", "Imagen", "/home/olpc/images", 180)
|