Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorWade Brainerd <wadetb@gmail.com>2008-05-23 23:01:10 (GMT)
committer Wade Brainerd <wadetb@gmail.com>2008-05-23 23:01:10 (GMT)
commitc52d16375b2f2a8ce33a5e058085fcaf36d584dd (patch)
tree766b02240ba9527641baa87ab8737710920a7cae /tools
parent9878512ab181ef56e82d91ed3e69ddbaa50520d0 (diff)
Reorganization stpe 3.
Diffstat (limited to 'tools')
-rw-r--r--tools/get_images.py252
1 files changed, 252 insertions, 0 deletions
diff --git a/tools/get_images.py b/tools/get_images.py
new file mode 100644
index 0000000..91da39e
--- /dev/null
+++ b/tools/get_images.py
@@ -0,0 +1,252 @@
+from __future__ import with_statement
+import re
+#import server
+import md5
+import urllib
+import collections
+import os
+import subprocess
+
+BASEWORD = r"Image"
+
+BASE_URL="http://upload.wikimedia.org/wikipedia/commons"
+
+def get_source_url(filename):
+ return "%s/%s" % (BASE_URL, get_endpath(filename))
+
+def get_dirs(filename):
+ m = md5.new()
+ m.update(filename)
+ h = m.hexdigest()
+ return (h[0], h[:2])
+
+def get_endpath(filename):
+ d = get_dirs(filename)
+ p = "%s/%s/%s" % (d[0], d[1], filename)
+ return p
+
+def canonicalize_filename(wikiname):
+ wikiname = wikiname.replace(' ', '_')
+ wikiname = wikiname[0].upper() + wikiname[1:]
+ return wikiname
+
+class WorkaroundURLopener(urllib.FancyURLopener):
+ version = "OLPC_wikislicer/0.1"
+
+urllib._urlopener = WorkaroundURLopener()
+
+def download_image(filename, base_dir):
+ source = get_source_url(filename)
+ dirs = get_dirs(filename)
+ destdir = "%s/%s/%s" % (base_dir, dirs[0], dirs[1])
+ try:
+ os.makedirs(destdir)
+ except:
+ pass #This just means that destdir already exists
+ dest = "%s/%s" % (destdir, filename)
+ try:
+ urllib.urlretrieve(source,dest)
+ except:
+ print "Failed to download " + source
+ return False
+ return dest
+
+def make_svg_wrapper(name, width, height):
+ s = '<svg xmlns="http://www.w3.org/2000/svg" version="1.2" xmlns:xlink="http://www.w3.org/1999/xlink" width="%(width)i" height="%(height)i" viewbox="0 0 %(width)i %(height)i"><image xlink:href="%(name)s" width="100%%" height="100%%" x="0" y="0"/></svg>' % {'name':name, 'width':width, 'height':height }
+ return s
+
+def get_dims(path):
+ try:
+ p = subprocess.Popen(['identify','-format','%wx%h',path],stdout=subprocess.PIPE)
+ p.wait()
+ s = p.stdout.read()
+ l = s.split('x')
+ return (int(l[0]), int(l[1]))
+ except:
+ print "Failed to get dims"
+ return False
+
+def download_and_process(imgdict, base_dir, thumb_width):
+ for wikiname in imgdict:
+ filename = canonicalize_filename(wikiname)
+ d = download_image(filename, base_dir)
+ if d:
+ width = None
+ height= None
+ for p in imgdict[wikiname]:
+ if p.width is not None:
+ width = max(width, p.width)
+ elif p.thumbnail:
+ width = max(width, thumb_width)
+ if p.height is not None:
+ height = max(height, p.height)
+ process_image(filename, width, height)
+
+MAXWIDTH=800
+MAXHEIGHT=800
+def process_image(d, width=None, height=None):
+ if width is None:
+ width = MAXWIDTH
+ if height is None:
+ height = MAXHEIGHT
+ newsize = "%ix%i>" % (width, height)
+ vector = d[-3:].upper() == 'SVG'
+ if vector:
+ try:
+ jpg_name = d + '.jpg'
+ subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % jpg_name])
+ (width, height) = get_dims(jpg_name)
+
+ svg_factor = 0.3 #favorability of SVG
+ print "Processing vector image " + d
+ jpg_size = os.stat(jpg_name).st_size
+ svg_size = svg_factor * os.stat(d).st_size
+ if svg_size > jpg_size:
+ print "Replacing svg by a raster wrapper"
+ endname = jpg_name.split('/')[-1]
+ s = make_svg_wrapper(endname, width, height)
+ f = open(d,'w')
+ f.write(s)
+ f.truncate()
+ f.close()
+ return jpg_size + os.stat(d).st_size
+ else:
+ print "Preserving svg as vector"
+ os.remove(jpg_name)
+ return os.stat(d).st_size
+ except:
+ print "Error: convert failed on " + d
+ try:
+ os.remove(d)
+ os.remove(jpg_name)
+ except:
+ print "Error: failed to remove " + d
+ return 0
+
+ else:
+ print "Processing raster image " + d
+ try:
+ subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % d])
+ print "Succesfully resized " + d
+ return os.stat(d).st_size
+ except:
+ print "Error: convert failed on " + d
+ try:
+ os.remove(d)
+ except:
+ print "Error: failed to remove " + d
+ return 0
+
+def process_imagelist(list_filename, base_dir, imgword, maxsize=float('inf')):
+ with open(list_filename) as f:
+ print "opened " + list_filename
+ totalsize = 0 #bytes
+ searcher = r"\[\[(?:%s|%s):(.*?)\]\]\s+(\d+)\s+(.*?)\s+(.*?)$" % (BASEWORD, imgword)
+ print searcher
+ for line in f.readlines():
+ m = re.search(searcher, line)
+ if m is None:
+ raise AssertionError("Match didn't work")
+ wikiname = m.group(1)
+ hits = m.group(2)
+ width = m.group(3)
+ height = m.group(4)
+ print wikiname, hits, width, height
+
+ if width == 'None':
+ width = None
+ else:
+ width = int(width)
+
+ if height == 'None':
+ height = None
+ else:
+ height = int(height)
+
+ filename = canonicalize_filename(wikiname)
+ d = download_image(filename, base_dir)
+ if d:
+ s = process_image(d, width, height)
+ totalsize += s
+ print d + " occupies " + str(s) + " bytes; running total is " + str(totalsize)
+ if totalsize > maxsize:
+ break
+
+class ImageProps:
+ thumbnail = False
+ width = None
+ height = None
+ upright = False
+
+ def __repr__(self):
+ return "%s (%s, %s) %s" % (self.thumbnail, self.width, self.height, self.upright)
+
+class ImageFinder:
+ def __init__(self, image_word):
+ self.word = image_word
+ self.db = server.WPWikiDB()
+
+ def find_images(self, text):
+ L = []
+
+ #pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?:\|(?P<type>thumb|thumbnail)|(?P<width>\d+)(?:x(?P<height>\d+))?px|(?P<upright>upright)|(?:[^\|\[\]]|\[[^\|\[\]]*\]|\[\[[^\|\[\]]*\]\])*)*\]\]" % (BASEWORD, self.word)
+ #pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*)\]\]" % (BASEWORD, self.word)
+ pattern = r"\[\[(?:%s|%s):\s*(?P<filename>[^\|\]]+?)\s*(?:\|(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*))?\]\]" % (BASEWORD, self.word)
+ for match in re.finditer(pattern, text):
+ if match:
+ #d = match.groupdict(None)
+ f = match.group('filename')
+ p = ImageProps()
+ for s in match.group('options').split('|'):
+ if s == 'thumb' or s == 'thumbnail':
+ p.thumbnail = True
+ elif s == 'upright':
+ p.upright = False
+ elif s[-2:] == 'px':
+ dims = s[:-2].split('x')
+ if len(dims) > 0:
+ p.width = int(dims[0])
+ if len(dims) > 1:
+ p.height = int(dims[1])
+ print (f,p)
+ L.append((f,p))
+ return L
+
+ def get_images_info(self, title):
+ text = self.db.getExpandedArticle(title)
+ return self.find_images(text)
+
+ def list_images(self, title):
+ props = self.get_images_info(title)
+ filenames = [t[0] for t in props]
+ return filenames
+
+ def get_metadata_all(self, titles):
+ d = collections.defaultdict(list)
+ for t in titles:
+ L = self.get_images_info(t)
+ for (fname, props) in L:
+ d[fname].append(props)
+ return d
+
+def read_links(index):
+ f = open(index)
+ text = f.read()
+ f.close()
+ titles = []
+ for match in re.finditer('href\s*=\s*[\'\"]/wiki/([^\'\"]+)[\'\"]', text):
+ if match:
+ titles.append(match.group(1))
+ return titles
+
+def main_task(db_path, indexfile, image_word, base_dir, thumb_width):
+ titles = read_links(indexfile)
+ print titles
+ server.load_db(db_path)
+ p = ImageFinder(image_word)
+ m = p.get_metadata_all(titles)
+ print m
+ download_and_process(m, base_dir, thumb_width)
+
+#main_task("/home/olpc/40ormore.xml.bz2", "../static/index.html", "Imagen", "/home/olpc/images", 180)
+process_imagelist("/home/olpc/top70k_images2", "/home/olpc/images", "Imagen", 15000000)