from __future__ import with_statement import re #import server import md5 import urllib import collections import os import subprocess BASEWORD = r"Image" BASE_URL="http://upload.wikimedia.org/wikipedia/commons" def get_source_url(filename): return "%s/%s" % (BASE_URL, get_endpath(filename)) def get_dirs(filename): m = md5.new() m.update(filename) h = m.hexdigest() return (h[0], h[:2]) def get_endpath(filename): d = get_dirs(filename) p = "%s/%s/%s" % (d[0], d[1], filename) return p def canonicalize_filename(wikiname): wikiname = wikiname.replace(' ', '_') wikiname = wikiname[0].upper() + wikiname[1:] return wikiname class WorkaroundURLopener(urllib.FancyURLopener): version = "OLPC_wikislicer/0.1" urllib._urlopener = WorkaroundURLopener() def download_image(filename, base_dir): source = get_source_url(filename) dirs = get_dirs(filename) destdir = "%s/%s/%s" % (base_dir, dirs[0], dirs[1]) try: os.makedirs(destdir) except: pass #This just means that destdir already exists dest = "%s/%s" % (destdir, filename) try: urllib.urlretrieve(source,dest) except: print "Failed to download " + source return False return dest def make_svg_wrapper(name, width, height): s = '' % {'name':name, 'width':width, 'height':height } return s def get_dims(path): try: p = subprocess.Popen(['identify','-format','%wx%h',path],stdout=subprocess.PIPE) p.wait() s = p.stdout.read() l = s.split('x') return (int(l[0]), int(l[1])) except: print "Failed to get dims" return False def download_and_process(imgdict, base_dir, thumb_width): for wikiname in imgdict: filename = canonicalize_filename(wikiname) d = download_image(filename, base_dir) if d: width = None height= None for p in imgdict[wikiname]: if p.width is not None: width = max(width, p.width) elif p.thumbnail: width = max(width, thumb_width) if p.height is not None: height = max(height, p.height) process_image(filename, width, height) MAXWIDTH=800 MAXHEIGHT=800 def process_image(d, width=None, height=None): vector = d[-3:].upper() == 'SVG' if vector: try: jpg_name = d + '.jpg' rsvg_command = ['rsvg-convert','--keep-aspect-ratio','--format=png','--output', jpg_name] if width is not None: rsvg_command.append('--width=%i' % width) if height is not None: rsvg_command.append('--height=%i' %height) rsvg_command.append(d) subprocess.check_call(rsvg_command) #jpg_name file now contains a png image; we want jpg to save space subprocess.check_call(['convert', "PNG:%s" % jpg_name, "-quality", "20", "JPEG:%s" % jpg_name]) (width, height) = get_dims(jpg_name) svg_factor = 0.3 #favorability of SVG print "Processing vector image " + d jpg_size = os.stat(jpg_name).st_size svg_size = svg_factor * os.stat(d).st_size if svg_size > jpg_size: print "Replacing svg by a raster wrapper" endname = jpg_name.split('/')[-1] s = make_svg_wrapper(endname, width, height) f = open(d,'w') f.write(s) f.truncate() f.close() return jpg_size + os.stat(d).st_size else: print "Preserving svg as vector" os.remove(jpg_name) return os.stat(d).st_size except: print "Error: convert failed on " + d try: os.remove(d) os.remove(jpg_name) except: print "Error: failed to remove " + d return 0 else: print "Processing raster image " + d try: if width is None: width = MAXWIDTH if height is None: height = MAXHEIGHT newsize = "%ix%i>" % (width, height) subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % d]) print "Succesfully resized " + d return os.stat(d).st_size except: print "Error: convert failed on " + d try: os.remove(d) except: print "Error: failed to remove " + d return 0 def process_imagelist(list_filename, base_dir, imgword, maxsize=float('inf')): with open(list_filename) as f: print "opened " + list_filename totalsize = 0 #bytes searcher = r"\[\[(?:%s|%s):(.+?)\]\]\s+(\d+)\s+(.*?)\s+(.*?)$" % (BASEWORD, imgword) print searcher for line in f.readlines(): m = re.search(searcher, line) if m is None: print "WARNING: Match didn't work on " + line wikiname = m.group(1) hits = m.group(2) width = m.group(3) height = m.group(4) print wikiname, hits, width, height if width == 'None': width = None else: width = int(width) if height == 'None': height = None else: height = int(height) filename = canonicalize_filename(wikiname) d = download_image(filename, base_dir) if d: s = process_image(d, width, height) totalsize += s print d + " occupies " + str(s) + " bytes; running total is " + str(totalsize) if totalsize > maxsize: break class ImageProps: thumbnail = False width = None height = None upright = False def __repr__(self): return "%s (%s, %s) %s" % (self.thumbnail, self.width, self.height, self.upright) class ImageFinder: def __init__(self, image_word): self.word = image_word self.db = server.WPWikiDB() def find_images(self, text): L = [] #pattern = r"\[\[(?:%s|%s):(?P[^\|\]]+)(?:\|(?Pthumb|thumbnail)|(?P\d+)(?:x(?P\d+))?px|(?Pupright)|(?:[^\|\[\]]|\[[^\|\[\]]*\]|\[\[[^\|\[\]]*\]\])*)*\]\]" % (BASEWORD, self.word) #pattern = r"\[\[(?:%s|%s):(?P[^\|\]]+)(?P(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*)\]\]" % (BASEWORD, self.word) pattern = r"\[\[(?:%s|%s):\s*(?P[^\|\]]+?)\s*(?:\|(?P(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*))?\]\]" % (BASEWORD, self.word) for match in re.finditer(pattern, text): if match: #d = match.groupdict(None) f = match.group('filename') p = ImageProps() for s in match.group('options').split('|'): if s == 'thumb' or s == 'thumbnail': p.thumbnail = True elif s == 'upright': p.upright = False elif s[-2:] == 'px': dims = s[:-2].split('x') if len(dims) > 0: p.width = int(dims[0]) if len(dims) > 1: p.height = int(dims[1]) print (f,p) L.append((f,p)) return L def get_images_info(self, title): text = self.db.getExpandedArticle(title) return self.find_images(text) def list_images(self, title): props = self.get_images_info(title) filenames = [t[0] for t in props] return filenames def get_metadata_all(self, titles): d = collections.defaultdict(list) for t in titles: L = self.get_images_info(t) for (fname, props) in L: d[fname].append(props) return d def read_links(index): f = open(index) text = f.read() f.close() titles = [] for match in re.finditer('href\s*=\s*[\'\"]/wiki/([^\'\"]+)[\'\"]', text): if match: titles.append(match.group(1)) return titles def main_task(db_path, indexfile, image_word, base_dir, thumb_width): titles = read_links(indexfile) print titles server.load_db(db_path) p = ImageFinder(image_word) m = p.get_metadata_all(titles) print m download_and_process(m, base_dir, thumb_width) #main_task("/home/olpc/40ormore.xml.bz2", "../static/index.html", "Imagen", "/home/olpc/images", 180) process_imagelist("top70k_images", "../es_PE/images", "Imagen", 23000000)