From c52d16375b2f2a8ce33a5e058085fcaf36d584dd Mon Sep 17 00:00:00 2001 From: Wade Brainerd Date: Fri, 23 May 2008 23:01:10 +0000 Subject: Reorganization stpe 3. --- (limited to 'woip') diff --git a/woip/py/get_images.py b/woip/py/get_images.py deleted file mode 100644 index 91da39e..0000000 --- a/woip/py/get_images.py +++ /dev/null @@ -1,252 +0,0 @@ -from __future__ import with_statement -import re -#import server -import md5 -import urllib -import collections -import os -import subprocess - -BASEWORD = r"Image" - -BASE_URL="http://upload.wikimedia.org/wikipedia/commons" - -def get_source_url(filename): - return "%s/%s" % (BASE_URL, get_endpath(filename)) - -def get_dirs(filename): - m = md5.new() - m.update(filename) - h = m.hexdigest() - return (h[0], h[:2]) - -def get_endpath(filename): - d = get_dirs(filename) - p = "%s/%s/%s" % (d[0], d[1], filename) - return p - -def canonicalize_filename(wikiname): - wikiname = wikiname.replace(' ', '_') - wikiname = wikiname[0].upper() + wikiname[1:] - return wikiname - -class WorkaroundURLopener(urllib.FancyURLopener): - version = "OLPC_wikislicer/0.1" - -urllib._urlopener = WorkaroundURLopener() - -def download_image(filename, base_dir): - source = get_source_url(filename) - dirs = get_dirs(filename) - destdir = "%s/%s/%s" % (base_dir, dirs[0], dirs[1]) - try: - os.makedirs(destdir) - except: - pass #This just means that destdir already exists - dest = "%s/%s" % (destdir, filename) - try: - urllib.urlretrieve(source,dest) - except: - print "Failed to download " + source - return False - return dest - -def make_svg_wrapper(name, width, height): - s = '' % {'name':name, 'width':width, 'height':height } - return s - -def get_dims(path): - try: - p = subprocess.Popen(['identify','-format','%wx%h',path],stdout=subprocess.PIPE) - p.wait() - s = p.stdout.read() - l = s.split('x') - return (int(l[0]), int(l[1])) - except: - print "Failed to get dims" - return False - -def download_and_process(imgdict, base_dir, thumb_width): - for wikiname in imgdict: - filename = canonicalize_filename(wikiname) - d = download_image(filename, base_dir) - if d: - width = None - height= None - for p in imgdict[wikiname]: - if p.width is not None: - width = max(width, p.width) - elif p.thumbnail: - width = max(width, thumb_width) - if p.height is not None: - height = max(height, p.height) - process_image(filename, width, height) - -MAXWIDTH=800 -MAXHEIGHT=800 -def process_image(d, width=None, height=None): - if width is None: - width = MAXWIDTH - if height is None: - height = MAXHEIGHT - newsize = "%ix%i>" % (width, height) - vector = d[-3:].upper() == 'SVG' - if vector: - try: - jpg_name = d + '.jpg' - subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % jpg_name]) - (width, height) = get_dims(jpg_name) - - svg_factor = 0.3 #favorability of SVG - print "Processing vector image " + d - jpg_size = os.stat(jpg_name).st_size - svg_size = svg_factor * os.stat(d).st_size - if svg_size > jpg_size: - print "Replacing svg by a raster wrapper" - endname = jpg_name.split('/')[-1] - s = make_svg_wrapper(endname, width, height) - f = open(d,'w') - f.write(s) - f.truncate() - f.close() - return jpg_size + os.stat(d).st_size - else: - print "Preserving svg as vector" - os.remove(jpg_name) - return os.stat(d).st_size - except: - print "Error: convert failed on " + d - try: - os.remove(d) - os.remove(jpg_name) - except: - print "Error: failed to remove " + d - return 0 - - else: - print "Processing raster image " + d - try: - subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % d]) - print "Succesfully resized " + d - return os.stat(d).st_size - except: - print "Error: convert failed on " + d - try: - os.remove(d) - except: - print "Error: failed to remove " + d - return 0 - -def process_imagelist(list_filename, base_dir, imgword, maxsize=float('inf')): - with open(list_filename) as f: - print "opened " + list_filename - totalsize = 0 #bytes - searcher = r"\[\[(?:%s|%s):(.*?)\]\]\s+(\d+)\s+(.*?)\s+(.*?)$" % (BASEWORD, imgword) - print searcher - for line in f.readlines(): - m = re.search(searcher, line) - if m is None: - raise AssertionError("Match didn't work") - wikiname = m.group(1) - hits = m.group(2) - width = m.group(3) - height = m.group(4) - print wikiname, hits, width, height - - if width == 'None': - width = None - else: - width = int(width) - - if height == 'None': - height = None - else: - height = int(height) - - filename = canonicalize_filename(wikiname) - d = download_image(filename, base_dir) - if d: - s = process_image(d, width, height) - totalsize += s - print d + " occupies " + str(s) + " bytes; running total is " + str(totalsize) - if totalsize > maxsize: - break - -class ImageProps: - thumbnail = False - width = None - height = None - upright = False - - def __repr__(self): - return "%s (%s, %s) %s" % (self.thumbnail, self.width, self.height, self.upright) - -class ImageFinder: - def __init__(self, image_word): - self.word = image_word - self.db = server.WPWikiDB() - - def find_images(self, text): - L = [] - - #pattern = r"\[\[(?:%s|%s):(?P[^\|\]]+)(?:\|(?Pthumb|thumbnail)|(?P\d+)(?:x(?P\d+))?px|(?Pupright)|(?:[^\|\[\]]|\[[^\|\[\]]*\]|\[\[[^\|\[\]]*\]\])*)*\]\]" % (BASEWORD, self.word) - #pattern = r"\[\[(?:%s|%s):(?P[^\|\]]+)(?P(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*)\]\]" % (BASEWORD, self.word) - pattern = r"\[\[(?:%s|%s):\s*(?P[^\|\]]+?)\s*(?:\|(?P(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*))?\]\]" % (BASEWORD, self.word) - for match in re.finditer(pattern, text): - if match: - #d = match.groupdict(None) - f = match.group('filename') - p = ImageProps() - for s in match.group('options').split('|'): - if s == 'thumb' or s == 'thumbnail': - p.thumbnail = True - elif s == 'upright': - p.upright = False - elif s[-2:] == 'px': - dims = s[:-2].split('x') - if len(dims) > 0: - p.width = int(dims[0]) - if len(dims) > 1: - p.height = int(dims[1]) - print (f,p) - L.append((f,p)) - return L - - def get_images_info(self, title): - text = self.db.getExpandedArticle(title) - return self.find_images(text) - - def list_images(self, title): - props = self.get_images_info(title) - filenames = [t[0] for t in props] - return filenames - - def get_metadata_all(self, titles): - d = collections.defaultdict(list) - for t in titles: - L = self.get_images_info(t) - for (fname, props) in L: - d[fname].append(props) - return d - -def read_links(index): - f = open(index) - text = f.read() - f.close() - titles = [] - for match in re.finditer('href\s*=\s*[\'\"]/wiki/([^\'\"]+)[\'\"]', text): - if match: - titles.append(match.group(1)) - return titles - -def main_task(db_path, indexfile, image_word, base_dir, thumb_width): - titles = read_links(indexfile) - print titles - server.load_db(db_path) - p = ImageFinder(image_word) - m = p.get_metadata_all(titles) - print m - download_and_process(m, base_dir, thumb_width) - -#main_task("/home/olpc/40ormore.xml.bz2", "../static/index.html", "Imagen", "/home/olpc/images", 180) -process_imagelist("/home/olpc/top70k_images2", "/home/olpc/images", "Imagen", 15000000) diff --git a/woip/py/wp/setup.py b/woip/py/setup.py index 4d43048..4d43048 100644 --- a/woip/py/wp/setup.py +++ b/woip/py/setup.py diff --git a/woip/py/wp/wp.i b/woip/py/wp.i index 1bba516..1bba516 100644 --- a/woip/py/wp/wp.i +++ b/woip/py/wp.i -- cgit v0.9.1