diff options
author | Wade Brainerd <wadetb@gmail.com> | 2008-05-23 22:59:37 (GMT) |
---|---|---|
committer | Wade Brainerd <wadetb@gmail.com> | 2008-05-23 22:59:37 (GMT) |
commit | 9878512ab181ef56e82d91ed3e69ddbaa50520d0 (patch) | |
tree | 879e52bebdea44daa32afaaa8802c183fd9484ed /woip/py | |
parent | dd58bf72d6799438d8033cf7de6bc26a711734c3 (diff) |
Reorganization step 2.
Diffstat (limited to 'woip/py')
-rw-r--r-- | woip/py/get_images.py | 252 | ||||
-rw-r--r-- | woip/py/wp/setup.py | 22 | ||||
-rw-r--r-- | woip/py/wp/wp.i | 106 |
3 files changed, 380 insertions, 0 deletions
diff --git a/woip/py/get_images.py b/woip/py/get_images.py new file mode 100644 index 0000000..91da39e --- /dev/null +++ b/woip/py/get_images.py @@ -0,0 +1,252 @@ +from __future__ import with_statement +import re +#import server +import md5 +import urllib +import collections +import os +import subprocess + +BASEWORD = r"Image" + +BASE_URL="http://upload.wikimedia.org/wikipedia/commons" + +def get_source_url(filename): + return "%s/%s" % (BASE_URL, get_endpath(filename)) + +def get_dirs(filename): + m = md5.new() + m.update(filename) + h = m.hexdigest() + return (h[0], h[:2]) + +def get_endpath(filename): + d = get_dirs(filename) + p = "%s/%s/%s" % (d[0], d[1], filename) + return p + +def canonicalize_filename(wikiname): + wikiname = wikiname.replace(' ', '_') + wikiname = wikiname[0].upper() + wikiname[1:] + return wikiname + +class WorkaroundURLopener(urllib.FancyURLopener): + version = "OLPC_wikislicer/0.1" + +urllib._urlopener = WorkaroundURLopener() + +def download_image(filename, base_dir): + source = get_source_url(filename) + dirs = get_dirs(filename) + destdir = "%s/%s/%s" % (base_dir, dirs[0], dirs[1]) + try: + os.makedirs(destdir) + except: + pass #This just means that destdir already exists + dest = "%s/%s" % (destdir, filename) + try: + urllib.urlretrieve(source,dest) + except: + print "Failed to download " + source + return False + return dest + +def make_svg_wrapper(name, width, height): + s = '<svg xmlns="http://www.w3.org/2000/svg" version="1.2" xmlns:xlink="http://www.w3.org/1999/xlink" width="%(width)i" height="%(height)i" viewbox="0 0 %(width)i %(height)i"><image xlink:href="%(name)s" width="100%%" height="100%%" x="0" y="0"/></svg>' % {'name':name, 'width':width, 'height':height } + return s + +def get_dims(path): + try: + p = subprocess.Popen(['identify','-format','%wx%h',path],stdout=subprocess.PIPE) + p.wait() + s = p.stdout.read() + l = s.split('x') + return (int(l[0]), int(l[1])) + except: + print "Failed to get dims" + return False + +def download_and_process(imgdict, base_dir, thumb_width): + for wikiname in imgdict: + filename = canonicalize_filename(wikiname) + d = download_image(filename, base_dir) + if d: + width = None + height= None + for p in imgdict[wikiname]: + if p.width is not None: + width = max(width, p.width) + elif p.thumbnail: + width = max(width, thumb_width) + if p.height is not None: + height = max(height, p.height) + process_image(filename, width, height) + +MAXWIDTH=800 +MAXHEIGHT=800 +def process_image(d, width=None, height=None): + if width is None: + width = MAXWIDTH + if height is None: + height = MAXHEIGHT + newsize = "%ix%i>" % (width, height) + vector = d[-3:].upper() == 'SVG' + if vector: + try: + jpg_name = d + '.jpg' + subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % jpg_name]) + (width, height) = get_dims(jpg_name) + + svg_factor = 0.3 #favorability of SVG + print "Processing vector image " + d + jpg_size = os.stat(jpg_name).st_size + svg_size = svg_factor * os.stat(d).st_size + if svg_size > jpg_size: + print "Replacing svg by a raster wrapper" + endname = jpg_name.split('/')[-1] + s = make_svg_wrapper(endname, width, height) + f = open(d,'w') + f.write(s) + f.truncate() + f.close() + return jpg_size + os.stat(d).st_size + else: + print "Preserving svg as vector" + os.remove(jpg_name) + return os.stat(d).st_size + except: + print "Error: convert failed on " + d + try: + os.remove(d) + os.remove(jpg_name) + except: + print "Error: failed to remove " + d + return 0 + + else: + print "Processing raster image " + d + try: + subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % d]) + print "Succesfully resized " + d + return os.stat(d).st_size + except: + print "Error: convert failed on " + d + try: + os.remove(d) + except: + print "Error: failed to remove " + d + return 0 + +def process_imagelist(list_filename, base_dir, imgword, maxsize=float('inf')): + with open(list_filename) as f: + print "opened " + list_filename + totalsize = 0 #bytes + searcher = r"\[\[(?:%s|%s):(.*?)\]\]\s+(\d+)\s+(.*?)\s+(.*?)$" % (BASEWORD, imgword) + print searcher + for line in f.readlines(): + m = re.search(searcher, line) + if m is None: + raise AssertionError("Match didn't work") + wikiname = m.group(1) + hits = m.group(2) + width = m.group(3) + height = m.group(4) + print wikiname, hits, width, height + + if width == 'None': + width = None + else: + width = int(width) + + if height == 'None': + height = None + else: + height = int(height) + + filename = canonicalize_filename(wikiname) + d = download_image(filename, base_dir) + if d: + s = process_image(d, width, height) + totalsize += s + print d + " occupies " + str(s) + " bytes; running total is " + str(totalsize) + if totalsize > maxsize: + break + +class ImageProps: + thumbnail = False + width = None + height = None + upright = False + + def __repr__(self): + return "%s (%s, %s) %s" % (self.thumbnail, self.width, self.height, self.upright) + +class ImageFinder: + def __init__(self, image_word): + self.word = image_word + self.db = server.WPWikiDB() + + def find_images(self, text): + L = [] + + #pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?:\|(?P<type>thumb|thumbnail)|(?P<width>\d+)(?:x(?P<height>\d+))?px|(?P<upright>upright)|(?:[^\|\[\]]|\[[^\|\[\]]*\]|\[\[[^\|\[\]]*\]\])*)*\]\]" % (BASEWORD, self.word) + #pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*)\]\]" % (BASEWORD, self.word) + pattern = r"\[\[(?:%s|%s):\s*(?P<filename>[^\|\]]+?)\s*(?:\|(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*))?\]\]" % (BASEWORD, self.word) + for match in re.finditer(pattern, text): + if match: + #d = match.groupdict(None) + f = match.group('filename') + p = ImageProps() + for s in match.group('options').split('|'): + if s == 'thumb' or s == 'thumbnail': + p.thumbnail = True + elif s == 'upright': + p.upright = False + elif s[-2:] == 'px': + dims = s[:-2].split('x') + if len(dims) > 0: + p.width = int(dims[0]) + if len(dims) > 1: + p.height = int(dims[1]) + print (f,p) + L.append((f,p)) + return L + + def get_images_info(self, title): + text = self.db.getExpandedArticle(title) + return self.find_images(text) + + def list_images(self, title): + props = self.get_images_info(title) + filenames = [t[0] for t in props] + return filenames + + def get_metadata_all(self, titles): + d = collections.defaultdict(list) + for t in titles: + L = self.get_images_info(t) + for (fname, props) in L: + d[fname].append(props) + return d + +def read_links(index): + f = open(index) + text = f.read() + f.close() + titles = [] + for match in re.finditer('href\s*=\s*[\'\"]/wiki/([^\'\"]+)[\'\"]', text): + if match: + titles.append(match.group(1)) + return titles + +def main_task(db_path, indexfile, image_word, base_dir, thumb_width): + titles = read_links(indexfile) + print titles + server.load_db(db_path) + p = ImageFinder(image_word) + m = p.get_metadata_all(titles) + print m + download_and_process(m, base_dir, thumb_width) + +#main_task("/home/olpc/40ormore.xml.bz2", "../static/index.html", "Imagen", "/home/olpc/images", 180) +process_imagelist("/home/olpc/top70k_images2", "/home/olpc/images", "Imagen", 15000000) diff --git a/woip/py/wp/setup.py b/woip/py/wp/setup.py new file mode 100644 index 0000000..4d43048 --- /dev/null +++ b/woip/py/wp/setup.py @@ -0,0 +1,22 @@ +from distutils.core import setup, Extension + +wp_sources = [ + 'wp.i', + '../../c/bzipreader.c', + '../../c/wp.c', + '../../c/lsearcher.c', + '../../c/safe.c', + '../../c/blocks.c' +] + +wp_module = Extension( + '_wp', + sources=wp_sources, + include_dirs=['../../c'], + define_macros=[('DEBUG', 1)], + libraries=['bz2']) + +setup(name='wp', version='0.1', + author='Wade Brainerd', + description="""Offline Wikipedia Interface.""", + ext_modules=[wp_module], py_modules=['wp']) diff --git a/woip/py/wp/wp.i b/woip/py/wp/wp.i new file mode 100644 index 0000000..1bba516 --- /dev/null +++ b/woip/py/wp/wp.i @@ -0,0 +1,106 @@ +// +// Offline Wikipedia database interface for Python +// (or any other SWIG supported language) +// +// Module functions are prefixed with 'wp_' to avoid clashes with functions +// defined in wp.h. + +// Note that the Ruby Inline implementation used a '__' prefix, but that won't +// work in Python as the underscores cause the methods to be treated as private. +// +%module wp + +%{ +#include "../../c/wp.h" + +#define MAXRES 40 +#define MAXSTR 1024 + +wp_dump d = {0}; +wp_article a = {0}; + +char results[MAXRES][MAXSTR]; +int nresults; + +char *__exact_match; +int __got_match; + +bool __handle_result(char *s) { + strncpy(results[nresults], s, MAXSTR); + results[nresults][MAXSTR - 1] = '\0'; + char *end = strrchr(results[nresults], ' '); + + if(end) { + *(end - 1) = '\0'; + nresults++; + } + + return nresults < MAXRES; +} + +bool __handle_exact_match(char *s) { + char buf[MAXSTR], *end; + strncpy(buf, s, MAXSTR); + + debug("handle_exact_match(%s)", s); + + end = strrchr(buf, ' ') - 1; + *end = '\0'; + + if(strcasecmp(buf, __exact_match)) return true; + else { + __got_match = 1; + return false; + } +} + +void wp_load_dump(char *dump, char *loc, char *ploc, char *blocks) { + load_dump(&d, dump, loc, ploc, blocks); + init_article(&a); +} + +char *wp_load_article(char *name) { + a.block = 0; + a.text[0] = '\0'; + load_article(&d, name, &a); + return a.text; +} + +int wp_article_block() { + return a.block; +} + +int wp_article_size() { + return strlen(a.text); +} + +int wp_search(char *needle) { + nresults = 0; + search(&d.index, needle, __handle_result, NULL, true, true); + return nresults; +} + +char *wp_result(int n) { + return results[n]; +} + +int wp_article_exists(char *name) { + __exact_match = name; + __got_match = 0; + debug("wp_article_exists(%s)", name); + search(&d.index, name, __handle_exact_match, NULL, true, true); + return __got_match; +} + +%} + +void wp_load_dump(char *dump, char *loc, char *ploc, char *blocks); + +char *wp_load_article(char *name); +int wp_article_block(); +int wp_article_size(); + +int wp_search(char *needle); +char *wp_result(int n); + +int wp_article_exists(char *name); |