Reorganization step 2.

author: Wade Brainerd <wadetb@gmail.com> 2008-05-23 22:59:37 (GMT)
committer: Wade Brainerd <wadetb@gmail.com> 2008-05-23 22:59:37 (GMT)
commit: 9878512ab181ef56e82d91ed3e69ddbaa50520d0 (patch)
tree: 879e52bebdea44daa32afaaa8802c183fd9484ed /woip/py
parent: dd58bf72d6799438d8033cf7de6bc26a711734c3 (diff)
3 files changed, 380 insertions, 0 deletions
diff --git a/woip/py/get_images.py b/woip/py/get_images.py
new file mode 100644
index 0000000..91da39e
--- /dev/null
+++ b/woip/py/get_images.py
@@ -0,0 +1,252 @@
+from __future__ import with_statement
+import re
+#import server
+import md5
+import urllib
+import collections
+import os
+import subprocess
+
+BASEWORD = r"Image"
+
+BASE_URL="http://upload.wikimedia.org/wikipedia/commons"
+
+def get_source_url(filename):
+    return "%s/%s" % (BASE_URL, get_endpath(filename))
+
+def get_dirs(filename):
+    m = md5.new()
+    m.update(filename)
+    h = m.hexdigest()
+    return (h[0], h[:2])
+
+def get_endpath(filename):
+    d = get_dirs(filename)
+    p = "%s/%s/%s" % (d[0], d[1], filename)
+    return p
+
+def canonicalize_filename(wikiname):
+    wikiname = wikiname.replace(' ', '_')
+    wikiname = wikiname[0].upper() + wikiname[1:]
+    return wikiname
+
+class WorkaroundURLopener(urllib.FancyURLopener):
+    version = "OLPC_wikislicer/0.1"
+
+urllib._urlopener = WorkaroundURLopener()
+
+def download_image(filename, base_dir):
+    source = get_source_url(filename)
+    dirs = get_dirs(filename)
+    destdir = "%s/%s/%s" % (base_dir, dirs[0], dirs[1])
+    try:
+        os.makedirs(destdir)
+    except:
+        pass #This just means that destdir already exists
+    dest = "%s/%s" % (destdir, filename)
+    try:
+        urllib.urlretrieve(source,dest)
+    except:
+        print "Failed to download " + source
+        return False
+    return dest
+
+def make_svg_wrapper(name, width, height):
+    s = '<svg xmlns="http://www.w3.org/2000/svg" version="1.2" xmlns:xlink="http://www.w3.org/1999/xlink" width="%(width)i" height="%(height)i" viewbox="0 0 %(width)i %(height)i"><image xlink:href="%(name)s" width="100%%" height="100%%" x="0" y="0"/></svg>' % {'name':name, 'width':width, 'height':height }
+    return s
+
+def get_dims(path):
+    try:
+        p = subprocess.Popen(['identify','-format','%wx%h',path],stdout=subprocess.PIPE)
+        p.wait()
+        s = p.stdout.read()
+        l = s.split('x')
+        return (int(l[0]), int(l[1]))
+    except:
+        print "Failed to get dims"
+        return False
+
+def download_and_process(imgdict, base_dir, thumb_width):
+    for wikiname in imgdict:
+        filename = canonicalize_filename(wikiname)
+        d = download_image(filename, base_dir)
+        if d:
+            width = None
+            height= None
+            for p in imgdict[wikiname]:
+                if p.width is not None:
+                    width = max(width, p.width)
+                elif p.thumbnail:
+                    width = max(width, thumb_width)
+                if p.height is not None:
+                    height = max(height, p.height)
+            process_image(filename, width, height)
+
+MAXWIDTH=800
+MAXHEIGHT=800
+def process_image(d, width=None, height=None):
+        if width is None:
+            width = MAXWIDTH
+        if height is None:
+            height = MAXHEIGHT
+        newsize = "%ix%i>" % (width, height)
+        vector = d[-3:].upper() == 'SVG'
+        if vector:
+            try:
+                jpg_name = d + '.jpg'
+                subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % jpg_name])
+                (width, height) = get_dims(jpg_name)
+
+                svg_factor = 0.3 #favorability of SVG
+                print "Processing vector image " + d
+                jpg_size = os.stat(jpg_name).st_size 
+                svg_size = svg_factor * os.stat(d).st_size
+                if svg_size > jpg_size: 
+                    print "Replacing svg by a raster wrapper"
+                    endname = jpg_name.split('/')[-1]
+                    s = make_svg_wrapper(endname, width, height)
+                    f = open(d,'w')
+                    f.write(s)
+                    f.truncate()
+                    f.close()
+                    return jpg_size + os.stat(d).st_size
+                else:
+                    print "Preserving svg as vector"
+                    os.remove(jpg_name)
+                    return os.stat(d).st_size
+            except:
+                print "Error: convert failed on " + d
+                try:
+                    os.remove(d)
+                    os.remove(jpg_name)
+                except:
+                    print "Error: failed to remove " + d
+                return 0
+                
+        else:
+            print "Processing raster image " + d
+            try:
+                subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % d])
+                print "Succesfully resized " + d
+                return os.stat(d).st_size
+            except:
+                print "Error: convert failed on " + d
+                try:
+                    os.remove(d)
+                except:
+                    print "Error: failed to remove " + d
+                return 0
+
+def process_imagelist(list_filename, base_dir, imgword, maxsize=float('inf')):
+    with open(list_filename) as f:
+        print "opened " + list_filename
+        totalsize = 0 #bytes
+        searcher = r"\[\[(?:%s|%s):(.*?)\]\]\s+(\d+)\s+(.*?)\s+(.*?)$" % (BASEWORD, imgword)
+        print searcher
+        for line in f.readlines():
+            m = re.search(searcher, line)
+            if m is None:
+                raise AssertionError("Match didn't work")
+            wikiname = m.group(1)
+            hits = m.group(2)
+            width = m.group(3)
+            height = m.group(4)
+            print wikiname, hits, width, height
+            
+            if width == 'None':
+                width = None
+            else:
+                width = int(width)
+            
+            if height == 'None':
+                height = None
+            else:
+                height = int(height)
+    
+            filename = canonicalize_filename(wikiname)
+            d = download_image(filename, base_dir)
+            if d:
+                s = process_image(d, width, height)
+                totalsize += s
+                print d + " occupies " + str(s)  + " bytes; running total is " + str(totalsize)
+                if totalsize > maxsize:
+                    break
+
+class ImageProps:
+    thumbnail = False
+    width = None
+    height = None
+    upright = False
+
+    def __repr__(self):
+        return "%s (%s, %s) %s" % (self.thumbnail, self.width, self.height, self.upright)
+
+class ImageFinder:
+    def __init__(self, image_word):
+        self.word = image_word
+        self.db = server.WPWikiDB()
+
+    def find_images(self, text):
+        L = []
+        
+        #pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?:\|(?P<type>thumb|thumbnail)|(?P<width>\d+)(?:x(?P<height>\d+))?px|(?P<upright>upright)|(?:[^\|\[\]]|\[[^\|\[\]]*\]|\[\[[^\|\[\]]*\]\])*)*\]\]" % (BASEWORD, self.word)
+        #pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*)\]\]" % (BASEWORD, self.word)
+        pattern = r"\[\[(?:%s|%s):\s*(?P<filename>[^\|\]]+?)\s*(?:\|(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*))?\]\]" % (BASEWORD, self.word)
+        for match in re.finditer(pattern, text):
+            if match:
+                #d = match.groupdict(None)
+                f = match.group('filename')
+                p = ImageProps()
+                for s in match.group('options').split('|'):
+                    if s == 'thumb' or s == 'thumbnail':
+                        p.thumbnail = True
+                    elif s == 'upright':
+                        p.upright = False
+                    elif s[-2:] == 'px':
+                        dims = s[:-2].split('x')
+                        if len(dims) > 0:
+                            p.width = int(dims[0])
+                        if len(dims) > 1:
+                            p.height = int(dims[1])
+                print (f,p)
+                L.append((f,p))
+        return L
+
+    def get_images_info(self, title):
+        text = self.db.getExpandedArticle(title)
+        return self.find_images(text)
+
+    def list_images(self, title):
+        props = self.get_images_info(title)
+        filenames = [t[0] for t in props]
+        return filenames
+
+    def get_metadata_all(self, titles):
+        d = collections.defaultdict(list)
+        for t in titles:
+            L = self.get_images_info(t)
+            for (fname, props) in L:
+                d[fname].append(props)
+        return d
+
+def read_links(index):
+    f = open(index)
+    text = f.read()
+    f.close()
+    titles = []
+    for match in re.finditer('href\s*=\s*[\'\"]/wiki/([^\'\"]+)[\'\"]', text):
+        if match:
+            titles.append(match.group(1))
+    return titles
+
+def main_task(db_path, indexfile, image_word, base_dir, thumb_width):
+    titles = read_links(indexfile)
+    print titles
+    server.load_db(db_path)
+    p = ImageFinder(image_word)
+    m = p.get_metadata_all(titles)
+    print m
+    download_and_process(m, base_dir, thumb_width)
+
+#main_task("/home/olpc/40ormore.xml.bz2", "../static/index.html", "Imagen", "/home/olpc/images", 180)
+process_imagelist("/home/olpc/top70k_images2", "/home/olpc/images", "Imagen", 15000000)
diff --git a/woip/py/wp/setup.py b/woip/py/wp/setup.py
new file mode 100644
index 0000000..4d43048
--- /dev/null
+++ b/woip/py/wp/setup.py
@@ -0,0 +1,22 @@
+from distutils.core import setup, Extension
+
+wp_sources = [
+    'wp.i',
+    '../../c/bzipreader.c',
+    '../../c/wp.c',
+    '../../c/lsearcher.c',
+    '../../c/safe.c',
+    '../../c/blocks.c'
+]
+
+wp_module = Extension(
+    '_wp',
+    sources=wp_sources,
+    include_dirs=['../../c'],
+    define_macros=[('DEBUG', 1)],
+    libraries=['bz2'])
+
+setup(name='wp', version='0.1', 
+      author='Wade Brainerd', 
+      description="""Offline Wikipedia Interface.""", 
+      ext_modules=[wp_module], py_modules=['wp'])
diff --git a/woip/py/wp/wp.i b/woip/py/wp/wp.i
new file mode 100644
index 0000000..1bba516
--- /dev/null
+++ b/woip/py/wp/wp.i
@@ -0,0 +1,106 @@
+//
+// Offline Wikipedia database interface for Python
+// (or any other SWIG supported language)
+//
+// Module functions are prefixed with 'wp_' to avoid clashes with functions
+// defined in wp.h.
+
+// Note that the Ruby Inline implementation used a '__' prefix, but that won't
+// work in Python as the underscores cause the methods to be treated as private.
+//
+%module wp
+
+%{
+#include "../../c/wp.h"
+
+#define MAXRES 40
+#define MAXSTR 1024
+
+wp_dump d = {0};
+wp_article a = {0};
+
+char results[MAXRES][MAXSTR];
+int nresults;
+
+char *__exact_match;
+int __got_match;
+
+bool __handle_result(char *s) {
+  strncpy(results[nresults], s, MAXSTR);
+  results[nresults][MAXSTR - 1] = '\0';
+  char *end = strrchr(results[nresults], ' ');
+
+  if(end) {
+    *(end - 1) = '\0';
+    nresults++;
+  }
+
+  return nresults < MAXRES;
+}
+
+bool __handle_exact_match(char *s) {
+  char buf[MAXSTR], *end;
+  strncpy(buf, s, MAXSTR);
+  
+  debug("handle_exact_match(%s)", s);
+
+  end = strrchr(buf, ' ') - 1;
+  *end = '\0';
+
+  if(strcasecmp(buf, __exact_match)) return true;
+  else {
+    __got_match = 1;
+    return false;
+  }
+}
+
+void wp_load_dump(char *dump, char *loc, char *ploc, char *blocks) {
+  load_dump(&d, dump, loc, ploc, blocks);
+  init_article(&a);
+}
+
+char *wp_load_article(char *name) {
+  a.block = 0;
+  a.text[0] = '\0';
+  load_article(&d, name, &a);
+  return a.text;
+}
+
+int wp_article_block() {
+  return a.block;
+}
+
+int wp_article_size() {
+  return strlen(a.text);
+}
+
+int wp_search(char *needle) {
+  nresults = 0;
+  search(&d.index, needle, __handle_result, NULL, true, true);
+  return nresults;
+}
+
+char *wp_result(int n) {
+  return results[n];
+}
+
+int wp_article_exists(char *name) {
+  __exact_match = name;
+  __got_match = 0;
+  debug("wp_article_exists(%s)", name);
+  search(&d.index, name, __handle_exact_match, NULL, true, true);
+  return __got_match;
+}
+
+%}
+
+void wp_load_dump(char *dump, char *loc, char *ploc, char *blocks);
+
+char *wp_load_article(char *name);
+int wp_article_block();
+int wp_article_size();
+
+int wp_search(char *needle);
+char *wp_result(int n);
+
+int wp_article_exists(char *name);
author	Wade Brainerd <wadetb@gmail.com>	2008-05-23 22:59:37 (GMT)
committer	Wade Brainerd <wadetb@gmail.com>	2008-05-23 22:59:37 (GMT)
commit	9878512ab181ef56e82d91ed3e69ddbaa50520d0 (patch)
tree	879e52bebdea44daa32afaaa8802c183fd9484ed /woip/py
parent	dd58bf72d6799438d8033cf7de6bc26a711734c3 (diff)