py/get_images.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

import re
import server
import md5
import urllib
import collections
import os
import subprocess

BASEWORD = r"Image"

BASE_URL="http://upload.wikimedia.org/wikipedia/commons"

def get_source_url(filename):
    return "%s/%s" % (BASE_URL, get_endpath(filename))

def get_dirs(filename):
    m = md5.new()
    m.update(filename)
    h = m.hexdigest()
    return (h[0], h[:2])

def get_endpath(filename):
    d = get_dirs(filename)
    p = "%s/%s/%s" % (d[0], d[1], filename)
    return p

def canonicalize_filename(wikiname):
    wikiname = wikiname.replace(' ', '_')
    wikiname = wikiname[0].upper() + wikiname[1:]
    return wikiname

class WorkaroundURLopener(urllib.FancyURLopener):
    version = "OLPC_wikislicer/0.1"

urllib._urlopener = WorkaroundURLopener()

def download_image(filename, base_dir):
    source = get_source_url(filename)
    dirs = get_dirs(filename)
    destdir = "%s/%s/%s" % (base_dir, dirs[0], dirs[1])
    try:
        os.makedirs(destdir)
    except:
        pass #This just means that destdir already exists
    dest = "%s/%s" % (destdir, filename)
    try:
        urllib.urlretrieve(source,dest)
    except:
        print "Failed to download " + source
        return False
    return dest

def download_and_process(imgdict, base_dir, thumb_width):
    for wikiname in imgdict:
        filename = canonicalize_filename(wikiname)
        d = download_image(filename, base_dir)
        vector = filename[-3:].upper() == 'SVG'
        if d and vector:
            print "Downloaded vector image " + d
        if d and not vector:
            print "Downloaded raster image" + d
            width = None
            height= None
            for p in imgdict[wikiname]:
                if p.width is not None:
                    width = max(width, p.width)
                elif p.thumbnail:
                    width = max(width, thumb_width)
                if p.height is not None:
                    height = max(height, p.height)
            if width is not None:
                if height is None:
                    newsize = "%i>" % width
                else:
                    newsize = "%ix%i>" % (width, height)
                try:
                    subprocess.check_call(['convert', d,"-flatten", "-resize", newsize, "-quality", "20", "JPEG:%s" % d])
                    print "Succesfully resized " + d
                except:
                    print "Error: convert failed on " + wikiname + " " + d
                    try:
                        os.remove(d)
                    except:
                        print "Error: failed to remove " + d
class ImageProps:
    thumbnail = False
    width = None
    height = None
    upright = False

    def __repr__(self):
        return "%s (%s, %s) %s" % (self.thumbnail, self.width, self.height, self.upright)

class ImageFinder:
    def __init__(self, image_word):
        self.word = image_word
        self.db = server.WPWikiDB()

    def find_images(self, text):
        L = []
        
        #pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?:\|(?P<type>thumb|thumbnail)|(?P<width>\d+)(?:x(?P<height>\d+))?px|(?P<upright>upright)|(?:[^\|\[\]]|\[[^\|\[\]]*\]|\[\[[^\|\[\]]*\]\])*)*\]\]" % (BASEWORD, self.word)
        #pattern = r"\[\[(?:%s|%s):(?P<filename>[^\|\]]+)(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*)\]\]" % (BASEWORD, self.word)
        pattern = r"\[\[(?:%s|%s):\s*(?P<filename>[^\|\]]+?)\s*(?:\|(?P<options>(?:[^\[\]]|\[[^\[\]]*\]|\[\[[^\[\]]*\]\])*))?\]\]" % (BASEWORD, self.word)
        for match in re.finditer(pattern, text):
            if match:
                #d = match.groupdict(None)
                f = match.group('filename')
                p = ImageProps()
                for s in match.group('options').split('|'):
                    if s == 'thumb' or s == 'thumbnail':
                        p.thumbnail = True
                    elif s == 'upright':
                        p.upright = False
                    elif s[-2:] == 'px':
                        dims = s[:-2].split('x')
                        if len(dims) > 0:
                            p.width = int(dims[0])
                        if len(dims) > 1:
                            p.height = int(dims[1])
                print (f,p)
                L.append((f,p))
        return L

    def get_images_info(self, title):
        text = self.db.getExpandedArticle(title)
        return self.find_images(text)

    def list_images(self, title):
        props = self.get_images_info(title)
        filenames = [t[0] for t in props]
        return filenames

    def get_metadata_all(self, titles):
        d = collections.defaultdict(list)
        for t in titles:
            L = self.get_images_info(t)
            for (fname, props) in L:
                d[fname].append(props)
        return d

def read_links(index):
    f = open(index)
    text = f.read()
    f.close()
    titles = []
    for match in re.finditer('href\s*=\s*[\'\"]/wiki/([^\'\"]+)[\'\"]', text):
        if match:
            titles.append(match.group(1))
    return titles

def main_task(db_path, indexfile, image_word, base_dir, thumb_width):
    titles = read_links(indexfile)
    print titles
    server.load_db(db_path)
    p = ImageFinder(image_word)
    m = p.get_metadata_all(titles)
    print m
    download_and_process(m, base_dir, thumb_width)

main_task("/home/olpc/40ormore.xml.bz2", "../static/index.html", "Imagen", "/home/olpc/images", 180)