Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools2/download_images.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools2/download_images.py')
-rwxr-xr-xtools2/download_images.py140
1 files changed, 140 insertions, 0 deletions
diff --git a/tools2/download_images.py b/tools2/download_images.py
new file mode 100755
index 0000000..532f6e5
--- /dev/null
+++ b/tools2/download_images.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Create a list of pages with a nuber of how many links are directed to them.
+
+import codecs
+import re
+import md5
+from urllib import FancyURLopener
+import os
+import sys
+import shutil
+import magic
+
+import config
+
+
+class FileListReader():
+
+ def __init__(self, file_name):
+ _file = open(file_name, mode='r')
+ self.list = []
+ line = _file.readline()
+ while line:
+ self.list.append(line.strip())
+ line = _file.readline()
+
+
+class CustomUrlOpener(FancyURLopener):
+
+ version = 'Mozilla/5.0 (X11; Linux x86_64; rv:9.0) Gecko/20100101 ' + \
+ 'Firefox/9.0'
+
+
+class ImagesDownloader:
+
+ def __init__(self, file_name, pages_selected, base_dir, cache_dir, lang):
+ self.base_dir = base_dir
+ self.cache_dir = cache_dir
+ self.mime_checker = magic.open(magic.MAGIC_MIME)
+ self.mime_checker.load()
+ input_links = open('%s.page_images' % file_name, mode='r')
+ line = input_links.readline()
+ while line:
+ words = line.split()
+ page = words[0]
+ if pages_selected is None or (page in pages_selected):
+ print "Processing page %s \r" % page,
+ for n in range(1, len(words)):
+ image_url = words[n]
+ self.download_image(image_url, lang)
+
+ line = input_links.readline()
+ input_links.close()
+
+ def download_image(self, url, lang, dest=None):
+ # avoid downloading .ogg files
+ if url.lower().endswith('.ogg'):
+ return
+ overwrite = True
+ if dest is None:
+ overwrite = False
+ sliced_url = url.split('thumb/')
+ image_part = sliced_url[1]
+ dirs = image_part.split('/')
+ destdir = "%s/%s/%s" % (self.base_dir, dirs[0], dirs[1])
+ image_name = dirs[len(dirs) - 1]
+ try:
+ os.makedirs(destdir)
+ except:
+ pass # This just means that destdir already exists
+ dest = "%s/%s" % (destdir, image_name)
+ if not os.path.exists(dest) or overwrite:
+ if self.cache_dir is not None and not overwrite:
+ # Verify if the file is in the cahce_dir
+ cache_file = "%s/%s/%s/%s" % (self.cache_dir, dirs[0], dirs[1],
+ image_name)
+ if os.path.exists(cache_file):
+ shutil.copyfile(cache_file, dest)
+ return
+ print "Downloading %s" % url
+ opener = CustomUrlOpener()
+ opener.retrieve(url, dest)
+ # Verify the mime type
+ # wikipedia return a html file with a error, if the size requested
+ # is small than the real image
+ # then if the file is a html we need request the unescaled image
+ if url.find('/thumb/')> -1:
+ mime_type = str(self.mime_checker.file(dest))
+ if mime_type.find('text') > -1:
+ url_ori = url
+ url = url[0:url.rfind('/')]
+ url = url.replace('thumb/', '')
+ print 'Wrong mime type, redownloading %s to %s' % (url, dest)
+ self.download_image(url, lang, dest)
+ mime_type = str(self.mime_checker.file(dest))
+ if mime_type.find('text') > -1:
+ # try downloading from the lang instead of commons
+ if url_ori.find('commons') > -1:
+ url_lang = url_ori.replace('commons', lang)
+ self.download_image(url_lang, lang, dest)
+
+ mime_type = str(self.mime_checker.file(dest))
+ if mime_type.find('text') > -1:
+ # if the file downloaded is html/text remove it
+ os.remove(dest)
+
+
+downlad_all = False
+cache_dir = None
+if len(sys.argv) > 1:
+ for argn in range(1, len(sys.argv)):
+ arg = sys.argv[argn]
+ if arg == '--all':
+ downlad_all = True
+ print "Downloading all images"
+ if arg.startswith('--cache_dir='):
+ cache_dir = arg[arg.find('=') + 1:]
+ print "Using cache directory", cache_dir
+
+input_xml_file_name = config.input_xml_file_name
+
+# TODO: take the lang from the first two letters
+# in the xml file, but this is not the best, because does not works
+# ever (example simplewiki)
+lang = input_xml_file_name
+if lang.find('/'):
+ lang = lang[lang.find('/') + 1:]
+lang = lang[:2]
+
+print 'Lang: %s' % lang
+
+selected_pages = None
+if not downlad_all:
+ print "Loading selected pages"
+ favorites_reader = FileListReader(config.favorites_file_name)
+ selected_pages = favorites_reader.list
+
+print "Downloading images"
+templates_counter = ImagesDownloader(input_xml_file_name,
+ selected_pages, "./images", cache_dir, lang)