Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGonzalo Odiard <godiard@sugarlabs.org>2011-05-26 14:11:01 (GMT)
committer Gonzalo Odiard <godiard@sugarlabs.org>2011-05-26 14:11:01 (GMT)
commit8c1c64409ee6997f3fba9c818a478c3773be0a79 (patch)
tree1f39017928e4dae5180624d97fa183ec57988396
parent8eba08663961fef858b38aa83752d54acbe7a69e (diff)
Encode title of the page before download it - SL #2855
If the word searched have a non ascii character, like a accent could not be downloaded
-rw-r--r--infoslicer/processing/MediaWiki_Helper.py9
1 files changed, 8 insertions, 1 deletions
diff --git a/infoslicer/processing/MediaWiki_Helper.py b/infoslicer/processing/MediaWiki_Helper.py
index a20c838..2dcc1ed 100644
--- a/infoslicer/processing/MediaWiki_Helper.py
+++ b/infoslicer/processing/MediaWiki_Helper.py
@@ -6,6 +6,8 @@ import logging
import net
+import re
+
logger = logging.getLogger('infoslicer')
"""
@@ -187,12 +189,17 @@ class MediaWiki_Helper:
urllib._urlopener = NewURLopener()
logger.debug("opening " + path)
logger.debug("proxies: " + str(self.proxies))
- doc = urllib.urlopen(path, proxies=self.proxies)
+ pathencoded = self.urlEncodeNonAscii(path)
+ logger.debug("pathencoded " + pathencoded)
+ doc = urllib.urlopen(pathencoded, proxies=self.proxies)
output = doc.read()
doc.close()
logger.debug("url opened successfully")
return output
+ def urlEncodeNonAscii(self, b):
+ return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
+
def stripTags(self, input, tag):
"""removes specified tag