diff options
author | Gonzalo Odiard <godiard@sugarlabs.org> | 2011-05-26 14:11:01 (GMT) |
---|---|---|
committer | Gonzalo Odiard <godiard@sugarlabs.org> | 2011-05-26 14:11:01 (GMT) |
commit | 8c1c64409ee6997f3fba9c818a478c3773be0a79 (patch) | |
tree | 1f39017928e4dae5180624d97fa183ec57988396 | |
parent | 8eba08663961fef858b38aa83752d54acbe7a69e (diff) |
Encode title of the page before download it - SL #2855
If the word searched have a non ascii character, like a accent
could not be downloaded
-rw-r--r-- | infoslicer/processing/MediaWiki_Helper.py | 9 |
1 files changed, 8 insertions, 1 deletions
diff --git a/infoslicer/processing/MediaWiki_Helper.py b/infoslicer/processing/MediaWiki_Helper.py index a20c838..2dcc1ed 100644 --- a/infoslicer/processing/MediaWiki_Helper.py +++ b/infoslicer/processing/MediaWiki_Helper.py @@ -6,6 +6,8 @@ import logging import net
+import re
+
logger = logging.getLogger('infoslicer')
"""
@@ -187,12 +189,17 @@ class MediaWiki_Helper: urllib._urlopener = NewURLopener()
logger.debug("opening " + path)
logger.debug("proxies: " + str(self.proxies))
- doc = urllib.urlopen(path, proxies=self.proxies)
+ pathencoded = self.urlEncodeNonAscii(path)
+ logger.debug("pathencoded " + pathencoded)
+ doc = urllib.urlopen(pathencoded, proxies=self.proxies)
output = doc.read()
doc.close()
logger.debug("url opened successfully")
return output
+ def urlEncodeNonAscii(self, b):
+ return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
+
def stripTags(self, input, tag):
"""removes specified tag
|