Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAleksey Lim <alsroot@member.fsf.org>2009-02-24 03:14:13 (GMT)
committer Aleksey Lim <alsroot@member.fsf.org>2009-02-24 03:14:13 (GMT)
commitaf41294bbcc29aae0f73101f9a34671c5b75f24f (patch)
tree0028ff1d3a703f1fa3770b6887428103a52d9bc6
parent354d415e2f835104964c8a5c3f923d55df3987dd (diff)
Safe parsing of HTML header
-rw-r--r--Processing/MediaWiki_Parser.py16
1 files changed, 12 insertions, 4 deletions
diff --git a/Processing/MediaWiki_Parser.py b/Processing/MediaWiki_Parser.py
index cf57b40..913f03e 100644
--- a/Processing/MediaWiki_Parser.py
+++ b/Processing/MediaWiki_Parser.py
@@ -2,6 +2,9 @@
from HTML_Parser import HTML_Parser
import re
+import logging
+
+logger = logging.getLogger('infoslicer')
class MediaWiki_Parser(HTML_Parser):
@@ -11,10 +14,15 @@ class MediaWiki_Parser(HTML_Parser):
def __init__(self, document_to_parse, title, source_url):
if input == None:
raise NoDocException("No content to parse - supply document to __init__")
+
+ logger.debug('MediaWiki_Parser: %s' % source_url)
+
+ header, input_content = document_to_parse.split("<text>")
+
#find the revision id in the xml the wiki API returns
- revid = re.findall(re.compile('\<parse revid\=\"(?P<rid>[0-9]*)\">'), document_to_parse)
- #remove the xml padding to parse html inside
- input_content = document_to_parse.split("<text>")[1]
+ revid = re.findall(re.compile('\<parse.*revid\=\"(?P<rid>[0-9]*)\"'),
+ header)
+
input_content = input_content.split("</text>")[0]
#call the normal constructor
HTML_Parser.__init__(self, "<body>" + input_content + "</body>", title, source_url)
@@ -76,4 +84,4 @@ class MediaWiki_Parser(HTML_Parser):
#add the infobox to the output
self.output_soup.refbody.append(infobox_tag)
#remove the first table to avoid parsing twice
- first_table.extract() \ No newline at end of file
+ first_table.extract()