Safe parsing of HTML header

author: Aleksey Lim <alsroot@member.fsf.org> 2009-02-24 03:14:13 (GMT)
committer: Aleksey Lim <alsroot@member.fsf.org> 2009-02-24 03:14:13 (GMT)
commit: af41294bbcc29aae0f73101f9a34671c5b75f24f (patch)
tree: 0028ff1d3a703f1fa3770b6887428103a52d9bc6
parent: 354d415e2f835104964c8a5c3f923d55df3987dd (diff)
1 files changed, 12 insertions, 4 deletions
diff --git a/Processing/MediaWiki_Parser.py b/Processing/MediaWiki_Parser.py
index cf57b40..913f03e 100644
--- a/Processing/MediaWiki_Parser.py
+++ b/Processing/MediaWiki_Parser.py
@@ -2,6 +2,9 @@
 
 from HTML_Parser import HTML_Parser
 import re
+import logging
+
+logger = logging.getLogger('infoslicer')
 
 class MediaWiki_Parser(HTML_Parser):
     
@@ -11,10 +14,15 @@ class MediaWiki_Parser(HTML_Parser):
     def __init__(self, document_to_parse, title, source_url):
         if input == None:
             raise NoDocException("No content to parse - supply document to __init__")
+
+        logger.debug('MediaWiki_Parser: %s' % source_url)
+
+        header, input_content = document_to_parse.split("<text>")
+
         #find the revision id in the xml the wiki API returns
-        revid = re.findall(re.compile('\<parse revid\=\"(?P<rid>[0-9]*)\">'), document_to_parse)
-        #remove the xml padding to parse html inside
-        input_content = document_to_parse.split("<text>")[1]
+        revid = re.findall(re.compile('\<parse.*revid\=\"(?P<rid>[0-9]*)\"'),
+                header)
+
         input_content = input_content.split("</text>")[0]
         #call the normal constructor
         HTML_Parser.__init__(self, "<body>" + input_content + "</body>", title, source_url)
@@ -76,4 +84,4 @@ class MediaWiki_Parser(HTML_Parser):
             #add the infobox to the output
             self.output_soup.refbody.append(infobox_tag)
             #remove the first table to avoid parsing twice
-            first_table.extract()
-\ No newline at end of file
+            first_table.extract()
author	Aleksey Lim <alsroot@member.fsf.org>	2009-02-24 03:14:13 (GMT)
committer	Aleksey Lim <alsroot@member.fsf.org>	2009-02-24 03:14:13 (GMT)
commit	af41294bbcc29aae0f73101f9a34671c5b75f24f (patch)
tree	0028ff1d3a703f1fa3770b6887428103a52d9bc6
parent	354d415e2f835104964c8a5c3f923d55df3987dd (diff)