diff options
author | Aleksey Lim <alsroot@member.fsf.org> | 2009-02-24 03:14:13 (GMT) |
---|---|---|
committer | Aleksey Lim <alsroot@member.fsf.org> | 2009-02-24 03:14:13 (GMT) |
commit | af41294bbcc29aae0f73101f9a34671c5b75f24f (patch) | |
tree | 0028ff1d3a703f1fa3770b6887428103a52d9bc6 | |
parent | 354d415e2f835104964c8a5c3f923d55df3987dd (diff) |
Safe parsing of HTML header
-rw-r--r-- | Processing/MediaWiki_Parser.py | 16 |
1 files changed, 12 insertions, 4 deletions
diff --git a/Processing/MediaWiki_Parser.py b/Processing/MediaWiki_Parser.py index cf57b40..913f03e 100644 --- a/Processing/MediaWiki_Parser.py +++ b/Processing/MediaWiki_Parser.py @@ -2,6 +2,9 @@ from HTML_Parser import HTML_Parser
import re
+import logging
+
+logger = logging.getLogger('infoslicer')
class MediaWiki_Parser(HTML_Parser):
@@ -11,10 +14,15 @@ class MediaWiki_Parser(HTML_Parser): def __init__(self, document_to_parse, title, source_url):
if input == None:
raise NoDocException("No content to parse - supply document to __init__")
+
+ logger.debug('MediaWiki_Parser: %s' % source_url)
+
+ header, input_content = document_to_parse.split("<text>")
+
#find the revision id in the xml the wiki API returns
- revid = re.findall(re.compile('\<parse revid\=\"(?P<rid>[0-9]*)\">'), document_to_parse)
- #remove the xml padding to parse html inside
- input_content = document_to_parse.split("<text>")[1]
+ revid = re.findall(re.compile('\<parse.*revid\=\"(?P<rid>[0-9]*)\"'),
+ header)
+
input_content = input_content.split("</text>")[0]
#call the normal constructor
HTML_Parser.__init__(self, "<body>" + input_content + "</body>", title, source_url)
@@ -76,4 +84,4 @@ class MediaWiki_Parser(HTML_Parser): #add the infobox to the output
self.output_soup.refbody.append(infobox_tag)
#remove the first table to avoid parsing twice
- first_table.extract()
\ No newline at end of file + first_table.extract()
|