diff options
author | Walter Bender <walter@walter-laptop.(none)> | 2009-10-29 17:04:45 (GMT) |
---|---|---|
committer | Walter Bender <walter@walter-laptop.(none)> | 2009-10-29 17:04:45 (GMT) |
commit | d66b8e69388aef06f155c14c9518e7f69b8a9afd (patch) | |
tree | 6f83b9af7003f80cf59104c0dde1e55dc3b26cd6 | |
parent | 750d861c9d6560643f95e1cb57c37f0139cc3568 (diff) |
jpichon patch to fix paring error
-rw-r--r-- | infoslicer/processing/HTML_Parser.py | 4 | ||||
-rw-r--r-- | infoslicer/processing/MediaWiki_Parser.py | 2 |
2 files changed, 3 insertions, 3 deletions
diff --git a/infoslicer/processing/HTML_Parser.py b/infoslicer/processing/HTML_Parser.py index b99e754..adb6eb0 100644 --- a/infoslicer/processing/HTML_Parser.py +++ b/infoslicer/processing/HTML_Parser.py @@ -28,8 +28,8 @@ class HTML_Parser: #=======================================================================
# These lists are used at the parsing stage
root_node = "body"
- section_separators = ["h3", "h4", "h5"]
- reference_separators = ["h1", "h2"]
+ section_separators = ["h2", "h3", "h4", "h5"]
+ reference_separators = ["h1"]
block_elements = ["img", "table", "ol", "ul"]
#=======================================================================
diff --git a/infoslicer/processing/MediaWiki_Parser.py b/infoslicer/processing/MediaWiki_Parser.py index 913f03e..dcf559a 100644 --- a/infoslicer/processing/MediaWiki_Parser.py +++ b/infoslicer/processing/MediaWiki_Parser.py @@ -17,7 +17,7 @@ class MediaWiki_Parser(HTML_Parser): logger.debug('MediaWiki_Parser: %s' % source_url)
- header, input_content = document_to_parse.split("<text>")
+ header, input_content = document_to_parse.split("<text xml:space=\"preserve\">")
#find the revision id in the xml the wiki API returns
revid = re.findall(re.compile('\<parse.*revid\=\"(?P<rid>[0-9]*)\"'),
|