Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWalter Bender <walter@walter-laptop.(none)>2009-10-29 17:04:45 (GMT)
committer Walter Bender <walter@walter-laptop.(none)>2009-10-29 17:04:45 (GMT)
commitd66b8e69388aef06f155c14c9518e7f69b8a9afd (patch)
tree6f83b9af7003f80cf59104c0dde1e55dc3b26cd6
parent750d861c9d6560643f95e1cb57c37f0139cc3568 (diff)
jpichon patch to fix paring error
-rw-r--r--infoslicer/processing/HTML_Parser.py4
-rw-r--r--infoslicer/processing/MediaWiki_Parser.py2
2 files changed, 3 insertions, 3 deletions
diff --git a/infoslicer/processing/HTML_Parser.py b/infoslicer/processing/HTML_Parser.py
index b99e754..adb6eb0 100644
--- a/infoslicer/processing/HTML_Parser.py
+++ b/infoslicer/processing/HTML_Parser.py
@@ -28,8 +28,8 @@ class HTML_Parser:
#=======================================================================
# These lists are used at the parsing stage
root_node = "body"
- section_separators = ["h3", "h4", "h5"]
- reference_separators = ["h1", "h2"]
+ section_separators = ["h2", "h3", "h4", "h5"]
+ reference_separators = ["h1"]
block_elements = ["img", "table", "ol", "ul"]
#=======================================================================
diff --git a/infoslicer/processing/MediaWiki_Parser.py b/infoslicer/processing/MediaWiki_Parser.py
index 913f03e..dcf559a 100644
--- a/infoslicer/processing/MediaWiki_Parser.py
+++ b/infoslicer/processing/MediaWiki_Parser.py
@@ -17,7 +17,7 @@ class MediaWiki_Parser(HTML_Parser):
logger.debug('MediaWiki_Parser: %s' % source_url)
- header, input_content = document_to_parse.split("<text>")
+ header, input_content = document_to_parse.split("<text xml:space=\"preserve\">")
#find the revision id in the xml the wiki API returns
revid = re.findall(re.compile('\<parse.*revid\=\"(?P<rid>[0-9]*)\"'),