Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKalpa Welivitigoda <callkalpa@gmail.com>2013-06-21 16:34:26 (GMT)
committer Kalpa Welivitigoda <callkalpa@gmail.com>2013-06-21 16:34:26 (GMT)
commit5567d5f69c5d0402bc4525f973a9bb77a1b02dc5 (patch)
tree77c3f9d6eacea423a0d73bb22bfe942bcc777b2a
init commit
-rw-r--r--html2mallard.py119
1 files changed, 119 insertions, 0 deletions
diff --git a/html2mallard.py b/html2mallard.py
new file mode 100644
index 0000000..1c39987
--- /dev/null
+++ b/html2mallard.py
@@ -0,0 +1,119 @@
+from HTMLParser import HTMLParser
+import sys
+
+output = []
+
+tag_conversion = { #'h1': {'name': 'page' },
+ 'h1': {'name': 'section', 'next_tag': 'title', 'ignore_end': True}
+ }
+
+# next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '<title>' tags
+next_tag = ''
+
+# hold open tags, (such as page, section) until the respective block is over
+open_tags = []
+
+# create a subclass and override the handler methods
+class MyHTMLParser(HTMLParser):
+
+ def handle_starttag(self, tag, attrs):
+ output.append(interpret_starttag(tag, attrs))
+
+ def handle_endtag(self, tag):
+ output.append(interpret_endtag(tag))
+
+ def handle_data(self, data):
+ output.append(parse_data(data))
+
+#converts html end tags to mallard tags
+def interpret_endtag(tag):
+ temp = '</'
+ if tag.lower() in tag_conversion.keys():
+ if tag_conversion[tag]['ignore_end']:
+ return ''
+
+ temp += tag_conversion[tag]['name']
+ else:
+ temp += tag
+ temp += '>'
+ return temp
+
+# converts html start tags to mallard tags
+def interpret_starttag(tag, attrs):
+ global next_tag
+ global open_tags
+
+ temp = ''
+
+ # if the current tag is similar to the currently open tag, first close the previous tag
+ if open_tags and open_tags[len(open_tags)-1] == tag:
+ temp += '</' + tag_conversion[tag]['name'] + '>'
+ open_tags.pop()
+
+ temp += '<'
+ if tag.lower() in tag_conversion.keys():
+ temp += tag_conversion[tag]['name']
+
+ if 'next_tag' in tag_conversion[tag].keys():
+ next_tag = tag_conversion[tag]['next_tag']
+ if 'ignore_end' in tag_conversion[tag].keys():
+ open_tags.append(tag)
+ else:
+ temp += tag
+ temp += '>'
+ return temp
+
+# parse data according to the previous tag
+def parse_data(data):
+ global next_tag
+ if(next_tag):
+ data = '<' + next_tag + '>' + data + '</' + next_tag + '>'
+ next_tag = ''
+ return data
+
+# writes the header of the mallard file
+def write_header():
+ return '<page xmlns="http://projectmallard.org/1.0/" id="index">'
+
+# writes the footer of the mallard file
+def write_footer():
+ global open_tags
+
+ temp = ''
+
+ # close the currently open tags
+ while len(open_tags):
+ temp += '</' + tag_conversion[open_tags.pop()]['name'] + '>'
+
+ temp += '</page>'
+ return temp
+
+# reads the markdown source file
+def read_file():
+ fi = open(sys.argv[1])
+ temp = ''
+ for line in fi.readlines():
+ temp += line
+ return temp
+
+# writes to page file
+def write_file():
+ fi = open(sys.argv[1].replace('.html','.page'), 'w')
+ for line in output:
+ fi.write(line + '\n')
+ fi.flush()
+ fi.close()
+
+def main():
+ input_md = read_file()
+ output.append(write_header())
+
+ # instantiate the parser and fed it some HTML
+ parser = MyHTMLParser()
+ parser.feed(input_md)
+ output.append(write_footer())
+ write_file()
+
+if __name__ == '__main__':
+ main()
+