init commit

author: Kalpa Welivitigoda <callkalpa@gmail.com> 2013-06-21 16:34:26 (GMT)
committer: Kalpa Welivitigoda <callkalpa@gmail.com> 2013-06-21 16:34:26 (GMT)
commit: 5567d5f69c5d0402bc4525f973a9bb77a1b02dc5 (patch)
tree: 77c3f9d6eacea423a0d73bb22bfe942bcc777b2a
1 files changed, 119 insertions, 0 deletions
diff --git a/html2mallard.py b/html2mallard.py
new file mode 100644
index 0000000..1c39987
--- /dev/null
+++ b/html2mallard.py
@@ -0,0 +1,119 @@
+from HTMLParser import HTMLParser
+import sys
+
+output = []
+
+tag_conversion = {	#'h1': {'name': 'page' },
+			'h1': {'name': 'section', 'next_tag': 'title', 'ignore_end': True}
+	}
+
+# next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '<title>' tags
+next_tag = ''
+
+# hold open tags, (such as page, section) until the respective block is over
+open_tags = []
+
+# create a subclass and override the handler methods
+class MyHTMLParser(HTMLParser):
+    
+    def handle_starttag(self, tag, attrs):
+        output.append(interpret_starttag(tag, attrs))
+        
+    def handle_endtag(self, tag):
+        output.append(interpret_endtag(tag))
+        
+    def handle_data(self, data):
+    	output.append(parse_data(data))
+
+#converts html end tags to mallard tags
+def interpret_endtag(tag):
+	temp = '</'
+	if tag.lower() in tag_conversion.keys():
+		if tag_conversion[tag]['ignore_end']:
+			return ''
+
+		temp += tag_conversion[tag]['name']
+	else:
+		temp += tag
+	temp += '>'
+	return temp
+
+# converts html start tags to mallard tags
+def interpret_starttag(tag, attrs):
+	global next_tag
+	global open_tags
+
+	temp = ''
+
+	# if the current tag is similar to the currently open tag, first close the previous tag
+	if open_tags and open_tags[len(open_tags)-1] == tag:
+		temp += '</' + tag_conversion[tag]['name'] + '>'
+		open_tags.pop()
+
+	temp += '<'
+	if tag.lower() in tag_conversion.keys():
+		temp += tag_conversion[tag]['name']
+
+		if 'next_tag' in tag_conversion[tag].keys():
+			next_tag = tag_conversion[tag]['next_tag']
+		if 'ignore_end' in tag_conversion[tag].keys():
+			open_tags.append(tag)
+	else:
+		temp += tag
+	temp += '>'
+	return temp
+
+# parse data according to the previous tag
+def parse_data(data):
+	global next_tag
+	if(next_tag):
+		data = '<' + next_tag + '>' + data + '</' + next_tag + '>'
+		next_tag = ''
+	return data
+
+# writes the header of the mallard file
+def write_header():
+	return '<page xmlns="http://projectmallard.org/1.0/" id="index">'
+
+# writes the footer of the mallard file
+def write_footer():
+	global open_tags
+
+	temp = ''
+
+	# close the currently open tags
+	while len(open_tags):
+		temp += '</' + tag_conversion[open_tags.pop()]['name'] + '>'
+
+	temp += '</page>'
+	return temp
+
+# reads the markdown source file
+def read_file():
+	fi = open(sys.argv[1])
+	temp = ''
+	for line in fi.readlines():
+		temp += line
+	return temp
+
+# writes to page file
+def write_file():
+	fi = open(sys.argv[1].replace('.html','.page'), 'w')
+	for line in output:
+		fi.write(line + '\n')
+	fi.flush()
+	fi.close()
+
+def main():
+	input_md = read_file()
+	output.append(write_header())
+
+	# instantiate the parser and fed it some HTML
+	parser = MyHTMLParser()
+	parser.feed(input_md)
+	output.append(write_footer())
+	write_file()
+
+if __name__ == '__main__':
+	main()
+
author	Kalpa Welivitigoda <callkalpa@gmail.com>	2013-06-21 16:34:26 (GMT)
committer	Kalpa Welivitigoda <callkalpa@gmail.com>	2013-06-21 16:34:26 (GMT)
commit	5567d5f69c5d0402bc4525f973a9bb77a1b02dc5 (patch)
tree	77c3f9d6eacea423a0d73bb22bfe942bcc777b2a