html2mallard.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

from HTMLParser import HTMLParser
import sys

output = []

tag_conversion = {	#'h1': {'name': 'page' },
			'h1': {'name': 'section', 'next_tag': 'title', 'ignore_end': True}
	}

# next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '<title>' tags
next_tag = ''

# hold open tags, (such as page, section) until the respective block is over
open_tags = []

# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
    
    def handle_starttag(self, tag, attrs):
        output.append(interpret_starttag(tag, attrs))
        
    def handle_endtag(self, tag):
        output.append(interpret_endtag(tag))
        
    def handle_data(self, data):
    	output.append(parse_data(data))

#converts html end tags to mallard tags
def interpret_endtag(tag):
	temp = '</'
	if tag.lower() in tag_conversion.keys():
		if tag_conversion[tag]['ignore_end']:
			return ''

		temp += tag_conversion[tag]['name']
	else:
		temp += tag
	temp += '>'
	return temp

# converts html start tags to mallard tags
def interpret_starttag(tag, attrs):
	global next_tag
	global open_tags

	temp = ''

	# if the current tag is similar to the currently open tag, first close the previous tag
	if open_tags and open_tags[len(open_tags)-1] == tag:
		temp += '</' + tag_conversion[tag]['name'] + '>'
		open_tags.pop()

	temp += '<'
	if tag.lower() in tag_conversion.keys():
		temp += tag_conversion[tag]['name']

		if 'next_tag' in tag_conversion[tag].keys():
			next_tag = tag_conversion[tag]['next_tag']
		if 'ignore_end' in tag_conversion[tag].keys():
			open_tags.append(tag)
	else:
		temp += tag
	temp += '>'
	return temp

# parse data according to the previous tag
def parse_data(data):
	global next_tag
	if(next_tag):
		data = '<' + next_tag + '>' + data + '</' + next_tag + '>'
		next_tag = ''
	return data

# writes the header of the mallard file
def write_header():
	return '<page xmlns="http://projectmallard.org/1.0/" id="index">'

# writes the footer of the mallard file
def write_footer():
	global open_tags

	temp = ''

	# close the currently open tags
	while len(open_tags):
		temp += '</' + tag_conversion[open_tags.pop()]['name'] + '>'

	temp += '</page>'
	return temp

# reads the markdown source file
def read_file():
	fi = open(sys.argv[1])
	temp = ''
	for line in fi.readlines():
		temp += line
	return temp

# writes to page file
def write_file():
	fi = open(sys.argv[1].replace('.html','.page'), 'w')
	for line in output:
		fi.write(line + '\n')
	fi.flush()
	fi.close()

def main():
	input_md = read_file()
	output.append(write_header())

	# instantiate the parser and fed it some HTML
	parser = MyHTMLParser()
	parser.feed(input_md)
	output.append(write_footer())
	write_file()

if __name__ == '__main__':
	main()