html2mallard.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

#!/bin/python

from HTMLParser import HTMLParser
import sys

output = []

# name - name of the mallard tag
# next_tag - next inline tag (ex: title)
# ignore_end - ignore the closing tag (ex: section)
# attrib - attributes of the tag
# get_attrib - retain the specified attributes from the html tag

tag_conversion = {	'h1': {'name': 'page', 'attrib': {'xmlns': 'http://projectmallard.org/1.0/', 'id': 'index'}, 'next_tag': 'title', 'ignore_end': True },
			'h2': {'name': 'section', 'next_tag': 'title', 'ignore_end': True},
			'strong': {'name': 'em'},
			'ul' : {'name': 'list'},
			'ol': {'name': 'list', 'attrib': {'type': 'numbered'}},
			'li' : {'name': 'item', 'next_tag': 'p'},
			'blockquote': {'name': 'quote'},
			'a': {'name': 'link', 'get_attrib': ['href']}
	}

# next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '<title>' tags
next_tag = ''

# hold open tags, (such as page, section) until the respective block is over
open_tags = []

# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
    
    def handle_starttag(self, tag, attrs):
        output.append(interpret_starttag(tag, attrs))
        
    def handle_endtag(self, tag):
        output.append(interpret_endtag(tag))
        
    def handle_data(self, data):
    	output.append(parse_data(data))

#converts html end tags to mallard tags
def interpret_endtag(tag):
	temp = '</'
	if tag.lower() in tag_conversion.keys():
		if 'ignore_end' in tag_conversion[tag].keys() and tag_conversion[tag]['ignore_end']:
			return ''

		temp += tag_conversion[tag]['name']
	else:
		temp += tag
	temp += '>'
	return temp

#returns attributes in a dictionary as a text
def attrib2text(attrib):
	temp = ''
	for key in attrib:
		temp += ' ' + key + '="' + attrib[key] + '"'
	return temp

# converts html start tags to mallard tags
def interpret_starttag(tag, attrs):
	global next_tag
	global open_tags

	temp = ''

	# if the current tag is similar to the currently open tag, first close the previous tag
	if open_tags and open_tags[len(open_tags)-1] == tag:
		temp += '</' + tag_conversion[tag]['name'] + '>'
		open_tags.pop()

	temp += '<'
	if tag.lower() in tag_conversion.keys():
		temp += tag_conversion[tag]['name']

		if 'attrib' in tag_conversion[tag].keys():
			temp += attrib2text(tag_conversion[tag]['attrib'])
		if 'get_attrib' in tag_conversion[tag].keys():
			for attrib in tag_conversion[tag]['get_attrib']:
				for a in attrs:
					if a[0] == attrib:
						temp += attrib2text({attrib: a[1]})
						break
		if 'next_tag' in tag_conversion[tag].keys():
			next_tag = tag_conversion[tag]['next_tag']
		if 'ignore_end' in tag_conversion[tag].keys():
			open_tags.append(tag)
	else:
		temp += tag
	temp += '>'
	return temp

# parse data according to the previous tag
def parse_data(data):
	global next_tag
	if(next_tag):
		data = '<' + next_tag + '>' + data + '</' + next_tag + '>'
		next_tag = ''
	return data

# writes the header of the mallard file
def write_header():
	return '<page xmlns="http://projectmallard.org/1.0/" id="index">'

# writes the footer of the mallard file
def write_footer():
	global open_tags

	temp = ''

	# close the currently open tags
	while len(open_tags):
		temp += '</' + tag_conversion[open_tags.pop()]['name'] + '>'

	#temp += '</page>'
	return temp

# reads the markdown source file
def read_file():
	fi = open(sys.argv[1])
	temp = ''
	for line in fi.readlines():
		temp += line
	return temp

# writes to page file
def write_file():
	fi = open(sys.argv[1].replace('.html','.page'), 'w')
	for line in output:
		fi.write(line + '\n')
	fi.flush()
	fi.close()

def main():
	input_md = read_file()
	#output.append(write_header())

	# instantiate the parser and fed it some HTML
	parser = MyHTMLParser()
	parser.feed(input_md)

	# add section links
	if 'section' in output:
		output.append('links type="section"')

	output.append(write_footer())
	write_file()

if __name__ == '__main__':
	main()