1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
#!/bin/python
from HTMLParser import HTMLParser
import sys
output = []
# name - name of the mallard tag
# next_tag - next inline tag (ex: title)
# ignore_end - ignore the closing tag (ex: section)
# attrib - attributes of the tag
# get_attrib - retain the specified attributes from the html tag
tag_conversion = { 'h1': {'name': 'page', 'attrib': {'xmlns': 'http://projectmallard.org/1.0/', 'id': 'index'}, 'next_tag': 'title', 'ignore_end': True },
'h2': {'name': 'section', 'next_tag': 'title', 'ignore_end': True},
'strong': {'name': 'em'},
'ul' : {'name': 'list'},
'ol': {'name': 'list', 'attrib': {'type': 'numbered'}},
'li' : {'name': 'item', 'next_tag': 'p'},
'blockquote': {'name': 'quote'},
'a': {'name': 'link', 'get_attrib': ['href']}
}
# next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '<title>' tags
next_tag = ''
# hold open tags, (such as page, section) until the respective block is over
open_tags = []
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
output.append(interpret_starttag(tag, attrs))
def handle_endtag(self, tag):
output.append(interpret_endtag(tag))
def handle_data(self, data):
output.append(parse_data(data))
#converts html end tags to mallard tags
def interpret_endtag(tag):
temp = '</'
if tag.lower() in tag_conversion.keys():
if 'ignore_end' in tag_conversion[tag].keys() and tag_conversion[tag]['ignore_end']:
return ''
temp += tag_conversion[tag]['name']
else:
temp += tag
temp += '>'
return temp
#returns attributes in a dictionary as a text
def attrib2text(attrib):
temp = ''
for key in attrib:
temp += ' ' + key + '="' + attrib[key] + '"'
return temp
# converts html start tags to mallard tags
def interpret_starttag(tag, attrs):
global next_tag
global open_tags
temp = ''
# if the current tag is similar to the currently open tag, first close the previous tag
if open_tags and open_tags[len(open_tags)-1] == tag:
temp += '</' + tag_conversion[tag]['name'] + '>'
open_tags.pop()
temp += '<'
if tag.lower() in tag_conversion.keys():
temp += tag_conversion[tag]['name']
if 'attrib' in tag_conversion[tag].keys():
temp += attrib2text(tag_conversion[tag]['attrib'])
if 'get_attrib' in tag_conversion[tag].keys():
for attrib in tag_conversion[tag]['get_attrib']:
for a in attrs:
if a[0] == attrib:
temp += attrib2text({attrib: a[1]})
break
if 'next_tag' in tag_conversion[tag].keys():
next_tag = tag_conversion[tag]['next_tag']
if 'ignore_end' in tag_conversion[tag].keys():
open_tags.append(tag)
else:
temp += tag
temp += '>'
return temp
# parse data according to the previous tag
def parse_data(data):
global next_tag
if(next_tag):
data = '<' + next_tag + '>' + data + '</' + next_tag + '>'
next_tag = ''
return data
# writes the header of the mallard file
def write_header():
return '<page xmlns="http://projectmallard.org/1.0/" id="index">'
# writes the footer of the mallard file
def write_footer():
global open_tags
temp = ''
# close the currently open tags
while len(open_tags):
temp += '</' + tag_conversion[open_tags.pop()]['name'] + '>'
#temp += '</page>'
return temp
# reads the markdown source file
def read_file():
fi = open(sys.argv[1])
temp = ''
for line in fi.readlines():
temp += line
return temp
# writes to page file
def write_file():
fi = open(sys.argv[1].replace('.html','.page'), 'w')
for line in output:
fi.write(line + '\n')
fi.flush()
fi.close()
def main():
input_md = read_file()
#output.append(write_header())
# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
parser.feed(input_md)
# add section links
if 'section' in output:
output.append('links type="section"')
output.append(write_footer())
write_file()
if __name__ == '__main__':
main()
|