from HTMLParser import HTMLParser
import sys
output = []
tag_conversion = { #'h1': {'name': 'page' },
'h1': {'name': 'section', 'next_tag': 'title', 'ignore_end': True}
}
# next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '
' tags
next_tag = ''
# hold open tags, (such as page, section) until the respective block is over
open_tags = []
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
output.append(interpret_starttag(tag, attrs))
def handle_endtag(self, tag):
output.append(interpret_endtag(tag))
def handle_data(self, data):
output.append(parse_data(data))
#converts html end tags to mallard tags
def interpret_endtag(tag):
temp = ''
if tag.lower() in tag_conversion.keys():
if tag_conversion[tag]['ignore_end']:
return ''
temp += tag_conversion[tag]['name']
else:
temp += tag
temp += '>'
return temp
# converts html start tags to mallard tags
def interpret_starttag(tag, attrs):
global next_tag
global open_tags
temp = ''
# if the current tag is similar to the currently open tag, first close the previous tag
if open_tags and open_tags[len(open_tags)-1] == tag:
temp += '' + tag_conversion[tag]['name'] + '>'
open_tags.pop()
temp += '<'
if tag.lower() in tag_conversion.keys():
temp += tag_conversion[tag]['name']
if 'next_tag' in tag_conversion[tag].keys():
next_tag = tag_conversion[tag]['next_tag']
if 'ignore_end' in tag_conversion[tag].keys():
open_tags.append(tag)
else:
temp += tag
temp += '>'
return temp
# parse data according to the previous tag
def parse_data(data):
global next_tag
if(next_tag):
data = '<' + next_tag + '>' + data + '' + next_tag + '>'
next_tag = ''
return data
# writes the header of the mallard file
def write_header():
return ''
# writes the footer of the mallard file
def write_footer():
global open_tags
temp = ''
# close the currently open tags
while len(open_tags):
temp += '' + tag_conversion[open_tags.pop()]['name'] + '>'
temp += ''
return temp
# reads the markdown source file
def read_file():
fi = open(sys.argv[1])
temp = ''
for line in fi.readlines():
temp += line
return temp
# writes to page file
def write_file():
fi = open(sys.argv[1].replace('.html','.page'), 'w')
for line in output:
fi.write(line + '\n')
fi.flush()
fi.close()
def main():
input_md = read_file()
output.append(write_header())
# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
parser.feed(input_md)
output.append(write_footer())
write_file()
if __name__ == '__main__':
main()