#!/bin/python
from HTMLParser import HTMLParser
import sys
output = []
# name - name of the mallard tag
# next_tag - next inline tag (ex: title)
# ignore_end - ignore the closing tag (ex: section)
# attrib - attributes of the tag
# get_attrib - retain the specified attributes from the html tag
tag_conversion = { 'h1': {'name': 'page', 'attrib': {'xmlns': 'http://projectmallard.org/1.0/', 'id': 'index'}, 'next_tag': 'title', 'ignore_end': True },
'h2': {'name': 'section', 'next_tag': 'title', 'ignore_end': True},
'strong': {'name': 'em'},
'ul' : {'name': 'list'},
'ol': {'name': 'list', 'attrib': {'type': 'numbered'}},
'li' : {'name': 'item', 'next_tag': 'p'},
'blockquote': {'name': 'quote'},
'a': {'name': 'link', 'get_attrib': ['href']}
}
# next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '
' tags
next_tag = ''
# hold open tags, (such as page, section) until the respective block is over
open_tags = []
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
output.append(interpret_starttag(tag, attrs))
def handle_endtag(self, tag):
output.append(interpret_endtag(tag))
def handle_data(self, data):
output.append(parse_data(data))
#converts html end tags to mallard tags
def interpret_endtag(tag):
temp = ''
if tag.lower() in tag_conversion.keys():
if 'ignore_end' in tag_conversion[tag].keys() and tag_conversion[tag]['ignore_end']:
return ''
temp += tag_conversion[tag]['name']
else:
temp += tag
temp += '>'
return temp
#returns attributes in a dictionary as a text
def attrib2text(attrib):
temp = ''
for key in attrib:
temp += ' ' + key + '="' + attrib[key] + '"'
return temp
# converts html start tags to mallard tags
def interpret_starttag(tag, attrs):
global next_tag
global open_tags
temp = ''
# if the current tag is similar to the currently open tag, first close the previous tag
if open_tags and open_tags[len(open_tags)-1] == tag:
temp += '' + tag_conversion[tag]['name'] + '>'
open_tags.pop()
temp += '<'
if tag.lower() in tag_conversion.keys():
temp += tag_conversion[tag]['name']
if 'attrib' in tag_conversion[tag].keys():
temp += attrib2text(tag_conversion[tag]['attrib'])
if 'get_attrib' in tag_conversion[tag].keys():
for attrib in tag_conversion[tag]['get_attrib']:
for a in attrs:
if a[0] == attrib:
temp += attrib2text({attrib: a[1]})
break
if 'next_tag' in tag_conversion[tag].keys():
next_tag = tag_conversion[tag]['next_tag']
if 'ignore_end' in tag_conversion[tag].keys():
open_tags.append(tag)
else:
temp += tag
temp += '>'
return temp
# parse data according to the previous tag
def parse_data(data):
global next_tag
if(next_tag):
data = '<' + next_tag + '>' + data + '' + next_tag + '>'
next_tag = ''
return data
# writes the header of the mallard file
def write_header():
return ''
# writes the footer of the mallard file
def write_footer():
global open_tags
temp = ''
# close the currently open tags
while len(open_tags):
temp += '' + tag_conversion[open_tags.pop()]['name'] + '>'
#temp += ''
return temp
# reads the markdown source file
def read_file():
fi = open(sys.argv[1])
temp = ''
for line in fi.readlines():
temp += line
return temp
# writes to page file
def write_file():
fi = open(sys.argv[1].replace('.html','.page'), 'w')
for line in output:
fi.write(line + '\n')
fi.flush()
fi.close()
def main():
input_md = read_file()
#output.append(write_header())
# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
parser.feed(input_md)
# add section links
if 'section' in output:
output.append('links type="section"')
output.append(write_footer())
write_file()
if __name__ == '__main__':
main()