from HTMLParser import HTMLParser import sys output = [] tag_conversion = { #'h1': {'name': 'page' }, 'h1': {'name': 'section', 'next_tag': 'title', 'ignore_end': True} } # next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '' tags next_tag = '' # hold open tags, (such as page, section) until the respective block is over open_tags = [] # create a subclass and override the handler methods class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): output.append(interpret_starttag(tag, attrs)) def handle_endtag(self, tag): output.append(interpret_endtag(tag)) def handle_data(self, data): output.append(parse_data(data)) #converts html end tags to mallard tags def interpret_endtag(tag): temp = '</' if tag.lower() in tag_conversion.keys(): if tag_conversion[tag]['ignore_end']: return '' temp += tag_conversion[tag]['name'] else: temp += tag temp += '>' return temp # converts html start tags to mallard tags def interpret_starttag(tag, attrs): global next_tag global open_tags temp = '' # if the current tag is similar to the currently open tag, first close the previous tag if open_tags and open_tags[len(open_tags)-1] == tag: temp += '</' + tag_conversion[tag]['name'] + '>' open_tags.pop() temp += '<' if tag.lower() in tag_conversion.keys(): temp += tag_conversion[tag]['name'] if 'next_tag' in tag_conversion[tag].keys(): next_tag = tag_conversion[tag]['next_tag'] if 'ignore_end' in tag_conversion[tag].keys(): open_tags.append(tag) else: temp += tag temp += '>' return temp # parse data according to the previous tag def parse_data(data): global next_tag if(next_tag): data = '<' + next_tag + '>' + data + '</' + next_tag + '>' next_tag = '' return data # writes the header of the mallard file def write_header(): return '<page xmlns="http://projectmallard.org/1.0/" id="index">' # writes the footer of the mallard file def write_footer(): global open_tags temp = '' # close the currently open tags while len(open_tags): temp += '</' + tag_conversion[open_tags.pop()]['name'] + '>' temp += '</page>' return temp # reads the markdown source file def read_file(): fi = open(sys.argv[1]) temp = '' for line in fi.readlines(): temp += line return temp # writes to page file def write_file(): fi = open(sys.argv[1].replace('.html','.page'), 'w') for line in output: fi.write(line + '\n') fi.flush() fi.close() def main(): input_md = read_file() output.append(write_header()) # instantiate the parser and fed it some HTML parser = MyHTMLParser() parser.feed(input_md) output.append(write_footer()) write_file() if __name__ == '__main__': main()