From 5567d5f69c5d0402bc4525f973a9bb77a1b02dc5 Mon Sep 17 00:00:00 2001 From: Kalpa Welivitigoda Date: Fri, 21 Jun 2013 16:34:26 +0000 Subject: init commit --- diff --git a/html2mallard.py b/html2mallard.py new file mode 100644 index 0000000..1c39987 --- /dev/null +++ b/html2mallard.py @@ -0,0 +1,119 @@ +from HTMLParser import HTMLParser +import sys + +output = [] + +tag_conversion = { #'h1': {'name': 'page' }, + 'h1': {'name': 'section', 'next_tag': 'title', 'ignore_end': True} + } + +# next data should be in this tag, for example if the starting tag is 'section' the next data should be with in '' tags +next_tag = '' + +# hold open tags, (such as page, section) until the respective block is over +open_tags = [] + +# create a subclass and override the handler methods +class MyHTMLParser(HTMLParser): + + def handle_starttag(self, tag, attrs): + output.append(interpret_starttag(tag, attrs)) + + def handle_endtag(self, tag): + output.append(interpret_endtag(tag)) + + def handle_data(self, data): + output.append(parse_data(data)) + +#converts html end tags to mallard tags +def interpret_endtag(tag): + temp = '</' + if tag.lower() in tag_conversion.keys(): + if tag_conversion[tag]['ignore_end']: + return '' + + temp += tag_conversion[tag]['name'] + else: + temp += tag + temp += '>' + return temp + +# converts html start tags to mallard tags +def interpret_starttag(tag, attrs): + global next_tag + global open_tags + + temp = '' + + # if the current tag is similar to the currently open tag, first close the previous tag + if open_tags and open_tags[len(open_tags)-1] == tag: + temp += '</' + tag_conversion[tag]['name'] + '>' + open_tags.pop() + + temp += '<' + if tag.lower() in tag_conversion.keys(): + temp += tag_conversion[tag]['name'] + + if 'next_tag' in tag_conversion[tag].keys(): + next_tag = tag_conversion[tag]['next_tag'] + if 'ignore_end' in tag_conversion[tag].keys(): + open_tags.append(tag) + else: + temp += tag + temp += '>' + return temp + +# parse data according to the previous tag +def parse_data(data): + global next_tag + if(next_tag): + data = '<' + next_tag + '>' + data + '</' + next_tag + '>' + next_tag = '' + return data + +# writes the header of the mallard file +def write_header(): + return '<page xmlns="http://projectmallard.org/1.0/" id="index">' + +# writes the footer of the mallard file +def write_footer(): + global open_tags + + temp = '' + + # close the currently open tags + while len(open_tags): + temp += '</' + tag_conversion[open_tags.pop()]['name'] + '>' + + temp += '</page>' + return temp + +# reads the markdown source file +def read_file(): + fi = open(sys.argv[1]) + temp = '' + for line in fi.readlines(): + temp += line + return temp + +# writes to page file +def write_file(): + fi = open(sys.argv[1].replace('.html','.page'), 'w') + for line in output: + fi.write(line + '\n') + fi.flush() + fi.close() + +def main(): + input_md = read_file() + output.append(write_header()) + + # instantiate the parser and fed it some HTML + parser = MyHTMLParser() + parser.feed(input_md) + output.append(write_footer()) + write_file() + +if __name__ == '__main__': + main() + -- cgit v0.9.1