#!/usr/bin/python #clean up tags in html conversion of Siyavula module #write cleaned up version as source.txt in source folder from path import path import os, sys, subprocess from BeautifulSoup import BeautifulSoup from optparse import OptionParser from cvtFile import Cvt SOURCE = path('../') def makesoup(txtin): soup = BeautifulSoup(txtin,fromEncoding="utf-8") return soup #use BeautifulSoup to clean up tags def applyFix(soup): #remove try: soup.head.extract() except: pass #perform conversions cvt = Cvt(soup) for method in cvt.processlist: cvt.methods[method]() #set up txt txtout = cvt.soup.prettify() #return return txtout parser = OptionParser(usage="Usage: %prog [options] file") (options, args) = parser.parse_args() if not args: print 'Specify a folder and module (e.g. Z4 z4m1 arguments.' parser.print_help() sys.exit(1) SUBJECT = args[0] COURSE = args[1] MODULE = args[2] tag = 'page-break-before: always">' basepath = SOURCE / SUBJECT / COURSE / MODULE fin = open(basepath / MODULE + '.html','r') txt = fin.read() fin.close() txt = txt.replace('\n',' ') txtout = '' while txt.find(tag) > -1: pos = txt.find(tag) txtin = txt[:pos+len(tag)] txt = txt[pos+len(tag):] if txtout: txtout += '\n\n
\n\n' #set up soup soup = makesoup(txtin) txtout += applyFix(soup) fout = open(basepath / 'source.txt','w') fout.write(txtout) fout.close