From b9a2719691b4c6cf83f31eb0b6c3e7e878524c0e Mon Sep 17 00:00:00 2001 From: Tony Anderson Date: Mon, 25 Apr 2011 10:33:11 +0000 Subject: initial commit --- (limited to 'cleanUp.py') diff --git a/cleanUp.py b/cleanUp.py new file mode 100755 index 0000000..a9dc662 --- /dev/null +++ b/cleanUp.py @@ -0,0 +1,60 @@ +#!/usr/bin/python +#clean up tags in html conversion of Siyavula module +#write cleaned up version as source.txt in source folder +from path import path +import os, sys, subprocess +from BeautifulSoup import BeautifulSoup +from optparse import OptionParser +from cvtFile import Cvt + +SOURCE = path('../') + +def makesoup(txtin): + soup = BeautifulSoup(txtin,fromEncoding="utf-8") + return soup + +#use BeautifulSoup to clean up tags +def applyFix(soup): + #remove + try: + soup.head.extract() + except: + pass + #perform conversions + cvt = Cvt(soup) + for method in cvt.processlist: + cvt.methods[method]() + #set up txt + txtout = cvt.soup.prettify() + #return + return txtout + +parser = OptionParser(usage="Usage: %prog [options] file") +(options, args) = parser.parse_args() +if not args: + print 'Specify a folder and module (e.g. Z4 z4m1 arguments.' + parser.print_help() + sys.exit(1) + +SUBJECT = args[0] +COURSE = args[1] +MODULE = args[2] +tag = 'page-break-before: always">' +basepath = SOURCE / SUBJECT / COURSE / MODULE +fin = open(basepath / MODULE + '.html','r') +txt = fin.read() +fin.close() +txt = txt.replace('\n',' ') +txtout = '' +while txt.find(tag) > -1: + pos = txt.find(tag) + txtin = txt[:pos+len(tag)] + txt = txt[pos+len(tag):] + if txtout: + txtout += '\n\n
\n\n' + #set up soup + soup = makesoup(txtin) + txtout += applyFix(soup) +fout = open(basepath / 'source.txt','w') +fout.write(txtout) +fout.close -- cgit v0.9.1