cleanUp.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

#!/usr/bin/python
#clean up tags in html conversion of Siyavula module
#write cleaned up version as source.txt in source folder
from path import path
import os, sys, subprocess
from BeautifulSoup import BeautifulSoup
from optparse import OptionParser
from cvtFile import Cvt

SOURCE = path('../')

def makesoup(txtin):
        soup = BeautifulSoup(txtin,fromEncoding="utf-8") 
        return soup

#use BeautifulSoup to clean up tags
def applyFix(soup):
    #remove <head>
    try:
        soup.head.extract()
    except:
        pass
    #perform conversions
    cvt = Cvt(soup)
    for method in cvt.processlist:
        cvt.methods[method]()
    #set up txt
    txtout = cvt.soup.prettify()
    #return
    return txtout

parser = OptionParser(usage="Usage: %prog [options] file")
(options, args) = parser.parse_args()
if not args:
    print 'Specify a folder and module (e.g. Z4 z4m1 arguments.'
    parser.print_help()
    sys.exit(1)

SUBJECT = args[0]
COURSE = args[1]
MODULE = args[2]
tag = 'page-break-before: always">'
basepath = SOURCE / SUBJECT / COURSE / MODULE
fin = open(basepath / MODULE + '.html','r')
txt = fin.read()
fin.close()
txt = txt.replace('\n',' ')
txtout = ''
while txt.find(tag) > -1:
    pos = txt.find(tag)
    txtin = txt[:pos+len(tag)]
    txt = txt[pos+len(tag):]
    if txtout:
        txtout += '\n\n<hr />\n\n'
    #set up soup
    soup = makesoup(txtin)
    txtout += applyFix(soup)
fout = open(basepath / 'source.txt','w')
fout.write(txtout)
fout.close