diff options
Diffstat (limited to 'cvtFile.py')
-rwxr-xr-x | cvtFile.py | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/cvtFile.py b/cvtFile.py new file mode 100755 index 0000000..f70881a --- /dev/null +++ b/cvtFile.py @@ -0,0 +1,185 @@ +#!/usr/bin/python +""" +This version acts as cgi script, applying selected conversions +to a file supplied in the form + +get form parameters +make soup +apply conversions +make txt +return +""" +import os, sys +import logging +from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment +from path import path +import subprocess +from PIL import Image + +class Cvt(): + def __init__(self, soup): + self.soup = soup + self.logfile = open('logfile', 'w') + self.kstrongs = 0 + self.kfonts = 0 + self.kspans = 0 + #methods + #method_2 - remove style attributes from table tags + #method_3 - remove style attributes from td tags + #method_4 - remove style attributes from p tags + #method_5 - remove style attributes from tr tags + #method_6 - remove col tags + #method_9 - remove fonts retaining content + #method_10 - remove empty p tags (string == ' ') + #method_11 - remove spans retaining content + self.processlist = [2, 3, 4, 5, 6, 7, 9, 10, 11, 12] + self.iterlist = [9, 11, 12] + self.methods = { 1:self.method_1, 2:self.method_2, 3:self.method_3, 4:self.method_4, 5:self.method_5, + 6:self.method_6, 7:self.method_7, 8:self.method_8, 9:self.method_9, 10:self.method_10, + 11:self.method_11, 12:self.method_12, + } + + def close_logfile(self): + self.logfile.close() + + def method_1(self): + link = Tag(self.soup, 'link') + link['rel']="StyleSheet" + link['type']="text/css" + link['href']="../../css/activity.css" + meta = self.soup.find('meta') + meta.insert(0,link) + + #remove style attributes from table tags + def method_2(self): + tblkeys = ['width', 'border', 'bordercolor', 'cellpadding', 'cellspacing', 'frame', 'rules', 'dir'] + tbls = self.soup.findAll('table') + for tbl in tbls: + for key in tblkeys: + try: + del tbl[key] + except: + pass + + #remove style attributes from td tags + def method_3(self): + tdkeys = ['width', 'height', 'bgcolor', 'valign'] + tds = self.soup.findAll('td') + for td in tds: + for key in tdkeys: + try: + del td[key] + except: + pass + + #remove style attributes from p tags + def method_4(self): + pkeys=['lang', 'align', 'style', 'class'] + ps = self.soup.findAll('p') + for p in ps: + for key in pkeys: + centerflag = False + try: + if 'head' in p['class']: + centerflag = True + except: + pass + try: + del p[key] + except: + pass + if centerflag: + p['class'] = 'center' + + #remove style attributes from tr tags + def method_5(self): + pkeys=['lang', 'align', 'style', 'class', 'valign'] + ps = self.soup.findAll('tr') + for p in ps: + for key in pkeys: + try: + del p[key] + except: + pass + + #remove col tags + def method_6(self): + cols = self.soup.findAll('col') + for col in cols: + col.extract() + + #remove attributes from span tags + def method_7(self): + pkeys=['lang', 'align', 'style', 'class'] + ps = self.soup.findAll('span') + for p in ps: + for key in pkeys: + try: + del p[key] + except: + pass + + + def method_8(self): + ps = self.soup.findAll('p') + for p in ps: + bs = p.findAll('b') + for b in bs: + try: + if 'LO' in b.string: + if p.findParents('h1'): + h1=p.findParent('h1') + h1.replaceWith('<p class="solid">' + str(b) + '</p>') + else: + p.replaceWith('<p class="solid">' + str(b) + '</p>') + except: + print 'b string not found', b.prettify() + + #remove font tags retaining content + def method_9(self): + while self.soup.font: + if len(self.soup.font.contents) > 1: + self.soup.font.replaceWith(self.soup.font.contents[1]) + elif len(self.soup.font.contents) > 0: + self.soup.font.replaceWith(self.soup.font.contents[0]) + else: + self.soup.font.extract() + + #remove empty paragraphs (used for spacing) + def method_10(self): + tags = self.soup.findAll('br') + for tag in tags: + if len(tag) == 0: + tag.extract() + tags = self.soup.findAll('p') + for tag in tags: + if len(tag)==0: + tag.extract() + if tag.string: + str = tag.string.strip() + if not str: + tag.extract() + + #remove spans (replace with p tags) retaining content + def method_11(self): + while self.soup.span: + if len(self.soup.span.contents)>1: + self.soup.span.replaceWith(self.soup.span.contents[1]) + elif len(self.soup.span.contents)>0: + self.soup.span.replaceWith(self.soup.span.contents[0]) + else: + self.soup.span.extract() + + #remove strong tags retaining content + def method_12(self): + while self.soup.strong: + if len(self.soup.strong.contents)>1: + self.soup.strong.replaceWith(self.soup.strong.contents[1]) + elif len(self.soup.strong.contents)>0: + self.soup.string.replaceWith(self.soup.strong.contents[0]) + else: + self.soup.strong.extract() + +def makesoup(txtin): + soup = BeautifulSoup(txtin) + return soup |