#!/usr/bin/python """ This version acts as cgi script, applying selected conversions to a file supplied in the form get form parameters make soup apply conversions make txt return """ import os, sys import logging from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment from path import path import subprocess from PIL import Image class Cvt(): def __init__(self, soup): self.soup = soup self.logfile = open('logfile', 'w') self.kstrongs = 0 self.kfonts = 0 self.kspans = 0 #methods #method_2 - remove style attributes from table tags #method_3 - remove style attributes from td tags #method_4 - remove style attributes from p tags #method_5 - remove style attributes from tr tags #method_6 - remove col tags #method_9 - remove fonts retaining content #method_10 - remove empty p tags (string == ' ') #method_11 - remove spans retaining content self.processlist = [2, 3, 4, 5, 6, 7, 9, 10, 11, 12] self.iterlist = [9, 11, 12] self.methods = { 1:self.method_1, 2:self.method_2, 3:self.method_3, 4:self.method_4, 5:self.method_5, 6:self.method_6, 7:self.method_7, 8:self.method_8, 9:self.method_9, 10:self.method_10, 11:self.method_11, 12:self.method_12, } def close_logfile(self): self.logfile.close() def method_1(self): link = Tag(self.soup, 'link') link['rel']="StyleSheet" link['type']="text/css" link['href']="../../css/activity.css" meta = self.soup.find('meta') meta.insert(0,link) #remove style attributes from table tags def method_2(self): tblkeys = ['width', 'border', 'bordercolor', 'cellpadding', 'cellspacing', 'frame', 'rules', 'dir'] tbls = self.soup.findAll('table') for tbl in tbls: for key in tblkeys: try: del tbl[key] except: pass #remove style attributes from td tags def method_3(self): tdkeys = ['width', 'height', 'bgcolor', 'valign'] tds = self.soup.findAll('td') for td in tds: for key in tdkeys: try: del td[key] except: pass #remove style attributes from p tags def method_4(self): pkeys=['lang', 'align', 'style', 'class'] ps = self.soup.findAll('p') for p in ps: for key in pkeys: centerflag = False try: if 'head' in p['class']: centerflag = True except: pass try: del p[key] except: pass if centerflag: p['class'] = 'center' #remove style attributes from tr tags def method_5(self): pkeys=['lang', 'align', 'style', 'class', 'valign'] ps = self.soup.findAll('tr') for p in ps: for key in pkeys: try: del p[key] except: pass #remove col tags def method_6(self): cols = self.soup.findAll('col') for col in cols: col.extract() #remove attributes from span tags def method_7(self): pkeys=['lang', 'align', 'style', 'class'] ps = self.soup.findAll('span') for p in ps: for key in pkeys: try: del p[key] except: pass def method_8(self): ps = self.soup.findAll('p') for p in ps: bs = p.findAll('b') for b in bs: try: if 'LO' in b.string: if p.findParents('h1'): h1=p.findParent('h1') h1.replaceWith('

' + str(b) + '

') else: p.replaceWith('

' + str(b) + '

') except: print 'b string not found', b.prettify() #remove font tags retaining content def method_9(self): while self.soup.font: if len(self.soup.font.contents) > 1: self.soup.font.replaceWith(self.soup.font.contents[1]) elif len(self.soup.font.contents) > 0: self.soup.font.replaceWith(self.soup.font.contents[0]) else: self.soup.font.extract() #remove empty paragraphs (used for spacing) def method_10(self): tags = self.soup.findAll('br') for tag in tags: if len(tag) == 0: tag.extract() tags = self.soup.findAll('p') for tag in tags: if len(tag)==0: tag.extract() if tag.string: str = tag.string.strip() if not str: tag.extract() #remove spans (replace with p tags) retaining content def method_11(self): while self.soup.span: if len(self.soup.span.contents)>1: self.soup.span.replaceWith(self.soup.span.contents[1]) elif len(self.soup.span.contents)>0: self.soup.span.replaceWith(self.soup.span.contents[0]) else: self.soup.span.extract() #remove strong tags retaining content def method_12(self): while self.soup.strong: if len(self.soup.strong.contents)>1: self.soup.strong.replaceWith(self.soup.strong.contents[1]) elif len(self.soup.strong.contents)>0: self.soup.string.replaceWith(self.soup.strong.contents[0]) else: self.soup.strong.extract() def makesoup(txtin): soup = BeautifulSoup(txtin) return soup