Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/cvtFile.py
diff options
context:
space:
mode:
Diffstat (limited to 'cvtFile.py')
-rwxr-xr-xcvtFile.py185
1 files changed, 185 insertions, 0 deletions
diff --git a/cvtFile.py b/cvtFile.py
new file mode 100755
index 0000000..f70881a
--- /dev/null
+++ b/cvtFile.py
@@ -0,0 +1,185 @@
+#!/usr/bin/python
+"""
+This version acts as cgi script, applying selected conversions
+to a file supplied in the form
+
+get form parameters
+make soup
+apply conversions
+make txt
+return
+"""
+import os, sys
+import logging
+from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
+from path import path
+import subprocess
+from PIL import Image
+
+class Cvt():
+ def __init__(self, soup):
+ self.soup = soup
+ self.logfile = open('logfile', 'w')
+ self.kstrongs = 0
+ self.kfonts = 0
+ self.kspans = 0
+ #methods
+ #method_2 - remove style attributes from table tags
+ #method_3 - remove style attributes from td tags
+ #method_4 - remove style attributes from p tags
+ #method_5 - remove style attributes from tr tags
+ #method_6 - remove col tags
+ #method_9 - remove fonts retaining content
+ #method_10 - remove empty p tags (string == ' ')
+ #method_11 - remove spans retaining content
+ self.processlist = [2, 3, 4, 5, 6, 7, 9, 10, 11, 12]
+ self.iterlist = [9, 11, 12]
+ self.methods = { 1:self.method_1, 2:self.method_2, 3:self.method_3, 4:self.method_4, 5:self.method_5,
+ 6:self.method_6, 7:self.method_7, 8:self.method_8, 9:self.method_9, 10:self.method_10,
+ 11:self.method_11, 12:self.method_12,
+ }
+
+ def close_logfile(self):
+ self.logfile.close()
+
+ def method_1(self):
+ link = Tag(self.soup, 'link')
+ link['rel']="StyleSheet"
+ link['type']="text/css"
+ link['href']="../../css/activity.css"
+ meta = self.soup.find('meta')
+ meta.insert(0,link)
+
+ #remove style attributes from table tags
+ def method_2(self):
+ tblkeys = ['width', 'border', 'bordercolor', 'cellpadding', 'cellspacing', 'frame', 'rules', 'dir']
+ tbls = self.soup.findAll('table')
+ for tbl in tbls:
+ for key in tblkeys:
+ try:
+ del tbl[key]
+ except:
+ pass
+
+ #remove style attributes from td tags
+ def method_3(self):
+ tdkeys = ['width', 'height', 'bgcolor', 'valign']
+ tds = self.soup.findAll('td')
+ for td in tds:
+ for key in tdkeys:
+ try:
+ del td[key]
+ except:
+ pass
+
+ #remove style attributes from p tags
+ def method_4(self):
+ pkeys=['lang', 'align', 'style', 'class']
+ ps = self.soup.findAll('p')
+ for p in ps:
+ for key in pkeys:
+ centerflag = False
+ try:
+ if 'head' in p['class']:
+ centerflag = True
+ except:
+ pass
+ try:
+ del p[key]
+ except:
+ pass
+ if centerflag:
+ p['class'] = 'center'
+
+ #remove style attributes from tr tags
+ def method_5(self):
+ pkeys=['lang', 'align', 'style', 'class', 'valign']
+ ps = self.soup.findAll('tr')
+ for p in ps:
+ for key in pkeys:
+ try:
+ del p[key]
+ except:
+ pass
+
+ #remove col tags
+ def method_6(self):
+ cols = self.soup.findAll('col')
+ for col in cols:
+ col.extract()
+
+ #remove attributes from span tags
+ def method_7(self):
+ pkeys=['lang', 'align', 'style', 'class']
+ ps = self.soup.findAll('span')
+ for p in ps:
+ for key in pkeys:
+ try:
+ del p[key]
+ except:
+ pass
+
+
+ def method_8(self):
+ ps = self.soup.findAll('p')
+ for p in ps:
+ bs = p.findAll('b')
+ for b in bs:
+ try:
+ if 'LO' in b.string:
+ if p.findParents('h1'):
+ h1=p.findParent('h1')
+ h1.replaceWith('<p class="solid">' + str(b) + '</p>')
+ else:
+ p.replaceWith('<p class="solid">' + str(b) + '</p>')
+ except:
+ print 'b string not found', b.prettify()
+
+ #remove font tags retaining content
+ def method_9(self):
+ while self.soup.font:
+ if len(self.soup.font.contents) > 1:
+ self.soup.font.replaceWith(self.soup.font.contents[1])
+ elif len(self.soup.font.contents) > 0:
+ self.soup.font.replaceWith(self.soup.font.contents[0])
+ else:
+ self.soup.font.extract()
+
+ #remove empty paragraphs (used for spacing)
+ def method_10(self):
+ tags = self.soup.findAll('br')
+ for tag in tags:
+ if len(tag) == 0:
+ tag.extract()
+ tags = self.soup.findAll('p')
+ for tag in tags:
+ if len(tag)==0:
+ tag.extract()
+ if tag.string:
+ str = tag.string.strip()
+ if not str:
+ tag.extract()
+
+ #remove spans (replace with p tags) retaining content
+ def method_11(self):
+ while self.soup.span:
+ if len(self.soup.span.contents)>1:
+ self.soup.span.replaceWith(self.soup.span.contents[1])
+ elif len(self.soup.span.contents)>0:
+ self.soup.span.replaceWith(self.soup.span.contents[0])
+ else:
+ self.soup.span.extract()
+
+ #remove strong tags retaining content
+ def method_12(self):
+ while self.soup.strong:
+ if len(self.soup.strong.contents)>1:
+ self.soup.strong.replaceWith(self.soup.strong.contents[1])
+ elif len(self.soup.strong.contents)>0:
+ self.soup.string.replaceWith(self.soup.strong.contents[0])
+ else:
+ self.soup.strong.extract()
+
+def makesoup(txtin):
+ soup = BeautifulSoup(txtin)
+ return soup