striphtml.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

#! /usr/bin/env python
from BeautifulSoup import  BeautifulSoup

def _attr_name_whitelisted(attr_name,  attr_value):
    if attr_name.lower() == "align" and attr_value.lower() == "center":
        return True
    elif attr_name.lower() == "class" and attr_value == "blockquote":
        return True
    else:
        return False

# remove these tags, complete with contents.
blacklist = ["head",  "div" ]
    
striplist = [ "p",  "h1",  "h2",  "h3" ]

whitelist = [
    "p", "br", "pre", "meta", 
    "table", "tbody", "thead", "tr", "td", "a",
    "blockquote", "h1", "h2",  "h3",  "h4", 
    "ul", "li",  
    "b", "em", "i", "strong", "u"                 
    ]

soup = BeautifulSoup(open("input.html"))

print "<html>\n<head>\n<meta http-equiv=\"CONTENT-TYPE\" content=\"text/html; charset=UTF-8\">"
print soup.title
print "<style type='text/css'>"
print "@font-face {\n    font-family: Yataghan;\n    src: url('../Fonts/yataghan.ttf');\n }"
print "@font-face {\n    font-family: Akashi;\n    src: url('../Fonts/akashi.ttf');\n }"
print "p {\n    text-align: left;\n      text-indent: 0;\n    margin-bottom: .5em;\n  }\n  h1,h2,h3 {\n    font-family: Yataghan; text-align: center; margin-top: 3em; margin-bottom: .5em;    clear: both;  }" 
print "p.blockquote {\n    text-align: left;\n      text-indent: 0;\n    margin-bottom: .5em;\n    margin-left: .5in;\n    margin-right: .5in;  }" 
print "</style>\n<head>\n<body>"

print "<h1>Contents</h1>"
print "<ul>"
print "<li><a href=\"TOC_0001.xhtml\">Title Page</a></li>"
i = 1
for chapter in soup.findAll("h1"):
    i = i + 1
    print("<li><a href=\"TOC_" + str(i).zfill(4) + ".xhtml\">")
    print(chapter.string)
    print("</a></li>")
    
print "</ul>"
print "<hr class=\"sigilChapterBreak\" />"

print "<p style=\"font-family: Akashi; text-align: center; font-size: 3em; font-weight: bold\">"
print soup.title.string
print "</p>"

for tag in soup.findAll():
    if tag.name.lower() in blacklist:
        # blacklisted tags are removed in their entirety
        tag.extract()
    elif tag.name.lower() in striplist:
        tag.attrs = [(a[0], a[1]) for a in tag.attrs if _attr_name_whitelisted(a[0],  a[1])]
    elif tag.name.lower() not in whitelist:
        # not a whitelisted tag. I'd like to remove it from the tree
        # and replace it with its children. But that's hard. It's much
        # easier to just replace it with an empty span tag.
        tag.name = "span"
        tag.attrs = []

print(soup.renderContents("utf-8"))
print "</body></html>"