Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAneesh Dogra <lionaneesh@gmail.com>2012-12-25 18:48:47 (GMT)
committer Aneesh Dogra <lionaneesh@gmail.com>2012-12-25 18:48:47 (GMT)
commitf9192cde8b4c34435778d45fb52d5c4ad55a2323 (patch)
tree387f072183cbb093688dc193df46e952ee7e1c4b
parent4b5c3cf1afc2a447d8302e2d9bbdf592e50f2cc8 (diff)
Show unicode chars too.HEADmaster
-rw-r--r--infoslicer/processing/HTML_strip.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/infoslicer/processing/HTML_strip.py b/infoslicer/processing/HTML_strip.py
index e41ce72..cdd5108 100644
--- a/infoslicer/processing/HTML_strip.py
+++ b/infoslicer/processing/HTML_strip.py
@@ -22,6 +22,9 @@ from infoslicer.processing.Article_Data import Sentence_Data, \
Article_Data
import string
+def filter_non_printable(str):
+ return ''.join([c for c in str if ord(c) > 31 or ord(c) == 9])
+
class HTML_Strip(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
@@ -33,7 +36,7 @@ class HTML_Strip(HTMLParser):
text = sub('[\t\r\n]+', '', text)
# replace multiple spaces with one
text = sub('[ ]+', ' ', text)
- text = filter(lambda x: x in string.printable, text)
+ text = filter_non_printable(text)
self.__text.append(text + '')
def handle_starttag(self, tag, attrs):