diff options
-rw-r--r-- | infoslicer/processing/HTML_strip.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/infoslicer/processing/HTML_strip.py b/infoslicer/processing/HTML_strip.py index e41ce72..cdd5108 100644 --- a/infoslicer/processing/HTML_strip.py +++ b/infoslicer/processing/HTML_strip.py @@ -22,6 +22,9 @@ from infoslicer.processing.Article_Data import Sentence_Data, \ Article_Data import string +def filter_non_printable(str): + return ''.join([c for c in str if ord(c) > 31 or ord(c) == 9]) + class HTML_Strip(HTMLParser): def __init__(self): HTMLParser.__init__(self) @@ -33,7 +36,7 @@ class HTML_Strip(HTMLParser): text = sub('[\t\r\n]+', '', text) # replace multiple spaces with one text = sub('[ ]+', ' ', text) - text = filter(lambda x: x in string.printable, text) + text = filter_non_printable(text) self.__text.append(text + '') def handle_starttag(self, tag, attrs): |