diff options
author | Aneesh Dogra <lionaneesh@gmail.com> | 2012-12-25 18:48:47 (GMT) |
---|---|---|
committer | Aneesh Dogra <lionaneesh@gmail.com> | 2012-12-25 18:48:47 (GMT) |
commit | f9192cde8b4c34435778d45fb52d5c4ad55a2323 (patch) | |
tree | 387f072183cbb093688dc193df46e952ee7e1c4b | |
parent | 4b5c3cf1afc2a447d8302e2d9bbdf592e50f2cc8 (diff) |
-rw-r--r-- | infoslicer/processing/HTML_strip.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/infoslicer/processing/HTML_strip.py b/infoslicer/processing/HTML_strip.py index e41ce72..cdd5108 100644 --- a/infoslicer/processing/HTML_strip.py +++ b/infoslicer/processing/HTML_strip.py @@ -22,6 +22,9 @@ from infoslicer.processing.Article_Data import Sentence_Data, \ Article_Data import string +def filter_non_printable(str): + return ''.join([c for c in str if ord(c) > 31 or ord(c) == 9]) + class HTML_Strip(HTMLParser): def __init__(self): HTMLParser.__init__(self) @@ -33,7 +36,7 @@ class HTML_Strip(HTMLParser): text = sub('[\t\r\n]+', '', text) # replace multiple spaces with one text = sub('[ ]+', ' ', text) - text = filter(lambda x: x in string.printable, text) + text = filter_non_printable(text) self.__text.append(text + '') def handle_starttag(self, tag, attrs): |