diff options
Diffstat (limited to 'translate-toolkit-1.3.0/translate/storage/html.py')
-rw-r--r-- | translate-toolkit-1.3.0/translate/storage/html.py | 261 |
1 files changed, 261 insertions, 0 deletions
diff --git a/translate-toolkit-1.3.0/translate/storage/html.py b/translate-toolkit-1.3.0/translate/storage/html.py new file mode 100644 index 0000000..4515e7c --- /dev/null +++ b/translate-toolkit-1.3.0/translate/storage/html.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2004-2006,2008 Zuza Software Foundation +# +# This file is part of translate. +# +# translate is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# translate is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with translate; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +"""module for parsing html files for translation""" + +import re +from translate.storage import base +from HTMLParser import HTMLParser + +class htmlunit(base.TranslationUnit): + """A unit of translatable/localisable HTML content""" + def __init__(self, source=None): + self.locations = [] + self.setsource(source) + + def getsource(self): + #TODO: Rethink how clever we should try to be with html entities. + return self.text.replace("&", "&").replace("<", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ") + + def setsource(self, source): + self.text = source.replace("&", "&").replace("<", "<") + source = property(getsource, setsource) + + def addlocation(self, location): + self.locations.append(location) + + def getlocations(self): + return self.locations + + +class htmlfile(HTMLParser, base.TranslationStore): + UnitClass = htmlunit + markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"] + markingattrs = [] + includeattrs = ["alt", "summary", "standby", "abbr", "content"] + + def __init__(self, includeuntaggeddata=None, inputfile=None): + self.units = [] + self.filename = getattr(inputfile, 'name', None) + self.currentblock = "" + self.currentblocknum = 0 + self.currenttag = None + self.includeuntaggeddata = includeuntaggeddata + HTMLParser.__init__(self) + + if inputfile is not None: + htmlsrc = inputfile.read() + inputfile.close() + self.parse(htmlsrc) + + def guess_encoding(self, htmlsrc): + """Returns the encoding of the html text. + + We look for 'charset=' within a meta tag to do this. + """ + + pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']''' + result = re.findall(pattern, htmlsrc) + encoding = None + if result: + encoding = result[0] + return encoding + + def do_encoding(self, htmlsrc): + """Return the html text properly encoded based on a charset.""" + charset = self.guess_encoding(htmlsrc) + if charset: + return htmlsrc.decode(charset) + else: + return htmlsrc + + def phprep(self, text): + """Replaces all instances of PHP with placeholder tags, and returns + the new text and a dictionary of tags. The current implementation + replaces <?foo?> with <?md5(foo)?>. The hash => code conversions + are stored in self.phpdict for later use in restoring the real PHP. + + The purpose of this is to remove all potential "tag-like" code from + inside PHP. The hash looks nothing like an HTML tag, but the following + PHP:: + $a < $b ? $c : ($d > $e ? $f : $g) + looks like it contains an HTML tag:: + < $b ? $c : ($d > + to nearly any regex. Hence, we replace all contents of PHP with simple + strings to help our regexes out. + + """ + + from translate.misc import hash + + self.phpdict = {} + result = re.findall('(?s)<\?(.*?)\?>', text) + for cmd in result: + h = hash.md5_f(cmd).hexdigest() + self.phpdict[h] = cmd + text = text.replace(cmd,h) + return text + + def reintrophp(self, text): + """Replaces the PHP placeholders in text with the real code""" + for hash, code in self.phpdict.items(): + text = text.replace(hash, code) + return text + + def parse(self, htmlsrc): + htmlsrc = self.do_encoding(htmlsrc) + htmlsrc = self.phprep(htmlsrc) #Clear out the PHP before parsing + self.feed(htmlsrc) + + def addhtmlblock(self, text): + text = self.strip_html(text) + text = self.reintrophp(text) #Before adding anything, restore PHP + if self.has_translatable_content(text): + self.currentblocknum += 1 + unit = self.addsourceunit(text) + unit.addlocation("%s:%d" % (self.filename, self.currentblocknum)) + + def strip_html(self, text): + """Strip unnecessary html from the text. + + HTML tags are deemed unnecessary if it fully encloses the translatable + text, eg. '<a href="index.html">Home Page</a>'. + + HTML tags that occurs within the normal flow of text will not be removed, + eg. 'This is a link to the <a href="index.html">Home Page</a>.' + """ + text = text.strip() + + # If all that is left is PHP, return "" + result = re.findall('(?s)^<\?.*?\?>$', text) + if len(result) == 1: + return "" + + # These two patterns are the same; the first one is more concise... + #pattern = '(?s)^<[^?>](?:(?:[^>]|(?:<\?.*?\?>))*[^?>])?>(.*)</.*[^?]>$' + pattern = re.compile(r''' + (?s)^ # We allow newlines, and match start of line + <[^?>] # Match start of tag and the first character (not ? or >) + (?: + (?: + [^>] # Anything that's not a > is valid tag material + | + (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid + )* # Repeat over valid tag material + [^?>] # If we have > 1 char, the last char can't be ? or > + )? # The repeated chars are optional, so that <a>, <p> work + > # Match ending > of opening tag + + (.*) # Match actual contents of tag + + </.*[^?]> # Match ending tag; can't end with ?> and must be >=1 char + $ # Match end of line + ''', re.VERBOSE) + result = re.findall(pattern, text) + if len(result) == 1: + text = self.strip_html(result[0]) + return text + + def has_translatable_content(self, text): + """Check if the supplied HTML snippet has any content that needs to be translated.""" + + text = text.strip() + result = re.findall('(?i).*(charset.*=.*)', text) + if len(result) == 1: + return False + + # TODO: Get a better way to find untranslatable entities. + if text == ' ': + return False + + pattern = '<\?.*?\?>' # Lazily strip all PHP + result = re.sub(pattern, '', text).strip() + pattern = '<[^>]*>' #Strip all HTML tags + result = re.sub(pattern, '', result).strip() + if result: + return True + else: + return False + +#From here on below, follows the methods of the HTMLParser + + def startblock(self, tag): + self.addhtmlblock(self.currentblock) + self.currentblock = "" + self.currenttag = tag + + def endblock(self): + self.addhtmlblock(self.currentblock) + self.currentblock = "" + self.currenttag = None + + def handle_starttag(self, tag, attrs): + newblock = 0 + if tag in self.markingtags: + newblock = 1 + for attrname, attrvalue in attrs: + if attrname in self.markingattrs: + newblock = 1 + if attrname in self.includeattrs: + self.addhtmlblock(attrvalue) + + if newblock: + self.startblock(tag) + elif self.currenttag is not None: + self.currentblock += self.get_starttag_text() + + def handle_startendtag(self, tag, attrs): + for attrname, attrvalue in attrs: + if attrname in self.includeattrs: + self.addhtmlblock(attrvalue) + if self.currenttag is not None: + self.currentblock += self.get_starttag_text() + + def handle_endtag(self, tag): + if tag == self.currenttag: + self.endblock() + elif self.currenttag is not None: + self.currentblock += '</%s>' % tag + + def handle_data(self, data): + if self.currenttag is not None: + self.currentblock += data + elif self.includeuntaggeddata: + self.startblock(None) + self.currentblock += data + + def handle_charref(self, name): + self.handle_data("&#%s;" % name) + + def handle_entityref(self, name): + self.handle_data("&%s;" % name) + + def handle_comment(self, data): + # we don't do anything with comments + pass + + def handle_pi(self, data): + self.handle_data("<?%s>" % data) + +class POHTMLParser(htmlfile): + pass + |