Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/translate-toolkit-1.3.0/translate/storage/html.py
diff options
context:
space:
mode:
Diffstat (limited to 'translate-toolkit-1.3.0/translate/storage/html.py')
-rw-r--r--translate-toolkit-1.3.0/translate/storage/html.py261
1 files changed, 261 insertions, 0 deletions
diff --git a/translate-toolkit-1.3.0/translate/storage/html.py b/translate-toolkit-1.3.0/translate/storage/html.py
new file mode 100644
index 0000000..4515e7c
--- /dev/null
+++ b/translate-toolkit-1.3.0/translate/storage/html.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright 2004-2006,2008 Zuza Software Foundation
+#
+# This file is part of translate.
+#
+# translate is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# translate is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with translate; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+
+"""module for parsing html files for translation"""
+
+import re
+from translate.storage import base
+from HTMLParser import HTMLParser
+
+class htmlunit(base.TranslationUnit):
+ """A unit of translatable/localisable HTML content"""
+ def __init__(self, source=None):
+ self.locations = []
+ self.setsource(source)
+
+ def getsource(self):
+ #TODO: Rethink how clever we should try to be with html entities.
+ return self.text.replace("&amp;", "&").replace("&lt;", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
+
+ def setsource(self, source):
+ self.text = source.replace("&", "&amp;").replace("<", "&lt;")
+ source = property(getsource, setsource)
+
+ def addlocation(self, location):
+ self.locations.append(location)
+
+ def getlocations(self):
+ return self.locations
+
+
+class htmlfile(HTMLParser, base.TranslationStore):
+ UnitClass = htmlunit
+ markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
+ markingattrs = []
+ includeattrs = ["alt", "summary", "standby", "abbr", "content"]
+
+ def __init__(self, includeuntaggeddata=None, inputfile=None):
+ self.units = []
+ self.filename = getattr(inputfile, 'name', None)
+ self.currentblock = ""
+ self.currentblocknum = 0
+ self.currenttag = None
+ self.includeuntaggeddata = includeuntaggeddata
+ HTMLParser.__init__(self)
+
+ if inputfile is not None:
+ htmlsrc = inputfile.read()
+ inputfile.close()
+ self.parse(htmlsrc)
+
+ def guess_encoding(self, htmlsrc):
+ """Returns the encoding of the html text.
+
+ We look for 'charset=' within a meta tag to do this.
+ """
+
+ pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
+ result = re.findall(pattern, htmlsrc)
+ encoding = None
+ if result:
+ encoding = result[0]
+ return encoding
+
+ def do_encoding(self, htmlsrc):
+ """Return the html text properly encoded based on a charset."""
+ charset = self.guess_encoding(htmlsrc)
+ if charset:
+ return htmlsrc.decode(charset)
+ else:
+ return htmlsrc
+
+ def phprep(self, text):
+ """Replaces all instances of PHP with placeholder tags, and returns
+ the new text and a dictionary of tags. The current implementation
+ replaces <?foo?> with <?md5(foo)?>. The hash => code conversions
+ are stored in self.phpdict for later use in restoring the real PHP.
+
+ The purpose of this is to remove all potential "tag-like" code from
+ inside PHP. The hash looks nothing like an HTML tag, but the following
+ PHP::
+ $a < $b ? $c : ($d > $e ? $f : $g)
+ looks like it contains an HTML tag::
+ < $b ? $c : ($d >
+ to nearly any regex. Hence, we replace all contents of PHP with simple
+ strings to help our regexes out.
+
+ """
+
+ from translate.misc import hash
+
+ self.phpdict = {}
+ result = re.findall('(?s)<\?(.*?)\?>', text)
+ for cmd in result:
+ h = hash.md5_f(cmd).hexdigest()
+ self.phpdict[h] = cmd
+ text = text.replace(cmd,h)
+ return text
+
+ def reintrophp(self, text):
+ """Replaces the PHP placeholders in text with the real code"""
+ for hash, code in self.phpdict.items():
+ text = text.replace(hash, code)
+ return text
+
+ def parse(self, htmlsrc):
+ htmlsrc = self.do_encoding(htmlsrc)
+ htmlsrc = self.phprep(htmlsrc) #Clear out the PHP before parsing
+ self.feed(htmlsrc)
+
+ def addhtmlblock(self, text):
+ text = self.strip_html(text)
+ text = self.reintrophp(text) #Before adding anything, restore PHP
+ if self.has_translatable_content(text):
+ self.currentblocknum += 1
+ unit = self.addsourceunit(text)
+ unit.addlocation("%s:%d" % (self.filename, self.currentblocknum))
+
+ def strip_html(self, text):
+ """Strip unnecessary html from the text.
+
+ HTML tags are deemed unnecessary if it fully encloses the translatable
+ text, eg. '<a href="index.html">Home Page</a>'.
+
+ HTML tags that occurs within the normal flow of text will not be removed,
+ eg. 'This is a link to the <a href="index.html">Home Page</a>.'
+ """
+ text = text.strip()
+
+ # If all that is left is PHP, return ""
+ result = re.findall('(?s)^<\?.*?\?>$', text)
+ if len(result) == 1:
+ return ""
+
+ # These two patterns are the same; the first one is more concise...
+ #pattern = '(?s)^<[^?>](?:(?:[^>]|(?:<\?.*?\?>))*[^?>])?>(.*)</.*[^?]>$'
+ pattern = re.compile(r'''
+ (?s)^ # We allow newlines, and match start of line
+ <[^?>] # Match start of tag and the first character (not ? or >)
+ (?:
+ (?:
+ [^>] # Anything that's not a > is valid tag material
+ |
+ (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid
+ )* # Repeat over valid tag material
+ [^?>] # If we have > 1 char, the last char can't be ? or >
+ )? # The repeated chars are optional, so that <a>, <p> work
+ > # Match ending > of opening tag
+
+ (.*) # Match actual contents of tag
+
+ </.*[^?]> # Match ending tag; can't end with ?> and must be >=1 char
+ $ # Match end of line
+ ''', re.VERBOSE)
+ result = re.findall(pattern, text)
+ if len(result) == 1:
+ text = self.strip_html(result[0])
+ return text
+
+ def has_translatable_content(self, text):
+ """Check if the supplied HTML snippet has any content that needs to be translated."""
+
+ text = text.strip()
+ result = re.findall('(?i).*(charset.*=.*)', text)
+ if len(result) == 1:
+ return False
+
+ # TODO: Get a better way to find untranslatable entities.
+ if text == '&nbsp;':
+ return False
+
+ pattern = '<\?.*?\?>' # Lazily strip all PHP
+ result = re.sub(pattern, '', text).strip()
+ pattern = '<[^>]*>' #Strip all HTML tags
+ result = re.sub(pattern, '', result).strip()
+ if result:
+ return True
+ else:
+ return False
+
+#From here on below, follows the methods of the HTMLParser
+
+ def startblock(self, tag):
+ self.addhtmlblock(self.currentblock)
+ self.currentblock = ""
+ self.currenttag = tag
+
+ def endblock(self):
+ self.addhtmlblock(self.currentblock)
+ self.currentblock = ""
+ self.currenttag = None
+
+ def handle_starttag(self, tag, attrs):
+ newblock = 0
+ if tag in self.markingtags:
+ newblock = 1
+ for attrname, attrvalue in attrs:
+ if attrname in self.markingattrs:
+ newblock = 1
+ if attrname in self.includeattrs:
+ self.addhtmlblock(attrvalue)
+
+ if newblock:
+ self.startblock(tag)
+ elif self.currenttag is not None:
+ self.currentblock += self.get_starttag_text()
+
+ def handle_startendtag(self, tag, attrs):
+ for attrname, attrvalue in attrs:
+ if attrname in self.includeattrs:
+ self.addhtmlblock(attrvalue)
+ if self.currenttag is not None:
+ self.currentblock += self.get_starttag_text()
+
+ def handle_endtag(self, tag):
+ if tag == self.currenttag:
+ self.endblock()
+ elif self.currenttag is not None:
+ self.currentblock += '</%s>' % tag
+
+ def handle_data(self, data):
+ if self.currenttag is not None:
+ self.currentblock += data
+ elif self.includeuntaggeddata:
+ self.startblock(None)
+ self.currentblock += data
+
+ def handle_charref(self, name):
+ self.handle_data("&#%s;" % name)
+
+ def handle_entityref(self, name):
+ self.handle_data("&%s;" % name)
+
+ def handle_comment(self, data):
+ # we don't do anything with comments
+ pass
+
+ def handle_pi(self, data):
+ self.handle_data("<?%s>" % data)
+
+class POHTMLParser(htmlfile):
+ pass
+