translate-toolkit-1.5.1/translate/storage/html.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2004-2006,2008 Zuza Software Foundation
# 
# This file is part of translate.
#
# translate is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# translate is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with translate; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#

"""module for parsing html files for translation"""

import re
from translate.storage import base
from HTMLParser import HTMLParser

class htmlunit(base.TranslationUnit):
    """A unit of translatable/localisable HTML content"""
    def __init__(self, source=None):
        self.locations = []
        self.setsource(source)

    def getsource(self):
        #TODO: Rethink how clever we should try to be with html entities.
        return self.text.replace("&amp;", "&").replace("&lt;", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
    
    def setsource(self, source):
        self.text = source.replace("&", "&amp;").replace("<", "&lt;")
    source = property(getsource, setsource)

    def addlocation(self, location):
        self.locations.append(location)

    def getlocations(self):
        return self.locations


class htmlfile(HTMLParser, base.TranslationStore):
    UnitClass = htmlunit
    markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
    markingattrs = []
    includeattrs = ["alt", "summary", "standby", "abbr", "content"]

    def __init__(self, includeuntaggeddata=None, inputfile=None):
        self.units = []
        self.filename = getattr(inputfile, 'name', None) 
        self.currentblock = ""
        self.currentblocknum = 0
        self.currentcomment = ""
        self.currenttag = None
        self.includeuntaggeddata = includeuntaggeddata
        HTMLParser.__init__(self)

        if inputfile is not None:
            htmlsrc = inputfile.read()
            inputfile.close()
            self.parse(htmlsrc)

    def guess_encoding(self, htmlsrc):
        """Returns the encoding of the html text.

        We look for 'charset=' within a meta tag to do this.
        """

        pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
        result = re.findall(pattern, htmlsrc)
        encoding = None
        if result:
            encoding = result[0]
        return encoding

    def do_encoding(self, htmlsrc):
        """Return the html text properly encoded based on a charset."""
        charset = self.guess_encoding(htmlsrc)
        if charset:
            return htmlsrc.decode(charset)
        else:
            return htmlsrc

    def phprep(self, text):
        """Replaces all instances of PHP with placeholder tags, and returns
        the new text and a dictionary of tags.  The current implementation
        replaces <?foo?> with <?md5(foo)?>.  The hash => code conversions
        are stored in self.phpdict for later use in restoring the real PHP.

        The purpose of this is to remove all potential "tag-like" code from
        inside PHP.  The hash looks nothing like an HTML tag, but the following
        PHP::
          $a < $b ? $c : ($d > $e ? $f : $g)
        looks like it contains an HTML tag::
          < $b ? $c : ($d >
        to nearly any regex.  Hence, we replace all contents of PHP with simple
        strings to help our regexes out.

        """

        from translate.misc import hash

        self.phpdict = {}
        result = re.findall('(?s)<\?(.*?)\?>', text)
        for cmd in result:
            h = hash.md5_f(cmd).hexdigest()
            self.phpdict[h] = cmd
            text = text.replace(cmd, h)
        return text

    def reintrophp(self, text):
        """Replaces the PHP placeholders in text with the real code"""
        for hash, code in self.phpdict.items():
            text = text.replace(hash, code) 
        return text

    def parse(self, htmlsrc):
        htmlsrc = self.do_encoding(htmlsrc)
        htmlsrc = self.phprep(htmlsrc) #Clear out the PHP before parsing
        self.feed(htmlsrc)

    def addhtmlblock(self, text):
        text = self.strip_html(text)
        text = self.reintrophp(text) #Before adding anything, restore PHP
        if self.has_translatable_content(text):
            self.currentblocknum += 1
            unit = self.addsourceunit(text)
            unit.addlocation("%s:%d" % (self.filename, self.currentblocknum))
            unit.addnote(self.currentcomment)

    def strip_html(self, text):
        """Strip unnecessary html from the text.

        HTML tags are deemed unnecessary if it fully encloses the translatable
        text, eg. '<a href="index.html">Home Page</a>'.

        HTML tags that occurs within the normal flow of text will not be removed,
        eg. 'This is a link to the <a href="index.html">Home Page</a>.'
        """
        text = text.strip()

        # If all that is left is PHP, return ""
        result = re.findall('(?s)^<\?.*?\?>$', text)
        if len(result) == 1:
            return "" 

        # These two patterns are the same; the first one is more concise...
        #pattern = '(?s)^<[^?>](?:(?:[^>]|(?:<\?.*?\?>))*[^?>])?>(.*)</.*[^?]>$'
        pattern = re.compile(r'''
        (?s)^       # We allow newlines, and match start of line
        <[^?>]      # Match start of tag and the first character (not ? or >)
        (?:
          (?:
            [^>]    # Anything that's not a > is valid tag material
              |     
            (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid
          )*        # Repeat over valid tag material 
          [^?>]     # If we have > 1 char, the last char can't be ? or >
        )?          # The repeated chars are optional, so that <a>, <p> work 
        >           # Match ending > of opening tag

        (.*)        # Match actual contents of tag

        </.*[^?]>   # Match ending tag; can't end with ?> and must be >=1 char 
        $           # Match end of line
        ''', re.VERBOSE)
        result = re.findall(pattern, text)
        if len(result) == 1:
            text = self.strip_html(result[0])
        return text

    def has_translatable_content(self, text):
        """Check if the supplied HTML snippet has any content that needs to be translated."""

        text = text.strip()
        result = re.findall('(?i).*(charset.*=.*)', text)
        if len(result) == 1:
            return False

        # TODO: Get a better way to find untranslatable entities.
        if text == '&nbsp;':
            return False

        pattern = '<\?.*?\?>' # Lazily strip all PHP
        result = re.sub(pattern, '', text).strip()
        pattern = '<[^>]*>' #Strip all HTML tags
        result = re.sub(pattern, '', result).strip()
        if result:
            return True
        else:
            return False

#From here on below, follows the methods of the HTMLParser

    def startblock(self, tag):
        self.addhtmlblock(self.currentblock)
        self.currentblock = ""
        self.currentcomment = ""
        self.currenttag = tag

    def endblock(self):
        self.addhtmlblock(self.currentblock)
        self.currentblock = ""
        self.currentcomment = ""
        self.currenttag = None

    def handle_starttag(self, tag, attrs):
        newblock = 0
        if tag in self.markingtags:
            newblock = 1
        for attrname, attrvalue in attrs:
            if attrname in self.markingattrs:
                newblock = 1
            if attrname in self.includeattrs:
                self.addhtmlblock(attrvalue)

        if newblock:
            self.startblock(tag)
        elif self.currenttag is not None:
            self.currentblock += self.get_starttag_text()

    def handle_startendtag(self, tag, attrs):
        for attrname, attrvalue in attrs:
            if attrname in self.includeattrs:
                self.addhtmlblock(attrvalue)
        if self.currenttag is not None:
            self.currentblock += self.get_starttag_text()

    def handle_endtag(self, tag):
        if tag == self.currenttag:
            self.endblock()
        elif self.currenttag is not None: 
            self.currentblock += '</%s>' % tag

    def handle_data(self, data):
        if self.currenttag is not None:
            self.currentblock += data
        elif self.includeuntaggeddata:
            self.startblock(None)
            self.currentblock += data

    def handle_charref(self, name):
        self.handle_data("&#%s;" % name)

    def handle_entityref(self, name):
        self.handle_data("&%s;" % name)

    def handle_comment(self, data):
        # we can place comments above the msgid as translator comments!
        if self.currentcomment == "":
            self.currentcomment = data
        else:
            self.currentcomment += '\n' + data

    def handle_pi(self, data):
        self.handle_data("<?%s>" % data)

class POHTMLParser(htmlfile):
    pass