Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/translate-toolkit-1.5.1/translate/storage/placeables/general.py
blob: 8717e5108611babe92c7ca68087f87383a0682cf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2009 Zuza Software Foundation
#
# This file is part of the Translate Toolkit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

"""
Contains general placeable implementations. That is placeables that does not
fit into any other sub-category.
"""

import re

__all__ = ['AltAttrPlaceable', 'XMLEntityPlaceable', 'XMLTagPlaceable', 'parsers', 'to_general_placeables']

from translate.storage.placeables.base import G, Ph, StringElem


def regex_parse(cls, pstr):
    """A parser method to extract placeables from a string based on a regular
        expression. Use this function as the C{@parse()} method of a placeable
        class."""
    if cls.regex is None:
        return None
    matches = []
    oldend = 0
    for match in cls.regex.finditer(pstr):
        start, end = match.start(), match.end()
        if oldend != start:
            matches.append(StringElem(pstr[oldend:start]))
        matches.append(cls([pstr[start:end]]))
        oldend = end
    if oldend != len(pstr) and matches:
        matches.append(StringElem(pstr[oldend:]))
    return matches or None


class AltAttrPlaceable(G):
    """Placeable for the "alt=..." attributes inside XML tags."""

    regex = re.compile(r'alt=".*?"')
    parse = classmethod(regex_parse)


class NewlinePlaceable(Ph):
    """Matches new-lines."""

    iseditable = False
    isfragile = True
    istranslatable = False
    regex = re.compile(r'\n')
    parse = classmethod(regex_parse)


class NumberPlaceable(Ph):
    """Placeable for numbers."""

    istranslatable = False
    regex = re.compile(ur"[-+]?[0-9]+([\u00a0.,][0-9]+)*")
    parse = classmethod(regex_parse)


class QtFormattingPlaceable(Ph):
    """Placeable representing a Qt string formatting variable.

    Implemented following Qt documentation on
    U{QString::arg<http://doc.trolltech.com/4.5/qstring.html#arg>} where
    the placeables are refered to as 'place markers'

    Notes:
      - Place markers can be reordered
      - Place markers may be repeated
      - 'L' use a localised representation e.g. in a number
      - %% some in the wild to escape real %, not documented (not in regex)
    """
    iseditable = False
    istranslatable = False
    regex = re.compile(r"""(?x)
                       %                 # Start of a place marker
                       L?                # The sequence is replaced with a localized representation (optional)
                       [1-9]\d{0,1}      # Place marker numbers must be in the range 1 to 99.
                       (?=([^\d]|$))     # Double check that we aren't matching %100+ (non consuming match)
                       """)
    parse = classmethod(regex_parse)


class PythonFormattingPlaceable(Ph):
    """Placeable representing a Python string formatting variable.

    Implemented following Python documentation on
    U{String Formatting Operations<http://docs.python.org/library/stdtypes.html#string-formatting-operations>}"""

    iseditable = False
    istranslatable = False
    # Need to correctly define a python identifier.
    regex = re.compile(r"""(?x)
                       %                     # Start of formatting specifier
                       (%|                   # No argument converted %% creates a %
                       (\([a-z_]+\)){0,1}    # Mapping key value (optional)
                       [\-\+0\s\#]{0,1}      # Conversion flags (optional)
                       (\d+|\*){0,1}         # Minimum field width (optional)
                       (\.(\d+|\*)){0,1}     # Precision (optional)
                       [hlL]{0,1}            # Length modifier (optional)
                       [diouxXeEfFgGcrs]{1}) # Conversion type""")
    parse = classmethod(regex_parse)


class JavaMessageFormatPlaceable(Ph):
    """Placeable representing a Java MessageFormat formatting variable.

    Implemented according to the Java U{MessageFormat 
    documentation<http://java.sun.com/j2se/1.4.2/docs/api/java/text/MessageFormat.html>}.

    Information about custom formats:
      - number - U{DecimalFormat<http://java.sun.com/j2se/1.4.2/docs/api/java/text/DecimalFormat.html>}
      - date/time - U{SimpleDateFormat<http://java.sun.com/j2se/1.4.2/docs/api/java/text/SimpleDateFormat.html>}
      - choice - U{ChoiceFormat<http://java.sun.com/j2se/1.4.2/docs/api/java/text/ChoiceFormat.html>}
    """

    iseditable = False  # TODO: Technically incorrect as you need to change
    istranslatable = False
    # things in a choice entry
    regex = re.compile(r"""(?x)
      {                      # Start of MessageFormat
      [0-9]+                 # Number, positive array reference
      (,\s*                  # FormatType (optional) one of number,date,time,choice
        (number(,\s*(integer|currency|percent|[-0#.,E;%\u2030\u00a4']+)?)?|  # number FormatStyle (optional)
         (date|time)(,\s*(short|medium|long|full|.+?))?|                  # date/time FormatStyle (optional)
         choice,([^{]+({.+})?)+)?                                      # choice with format, format required
      )?                     # END: (optional) FormatType
      }                      # END: MessageFormat""")
    parse = classmethod(regex_parse)


class FormattingPlaceable(Ph):
    """Placeable representing string formatting variables."""
    #For more information, see  man 3 printf
    #We probably don't want to support absolutely everything

    iseditable = False
    istranslatable = False
    regex = re.compile(r"""
        %                         # introduction
        (\d+\$)?                  # selection of non-next variable (reordering)
        [\-\+0 \#'I]?             # optional flag
        ((\d+)|[*])?              # field width
        (\.\d+)?                  # precision
        [hlI]?                    # length
        [cCdiouxXeEfgGnpsS]       # conversion specifier
        """, re.VERBOSE)
    parse = classmethod(regex_parse)


class UrlPlaceable(Ph):
    """Placeable handling URI."""

    istranslatable = False
    regex = re.compile(r"""
    ((((news|nttp|file|https?|ftp|irc)://)       # has to start with a protocol
    |((www|ftp)[-A-Za-z0-9]*\.))                 # or www... or ftp... hostname
    ([-A-Za-z0-9]+(\.[-A-Za-z0-9]+)*)            # hostname
    |(\d{1,3}(\.\d{1,3}){3,3}))                  # or IP address
    (:[0-9]{1,5})?                               # optional port
    (/[-A-Za-z0-9_\$\.\+\!\*\(\),;:@&=\?/~\#\%]*)?     # optional trailing path
    (?=$|\s|([]'}>\),\"]))
    """, re.VERBOSE)
    parse = classmethod(regex_parse)


class FilePlaceable(Ph):
    """Placeable handling file locations."""

    istranslatable = False
    regex = re.compile(r"(~/|/|\./)([-A-Za-z0-9_\$\.\+\!\*\(\),;:@&=\?/~\#\%]|\\){3,}")
    #TODO: Handle Windows drive letters. Some common Windows paths won't be
    # handled correctly while note allowing spaces, such as
    #     "C:\Documents and Settings"
    #     "C:\Program Files"
    parse = classmethod(regex_parse)


class EmailPlaceable(Ph):
    """Placeable handling emails."""

    istranslatable = False
    regex = re.compile(r"((mailto:)|)[A-Za-z0-9]+[-a-zA-Z0-9._%]*@(([-A-Za-z0-9]+)\.)+[a-zA-Z]{2,4}")
    # TODO: What about internationalised domain names? ;-)
    parse = classmethod(regex_parse)


class PunctuationPlaceable(Ph):
    """Placeable handling punctuation."""

    iseditable = False
    istranslatable = False
    # FIXME this should really be a list created as being the inverse of what
    # is available on the translators keyboard.  Or easily expanded by their
    # configuration.
    regex = re.compile(ur'''([™©®]|          # Marks
                             [℃℉°]|          # Degree related
                             [±πθ×÷−√∞∆Σ′″]| # Maths
                             [‘’ʼ‚‛“”„‟]|    # Quote characters
                             [£¥]|           # Currencies
                             …|              # U2026 - horizontal ellipsis
                             —|              # U2014 - em dash
                             –|              # U2013 - en dash
                             [ ]             # U202F - narrow no-break space
                            )+''', re.VERBOSE)
    parse = classmethod(regex_parse)


class XMLEntityPlaceable(Ph):
    """Placeable handling XML entities (C{&xxxxx;}-style entities)."""

    iseditable = False
    istranslatable = False
    regex = re.compile(r'''&(
        ([a-zA-Z][a-zA-Z0-9\.-]*)            #named entity
         |([#](\d{1,5}|x[a-fA-F0-9]{1,5})+)  #numeric entity
        );''', re.VERBOSE)
    parse = classmethod(regex_parse)


class CapsPlaceable(Ph):
    """Placeable handling long all-caps strings."""

    iseditable = True
    regex = re.compile(r'\b[A-Z][A-Z_/\-:*0-9]{2,}\b[+]?')
    parse = classmethod(regex_parse)


class CamelCasePlaceable(Ph):
    """Placeable handling camel case strings."""

    iseditable = True
    regex = re.compile(r'''(?x)
            \b(
               [a-z]+[A-Z]|         #Not that strict if we start with lower (iPod)
               [A-Z]+[a-z]+[A-Z]|   #One capital at the start is not enough (OpenTran)
               [A-Z]{2,}[a-z]       #Two capitals at the start is enough    (KBabel)
            )[a-zA-Z0-9]*           #Let's allow any final lower/upper/digit
            \b''')
    parse = classmethod(regex_parse)


class SpacesPlaceable(Ph):
    """Placeable handling unusual spaces in strings."""

    iseditable = True
    istranslatable = False
    regex = re.compile(r"""(?m)  #Multiline expression
        [ ]{2,}|     #More than two consecutive
        ^[ ]+|       #At start of a line
        [ ]+$        #At end of line""", re.VERBOSE)

    parse = classmethod(regex_parse)


class XMLTagPlaceable(Ph):
    """Placeable handling XML tags."""

    iseditable = False
    istranslatable = False
    regex = re.compile(r'<(\w+)(\s(\w*=".*?")?)*/?>|</(\w+)>')
    parse = classmethod(regex_parse)


class OptionPlaceable(Ph):
    """Placeble handling command line options e.g. --help"""

    istranslatable = False
    regex = re.compile(r'''(?x)
                      \B(             # Empty string at the start of a non-word, ensures [space]-
                        -[a-zA-Z]|    # Single letter options: -i, -I
                        --[a-z\-]+    # Word options: --help
                      )\b''')
    #regex = re.compile(r'''(-[a-zA-Z]|--[-a-z]+)\b''')
    parse = classmethod(regex_parse)


def to_general_placeables(tree, classmap={
        G:      (AltAttrPlaceable,),
        Ph:     (
            NumberPlaceable,
            XMLEntityPlaceable,
            XMLTagPlaceable,
            UrlPlaceable,
            FilePlaceable,
            EmailPlaceable,
            OptionPlaceable,
            PunctuationPlaceable,
                )
        }):
    if not isinstance(tree, StringElem):
        return tree

    newtree = None

    for baseclass, gclasslist in classmap.items():
        if isinstance(tree, baseclass):
            gclass = [c for c in gclasslist if c.parse(unicode(tree))]
            if gclass:
                newtree = gclass[0]()

    if newtree is None:
        newtree = tree.__class__()

    newtree.id = tree.id
    newtree.rid = tree.rid
    newtree.xid = tree.xid
    newtree.sub = []

    for subtree in tree.sub:
        newtree.sub.append(to_general_placeables(subtree))

    return newtree

# The order of these parsers are very important
parsers = [
    NewlinePlaceable.parse,
    XMLTagPlaceable.parse,
    AltAttrPlaceable.parse,
    XMLEntityPlaceable.parse,
    PythonFormattingPlaceable.parse,
    JavaMessageFormatPlaceable.parse,
    FormattingPlaceable.parse,
    # The Qt variables can consume the %1 in %1$s which will mask a printf
    # placeable, so it has to come later.
    QtFormattingPlaceable.parse,
    UrlPlaceable.parse,
    FilePlaceable.parse,
    EmailPlaceable.parse,
    CapsPlaceable.parse,
    CamelCasePlaceable.parse,
    OptionPlaceable.parse,
    PunctuationPlaceable.parse,
    NumberPlaceable.parse,
]