diff options
Diffstat (limited to 'genshi/filters')
-rw-r--r-- | genshi/filters/__init__.py | 20 | ||||
-rw-r--r-- | genshi/filters/html.py | 453 | ||||
-rw-r--r-- | genshi/filters/i18n.py | 1238 | ||||
-rw-r--r-- | genshi/filters/transform.py | 1310 |
4 files changed, 3021 insertions, 0 deletions
diff --git a/genshi/filters/__init__.py b/genshi/filters/__init__.py new file mode 100644 index 0000000..efc2565 --- /dev/null +++ b/genshi/filters/__init__.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2007-2009 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://genshi.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://genshi.edgewall.org/log/. + +"""Implementation of a number of stream filters.""" + +from genshi.filters.html import HTMLFormFiller, HTMLSanitizer +from genshi.filters.i18n import Translator +from genshi.filters.transform import Transformer + +__docformat__ = 'restructuredtext en' diff --git a/genshi/filters/html.py b/genshi/filters/html.py new file mode 100644 index 0000000..d554a54 --- /dev/null +++ b/genshi/filters/html.py @@ -0,0 +1,453 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2006-2009 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://genshi.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://genshi.edgewall.org/log/. + +"""Implementation of a number of stream filters.""" + +try: + any +except NameError: + from genshi.util import any +import re + +from genshi.core import Attrs, QName, stripentities +from genshi.core import END, START, TEXT, COMMENT + +__all__ = ['HTMLFormFiller', 'HTMLSanitizer'] +__docformat__ = 'restructuredtext en' + + +class HTMLFormFiller(object): + """A stream filter that can populate HTML forms from a dictionary of values. + + >>> from genshi.input import HTML + >>> html = HTML('''<form> + ... <p><input type="text" name="foo" /></p> + ... </form>''') + >>> filler = HTMLFormFiller(data={'foo': 'bar'}) + >>> print(html | filler) + <form> + <p><input type="text" name="foo" value="bar"/></p> + </form> + """ + # TODO: only select the first radio button, and the first select option + # (if not in a multiple-select) + # TODO: only apply to elements in the XHTML namespace (or no namespace)? + + def __init__(self, name=None, id=None, data=None, passwords=False): + """Create the filter. + + :param name: The name of the form that should be populated. If this + parameter is given, only forms where the ``name`` attribute + value matches the parameter are processed. + :param id: The ID of the form that should be populated. If this + parameter is given, only forms where the ``id`` attribute + value matches the parameter are processed. + :param data: The dictionary of form values, where the keys are the names + of the form fields, and the values are the values to fill + in. + :param passwords: Whether password input fields should be populated. + This is off by default for security reasons (for + example, a password may end up in the browser cache) + :note: Changed in 0.5.2: added the `passwords` option + """ + self.name = name + self.id = id + if data is None: + data = {} + self.data = data + self.passwords = passwords + + def __call__(self, stream): + """Apply the filter to the given stream. + + :param stream: the markup event stream to filter + """ + in_form = in_select = in_option = in_textarea = False + select_value = option_value = textarea_value = None + option_start = None + option_text = [] + no_option_value = False + + for kind, data, pos in stream: + + if kind is START: + tag, attrs = data + tagname = tag.localname + + if tagname == 'form' and ( + self.name and attrs.get('name') == self.name or + self.id and attrs.get('id') == self.id or + not (self.id or self.name)): + in_form = True + + elif in_form: + if tagname == 'input': + type = attrs.get('type', '').lower() + if type in ('checkbox', 'radio'): + name = attrs.get('name') + if name and name in self.data: + value = self.data[name] + declval = attrs.get('value') + checked = False + if isinstance(value, (list, tuple)): + if declval: + checked = declval in [unicode(v) for v + in value] + else: + checked = any(value) + else: + if declval: + checked = declval == unicode(value) + elif type == 'checkbox': + checked = bool(value) + if checked: + attrs |= [(QName('checked'), 'checked')] + elif 'checked' in attrs: + attrs -= 'checked' + elif type in ('', 'hidden', 'text') \ + or type == 'password' and self.passwords: + name = attrs.get('name') + if name and name in self.data: + value = self.data[name] + if isinstance(value, (list, tuple)): + value = value[0] + if value is not None: + attrs |= [ + (QName('value'), unicode(value)) + ] + elif tagname == 'select': + name = attrs.get('name') + if name in self.data: + select_value = self.data[name] + in_select = True + elif tagname == 'textarea': + name = attrs.get('name') + if name in self.data: + textarea_value = self.data.get(name) + if isinstance(textarea_value, (list, tuple)): + textarea_value = textarea_value[0] + in_textarea = True + elif in_select and tagname == 'option': + option_start = kind, data, pos + option_value = attrs.get('value') + if option_value is None: + no_option_value = True + option_value = '' + in_option = True + continue + yield kind, (tag, attrs), pos + + elif in_form and kind is TEXT: + if in_select and in_option: + if no_option_value: + option_value += data + option_text.append((kind, data, pos)) + continue + elif in_textarea: + continue + yield kind, data, pos + + elif in_form and kind is END: + tagname = data.localname + if tagname == 'form': + in_form = False + elif tagname == 'select': + in_select = False + select_value = None + elif in_select and tagname == 'option': + if isinstance(select_value, (tuple, list)): + selected = option_value in [unicode(v) for v + in select_value] + else: + selected = option_value == unicode(select_value) + okind, (tag, attrs), opos = option_start + if selected: + attrs |= [(QName('selected'), 'selected')] + elif 'selected' in attrs: + attrs -= 'selected' + yield okind, (tag, attrs), opos + if option_text: + for event in option_text: + yield event + in_option = False + no_option_value = False + option_start = option_value = None + option_text = [] + elif tagname == 'textarea': + if textarea_value: + yield TEXT, unicode(textarea_value), pos + in_textarea = False + yield kind, data, pos + + else: + yield kind, data, pos + + +class HTMLSanitizer(object): + """A filter that removes potentially dangerous HTML tags and attributes + from the stream. + + >>> from genshi import HTML + >>> html = HTML('<div><script>alert(document.cookie)</script></div>') + >>> print(html | HTMLSanitizer()) + <div/> + + The default set of safe tags and attributes can be modified when the filter + is instantiated. For example, to allow inline ``style`` attributes, the + following instantation would work: + + >>> html = HTML('<div style="background: #000"></div>') + >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) + >>> print(html | sanitizer) + <div style="background: #000"/> + + Note that even in this case, the filter *does* attempt to remove dangerous + constructs from style attributes: + + >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>') + >>> print(html | sanitizer) + <div style="color: #000"/> + + This handles HTML entities, unicode escapes in CSS and Javascript text, as + well as a lot of other things. However, the style tag is still excluded by + default because it is very hard for such sanitizing to be completely safe, + especially considering how much error recovery current web browsers perform. + + It also does some basic filtering of CSS properties that may be used for + typical phishing attacks. For more sophisticated filtering, this class + provides a couple of hooks that can be overridden in sub-classes. + + :warn: Note that this special processing of CSS is currently only applied to + style attributes, **not** style elements. + """ + + SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', + 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', + 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', + 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', + 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', + 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', + 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', + 'ul', 'var']) + + SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', + 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', + 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', + 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', + 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', + 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', + 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', + 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', + 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', + 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', + 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) + + SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) + + URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', + 'src']) + + def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS, + safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS): + """Create the sanitizer. + + The exact set of allowed elements and attributes can be configured. + + :param safe_tags: a set of tag names that are considered safe + :param safe_attrs: a set of attribute names that are considered safe + :param safe_schemes: a set of URI schemes that are considered safe + :param uri_attrs: a set of names of attributes that contain URIs + """ + self.safe_tags = safe_tags + "The set of tag names that are considered safe." + self.safe_attrs = safe_attrs + "The set of attribute names that are considered safe." + self.uri_attrs = uri_attrs + "The set of names of attributes that may contain URIs." + self.safe_schemes = safe_schemes + "The set of URI schemes that are considered safe." + + def __call__(self, stream): + """Apply the filter to the given stream. + + :param stream: the markup event stream to filter + """ + waiting_for = None + + for kind, data, pos in stream: + if kind is START: + if waiting_for: + continue + tag, attrs = data + if not self.is_safe_elem(tag, attrs): + waiting_for = tag + continue + + new_attrs = [] + for attr, value in attrs: + value = stripentities(value) + if attr not in self.safe_attrs: + continue + elif attr in self.uri_attrs: + # Don't allow URI schemes such as "javascript:" + if not self.is_safe_uri(value): + continue + elif attr == 'style': + # Remove dangerous CSS declarations from inline styles + decls = self.sanitize_css(value) + if not decls: + continue + value = '; '.join(decls) + new_attrs.append((attr, value)) + + yield kind, (tag, Attrs(new_attrs)), pos + + elif kind is END: + tag = data + if waiting_for: + if waiting_for == tag: + waiting_for = None + else: + yield kind, data, pos + + elif kind is not COMMENT: + if not waiting_for: + yield kind, data, pos + + def is_safe_css(self, propname, value): + """Determine whether the given css property declaration is to be + considered safe for inclusion in the output. + + :param propname: the CSS property name + :param value: the value of the property + :return: whether the property value should be considered safe + :rtype: bool + :since: version 0.6 + """ + if propname == 'position': + return False + if propname.startswith('margin') and '-' in value: + # Negative margins can be used for phishing + return False + return True + + def is_safe_elem(self, tag, attrs): + """Determine whether the given element should be considered safe for + inclusion in the output. + + :param tag: the tag name of the element + :type tag: QName + :param attrs: the element attributes + :type attrs: Attrs + :return: whether the element should be considered safe + :rtype: bool + :since: version 0.6 + """ + if tag not in self.safe_tags: + return False + if tag.localname == 'input': + input_type = attrs.get('type', '').lower() + if input_type == 'password': + return False + return True + + def is_safe_uri(self, uri): + """Determine whether the given URI is to be considered safe for + inclusion in the output. + + The default implementation checks whether the scheme of the URI is in + the set of allowed URIs (`safe_schemes`). + + >>> sanitizer = HTMLSanitizer() + >>> sanitizer.is_safe_uri('http://example.org/') + True + >>> sanitizer.is_safe_uri('javascript:alert(document.cookie)') + False + + :param uri: the URI to check + :return: `True` if the URI can be considered safe, `False` otherwise + :rtype: `bool` + :since: version 0.4.3 + """ + if '#' in uri: + uri = uri.split('#', 1)[0] # Strip out the fragment identifier + if ':' not in uri: + return True # This is a relative URI + chars = [char for char in uri.split(':', 1)[0] if char.isalnum()] + return ''.join(chars).lower() in self.safe_schemes + + def sanitize_css(self, text): + """Remove potentially dangerous property declarations from CSS code. + + In particular, properties using the CSS ``url()`` function with a scheme + that is not considered safe are removed: + + >>> sanitizer = HTMLSanitizer() + >>> sanitizer.sanitize_css(u''' + ... background: url(javascript:alert("foo")); + ... color: #000; + ... ''') + [u'color: #000'] + + Also, the proprietary Internet Explorer function ``expression()`` is + always stripped: + + >>> sanitizer.sanitize_css(u''' + ... background: #fff; + ... color: #000; + ... width: e/**/xpression(alert("foo")); + ... ''') + [u'background: #fff', u'color: #000'] + + :param text: the CSS text; this is expected to be `unicode` and to not + contain any character or numeric references + :return: a list of declarations that are considered safe + :rtype: `list` + :since: version 0.4.3 + """ + decls = [] + text = self._strip_css_comments(self._replace_unicode_escapes(text)) + for decl in text.split(';'): + decl = decl.strip() + if not decl: + continue + try: + propname, value = decl.split(':', 1) + except ValueError: + continue + if not self.is_safe_css(propname.strip().lower(), value.strip()): + continue + is_evil = False + if 'expression' in value: + is_evil = True + for match in re.finditer(r'url\s*\(([^)]+)', value): + if not self.is_safe_uri(match.group(1)): + is_evil = True + break + if not is_evil: + decls.append(decl.strip()) + return decls + + _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub + _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub + + def _replace_unicode_escapes(self, text): + def _repl(match): + return unichr(int(match.group(1), 16)) + return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text)) + + _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub + + def _strip_css_comments(self, text): + return self._CSS_COMMENTS('', text) diff --git a/genshi/filters/i18n.py b/genshi/filters/i18n.py new file mode 100644 index 0000000..7852875 --- /dev/null +++ b/genshi/filters/i18n.py @@ -0,0 +1,1238 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2007-2010 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://genshi.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://genshi.edgewall.org/log/. + +"""Directives and utilities for internationalization and localization of +templates. + +:since: version 0.4 +:note: Directives support added since version 0.6 +""" + +try: + any +except NameError: + from genshi.util import any +from gettext import NullTranslations +import os +import re +from types import FunctionType + +from genshi.core import Attrs, Namespace, QName, START, END, TEXT, \ + XML_NAMESPACE, _ensure, StreamEventKind +from genshi.template.eval import _ast +from genshi.template.base import DirectiveFactory, EXPR, SUB, _apply_directives +from genshi.template.directives import Directive, StripDirective +from genshi.template.markup import MarkupTemplate, EXEC + +__all__ = ['Translator', 'extract'] +__docformat__ = 'restructuredtext en' + + +I18N_NAMESPACE = Namespace('http://genshi.edgewall.org/i18n') + +MSGBUF = StreamEventKind('MSGBUF') +SUB_START = StreamEventKind('SUB_START') +SUB_END = StreamEventKind('SUB_END') + +GETTEXT_FUNCTIONS = ('_', 'gettext', 'ngettext', 'dgettext', 'dngettext', + 'ugettext', 'ungettext') + + +class I18NDirective(Directive): + """Simple interface for i18n directives to support messages extraction.""" + + def __call__(self, stream, directives, ctxt, **vars): + return _apply_directives(stream, directives, ctxt, vars) + + +class ExtractableI18NDirective(I18NDirective): + """Simple interface for directives to support messages extraction.""" + + def extract(self, translator, stream, gettext_functions=GETTEXT_FUNCTIONS, + search_text=True, comment_stack=None): + raise NotImplementedError + + +class CommentDirective(I18NDirective): + """Implementation of the ``i18n:comment`` template directive which adds + translation comments. + + >>> tmpl = MarkupTemplate('''<html xmlns:i18n="http://genshi.edgewall.org/i18n"> + ... <p i18n:comment="As in Foo Bar">Foo</p> + ... </html>''') + >>> translator = Translator() + >>> translator.setup(tmpl) + >>> list(translator.extract(tmpl.stream)) + [(2, None, u'Foo', [u'As in Foo Bar'])] + """ + __slots__ = ['comment'] + + def __init__(self, value, template=None, namespaces=None, lineno=-1, + offset=-1): + Directive.__init__(self, None, template, namespaces, lineno, offset) + self.comment = value + + +class MsgDirective(ExtractableI18NDirective): + r"""Implementation of the ``i18n:msg`` directive which marks inner content + as translatable. Consider the following examples: + + >>> tmpl = MarkupTemplate('''<html xmlns:i18n="http://genshi.edgewall.org/i18n"> + ... <div i18n:msg=""> + ... <p>Foo</p> + ... <p>Bar</p> + ... </div> + ... <p i18n:msg="">Foo <em>bar</em>!</p> + ... </html>''') + + >>> translator = Translator() + >>> translator.setup(tmpl) + >>> list(translator.extract(tmpl.stream)) + [(2, None, u'[1:Foo]\n [2:Bar]', []), (6, None, u'Foo [1:bar]!', [])] + >>> print(tmpl.generate().render()) + <html> + <div><p>Foo</p> + <p>Bar</p></div> + <p>Foo <em>bar</em>!</p> + </html> + + >>> tmpl = MarkupTemplate('''<html xmlns:i18n="http://genshi.edgewall.org/i18n"> + ... <div i18n:msg="fname, lname"> + ... <p>First Name: ${fname}</p> + ... <p>Last Name: ${lname}</p> + ... </div> + ... <p i18n:msg="">Foo <em>bar</em>!</p> + ... </html>''') + >>> translator.setup(tmpl) + >>> list(translator.extract(tmpl.stream)) #doctest: +NORMALIZE_WHITESPACE + [(2, None, u'[1:First Name: %(fname)s]\n [2:Last Name: %(lname)s]', []), + (6, None, u'Foo [1:bar]!', [])] + + >>> tmpl = MarkupTemplate('''<html xmlns:i18n="http://genshi.edgewall.org/i18n"> + ... <div i18n:msg="fname, lname"> + ... <p>First Name: ${fname}</p> + ... <p>Last Name: ${lname}</p> + ... </div> + ... <p i18n:msg="">Foo <em>bar</em>!</p> + ... </html>''') + >>> translator.setup(tmpl) + >>> print(tmpl.generate(fname='John', lname='Doe').render()) + <html> + <div><p>First Name: John</p> + <p>Last Name: Doe</p></div> + <p>Foo <em>bar</em>!</p> + </html> + + Starting and ending white-space is stripped of to make it simpler for + translators. Stripping it is not that important since it's on the html + source, the rendered output will remain the same. + """ + __slots__ = ['params', 'lineno'] + + def __init__(self, value, template=None, namespaces=None, lineno=-1, + offset=-1): + Directive.__init__(self, None, template, namespaces, lineno, offset) + self.params = [param.strip() for param in value.split(',') if param] + self.lineno = lineno + + @classmethod + def attach(cls, template, stream, value, namespaces, pos): + if type(value) is dict: + value = value.get('params', '').strip() + return super(MsgDirective, cls).attach(template, stream, value.strip(), + namespaces, pos) + + def __call__(self, stream, directives, ctxt, **vars): + gettext = ctxt.get('_i18n.gettext') + if ctxt.get('_i18n.domain'): + dgettext = ctxt.get('_i18n.dgettext') + assert hasattr(dgettext, '__call__'), \ + 'No domain gettext function passed' + gettext = lambda msg: dgettext(ctxt.get('_i18n.domain'), msg) + + def _generate(): + msgbuf = MessageBuffer(self) + previous = stream.next() + if previous[0] is START: + yield previous + else: + msgbuf.append(*previous) + previous = stream.next() + for kind, data, pos in stream: + msgbuf.append(*previous) + previous = kind, data, pos + if previous[0] is not END: + msgbuf.append(*previous) + previous = None + for event in msgbuf.translate(gettext(msgbuf.format())): + yield event + if previous: + yield previous + + return _apply_directives(_generate(), directives, ctxt, vars) + + def extract(self, translator, stream, gettext_functions=GETTEXT_FUNCTIONS, + search_text=True, comment_stack=None): + msgbuf = MessageBuffer(self) + strip = False + + stream = iter(stream) + previous = stream.next() + if previous[0] is START: + for message in translator._extract_attrs(previous, + gettext_functions, + search_text=search_text): + yield message + previous = stream.next() + strip = True + for event in stream: + if event[0] is START: + for message in translator._extract_attrs(event, + gettext_functions, + search_text=search_text): + yield message + msgbuf.append(*previous) + previous = event + if not strip: + msgbuf.append(*previous) + + yield self.lineno, None, msgbuf.format(), comment_stack[-1:] + + +class ChooseBranchDirective(I18NDirective): + __slots__ = ['params'] + + def __call__(self, stream, directives, ctxt, **vars): + self.params = ctxt.get('_i18n.choose.params', [])[:] + msgbuf = MessageBuffer(self) + stream = _apply_directives(stream, directives, ctxt, vars) + + previous = stream.next() + if previous[0] is START: + yield previous + else: + msgbuf.append(*previous) + + try: + previous = stream.next() + except StopIteration: + # For example <i18n:singular> or <i18n:plural> directives + yield MSGBUF, (), -1 # the place holder for msgbuf output + ctxt['_i18n.choose.%s' % self.tagname] = msgbuf + return + + for event in stream: + msgbuf.append(*previous) + previous = event + yield MSGBUF, (), -1 # the place holder for msgbuf output + + if previous[0] is END: + yield previous # the outer end tag + else: + msgbuf.append(*previous) + ctxt['_i18n.choose.%s' % self.tagname] = msgbuf + + def extract(self, translator, stream, gettext_functions=GETTEXT_FUNCTIONS, + search_text=True, comment_stack=None, msgbuf=None): + stream = iter(stream) + previous = stream.next() + + if previous[0] is START: + # skip the enclosing element + for message in translator._extract_attrs(previous, + gettext_functions, + search_text=search_text): + yield message + previous = stream.next() + + for event in stream: + if previous[0] is START: + for message in translator._extract_attrs(previous, + gettext_functions, + search_text=search_text): + yield message + msgbuf.append(*previous) + previous = event + + if previous[0] is not END: + msgbuf.append(*previous) + + +class SingularDirective(ChooseBranchDirective): + """Implementation of the ``i18n:singular`` directive to be used with the + ``i18n:choose`` directive.""" + + +class PluralDirective(ChooseBranchDirective): + """Implementation of the ``i18n:plural`` directive to be used with the + ``i18n:choose`` directive.""" + + +class ChooseDirective(ExtractableI18NDirective): + """Implementation of the ``i18n:choose`` directive which provides plural + internationalisation of strings. + + This directive requires at least one parameter, the one which evaluates to + an integer which will allow to choose the plural/singular form. If you also + have expressions inside the singular and plural version of the string you + also need to pass a name for those parameters. Consider the following + examples: + + >>> tmpl = MarkupTemplate('''\ + <html xmlns:i18n="http://genshi.edgewall.org/i18n"> + ... <div i18n:choose="num; num"> + ... <p i18n:singular="">There is $num coin</p> + ... <p i18n:plural="">There are $num coins</p> + ... </div> + ... </html>''') + >>> translator = Translator() + >>> translator.setup(tmpl) + >>> list(translator.extract(tmpl.stream)) #doctest: +NORMALIZE_WHITESPACE + [(2, 'ngettext', (u'There is %(num)s coin', + u'There are %(num)s coins'), [])] + + >>> tmpl = MarkupTemplate('''\ + <html xmlns:i18n="http://genshi.edgewall.org/i18n"> + ... <div i18n:choose="num; num"> + ... <p i18n:singular="">There is $num coin</p> + ... <p i18n:plural="">There are $num coins</p> + ... </div> + ... </html>''') + >>> translator.setup(tmpl) + >>> print(tmpl.generate(num=1).render()) + <html> + <div> + <p>There is 1 coin</p> + </div> + </html> + >>> print(tmpl.generate(num=2).render()) + <html> + <div> + <p>There are 2 coins</p> + </div> + </html> + + When used as a element and not as an attribute: + + >>> tmpl = MarkupTemplate('''\ + <html xmlns:i18n="http://genshi.edgewall.org/i18n"> + ... <i18n:choose numeral="num" params="num"> + ... <p i18n:singular="">There is $num coin</p> + ... <p i18n:plural="">There are $num coins</p> + ... </i18n:choose> + ... </html>''') + >>> translator.setup(tmpl) + >>> list(translator.extract(tmpl.stream)) #doctest: +NORMALIZE_WHITESPACE + [(2, 'ngettext', (u'There is %(num)s coin', + u'There are %(num)s coins'), [])] + """ + __slots__ = ['numeral', 'params', 'lineno'] + + def __init__(self, value, template=None, namespaces=None, lineno=-1, + offset=-1): + Directive.__init__(self, None, template, namespaces, lineno, offset) + params = [v.strip() for v in value.split(';')] + self.numeral = self._parse_expr(params.pop(0), template, lineno, offset) + self.params = params and [name.strip() for name in + params[0].split(',') if name] or [] + self.lineno = lineno + + @classmethod + def attach(cls, template, stream, value, namespaces, pos): + if type(value) is dict: + numeral = value.get('numeral', '').strip() + assert numeral is not '', "at least pass the numeral param" + params = [v.strip() for v in value.get('params', '').split(',')] + value = '%s; ' % numeral + ', '.join(params) + return super(ChooseDirective, cls).attach(template, stream, value, + namespaces, pos) + + def __call__(self, stream, directives, ctxt, **vars): + ctxt.push({'_i18n.choose.params': self.params, + '_i18n.choose.singular': None, + '_i18n.choose.plural': None}) + + ngettext = ctxt.get('_i18n.ngettext') + assert hasattr(ngettext, '__call__'), 'No ngettext function available' + dngettext = ctxt.get('_i18n.dngettext') + if not dngettext: + dngettext = lambda d, s, p, n: ngettext(s, p, n) + + new_stream = [] + singular_stream = None + singular_msgbuf = None + plural_stream = None + plural_msgbuf = None + + numeral = self.numeral.evaluate(ctxt) + is_plural = self._is_plural(numeral, ngettext) + + for event in stream: + if event[0] is SUB and any(isinstance(d, ChooseBranchDirective) + for d in event[1][0]): + subdirectives, substream = event[1] + + if isinstance(subdirectives[0], SingularDirective): + singular_stream = list(_apply_directives(substream, + subdirectives, + ctxt, vars)) + new_stream.append((MSGBUF, None, (None, -1, -1))) + + elif isinstance(subdirectives[0], PluralDirective): + if is_plural: + plural_stream = list(_apply_directives(substream, + subdirectives, + ctxt, vars)) + + else: + new_stream.append(event) + + if ctxt.get('_i18n.domain'): + ngettext = lambda s, p, n: dngettext(ctxt.get('_i18n.domain'), + s, p, n) + + singular_msgbuf = ctxt.get('_i18n.choose.singular') + if is_plural: + plural_msgbuf = ctxt.get('_i18n.choose.plural') + msgbuf, choice = plural_msgbuf, plural_stream + else: + msgbuf, choice = singular_msgbuf, singular_stream + plural_msgbuf = MessageBuffer(self) + + for kind, data, pos in new_stream: + if kind is MSGBUF: + for event in choice: + if event[0] is MSGBUF: + translation = ngettext(singular_msgbuf.format(), + plural_msgbuf.format(), + numeral) + for subevent in msgbuf.translate(translation): + yield subevent + else: + yield event + else: + yield kind, data, pos + + ctxt.pop() + + def extract(self, translator, stream, gettext_functions=GETTEXT_FUNCTIONS, + search_text=True, comment_stack=None): + strip = False + stream = iter(stream) + previous = stream.next() + + if previous[0] is START: + # skip the enclosing element + for message in translator._extract_attrs(previous, + gettext_functions, + search_text=search_text): + yield message + previous = stream.next() + strip = True + + singular_msgbuf = MessageBuffer(self) + plural_msgbuf = MessageBuffer(self) + + for event in stream: + if previous[0] is SUB: + directives, substream = previous[1] + for directive in directives: + if isinstance(directive, SingularDirective): + for message in directive.extract(translator, + substream, gettext_functions, search_text, + comment_stack, msgbuf=singular_msgbuf): + yield message + elif isinstance(directive, PluralDirective): + for message in directive.extract(translator, + substream, gettext_functions, search_text, + comment_stack, msgbuf=plural_msgbuf): + yield message + elif not isinstance(directive, StripDirective): + singular_msgbuf.append(*previous) + plural_msgbuf.append(*previous) + else: + if previous[0] is START: + for message in translator._extract_attrs(previous, + gettext_functions, + search_text): + yield message + singular_msgbuf.append(*previous) + plural_msgbuf.append(*previous) + previous = event + + if not strip: + singular_msgbuf.append(*previous) + plural_msgbuf.append(*previous) + + yield self.lineno, 'ngettext', \ + (singular_msgbuf.format(), plural_msgbuf.format()), \ + comment_stack[-1:] + + def _is_plural(self, numeral, ngettext): + # XXX: should we test which form was chosen like this!?!?!? + # There should be no match in any catalogue for these singular and + # plural test strings + singular = u'O\x85\xbe\xa9\xa8az\xc3?\xe6\xa1\x02n\x84\x93' + plural = u'\xcc\xfb+\xd3Pn\x9d\tT\xec\x1d\xda\x1a\x88\x00' + return ngettext(singular, plural, numeral) == plural + + +class DomainDirective(I18NDirective): + """Implementation of the ``i18n:domain`` directive which allows choosing + another i18n domain(catalog) to translate from. + + >>> from genshi.filters.tests.i18n import DummyTranslations + >>> tmpl = MarkupTemplate('''\ + <html xmlns:i18n="http://genshi.edgewall.org/i18n"> + ... <p i18n:msg="">Bar</p> + ... <div i18n:domain="foo"> + ... <p i18n:msg="">FooBar</p> + ... <p>Bar</p> + ... <p i18n:domain="bar" i18n:msg="">Bar</p> + ... <p i18n:domain="">Bar</p> + ... </div> + ... <p>Bar</p> + ... </html>''') + + >>> translations = DummyTranslations({'Bar': 'Voh'}) + >>> translations.add_domain('foo', {'FooBar': 'BarFoo', 'Bar': 'foo_Bar'}) + >>> translations.add_domain('bar', {'Bar': 'bar_Bar'}) + >>> translator = Translator(translations) + >>> translator.setup(tmpl) + + >>> print(tmpl.generate().render()) + <html> + <p>Voh</p> + <div> + <p>BarFoo</p> + <p>foo_Bar</p> + <p>bar_Bar</p> + <p>Voh</p> + </div> + <p>Voh</p> + </html> + """ + __slots__ = ['domain'] + + def __init__(self, value, template=None, namespaces=None, lineno=-1, + offset=-1): + Directive.__init__(self, None, template, namespaces, lineno, offset) + self.domain = value and value.strip() or '__DEFAULT__' + + @classmethod + def attach(cls, template, stream, value, namespaces, pos): + if type(value) is dict: + value = value.get('name') + return super(DomainDirective, cls).attach(template, stream, value, + namespaces, pos) + + def __call__(self, stream, directives, ctxt, **vars): + ctxt.push({'_i18n.domain': self.domain}) + for event in _apply_directives(stream, directives, ctxt, vars): + yield event + ctxt.pop() + + +class Translator(DirectiveFactory): + """Can extract and translate localizable strings from markup streams and + templates. + + For example, assume the following template: + + >>> tmpl = MarkupTemplate('''<html xmlns:py="http://genshi.edgewall.org/"> + ... <head> + ... <title>Example</title> + ... </head> + ... <body> + ... <h1>Example</h1> + ... <p>${_("Hello, %(name)s") % dict(name=username)}</p> + ... </body> + ... </html>''', filename='example.html') + + For demonstration, we define a dummy ``gettext``-style function with a + hard-coded translation table, and pass that to the `Translator` initializer: + + >>> def pseudo_gettext(string): + ... return { + ... 'Example': 'Beispiel', + ... 'Hello, %(name)s': 'Hallo, %(name)s' + ... }[string] + >>> translator = Translator(pseudo_gettext) + + Next, the translator needs to be prepended to any already defined filters + on the template: + + >>> tmpl.filters.insert(0, translator) + + When generating the template output, our hard-coded translations should be + applied as expected: + + >>> print(tmpl.generate(username='Hans', _=pseudo_gettext)) + <html> + <head> + <title>Beispiel</title> + </head> + <body> + <h1>Beispiel</h1> + <p>Hallo, Hans</p> + </body> + </html> + + Note that elements defining ``xml:lang`` attributes that do not contain + variable expressions are ignored by this filter. That can be used to + exclude specific parts of a template from being extracted and translated. + """ + + directives = [ + ('domain', DomainDirective), + ('comment', CommentDirective), + ('msg', MsgDirective), + ('choose', ChooseDirective), + ('singular', SingularDirective), + ('plural', PluralDirective) + ] + + IGNORE_TAGS = frozenset([ + QName('script'), QName('http://www.w3.org/1999/xhtml}script'), + QName('style'), QName('http://www.w3.org/1999/xhtml}style') + ]) + INCLUDE_ATTRS = frozenset([ + 'abbr', 'alt', 'label', 'prompt', 'standby', 'summary', 'title' + ]) + NAMESPACE = I18N_NAMESPACE + + def __init__(self, translate=NullTranslations(), ignore_tags=IGNORE_TAGS, + include_attrs=INCLUDE_ATTRS, extract_text=True): + """Initialize the translator. + + :param translate: the translation function, for example ``gettext`` or + ``ugettext``. + :param ignore_tags: a set of tag names that should not be localized + :param include_attrs: a set of attribute names should be localized + :param extract_text: whether the content of text nodes should be + extracted, or only text in explicit ``gettext`` + function calls + + :note: Changed in 0.6: the `translate` parameter can now be either + a ``gettext``-style function, or an object compatible with the + ``NullTransalations`` or ``GNUTranslations`` interface + """ + self.translate = translate + self.ignore_tags = ignore_tags + self.include_attrs = include_attrs + self.extract_text = extract_text + + def __call__(self, stream, ctxt=None, translate_text=True, + translate_attrs=True): + """Translate any localizable strings in the given stream. + + This function shouldn't be called directly. Instead, an instance of + the `Translator` class should be registered as a filter with the + `Template` or the `TemplateLoader`, or applied as a regular stream + filter. If used as a template filter, it should be inserted in front of + all the default filters. + + :param stream: the markup event stream + :param ctxt: the template context (not used) + :param translate_text: whether text nodes should be translated (used + internally) + :param translate_attrs: whether attribute values should be translated + (used internally) + :return: the localized stream + """ + ignore_tags = self.ignore_tags + include_attrs = self.include_attrs + skip = 0 + xml_lang = XML_NAMESPACE['lang'] + if not self.extract_text: + translate_text = False + translate_attrs = False + + if type(self.translate) is FunctionType: + gettext = self.translate + if ctxt: + ctxt['_i18n.gettext'] = gettext + else: + gettext = self.translate.ugettext + ngettext = self.translate.ungettext + try: + dgettext = self.translate.dugettext + dngettext = self.translate.dungettext + except AttributeError: + dgettext = lambda _, y: gettext(y) + dngettext = lambda _, s, p, n: ngettext(s, p, n) + if ctxt: + ctxt['_i18n.gettext'] = gettext + ctxt['_i18n.ngettext'] = ngettext + ctxt['_i18n.dgettext'] = dgettext + ctxt['_i18n.dngettext'] = dngettext + + if ctxt and ctxt.get('_i18n.domain'): + gettext = lambda msg: dgettext(ctxt.get('_i18n.domain'), msg) + + for kind, data, pos in stream: + + # skip chunks that should not be localized + if skip: + if kind is START: + skip += 1 + elif kind is END: + skip -= 1 + yield kind, data, pos + continue + + # handle different events that can be localized + if kind is START: + tag, attrs = data + if tag in self.ignore_tags or \ + isinstance(attrs.get(xml_lang), basestring): + skip += 1 + yield kind, data, pos + continue + + new_attrs = [] + changed = False + + for name, value in attrs: + newval = value + if isinstance(value, basestring): + if translate_attrs and name in include_attrs: + newval = gettext(value) + else: + newval = list( + self(_ensure(value), ctxt, translate_text=False) + ) + if newval != value: + value = newval + changed = True + new_attrs.append((name, value)) + if changed: + attrs = Attrs(new_attrs) + + yield kind, (tag, attrs), pos + + elif translate_text and kind is TEXT: + text = data.strip() + if text: + data = data.replace(text, unicode(gettext(text))) + yield kind, data, pos + + elif kind is SUB: + directives, substream = data + current_domain = None + for idx, directive in enumerate(directives): + # Organize directives to make everything work + # FIXME: There's got to be a better way to do this! + if isinstance(directive, DomainDirective): + # Grab current domain and update context + current_domain = directive.domain + ctxt.push({'_i18n.domain': current_domain}) + # Put domain directive as the first one in order to + # update context before any other directives evaluation + directives.insert(0, directives.pop(idx)) + + # If this is an i18n directive, no need to translate text + # nodes here + is_i18n_directive = any([ + isinstance(d, ExtractableI18NDirective) + for d in directives + ]) + substream = list(self(substream, ctxt, + translate_text=not is_i18n_directive, + translate_attrs=translate_attrs)) + yield kind, (directives, substream), pos + + if current_domain: + ctxt.pop() + else: + yield kind, data, pos + + def extract(self, stream, gettext_functions=GETTEXT_FUNCTIONS, + search_text=True, comment_stack=None): + """Extract localizable strings from the given template stream. + + For every string found, this function yields a ``(lineno, function, + message, comments)`` tuple, where: + + * ``lineno`` is the number of the line on which the string was found, + * ``function`` is the name of the ``gettext`` function used (if the + string was extracted from embedded Python code), and + * ``message`` is the string itself (a ``unicode`` object, or a tuple + of ``unicode`` objects for functions with multiple string + arguments). + * ``comments`` is a list of comments related to the message, extracted + from ``i18n:comment`` attributes found in the markup + + >>> tmpl = MarkupTemplate('''<html xmlns:py="http://genshi.edgewall.org/"> + ... <head> + ... <title>Example</title> + ... </head> + ... <body> + ... <h1>Example</h1> + ... <p>${_("Hello, %(name)s") % dict(name=username)}</p> + ... <p>${ngettext("You have %d item", "You have %d items", num)}</p> + ... </body> + ... </html>''', filename='example.html') + >>> for line, func, msg, comments in Translator().extract(tmpl.stream): + ... print('%d, %r, %r' % (line, func, msg)) + 3, None, u'Example' + 6, None, u'Example' + 7, '_', u'Hello, %(name)s' + 8, 'ngettext', (u'You have %d item', u'You have %d items', None) + + :param stream: the event stream to extract strings from; can be a + regular stream or a template stream + :param gettext_functions: a sequence of function names that should be + treated as gettext-style localization + functions + :param search_text: whether the content of text nodes should be + extracted (used internally) + + :note: Changed in 0.4.1: For a function with multiple string arguments + (such as ``ngettext``), a single item with a tuple of strings is + yielded, instead an item for each string argument. + :note: Changed in 0.6: The returned tuples now include a fourth + element, which is a list of comments for the translator. + """ + if not self.extract_text: + search_text = False + if comment_stack is None: + comment_stack = [] + skip = 0 + + xml_lang = XML_NAMESPACE['lang'] + + for kind, data, pos in stream: + if skip: + if kind is START: + skip += 1 + if kind is END: + skip -= 1 + + if kind is START and not skip: + tag, attrs = data + if tag in self.ignore_tags or \ + isinstance(attrs.get(xml_lang), basestring): + skip += 1 + continue + + for message in self._extract_attrs((kind, data, pos), + gettext_functions, + search_text=search_text): + yield message + + elif not skip and search_text and kind is TEXT: + text = data.strip() + if text and [ch for ch in text if ch.isalpha()]: + yield pos[1], None, text, comment_stack[-1:] + + elif kind is EXPR or kind is EXEC: + for funcname, strings in extract_from_code(data, + gettext_functions): + # XXX: Do we need to grab i18n:comment from comment_stack ??? + yield pos[1], funcname, strings, [] + + elif kind is SUB: + directives, substream = data + in_comment = False + + for idx, directive in enumerate(directives): + # Do a first loop to see if there's a comment directive + # If there is update context and pop it from directives + if isinstance(directive, CommentDirective): + in_comment = True + comment_stack.append(directive.comment) + if len(directives) == 1: + # in case we're in the presence of something like: + # <p i18n:comment="foo">Foo</p> + for message in self.extract( + substream, gettext_functions, + search_text=search_text and not skip, + comment_stack=comment_stack): + yield message + directives.pop(idx) + elif not isinstance(directive, I18NDirective): + # Remove all other non i18n directives from the process + directives.pop(idx) + + if not directives and not in_comment: + # Extract content if there's no directives because + # strip was pop'ed and not because comment was pop'ed. + # Extraction in this case has been taken care of. + for message in self.extract( + substream, gettext_functions, + search_text=search_text and not skip): + yield message + + for directive in directives: + if isinstance(directive, ExtractableI18NDirective): + for message in directive.extract(self, + substream, gettext_functions, + search_text=search_text and not skip, + comment_stack=comment_stack): + yield message + else: + for message in self.extract( + substream, gettext_functions, + search_text=search_text and not skip, + comment_stack=comment_stack): + yield message + + if in_comment: + comment_stack.pop() + + def get_directive_index(self, dir_cls): + total = len(self._dir_order) + if dir_cls in self._dir_order: + return self._dir_order.index(dir_cls) - total + return total + + def setup(self, template): + """Convenience function to register the `Translator` filter and the + related directives with the given template. + + :param template: a `Template` instance + """ + template.filters.insert(0, self) + if hasattr(template, 'add_directives'): + template.add_directives(Translator.NAMESPACE, self) + + def _extract_attrs(self, event, gettext_functions, search_text): + for name, value in event[1][1]: + if search_text and isinstance(value, basestring): + if name in self.include_attrs: + text = value.strip() + if text: + yield event[2][1], None, text, [] + else: + for message in self.extract(_ensure(value), gettext_functions, + search_text=False): + yield message + + +class MessageBuffer(object): + """Helper class for managing internationalized mixed content. + + :since: version 0.5 + """ + + def __init__(self, directive=None): + """Initialize the message buffer. + + :param directive: the directive owning the buffer + :type directive: I18NDirective + """ + # params list needs to be copied so that directives can be evaluated + # more than once + self.orig_params = self.params = directive.params[:] + self.directive = directive + self.string = [] + self.events = {} + self.values = {} + self.depth = 1 + self.order = 1 + self.stack = [0] + self.subdirectives = {} + + def append(self, kind, data, pos): + """Append a stream event to the buffer. + + :param kind: the stream event kind + :param data: the event data + :param pos: the position of the event in the source + """ + if kind is SUB: + # The order needs to be +1 because a new START kind event will + # happen and we we need to wrap those events into our custom kind(s) + order = self.stack[-1] + 1 + subdirectives, substream = data + # Store the directives that should be applied after translation + self.subdirectives.setdefault(order, []).extend(subdirectives) + self.events.setdefault(order, []).append((SUB_START, None, pos)) + for skind, sdata, spos in substream: + self.append(skind, sdata, spos) + self.events.setdefault(order, []).append((SUB_END, None, pos)) + elif kind is TEXT: + if '[' in data or ']' in data: + # Quote [ and ] if it ain't us adding it, ie, if the user is + # using those chars in his templates, escape them + data = data.replace('[', '\[').replace(']', '\]') + self.string.append(data) + self.events.setdefault(self.stack[-1], []).append((kind, data, pos)) + elif kind is EXPR: + if self.params: + param = self.params.pop(0) + else: + params = ', '.join(['"%s"' % p for p in self.orig_params if p]) + if params: + params = "(%s)" % params + raise IndexError("%d parameters%s given to 'i18n:%s' but " + "%d or more expressions used in '%s', line %s" + % (len(self.orig_params), params, + self.directive.tagname, + len(self.orig_params) + 1, + os.path.basename(pos[0] or + 'In-memory Template'), + pos[1])) + self.string.append('%%(%s)s' % param) + self.events.setdefault(self.stack[-1], []).append((kind, data, pos)) + self.values[param] = (kind, data, pos) + else: + if kind is START: + self.string.append('[%d:' % self.order) + self.stack.append(self.order) + self.events.setdefault(self.stack[-1], + []).append((kind, data, pos)) + self.depth += 1 + self.order += 1 + elif kind is END: + self.depth -= 1 + if self.depth: + self.events[self.stack[-1]].append((kind, data, pos)) + self.string.append(']') + self.stack.pop() + + def format(self): + """Return a message identifier representing the content in the + buffer. + """ + return ''.join(self.string).strip() + + def translate(self, string, regex=re.compile(r'%\((\w+)\)s')): + """Interpolate the given message translation with the events in the + buffer and return the translated stream. + + :param string: the translated message string + """ + substream = None + + def yield_parts(string): + for idx, part in enumerate(regex.split(string)): + if idx % 2: + yield self.values[part] + elif part: + yield (TEXT, + part.replace('\[', '[').replace('\]', ']'), + (None, -1, -1) + ) + + parts = parse_msg(string) + parts_counter = {} + for order, string in parts: + parts_counter.setdefault(order, []).append(None) + + while parts: + order, string = parts.pop(0) + if len(parts_counter[order]) == 1: + events = self.events[order] + else: + events = [self.events[order].pop(0)] + parts_counter[order].pop() + + for event in events: + if event[0] is SUB_START: + substream = [] + elif event[0] is SUB_END: + # Yield a substream which might have directives to be + # applied to it (after translation events) + yield SUB, (self.subdirectives[order], substream), event[2] + substream = None + elif event[0] is TEXT: + if string: + for part in yield_parts(string): + if substream is not None: + substream.append(part) + else: + yield part + # String handled, reset it + string = None + elif event[0] is START: + if substream is not None: + substream.append(event) + else: + yield event + if string: + for part in yield_parts(string): + if substream is not None: + substream.append(part) + else: + yield part + # String handled, reset it + string = None + elif event[0] is END: + if string: + for part in yield_parts(string): + if substream is not None: + substream.append(part) + else: + yield part + # String handled, reset it + string = None + if substream is not None: + substream.append(event) + else: + yield event + elif event[0] is EXPR: + # These are handled on the strings itself + continue + else: + if string: + for part in yield_parts(string): + if substream is not None: + substream.append(part) + else: + yield part + # String handled, reset it + string = None + if substream is not None: + substream.append(event) + else: + yield event + + +def parse_msg(string, regex=re.compile(r'(?:\[(\d+)\:)|(?<!\\)\]')): + """Parse a translated message using Genshi mixed content message + formatting. + + >>> parse_msg("See [1:Help].") + [(0, 'See '), (1, 'Help'), (0, '.')] + + >>> parse_msg("See [1:our [2:Help] page] for details.") + [(0, 'See '), (1, 'our '), (2, 'Help'), (1, ' page'), (0, ' for details.')] + + >>> parse_msg("[2:Details] finden Sie in [1:Hilfe].") + [(2, 'Details'), (0, ' finden Sie in '), (1, 'Hilfe'), (0, '.')] + + >>> parse_msg("[1:] Bilder pro Seite anzeigen.") + [(1, ''), (0, ' Bilder pro Seite anzeigen.')] + + :param string: the translated message string + :return: a list of ``(order, string)`` tuples + :rtype: `list` + """ + parts = [] + stack = [0] + while True: + mo = regex.search(string) + if not mo: + break + + if mo.start() or stack[-1]: + parts.append((stack[-1], string[:mo.start()])) + string = string[mo.end():] + + orderno = mo.group(1) + if orderno is not None: + stack.append(int(orderno)) + else: + stack.pop() + if not stack: + break + + if string: + parts.append((stack[-1], string)) + + return parts + + +def extract_from_code(code, gettext_functions): + """Extract strings from Python bytecode. + + >>> from genshi.template.eval import Expression + >>> expr = Expression('_("Hello")') + >>> list(extract_from_code(expr, GETTEXT_FUNCTIONS)) + [('_', u'Hello')] + + >>> expr = Expression('ngettext("You have %(num)s item", ' + ... '"You have %(num)s items", num)') + >>> list(extract_from_code(expr, GETTEXT_FUNCTIONS)) + [('ngettext', (u'You have %(num)s item', u'You have %(num)s items', None))] + + :param code: the `Code` object + :type code: `genshi.template.eval.Code` + :param gettext_functions: a sequence of function names + :since: version 0.5 + """ + def _walk(node): + if isinstance(node, _ast.Call) and isinstance(node.func, _ast.Name) \ + and node.func.id in gettext_functions: + strings = [] + def _add(arg): + if isinstance(arg, _ast.Str) and isinstance(arg.s, basestring): + strings.append(unicode(arg.s, 'utf-8')) + elif arg: + strings.append(None) + [_add(arg) for arg in node.args] + _add(node.starargs) + _add(node.kwargs) + if len(strings) == 1: + strings = strings[0] + else: + strings = tuple(strings) + yield node.func.id, strings + elif node._fields: + children = [] + for field in node._fields: + child = getattr(node, field, None) + if isinstance(child, list): + for elem in child: + children.append(elem) + elif isinstance(child, _ast.AST): + children.append(child) + for child in children: + for funcname, strings in _walk(child): + yield funcname, strings + return _walk(code.ast) + + +def extract(fileobj, keywords, comment_tags, options): + """Babel extraction method for Genshi templates. + + :param fileobj: the file-like object the messages should be extracted from + :param keywords: a list of keywords (i.e. function names) that should be + recognized as translation functions + :param comment_tags: a list of translator tags to search for and include + in the results + :param options: a dictionary of additional options (optional) + :return: an iterator over ``(lineno, funcname, message, comments)`` tuples + :rtype: ``iterator`` + """ + template_class = options.get('template_class', MarkupTemplate) + if isinstance(template_class, basestring): + module, clsname = template_class.split(':', 1) + template_class = getattr(__import__(module, {}, {}, [clsname]), clsname) + encoding = options.get('encoding', None) + + extract_text = options.get('extract_text', True) + if isinstance(extract_text, basestring): + extract_text = extract_text.lower() in ('1', 'on', 'yes', 'true') + + ignore_tags = options.get('ignore_tags', Translator.IGNORE_TAGS) + if isinstance(ignore_tags, basestring): + ignore_tags = ignore_tags.split() + ignore_tags = [QName(tag) for tag in ignore_tags] + + include_attrs = options.get('include_attrs', Translator.INCLUDE_ATTRS) + if isinstance(include_attrs, basestring): + include_attrs = include_attrs.split() + include_attrs = [QName(attr) for attr in include_attrs] + + tmpl = template_class(fileobj, filename=getattr(fileobj, 'name', None), + encoding=encoding) + tmpl.loader = None + + translator = Translator(None, ignore_tags, include_attrs, extract_text) + if hasattr(tmpl, 'add_directives'): + tmpl.add_directives(Translator.NAMESPACE, translator) + for message in translator.extract(tmpl.stream, gettext_functions=keywords): + yield message diff --git a/genshi/filters/transform.py b/genshi/filters/transform.py new file mode 100644 index 0000000..9b75b06 --- /dev/null +++ b/genshi/filters/transform.py @@ -0,0 +1,1310 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2007-2009 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://genshi.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://genshi.edgewall.org/log/. + +"""A filter for functional-style transformations of markup streams. + +The `Transformer` filter provides a variety of transformations that can be +applied to parts of streams that match given XPath expressions. These +transformations can be chained to achieve results that would be comparitively +tedious to achieve by writing stream filters by hand. The approach of chaining +node selection and transformation has been inspired by the `jQuery`_ Javascript +library. + + .. _`jQuery`: http://jquery.com/ + +For example, the following transformation removes the ``<title>`` element from +the ``<head>`` of the input document: + +>>> from genshi.builder import tag +>>> html = HTML('''<html> +... <head><title>Some Title</title></head> +... <body> +... Some <em>body</em> text. +... </body> +... </html>''') +>>> print(html | Transformer('body/em').map(unicode.upper, TEXT) +... .unwrap().wrap(tag.u)) +<html> + <head><title>Some Title</title></head> + <body> + Some <u>BODY</u> text. + </body> +</html> + +The ``Transformer`` support a large number of useful transformations out of the +box, but custom transformations can be added easily. + +:since: version 0.5 +""" + +import re +import sys + +from genshi.builder import Element +from genshi.core import Stream, Attrs, QName, TEXT, START, END, _ensure, Markup +from genshi.path import Path + +__all__ = ['Transformer', 'StreamBuffer', 'InjectorTransformation', 'ENTER', + 'EXIT', 'INSIDE', 'OUTSIDE', 'BREAK'] + + +class TransformMark(str): + """A mark on a transformation stream.""" + __slots__ = [] + _instances = {} + + def __new__(cls, val): + return cls._instances.setdefault(val, str.__new__(cls, val)) + + +ENTER = TransformMark('ENTER') +"""Stream augmentation mark indicating that a selected element is being +entered.""" + +INSIDE = TransformMark('INSIDE') +"""Stream augmentation mark indicating that processing is currently inside a +selected element.""" + +OUTSIDE = TransformMark('OUTSIDE') +"""Stream augmentation mark indicating that a match occurred outside a selected +element.""" + +ATTR = TransformMark('ATTR') +"""Stream augmentation mark indicating a selected element attribute.""" + +EXIT = TransformMark('EXIT') +"""Stream augmentation mark indicating that a selected element is being +exited.""" + +BREAK = TransformMark('BREAK') +"""Stream augmentation mark indicating a break between two otherwise contiguous +blocks of marked events. + +This is used primarily by the cut() transform to provide later transforms with +an opportunity to operate on the cut buffer. +""" + + +class PushBackStream(object): + """Allows a single event to be pushed back onto the stream and re-consumed. + """ + def __init__(self, stream): + self.stream = iter(stream) + self.peek = None + + def push(self, event): + assert self.peek is None + self.peek = event + + def __iter__(self): + while True: + if self.peek is not None: + peek = self.peek + self.peek = None + yield peek + else: + try: + event = self.stream.next() + yield event + except StopIteration: + if self.peek is None: + raise + + +class Transformer(object): + """Stream filter that can apply a variety of different transformations to + a stream. + + This is achieved by selecting the events to be transformed using XPath, + then applying the transformations to the events matched by the path + expression. Each marked event is in the form (mark, (kind, data, pos)), + where mark can be any of `ENTER`, `INSIDE`, `EXIT`, `OUTSIDE`, or `None`. + + The first three marks match `START` and `END` events, and any events + contained `INSIDE` any selected XML/HTML element. A non-element match + outside a `START`/`END` container (e.g. ``text()``) will yield an `OUTSIDE` + mark. + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + + Transformations act on selected stream events matching an XPath expression. + Here's an example of removing some markup (the title, in this case) + selected by an expression: + + >>> print(html | Transformer('head/title').remove()) + <html><head/><body>Some <em>body</em> text.</body></html> + + Inserted content can be passed in the form of a string, or a markup event + stream, which includes streams generated programmatically via the + `builder` module: + + >>> from genshi.builder import tag + >>> print(html | Transformer('body').prepend(tag.h1('Document Title'))) + <html><head><title>Some Title</title></head><body><h1>Document + Title</h1>Some <em>body</em> text.</body></html> + + Each XPath expression determines the set of tags that will be acted upon by + subsequent transformations. In this example we select the ``<title>`` text, + copy it into a buffer, then select the ``<body>`` element and paste the + copied text into the body as ``<h1>`` enclosed text: + + >>> buffer = StreamBuffer() + >>> print(html | Transformer('head/title/text()').copy(buffer) + ... .end().select('body').prepend(tag.h1(buffer))) + <html><head><title>Some Title</title></head><body><h1>Some Title</h1>Some + <em>body</em> text.</body></html> + + Transformations can also be assigned and reused, although care must be + taken when using buffers, to ensure that buffers are cleared between + transforms: + + >>> emphasis = Transformer('body//em').attr('class', 'emphasis') + >>> print(html | emphasis) + <html><head><title>Some Title</title></head><body>Some <em + class="emphasis">body</em> text.</body></html> + """ + + __slots__ = ['transforms'] + + def __init__(self, path='.'): + """Construct a new transformation filter. + + :param path: an XPath expression (as string) or a `Path` instance + """ + self.transforms = [SelectTransformation(path)] + + def __call__(self, stream, keep_marks=False): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + :param keep_marks: Do not strip transformer selection marks from the + stream. Useful for testing. + :return: the transformed stream + :rtype: `Stream` + """ + transforms = self._mark(stream) + for link in self.transforms: + transforms = link(transforms) + if not keep_marks: + transforms = self._unmark(transforms) + return Stream(transforms, + serializer=getattr(stream, 'serializer', None)) + + def apply(self, function): + """Apply a transformation to the stream. + + Transformations can be chained, similar to stream filters. Any callable + accepting a marked stream can be used as a transform. + + As an example, here is a simple `TEXT` event upper-casing transform: + + >>> def upper(stream): + ... for mark, (kind, data, pos) in stream: + ... if mark and kind is TEXT: + ... yield mark, (kind, data.upper(), pos) + ... else: + ... yield mark, (kind, data, pos) + >>> short_stream = HTML('<body>Some <em>test</em> text</body>') + >>> print(short_stream | Transformer('.//em/text()').apply(upper)) + <body>Some <em>TEST</em> text</body> + """ + transformer = Transformer() + transformer.transforms = self.transforms[:] + if isinstance(function, Transformer): + transformer.transforms.extend(function.transforms) + else: + transformer.transforms.append(function) + return transformer + + #{ Selection operations + + def select(self, path): + """Mark events matching the given XPath expression, within the current + selection. + + >>> html = HTML('<body>Some <em>test</em> text</body>') + >>> print(html | Transformer().select('.//em').trace()) + (None, ('START', (QName('body'), Attrs()), (None, 1, 0))) + (None, ('TEXT', u'Some ', (None, 1, 6))) + ('ENTER', ('START', (QName('em'), Attrs()), (None, 1, 11))) + ('INSIDE', ('TEXT', u'test', (None, 1, 15))) + ('EXIT', ('END', QName('em'), (None, 1, 19))) + (None, ('TEXT', u' text', (None, 1, 24))) + (None, ('END', QName('body'), (None, 1, 29))) + <body>Some <em>test</em> text</body> + + :param path: an XPath expression (as string) or a `Path` instance + :return: the stream augmented by transformation marks + :rtype: `Transformer` + """ + return self.apply(SelectTransformation(path)) + + def invert(self): + """Invert selection so that marked events become unmarked, and vice + versa. + + Specificaly, all marks are converted to null marks, and all null marks + are converted to OUTSIDE marks. + + >>> html = HTML('<body>Some <em>test</em> text</body>') + >>> print(html | Transformer('//em').invert().trace()) + ('OUTSIDE', ('START', (QName('body'), Attrs()), (None, 1, 0))) + ('OUTSIDE', ('TEXT', u'Some ', (None, 1, 6))) + (None, ('START', (QName('em'), Attrs()), (None, 1, 11))) + (None, ('TEXT', u'test', (None, 1, 15))) + (None, ('END', QName('em'), (None, 1, 19))) + ('OUTSIDE', ('TEXT', u' text', (None, 1, 24))) + ('OUTSIDE', ('END', QName('body'), (None, 1, 29))) + <body>Some <em>test</em> text</body> + + :rtype: `Transformer` + """ + return self.apply(InvertTransformation()) + + def end(self): + """End current selection, allowing all events to be selected. + + Example: + + >>> html = HTML('<body>Some <em>test</em> text</body>') + >>> print(html | Transformer('//em').end().trace()) + ('OUTSIDE', ('START', (QName('body'), Attrs()), (None, 1, 0))) + ('OUTSIDE', ('TEXT', u'Some ', (None, 1, 6))) + ('OUTSIDE', ('START', (QName('em'), Attrs()), (None, 1, 11))) + ('OUTSIDE', ('TEXT', u'test', (None, 1, 15))) + ('OUTSIDE', ('END', QName('em'), (None, 1, 19))) + ('OUTSIDE', ('TEXT', u' text', (None, 1, 24))) + ('OUTSIDE', ('END', QName('body'), (None, 1, 29))) + <body>Some <em>test</em> text</body> + + :return: the stream augmented by transformation marks + :rtype: `Transformer` + """ + return self.apply(EndTransformation()) + + #{ Deletion operations + + def empty(self): + """Empty selected elements of all content. + + Example: + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//em').empty()) + <html><head><title>Some Title</title></head><body>Some <em/> + text.</body></html> + + :rtype: `Transformer` + """ + return self.apply(EmptyTransformation()) + + def remove(self): + """Remove selection from the stream. + + Example: + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//em').remove()) + <html><head><title>Some Title</title></head><body>Some + text.</body></html> + + :rtype: `Transformer` + """ + return self.apply(RemoveTransformation()) + + #{ Direct element operations + + def unwrap(self): + """Remove outermost enclosing elements from selection. + + Example: + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//em').unwrap()) + <html><head><title>Some Title</title></head><body>Some body + text.</body></html> + + :rtype: `Transformer` + """ + return self.apply(UnwrapTransformation()) + + def wrap(self, element): + """Wrap selection in an element. + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//em').wrap('strong')) + <html><head><title>Some Title</title></head><body>Some + <strong><em>body</em></strong> text.</body></html> + + :param element: either a tag name (as string) or an `Element` object + :rtype: `Transformer` + """ + return self.apply(WrapTransformation(element)) + + #{ Content insertion operations + + def replace(self, content): + """Replace selection with content. + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//title/text()').replace('New Title')) + <html><head><title>New Title</title></head><body>Some <em>body</em> + text.</body></html> + + :param content: Either a callable, an iterable of events, or a string + to insert. + :rtype: `Transformer` + """ + return self.apply(ReplaceTransformation(content)) + + def before(self, content): + """Insert content before selection. + + In this example we insert the word 'emphasised' before the <em> opening + tag: + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//em').before('emphasised ')) + <html><head><title>Some Title</title></head><body>Some emphasised + <em>body</em> text.</body></html> + + :param content: Either a callable, an iterable of events, or a string + to insert. + :rtype: `Transformer` + """ + return self.apply(BeforeTransformation(content)) + + def after(self, content): + """Insert content after selection. + + Here, we insert some text after the </em> closing tag: + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//em').after(' rock')) + <html><head><title>Some Title</title></head><body>Some <em>body</em> + rock text.</body></html> + + :param content: Either a callable, an iterable of events, or a string + to insert. + :rtype: `Transformer` + """ + return self.apply(AfterTransformation(content)) + + def prepend(self, content): + """Insert content after the ENTER event of the selection. + + Inserting some new text at the start of the <body>: + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//body').prepend('Some new body text. ')) + <html><head><title>Some Title</title></head><body>Some new body text. + Some <em>body</em> text.</body></html> + + :param content: Either a callable, an iterable of events, or a string + to insert. + :rtype: `Transformer` + """ + return self.apply(PrependTransformation(content)) + + def append(self, content): + """Insert content before the END event of the selection. + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//body').append(' Some new body text.')) + <html><head><title>Some Title</title></head><body>Some <em>body</em> + text. Some new body text.</body></html> + + :param content: Either a callable, an iterable of events, or a string + to insert. + :rtype: `Transformer` + """ + return self.apply(AppendTransformation(content)) + + #{ Attribute manipulation + + def attr(self, name, value): + """Add, replace or delete an attribute on selected elements. + + If `value` evaulates to `None` the attribute will be deleted from the + element: + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em class="before">body</em> <em>text</em>.</body>' + ... '</html>') + >>> print(html | Transformer('body/em').attr('class', None)) + <html><head><title>Some Title</title></head><body>Some <em>body</em> + <em>text</em>.</body></html> + + Otherwise the attribute will be set to `value`: + + >>> print(html | Transformer('body/em').attr('class', 'emphasis')) + <html><head><title>Some Title</title></head><body>Some <em + class="emphasis">body</em> <em class="emphasis">text</em>.</body></html> + + If `value` is a callable it will be called with the attribute name and + the `START` event for the matching element. Its return value will then + be used to set the attribute: + + >>> def print_attr(name, event): + ... attrs = event[1][1] + ... print(attrs) + ... return attrs.get(name) + >>> print(html | Transformer('body/em').attr('class', print_attr)) + Attrs([(QName('class'), u'before')]) + Attrs() + <html><head><title>Some Title</title></head><body>Some <em + class="before">body</em> <em>text</em>.</body></html> + + :param name: the name of the attribute + :param value: the value that should be set for the attribute. + :rtype: `Transformer` + """ + return self.apply(AttrTransformation(name, value)) + + #{ Buffer operations + + def copy(self, buffer, accumulate=False): + """Copy selection into buffer. + + The buffer is replaced by each *contiguous* selection before being passed + to the next transformation. If accumulate=True, further selections will + be appended to the buffer rather than replacing it. + + >>> from genshi.builder import tag + >>> buffer = StreamBuffer() + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('head/title/text()').copy(buffer) + ... .end().select('body').prepend(tag.h1(buffer))) + <html><head><title>Some Title</title></head><body><h1>Some + Title</h1>Some <em>body</em> text.</body></html> + + This example illustrates that only a single contiguous selection will + be buffered: + + >>> print(html | Transformer('head/title/text()').copy(buffer) + ... .end().select('body/em').copy(buffer).end().select('body') + ... .prepend(tag.h1(buffer))) + <html><head><title>Some Title</title></head><body><h1>Some + Title</h1>Some <em>body</em> text.</body></html> + >>> print(buffer) + <em>body</em> + + Element attributes can also be copied for later use: + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body><em>Some</em> <em class="before">body</em>' + ... '<em>text</em>.</body></html>') + >>> buffer = StreamBuffer() + >>> def apply_attr(name, entry): + ... return list(buffer)[0][1][1].get('class') + >>> print(html | Transformer('body/em[@class]/@class').copy(buffer) + ... .end().buffer().select('body/em[not(@class)]') + ... .attr('class', apply_attr)) + <html><head><title>Some Title</title></head><body><em + class="before">Some</em> <em class="before">body</em><em + class="before">text</em>.</body></html> + + + :param buffer: the `StreamBuffer` in which the selection should be + stored + :rtype: `Transformer` + :note: Copy (and cut) copy each individual selected object into the + buffer before passing to the next transform. For example, the + XPath ``*|text()`` will select all elements and text, each + instance of which will be copied to the buffer individually + before passing to the next transform. This has implications for + how ``StreamBuffer`` objects can be used, so some + experimentation may be required. + + """ + return self.apply(CopyTransformation(buffer, accumulate)) + + def cut(self, buffer, accumulate=False): + """Copy selection into buffer and remove the selection from the stream. + + >>> from genshi.builder import tag + >>> buffer = StreamBuffer() + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('.//em/text()').cut(buffer) + ... .end().select('.//em').after(tag.h1(buffer))) + <html><head><title>Some Title</title></head><body>Some + <em/><h1>body</h1> text.</body></html> + + Specifying accumulate=True, appends all selected intervals onto the + buffer. Combining this with the .buffer() operation allows us operate + on all copied events rather than per-segment. See the documentation on + buffer() for more information. + + :param buffer: the `StreamBuffer` in which the selection should be + stored + :rtype: `Transformer` + :note: this transformation will buffer the entire input stream + """ + return self.apply(CutTransformation(buffer, accumulate)) + + def buffer(self): + """Buffer the entire stream (can consume a considerable amount of + memory). + + Useful in conjunction with copy(accumulate=True) and + cut(accumulate=True) to ensure that all marked events in the entire + stream are copied to the buffer before further transformations are + applied. + + For example, to move all <note> elements inside a <notes> tag at the + top of the document: + + >>> doc = HTML('<doc><notes></notes><body>Some <note>one</note> ' + ... 'text <note>two</note>.</body></doc>') + >>> buffer = StreamBuffer() + >>> print(doc | Transformer('body/note').cut(buffer, accumulate=True) + ... .end().buffer().select('notes').prepend(buffer)) + <doc><notes><note>one</note><note>two</note></notes><body>Some text + .</body></doc> + + """ + return self.apply(list) + + #{ Miscellaneous operations + + def filter(self, filter): + """Apply a normal stream filter to the selection. The filter is called + once for each contiguous block of marked events. + + >>> from genshi.filters.html import HTMLSanitizer + >>> html = HTML('<html><body>Some text<script>alert(document.cookie)' + ... '</script> and some more text</body></html>') + >>> print(html | Transformer('body/*').filter(HTMLSanitizer())) + <html><body>Some text and some more text</body></html> + + :param filter: The stream filter to apply. + :rtype: `Transformer` + """ + return self.apply(FilterTransformation(filter)) + + def map(self, function, kind): + """Applies a function to the ``data`` element of events of ``kind`` in + the selection. + + >>> html = HTML('<html><head><title>Some Title</title></head>' + ... '<body>Some <em>body</em> text.</body></html>') + >>> print(html | Transformer('head/title').map(unicode.upper, TEXT)) + <html><head><title>SOME TITLE</title></head><body>Some <em>body</em> + text.</body></html> + + :param function: the function to apply + :param kind: the kind of event the function should be applied to + :rtype: `Transformer` + """ + return self.apply(MapTransformation(function, kind)) + + def substitute(self, pattern, replace, count=1): + """Replace text matching a regular expression. + + Refer to the documentation for ``re.sub()`` for details. + + >>> html = HTML('<html><body>Some text, some more text and ' + ... '<b>some bold text</b>\\n' + ... '<i>some italicised text</i></body></html>') + >>> print(html | Transformer('body/b').substitute('(?i)some', 'SOME')) + <html><body>Some text, some more text and <b>SOME bold text</b> + <i>some italicised text</i></body></html> + >>> tags = tag.html(tag.body('Some text, some more text and\\n', + ... Markup('<b>some bold text</b>'))) + >>> print(tags.generate() | Transformer('body').substitute( + ... '(?i)some', 'SOME')) + <html><body>SOME text, some more text and + <b>SOME bold text</b></body></html> + + :param pattern: A regular expression object or string. + :param replace: Replacement pattern. + :param count: Number of replacements to make in each text fragment. + :rtype: `Transformer` + """ + return self.apply(SubstituteTransformation(pattern, replace, count)) + + def rename(self, name): + """Rename matching elements. + + >>> html = HTML('<html><body>Some text, some more text and ' + ... '<b>some bold text</b></body></html>') + >>> print(html | Transformer('body/b').rename('strong')) + <html><body>Some text, some more text and <strong>some bold text</strong></body></html> + """ + return self.apply(RenameTransformation(name)) + + def trace(self, prefix='', fileobj=None): + """Print events as they pass through the transform. + + >>> html = HTML('<body>Some <em>test</em> text</body>') + >>> print(html | Transformer('em').trace()) + (None, ('START', (QName('body'), Attrs()), (None, 1, 0))) + (None, ('TEXT', u'Some ', (None, 1, 6))) + ('ENTER', ('START', (QName('em'), Attrs()), (None, 1, 11))) + ('INSIDE', ('TEXT', u'test', (None, 1, 15))) + ('EXIT', ('END', QName('em'), (None, 1, 19))) + (None, ('TEXT', u' text', (None, 1, 24))) + (None, ('END', QName('body'), (None, 1, 29))) + <body>Some <em>test</em> text</body> + + :param prefix: a string to prefix each event with in the output + :param fileobj: the writable file-like object to write to; defaults to + the standard output stream + :rtype: `Transformer` + """ + return self.apply(TraceTransformation(prefix, fileobj=fileobj)) + + # Internal methods + + def _mark(self, stream): + for event in stream: + yield OUTSIDE, event + + def _unmark(self, stream): + for mark, event in stream: + kind = event[0] + if not (kind is None or kind is ATTR or kind is BREAK): + yield event + + +class SelectTransformation(object): + """Select and mark events that match an XPath expression.""" + + def __init__(self, path): + """Create selection. + + :param path: an XPath expression (as string) or a `Path` object + """ + if not isinstance(path, Path): + path = Path(path) + self.path = path + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + """ + namespaces = {} + variables = {} + test = self.path.test() + stream = iter(stream) + next = stream.next + for mark, event in stream: + if mark is None: + yield mark, event + continue + result = test(event, namespaces, variables) + # XXX This is effectively genshi.core._ensure() for transform + # streams. + if result is True: + if event[0] is START: + yield ENTER, event + depth = 1 + while depth > 0: + mark, subevent = next() + if subevent[0] is START: + depth += 1 + elif subevent[0] is END: + depth -= 1 + if depth == 0: + yield EXIT, subevent + else: + yield INSIDE, subevent + test(subevent, namespaces, variables, updateonly=True) + else: + yield OUTSIDE, event + elif isinstance(result, Attrs): + # XXX Selected *attributes* are given a "kind" of None to + # indicate they are not really part of the stream. + yield ATTR, (ATTR, (QName(event[1][0] + '@*'), result), event[2]) + yield None, event + elif isinstance(result, tuple): + yield OUTSIDE, result + elif result: + # XXX Assume everything else is "text"? + yield None, (TEXT, unicode(result), (None, -1, -1)) + else: + yield None, event + + +class InvertTransformation(object): + """Invert selection so that marked events become unmarked, and vice versa. + + Specificaly, all input marks are converted to null marks, and all input + null marks are converted to OUTSIDE marks. + """ + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + """ + for mark, event in stream: + if mark: + yield None, event + else: + yield OUTSIDE, event + + +class EndTransformation(object): + """End the current selection.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + """ + for mark, event in stream: + yield OUTSIDE, event + + +class EmptyTransformation(object): + """Empty selected elements of all content.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + """ + for mark, event in stream: + yield mark, event + if mark is ENTER: + for mark, event in stream: + if mark is EXIT: + yield mark, event + break + + +class RemoveTransformation(object): + """Remove selection from the stream.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + """ + for mark, event in stream: + if mark is None: + yield mark, event + + +class UnwrapTransformation(object): + """Remove outtermost enclosing elements from selection.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + """ + for mark, event in stream: + if mark not in (ENTER, EXIT): + yield mark, event + + +class WrapTransformation(object): + """Wrap selection in an element.""" + + def __init__(self, element): + if isinstance(element, Element): + self.element = element + else: + self.element = Element(element) + + def __call__(self, stream): + for mark, event in stream: + if mark: + element = list(self.element.generate()) + for prefix in element[:-1]: + yield None, prefix + yield mark, event + start = mark + stopped = False + for mark, event in stream: + if start is ENTER and mark is EXIT: + yield mark, event + stopped = True + break + if not mark: + break + yield mark, event + else: + stopped = True + yield None, element[-1] + if not stopped: + yield mark, event + else: + yield mark, event + + +class TraceTransformation(object): + """Print events as they pass through the transform.""" + + def __init__(self, prefix='', fileobj=None): + """Trace constructor. + + :param prefix: text to prefix each traced line with. + :param fileobj: the writable file-like object to write to + """ + self.prefix = prefix + self.fileobj = fileobj or sys.stdout + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + """ + for event in stream: + self.fileobj.write('%s%s\n' % (self.prefix, event)) + yield event + + +class FilterTransformation(object): + """Apply a normal stream filter to the selection. The filter is called once + for each selection.""" + + def __init__(self, filter): + """Create the transform. + + :param filter: The stream filter to apply. + """ + self.filter = filter + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + def flush(queue): + if queue: + for event in self.filter(queue): + yield OUTSIDE, event + del queue[:] + + queue = [] + for mark, event in stream: + if mark is ENTER: + queue.append(event) + for mark, event in stream: + queue.append(event) + if mark is EXIT: + break + for queue_event in flush(queue): + yield queue_event + elif mark is OUTSIDE: + stopped = False + queue.append(event) + for mark, event in stream: + if mark is not OUTSIDE: + break + queue.append(event) + else: + stopped = True + for queue_event in flush(queue): + yield queue_event + if not stopped: + yield mark, event + else: + yield mark, event + for queue_event in flush(queue): + yield queue_event + + +class MapTransformation(object): + """Apply a function to the `data` element of events of ``kind`` in the + selection. + """ + + def __init__(self, function, kind): + """Create the transform. + + :param function: the function to apply; the function must take one + argument, the `data` element of each selected event + :param kind: the stream event ``kind`` to apply the `function` to + """ + self.function = function + self.kind = kind + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + for mark, (kind, data, pos) in stream: + if mark and self.kind in (None, kind): + yield mark, (kind, self.function(data), pos) + else: + yield mark, (kind, data, pos) + + +class SubstituteTransformation(object): + """Replace text matching a regular expression. + + Refer to the documentation for ``re.sub()`` for details. + """ + def __init__(self, pattern, replace, count=0): + """Create the transform. + + :param pattern: A regular expression object, or string. + :param replace: Replacement pattern. + :param count: Number of replacements to make in each text fragment. + """ + if isinstance(pattern, basestring): + self.pattern = re.compile(pattern) + else: + self.pattern = pattern + self.count = count + self.replace = replace + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + for mark, (kind, data, pos) in stream: + if mark is not None and kind is TEXT: + new_data = self.pattern.sub(self.replace, data, self.count) + if isinstance(data, Markup): + data = Markup(new_data) + else: + data = new_data + yield mark, (kind, data, pos) + + +class RenameTransformation(object): + """Rename matching elements.""" + def __init__(self, name): + """Create the transform. + + :param name: New element name. + """ + self.name = QName(name) + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + for mark, (kind, data, pos) in stream: + if mark is ENTER: + data = self.name, data[1] + elif mark is EXIT: + data = self.name + yield mark, (kind, data, pos) + + +class InjectorTransformation(object): + """Abstract base class for transformations that inject content into a + stream. + + >>> class Top(InjectorTransformation): + ... def __call__(self, stream): + ... for event in self._inject(): + ... yield event + ... for event in stream: + ... yield event + >>> html = HTML('<body>Some <em>test</em> text</body>') + >>> print(html | Transformer('.//em').apply(Top('Prefix '))) + Prefix <body>Some <em>test</em> text</body> + """ + def __init__(self, content): + """Create a new injector. + + :param content: An iterable of Genshi stream events, or a string to be + injected. + """ + self.content = content + + def _inject(self): + content = self.content + if hasattr(content, '__call__'): + content = content() + for event in _ensure(content): + yield None, event + + +class ReplaceTransformation(InjectorTransformation): + """Replace selection with content.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + stream = PushBackStream(stream) + for mark, event in stream: + if mark is not None: + start = mark + for subevent in self._inject(): + yield subevent + for mark, event in stream: + if start is ENTER: + if mark is EXIT: + break + elif mark != start: + stream.push((mark, event)) + break + else: + yield mark, event + + +class BeforeTransformation(InjectorTransformation): + """Insert content before selection.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + stream = PushBackStream(stream) + for mark, event in stream: + if mark is not None: + start = mark + for subevent in self._inject(): + yield subevent + yield mark, event + for mark, event in stream: + if mark != start and start is not ENTER: + stream.push((mark, event)) + break + yield mark, event + if start is ENTER and mark is EXIT: + break + else: + yield mark, event + + +class AfterTransformation(InjectorTransformation): + """Insert content after selection.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + stream = PushBackStream(stream) + for mark, event in stream: + yield mark, event + if mark: + start = mark + for mark, event in stream: + if start is not ENTER and mark != start: + stream.push((mark, event)) + break + yield mark, event + if start is ENTER and mark is EXIT: + break + for subevent in self._inject(): + yield subevent + + +class PrependTransformation(InjectorTransformation): + """Prepend content to the inside of selected elements.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + for mark, event in stream: + yield mark, event + if mark is ENTER: + for subevent in self._inject(): + yield subevent + + +class AppendTransformation(InjectorTransformation): + """Append content after the content of selected elements.""" + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + for mark, event in stream: + yield mark, event + if mark is ENTER: + for mark, event in stream: + if mark is EXIT: + break + yield mark, event + for subevent in self._inject(): + yield subevent + yield mark, event + + +class AttrTransformation(object): + """Set an attribute on selected elements.""" + + def __init__(self, name, value): + """Construct transform. + + :param name: name of the attribute that should be set + :param value: the value to set + """ + self.name = name + self.value = value + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: The marked event stream to filter + """ + callable_value = hasattr(self.value, '__call__') + for mark, (kind, data, pos) in stream: + if mark is ENTER: + if callable_value: + value = self.value(self.name, (kind, data, pos)) + else: + value = self.value + if value is None: + attrs = data[1] - [QName(self.name)] + else: + attrs = data[1] | [(QName(self.name), value)] + data = (data[0], attrs) + yield mark, (kind, data, pos) + + + +class StreamBuffer(Stream): + """Stream event buffer used for cut and copy transformations.""" + + def __init__(self): + """Create the buffer.""" + Stream.__init__(self, []) + + def append(self, event): + """Add an event to the buffer. + + :param event: the markup event to add + """ + self.events.append(event) + + def reset(self): + """Empty the buffer of events.""" + del self.events[:] + + +class CopyTransformation(object): + """Copy selected events into a buffer for later insertion.""" + + def __init__(self, buffer, accumulate=False): + """Create the copy transformation. + + :param buffer: the `StreamBuffer` in which the selection should be + stored + """ + if not accumulate: + buffer.reset() + self.buffer = buffer + self.accumulate = accumulate + + def __call__(self, stream): + """Apply the transformation to the marked stream. + + :param stream: the marked event stream to filter + """ + stream = PushBackStream(stream) + + for mark, event in stream: + if mark: + if not self.accumulate: + self.buffer.reset() + events = [(mark, event)] + self.buffer.append(event) + start = mark + for mark, event in stream: + if start is not ENTER and mark != start: + stream.push((mark, event)) + break + events.append((mark, event)) + self.buffer.append(event) + if start is ENTER and mark is EXIT: + break + for i in events: + yield i + else: + yield mark, event + + +class CutTransformation(object): + """Cut selected events into a buffer for later insertion and remove the + selection. + """ + + def __init__(self, buffer, accumulate=False): + """Create the cut transformation. + + :param buffer: the `StreamBuffer` in which the selection should be + stored + """ + self.buffer = buffer + self.accumulate = accumulate + + + def __call__(self, stream): + """Apply the transform filter to the marked stream. + + :param stream: the marked event stream to filter + """ + attributes = [] + stream = PushBackStream(stream) + broken = False + if not self.accumulate: + self.buffer.reset() + for mark, event in stream: + if mark: + # Send a BREAK event if there was no other event sent between + if not self.accumulate: + if not broken and self.buffer: + yield BREAK, (BREAK, None, None) + self.buffer.reset() + self.buffer.append(event) + start = mark + if mark is ATTR: + attributes.extend([name for name, _ in event[1][1]]) + for mark, event in stream: + if start is mark is ATTR: + attributes.extend([name for name, _ in event[1][1]]) + # Handle non-element contiguous selection + if start is not ENTER and mark != start: + # Operating on the attributes of a START event + if start is ATTR: + kind, data, pos = event + assert kind is START + data = (data[0], data[1] - attributes) + attributes = None + stream.push((mark, (kind, data, pos))) + else: + stream.push((mark, event)) + break + self.buffer.append(event) + if start is ENTER and mark is EXIT: + break + broken = False + else: + broken = True + yield mark, event + if not broken and self.buffer: + yield BREAK, (BREAK, None, None) |