From 570a268e7562303690ef6b599ea244945a3100ce Mon Sep 17 00:00:00 2001 From: Sebastian Silva Date: Sat, 09 Jul 2011 00:17:44 +0000 Subject: Still importing WebSDK. Need to read up on GIT. --- (limited to 'genshi/output.py') diff --git a/genshi/output.py b/genshi/output.py new file mode 100644 index 0000000..2ebb38b --- /dev/null +++ b/genshi/output.py @@ -0,0 +1,838 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2006-2009 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://genshi.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://genshi.edgewall.org/log/. + +"""This module provides different kinds of serialization methods for XML event +streams. +""" + +from itertools import chain +import re + +from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind +from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \ + START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE + +__all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer', + 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] +__docformat__ = 'restructuredtext en' + + +def encode(iterator, method='xml', encoding='utf-8', out=None): + """Encode serializer output into a string. + + :param iterator: the iterator returned from serializing a stream (basically + any iterator that yields unicode objects) + :param method: the serialization method; determines how characters not + representable in the specified encoding are treated + :param encoding: how the output string should be encoded; if set to `None`, + this method returns a `unicode` object + :param out: a file-like object that the output should be written to + instead of being returned as one big string; note that if + this is a file or socket (or similar), the `encoding` must + not be `None` (that is, the output must be encoded) + :return: a `str` or `unicode` object (depending on the `encoding` + parameter), or `None` if the `out` parameter is provided + + :since: version 0.4.1 + :note: Changed in 0.5: added the `out` parameter + """ + if encoding is not None: + errors = 'replace' + if method != 'text' and not isinstance(method, TextSerializer): + errors = 'xmlcharrefreplace' + _encode = lambda string: string.encode(encoding, errors) + else: + _encode = lambda string: string + if out is None: + return _encode(''.join(list(iterator))) + for chunk in iterator: + out.write(_encode(chunk)) + + +def get_serializer(method='xml', **kwargs): + """Return a serializer object for the given method. + + :param method: the serialization method; can be either "xml", "xhtml", + "html", "text", or a custom serializer class + + Any additional keyword arguments are passed to the serializer, and thus + depend on the `method` parameter value. + + :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer` + :since: version 0.4.1 + """ + if isinstance(method, basestring): + method = {'xml': XMLSerializer, + 'xhtml': XHTMLSerializer, + 'html': HTMLSerializer, + 'text': TextSerializer}[method.lower()] + return method(**kwargs) + + +class DocType(object): + """Defines a number of commonly used DOCTYPE declarations as constants.""" + + HTML_STRICT = ( + 'html', '-//W3C//DTD HTML 4.01//EN', + 'http://www.w3.org/TR/html4/strict.dtd' + ) + HTML_TRANSITIONAL = ( + 'html', '-//W3C//DTD HTML 4.01 Transitional//EN', + 'http://www.w3.org/TR/html4/loose.dtd' + ) + HTML_FRAMESET = ( + 'html', '-//W3C//DTD HTML 4.01 Frameset//EN', + 'http://www.w3.org/TR/html4/frameset.dtd' + ) + HTML = HTML_STRICT + + HTML5 = ('html', None, None) + + XHTML_STRICT = ( + 'html', '-//W3C//DTD XHTML 1.0 Strict//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' + ) + XHTML_TRANSITIONAL = ( + 'html', '-//W3C//DTD XHTML 1.0 Transitional//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + ) + XHTML_FRAMESET = ( + 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd' + ) + XHTML = XHTML_STRICT + + XHTML11 = ( + 'html', '-//W3C//DTD XHTML 1.1//EN', + 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' + ) + + SVG_FULL = ( + 'svg', '-//W3C//DTD SVG 1.1//EN', + 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd' + ) + SVG_BASIC = ( + 'svg', '-//W3C//DTD SVG Basic 1.1//EN', + 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd' + ) + SVG_TINY = ( + 'svg', '-//W3C//DTD SVG Tiny 1.1//EN', + 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd' + ) + SVG = SVG_FULL + + @classmethod + def get(cls, name): + """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` + declaration for the specified name. + + The following names are recognized in this version: + * "html" or "html-strict" for the HTML 4.01 strict DTD + * "html-transitional" for the HTML 4.01 transitional DTD + * "html-frameset" for the HTML 4.01 frameset DTD + * "html5" for the ``DOCTYPE`` proposed for HTML5 + * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD + * "xhtml-transitional" for the XHTML 1.0 transitional DTD + * "xhtml-frameset" for the XHTML 1.0 frameset DTD + * "xhtml11" for the XHTML 1.1 DTD + * "svg" or "svg-full" for the SVG 1.1 DTD + * "svg-basic" for the SVG Basic 1.1 DTD + * "svg-tiny" for the SVG Tiny 1.1 DTD + + :param name: the name of the ``DOCTYPE`` + :return: the ``(name, pubid, sysid)`` tuple for the requested + ``DOCTYPE``, or ``None`` if the name is not recognized + :since: version 0.4.1 + """ + return { + 'html': cls.HTML, 'html-strict': cls.HTML_STRICT, + 'html-transitional': DocType.HTML_TRANSITIONAL, + 'html-frameset': DocType.HTML_FRAMESET, + 'html5': cls.HTML5, + 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, + 'xhtml-transitional': cls.XHTML_TRANSITIONAL, + 'xhtml-frameset': cls.XHTML_FRAMESET, + 'xhtml11': cls.XHTML11, + 'svg': cls.SVG, 'svg-full': cls.SVG_FULL, + 'svg-basic': cls.SVG_BASIC, + 'svg-tiny': cls.SVG_TINY + }.get(name.lower()) + + +class XMLSerializer(object): + """Produces XML text from an event stream. + + >>> from genshi.builder import tag + >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) + >>> print(''.join(XMLSerializer()(elem.generate()))) +


+ """ + + _PRESERVE_SPACE = frozenset() + + def __init__(self, doctype=None, strip_whitespace=True, + namespace_prefixes=None, cache=True): + """Initialize the XML serializer. + + :param doctype: a ``(name, pubid, sysid)`` tuple that represents the + DOCTYPE declaration that should be included at the top + of the generated output, or the name of a DOCTYPE as + defined in `DocType.get` + :param strip_whitespace: whether extraneous whitespace should be + stripped from the output + :param cache: whether to cache the text output per event, which + improves performance for repetitive markup + :note: Changed in 0.4.2: The `doctype` parameter can now be a string. + :note: Changed in 0.6: The `cache` parameter was added + """ + self.filters = [EmptyTagFilter()] + if strip_whitespace: + self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) + self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes, + cache=cache)) + if doctype: + self.filters.append(DocTypeInserter(doctype)) + self.cache = cache + + def __call__(self, stream): + have_decl = have_doctype = False + in_cdata = False + + cache = {} + cache_get = cache.get + if self.cache: + def _emit(kind, input, output): + cache[kind, input] = output + return output + else: + def _emit(kind, input, output): + return output + + for filter_ in self.filters: + stream = filter_(stream) + for kind, data, pos in stream: + cached = cache_get((kind, data)) + if cached is not None: + yield cached + + elif kind is START or kind is EMPTY: + tag, attrib = data + buf = ['<', tag] + for attr, value in attrib: + buf += [' ', attr, '="', escape(value), '"'] + buf.append(kind is EMPTY and '/>' or '>') + yield _emit(kind, data, Markup(''.join(buf))) + + elif kind is END: + yield _emit(kind, data, Markup('' % data)) + + elif kind is TEXT: + if in_cdata: + yield _emit(kind, data, data) + else: + yield _emit(kind, data, escape(data, quotes=False)) + + elif kind is COMMENT: + yield _emit(kind, data, Markup('' % data)) + + elif kind is XML_DECL and not have_decl: + version, encoding, standalone = data + buf = ['\n') + yield Markup(''.join(buf)) + have_decl = True + + elif kind is DOCTYPE and not have_doctype: + name, pubid, sysid = data + buf = ['\n') + yield Markup(''.join(buf)) % tuple([p for p in data if p]) + have_doctype = True + + elif kind is START_CDATA: + yield Markup('') + in_cdata = False + + elif kind is PI: + yield _emit(kind, data, Markup('' % data)) + + +class XHTMLSerializer(XMLSerializer): + """Produces XHTML text from an event stream. + + >>> from genshi.builder import tag + >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) + >>> print(''.join(XHTMLSerializer()(elem.generate()))) +


+ """ + + _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', + 'hr', 'img', 'input', 'isindex', 'link', 'meta', + 'param']) + _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', + 'defer', 'disabled', 'ismap', 'multiple', + 'nohref', 'noresize', 'noshade', 'nowrap']) + _PRESERVE_SPACE = frozenset([ + QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), + QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') + ]) + + def __init__(self, doctype=None, strip_whitespace=True, + namespace_prefixes=None, drop_xml_decl=True, cache=True): + super(XHTMLSerializer, self).__init__(doctype, False) + self.filters = [EmptyTagFilter()] + if strip_whitespace: + self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) + namespace_prefixes = namespace_prefixes or {} + namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' + self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes, + cache=cache)) + if doctype: + self.filters.append(DocTypeInserter(doctype)) + self.drop_xml_decl = drop_xml_decl + self.cache = cache + + def __call__(self, stream): + boolean_attrs = self._BOOLEAN_ATTRS + empty_elems = self._EMPTY_ELEMS + drop_xml_decl = self.drop_xml_decl + have_decl = have_doctype = False + in_cdata = False + + cache = {} + cache_get = cache.get + if self.cache: + def _emit(kind, input, output): + cache[kind, input] = output + return output + else: + def _emit(kind, input, output): + return output + + for filter_ in self.filters: + stream = filter_(stream) + for kind, data, pos in stream: + cached = cache_get((kind, data)) + if cached is not None: + yield cached + + elif kind is START or kind is EMPTY: + tag, attrib = data + buf = ['<', tag] + for attr, value in attrib: + if attr in boolean_attrs: + value = attr + elif attr == 'xml:lang' and 'lang' not in attrib: + buf += [' lang="', escape(value), '"'] + elif attr == 'xml:space': + continue + buf += [' ', attr, '="', escape(value), '"'] + if kind is EMPTY: + if tag in empty_elems: + buf.append(' />') + else: + buf.append('>' % tag) + else: + buf.append('>') + yield _emit(kind, data, Markup(''.join(buf))) + + elif kind is END: + yield _emit(kind, data, Markup('' % data)) + + elif kind is TEXT: + if in_cdata: + yield _emit(kind, data, data) + else: + yield _emit(kind, data, escape(data, quotes=False)) + + elif kind is COMMENT: + yield _emit(kind, data, Markup('' % data)) + + elif kind is DOCTYPE and not have_doctype: + name, pubid, sysid = data + buf = ['\n') + yield Markup(''.join(buf)) % tuple([p for p in data if p]) + have_doctype = True + + elif kind is XML_DECL and not have_decl and not drop_xml_decl: + version, encoding, standalone = data + buf = ['\n') + yield Markup(''.join(buf)) + have_decl = True + + elif kind is START_CDATA: + yield Markup('') + in_cdata = False + + elif kind is PI: + yield _emit(kind, data, Markup('' % data)) + + +class HTMLSerializer(XHTMLSerializer): + """Produces HTML text from an event stream. + + >>> from genshi.builder import tag + >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) + >>> print(''.join(HTMLSerializer()(elem.generate()))) +


+ """ + + _NOESCAPE_ELEMS = frozenset([ + QName('script'), QName('http://www.w3.org/1999/xhtml}script'), + QName('style'), QName('http://www.w3.org/1999/xhtml}style') + ]) + + def __init__(self, doctype=None, strip_whitespace=True, cache=True): + """Initialize the HTML serializer. + + :param doctype: a ``(name, pubid, sysid)`` tuple that represents the + DOCTYPE declaration that should be included at the top + of the generated output + :param strip_whitespace: whether extraneous whitespace should be + stripped from the output + :param cache: whether to cache the text output per event, which + improves performance for repetitive markup + :note: Changed in 0.6: The `cache` parameter was added + """ + super(HTMLSerializer, self).__init__(doctype, False) + self.filters = [EmptyTagFilter()] + if strip_whitespace: + self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, + self._NOESCAPE_ELEMS)) + self.filters.append(NamespaceFlattener(prefixes={ + 'http://www.w3.org/1999/xhtml': '' + }, cache=cache)) + if doctype: + self.filters.append(DocTypeInserter(doctype)) + self.cache = True + + def __call__(self, stream): + boolean_attrs = self._BOOLEAN_ATTRS + empty_elems = self._EMPTY_ELEMS + noescape_elems = self._NOESCAPE_ELEMS + have_doctype = False + noescape = False + + cache = {} + cache_get = cache.get + if self.cache: + def _emit(kind, input, output): + cache[kind, input] = output + return output + else: + def _emit(kind, input, output): + return output + + for filter_ in self.filters: + stream = filter_(stream) + for kind, data, _ in stream: + output = cache_get((kind, data)) + if output is not None: + yield output + if (kind is START or kind is EMPTY) \ + and data[0] in noescape_elems: + noescape = True + elif kind is END: + noescape = False + + elif kind is START or kind is EMPTY: + tag, attrib = data + buf = ['<', tag] + for attr, value in attrib: + if attr in boolean_attrs: + if value: + buf += [' ', attr] + elif ':' in attr: + if attr == 'xml:lang' and 'lang' not in attrib: + buf += [' lang="', escape(value), '"'] + elif attr != 'xmlns': + buf += [' ', attr, '="', escape(value), '"'] + buf.append('>') + if kind is EMPTY: + if tag not in empty_elems: + buf.append('' % tag) + yield _emit(kind, data, Markup(''.join(buf))) + if tag in noescape_elems: + noescape = True + + elif kind is END: + yield _emit(kind, data, Markup('' % data)) + noescape = False + + elif kind is TEXT: + if noescape: + yield _emit(kind, data, data) + else: + yield _emit(kind, data, escape(data, quotes=False)) + + elif kind is COMMENT: + yield _emit(kind, data, Markup('' % data)) + + elif kind is DOCTYPE and not have_doctype: + name, pubid, sysid = data + buf = ['\n') + yield Markup(''.join(buf)) % tuple([p for p in data if p]) + have_doctype = True + + elif kind is PI: + yield _emit(kind, data, Markup('' % data)) + + +class TextSerializer(object): + """Produces plain text from an event stream. + + Only text events are included in the output. Unlike the other serializer, + special XML characters are not escaped: + + >>> from genshi.builder import tag + >>> elem = tag.div(tag.a('', href='foo'), tag.br) + >>> print(elem) +
<Hello!>
+ >>> print(''.join(TextSerializer()(elem.generate()))) + + + If text events contain literal markup (instances of the `Markup` class), + that markup is by default passed through unchanged: + + >>> elem = tag.div(Markup('Hello & Bye!
')) + >>> print(elem.generate().render(TextSerializer, encoding=None)) + Hello & Bye!
+ + You can use the ``strip_markup`` to change this behavior, so that tags and + entities are stripped from the output (or in the case of entities, + replaced with the equivalent character): + + >>> print(elem.generate().render(TextSerializer, strip_markup=True, + ... encoding=None)) + Hello & Bye! + """ + + def __init__(self, strip_markup=False): + """Create the serializer. + + :param strip_markup: whether markup (tags and encoded characters) found + in the text should be removed + """ + self.strip_markup = strip_markup + + def __call__(self, stream): + strip_markup = self.strip_markup + for event in stream: + if event[0] is TEXT: + data = event[1] + if strip_markup and type(data) is Markup: + data = data.striptags().stripentities() + yield unicode(data) + + +class EmptyTagFilter(object): + """Combines `START` and `STOP` events into `EMPTY` events for elements that + have no contents. + """ + + EMPTY = StreamEventKind('EMPTY') + + def __call__(self, stream): + prev = (None, None, None) + for ev in stream: + if prev[0] is START: + if ev[0] is END: + prev = EMPTY, prev[1], prev[2] + yield prev + continue + else: + yield prev + if ev[0] is not START: + yield ev + prev = ev + + +EMPTY = EmptyTagFilter.EMPTY + + +class NamespaceFlattener(object): + r"""Output stream filter that removes namespace information from the stream, + instead adding namespace attributes and prefixes as needed. + + :param prefixes: optional mapping of namespace URIs to prefixes + + >>> from genshi.input import XML + >>> xml = XML(''' + ... + ... ''') + >>> for kind, data, pos in NamespaceFlattener()(xml): + ... print('%s %r' % (kind, data)) + START (u'doc', Attrs([('xmlns', u'NS1'), (u'xmlns:two', u'NS2')])) + TEXT u'\n ' + START (u'two:item', Attrs()) + END u'two:item' + TEXT u'\n' + END u'doc' + """ + + def __init__(self, prefixes=None, cache=True): + self.prefixes = {XML_NAMESPACE.uri: 'xml'} + if prefixes is not None: + self.prefixes.update(prefixes) + self.cache = cache + + def __call__(self, stream): + cache = {} + cache_get = cache.get + if self.cache: + def _emit(kind, input, output, pos): + cache[kind, input] = output + return kind, output, pos + else: + def _emit(kind, input, output, pos): + return output + + prefixes = dict([(v, [k]) for k, v in self.prefixes.items()]) + namespaces = {XML_NAMESPACE.uri: ['xml']} + def _push_ns(prefix, uri): + namespaces.setdefault(uri, []).append(prefix) + prefixes.setdefault(prefix, []).append(uri) + cache.clear() + def _pop_ns(prefix): + uris = prefixes.get(prefix) + uri = uris.pop() + if not uris: + del prefixes[prefix] + if uri not in uris or uri != uris[-1]: + uri_prefixes = namespaces[uri] + uri_prefixes.pop() + if not uri_prefixes: + del namespaces[uri] + cache.clear() + return uri + + ns_attrs = [] + _push_ns_attr = ns_attrs.append + def _make_ns_attr(prefix, uri): + return 'xmlns%s' % (prefix and ':%s' % prefix or ''), uri + + def _gen_prefix(): + val = 0 + while 1: + val += 1 + yield 'ns%d' % val + _gen_prefix = _gen_prefix().next + + for kind, data, pos in stream: + output = cache_get((kind, data)) + if output is not None: + yield kind, output, pos + + elif kind is START or kind is EMPTY: + tag, attrs = data + + tagname = tag.localname + tagns = tag.namespace + if tagns: + if tagns in namespaces: + prefix = namespaces[tagns][-1] + if prefix: + tagname = '%s:%s' % (prefix, tagname) + else: + _push_ns_attr(('xmlns', tagns)) + _push_ns('', tagns) + + new_attrs = [] + for attr, value in attrs: + attrname = attr.localname + attrns = attr.namespace + if attrns: + if attrns not in namespaces: + prefix = _gen_prefix() + _push_ns(prefix, attrns) + _push_ns_attr(('xmlns:%s' % prefix, attrns)) + else: + prefix = namespaces[attrns][-1] + if prefix: + attrname = '%s:%s' % (prefix, attrname) + new_attrs.append((attrname, value)) + + yield _emit(kind, data, (tagname, Attrs(ns_attrs + new_attrs)), pos) + del ns_attrs[:] + + elif kind is END: + tagname = data.localname + tagns = data.namespace + if tagns: + prefix = namespaces[tagns][-1] + if prefix: + tagname = '%s:%s' % (prefix, tagname) + yield _emit(kind, data, tagname, pos) + + elif kind is START_NS: + prefix, uri = data + if uri not in namespaces: + prefix = prefixes.get(uri, [prefix])[-1] + _push_ns_attr(_make_ns_attr(prefix, uri)) + _push_ns(prefix, uri) + + elif kind is END_NS: + if data in prefixes: + uri = _pop_ns(data) + if ns_attrs: + attr = _make_ns_attr(data, uri) + if attr in ns_attrs: + ns_attrs.remove(attr) + + else: + yield kind, data, pos + + +class WhitespaceFilter(object): + """A filter that removes extraneous ignorable white space from the + stream. + """ + + def __init__(self, preserve=None, noescape=None): + """Initialize the filter. + + :param preserve: a set or sequence of tag names for which white-space + should be preserved + :param noescape: a set or sequence of tag names for which text content + should not be escaped + + The `noescape` set is expected to refer to elements that cannot contain + further child elements (such as ``