# -*- coding: utf-8 -*- # # Copyright (C) 2006-2009 Edgewall Software # All rights reserved. # # This software is licensed as described in the file COPYING, which # you should have received as part of this distribution. The terms # are also available at http://genshi.edgewall.org/wiki/License. # # This software consists of voluntary contributions made by many # individuals. For the exact contribution history, see the revision # history and logs, available at http://genshi.edgewall.org/log/. """This module provides different kinds of serialization methods for XML event streams. """ from itertools import chain import re from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \ START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] __docformat__ = 'restructuredtext en' def encode(iterator, method='xml', encoding='utf-8', out=None): """Encode serializer output into a string. :param iterator: the iterator returned from serializing a stream (basically any iterator that yields unicode objects) :param method: the serialization method; determines how characters not representable in the specified encoding are treated :param encoding: how the output string should be encoded; if set to `None`, this method returns a `unicode` object :param out: a file-like object that the output should be written to instead of being returned as one big string; note that if this is a file or socket (or similar), the `encoding` must not be `None` (that is, the output must be encoded) :return: a `str` or `unicode` object (depending on the `encoding` parameter), or `None` if the `out` parameter is provided :since: version 0.4.1 :note: Changed in 0.5: added the `out` parameter """ if encoding is not None: errors = 'replace' if method != 'text' and not isinstance(method, TextSerializer): errors = 'xmlcharrefreplace' _encode = lambda string: string.encode(encoding, errors) else: _encode = lambda string: string if out is None: return _encode(''.join(list(iterator))) for chunk in iterator: out.write(_encode(chunk)) def get_serializer(method='xml', **kwargs): """Return a serializer object for the given method. :param method: the serialization method; can be either "xml", "xhtml", "html", "text", or a custom serializer class Any additional keyword arguments are passed to the serializer, and thus depend on the `method` parameter value. :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer` :since: version 0.4.1 """ if isinstance(method, basestring): method = {'xml': XMLSerializer, 'xhtml': XHTMLSerializer, 'html': HTMLSerializer, 'text': TextSerializer}[method.lower()] return method(**kwargs) class DocType(object): """Defines a number of commonly used DOCTYPE declarations as constants.""" HTML_STRICT = ( 'html', '-//W3C//DTD HTML 4.01//EN', 'http://www.w3.org/TR/html4/strict.dtd' ) HTML_TRANSITIONAL = ( 'html', '-//W3C//DTD HTML 4.01 Transitional//EN', 'http://www.w3.org/TR/html4/loose.dtd' ) HTML_FRAMESET = ( 'html', '-//W3C//DTD HTML 4.01 Frameset//EN', 'http://www.w3.org/TR/html4/frameset.dtd' ) HTML = HTML_STRICT HTML5 = ('html', None, None) XHTML_STRICT = ( 'html', '-//W3C//DTD XHTML 1.0 Strict//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' ) XHTML_TRANSITIONAL = ( 'html', '-//W3C//DTD XHTML 1.0 Transitional//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' ) XHTML_FRAMESET = ( 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd' ) XHTML = XHTML_STRICT XHTML11 = ( 'html', '-//W3C//DTD XHTML 1.1//EN', 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' ) SVG_FULL = ( 'svg', '-//W3C//DTD SVG 1.1//EN', 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd' ) SVG_BASIC = ( 'svg', '-//W3C//DTD SVG Basic 1.1//EN', 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd' ) SVG_TINY = ( 'svg', '-//W3C//DTD SVG Tiny 1.1//EN', 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd' ) SVG = SVG_FULL @classmethod def get(cls, name): """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` declaration for the specified name. The following names are recognized in this version: * "html" or "html-strict" for the HTML 4.01 strict DTD * "html-transitional" for the HTML 4.01 transitional DTD * "html-frameset" for the HTML 4.01 frameset DTD * "html5" for the ``DOCTYPE`` proposed for HTML5 * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD * "xhtml-transitional" for the XHTML 1.0 transitional DTD * "xhtml-frameset" for the XHTML 1.0 frameset DTD * "xhtml11" for the XHTML 1.1 DTD * "svg" or "svg-full" for the SVG 1.1 DTD * "svg-basic" for the SVG Basic 1.1 DTD * "svg-tiny" for the SVG Tiny 1.1 DTD :param name: the name of the ``DOCTYPE`` :return: the ``(name, pubid, sysid)`` tuple for the requested ``DOCTYPE``, or ``None`` if the name is not recognized :since: version 0.4.1 """ return { 'html': cls.HTML, 'html-strict': cls.HTML_STRICT, 'html-transitional': DocType.HTML_TRANSITIONAL, 'html-frameset': DocType.HTML_FRAMESET, 'html5': cls.HTML5, 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, 'xhtml-transitional': cls.XHTML_TRANSITIONAL, 'xhtml-frameset': cls.XHTML_FRAMESET, 'xhtml11': cls.XHTML11, 'svg': cls.SVG, 'svg-full': cls.SVG_FULL, 'svg-basic': cls.SVG_BASIC, 'svg-tiny': cls.SVG_TINY }.get(name.lower()) class XMLSerializer(object): """Produces XML text from an event stream. >>> from genshi.builder import tag >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) >>> print(''.join(XMLSerializer()(elem.generate())))


""" _PRESERVE_SPACE = frozenset() def __init__(self, doctype=None, strip_whitespace=True, namespace_prefixes=None, cache=True): """Initialize the XML serializer. :param doctype: a ``(name, pubid, sysid)`` tuple that represents the DOCTYPE declaration that should be included at the top of the generated output, or the name of a DOCTYPE as defined in `DocType.get` :param strip_whitespace: whether extraneous whitespace should be stripped from the output :param cache: whether to cache the text output per event, which improves performance for repetitive markup :note: Changed in 0.4.2: The `doctype` parameter can now be a string. :note: Changed in 0.6: The `cache` parameter was added """ self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes, cache=cache)) if doctype: self.filters.append(DocTypeInserter(doctype)) self.cache = cache def __call__(self, stream): have_decl = have_doctype = False in_cdata = False cache = {} cache_get = cache.get if self.cache: def _emit(kind, input, output): cache[kind, input] = output return output else: def _emit(kind, input, output): return output for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: cached = cache_get((kind, data)) if cached is not None: yield cached elif kind is START or kind is EMPTY: tag, attrib = data buf = ['<', tag] for attr, value in attrib: buf += [' ', attr, '="', escape(value), '"'] buf.append(kind is EMPTY and '/>' or '>') yield _emit(kind, data, Markup(''.join(buf))) elif kind is END: yield _emit(kind, data, Markup('' % data)) elif kind is TEXT: if in_cdata: yield _emit(kind, data, data) else: yield _emit(kind, data, escape(data, quotes=False)) elif kind is COMMENT: yield _emit(kind, data, Markup('' % data)) elif kind is XML_DECL and not have_decl: version, encoding, standalone = data buf = ['\n') yield Markup(''.join(buf)) have_decl = True elif kind is DOCTYPE and not have_doctype: name, pubid, sysid = data buf = ['\n') yield Markup(''.join(buf)) % tuple([p for p in data if p]) have_doctype = True elif kind is START_CDATA: yield Markup('') in_cdata = False elif kind is PI: yield _emit(kind, data, Markup('' % data)) class XHTMLSerializer(XMLSerializer): """Produces XHTML text from an event stream. >>> from genshi.builder import tag >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) >>> print(''.join(XHTMLSerializer()(elem.generate())))


""" _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param']) _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', 'defer', 'disabled', 'ismap', 'multiple', 'nohref', 'noresize', 'noshade', 'nowrap']) _PRESERVE_SPACE = frozenset([ QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') ]) def __init__(self, doctype=None, strip_whitespace=True, namespace_prefixes=None, drop_xml_decl=True, cache=True): super(XHTMLSerializer, self).__init__(doctype, False) self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) namespace_prefixes = namespace_prefixes or {} namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes, cache=cache)) if doctype: self.filters.append(DocTypeInserter(doctype)) self.drop_xml_decl = drop_xml_decl self.cache = cache def __call__(self, stream): boolean_attrs = self._BOOLEAN_ATTRS empty_elems = self._EMPTY_ELEMS drop_xml_decl = self.drop_xml_decl have_decl = have_doctype = False in_cdata = False cache = {} cache_get = cache.get if self.cache: def _emit(kind, input, output): cache[kind, input] = output return output else: def _emit(kind, input, output): return output for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: cached = cache_get((kind, data)) if cached is not None: yield cached elif kind is START or kind is EMPTY: tag, attrib = data buf = ['<', tag] for attr, value in attrib: if attr in boolean_attrs: value = attr elif attr == 'xml:lang' and 'lang' not in attrib: buf += [' lang="', escape(value), '"'] elif attr == 'xml:space': continue buf += [' ', attr, '="', escape(value), '"'] if kind is EMPTY: if tag in empty_elems: buf.append(' />') else: buf.append('>' % tag) else: buf.append('>') yield _emit(kind, data, Markup(''.join(buf))) elif kind is END: yield _emit(kind, data, Markup('' % data)) elif kind is TEXT: if in_cdata: yield _emit(kind, data, data) else: yield _emit(kind, data, escape(data, quotes=False)) elif kind is COMMENT: yield _emit(kind, data, Markup('' % data)) elif kind is DOCTYPE and not have_doctype: name, pubid, sysid = data buf = ['\n') yield Markup(''.join(buf)) % tuple([p for p in data if p]) have_doctype = True elif kind is XML_DECL and not have_decl and not drop_xml_decl: version, encoding, standalone = data buf = ['\n') yield Markup(''.join(buf)) have_decl = True elif kind is START_CDATA: yield Markup('') in_cdata = False elif kind is PI: yield _emit(kind, data, Markup('' % data)) class HTMLSerializer(XHTMLSerializer): """Produces HTML text from an event stream. >>> from genshi.builder import tag >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) >>> print(''.join(HTMLSerializer()(elem.generate())))


""" _NOESCAPE_ELEMS = frozenset([ QName('script'), QName('http://www.w3.org/1999/xhtml}script'), QName('style'), QName('http://www.w3.org/1999/xhtml}style') ]) def __init__(self, doctype=None, strip_whitespace=True, cache=True): """Initialize the HTML serializer. :param doctype: a ``(name, pubid, sysid)`` tuple that represents the DOCTYPE declaration that should be included at the top of the generated output :param strip_whitespace: whether extraneous whitespace should be stripped from the output :param cache: whether to cache the text output per event, which improves performance for repetitive markup :note: Changed in 0.6: The `cache` parameter was added """ super(HTMLSerializer, self).__init__(doctype, False) self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, self._NOESCAPE_ELEMS)) self.filters.append(NamespaceFlattener(prefixes={ 'http://www.w3.org/1999/xhtml': '' }, cache=cache)) if doctype: self.filters.append(DocTypeInserter(doctype)) self.cache = True def __call__(self, stream): boolean_attrs = self._BOOLEAN_ATTRS empty_elems = self._EMPTY_ELEMS noescape_elems = self._NOESCAPE_ELEMS have_doctype = False noescape = False cache = {} cache_get = cache.get if self.cache: def _emit(kind, input, output): cache[kind, input] = output return output else: def _emit(kind, input, output): return output for filter_ in self.filters: stream = filter_(stream) for kind, data, _ in stream: output = cache_get((kind, data)) if output is not None: yield output if (kind is START or kind is EMPTY) \ and data[0] in noescape_elems: noescape = True elif kind is END: noescape = False elif kind is START or kind is EMPTY: tag, attrib = data buf = ['<', tag] for attr, value in attrib: if attr in boolean_attrs: if value: buf += [' ', attr] elif ':' in attr: if attr == 'xml:lang' and 'lang' not in attrib: buf += [' lang="', escape(value), '"'] elif attr != 'xmlns': buf += [' ', attr, '="', escape(value), '"'] buf.append('>') if kind is EMPTY: if tag not in empty_elems: buf.append('' % tag) yield _emit(kind, data, Markup(''.join(buf))) if tag in noescape_elems: noescape = True elif kind is END: yield _emit(kind, data, Markup('' % data)) noescape = False elif kind is TEXT: if noescape: yield _emit(kind, data, data) else: yield _emit(kind, data, escape(data, quotes=False)) elif kind is COMMENT: yield _emit(kind, data, Markup('' % data)) elif kind is DOCTYPE and not have_doctype: name, pubid, sysid = data buf = ['\n') yield Markup(''.join(buf)) % tuple([p for p in data if p]) have_doctype = True elif kind is PI: yield _emit(kind, data, Markup('' % data)) class TextSerializer(object): """Produces plain text from an event stream. Only text events are included in the output. Unlike the other serializer, special XML characters are not escaped: >>> from genshi.builder import tag >>> elem = tag.div(tag.a('', href='foo'), tag.br) >>> print(elem)
<Hello!>
>>> print(''.join(TextSerializer()(elem.generate()))) If text events contain literal markup (instances of the `Markup` class), that markup is by default passed through unchanged: >>> elem = tag.div(Markup('Hello & Bye!
')) >>> print(elem.generate().render(TextSerializer, encoding=None)) Hello & Bye!
You can use the ``strip_markup`` to change this behavior, so that tags and entities are stripped from the output (or in the case of entities, replaced with the equivalent character): >>> print(elem.generate().render(TextSerializer, strip_markup=True, ... encoding=None)) Hello & Bye! """ def __init__(self, strip_markup=False): """Create the serializer. :param strip_markup: whether markup (tags and encoded characters) found in the text should be removed """ self.strip_markup = strip_markup def __call__(self, stream): strip_markup = self.strip_markup for event in stream: if event[0] is TEXT: data = event[1] if strip_markup and type(data) is Markup: data = data.striptags().stripentities() yield unicode(data) class EmptyTagFilter(object): """Combines `START` and `STOP` events into `EMPTY` events for elements that have no contents. """ EMPTY = StreamEventKind('EMPTY') def __call__(self, stream): prev = (None, None, None) for ev in stream: if prev[0] is START: if ev[0] is END: prev = EMPTY, prev[1], prev[2] yield prev continue else: yield prev if ev[0] is not START: yield ev prev = ev EMPTY = EmptyTagFilter.EMPTY class NamespaceFlattener(object): r"""Output stream filter that removes namespace information from the stream, instead adding namespace attributes and prefixes as needed. :param prefixes: optional mapping of namespace URIs to prefixes >>> from genshi.input import XML >>> xml = XML(''' ... ... ''') >>> for kind, data, pos in NamespaceFlattener()(xml): ... print('%s %r' % (kind, data)) START (u'doc', Attrs([('xmlns', u'NS1'), (u'xmlns:two', u'NS2')])) TEXT u'\n ' START (u'two:item', Attrs()) END u'two:item' TEXT u'\n' END u'doc' """ def __init__(self, prefixes=None, cache=True): self.prefixes = {XML_NAMESPACE.uri: 'xml'} if prefixes is not None: self.prefixes.update(prefixes) self.cache = cache def __call__(self, stream): cache = {} cache_get = cache.get if self.cache: def _emit(kind, input, output, pos): cache[kind, input] = output return kind, output, pos else: def _emit(kind, input, output, pos): return output prefixes = dict([(v, [k]) for k, v in self.prefixes.items()]) namespaces = {XML_NAMESPACE.uri: ['xml']} def _push_ns(prefix, uri): namespaces.setdefault(uri, []).append(prefix) prefixes.setdefault(prefix, []).append(uri) cache.clear() def _pop_ns(prefix): uris = prefixes.get(prefix) uri = uris.pop() if not uris: del prefixes[prefix] if uri not in uris or uri != uris[-1]: uri_prefixes = namespaces[uri] uri_prefixes.pop() if not uri_prefixes: del namespaces[uri] cache.clear() return uri ns_attrs = [] _push_ns_attr = ns_attrs.append def _make_ns_attr(prefix, uri): return 'xmlns%s' % (prefix and ':%s' % prefix or ''), uri def _gen_prefix(): val = 0 while 1: val += 1 yield 'ns%d' % val _gen_prefix = _gen_prefix().next for kind, data, pos in stream: output = cache_get((kind, data)) if output is not None: yield kind, output, pos elif kind is START or kind is EMPTY: tag, attrs = data tagname = tag.localname tagns = tag.namespace if tagns: if tagns in namespaces: prefix = namespaces[tagns][-1] if prefix: tagname = '%s:%s' % (prefix, tagname) else: _push_ns_attr(('xmlns', tagns)) _push_ns('', tagns) new_attrs = [] for attr, value in attrs: attrname = attr.localname attrns = attr.namespace if attrns: if attrns not in namespaces: prefix = _gen_prefix() _push_ns(prefix, attrns) _push_ns_attr(('xmlns:%s' % prefix, attrns)) else: prefix = namespaces[attrns][-1] if prefix: attrname = '%s:%s' % (prefix, attrname) new_attrs.append((attrname, value)) yield _emit(kind, data, (tagname, Attrs(ns_attrs + new_attrs)), pos) del ns_attrs[:] elif kind is END: tagname = data.localname tagns = data.namespace if tagns: prefix = namespaces[tagns][-1] if prefix: tagname = '%s:%s' % (prefix, tagname) yield _emit(kind, data, tagname, pos) elif kind is START_NS: prefix, uri = data if uri not in namespaces: prefix = prefixes.get(uri, [prefix])[-1] _push_ns_attr(_make_ns_attr(prefix, uri)) _push_ns(prefix, uri) elif kind is END_NS: if data in prefixes: uri = _pop_ns(data) if ns_attrs: attr = _make_ns_attr(data, uri) if attr in ns_attrs: ns_attrs.remove(attr) else: yield kind, data, pos class WhitespaceFilter(object): """A filter that removes extraneous ignorable white space from the stream. """ def __init__(self, preserve=None, noescape=None): """Initialize the filter. :param preserve: a set or sequence of tag names for which white-space should be preserved :param noescape: a set or sequence of tag names for which text content should not be escaped The `noescape` set is expected to refer to elements that cannot contain further child elements (such as ``