Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/websdk/genshi/input.py
diff options
context:
space:
mode:
Diffstat (limited to 'websdk/genshi/input.py')
-rw-r--r--websdk/genshi/input.py443
1 files changed, 443 insertions, 0 deletions
diff --git a/websdk/genshi/input.py b/websdk/genshi/input.py
new file mode 100644
index 0000000..039e5e5
--- /dev/null
+++ b/websdk/genshi/input.py
@@ -0,0 +1,443 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2006-2009 Edgewall Software
+# All rights reserved.
+#
+# This software is licensed as described in the file COPYING, which
+# you should have received as part of this distribution. The terms
+# are also available at http://genshi.edgewall.org/wiki/License.
+#
+# This software consists of voluntary contributions made by many
+# individuals. For the exact contribution history, see the revision
+# history and logs, available at http://genshi.edgewall.org/log/.
+
+"""Support for constructing markup streams from files, strings, or other
+sources.
+"""
+
+from itertools import chain
+import htmlentitydefs as entities
+import HTMLParser as html
+from StringIO import StringIO
+from xml.parsers import expat
+
+from genshi.core import Attrs, QName, Stream, stripentities
+from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \
+ END_NS, START_CDATA, END_CDATA, PI, COMMENT
+
+__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
+__docformat__ = 'restructuredtext en'
+
+
+def ET(element):
+ """Convert a given ElementTree element to a markup stream.
+
+ :param element: an ElementTree element
+ :return: a markup stream
+ """
+ tag_name = QName(element.tag.lstrip('{'))
+ attrs = Attrs([(QName(attr.lstrip('{')), value)
+ for attr, value in element.items()])
+
+ yield START, (tag_name, attrs), (None, -1, -1)
+ if element.text:
+ yield TEXT, element.text, (None, -1, -1)
+ for child in element.getchildren():
+ for item in ET(child):
+ yield item
+ yield END, tag_name, (None, -1, -1)
+ if element.tail:
+ yield TEXT, element.tail, (None, -1, -1)
+
+
+class ParseError(Exception):
+ """Exception raised when fatal syntax errors are found in the input being
+ parsed.
+ """
+
+ def __init__(self, message, filename=None, lineno=-1, offset=-1):
+ """Exception initializer.
+
+ :param message: the error message from the parser
+ :param filename: the path to the file that was parsed
+ :param lineno: the number of the line on which the error was encountered
+ :param offset: the column number where the error was encountered
+ """
+ self.msg = message
+ if filename:
+ message += ', in ' + filename
+ Exception.__init__(self, message)
+ self.filename = filename or '<string>'
+ self.lineno = lineno
+ self.offset = offset
+
+
+class XMLParser(object):
+ """Generator-based XML parser based on roughly equivalent code in
+ Kid/ElementTree.
+
+ The parsing is initiated by iterating over the parser object:
+
+ >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
+ >>> for kind, data, pos in parser:
+ ... print('%s %s' % (kind, data))
+ START (QName('root'), Attrs([(QName('id'), u'2')]))
+ START (QName('child'), Attrs())
+ TEXT Foo
+ END child
+ END root
+ """
+
+ _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
+ entities.name2codepoint.items()]
+ _external_dtd = '\n'.join(_entitydefs)
+
+ def __init__(self, source, filename=None, encoding=None):
+ """Initialize the parser for the given XML input.
+
+ :param source: the XML text as a file-like object
+ :param filename: the name of the file, if appropriate
+ :param encoding: the encoding of the file; if not specified, the
+ encoding is assumed to be ASCII, UTF-8, or UTF-16, or
+ whatever the encoding specified in the XML declaration
+ (if any)
+ """
+ self.source = source
+ self.filename = filename
+
+ # Setup the Expat parser
+ parser = expat.ParserCreate(encoding, '}')
+ parser.buffer_text = True
+ parser.returns_unicode = True
+ parser.ordered_attributes = True
+
+ parser.StartElementHandler = self._handle_start
+ parser.EndElementHandler = self._handle_end
+ parser.CharacterDataHandler = self._handle_data
+ parser.StartDoctypeDeclHandler = self._handle_doctype
+ parser.StartNamespaceDeclHandler = self._handle_start_ns
+ parser.EndNamespaceDeclHandler = self._handle_end_ns
+ parser.StartCdataSectionHandler = self._handle_start_cdata
+ parser.EndCdataSectionHandler = self._handle_end_cdata
+ parser.ProcessingInstructionHandler = self._handle_pi
+ parser.XmlDeclHandler = self._handle_xml_decl
+ parser.CommentHandler = self._handle_comment
+
+ # Tell Expat that we'll handle non-XML entities ourselves
+ # (in _handle_other)
+ parser.DefaultHandler = self._handle_other
+ parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
+ parser.UseForeignDTD()
+ parser.ExternalEntityRefHandler = self._build_foreign
+
+ self.expat = parser
+ self._queue = []
+
+ def parse(self):
+ """Generator that parses the XML source, yielding markup events.
+
+ :return: a markup event stream
+ :raises ParseError: if the XML text is not well formed
+ """
+ def _generate():
+ try:
+ bufsize = 4 * 1024 # 4K
+ done = False
+ while 1:
+ while not done and len(self._queue) == 0:
+ data = self.source.read(bufsize)
+ if data == '': # end of data
+ if hasattr(self, 'expat'):
+ self.expat.Parse('', True)
+ del self.expat # get rid of circular references
+ done = True
+ else:
+ if isinstance(data, unicode):
+ data = data.encode('utf-8')
+ self.expat.Parse(data, False)
+ for event in self._queue:
+ yield event
+ self._queue = []
+ if done:
+ break
+ except expat.ExpatError, e:
+ msg = str(e)
+ raise ParseError(msg, self.filename, e.lineno, e.offset)
+ return Stream(_generate()).filter(_coalesce)
+
+ def __iter__(self):
+ return iter(self.parse())
+
+ def _build_foreign(self, context, base, sysid, pubid):
+ parser = self.expat.ExternalEntityParserCreate(context)
+ parser.ParseFile(StringIO(self._external_dtd))
+ return 1
+
+ def _enqueue(self, kind, data=None, pos=None):
+ if pos is None:
+ pos = self._getpos()
+ if kind is TEXT:
+ # Expat reports the *end* of the text event as current position. We
+ # try to fix that up here as much as possible. Unfortunately, the
+ # offset is only valid for single-line text. For multi-line text,
+ # it is apparently not possible to determine at what offset it
+ # started
+ if '\n' in data:
+ lines = data.splitlines()
+ lineno = pos[1] - len(lines) + 1
+ offset = -1
+ else:
+ lineno = pos[1]
+ offset = pos[2] - len(data)
+ pos = (pos[0], lineno, offset)
+ self._queue.append((kind, data, pos))
+
+ def _getpos_unknown(self):
+ return (self.filename, -1, -1)
+
+ def _getpos(self):
+ return (self.filename, self.expat.CurrentLineNumber,
+ self.expat.CurrentColumnNumber)
+
+ def _handle_start(self, tag, attrib):
+ attrs = Attrs([(QName(name), value) for name, value in
+ zip(*[iter(attrib)] * 2)])
+ self._enqueue(START, (QName(tag), attrs))
+
+ def _handle_end(self, tag):
+ self._enqueue(END, QName(tag))
+
+ def _handle_data(self, text):
+ self._enqueue(TEXT, text)
+
+ def _handle_xml_decl(self, version, encoding, standalone):
+ self._enqueue(XML_DECL, (version, encoding, standalone))
+
+ def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
+ self._enqueue(DOCTYPE, (name, pubid, sysid))
+
+ def _handle_start_ns(self, prefix, uri):
+ self._enqueue(START_NS, (prefix or '', uri))
+
+ def _handle_end_ns(self, prefix):
+ self._enqueue(END_NS, prefix or '')
+
+ def _handle_start_cdata(self):
+ self._enqueue(START_CDATA)
+
+ def _handle_end_cdata(self):
+ self._enqueue(END_CDATA)
+
+ def _handle_pi(self, target, data):
+ self._enqueue(PI, (target, data))
+
+ def _handle_comment(self, text):
+ self._enqueue(COMMENT, text)
+
+ def _handle_other(self, text):
+ if text.startswith('&'):
+ # deal with undefined entities
+ try:
+ text = unichr(entities.name2codepoint[text[1:-1]])
+ self._enqueue(TEXT, text)
+ except KeyError:
+ filename, lineno, offset = self._getpos()
+ error = expat.error('undefined entity "%s": line %d, column %d'
+ % (text, lineno, offset))
+ error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY
+ error.lineno = lineno
+ error.offset = offset
+ raise error
+
+
+def XML(text):
+ """Parse the given XML source and return a markup stream.
+
+ Unlike with `XMLParser`, the returned stream is reusable, meaning it can be
+ iterated over multiple times:
+
+ >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>')
+ >>> print(xml)
+ <doc><elem>Foo</elem><elem>Bar</elem></doc>
+ >>> print(xml.select('elem'))
+ <elem>Foo</elem><elem>Bar</elem>
+ >>> print(xml.select('elem/text()'))
+ FooBar
+
+ :param text: the XML source
+ :return: the parsed XML event stream
+ :raises ParseError: if the XML text is not well-formed
+ """
+ return Stream(list(XMLParser(StringIO(text))))
+
+
+class HTMLParser(html.HTMLParser, object):
+ """Parser for HTML input based on the Python `HTMLParser` module.
+
+ This class provides the same interface for generating stream events as
+ `XMLParser`, and attempts to automatically balance tags.
+
+ The parsing is initiated by iterating over the parser object:
+
+ >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
+ >>> for kind, data, pos in parser:
+ ... print('%s %s' % (kind, data))
+ START (QName('ul'), Attrs([(QName('compact'), u'compact')]))
+ START (QName('li'), Attrs())
+ TEXT Foo
+ END li
+ END ul
+ """
+
+ _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
+ 'hr', 'img', 'input', 'isindex', 'link', 'meta',
+ 'param'])
+
+ def __init__(self, source, filename=None, encoding='utf-8'):
+ """Initialize the parser for the given HTML input.
+
+ :param source: the HTML text as a file-like object
+ :param filename: the name of the file, if known
+ :param filename: encoding of the file; ignored if the input is unicode
+ """
+ html.HTMLParser.__init__(self)
+ self.source = source
+ self.filename = filename
+ self.encoding = encoding
+ self._queue = []
+ self._open_tags = []
+
+ def parse(self):
+ """Generator that parses the HTML source, yielding markup events.
+
+ :return: a markup event stream
+ :raises ParseError: if the HTML text is not well formed
+ """
+ def _generate():
+ try:
+ bufsize = 4 * 1024 # 4K
+ done = False
+ while 1:
+ while not done and len(self._queue) == 0:
+ data = self.source.read(bufsize)
+ if data == '': # end of data
+ self.close()
+ done = True
+ else:
+ self.feed(data)
+ for kind, data, pos in self._queue:
+ yield kind, data, pos
+ self._queue = []
+ if done:
+ open_tags = self._open_tags
+ open_tags.reverse()
+ for tag in open_tags:
+ yield END, QName(tag), pos
+ break
+ except html.HTMLParseError, e:
+ msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
+ raise ParseError(msg, self.filename, e.lineno, e.offset)
+ return Stream(_generate()).filter(_coalesce)
+
+ def __iter__(self):
+ return iter(self.parse())
+
+ def _enqueue(self, kind, data, pos=None):
+ if pos is None:
+ pos = self._getpos()
+ self._queue.append((kind, data, pos))
+
+ def _getpos(self):
+ lineno, column = self.getpos()
+ return (self.filename, lineno, column)
+
+ def handle_starttag(self, tag, attrib):
+ fixed_attrib = []
+ for name, value in attrib: # Fixup minimized attributes
+ if value is None:
+ value = unicode(name)
+ elif not isinstance(value, unicode):
+ value = value.decode(self.encoding, 'replace')
+ fixed_attrib.append((QName(name), stripentities(value)))
+
+ self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
+ if tag in self._EMPTY_ELEMS:
+ self._enqueue(END, QName(tag))
+ else:
+ self._open_tags.append(tag)
+
+ def handle_endtag(self, tag):
+ if tag not in self._EMPTY_ELEMS:
+ while self._open_tags:
+ open_tag = self._open_tags.pop()
+ self._enqueue(END, QName(open_tag))
+ if open_tag.lower() == tag.lower():
+ break
+
+ def handle_data(self, text):
+ if not isinstance(text, unicode):
+ text = text.decode(self.encoding, 'replace')
+ self._enqueue(TEXT, text)
+
+ def handle_charref(self, name):
+ if name.lower().startswith('x'):
+ text = unichr(int(name[1:], 16))
+ else:
+ text = unichr(int(name))
+ self._enqueue(TEXT, text)
+
+ def handle_entityref(self, name):
+ try:
+ text = unichr(entities.name2codepoint[name])
+ except KeyError:
+ text = '&%s;' % name
+ self._enqueue(TEXT, text)
+
+ def handle_pi(self, data):
+ target, data = data.split(None, 1)
+ if data.endswith('?'):
+ data = data[:-1]
+ self._enqueue(PI, (target.strip(), data.strip()))
+
+ def handle_comment(self, text):
+ self._enqueue(COMMENT, text)
+
+
+def HTML(text, encoding='utf-8'):
+ """Parse the given HTML source and return a markup stream.
+
+ Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
+ iterated over multiple times:
+
+ >>> html = HTML('<body><h1>Foo</h1></body>')
+ >>> print(html)
+ <body><h1>Foo</h1></body>
+ >>> print(html.select('h1'))
+ <h1>Foo</h1>
+ >>> print(html.select('h1/text()'))
+ Foo
+
+ :param text: the HTML source
+ :return: the parsed XML event stream
+ :raises ParseError: if the HTML text is not well-formed, and error recovery
+ fails
+ """
+ return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
+
+
+def _coalesce(stream):
+ """Coalesces adjacent TEXT events into a single event."""
+ textbuf = []
+ textpos = None
+ for kind, data, pos in chain(stream, [(None, None, None)]):
+ if kind is TEXT:
+ textbuf.append(data)
+ if textpos is None:
+ textpos = pos
+ else:
+ if textbuf:
+ yield TEXT, ''.join(textbuf), textpos
+ del textbuf[:]
+ textpos = None
+ if kind:
+ yield kind, data, pos