diff options
Diffstat (limited to 'creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/__init__.py')
-rw-r--r-- | creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/__init__.py | 168 |
1 files changed, 168 insertions, 0 deletions
diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/__init__.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/__init__.py new file mode 100644 index 0000000..9553349 --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/__init__.py @@ -0,0 +1,168 @@ +""" +From a Python file, expecting an RDF/XML pretty printed output:: + + import rdflib.graph as g + graph = g.Graph() + graph.parse('filename.html', format='rdfa') + print graph.serialize(format='pretty-xml') + +For details on RDFa, the reader should consult the `RDFa syntax document`__. + +This is an adapted version of pyRdfa (`W3C RDFa Distiller page`__) by Ivan Herman + +.. __: http://www.w3.org/TR/rdfa-syntax +.. __: http://www.w3.org/2007/08/pyRdfa/ + +""" + + +import sys +import urllib +import xml.dom.minidom + +from rdflib.term import URIRef +from rdflib.parser import Parser +from rdflib.plugins.parsers.rdfa.state import ExecutionContext +from rdflib.plugins.parsers.rdfa.parse import parse_one_node +from rdflib.plugins.parsers.rdfa.options import (Options, _add_to_comment_graph, + DIST_NS, ERROR, GENERIC_XML, XHTML_RDFA, HTML5_RDFA) + +from rdflib.plugins.parsers.rdfa.transform.headabout import head_about_transform + +__all__ = ['RDFaParser'] + +# These are part of the RDFa spec. +BUILT_IN_TRANSFORMERS = [ + head_about_transform +] + +# Exception handling. Essentially, all the different exceptions are re-packaged +# into separate exception class, to allow for an easier management on the user +# level +class RDFaError(Exception) : + """Just a wrapper around the local exceptions. It does not add any new + functionality to the Exception class.""" + pass + +# For some doctype and element name combinations an automatic switch to an +# input mode is done +_HOST_LANG = { + ("http://www.w3.org/1999/xhtml", "html"): XHTML_RDFA, + ("http://www.w3.org/2000/svg", "svg"): GENERIC_XML +} + + +class RDFaParser(Parser): + + def parse(self, source, sink, + warnings=False, space_preserve=True, + transformers=None, xhtml=True, lax=True, html5=False, encoding=None): + if transformers is None: + transformers = [] + options = Options(warnings, space_preserve, transformers, xhtml, lax) + baseURI = source.getPublicId() + stream = source.getByteStream() + if html5: + dom = _process_html5_source(stream, options, encoding) + else: + dom = _try_process_source(stream, options, encoding) + _process_DOM(dom, baseURI, sink, options) + + +def _process_DOM(dom, base, graph, options=None): + """ + Core processing. The transformers ("pre-processing") is done on the DOM + tree, the state is initialized, and the "real" RDFa parsing is done. + The result is put into the provided Graph. + + The real work is done in the parser function ``parse_one_node()``. + + Params: + dom -- XML DOM Tree node (for the top level) + base -- URI for the default "base" value (usually the URI of the file to be processed) + + Options: + obj -- `Options` for the distiller + raise RDFaError -- when called via CGI, this encapsulates the possible + exceptions raised by the RDFLib serializer or the processing itself + """ + html = dom.documentElement + # Perform the built-in and external transformations on the HTML tree. This is, + # in simulated form, the hGRDDL approach of Ben Adida. + for trans in options.transformers + BUILT_IN_TRANSFORMERS: + trans(html, options) + # Collect the initial state. This takes care of things + # like base, top level namespace settings, etc. + # Ensure the proper initialization. + state = ExecutionContext(html, graph, base=base, options=options) + # The top level subject starts with the current document; this + # is used by the recursion + subject = URIRef(state.base) + # Parse the whole thing recursively and fill the graph. + parse_one_node(html, graph, subject, state, []) + if options.comment_graph.graph != None: + # Add the content of the comment graph to the output. + graph.bind("dist", DIST_NS) + for t in options.comment_graph.graph: + graph.add(t) + +def _try_process_source(stream, options, encoding): + """ + Tries to parse input as xhtml, xml (e.g. svg) or html(5), modifying options + while figuring out input.. + + Returns a DOM tree. + """ + parse = xml.dom.minidom.parse + try: + dom = parse(stream) + # Try to second-guess the input type + # This is _not_ really kosher, but the minidom is not really namespace aware... + # In practice the goal is to have the system recognize svg content automatically + # First see if there is a default namespace defined for the document: + top = dom.documentElement + if top.hasAttribute("xmlns"): + key = (top.getAttribute("xmlns"), top.nodeName) + if key in _HOST_LANG: + options.host_language = _HOST_LANG[key] + return dom + except: + # XML Parsing error in the input + type, value, traceback = sys.exc_info() + if options.host_language == GENERIC_XML or options.lax == False: + raise RDFaError('Parsing error in input file: "%s"' % value) + + # XML Parsing error in the input + msg = "XHTML Parsing error in input file: %s. Falling back on the HTML5 parser" % value + if options != None and options.warnings: + options.comment_graph.add_warning(msg) + + # in Ivan's original code he reopened the stream if it was from urllib + if isinstance(stream, urllib.addinfourl): + stream = urllib.urlopen(stream.url) + + return _process_html5_source(stream, options, encoding) + + +def _process_html5_source(stream, options, encoding): + # Now try to see if and HTML5 parser is an alternative... + try: + from html5lib import HTMLParser, treebuilders + except ImportError: + # no alternative to the XHTML error, because HTML5 parser not available... + msg2 = 'XHTML Parsing error in input file: %s. Though parsing is lax, HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>' + raise RDFaError(msg2) + + parser = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) + parse = parser.parse + try: + dom = parse(stream, encoding) + # The host language has changed + options.host_language = HTML5_RDFA + except: + # Well, even the HTML5 parser could not do anything with this... + (type, value, traceback) = sys.exc_info() + msg2 = 'Parsing error in input file as HTML5: "%s"' % value + raise RDFaError, msg2 + + return dom |