diff options
Diffstat (limited to 'creactistore/_templates/lib/rdflib/plugins/parsers/rdfa')
8 files changed, 1218 insertions, 0 deletions
diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/__init__.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/__init__.py new file mode 100644 index 0000000..9553349 --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/__init__.py @@ -0,0 +1,168 @@ +""" +From a Python file, expecting an RDF/XML pretty printed output:: + + import rdflib.graph as g + graph = g.Graph() + graph.parse('filename.html', format='rdfa') + print graph.serialize(format='pretty-xml') + +For details on RDFa, the reader should consult the `RDFa syntax document`__. + +This is an adapted version of pyRdfa (`W3C RDFa Distiller page`__) by Ivan Herman + +.. __: http://www.w3.org/TR/rdfa-syntax +.. __: http://www.w3.org/2007/08/pyRdfa/ + +""" + + +import sys +import urllib +import xml.dom.minidom + +from rdflib.term import URIRef +from rdflib.parser import Parser +from rdflib.plugins.parsers.rdfa.state import ExecutionContext +from rdflib.plugins.parsers.rdfa.parse import parse_one_node +from rdflib.plugins.parsers.rdfa.options import (Options, _add_to_comment_graph, + DIST_NS, ERROR, GENERIC_XML, XHTML_RDFA, HTML5_RDFA) + +from rdflib.plugins.parsers.rdfa.transform.headabout import head_about_transform + +__all__ = ['RDFaParser'] + +# These are part of the RDFa spec. +BUILT_IN_TRANSFORMERS = [ + head_about_transform +] + +# Exception handling. Essentially, all the different exceptions are re-packaged +# into separate exception class, to allow for an easier management on the user +# level +class RDFaError(Exception) : + """Just a wrapper around the local exceptions. It does not add any new + functionality to the Exception class.""" + pass + +# For some doctype and element name combinations an automatic switch to an +# input mode is done +_HOST_LANG = { + ("http://www.w3.org/1999/xhtml", "html"): XHTML_RDFA, + ("http://www.w3.org/2000/svg", "svg"): GENERIC_XML +} + + +class RDFaParser(Parser): + + def parse(self, source, sink, + warnings=False, space_preserve=True, + transformers=None, xhtml=True, lax=True, html5=False, encoding=None): + if transformers is None: + transformers = [] + options = Options(warnings, space_preserve, transformers, xhtml, lax) + baseURI = source.getPublicId() + stream = source.getByteStream() + if html5: + dom = _process_html5_source(stream, options, encoding) + else: + dom = _try_process_source(stream, options, encoding) + _process_DOM(dom, baseURI, sink, options) + + +def _process_DOM(dom, base, graph, options=None): + """ + Core processing. The transformers ("pre-processing") is done on the DOM + tree, the state is initialized, and the "real" RDFa parsing is done. + The result is put into the provided Graph. + + The real work is done in the parser function ``parse_one_node()``. + + Params: + dom -- XML DOM Tree node (for the top level) + base -- URI for the default "base" value (usually the URI of the file to be processed) + + Options: + obj -- `Options` for the distiller + raise RDFaError -- when called via CGI, this encapsulates the possible + exceptions raised by the RDFLib serializer or the processing itself + """ + html = dom.documentElement + # Perform the built-in and external transformations on the HTML tree. This is, + # in simulated form, the hGRDDL approach of Ben Adida. + for trans in options.transformers + BUILT_IN_TRANSFORMERS: + trans(html, options) + # Collect the initial state. This takes care of things + # like base, top level namespace settings, etc. + # Ensure the proper initialization. + state = ExecutionContext(html, graph, base=base, options=options) + # The top level subject starts with the current document; this + # is used by the recursion + subject = URIRef(state.base) + # Parse the whole thing recursively and fill the graph. + parse_one_node(html, graph, subject, state, []) + if options.comment_graph.graph != None: + # Add the content of the comment graph to the output. + graph.bind("dist", DIST_NS) + for t in options.comment_graph.graph: + graph.add(t) + +def _try_process_source(stream, options, encoding): + """ + Tries to parse input as xhtml, xml (e.g. svg) or html(5), modifying options + while figuring out input.. + + Returns a DOM tree. + """ + parse = xml.dom.minidom.parse + try: + dom = parse(stream) + # Try to second-guess the input type + # This is _not_ really kosher, but the minidom is not really namespace aware... + # In practice the goal is to have the system recognize svg content automatically + # First see if there is a default namespace defined for the document: + top = dom.documentElement + if top.hasAttribute("xmlns"): + key = (top.getAttribute("xmlns"), top.nodeName) + if key in _HOST_LANG: + options.host_language = _HOST_LANG[key] + return dom + except: + # XML Parsing error in the input + type, value, traceback = sys.exc_info() + if options.host_language == GENERIC_XML or options.lax == False: + raise RDFaError('Parsing error in input file: "%s"' % value) + + # XML Parsing error in the input + msg = "XHTML Parsing error in input file: %s. Falling back on the HTML5 parser" % value + if options != None and options.warnings: + options.comment_graph.add_warning(msg) + + # in Ivan's original code he reopened the stream if it was from urllib + if isinstance(stream, urllib.addinfourl): + stream = urllib.urlopen(stream.url) + + return _process_html5_source(stream, options, encoding) + + +def _process_html5_source(stream, options, encoding): + # Now try to see if and HTML5 parser is an alternative... + try: + from html5lib import HTMLParser, treebuilders + except ImportError: + # no alternative to the XHTML error, because HTML5 parser not available... + msg2 = 'XHTML Parsing error in input file: %s. Though parsing is lax, HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>' + raise RDFaError(msg2) + + parser = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) + parse = parser.parse + try: + dom = parse(stream, encoding) + # The host language has changed + options.host_language = HTML5_RDFA + except: + # Well, even the HTML5 parser could not do anything with this... + (type, value, traceback) = sys.exc_info() + msg2 = 'Parsing error in input file as HTML5: "%s"' % value + raise RDFaError, msg2 + + return dom diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/embeddedrdf.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/embeddedrdf.py new file mode 100644 index 0000000..4a9b015 --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/embeddedrdf.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +""" +Extracting possible embedded RDF/XML content from the file and parse it separately into the Graph. This is used, for example +by U{SVG 1.2 Tiny<http://www.w3.org/TR/SVGMobile12/>}. + +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +""" + +from StringIO import StringIO + +__all__ = ['handle_embeddedRDF'] + +def handle_embeddedRDF(node, graph, state): + """ + Check if the node is the top level rdf element for RDF/XML. If so, the content is parsed and added to the target graph. Note that if an separate + base is defined in the state, the C{xml:base} attribute will be added to the C{rdf} node before parsing. + @param node: a DOM node for the top level xml element + @param graph: target rdf graph + @type graph: RDFLib's Graph object instance + @param state: the inherited state (namespaces, lang, etc) + @type state: L{State.ExecutionContext} + @return: whether an RDF/XML content has been detected or not. If TRUE, the RDFa processing should not occur on the node and its descendents. + @rtype: Boolean + + """ + if node.localName == "RDF" and node.namespaceURI == "http://www.w3.org/1999/02/22-rdf-syntax-ns#": + node.setAttribute("xml:base",state.base) + rdf = StringIO(node.toxml()) + graph.parse(rdf) + return True + else: + return False + diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/literal.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/literal.py new file mode 100644 index 0000000..2ab9b44 --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/literal.py @@ -0,0 +1,180 @@ +# -*- coding: utf-8 -*- +""" +Implementation of the Literal handling. Details of the algorithm are described on +U{RDFa Task Force's wiki page<http://www.w3.org/2006/07/SWD/wiki/RDFa/LiteralObject>}. + +@summary: RDFa Literal generation +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +import re +from rdflib.namespace import RDF +from rdflib.term import Literal + +__all__ = ['generate_literal'] + +XMLLiteral = RDF.XMLLiteral + + +def __putBackEntities(str): + """Put 'back' entities for the '&', '<', and '>' characters, to produce kosher XML string. + Used by XML Literal + @param str: string to be converted + @return: string with entities + @rtype: string + """ + return str.replace('&', '&').replace('<', '<').replace('>', '>') + +#### The real meat... +def generate_literal(node, graph, subject, state): + """Generate the literal the C{@property}, taking into account datatype, etc. + Note: this method is called only if the C{@property} is indeed present, no need to check. + + This method is an encoding of the algorithm documented + U{task force's wiki page<http://www.w3.org/2006/07/SWD/wiki/RDFa/LiteralObject>}. + + The method returns a value whether the literal is a 'normal' literal (regardless of its datatype) + or an XML Literal. The return value is True or False, respectively. This value is used to control whether + the parser should stop recursion. This also means that that if the literal is generated from @content, + the return value is False, regardless of the possible @datatype value. + + @param node: DOM element node + @param graph: the (RDF) graph to add the properies to + @param subject: the RDFLib URIRef serving as a subject for the generated triples + @param state: the current state to be used for the CURIE-s + @type state: L{State.ExecutionContext} + @return: whether the literal is a 'normal' or an XML Literal (return value is True or False, respectively). Note that if the literal is generated from @content, the return value is False, regardless of the possible @datatype value. + @rtype: Boolean + """ + def _get_literal(Pnode): + """ + Get (recursively) the full text from a DOM Node. + + @param Pnode: DOM Node + @return: string + """ + rc = "" + for node in Pnode.childNodes: + if node.nodeType == node.TEXT_NODE: + rc = rc + node.data + elif node.nodeType == node.ELEMENT_NODE: + rc = rc + _get_literal(node) + + # The decision of the group in February 2008 is not to normalize the result by default. + # This is reflected in the default value of the option + if state.options.space_preserve: + return rc + else: + return re.sub(r'(\r| |\n|\t)+', " ", rc).strip() + # end getLiteral + + def _get_XML_literal(Pnode): + """ + Get (recursively) the XML Literal content of a DOM Node. (Most of the processing is done + via a C{node.toxml} call of the xml minidom implementation.) + + @param Pnode: DOM Node + @return: string + """ + def collectPrefixes(prefixes, node): + def addPf(prefx, string): + pf = string.split(':')[0] + if pf != string and pf not in prefx : prefx.append(pf) + # edn addPf + + # first the local name of the node + addPf(prefixes, node.tagName) + # get all the attributes and children + for child in node.childNodes: + if child.nodeType == node.ELEMENT_NODE: + collectPrefixes(prefixes, child) + elif child.nodeType == node.ATTRIBUTE_NODE: + addPf(prefixes, node.child.name) + # end collectPrefixes + + rc = "" + prefixes = [] + for node in Pnode.childNodes: + if node.nodeType == node.ELEMENT_NODE: + collectPrefixes(prefixes, node) + + for node in Pnode.childNodes: + if node.nodeType == node.TEXT_NODE: + rc = rc + __putBackEntities(node.data) + elif node.nodeType == node.ELEMENT_NODE: + # Decorate the element with namespaces and lang values + for prefix in prefixes: + if prefix in state.ns and not node.hasAttribute("xmlns:%s" % prefix): + node.setAttribute("xmlns:%s" % prefix, "%s" % state.ns[prefix]) + # Set the default namespace, if not done (and is available) + if not node.getAttribute("xmlns") and state.defaultNS != None: + node.setAttribute("xmlns", state.defaultNS) + # Get the lang, if necessary + if not node.getAttribute("xml:lang") and state.lang != None: + node.setAttribute("xml:lang", state.lang) + rc = rc + node.toxml() + return rc + # If XML Literals must be canonicalized for space, then this is the return line: + #return re.sub(r'(\r| |\n|\t)+', " ", rc).strip() + # end getXMLLiteral + + # Most of the times the literal is a 'normal' one, ie, not an XML Literal + retval = True + + # Get the Property URI-s + props = state.get_resources(node.getAttribute("property"), prop=True) + + # Get, if exists, the value of @datatype, and figure out the language + datatype = None + dtset = False + lang = state.lang + if node.hasAttribute("datatype"): + dtset = True + dt = node.getAttribute("datatype") + if dt != "": + datatype = state.get_resource(dt) + lang = None + + # The simple case: separate @content attribute + if node.hasAttribute("content"): + val = node.getAttribute("content") + object = Literal(node.getAttribute("content"), datatype=datatype, lang=lang) + # The value of datatype has been set, and the keyword paramaters take care of the rest + else: + # see if there *is* a datatype (even if it is empty!) + if dtset: + # yep. The Literal content is the pure text part of the current element: + # We have to check whether the specified datatype is, in fact, and + # explicit XML Literal + if datatype == XMLLiteral: + object = Literal(_get_XML_literal(node), datatype=XMLLiteral) + retval = False + else: + object = Literal(_get_literal(node), datatype=datatype, lang=lang) + else: + # no controlling @datatype. We have to see if there is markup in the contained + # element + if True in [ n.nodeType == node.ELEMENT_NODE for n in node.childNodes ]: + # yep, and XML Literal should be generated + object = Literal(_get_XML_literal(node), datatype=XMLLiteral) + retval = False + else: + val = _get_literal(node) + # At this point, there might be entities in the string that are returned as real characters by the dom + # implementation. That should be turned back + object = Literal(_get_literal(node), lang=lang) + + # NOTE: rdflib<2.5 didn't equal Literal with lang="", hence this check + # proably always passed? + # All tests pass with this check removed; going with that.. + ## The object may be empty, for example in an ill-defined <meta> element... + if True:#object != "": + for prop in props: + graph.add((subject, prop, object)) + + return retval + diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/options.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/options.py new file mode 100644 index 0000000..0329969 --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/options.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- +""" + +Options class: collect the possible options that govern the parsing possibilities. It also includes a reference and +handling of the extra Graph for warnings, informations, errors. + + +@summary: RDFa parser (distiller) +@requires: U{RDFLib<http://rdflib.net>} +@requires: U{html5lib<http://code.google.com/p/html5lib/>} for the HTML5 parsing; note possible dependecies on Python's version on the project's web site +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +""" + +import sys +from rdflib.graph import Graph +from rdflib.term import BNode, Literal, URIRef +from rdflib.namespace import Namespace + +__all__ = ['CommentGraph', 'Options'] + +DIST_URI = "http://www.w3.org/2007/08/pyRdfa/distiller" +DIST_NS = DIST_URI + '#' + +ns_errors = Namespace(DIST_NS) +distillerURI = URIRef(DIST_URI) + +WARNING = 'warning' +ERROR = 'error' +INFO = 'info' +DEBUG = 'debug' + +_message_properties = { + WARNING: ns_errors["warning"], + ERROR: ns_errors["error"], + INFO: ns_errors["information"], + DEBUG: ns_errors["debug"] +} + +def _add_to_comment_graph(graph, msg, prop, uri): + """ + Add a distiller message to the graph. + + @param graph: RDFLib Graph + @param msg: message of an exception + @type msg: RDFLIb Literal + @param prop: the property to be used + @type prop: string, must be one of 'warning', 'error', 'info', 'debug' + @param uri: the top URI used to invoke the distiller + @type uri: URIRef + """ + bnode = BNode() + graph.add((distillerURI, _message_properties[prop], bnode)) + graph.add((bnode, ns_errors["onURI"], uri)) + graph.add((bnode, ns_errors["message"], msg)) + + +class CommentGraph(object): + """Class to handle the 'comment graph', ie, the (RDF) Graph containing the warnings, + error messages, and informational messages. + """ + def __init__(self, warnings = False): + """ + @param warnings: whether a graph should effectively be set up, or whether this + should just be an empty shell for the various calls to work (without effect) + """ + if warnings: + self.graph = Graph() + else: + self.graph = None + self.accumulated_literals = [] + self.baseURI = None + + def _add_triple(self, msg, prop): + obj = Literal(msg) + if self.baseURI == None: + self.accumulated_literals.append((obj,prop)) + elif self.graph != None: + _add_to_comment_graph(self.graph, obj, prop, self.baseURI) + + def set_base_URI(self, URI): + """Set the base URI for the comment triples. + + Note that this method I{must} be called at some point to complete the triples. Without it the triples + added via L{add_warning<CommentGraph.add_warning>}, L{add_info<CommentGraph.add_info>}, etc, will not be added to the final graph. + + @param URI: URIRef for the subject of the comments + """ + self.baseURI = URI + if self.graph != None: + for obj, prop in self.accumulated_literals: + _add_to_comment_graph(self.graph, obj, prop, self.baseURI) + self.accumulated_literals = [] + + def add_warning(self, txt): + """Add a warning. A comment triplet is added to the separate "warning" graph. + @param txt: the warning text. It will be preceded by the string "==== pyRdfa Warning ==== " + """ + self._add_triple(txt, WARNING) + + def add_info(self, txt): + """Add an informational comment. A comment triplet is added to the separate "warning" graph. + @param txt: the information text. It will be preceded by the string "==== pyRdfa information ==== " + """ + self._add_triple(txt, INFO) + + def add_error(self, txt): + """Add an error comment. A comment triplet is added to the separate "warning" graph. + @param txt: the information text. It will be preceded by the string "==== pyRdfa information ==== " + """ + self._add_triple(txt, ERROR) + + def _add_debug(self, txt): + self._add_triple(txt, DEBUG) + + +GENERIC_XML = 0 +XHTML_RDFA = 1 +HTML5_RDFA = 2 + +class Options(object): + """Settable options. An instance of this class is stored in + the L{execution context<ExecutionContext>} of the parser. + + @ivar space_preserve: whether plain literals should preserve spaces at output or not + @type space_preserve: Boolean + @ivar comment_graph: Graph for the storage of warnings + @type comment_graph: L{CommentGraph} + @ivar warnings: whether warnings should be generated or not + @type warnings: Boolean + @ivar transformers: extra transformers + @type transformers: list + @type host_language: the host language for the RDFa attributes. Default is XHTML_RDFA, but it can be GENERIC_XML and HTML5_RDFA + @ivar host_language: integer (logically: an enumeration) + @ivar lax: whether a 'lax' parsing of XHTML (ie, HTML5) is allowed. This means that the value of the host language might change run time + @type lax: Boolean + """ + def __init__(self, warnings=False, space_preserve=True, transformers=[], xhtml=True, lax=False): + """ + @param space_preserve: whether plain literals should preserve spaces at output or not + @type space_preserve: Boolean + @param warnings: whether warnings should be generated or not + @type warnings: Boolean + @param transformers: extra transformers + @type transformers: list + @param xhtml: initial value for the host language. If True, the value is set to XHTML_RDFA. Note that run-time the class variable might be set ot HTML5_RDFA, depending on the value of the lax flag and the result of parsing. + @type xhtml: Booelan + @param lax: whether a 'lax' parsing of XHTML (ie, HTML5) is allowed. This means that the value of the host language might change run time + @type lax: Boolean + """ + self.space_preserve = space_preserve + self.transformers = transformers + self.comment_graph = CommentGraph(warnings) + self.warnings = warnings + self.lax = lax + if xhtml: + self.host_language = XHTML_RDFA + else: + self.host_language = GENERIC_XML + + def __str__(self): + retval = """Current options: + space_preserve : %s + warnings : %s + lax parsing : %s + host language : %s + """ + return retval % (self.space_preserve, self.warnings, self.lax, self.host_language) + + diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/parse.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/parse.py new file mode 100644 index 0000000..d5b411f --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/parse.py @@ -0,0 +1,200 @@ +# -*- coding: utf-8 -*- +""" +The core parsing function of RDFa. Some details are +put into other modules to make it clearer to update/modify (eg, generation of literals, or managing the current state). + +@summary: RDFa core parser processing step +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +from rdflib.term import BNode, URIRef +from rdflib.namespace import RDF + +from rdflib.plugins.parsers.rdfa.state import ExecutionContext +from rdflib.plugins.parsers.rdfa.literal import generate_literal +from rdflib.plugins.parsers.rdfa.embeddedrdf import handle_embeddedRDF +from rdflib.plugins.parsers.rdfa.options import GENERIC_XML, XHTML_RDFA, HTML5_RDFA + +__all__ = ['parse_one_node'] + +def parse_one_node(node, graph, parent_object, incoming_state, parent_incomplete_triples): + """The (recursive) step of handling a single node. See the + U{RDFa syntax document<http://www.w3.org/TR/rdfa-syntax>} for further details. + + @param node: the DOM node to handle + @param graph: the RDF graph + @type graph: RDFLib's Graph object instance + @param parent_object: the parent's object, as an RDFLib URIRef + @param incoming_state: the inherited state (namespaces, lang, etc) + @type incoming_state: L{State.ExecutionContext} + @param parent_incomplete_triples: list of hanging triples (the missing resource set to None) to be handled (or not) + by the current node. + @return: whether the caller has to complete it's parent's incomplete triples + @rtype: Boolean + """ + def _get_resources_for_attr(attr): + """Get a series of resources encoded via CURIE-s for an attribute on a specific node. + @param attr: the name of the attribute + @return: a list of RDFLib URIRef instances + """ + if not node.hasAttribute(attr): + return [] + else: + rel = (attr == "rel") or (attr == "rev") + prop = (attr == "property") + return state.get_resources(node.getAttribute(attr), rel, prop) + + # Update the state. This means, for example, the possible local settings of + # namespaces and lang + state = ExecutionContext(node, graph, inherited_state=incoming_state) + + #--------------------------------------------------------------------------------- + # Handle the special case for embedded RDF, eg, in SVG1.2. + # This may add some triples to the target graph that does not originate from RDFa parsing + # If the function return TRUE, that means that an rdf:RDF has been found. No + # RDFa parsing should be done on that subtree, so we simply return... + if state.options.host_language == GENERIC_XML and node.nodeType == node.ELEMENT_NODE and handle_embeddedRDF(node, graph, state): + return + + #--------------------------------------------------------------------------------- + # First, let us check whether there is anything to do at all. Ie, + # whether there is any relevant RDFa specific attribute on the element + # + if not _has_one_of_attributes(node, "href", "resource", "about", "property", "rel", "rev", "typeof", "src"): + # nop, there is nothing to do here, just go down the tree and return... + for n in node.childNodes: + if n.nodeType == node.ELEMENT_NODE : parse_one_node(n, graph, parent_object, state, parent_incomplete_triples) + return + + + #----------------------------------------------------------------- + # The goal is to establish the subject and object for local processing + # The behaviour is slightly different depending on the presense or not + # of the @rel/@rev attributes + current_subject = None + current_object = None + + if _has_one_of_attributes(node, "rel", "rev"): + # in this case there is the notion of 'left' and 'right' of @rel/@rev + # in establishing the new Subject and the objectResource + + # set first the subject + if node.hasAttribute("about"): + current_subject = state.get_Curie_ref(node.getAttribute("about")) + elif node.hasAttribute("src"): + current_subject = state.get_URI_ref(node.getAttribute("src")) + elif node.hasAttribute("typeof"): + current_subject = BNode() + + # get_URI_ref may return None in case of an illegal Curie, so + # we have to be careful here, not use only an 'else' + if current_subject == None: + current_subject = parent_object + + # set the object resource + if node.hasAttribute("resource"): + current_object = state.get_Curie_ref(node.getAttribute("resource")) + elif node.hasAttribute("href"): + current_object = state.get_URI_ref(node.getAttribute("href")) + else: + # in this case all the various 'resource' setting attributes + # behave identically, except that their value might be different + # in terms of CURIE-s and they also have their own priority, of course + if node.hasAttribute("about"): + current_subject = state.get_Curie_ref(node.getAttribute("about")) + elif node.hasAttribute("src"): + current_subject = state.get_URI_ref(node.getAttribute("src")) + elif node.hasAttribute("resource"): + current_subject = state.get_Curie_ref(node.getAttribute("resource")) + elif node.hasAttribute("href"): + current_subject = state.get_URI_ref(node.getAttribute("href")) + elif node.hasAttribute("typeof"): + current_subject = BNode() + + # get_URI_ref may return None in case of an illegal Curie, so + # we have to be careful here, not use only an 'else' + if current_subject == None: + current_subject = parent_object + + # in this case no non-literal triples will be generated, so the + # only role of the current_objectResource is to be transferred to + # the children node + current_object = current_subject + + # --------------------------------------------------------------------- + # The possible typeof indicates a number of type statements on the newSubject + for defined_type in _get_resources_for_attr("typeof"): + graph.add((current_subject, RDF.type, defined_type)) + + # --------------------------------------------------------------------- + # In case of @rel/@rev, either triples or incomplete triples are generated + # the (possible) incomplete triples are collected, to be forwarded to the children + incomplete_triples = [] + for prop in _get_resources_for_attr("rel"): + theTriple = (current_subject, prop, current_object) + if current_object != None: + graph.add(theTriple) + else: + incomplete_triples.append(theTriple) + for prop in _get_resources_for_attr("rev"): + theTriple = (current_object, prop, current_subject) + if current_object != None: + graph.add(theTriple) + else: + incomplete_triples.append(theTriple) + + # ---------------------------------------------------------------------- + # Generation of the literal values. The newSubject is the subject + # A particularity of property is that it stops the parsing down the DOM tree if an XML Literal is generated, + # because everything down there is part of the generated literal. For this purpose the recurse flag is set (and used later + # in the parsing process). + if node.hasAttribute("property"): + # Generate the literal. It has been put it into a separate module to make it more managable + # the overall return value should be set to true if any valid triple has been generated + recurse = generate_literal(node, graph, current_subject, state) + else: + recurse = True + + # ---------------------------------------------------------------------- + # Setting the current object to a bnode is setting up a possible resource + # for the incomplete triples downwards + if current_object == None: + object_to_children = BNode() + else: + object_to_children = current_object + + #----------------------------------------------------------------------- + # Here is the recursion step for all the children + if recurse: + for n in node.childNodes: + if n.nodeType == node.ELEMENT_NODE: + parse_one_node(n, graph, object_to_children, state, incomplete_triples) + + # --------------------------------------------------------------------- + # At this point, the parent's incomplete triples may be completed + for s, p, o in parent_incomplete_triples: + if s == None: s = current_subject + if o == None: o = current_subject + graph.add((s, p, o)) + + # ------------------------------------------------------------------- + # This should be it... + # ------------------------------------------------------------------- + return + + +def _has_one_of_attributes(node, *args): + """ + Check whether one of the listed attributes is present on a (DOM) node. + @param node: DOM element node + @param args: possible attribute names + @return: True or False + @rtype: Boolean + """ + return True in [ node.hasAttribute(attr) for attr in args ] + + diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/state.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/state.py new file mode 100644 index 0000000..31caf41 --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/state.py @@ -0,0 +1,434 @@ +# -*- coding: utf-8 -*- +""" +Parser's execution context (a.k.a. state) object and handling. The state includes: + + - dictionary for namespaces. Keys are the namespace prefixes, values are RDFLib Namespace instances + - language, retrieved from C{@xml:lang} + - URI base, determined by <base> (or set explicitly). This is a little bit superfluous, because the current RDFa syntax does not make use of C{@xml:base}; ie, this could be a global value. But the structure is prepared to add C{@xml:base} easily, if needed. + - options, in the form of an L{Options<pyRdfa.Options>} instance + +The execution context object is also used to turn relative URI-s and CURIES into real URI references. + +@summary: RDFa core parser processing step +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +@var XHTML_PREFIX: prefix for the XHTML vocabulary namespace +@var XHTML_URI: URI prefix of the XHTML vocabulary +@var RDFa_PROFILE: the official RDFa profile URI +@var RDFa_VERSION: the official version string of RDFa +@var usual_protocols: list of "usual" protocols (used to generate warnings when CURIES are not protected) +@var _predefined_rel: list of predefined C{@rev} and C{@rel} values that should be mapped onto the XHTML vocabulary URI-s. +@var _predefined_property: list of predefined C{@property} values that should be mapped onto the XHTML vocabulary URI-s. (At present, this list is empty, but this has been an ongoing question in the group, so the I{mechanism} of checking is still there.) +@var __bnodes: dictionary of blank node names to real blank node +@var __empty_bnode: I{The} Bnode to be associated with the CURIE of the form "C{_:}". +""" + +from rdflib.namespace import Namespace, RDF, RDFS +from rdflib.term import BNode, URIRef +from rdflib.plugins.parsers.rdfa.options import Options, GENERIC_XML, XHTML_RDFA, HTML5_RDFA + +import re +import random +import urlparse + +__all__ = ['ExecutionContext'] + +RDFa_PROFILE = "http://www.w3.org/1999/xhtml/vocab" +RDFa_VERSION = "XHTML+RDFa 1.0" +RDFa_PublicID = "-//W3C//DTD XHTML+RDFa 1.0//EN" +RDFa_SystemID = "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd" + +usual_protocols = ["http", "https", "mailto", "ftp", "urn", "gopher", "tel", "ldap", "doi", "news"] + +####Predefined @rel/@rev/@property values +# predefined values for the @rel and @rev values. These are considered to be part of a specific +# namespace, defined by the RDFa document. +# At the moment, there are no predefined @property values, but the code is there in case +# some will be defined +XHTML_PREFIX = "xhv" +XHTML_URI = "http://www.w3.org/1999/xhtml/vocab#" + +_predefined_rel = ['alternate', 'appendix', 'cite', 'bookmark', 'chapter', 'contents', +'copyright', 'glossary', 'help', 'icon', 'index', 'meta', 'next', 'p3pv1', 'prev', +'role', 'section', 'subsection', 'start', 'license', 'up', 'last', 'stylesheet', 'first', 'top'] + +_predefined_property = [] + +#### Managing blank nodes for CURIE-s +__bnodes = {} +__empty_bnode = BNode() +def _get_bnode_from_Curie(var): + """ + 'Var' gives the string after the coloumn in a CURIE of the form C{_:XXX}. If this variable has been used + before, then the corresponding BNode is returned; otherwise a new BNode is created and + associated to that value. + @param var: CURIE BNode identifier + @return: BNode + """ + if len(var) == 0: + return __empty_bnode + if var in __bnodes: + return __bnodes[var] + else: + retval = BNode() + __bnodes[var] = retval + return retval + +#### Quote URI-s +import urllib +# 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other +# special characters are converted to their %.. equivalents for namespace prefixes +_unquotedChars = ':/\?=#' +_warnChars = [' ', '\n', '\r', '\t'] +def _quote(uri, options): + """ + 'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters + may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} + is also in the uri, an extra warning is also generated. + @param uri: URI + @param options: + @type options: L{Options<pyRdfa.Options>} + """ + suri = uri.strip() + for c in _warnChars: + if suri.find(c) != -1: + if options != None: + options.comment_graph.add_warning('Unusual character in uri:%s; possible error?' % suri) + break + return urllib.quote(suri, _unquotedChars) + + +#### Core Class definition +class ExecutionContext(object): + """State at a specific node, including the current set + of namespaces in the RDFLib sense, the + current language, and the base. The class is also used to interpret URI-s and CURIE-s to produce + URI references for RDFLib. + + @ivar options: reference to the overall options + @type ivar: L{Options.Options} + @ivar base: the 'base' URI + @ivar defaultNS: default namespace + @ivar lang: language tag (possibly None) + @ivar ns: dictionary of namespaces + @type ns: dictionary, each value is an RDFLib Namespace object + + """ + def __init__(self, node, graph, inherited_state=None, base="", options=None): + """ + @param node: the current DOM Node + @param graph: the RDFLib Graph + @keyword inherited_state: the state as inherited + from upper layers. This inherited_state is mixed with the state information + retrieved from the current node. + @type inherited_state: L{State.ExecutionContext} + @keyword base: string denoting the base URI for the specific node. This overrides the possible + base inherited from the upper layers. The + current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is + necessary for SVG (and other possible XML dialects that accept C{@xml:base}) + @keyword options: invocation option + @type options: L{Options<pyRdfa.Options>} + """ + #----------------------------------------------------------------- + # settling the base + # note that, strictly speaking, it is not necessary to add the base to the + # context, because there is only one place to set it (<base> element of the <header>). + # It is done because it is prepared for a possible future change in direction of + # accepting xml:base on each element. + # At the moment, it is invoked with a 'None' at the top level of parsing, that is + # when the <base> element is looked for. + if inherited_state: + self.base = inherited_state.base + self.options = inherited_state.options + # for generic XML versions the xml:base attribute should be handled + if self.options.host_language == GENERIC_XML and node.hasAttribute("xml:base"): + self.base = node.getAttribute("xml:base") + else: + # this is the branch called from the very top + self.base = "" + for bases in node.getElementsByTagName("base"): + if bases.hasAttribute("href"): + self.base = bases.getAttribute("href") + continue + if self.base == "": + self.base = base + + # this is just to play safe. I believe this branch should actually not happen... + if options == None: + from pyRdfa import Options + self.options = Options() + else: + self.options = options + + # xml:base is not part of XHTML+RDFa, but it is a valid setting for, say, SVG1.2 + if self.options.host_language == GENERIC_XML and node.hasAttribute("xml:base"): + self.base = node.getAttribute("xml:base") + + self.options.comment_graph.set_base_URI(URIRef(_quote(base, self.options))) + + # check the the presense of the @profile and or @version attribute for the RDFa profile... + # This whole branch is, however, irrelevant if the host language is a generic XML one (eg, SVG) + if self.options.host_language != GENERIC_XML: + doctype = None + try: + # I am not 100% sure the HTML5 minidom implementation has this, so let us just be + # cautious here... + doctype = node.ownerDocument.doctype + except: + pass + if doctype == None or not( doctype.publicId == RDFa_PublicID and doctype.systemId == RDFa_SystemID ): + # next level: check the version + html = node.ownerDocument.documentElement + if not( html.hasAttribute("version") and RDFa_VERSION == html.getAttribute("version") ): + # see if least the profile has been set + # Find the <head> element + head = None + for index in range(0, html.childNodes.length-1): + if html.childNodes.item(index).nodeName == "head": + head = html.childNodes.item(index) + break + if not( head != None and head.hasAttribute("profile") and RDFa_PROFILE in head.getAttribute("profile").strip().split() ): + if self.options.host_language == HTML5_RDFA: + self.options.comment_graph.add_info("RDFa profile or RFDa version has not been set (for a correct identification of RDFa). This is not a requirement for RDFa, but it is advised to use one of those nevertheless. Note that in the case of HTML5, the DOCTYPE setting may not work...") + else: + self.options.comment_graph.add_info("None of the RDFa DOCTYPE, RDFa profile, or RFDa version has been set (for a correct identification of RDFa). This is not a requirement for RDFa, but it is advised to use one of those nevertheless.") + + #----------------------------------------------------------------- + # Stripping the fragment ID from the base URI, as demanded by RFC 3986 + self.base = urlparse.urldefrag(self.base)[0] + + #----------------------------------------------------------------- + # Settling the language tags + # check first the lang or xml:lang attribute + # RDFa does not allow the lang attribute. HTML5 relies :-( on @lang; + # I just want to be prepared here... + if options != None and options.host_language == HTML5_RDFA and node.hasAttribute("lang"): + self.lang = node.getAttribute("lang") + if len(self.lang) == 0 : self.lang = None + elif node.hasAttribute("xml:lang"): + self.lang = node.getAttribute("xml:lang") + if len(self.lang) == 0 : self.lang = None + elif inherited_state: + self.lang = inherited_state.lang + else: + self.lang = None + + #----------------------------------------------------------------- + # Handling namespaces + # First get the local xmlns declarations/namespaces stuff. + dict = {} + for i in range(0, node.attributes.length): + attr = node.attributes.item(i) + if attr.name.find('xmlns:') == 0 : + # yep, there is a namespace setting + key = attr.localName + if key != "" : # exclude the top level xmlns setting... + if key == "_": + if warning: self.options.comment_graph.add_error("The '_' local CURIE prefix is reserved for blank nodes, and cannot be changed" ) + elif key.find(':') != -1: + if warning: self.options.comment_graph.add_error("The character ':' is not valid in a CURIE Prefix" ) + else : + # quote the URI, ie, convert special characters into %.. This is + # true, for example, for spaces + uri = _quote(attr.value, self.options) + # 1. create a new Namespace entry + ns = Namespace(uri) + # 2. 'bind' it in the current graph to + # get a nicer output + graph.bind(key, uri) + # 3. Add an entry to the dictionary + dict[key] = ns + + # See if anything has been collected at all. + # If not, the namespaces of the incoming state is + # taken over + self.ns = {} + if len(dict) == 0 and inherited_state: + self.ns = inherited_state.ns + else: + if inherited_state: + for k in inherited_state.ns : self.ns[k] = inherited_state.ns[k] + # copying the newly found namespace, possibly overwriting + # incoming values + for k in dict : self.ns[k] = dict[k] + else: + self.ns = dict + + # see if the xhtml core vocabulary has been set + self.xhtml_prefix = None + for key in self.ns.keys(): + if XHTML_URI == str(self.ns[key]): + self.xhtml_prefix = key + break + if self.xhtml_prefix == None: + if XHTML_PREFIX not in self.ns: + self.ns[XHTML_PREFIX] = Namespace(XHTML_URI) + self.xhtml_prefix = XHTML_PREFIX + else: + # the most disagreeable thing, the user has used + # the prefix for something else... + self.xhtml_prefix = XHTML_PREFIX + '_' + ("%d" % random.randint(1, 1000)) + self.ns[self.xhtml_prefix] = Namespace(XHTML_URI) + graph.bind(self.xhtml_prefix, XHTML_URI) + + # extra tricks for unusual usages... + # if the 'rdf' prefix is not used, it is artificially added... + if "rdf" not in self.ns: + self.ns["rdf"] = RDF + if "rdfs" not in self.ns: + self.ns["rdfs"] = RDFS + + # Final touch: setting the default namespace... + if node.hasAttribute("xmlns"): + self.defaultNS = node.getAttribute("xmlns") + elif inherited_state and inherited_state.defaultNS != None: + self.defaultNS = inherited_state.defaultNS + else: + self.defaultNS = None + + def _get_predefined_rels(self, val, warning): + """Get the predefined URI value for the C{@rel/@rev} attribute. + @param val: attribute name + @param warning: whether a warning should be generated or not + @type warning: boolean + @return: URIRef for the predefined URI (or None) + """ + vv = val.strip().lower() + if vv in _predefined_rel: + return self.ns[self.xhtml_prefix][vv] + else: + if warning: self.options.comment_graph.add_warning("invalid @rel/@rev value: '%s'" % val) + return None + + def _get_predefined_properties(self, val, warning): + """Get the predefined value for the C{@property} attribute. + @param val: attribute name + @param warning: whether a warning should be generated or not + @type warning: boolean + @return: URIRef for the predefined URI (or None) + """ + vv = val.strip().lower() + if vv in _predefined_property: + return self.ns[self.xhtml_prefix][vv] + else: + if warning: self.options.comment_graph.add_warning("invalid @property value: '%s'" % val) + return None + + def get_resource(self, val, rel=False, prop=False, warning=True): + """Get a resource for a CURIE. + The input argument is a CURIE; this is interpreted + via the current namespaces and the corresponding URI Reference is returned + @param val: string of the form "prefix:lname" + @keyword rel: whether the predefined C{@rel/@rev} values should also be interpreted + @keyword prop: whether the predefined C{@property} values should also be interpreted + @return: an RDFLib URIRef instance (or None) + """ + if val == "": + return None + elif val.find(":") != -1: + key = val.split(":", 1)[0] + lname = val.split(":", 1)[1] + if key == "_": + # A possible error: this method is invoked for property URI-s, which + # should not refer to a blank node. This case is checked and a possible + # error condition is handled + self.options.comment_graph.add_error("Blank node CURIE cannot be used in property position: _:%s" % lname) + return None + if key == "": + # This is the ":blabla" case + key = self.xhtml_prefix + else: + # if the resources correspond to a @rel or @rev or @property, then there + # may be one more possibility here, namely that it is one of the + # predefined values + if rel: + return self._get_predefined_rels(val, warning) + elif prop: + return self._get_predefined_properties(val, warning) + else: + self.options.comment_graph.add_warning("Invalid CURIE (without prefix): '%s'" % val) + return None + + if key not in self.ns: + self.options.comment_graph.add_error("CURIE used with non declared prefix: %s" % key) + return None + else: + if lname == "": + return URIRef(str(self.ns[key])) + else: + return self.ns[key][lname] + + def get_resources(self, val, rel=False, prop=False): + """Get a series of resources encoded in CURIE-s. + The input argument is a list of CURIE-s; these are interpreted + via the current namespaces and the corresponding URI References are returned. + @param val: strings of the form prefix':'lname, separated by space + @keyword rel: whether the predefined C{@rel/@rev} values should also be interpreted + @keyword prop: whether the predefined C{@property} values should also be interpreted + @return: a list of RDFLib URIRef instances (possibly empty) + """ + val.strip() + resources = [ self.get_resource(v, rel, prop) for v in val.split() if v != None ] + return [ r for r in resources if r != None ] + + def get_URI_ref(self, val): + """Create a URI RDFLib resource for a URI. + The input argument is a URI. It is checked whether it is a local + reference with a '#' or not. If yes, a URIRef combined with the + stored base value is returned. In both cases a URIRef for a full URI is created + and returned + @param val: URI string + @return: an RDFLib URIRef instance + """ + if val == "": + return URIRef(self.base) + elif val[0] == '[' and val[-1] == ']': + self.options.comment_graph.add_error("Illegal usage of CURIE: %s" % val) + return None + else: + return URIRef(urlparse.urljoin(self.base, val)) + + def get_Curie_ref(self, val): + """Create a URI RDFLib resource for a CURIE. + The input argument is a CURIE. This means that it is: + - either of the form [a:b] where a:b should be resolved as an + 'unprotected' CURIE, or + - it is a traditional URI (relative or absolute) + + If the second case the URI value is also compared to 'usual' URI + protocols ('http', 'https', 'ftp', etc) (see L{usual_protocols}). + If there is no match, a warning is generated (indeed, a frequent + mistake in authoring RDFa is to forget the '[' and ']' characters to + "protect" CURIE-s.) + + @param val: CURIE string + @return: an RDFLib URIRef instance + """ + if len(val) == 0: + return URIRef(self.base) + elif val[0] == "[": + if val[-1] == "]": + curie = val[1:-1] + # A possible Blank node reference should be separated here: + if len(curie) >= 2 and curie[0] == "_" and curie[1] == ":": + return _get_bnode_from_Curie(curie[2:]) + else: + return self.get_resource(val[1:-1]) + else: + # illegal CURIE... + self.options.comment_graph.add_error("Illegal CURIE: %s" % val) + return None + else: + # check the value, to see if an error may have been made... + # Usual protocol values in the URI + v = val.strip().lower() + protocol = urlparse.urlparse(val)[0] + if protocol != "" and protocol not in usual_protocols: + err = "Possible URI error with '%s'; the intention may have been to use a protected CURIE" % val + self.options.comment_graph.add_warning(err) + return self.get_URI_ref(val) + diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/transform/__init__.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/transform/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/transform/__init__.py diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/transform/headabout.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/transform/headabout.py new file mode 100644 index 0000000..0cf8f7a --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/transform/headabout.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +""" +Simple transfomer: the C{@about=""} is added to the C{<head>} and C{<body>} elements (unless something is already there). +Note that this transformer is always invoked by the parser because this behaviour is mandated by the RDFa syntax. + +@summary: Add a top "about" to <head> and <body> +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +""" + +def head_about_transform(html, options): + """ + @param html: a DOM node for the top level html element + @param options: invocation options + @type options: L{Options<pyRdfa.Options>} + """ + for top in html.getElementsByTagName("head"): + if not top.hasAttribute("about"): + top.setAttribute("about", "") + for top in html.getElementsByTagName("body"): + if not top.hasAttribute("about"): + top.setAttribute("about", "") + |