diff options
Diffstat (limited to 'creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/state.py')
-rw-r--r-- | creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/state.py | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/state.py b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/state.py new file mode 100644 index 0000000..31caf41 --- /dev/null +++ b/creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/state.py @@ -0,0 +1,434 @@ +# -*- coding: utf-8 -*- +""" +Parser's execution context (a.k.a. state) object and handling. The state includes: + + - dictionary for namespaces. Keys are the namespace prefixes, values are RDFLib Namespace instances + - language, retrieved from C{@xml:lang} + - URI base, determined by <base> (or set explicitly). This is a little bit superfluous, because the current RDFa syntax does not make use of C{@xml:base}; ie, this could be a global value. But the structure is prepared to add C{@xml:base} easily, if needed. + - options, in the form of an L{Options<pyRdfa.Options>} instance + +The execution context object is also used to turn relative URI-s and CURIES into real URI references. + +@summary: RDFa core parser processing step +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +@var XHTML_PREFIX: prefix for the XHTML vocabulary namespace +@var XHTML_URI: URI prefix of the XHTML vocabulary +@var RDFa_PROFILE: the official RDFa profile URI +@var RDFa_VERSION: the official version string of RDFa +@var usual_protocols: list of "usual" protocols (used to generate warnings when CURIES are not protected) +@var _predefined_rel: list of predefined C{@rev} and C{@rel} values that should be mapped onto the XHTML vocabulary URI-s. +@var _predefined_property: list of predefined C{@property} values that should be mapped onto the XHTML vocabulary URI-s. (At present, this list is empty, but this has been an ongoing question in the group, so the I{mechanism} of checking is still there.) +@var __bnodes: dictionary of blank node names to real blank node +@var __empty_bnode: I{The} Bnode to be associated with the CURIE of the form "C{_:}". +""" + +from rdflib.namespace import Namespace, RDF, RDFS +from rdflib.term import BNode, URIRef +from rdflib.plugins.parsers.rdfa.options import Options, GENERIC_XML, XHTML_RDFA, HTML5_RDFA + +import re +import random +import urlparse + +__all__ = ['ExecutionContext'] + +RDFa_PROFILE = "http://www.w3.org/1999/xhtml/vocab" +RDFa_VERSION = "XHTML+RDFa 1.0" +RDFa_PublicID = "-//W3C//DTD XHTML+RDFa 1.0//EN" +RDFa_SystemID = "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd" + +usual_protocols = ["http", "https", "mailto", "ftp", "urn", "gopher", "tel", "ldap", "doi", "news"] + +####Predefined @rel/@rev/@property values +# predefined values for the @rel and @rev values. These are considered to be part of a specific +# namespace, defined by the RDFa document. +# At the moment, there are no predefined @property values, but the code is there in case +# some will be defined +XHTML_PREFIX = "xhv" +XHTML_URI = "http://www.w3.org/1999/xhtml/vocab#" + +_predefined_rel = ['alternate', 'appendix', 'cite', 'bookmark', 'chapter', 'contents', +'copyright', 'glossary', 'help', 'icon', 'index', 'meta', 'next', 'p3pv1', 'prev', +'role', 'section', 'subsection', 'start', 'license', 'up', 'last', 'stylesheet', 'first', 'top'] + +_predefined_property = [] + +#### Managing blank nodes for CURIE-s +__bnodes = {} +__empty_bnode = BNode() +def _get_bnode_from_Curie(var): + """ + 'Var' gives the string after the coloumn in a CURIE of the form C{_:XXX}. If this variable has been used + before, then the corresponding BNode is returned; otherwise a new BNode is created and + associated to that value. + @param var: CURIE BNode identifier + @return: BNode + """ + if len(var) == 0: + return __empty_bnode + if var in __bnodes: + return __bnodes[var] + else: + retval = BNode() + __bnodes[var] = retval + return retval + +#### Quote URI-s +import urllib +# 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other +# special characters are converted to their %.. equivalents for namespace prefixes +_unquotedChars = ':/\?=#' +_warnChars = [' ', '\n', '\r', '\t'] +def _quote(uri, options): + """ + 'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters + may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} + is also in the uri, an extra warning is also generated. + @param uri: URI + @param options: + @type options: L{Options<pyRdfa.Options>} + """ + suri = uri.strip() + for c in _warnChars: + if suri.find(c) != -1: + if options != None: + options.comment_graph.add_warning('Unusual character in uri:%s; possible error?' % suri) + break + return urllib.quote(suri, _unquotedChars) + + +#### Core Class definition +class ExecutionContext(object): + """State at a specific node, including the current set + of namespaces in the RDFLib sense, the + current language, and the base. The class is also used to interpret URI-s and CURIE-s to produce + URI references for RDFLib. + + @ivar options: reference to the overall options + @type ivar: L{Options.Options} + @ivar base: the 'base' URI + @ivar defaultNS: default namespace + @ivar lang: language tag (possibly None) + @ivar ns: dictionary of namespaces + @type ns: dictionary, each value is an RDFLib Namespace object + + """ + def __init__(self, node, graph, inherited_state=None, base="", options=None): + """ + @param node: the current DOM Node + @param graph: the RDFLib Graph + @keyword inherited_state: the state as inherited + from upper layers. This inherited_state is mixed with the state information + retrieved from the current node. + @type inherited_state: L{State.ExecutionContext} + @keyword base: string denoting the base URI for the specific node. This overrides the possible + base inherited from the upper layers. The + current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is + necessary for SVG (and other possible XML dialects that accept C{@xml:base}) + @keyword options: invocation option + @type options: L{Options<pyRdfa.Options>} + """ + #----------------------------------------------------------------- + # settling the base + # note that, strictly speaking, it is not necessary to add the base to the + # context, because there is only one place to set it (<base> element of the <header>). + # It is done because it is prepared for a possible future change in direction of + # accepting xml:base on each element. + # At the moment, it is invoked with a 'None' at the top level of parsing, that is + # when the <base> element is looked for. + if inherited_state: + self.base = inherited_state.base + self.options = inherited_state.options + # for generic XML versions the xml:base attribute should be handled + if self.options.host_language == GENERIC_XML and node.hasAttribute("xml:base"): + self.base = node.getAttribute("xml:base") + else: + # this is the branch called from the very top + self.base = "" + for bases in node.getElementsByTagName("base"): + if bases.hasAttribute("href"): + self.base = bases.getAttribute("href") + continue + if self.base == "": + self.base = base + + # this is just to play safe. I believe this branch should actually not happen... + if options == None: + from pyRdfa import Options + self.options = Options() + else: + self.options = options + + # xml:base is not part of XHTML+RDFa, but it is a valid setting for, say, SVG1.2 + if self.options.host_language == GENERIC_XML and node.hasAttribute("xml:base"): + self.base = node.getAttribute("xml:base") + + self.options.comment_graph.set_base_URI(URIRef(_quote(base, self.options))) + + # check the the presense of the @profile and or @version attribute for the RDFa profile... + # This whole branch is, however, irrelevant if the host language is a generic XML one (eg, SVG) + if self.options.host_language != GENERIC_XML: + doctype = None + try: + # I am not 100% sure the HTML5 minidom implementation has this, so let us just be + # cautious here... + doctype = node.ownerDocument.doctype + except: + pass + if doctype == None or not( doctype.publicId == RDFa_PublicID and doctype.systemId == RDFa_SystemID ): + # next level: check the version + html = node.ownerDocument.documentElement + if not( html.hasAttribute("version") and RDFa_VERSION == html.getAttribute("version") ): + # see if least the profile has been set + # Find the <head> element + head = None + for index in range(0, html.childNodes.length-1): + if html.childNodes.item(index).nodeName == "head": + head = html.childNodes.item(index) + break + if not( head != None and head.hasAttribute("profile") and RDFa_PROFILE in head.getAttribute("profile").strip().split() ): + if self.options.host_language == HTML5_RDFA: + self.options.comment_graph.add_info("RDFa profile or RFDa version has not been set (for a correct identification of RDFa). This is not a requirement for RDFa, but it is advised to use one of those nevertheless. Note that in the case of HTML5, the DOCTYPE setting may not work...") + else: + self.options.comment_graph.add_info("None of the RDFa DOCTYPE, RDFa profile, or RFDa version has been set (for a correct identification of RDFa). This is not a requirement for RDFa, but it is advised to use one of those nevertheless.") + + #----------------------------------------------------------------- + # Stripping the fragment ID from the base URI, as demanded by RFC 3986 + self.base = urlparse.urldefrag(self.base)[0] + + #----------------------------------------------------------------- + # Settling the language tags + # check first the lang or xml:lang attribute + # RDFa does not allow the lang attribute. HTML5 relies :-( on @lang; + # I just want to be prepared here... + if options != None and options.host_language == HTML5_RDFA and node.hasAttribute("lang"): + self.lang = node.getAttribute("lang") + if len(self.lang) == 0 : self.lang = None + elif node.hasAttribute("xml:lang"): + self.lang = node.getAttribute("xml:lang") + if len(self.lang) == 0 : self.lang = None + elif inherited_state: + self.lang = inherited_state.lang + else: + self.lang = None + + #----------------------------------------------------------------- + # Handling namespaces + # First get the local xmlns declarations/namespaces stuff. + dict = {} + for i in range(0, node.attributes.length): + attr = node.attributes.item(i) + if attr.name.find('xmlns:') == 0 : + # yep, there is a namespace setting + key = attr.localName + if key != "" : # exclude the top level xmlns setting... + if key == "_": + if warning: self.options.comment_graph.add_error("The '_' local CURIE prefix is reserved for blank nodes, and cannot be changed" ) + elif key.find(':') != -1: + if warning: self.options.comment_graph.add_error("The character ':' is not valid in a CURIE Prefix" ) + else : + # quote the URI, ie, convert special characters into %.. This is + # true, for example, for spaces + uri = _quote(attr.value, self.options) + # 1. create a new Namespace entry + ns = Namespace(uri) + # 2. 'bind' it in the current graph to + # get a nicer output + graph.bind(key, uri) + # 3. Add an entry to the dictionary + dict[key] = ns + + # See if anything has been collected at all. + # If not, the namespaces of the incoming state is + # taken over + self.ns = {} + if len(dict) == 0 and inherited_state: + self.ns = inherited_state.ns + else: + if inherited_state: + for k in inherited_state.ns : self.ns[k] = inherited_state.ns[k] + # copying the newly found namespace, possibly overwriting + # incoming values + for k in dict : self.ns[k] = dict[k] + else: + self.ns = dict + + # see if the xhtml core vocabulary has been set + self.xhtml_prefix = None + for key in self.ns.keys(): + if XHTML_URI == str(self.ns[key]): + self.xhtml_prefix = key + break + if self.xhtml_prefix == None: + if XHTML_PREFIX not in self.ns: + self.ns[XHTML_PREFIX] = Namespace(XHTML_URI) + self.xhtml_prefix = XHTML_PREFIX + else: + # the most disagreeable thing, the user has used + # the prefix for something else... + self.xhtml_prefix = XHTML_PREFIX + '_' + ("%d" % random.randint(1, 1000)) + self.ns[self.xhtml_prefix] = Namespace(XHTML_URI) + graph.bind(self.xhtml_prefix, XHTML_URI) + + # extra tricks for unusual usages... + # if the 'rdf' prefix is not used, it is artificially added... + if "rdf" not in self.ns: + self.ns["rdf"] = RDF + if "rdfs" not in self.ns: + self.ns["rdfs"] = RDFS + + # Final touch: setting the default namespace... + if node.hasAttribute("xmlns"): + self.defaultNS = node.getAttribute("xmlns") + elif inherited_state and inherited_state.defaultNS != None: + self.defaultNS = inherited_state.defaultNS + else: + self.defaultNS = None + + def _get_predefined_rels(self, val, warning): + """Get the predefined URI value for the C{@rel/@rev} attribute. + @param val: attribute name + @param warning: whether a warning should be generated or not + @type warning: boolean + @return: URIRef for the predefined URI (or None) + """ + vv = val.strip().lower() + if vv in _predefined_rel: + return self.ns[self.xhtml_prefix][vv] + else: + if warning: self.options.comment_graph.add_warning("invalid @rel/@rev value: '%s'" % val) + return None + + def _get_predefined_properties(self, val, warning): + """Get the predefined value for the C{@property} attribute. + @param val: attribute name + @param warning: whether a warning should be generated or not + @type warning: boolean + @return: URIRef for the predefined URI (or None) + """ + vv = val.strip().lower() + if vv in _predefined_property: + return self.ns[self.xhtml_prefix][vv] + else: + if warning: self.options.comment_graph.add_warning("invalid @property value: '%s'" % val) + return None + + def get_resource(self, val, rel=False, prop=False, warning=True): + """Get a resource for a CURIE. + The input argument is a CURIE; this is interpreted + via the current namespaces and the corresponding URI Reference is returned + @param val: string of the form "prefix:lname" + @keyword rel: whether the predefined C{@rel/@rev} values should also be interpreted + @keyword prop: whether the predefined C{@property} values should also be interpreted + @return: an RDFLib URIRef instance (or None) + """ + if val == "": + return None + elif val.find(":") != -1: + key = val.split(":", 1)[0] + lname = val.split(":", 1)[1] + if key == "_": + # A possible error: this method is invoked for property URI-s, which + # should not refer to a blank node. This case is checked and a possible + # error condition is handled + self.options.comment_graph.add_error("Blank node CURIE cannot be used in property position: _:%s" % lname) + return None + if key == "": + # This is the ":blabla" case + key = self.xhtml_prefix + else: + # if the resources correspond to a @rel or @rev or @property, then there + # may be one more possibility here, namely that it is one of the + # predefined values + if rel: + return self._get_predefined_rels(val, warning) + elif prop: + return self._get_predefined_properties(val, warning) + else: + self.options.comment_graph.add_warning("Invalid CURIE (without prefix): '%s'" % val) + return None + + if key not in self.ns: + self.options.comment_graph.add_error("CURIE used with non declared prefix: %s" % key) + return None + else: + if lname == "": + return URIRef(str(self.ns[key])) + else: + return self.ns[key][lname] + + def get_resources(self, val, rel=False, prop=False): + """Get a series of resources encoded in CURIE-s. + The input argument is a list of CURIE-s; these are interpreted + via the current namespaces and the corresponding URI References are returned. + @param val: strings of the form prefix':'lname, separated by space + @keyword rel: whether the predefined C{@rel/@rev} values should also be interpreted + @keyword prop: whether the predefined C{@property} values should also be interpreted + @return: a list of RDFLib URIRef instances (possibly empty) + """ + val.strip() + resources = [ self.get_resource(v, rel, prop) for v in val.split() if v != None ] + return [ r for r in resources if r != None ] + + def get_URI_ref(self, val): + """Create a URI RDFLib resource for a URI. + The input argument is a URI. It is checked whether it is a local + reference with a '#' or not. If yes, a URIRef combined with the + stored base value is returned. In both cases a URIRef for a full URI is created + and returned + @param val: URI string + @return: an RDFLib URIRef instance + """ + if val == "": + return URIRef(self.base) + elif val[0] == '[' and val[-1] == ']': + self.options.comment_graph.add_error("Illegal usage of CURIE: %s" % val) + return None + else: + return URIRef(urlparse.urljoin(self.base, val)) + + def get_Curie_ref(self, val): + """Create a URI RDFLib resource for a CURIE. + The input argument is a CURIE. This means that it is: + - either of the form [a:b] where a:b should be resolved as an + 'unprotected' CURIE, or + - it is a traditional URI (relative or absolute) + + If the second case the URI value is also compared to 'usual' URI + protocols ('http', 'https', 'ftp', etc) (see L{usual_protocols}). + If there is no match, a warning is generated (indeed, a frequent + mistake in authoring RDFa is to forget the '[' and ']' characters to + "protect" CURIE-s.) + + @param val: CURIE string + @return: an RDFLib URIRef instance + """ + if len(val) == 0: + return URIRef(self.base) + elif val[0] == "[": + if val[-1] == "]": + curie = val[1:-1] + # A possible Blank node reference should be separated here: + if len(curie) >= 2 and curie[0] == "_" and curie[1] == ":": + return _get_bnode_from_Curie(curie[2:]) + else: + return self.get_resource(val[1:-1]) + else: + # illegal CURIE... + self.options.comment_graph.add_error("Illegal CURIE: %s" % val) + return None + else: + # check the value, to see if an error may have been made... + # Usual protocol values in the URI + v = val.strip().lower() + protocol = urlparse.urlparse(val)[0] + if protocol != "" and protocol not in usual_protocols: + err = "Possible URI error with '%s'; the intention may have been to use a protected CURIE" % val + self.options.comment_graph.add_warning(err) + return self.get_URI_ref(val) + |