diff options
Diffstat (limited to 'creactistore/_templates/lib/rdflib/plugins/parsers/notation3.py')
-rw-r--r-- | creactistore/_templates/lib/rdflib/plugins/parsers/notation3.py | 2314 |
1 files changed, 0 insertions, 2314 deletions
diff --git a/creactistore/_templates/lib/rdflib/plugins/parsers/notation3.py b/creactistore/_templates/lib/rdflib/plugins/parsers/notation3.py deleted file mode 100644 index ac48340..0000000 --- a/creactistore/_templates/lib/rdflib/plugins/parsers/notation3.py +++ /dev/null @@ -1,2314 +0,0 @@ -#!/usr/bin/env python -u""" -notation3.py - Standalone Notation3 Parser -Derived from CWM, the Closed World Machine - -Authors of the original suite: - -* Dan Connolly <@@> -* Tim Berners-Lee <@@> -* Yosi Scharf <@@> -* Joseph M. Reagle Jr. <reagle@w3.org> -* Rich Salz <rsalz@zolera.com> - -http://www.w3.org/2000/10/swap/notation3.py - -Copyright 2000-2007, World Wide Web Consortium. -Copyright 2001, MIT. -Copyright 2001, Zolera Systems Inc. - -License: W3C Software License -http://www.w3.org/Consortium/Legal/copyright-software - -Modified by Sean B. Palmer -Copyright 2007, Sean B. Palmer. \u32E1 - -Modified to work with rdflib by Gunnar Aastrand Grimnes -Copyright 2010, Gunnar A. Grimnes - -""" - -# Python standard libraries -import types -import sys -import os -import string -import re -import time -import StringIO -import codecs - -from binascii import a2b_hex -from decimal import Decimal - -from rdflib.term import URIRef, BNode, Literal, Variable, _XSD_PFX, _unique_id -from rdflib.graph import QuotedGraph, ConjunctiveGraph -from rdflib import py3compat -b = py3compat.b - -__all__ = ['URISyntaxError', 'BadSyntax', 'N3Parser', "verbosity", "setVerbosity", "progress", "splitFrag", "splitFragP", "join", "refTo", "base", "canonical", "runNamespace", "uniqueURI", "Canonicalize", "stripCR", "dummyWrite", "toBool", "stringToN3", "backslashUify", "hexify", "dummy"] - -from rdflib.parser import Parser - -# Incestuous.. would be nice to separate N3 and XML -# from sax2rdf import XMLtoDOM -def XMLtoDOM(*args, **kargs): - # print >> sys.stderr, args, kargs - pass - -# SWAP http://www.w3.org/2000/10/swap -# from diag import verbosity, setVerbosity, progress -def verbosity(*args, **kargs): - # print >> sys.stderr, args, kargs - pass -def setVerbosity(*args, **kargs): - # print >> sys.stderr, args, kargs - pass -def progress(*args, **kargs): - # print >> sys.stderr, args, kargs - pass - - - -def splitFrag(uriref): - """split a URI reference between the fragment and the rest. - - Punctuation is thrown away. - - e.g. - - >>> splitFrag("abc#def") - ('abc', 'def') - - >>> splitFrag("abcdef") - ('abcdef', None) - - """ - - i = uriref.rfind("#") - if i>= 0: return uriref[:i], uriref[i+1:] - else: return uriref, None - -def splitFragP(uriref, punct=0): - """split a URI reference before the fragment - - Punctuation is kept. - - e.g. - - >>> splitFragP("abc#def") - ('abc', '#def') - - >>> splitFragP("abcdef") - ('abcdef', '') - - """ - - i = uriref.rfind("#") - if i>= 0: return uriref[:i], uriref[i:] - else: return uriref, '' - -@py3compat.format_doctest_out -def join(here, there): - """join an absolute URI and URI reference - (non-ascii characters are supported/doctested; - haven't checked the details of the IRI spec though) - - here is assumed to be absolute. - there is URI reference. - - >>> join('http://example/x/y/z', '../abc') - 'http://example/x/abc' - - Raise ValueError if there uses relative path - syntax but here has no hierarchical path. - - >>> join('mid:foo@example', '../foo') - Traceback (most recent call last): - raise ValueError, here - ValueError: Base <mid:foo@example> has no slash after colon - with relative '../foo'. - - >>> join('http://example/x/y/z', '') - 'http://example/x/y/z' - - >>> join('mid:foo@example', '#foo') - 'mid:foo@example#foo' - - We grok IRIs - - >>> len(u'Andr\\xe9') - 5 - - >>> join('http://example.org/', u'#Andr\\xe9') - %(u)s'http://example.org/#Andr\\xe9' - """ - - assert(here.find("#") < 0), "Base may not contain hash: '%s'"% here # caller must splitFrag (why?) - - slashl = there.find('/') - colonl = there.find(':') - - # join(base, 'foo:/') -- absolute - if colonl >= 0 and (slashl < 0 or colonl < slashl): - return there - - bcolonl = here.find(':') - assert(bcolonl >= 0), "Base uri '%s' is not absolute" % here # else it's not absolute - - path, frag = splitFragP(there) - if not path: return here + frag - - # join('mid:foo@example', '../foo') bzzt - if here[bcolonl+1:bcolonl+2] <> '/': - raise ValueError ("Base <%s> has no slash after colon - with relative '%s'." %(here, there)) - - if here[bcolonl+1:bcolonl+3] == '//': - bpath = here.find('/', bcolonl+3) - else: - bpath = bcolonl+1 - - # join('http://xyz', 'foo') - if bpath < 0: - bpath = len(here) - here = here + '/' - - # join('http://xyz/', '//abc') => 'http://abc' - if there[:2] == '//': - return here[:bcolonl+1] + there - - # join('http://xyz/', '/abc') => 'http://xyz/abc' - if there[:1] == '/': - return here[:bpath] + there - - slashr = here.rfind('/') - - while 1: - if path[:2] == './': - path = path[2:] - if path == '.': - path = '' - elif path[:3] == '../' or path == '..': - path = path[3:] - i = here.rfind('/', bpath, slashr) - if i >= 0: - here = here[:i+1] - slashr = i - else: - break - - return here[:slashr+1] + path + frag - -commonHost = re.compile(r'^[-_a-zA-Z0-9.]+:(//[^/]*)?/[^/]*$') - -def refTo(base, uri): - """figure out a relative URI reference from base to uri - - >>> refTo('http://example/x/y/z', 'http://example/x/abc') - '../abc' - - >>> refTo('file:/ex/x/y', 'file:/ex/x/q/r#s') - 'q/r#s' - - >>> refTo(None, 'http://ex/x/y') - 'http://ex/x/y' - - >>> refTo('http://ex/x/y', 'http://ex/x/y') - '' - - Note the relationship between refTo and join: - join(x, refTo(x, y)) == y - which points out certain strings which cannot be URIs. e.g. - >>> x='http://ex/x/y';y='http://ex/x/q:r';join(x, refTo(x, y)) == y - 0 - - So 'http://ex/x/q:r' is not a URI. Use 'http://ex/x/q%3ar' instead: - >>> x='http://ex/x/y';y='http://ex/x/q%3ar';join(x, refTo(x, y)) == y - 1 - - This one checks that it uses a root-realtive one where that is - all they share. Now uses root-relative where no path is shared. - This is a matter of taste but tends to give more resilience IMHO - -- and shorter paths - - Note that base may be None, meaning no base. In some situations, there - just ain't a base. Slife. In these cases, relTo returns the absolute value. - The axiom abs(,rel(b,x))=x still holds. - This saves people having to set the base to "bogus:". - - >>> refTo('http://ex/x/y/z', 'http://ex/r') - '/r' - - """ - -# assert base # don't mask bugs -danc # not a bug. -tim - if not base: return uri - if base == uri: return "" - - # Find how many path segments in common - i=0 - while i<len(uri) and i<len(base): - if uri[i] == base[i]: i = i + 1 - else: break - # print "# relative", base, uri, " same up to ", i - # i point to end of shortest one or first difference - - m = commonHost.match(base[:i]) - if m: - k=uri.find("//") - if k<0: k=-2 # no host - l=uri.find("/", k+2) - if uri[l+1:l+2] != "/" and base[l+1:l+2] != "/" and uri[:l]==base[:l]: - return uri[l:] - - if uri[i:i+1] =="#" and len(base) == i: return uri[i:] # fragment of base - - while i>0 and uri[i-1] != '/' : i=i-1 # scan for slash - - if i < 3: return uri # No way. - if base.find("//", i-2)>0 \ - or uri.find("//", i-2)>0: return uri # An unshared "//" - if base.find(":", i)>0: return uri # An unshared ":" - n = base.count("/", i) - if n == 0 and i<len(uri) and uri[i] == '#': - return "./" + uri[i:] - elif n == 0 and i == len(uri): - return "./" - else: - return ("../" * n) + uri[i:] - - -def base(): - """The base URI for this process - the Web equiv of cwd - - Relative or abolute unix-standard filenames parsed relative to - this yeild the URI of the file. - If we had a reliable way of getting a computer name, - we should put it in the hostname just to prevent ambiguity - - """ -# return "file://" + hostname + os.getcwd() + "/" - return "file://" + _fixslash(os.getcwd()) + "/" - - -def _fixslash(str): - """ Fix windowslike filename to unixlike - (#ifdef WINDOWS)""" - s = str - for i in range(len(s)): - if s[i] == "\\": s = s[:i] + "/" + s[i+1:] - if s[0] != "/" and s[1] == ":": s = s[2:] # @@@ Hack when drive letter present - return s - -URI_unreserved = b("ABCDEFGHIJJLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~") - # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" - -@py3compat.format_doctest_out -def canonical(str_in): - """Convert equivalent URIs (or parts) to the same string - - There are many differenet levels of URI canonicalization - which are possible. See http://www.ietf.org/rfc/rfc3986.txt - Done: - - Converfting unicode IRI to utf-8 - - Escaping all non-ASCII - - De-escaping, if escaped, ALPHA (%%41-%%5A and %%61-%%7A), DIGIT (%%30-%%39), - hyphen (%%2D), period (%%2E), underscore (%%5F), or tilde (%%7E) (Sect 2.4) - - Making all escapes uppercase hexadecimal - - Not done: - - Making URI scheme lowercase - - changing /./ or /foo/../ to / with care not to change host part - - - >>> canonical("foo bar") - %(b)s'foo%%20bar' - - >>> canonical(u'http:') - %(b)s'http:' - - >>> canonical('fran%%c3%%83%%c2%%a7ois') - %(b)s'fran%%C3%%83%%C2%%A7ois' - - >>> canonical('a') - %(b)s'a' - - >>> canonical('%%4e') - %(b)s'N' - - >>> canonical('%%9d') - %(b)s'%%9D' - - >>> canonical('%%2f') - %(b)s'%%2F' - - >>> canonical('%%2F') - %(b)s'%%2F' - - """ - if type(str_in) == type(u''): - s8 = str_in.encode('utf-8') - else: - s8 = str_in - s = b('') - i = 0 - while i < len(s8): - if py3compat.PY3: - n = s8[i]; ch = bytes([n]) - else: - ch = s8[i]; n = ord(ch) - if (n > 126) or (n < 33) : # %-encode controls, SP, DEL, and utf-8 - s += b("%%%02X" % ord(ch)) - elif ch == b('%') and i+2 < len(s8): - ch2 = a2b_hex(s8[i+1:i+3]) - if ch2 in URI_unreserved: s += ch2 - else: s += b("%%%02X" % ord(ch2)) - i = i+3 - continue - else: - s += ch - i = i +1 - return s - - - - - - -CONTEXT = 0 -PRED = 1 -SUBJ = 2 -OBJ = 3 - -PARTS = PRED, SUBJ, OBJ -ALL4 = CONTEXT, PRED, SUBJ, OBJ - -SYMBOL = 0 -FORMULA = 1 -LITERAL = 2 -LITERAL_DT = 21 -LITERAL_LANG = 22 -ANONYMOUS = 3 -XMLLITERAL = 25 - -Logic_NS = "http://www.w3.org/2000/10/swap/log#" -NODE_MERGE_URI = Logic_NS + "is" # Pseudo-property indicating node merging -forSomeSym = Logic_NS + "forSome" -forAllSym = Logic_NS + "forAll" - -RDF_type_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" -RDF_NS_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" -OWL_NS = "http://www.w3.org/2002/07/owl#" -DAML_sameAs_URI = OWL_NS+"sameAs" -parsesTo_URI = Logic_NS + "parsesTo" -RDF_spec = "http://www.w3.org/TR/REC-rdf-syntax/" - -List_NS = RDF_NS_URI # From 20030808 -_Old_Logic_NS = "http://www.w3.org/2000/10/swap/log.n3#" - -N3_first = (SYMBOL, List_NS + "first") -N3_rest = (SYMBOL, List_NS + "rest") -N3_li = (SYMBOL, List_NS + "li") -N3_nil = (SYMBOL, List_NS + "nil") -N3_List = (SYMBOL, List_NS + "List") -N3_Empty = (SYMBOL, List_NS + "Empty") - - - -runNamespaceValue = None - -def runNamespace(): - "Return a URI suitable as a namespace for run-local objects" - # @@@ include hostname (privacy?) (hash it?) - global runNamespaceValue - if runNamespaceValue == None: - runNamespaceValue = join(base(), _unique_id()) + '#' - return runNamespaceValue - -nextu = 0 -def uniqueURI(): - "A unique URI" - global nextu - nextu += 1 - return runNamespace() + "u_" + `nextu` - -class URISyntaxError(ValueError): - """A parameter is passed to a routine that requires a URI reference""" - pass - - -tracking = False -chatty_flag = 50 - - -from xml.dom import Node -try: - from xml.ns import XMLNS -except: - class XMLNS: - BASE = "http://www.w3.org/2000/xmlns/" - XML = "http://www.w3.org/XML/1998/namespace" - - -_attrs = lambda E: (E.attributes and E.attributes.values()) or [] -_children = lambda E: E.childNodes or [] -_IN_XML_NS = lambda n: n.namespaceURI == XMLNS.XML -_inclusive = lambda n: n.unsuppressedPrefixes == None - -# Does a document/PI has lesser/greater document order than the -# first element? -_LesserElement, _Element, _GreaterElement = range(3) - -def _sorter(n1,n2): - '''_sorter(n1,n2) -> int - Sorting predicate for non-NS attributes.''' - - i = cmp(n1.namespaceURI, n2.namespaceURI) - if i: return i - return cmp(n1.localName, n2.localName) - - -def _sorter_ns(n1,n2): - '''_sorter_ns((n,v),(n,v)) -> int - "(an empty namespace URI is lexicographically least)."''' - - if n1[0] == 'xmlns': return -1 - if n2[0] == 'xmlns': return 1 - return cmp(n1[0], n2[0]) - -def _utilized(n, node, other_attrs, unsuppressedPrefixes): - '''_utilized(n, node, other_attrs, unsuppressedPrefixes) -> boolean - Return true if that nodespace is utilized within the node''' - - if n.startswith('xmlns:'): - n = n[6:] - elif n.startswith('xmlns'): - n = n[5:] - if (n=="" and node.prefix in ["#default", None]) or \ - n == node.prefix or n in unsuppressedPrefixes: - return 1 - for attr in other_attrs: - if n == attr.prefix: return 1 - return 0 - -#_in_subset = lambda subset, node: not subset or node in subset -_in_subset = lambda subset, node: subset is None or node in subset # rich's tweak - -class _implementation: - '''Implementation class for C14N. This accompanies a node during it's - processing and includes the parameters and processing state.''' - - # Handler for each node type; populated during module instantiation. - handlers = {} - - def __init__(self, node, write, **kw): - '''Create and run the implementation.''' - self.write = write - self.subset = kw.get('subset') - self.comments = kw.get('comments', 0) - self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes') - nsdict = kw.get('nsdict', { 'xml': XMLNS.XML, 'xmlns': XMLNS.BASE }) - - # Processing state. - self.state = (nsdict, {'xml':''}, {}) #0422 - - if node.nodeType == Node.DOCUMENT_NODE: - self._do_document(node) - elif node.nodeType == Node.ELEMENT_NODE: - self.documentOrder = _Element # At document element - if not _inclusive(self): - self._do_element(node) - else: - inherited = self._inherit_context(node) - self._do_element(node, inherited) - elif node.nodeType == Node.DOCUMENT_TYPE_NODE: - pass - elif node.nodeType == Node.TEXT_NODE: - self._do_text(node) - else: - raise TypeError, str(node) - - - def _inherit_context(self, node): - '''_inherit_context(self, node) -> list - Scan ancestors of attribute and namespace context. Used only - for single element node canonicalization, not for subset - canonicalization.''' - - # Collect the initial list of xml:foo attributes. - xmlattrs = filter(_IN_XML_NS, _attrs(node)) - - # Walk up and get all xml:XXX attributes we inherit. - inherited, parent = [], node.parentNode - while parent and parent.nodeType == Node.ELEMENT_NODE: - for a in filter(_IN_XML_NS, _attrs(parent)): - n = a.localName - if n not in xmlattrs: - xmlattrs.append(n) - inherited.append(a) - parent = parent.parentNode - return inherited - - - def _do_document(self, node): - '''_do_document(self, node) -> None - Process a document node. documentOrder holds whether the document - element has been encountered such that PIs/comments can be written - as specified.''' - - self.documentOrder = _LesserElement - for child in node.childNodes: - if child.nodeType == Node.ELEMENT_NODE: - self.documentOrder = _Element # At document element - self._do_element(child) - self.documentOrder = _GreaterElement # After document element - elif child.nodeType == Node.PROCESSING_INSTRUCTION_NODE: - self._do_pi(child) - elif child.nodeType == Node.COMMENT_NODE: - self._do_comment(child) - elif child.nodeType == Node.DOCUMENT_TYPE_NODE: - pass - else: - raise TypeError, str(child) - handlers[Node.DOCUMENT_NODE] = _do_document - - - def _do_text(self, node): - '''_do_text(self, node) -> None - Process a text or CDATA node. Render various special characters - as their C14N entity representations.''' - if not _in_subset(self.subset, node): return - s = node.data.replace("&", "&") - s = s.replace("<", "<") - s = s.replace(">", ">") - s = s.replace("\015", "
") - if s: self.write(s) - handlers[Node.TEXT_NODE] = _do_text - handlers[Node.CDATA_SECTION_NODE] = _do_text - - - def _do_pi(self, node): - '''_do_pi(self, node) -> None - Process a PI node. Render a leading or trailing #xA if the - document order of the PI is greater or lesser (respectively) - than the document element. - ''' - if not _in_subset(self.subset, node): return - W = self.write - if self.documentOrder == _GreaterElement: W('\n') - W('<?') - W(node.nodeName) - s = node.data - if s: - W(' ') - W(s) - W('?>') - if self.documentOrder == _LesserElement: W('\n') - handlers[Node.PROCESSING_INSTRUCTION_NODE] = _do_pi - - - def _do_comment(self, node): - '''_do_comment(self, node) -> None - Process a comment node. Render a leading or trailing #xA if the - document order of the comment is greater or lesser (respectively) - than the document element. - ''' - if not _in_subset(self.subset, node): return - if self.comments: - W = self.write - if self.documentOrder == _GreaterElement: W('\n') - W('<!--') - W(node.data) - W('-->') - if self.documentOrder == _LesserElement: W('\n') - handlers[Node.COMMENT_NODE] = _do_comment - - - def _do_attr(self, n, value): - ''''_do_attr(self, node) -> None - Process an attribute.''' - - W = self.write - W(' ') - W(n) - W('="') - s = value.replace(value, "&", "&") - s = s.replace("<", "<") - s = s.replace('"', '"') - s = s.replace('\011', '	') - s = s.replace('\012', '
') - s = s.replace('\015', '
') - W(s) - W('"') - - - def _do_element(self, node, initial_other_attrs = []): - '''_do_element(self, node, initial_other_attrs = []) -> None - Process an element (and its children).''' - - # Get state (from the stack) make local copies. - # ns_parent -- NS declarations in parent - # ns_rendered -- NS nodes rendered by ancestors - # ns_local -- NS declarations relevant to this element - # xml_attrs -- Attributes in XML namespace from parent - # xml_attrs_local -- Local attributes in XML namespace. - ns_parent, ns_rendered, xml_attrs = \ - self.state[0], self.state[1].copy(), self.state[2].copy() #0422 - ns_local = ns_parent.copy() - xml_attrs_local = {} - - # progress("_do_element node.nodeName=", node.nodeName) - # progress("_do_element node.namespaceURI", node.namespaceURI) - # progress("_do_element node.tocml()", node.toxml()) - # Divide attributes into NS, XML, and others. - other_attrs = initial_other_attrs[:] - in_subset = _in_subset(self.subset, node) - for a in _attrs(node): - # progress("\t_do_element a.nodeName=", a.nodeName) - if a.namespaceURI == XMLNS.BASE: - n = a.nodeName - if n == "xmlns:": n = "xmlns" # DOM bug workaround - ns_local[n] = a.nodeValue - elif a.namespaceURI == XMLNS.XML: - if _inclusive(self) or in_subset: - xml_attrs_local[a.nodeName] = a #0426 - else: - other_attrs.append(a) - #add local xml:foo attributes to ancestor's xml:foo attributes - xml_attrs.update(xml_attrs_local) - - # Render the node - W, name = self.write, None - if in_subset: - name = node.nodeName - W('<') - W(name) - - # Create list of NS attributes to render. - ns_to_render = [] - for n,v in ns_local.items(): - - # If default namespace is XMLNS.BASE or empty, - # and if an ancestor was the same - if n == "xmlns" and v in [ XMLNS.BASE, '' ] \ - and ns_rendered.get('xmlns') in [ XMLNS.BASE, '', None ]: - continue - - # "omit namespace node with local name xml, which defines - # the xml prefix, if its string value is - # http://www.w3.org/XML/1998/namespace." - if n in ["xmlns:xml", "xml"] \ - and v in [ 'http://www.w3.org/XML/1998/namespace' ]: - continue - - - # If not previously rendered - # and it's inclusive or utilized - if (n,v) not in ns_rendered.items() \ - and (_inclusive(self) or \ - _utilized(n, node, other_attrs, self.unsuppressedPrefixes)): - ns_to_render.append((n, v)) - - # Sort and render the ns, marking what was rendered. - ns_to_render.sort(_sorter_ns) - for n,v in ns_to_render: - self._do_attr(n, v) - ns_rendered[n]=v #0417 - - # If exclusive or the parent is in the subset, add the local xml attributes - # Else, add all local and ancestor xml attributes - # Sort and render the attributes. - if not _inclusive(self) or _in_subset(self.subset,node.parentNode): #0426 - other_attrs.extend(xml_attrs_local.values()) - else: - other_attrs.extend(xml_attrs.values()) - other_attrs.sort(_sorter) - for a in other_attrs: - self._do_attr(a.nodeName, a.value) - W('>') - - # Push state, recurse, pop state. - state, self.state = self.state, (ns_local, ns_rendered, xml_attrs) - for c in _children(node): - _implementation.handlers[c.nodeType](self, c) - self.state = state - - if name: W('</%s>' % name) - handlers[Node.ELEMENT_NODE] = _do_element - - -def Canonicalize(node, output=None, **kw): - '''Canonicalize(node, output=None, **kw) -> UTF-8 - - Canonicalize a DOM document/element node and all descendents. - Return the text; if output is specified then output.write will - be called to output the text and None will be returned - Keyword parameters: - nsdict -- a dictionary of prefix:uri namespace entries - assumed to exist in the surrounding context - comments -- keep comments if non-zero (default is 0) - subset -- Canonical XML subsetting resulting from XPath (default is []) - unsuppressedPrefixes -- do exclusive C14N, and this specifies the - prefixes that should be inherited. - ''' - if output: - apply(_implementation, (node, output.write), kw) - else: - s = StringIO.StringIO() - apply(_implementation, (node, s.write), kw) - return s.getvalue() - -# end of xmlC14n.py - -# from why import BecauseOfData, becauseSubexpression -def BecauseOfData(*args, **kargs): - # print args, kargs - pass -def becauseSubexpression(*args, **kargs): - # print args, kargs - pass - -N3_forSome_URI = forSomeSym -N3_forAll_URI = forAllSym - -# Magic resources we know about - - - -ADDED_HASH = "#" # Stop where we use this in case we want to remove it! -# This is the hash on namespace URIs - -RDF_type = ( SYMBOL , RDF_type_URI ) -DAML_sameAs = ( SYMBOL, DAML_sameAs_URI ) - -LOG_implies_URI = "http://www.w3.org/2000/10/swap/log#implies" - -BOOLEAN_DATATYPE = _XSD_PFX + "boolean" -DECIMAL_DATATYPE = _XSD_PFX + "decimal" -DOUBLE_DATATYPE = _XSD_PFX + "double" -FLOAT_DATATYPE = _XSD_PFX + "float" -INTEGER_DATATYPE = _XSD_PFX + "integer" - -option_noregen = 0 # If set, do not regenerate genids on output - -# @@ I18n - the notname chars need extending for well known unicode non-text -# characters. The XML spec switched to assuming unknown things were name -# characaters. -# _namechars = string.lowercase + string.uppercase + string.digits + '_-' -_notQNameChars = "\t\r\n !\"#$%&'()*.,+/;<=>?@[\\]^`{|}~" # else valid qname :-/ -_notNameChars = _notQNameChars + ":" # Assume anything else valid name :-/ -_rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' - - -N3CommentCharacter = "#" # For unix script #! compatabilty - -########################################## Parse string to sink -# -# Regular expressions: -eol = re.compile(r'[ \t]*(#[^\n]*)?\r?\n') # end of line, poss. w/comment -eof = re.compile(r'[ \t]*(#[^\n]*)?$') # end of file, poss. w/comment -ws = re.compile(r'[ \t]*') # Whitespace not including NL -signed_integer = re.compile(r'[-+]?[0-9]+') # integer -number_syntax = re.compile(r'(?P<integer>[-+]?[0-9]+)(?P<decimal>\.[0-9]+)?(?P<exponent>e[-+]?[0-9]+)?') -digitstring = re.compile(r'[0-9]+') # Unsigned integer -interesting = re.compile(r'[\\\r\n\"]') -langcode = re.compile(r'[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?') -#" - - - -class SinkParser: - def __init__(self, store, openFormula=None, thisDoc="", baseURI=None, - genPrefix = "", flags="", - why=None): - """ note: namespace names should *not* end in #; - the # will get added during qname processing """ - - self._bindings = {} - self._flags = flags - if thisDoc != "": - assert ':' in thisDoc, "Document URI not absolute: <%s>" % thisDoc - self._bindings[""] = thisDoc + "#" # default - - self._store = store - if genPrefix: store.setGenPrefix(genPrefix) # pass it on - - self._thisDoc = thisDoc - self.lines = 0 # for error handling - self.startOfLine = 0 # For calculating character number - self._genPrefix = genPrefix - self.keywords = ['a', 'this', 'bind', 'has', 'is', 'of', 'true', 'false' ] - self.keywordsSet = 0 # Then only can others be considerd qnames - self._anonymousNodes = {} # Dict of anon nodes already declared ln: Term - self._variables = {} - self._parentVariables = {} - self._reason = why # Why the parser was asked to parse this - - self._reason2 = None # Why these triples - # was: diag.tracking - if tracking: self._reason2 = BecauseOfData( - store.newSymbol(thisDoc), because=self._reason) - - if baseURI: self._baseURI = baseURI - else: - if thisDoc: - self._baseURI = thisDoc - else: - self._baseURI = None - - assert not self._baseURI or ':' in self._baseURI - - if not self._genPrefix: - if self._thisDoc: self._genPrefix = self._thisDoc + "#_g" - else: self._genPrefix = uniqueURI() - - if openFormula ==None: - if self._thisDoc: - self._formula = store.newFormula(thisDoc + "#_formula") - else: - self._formula = store.newFormula() - else: - self._formula = openFormula - - - self._context = self._formula - self._parentContext = None - - - def here(self, i): - """String generated from position in file - - This is for repeatability when refering people to bnodes in a document. - This has diagnostic uses less formally, as it should point one to which - bnode the arbitrary identifier actually is. It gives the - line and character number of the '[' charcacter or path character - which introduced the blank node. The first blank node is boringly _L1C1. - It used to be used only for tracking, but for tests in general - it makes the canonical ordering of bnodes repeatable.""" - - return "%s_L%iC%i" % (self._genPrefix , self.lines, - i - self.startOfLine + 1) - - def formula(self): - return self._formula - - def loadStream(self, stream): - return self.loadBuf(stream.read()) # Not ideal - - def loadBuf(self, buf): - """Parses a buffer and returns its top level formula""" - self.startDoc() - - self.feed(buf) - return self.endDoc() # self._formula - - - def feed(self, octets): - """Feed an octet stream tothe parser - - if BadSyntax is raised, the string - passed in the exception object is the - remainder after any statements have been parsed. - So if there is more data to feed to the - parser, it should be straightforward to recover.""" - - if not isinstance(octets, unicode): - s = octets.decode('utf-8') - # NB already decoded, so \ufeff - if len(s) > 0 and s[0] == codecs.BOM_UTF8.decode('utf-8'): - s = s[1:] - else: - s=octets - - i = 0 - while i >= 0: - j = self.skipSpace(s, i) - if j<0: return - - i = self.directiveOrStatement(s,j) - if i<0: - print "# next char: ", `s[j]` - raise BadSyntax(self._thisDoc, self.lines, s, j, - "expected directive or statement") - - def directiveOrStatement(self, str,h): - - i = self.skipSpace(str, h) - if i<0: return i # EOF - - j = self.directive(str, i) - if j>=0: return self.checkDot(str,j) - - j = self.statement(str, i) - if j>=0: return self.checkDot(str,j) - - return j - - - #@@I18N - global _notNameChars - #_namechars = string.lowercase + string.uppercase + string.digits + '_-' - - def tok(self, tok, str, i): - """Check for keyword. Space must have been stripped on entry and - we must not be at end of file.""" - - assert tok[0] not in _notNameChars # not for punctuation - if str[i:i+1] == "@": - i = i+1 - else: - if tok not in self.keywords: - return -1 # No, this has neither keywords declaration nor "@" - - if (str[i:i+len(tok)] == tok - and (str[i+len(tok)] in _notQNameChars )): - i = i + len(tok) - return i - else: - return -1 - - def directive(self, str, i): - j = self.skipSpace(str, i) - if j<0: return j # eof - res = [] - - j = self.tok('bind', str, i) # implied "#". Obsolete. - if j>0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "keyword bind is obsolete: use @prefix") - - j = self.tok('keywords', str, i) - if j>0: - i = self.commaSeparatedList(str, j, res, self.bareWord) - if i < 0: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "'@keywords' needs comma separated list of words") - self.setKeywords(res[:]) - # was: diag.chatty_flag - if chatty_flag > 80: progress("Keywords ", self.keywords) - return i - - - j = self.tok('forAll', str, i) - if j > 0: - i = self.commaSeparatedList(str, j, res, self.uri_ref2) - if i <0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "Bad variable list after @forAll") - for x in res: - #self._context.declareUniversal(x) - if x not in self._variables or x in self._parentVariables: - self._variables[x] = self._context.newUniversal(x) - return i - - j = self.tok('forSome', str, i) - if j > 0: - i = self. commaSeparatedList(str, j, res, self.uri_ref2) - if i <0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "Bad variable list after @forSome") - for x in res: - self._context.declareExistential(x) - return i - - - j=self.tok('prefix', str, i) # no implied "#" - if j>=0: - t = [] - i = self.qname(str, j, t) - if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, - "expected qname after @prefix") - j = self.uri_ref2(str, i, t) - if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "expected <uriref> after @prefix _qname_") - ns = self.uriOf(t[1]) - - if self._baseURI: - ns = join(self._baseURI, ns) - elif ":" not in ns: - raise BadSyntax(self._thisDoc, self.lines, str, j, - "With no base URI, cannot use relative URI in @prefix <"+ns+">") - assert ':' in ns # must be absolute - self._bindings[t[0][0]] = ns - self.bind(t[0][0], hexify(ns)) - return j - - j=self.tok('base', str, i) # Added 2007/7/7 - if j >= 0: - t = [] - i = self.uri_ref2(str, j, t) - if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, - "expected <uri> after @base ") - ns = self.uriOf(t[0]) - - if self._baseURI: - ns = join(self._baseURI, ns) - else: - raise BadSyntax(self._thisDoc, self.lines, str, j, - "With no previous base URI, cannot use relative URI in @base <"+ns+">") - assert ':' in ns # must be absolute - self._baseURI = ns - return i - - return -1 # Not a directive, could be something else. - - def bind(self, qn, uri): - assert isinstance(uri, - types.StringType), "Any unicode must be %x-encoded already" - if qn == "": - self._store.setDefaultNamespace(uri) - else: - self._store.bind(qn, uri) - - def setKeywords(self, k): - "Takes a list of strings" - if k == None: - self.keywordsSet = 0 - else: - self.keywords = k - self.keywordsSet = 1 - - - def startDoc(self): - # was: self._store.startDoc() - self._store.startDoc(self._formula) - - def endDoc(self): - """Signal end of document and stop parsing. returns formula""" - self._store.endDoc(self._formula) # don't canonicalize yet - return self._formula - - def makeStatement(self, quadruple): - #$$$$$$$$$$$$$$$$$$$$$ -# print "# Parser output: ", `quadruple` - self._store.makeStatement(quadruple, why=self._reason2) - - - - def statement(self, str, i): - r = [] - - i = self.object(str, i, r) # Allow literal for subject - extends RDF - if i<0: return i - - j = self.property_list(str, i, r[0]) - - if j<0: raise BadSyntax(self._thisDoc, self.lines, - str, i, "expected propertylist") - return j - - def subject(self, str, i, res): - return self.item(str, i, res) - - def verb(self, str, i, res): - """ has _prop_ - is _prop_ of - a - = - _prop_ - >- prop -> - <- prop -< - _operator_""" - - j = self.skipSpace(str, i) - if j<0:return j # eof - - r = [] - - j = self.tok('has', str, i) - if j>=0: - i = self.prop(str, j, r) - if i < 0: raise BadSyntax(self._thisDoc, self.lines, - str, j, "expected property after 'has'") - res.append(('->', r[0])) - return i - - j = self.tok('is', str, i) - if j>=0: - i = self.prop(str, j, r) - if i < 0: raise BadSyntax(self._thisDoc, self.lines, str, j, - "expected <property> after 'is'") - j = self.skipSpace(str, i) - if j<0: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "End of file found, expected property after 'is'") - return j # eof - i=j - j = self.tok('of', str, i) - if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "expected 'of' after 'is' <prop>") - res.append(('<-', r[0])) - return j - - j = self.tok('a', str, i) - if j>=0: - res.append(('->', RDF_type)) - return j - - - if str[i:i+2] == "<=": - res.append(('<-', self._store.newSymbol(Logic_NS+"implies"))) - return i+2 - - if str[i:i+1] == "=": - if str[i+1:i+2] == ">": - res.append(('->', self._store.newSymbol(Logic_NS+"implies"))) - return i+2 - res.append(('->', DAML_sameAs)) - return i+1 - - if str[i:i+2] == ":=": - # patch file relates two formulae, uses this @@ really? - res.append(('->', Logic_NS+"becomes")) - return i+2 - - j = self.prop(str, i, r) - if j >= 0: - res.append(('->', r[0])) - return j - - if str[i:i+2] == ">-" or str[i:i+2] == "<-": - raise BadSyntax(self._thisDoc, self.lines, str, j, - ">- ... -> syntax is obsolete.") - - return -1 - - def prop(self, str, i, res): - return self.item(str, i, res) - - def item(self, str, i, res): - return self.path(str, i, res) - - def blankNode(self, uri=None): - if "B" not in self._flags: - return self._context.newBlankNode(uri, why=self._reason2) - x = self._context.newSymbol(uri) - self._context.declareExistential(x) - return x - - def path(self, str, i, res): - """Parse the path production. - """ - j = self.nodeOrLiteral(str, i, res) - if j<0: return j # nope - - while str[j:j+1] in "!^.": # no spaces, must follow exactly (?) - ch = str[j:j+1] # @@ Allow "." followed IMMEDIATELY by a node. - if ch == ".": - ahead = str[j+1:j+2] - if not ahead or (ahead in _notNameChars - and ahead not in ":?<[{("): break - subj = res.pop() - obj = self.blankNode(uri=self.here(j)) - j = self.node(str, j+1, res) - if j<0: raise BadSyntax(self._thisDoc, self.lines, str, j, - "EOF found in middle of path syntax") - pred = res.pop() - if ch == "^": # Reverse traverse - self.makeStatement((self._context, pred, obj, subj)) - else: - self.makeStatement((self._context, pred, subj, obj)) - res.append(obj) - return j - - def anonymousNode(self, ln): - """Remember or generate a term for one of these _: anonymous nodes""" - term = self._anonymousNodes.get(ln, None) - if term != None: return term - term = self._store.newBlankNode(self._context, why=self._reason2) - self._anonymousNodes[ln] = term - return term - - def node(self, str, i, res, subjectAlready=None): - """Parse the <node> production. - Space is now skipped once at the beginning - instead of in multipe calls to self.skipSpace(). - """ - subj = subjectAlready - - j = self.skipSpace(str,i) - if j<0: return j #eof - i=j - ch = str[i:i+1] # Quick 1-character checks first: - - if ch == "[": - bnodeID = self.here(i) - j=self.skipSpace(str,i+1) - if j<0: raise BadSyntax(self._thisDoc, - self.lines, str, i, "EOF after '['") - if str[j:j+1] == "=": # Hack for "is" binding name to anon node - i = j+1 - objs = [] - j = self.objectList(str, i, objs); - if j>=0: - subj = objs[0] - if len(objs)>1: - for obj in objs: - self.makeStatement((self._context, - DAML_sameAs, subj, obj)) - j = self.skipSpace(str, j) - if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "EOF when objectList expected after [ = ") - if str[j:j+1] == ";": - j=j+1 - else: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "objectList expected after [= ") - - if subj is None: - subj=self.blankNode(uri= bnodeID) - - i = self.property_list(str, j, subj) - if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, - "property_list expected") - - j = self.skipSpace(str, i) - if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "EOF when ']' expected after [ <propertyList>") - if str[j:j+1] != "]": - raise BadSyntax(self._thisDoc, - self.lines, str, j, "']' expected") - res.append(subj) - return j+1 - - if ch == "{": - ch2 = str[i+1:i+2] - if ch2 == '$': - i += 1 - j = i + 1 - List = [] - first_run = True - while 1: - i = self.skipSpace(str, j) - if i<0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "needed '$}', found end.") - if str[i:i+2] == '$}': - j = i+2 - break - - if not first_run: - if str[i:i+1] == ',': - i+=1 - else: - raise BadSyntax(self._thisDoc, self.lines, - str, i, "expected: ','") - else: first_run = False - - item = [] - j = self.item(str,i, item) #@@@@@ should be path, was object - if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "expected item in set or '$}'") - List.append(self._store.intern(item[0])) - res.append(self._store.newSet(List, self._context)) - return j - else: - j=i+1 - oldParentContext = self._parentContext - self._parentContext = self._context - parentAnonymousNodes = self._anonymousNodes - grandParentVariables = self._parentVariables - self._parentVariables = self._variables - self._anonymousNodes = {} - self._variables = self._variables.copy() - reason2 = self._reason2 - self._reason2 = becauseSubexpression - if subj is None: subj = self._store.newFormula() - self._context = subj - - while 1: - i = self.skipSpace(str, j) - if i<0: raise BadSyntax(self._thisDoc, self.lines, - str, i, "needed '}', found end.") - - if str[i:i+1] == "}": - j = i+1 - break - - j = self.directiveOrStatement(str,i) - if j<0: raise BadSyntax(self._thisDoc, self.lines, - str, i, "expected statement or '}'") - - self._anonymousNodes = parentAnonymousNodes - self._variables = self._parentVariables - self._parentVariables = grandParentVariables - self._context = self._parentContext - self._reason2 = reason2 - self._parentContext = oldParentContext - res.append(subj.close()) # No use until closed - return j - - if ch == "(": - thing_type = self._store.newList - ch2 = str[i+1:i+2] - if ch2 == '$': - thing_type = self._store.newSet - i += 1 - j=i+1 - - List = [] - while 1: - i = self.skipSpace(str, j) - if i<0: raise BadSyntax(self._thisDoc, self.lines, - str, i, "needed ')', found end.") - if str[i:i+1] == ')': - j = i+1 - break - - item = [] - j = self.item(str,i, item) #@@@@@ should be path, was object - if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "expected item in list or ')'") - List.append(self._store.intern(item[0])) - res.append(thing_type(List, self._context)) - return j - - j = self.tok('this', str, i) # This context - if j>=0: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "Keyword 'this' was ancient N3. Now use @forSome and @forAll keywords.") - res.append(self._context) - return j - - #booleans - j = self.tok('true', str, i) - if j>=0: - res.append(True) - return j - j = self.tok('false', str, i) - if j>=0: - res.append(False) - return j - - if subj is None: # If this can be a named node, then check for a name. - j = self.uri_ref2(str, i, res) - if j >= 0: - return j - - return -1 - - def property_list(self, str, i, subj): - """Parse property list - Leaves the terminating punctuation in the buffer - """ - while 1: - j = self.skipSpace(str, i) - if j<0: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "EOF found when expected verb in property list") - return j #eof - - if str[j:j+2] ==":-": - i = j + 2 - res = [] - j = self.node(str, i, res, subj) - if j<0: raise BadSyntax(self._thisDoc, self.lines, str, i, - "bad {} or () or [] node after :- ") - i=j - continue - i=j - v = [] - j = self.verb(str, i, v) - if j<=0: - return i # void but valid - - objs = [] - i = self.objectList(str, j, objs) - if i<0: raise BadSyntax(self._thisDoc, self.lines, str, j, - "objectList expected") - for obj in objs: - dir, sym = v[0] - if dir == '->': - self.makeStatement((self._context, sym, subj, obj)) - else: - self.makeStatement((self._context, sym, obj, subj)) - - j = self.skipSpace(str, i) - if j<0: - raise BadSyntax(self._thisDoc, self.lines, str, j, - "EOF found in list of objects") - return j #eof - if str[i:i+1] != ";": - return i - i = i+1 # skip semicolon and continue - - def commaSeparatedList(self, str, j, res, what): - """return value: -1 bad syntax; >1 new position in str - res has things found appended - """ - i = self.skipSpace(str, j) - if i<0: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "EOF found expecting comma sep list") - return i - if str[i] == ".": return j # empty list is OK - i = what(str, i, res) - if i<0: return -1 - - while 1: - j = self.skipSpace(str, i) - if j<0: return j # eof - ch = str[j:j+1] - if ch != ",": - if ch != ".": - return -1 - return j # Found but not swallowed "." - i = what(str, j+1, res) - if i<0: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "bad list content") - return i - - def objectList(self, str, i, res): - i = self.object(str, i, res) - if i<0: return -1 - while 1: - j = self.skipSpace(str, i) - if j<0: - raise BadSyntax(self._thisDoc, self.lines, str, j, - "EOF found after object") - return j #eof - if str[j:j+1] != ",": - return j # Found something else! - i = self.object(str, j+1, res) - if i<0: return i - - def checkDot(self, str, i): - j = self.skipSpace(str, i) - if j<0: return j #eof - if str[j:j+1] == ".": - return j+1 # skip - if str[j:j+1] == "}": - return j # don't skip it - if str[j:j+1] == "]": - return j - raise BadSyntax(self._thisDoc, self.lines, - str, j, "expected '.' or '}' or ']' at end of statement") - return i - - - def uri_ref2(self, str, i, res): - """Generate uri from n3 representation. - - Note that the RDF convention of directly concatenating - NS and local name is now used though I prefer inserting a '#' - to make the namesapces look more like what XML folks expect. - """ - qn = [] - j = self.qname(str, i, qn) - if j>=0: - pfx, ln = qn[0] - if pfx is None: - assert 0, "not used?" - ns = self._baseURI + ADDED_HASH - else: - try: - ns = self._bindings[pfx] - except KeyError: - if pfx == "_": # Magic prefix 2001/05/30, can be overridden - res.append(self.anonymousNode(ln)) - return j - raise BadSyntax(self._thisDoc, self.lines, str, i, - "Prefix \"%s:\" not bound" % (pfx)) - symb = self._store.newSymbol(ns + ln) - if symb in self._variables: - res.append(self._variables[symb]) - else: - res.append(symb) # @@@ "#" CONVENTION - if not ns.find("#"):progress( - "Warning: no # on namespace %s," % ns) - return j - - - i = self.skipSpace(str, i) - if i<0: return -1 - - if str[i] == "?": - v = [] - j = self.variable(str,i,v) - if j>0: #Forget varibles as a class, only in context. - res.append(v[0]) - return j - return -1 - - elif str[i]=="<": - i = i + 1 - st = i - while i < len(str): - if str[i] == ">": - uref = str[st:i] # the join should dealt with "": - if self._baseURI: - uref = join(self._baseURI, uref) # was: uripath.join - else: - assert ":" in uref, \ - "With no base URI, cannot deal with relative URIs" - if str[i-1:i]=="#" and not uref[-1:]=="#": - uref = uref + "#" # She meant it! Weirdness in urlparse? - symb = self._store.newSymbol(uref) - if symb in self._variables: - res.append(self._variables[symb]) - else: - res.append(symb) - return i+1 - i = i + 1 - raise BadSyntax(self._thisDoc, self.lines, str, j, - "unterminated URI reference") - - elif self.keywordsSet: - v = [] - j = self.bareWord(str,i,v) - if j<0: return -1 #Forget varibles as a class, only in context. - if v[0] in self.keywords: - raise BadSyntax(self._thisDoc, self.lines, str, i, - 'Keyword "%s" not allowed here.' % v[0]) - res.append(self._store.newSymbol(self._bindings[""]+v[0])) - return j - else: - return -1 - - def skipSpace(self, str, i): - """Skip white space, newlines and comments. - return -1 if EOF, else position of first non-ws character""" - while 1: - m = eol.match(str, i) - if m == None: break - self.lines = self.lines + 1 - i = m.end() # Point to first character unmatched - self.startOfLine = i - m = ws.match(str, i) - if m != None: - i = m.end() - m = eof.match(str, i) - if m != None: return -1 - return i - - def variable(self, str, i, res): - """ ?abc -> variable(:abc) - """ - - j = self.skipSpace(str, i) - if j<0: return -1 - - if str[j:j+1] != "?": return -1 - j=j+1 - i = j - if str[j] in "0123456789-": - raise BadSyntax(self._thisDoc, self.lines, str, j, - "Varible name can't start with '%s'" % str[j]) - return -1 - while i <len(str) and str[i] not in _notNameChars: - i = i+1 - if self._parentContext == None: - varURI = self._store.newSymbol(self._baseURI + "#" +str[j:i]) - if varURI not in self._variables: - self._variables[varURI] = self._context.newUniversal(varURI - , why=self._reason2) - res.append(self._variables[varURI]) - return i - # @@ was: - # raise BadSyntax(self._thisDoc, self.lines, str, j, - # "Can't use ?xxx syntax for variable in outermost level: %s" - # % str[j-1:i]) - varURI = self._store.newSymbol(self._baseURI + "#" +str[j:i]) - if varURI not in self._parentVariables: - self._parentVariables[varURI] = self._parentContext.newUniversal(varURI - , why=self._reason2) - res.append(self._parentVariables[varURI]) - return i - - def bareWord(self, str, i, res): - """ abc -> :abc - """ - j = self.skipSpace(str, i) - if j<0: return -1 - - if str[j] in "0123456789-" or str[j] in _notNameChars: return -1 - i = j - while i <len(str) and str[i] not in _notNameChars: - i = i+1 - res.append(str[j:i]) - return i - - def qname(self, str, i, res): - """ - xyz:def -> ('xyz', 'def') - If not in keywords and keywordsSet: def -> ('', 'def') - :def -> ('', 'def') - """ - - i = self.skipSpace(str, i) - if i<0: return -1 - - c = str[i] - if c in "0123456789-+": return -1 - if c not in _notNameChars: - ln = c - i = i + 1 - while i < len(str): - c = str[i] - if c not in _notNameChars: - ln = ln + c - i = i + 1 - else: break - else: # First character is non-alpha - ln = '' # Was: None - TBL (why? useful?) - - if i<len(str) and str[i] == ':': - pfx = ln - i = i + 1 - ln = '' - while i < len(str): - c = str[i] - if c not in _notNameChars: - ln = ln + c - i = i + 1 - else: break - - res.append((pfx, ln)) - return i - - else: # delimiter was not ":" - if ln and self.keywordsSet and ln not in self.keywords: - res.append(('', ln)) - return i - return -1 - - def object(self, str, i, res): - j = self.subject(str, i, res) - if j>= 0: - return j - else: - j = self.skipSpace(str, i) - if j<0: return -1 - else: i=j - - if str[i]=='"': - if str[i:i+3] == '"""': delim = '"""' - else: delim = '"' - i = i + len(delim) - - j, s = self.strconst(str, i, delim) - - res.append(self._store.newLiteral(s)) - progress("New string const ", s, j) - return j - else: - return -1 - - def nodeOrLiteral(self, str, i, res): - j = self.node(str, i, res) - startline = self.lines # Remember where for error messages - if j>= 0: - return j - else: - j = self.skipSpace(str, i) - if j<0: return -1 - else: i=j - - ch = str[i] - if ch in "-+0987654321": - m = number_syntax.match(str, i) - if m == None: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "Bad number syntax") - j = m.end() - if m.group('exponent') != None: # includes decimal exponent - res.append(float(str[i:j])) -# res.append(self._store.newLiteral(str[i:j], -# self._store.newSymbol(FLOAT_DATATYPE))) - elif m.group('decimal') != None: - res.append(Decimal(str[i:j])) - else: - res.append(long(str[i:j])) -# res.append(self._store.newLiteral(str[i:j], -# self._store.newSymbol(INTEGER_DATATYPE))) - return j - - if str[i]=='"': - if str[i:i+3] == '"""': delim = '"""' - else: delim = '"' - i = i + len(delim) - - dt = None - j, s = self.strconst(str, i, delim) - lang = None - if str[j:j+1] == "@": # Language? - m = langcode.match(str, j+1) - if m == None: - raise BadSyntax(self._thisDoc, startline, str, i, - "Bad language code syntax on string literal, after @") - i = m.end() - lang = str[j+1:i] - j = i - if str[j:j+2] == "^^": - res2 = [] - j = self.uri_ref2(str, j+2, res2) # Read datatype URI - dt = res2[0] -# if dt.uriref() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral": - if dt == "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral": - try: - dom = XMLtoDOM('<rdf:envelope xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns">' - + s - + '</rdf:envelope>').firstChild - except: - raise ValueError('s="%s"' % s) - res.append(self._store.newXMLLiteral(dom)) - return j - res.append(self._store.newLiteral(s, dt, lang)) - return j - else: - return -1 - - def uriOf(self, sym): - if isinstance(sym, types.TupleType): - return sym[1] # old system for --pipe - # return sym.uriref() # cwm api - return sym - - - def strconst(self, str, i, delim): - """parse an N3 string constant delimited by delim. - return index, val - """ - - - j = i - ustr = u"" # Empty unicode string - startline = self.lines # Remember where for error messages - while j<len(str): - if str[j] == '"': - if delim == '"': # done when delim is " - i = j + 1 - return i, ustr - if delim == '"""': # done when delim is """ and ... - if str[j:j+5] == '"""""': # ... we have "" before - i = j + 5 - ustr = ustr + '""' - return i, ustr - if str[j:j+4] == '""""': # ... we have " before - i = j + 4 - ustr = ustr + '"' - return i, ustr - if str[j:j+3] == '"""': # ... current " is part of delim - i = j + 3 - return i, ustr - - # we are inside of the string and current char is " - j = j + 1 - ustr = ustr + '"' - continue - - m = interesting.search(str, j) # was str[j:]. - # Note for pos param to work, MUST be compiled ... re bug? - assert m , "Quote expected in string at ^ in %s^%s" %( - str[j-20:j], str[j:j+20]) # we at least have to find a quote - - i = m.start() - try: - ustr = ustr + str[j:i] - except UnicodeError: - err = "" - for c in str[j:i]: - err = err + (" %02x" % ord(c)) - streason = sys.exc_info()[1].__str__() - raise BadSyntax(self._thisDoc, startline, str, j, - "Unicode error appending characters %s to string, because\n\t%s" - % (err, streason)) - -# print "@@@ i = ",i, " j=",j, "m.end=", m.end() - - ch = str[i] - if ch == '"': - j = i - continue - elif ch == "\r": # Strip carriage returns - j = i+1 - continue - elif ch == "\n": - if delim == '"': - raise BadSyntax(self._thisDoc, startline, str, i, - "newline found in string literal") - self.lines = self.lines + 1 - ustr = ustr + ch - j = i + 1 - self.startOfLine = j - - elif ch == "\\": - j = i + 1 - ch = str[j:j+1] # Will be empty if string ends - if not ch: - raise BadSyntax(self._thisDoc, startline, str, i, - "unterminated string literal (2)") - k = 'abfrtvn\\"'.find(ch) - if k >= 0: - uch = '\a\b\f\r\t\v\n\\"'[k] - ustr = ustr + uch - j = j + 1 - elif ch == "u": - j, ch = self.uEscape(str, j+1, startline) - ustr = ustr + ch - elif ch == "U": - j, ch = self.UEscape(str, j+1, startline) - ustr = ustr + ch - else: - raise BadSyntax(self._thisDoc, self.lines, str, i, - "bad escape") - - raise BadSyntax(self._thisDoc, self.lines, str, i, - "unterminated string literal") - - - def uEscape(self, str, i, startline): - j = i - count = 0 - value = 0 - while count < 4: # Get 4 more characters - ch = str[j:j+1].lower() - # sbp http://ilrt.org/discovery/chatlogs/rdfig/2002-07-05 - j = j + 1 - if ch == "": - raise BadSyntax(self._thisDoc, startline, str, i, - "unterminated string literal(3)") - k = "0123456789abcdef".find(ch) - if k < 0: - raise BadSyntax(self._thisDoc, startline, str, i, - "bad string literal hex escape") - value = value * 16 + k - count = count + 1 - uch = unichr(value) - return j, uch - - def UEscape(self, str, i, startline): - stringType = type('') - j = i - count = 0 - value = '\\U' - while count < 8: # Get 8 more characters - ch = str[j:j+1].lower() - # sbp http://ilrt.org/discovery/chatlogs/rdfig/2002-07-05 - j = j + 1 - if ch == "": - raise BadSyntax(self._thisDoc, startline, str, i, - "unterminated string literal(3)") - k = "0123456789abcdef".find(ch) - if k < 0: - raise BadSyntax(self._thisDoc, startline, str, i, - "bad string literal hex escape") - value = value + ch - count = count + 1 - - uch = stringType(value).decode('unicode-escape') - return j, uch - -wide_build = True -try: - unichr(0x10000) -except ValueError: - wide_build = False - -# If we are going to do operators then they should generate -# [ is operator:plus of ( \1 \2 ) ] - - -class BadSyntax(SyntaxError): - def __init__(self, uri, lines, str, i, why): - self._str = str.encode('utf-8') # Better go back to strings for errors - self._i = i - self._why = why - self.lines = lines - self._uri = uri - - def __str__(self): - str = self._str - i = self._i - st = 0 - if i>60: - pre="..." - st = i - 60 - else: pre="" - if len(str)-i > 60: post="..." - else: post="" - - return 'at line %i of <%s>:\nBad syntax (%s) at ^ in:\n"%s%s^%s%s"' \ - % (self.lines +1, self._uri, self._why, pre, - str[st:i], str[i:i+60], post) - - - -def stripCR(str): - res = "" - for ch in str: - if ch != "\r": - res = res + ch - return res - -def dummyWrite(x): - pass - -################################################################################ - - -def toBool(s): - if s == 'true' or s == 'True' or s == '1': - return True - if s == 'false' or s == 'False' or s == '0': - return False - raise ValueError(s) - - - - - -class Formula(object): - number = 0 - - def __init__(self, parent): - self.counter = 0 - Formula.number += 1 - self.number = Formula.number - self.existentials = {} - self.universals = {} - - self.quotedgraph=QuotedGraph(store=parent.store, identifier=self.id()) - - def __str__(self): - return '_:Formula%s' % self.number - - def id(self): - return BNode('_:Formula%s' % self.number) - - def newBlankNode(self, uri=None, why=None): - if uri is None: - self.counter += 1 - b = BNode('f%sb%s' % (id(self), self.counter)) - else: b = BNode(uri.split('#').pop().replace('_', 'b')) - return b - - def newUniversal(self, uri, why=None): - return Variable(uri.split('#').pop()) - - def declareExistential(self, x): - self.existentials[x] = self.newBlankNode() - - def close(self): - - return self.quotedgraph - -r_hibyte = re.compile(r'([\x80-\xff])') -def iri(uri): - return uri.decode('utf-8') - # return unicode(r_hibyte.sub(lambda m: '%%%02X' % ord(m.group(1)), uri)) - -class RDFSink(object): - def __init__(self, graph): - self.rootFormula = None - self.counter = 0 - self.graph=graph - - - def newFormula(self): - assert self.graph.store.formula_aware - f = Formula(self.graph) - return f - - def newSymbol(self, *args): - uri = args[0].encode('utf-8') - return URIRef(iri(uri)) - - def newBlankNode(self, arg=None, **kargs): - if isinstance(arg, Formula): - return arg.newBlankNode() - elif arg is None: - self.counter += 1 - b = BNode('n' + str(self.counter)) - else: b = BNode(str(arg[0]).split('#').pop().replace('_', 'b')) - return b - - def newLiteral(self, s, dt, lang): - if dt: return Literal(s, datatype=dt) - else: return Literal(s, lang=lang) - - def newList(self, n, f): - if not n: - return self.newSymbol( - 'http://www.w3.org/1999/02/22-rdf-syntax-ns#nil' - ) - - a = self.newBlankNode(f) - first = self.newSymbol( - 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first' - ) - rest = self.newSymbol('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest') - self.makeStatement((f, first, a, n[0])) - self.makeStatement((f, rest, a, self.newList(n[1:], f))) - return a - - def newSet(self, *args): - return set(args) - - def setDefaultNamespace(self, *args): - return ':'.join(repr(n) for n in args) - - def makeStatement(self, quadruple, why=None): - f, p, s, o = quadruple - - if hasattr(p, 'formula'): - raise Exception("Formula used as predicate") - - s = self.normalise(f, s) - p = self.normalise(f, p) - o = self.normalise(f, o) - - - if f == self.rootFormula: - # print s, p, o, '.' - self.graph.add((s, p, o)) - else: - f.quotedgraph.add((s,p,o)) - - - #return str(quadruple) - - def normalise(self, f, n): - if isinstance(n, tuple): - return URIRef(unicode(n[1])) - - # if isinstance(n, list): - # rdflist, f = n - # name = self.newBlankNode() - # if f == self.rootFormula: - # sublist = name - # for i in xrange(0, len(rdflist) - 1): - # print sublist, 'first', rdflist[i] - # rest = self.newBlankNode() - # print sublist, 'rest', rest - # sublist = rest - # print sublist, 'first', rdflist[-1] - # print sublist, 'rest', 'nil' - # return name - - if isinstance(n, bool): - s = Literal(str(n).lower(), datatype=BOOLEAN_DATATYPE) - return s - - if isinstance(n, int) or isinstance(n, long): - s = Literal(unicode(n), datatype=INTEGER_DATATYPE) - return s - - if isinstance(n, Decimal): - value = str(n.normalize()) - if value == '-0': - value = '0' - s = Literal(value, datatype=DECIMAL_DATATYPE ) - return s - - if isinstance(n, float): - s = Literal(str(n), datatype=DOUBLE_DATATYPE ) - return s - - if f.existentials.has_key(n): - return f.existentials[n] - - # if isinstance(n, Var): - # if f.universals.has_key(n): - # return f.universals[n] - # f.universals[n] = f.newBlankNode() - # return f.universals[n] - - return n - - def intern(self, something): - return something - - def bind(self, pfx, uri): - pass # print pfx, ':', uri - - def startDoc(self, formula): - self.rootFormula = formula - - def endDoc(self, formula): - pass - - -################################################### -# -# Utilities -# - -Escapes = {'a': '\a', - 'b': '\b', - 'f': '\f', - 'r': '\r', - 't': '\t', - 'v': '\v', - 'n': '\n', - '\\': '\\', - '"': '"'} - -forbidden1 = re.compile(ur'[\\\"\a\b\f\r\v\u0080-\U0000ffff]') -forbidden2 = re.compile(ur'[\\\"\a\b\f\r\v\t\n\u0080-\U0000ffff]') -#" -def stringToN3(str, singleLine=0, flags=""): - res = '' - if (len(str) > 20 and - str[-1] <> '"' and - not singleLine and - (str.find("\n") >=0 - or str.find('"') >=0)): - delim= '"""' - forbidden = forbidden1 # (allow tabs too now) - else: - delim = '"' - forbidden = forbidden2 - - i = 0 - - while i < len(str): - m = forbidden.search(str, i) - if not m: - break - - j = m.start() - res = res + str[i:j] - ch = m.group(0) - if ch == '"' and delim == '"""' and str[j:j+3] != '"""': #" - res = res + ch - else: - k = '\a\b\f\r\t\v\n\\"'.find(ch) - if k >= 0: res = res + "\\" + 'abfrtvn\\"'[k] - else: - if 'e' in flags: -# res = res + ('\\u%04x' % ord(ch)) - res = res + ('\\u%04X' % ord(ch)) - # http://www.w3.org/TR/rdf-testcases/#ntriples - else: - res = res + ch - i = j + 1 - - # The following code fixes things for really high range Unicode - newstr = "" - for ch in res + str[i:]: - if ord(ch)>65535: - newstr = newstr + ('\\U%08X' % ord(ch)) - # http://www.w3.org/TR/rdf-testcases/#ntriples - else: - newstr = newstr + ch - # - - return delim + newstr + delim - -def backslashUify(ustr): - """Use URL encoding to return an ASCII string corresponding - to the given unicode""" -# progress("String is "+`ustr`) -# s1=ustr.encode('utf-8') - s = "" - for ch in ustr: # .encode('utf-8'): - if ord(ch) > 65535: - ch = "\\U%08X" % ord(ch) - elif ord(ch) > 126: - ch = "\\u%04X" % ord(ch) - else: - ch = "%c" % ord(ch) - s = s + ch - return b(s) - -@py3compat.format_doctest_out -def hexify(ustr): - """Use URL encoding to return an ASCII string - corresponding to the given UTF8 string - - >>> hexify("http://example/a b") - %(b)s'http://example/a%%20b' - - """ #" -# progress("String is "+`ustr`) -# s1=ustr.encode('utf-8') - s = "" - for ch in ustr: # .encode('utf-8'): - if ord(ch) > 126 or ord(ch) < 33 : - ch = "%%%02X" % ord(ch) - else: - ch = "%c" % ord(ch) - s = s + ch - return b(s) - -def dummy(): - res = "" - if len(str) > 20 and (str.find("\n") >=0 - or str.find('"') >=0): - delim= '"""' - forbidden = "\\\"\a\b\f\r\v" # (allow tabs too now) - else: - delim = '"' - forbidden = "\\\"\a\b\f\r\v\t\n" - for i in range(len(str)): - ch = str[i] - j = forbidden.find(ch) - if ch == '"' and delim == '"""' \ - and i+1 < len(str) and str[i+1] != '"': - j=-1 # Single quotes don't need escaping in long format - if j>=0: ch = "\\" + '\\"abfrvtn'[j] - elif ch not in "\n\t" and (ch < " " or ch > "}"): - ch = "[[" + `ch` + "]]" #[2:-1] # Use python - res = res + ch - return delim + res + delim - - -class N3Parser(Parser): - - def __init__(self): - pass - - def parse(self, source, graph, encoding="utf-8"): - # we're currently being handed a Graph, not a ConjunctiveGraph - assert graph.store.context_aware # is this implied by formula_aware - assert graph.store.formula_aware - - if encoding not in [None, "utf-8"]: - raise Exception("N3 files are always utf-8 encoded, I was passed: %s"%encoding) - - conj_graph = ConjunctiveGraph(store=graph.store) - conj_graph.default_context = graph # TODO: CG __init__ should have a default_context arg - # TODO: update N3Processor so that it can use conj_graph as the sink - conj_graph.namespace_manager = graph.namespace_manager - sink = RDFSink(conj_graph) - - baseURI = graph.absolutize(source.getPublicId() or source.getSystemId() or "") - p = SinkParser(sink, baseURI=baseURI) - - p.loadStream(source.getByteStream()) - - for prefix, namespace in p._bindings.items(): - conj_graph.bind(prefix, namespace) - - - - -def _test(): - import doctest - doctest.testmod() - - -# if __name__ == '__main__': -# _test() - -def main(): - g=ConjunctiveGraph() - - sink = RDFSink(g) - base = 'file://' + os.path.join(os.getcwd(), sys.argv[1]) - - p = SinkParser(sink, baseURI=base) - p._bindings[''] = p._baseURI + '#' - p.startDoc() - - f = open(sys.argv[1], 'rb') - bytes = f.read() - f.close() - - p.feed(bytes) - p.endDoc() - for t in g.quads((None,None,None)): - - print t - -if __name__ == '__main__': - main() - -#ends - |