1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
|
# -*- coding: utf-8 -*-
"""
Implementation of the Literal handling. Details of the algorithm are described on
U{RDFa Task Force's wiki page<http://www.w3.org/2006/07/SWD/wiki/RDFa/LiteralObject>}.
@summary: RDFa Literal generation
@requires: U{RDFLib package<http://rdflib.net>}
@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
"""
import re
from rdflib.namespace import RDF
from rdflib.term import Literal
__all__ = ['generate_literal']
XMLLiteral = RDF.XMLLiteral
def __putBackEntities(str):
"""Put 'back' entities for the '&', '<', and '>' characters, to produce kosher XML string.
Used by XML Literal
@param str: string to be converted
@return: string with entities
@rtype: string
"""
return str.replace('&', '&').replace('<', '<').replace('>', '>')
#### The real meat...
def generate_literal(node, graph, subject, state):
"""Generate the literal the C{@property}, taking into account datatype, etc.
Note: this method is called only if the C{@property} is indeed present, no need to check.
This method is an encoding of the algorithm documented
U{task force's wiki page<http://www.w3.org/2006/07/SWD/wiki/RDFa/LiteralObject>}.
The method returns a value whether the literal is a 'normal' literal (regardless of its datatype)
or an XML Literal. The return value is True or False, respectively. This value is used to control whether
the parser should stop recursion. This also means that that if the literal is generated from @content,
the return value is False, regardless of the possible @datatype value.
@param node: DOM element node
@param graph: the (RDF) graph to add the properies to
@param subject: the RDFLib URIRef serving as a subject for the generated triples
@param state: the current state to be used for the CURIE-s
@type state: L{State.ExecutionContext}
@return: whether the literal is a 'normal' or an XML Literal (return value is True or False, respectively). Note that if the literal is generated from @content, the return value is False, regardless of the possible @datatype value.
@rtype: Boolean
"""
def _get_literal(Pnode):
"""
Get (recursively) the full text from a DOM Node.
@param Pnode: DOM Node
@return: string
"""
rc = ""
for node in Pnode.childNodes:
if node.nodeType == node.TEXT_NODE:
rc = rc + node.data
elif node.nodeType == node.ELEMENT_NODE:
rc = rc + _get_literal(node)
# The decision of the group in February 2008 is not to normalize the result by default.
# This is reflected in the default value of the option
if state.options.space_preserve:
return rc
else:
return re.sub(r'(\r| |\n|\t)+', " ", rc).strip()
# end getLiteral
def _get_XML_literal(Pnode):
"""
Get (recursively) the XML Literal content of a DOM Node. (Most of the processing is done
via a C{node.toxml} call of the xml minidom implementation.)
@param Pnode: DOM Node
@return: string
"""
def collectPrefixes(prefixes, node):
def addPf(prefx, string):
pf = string.split(':')[0]
if pf != string and pf not in prefx : prefx.append(pf)
# edn addPf
# first the local name of the node
addPf(prefixes, node.tagName)
# get all the attributes and children
for child in node.childNodes:
if child.nodeType == node.ELEMENT_NODE:
collectPrefixes(prefixes, child)
elif child.nodeType == node.ATTRIBUTE_NODE:
addPf(prefixes, node.child.name)
# end collectPrefixes
rc = ""
prefixes = []
for node in Pnode.childNodes:
if node.nodeType == node.ELEMENT_NODE:
collectPrefixes(prefixes, node)
for node in Pnode.childNodes:
if node.nodeType == node.TEXT_NODE:
rc = rc + __putBackEntities(node.data)
elif node.nodeType == node.ELEMENT_NODE:
# Decorate the element with namespaces and lang values
for prefix in prefixes:
if prefix in state.ns and not node.hasAttribute("xmlns:%s" % prefix):
node.setAttribute("xmlns:%s" % prefix, "%s" % state.ns[prefix])
# Set the default namespace, if not done (and is available)
if not node.getAttribute("xmlns") and state.defaultNS != None:
node.setAttribute("xmlns", state.defaultNS)
# Get the lang, if necessary
if not node.getAttribute("xml:lang") and state.lang != None:
node.setAttribute("xml:lang", state.lang)
rc = rc + node.toxml()
return rc
# If XML Literals must be canonicalized for space, then this is the return line:
#return re.sub(r'(\r| |\n|\t)+', " ", rc).strip()
# end getXMLLiteral
# Most of the times the literal is a 'normal' one, ie, not an XML Literal
retval = True
# Get the Property URI-s
props = state.get_resources(node.getAttribute("property"), prop=True)
# Get, if exists, the value of @datatype, and figure out the language
datatype = None
dtset = False
lang = state.lang
if node.hasAttribute("datatype"):
dtset = True
dt = node.getAttribute("datatype")
if dt != "":
datatype = state.get_resource(dt)
lang = None
# The simple case: separate @content attribute
if node.hasAttribute("content"):
val = node.getAttribute("content")
object = Literal(node.getAttribute("content"), datatype=datatype, lang=lang)
# The value of datatype has been set, and the keyword paramaters take care of the rest
else:
# see if there *is* a datatype (even if it is empty!)
if dtset:
# yep. The Literal content is the pure text part of the current element:
# We have to check whether the specified datatype is, in fact, and
# explicit XML Literal
if datatype == XMLLiteral:
object = Literal(_get_XML_literal(node), datatype=XMLLiteral)
retval = False
else:
object = Literal(_get_literal(node), datatype=datatype, lang=lang)
else:
# no controlling @datatype. We have to see if there is markup in the contained
# element
if True in [ n.nodeType == node.ELEMENT_NODE for n in node.childNodes ]:
# yep, and XML Literal should be generated
object = Literal(_get_XML_literal(node), datatype=XMLLiteral)
retval = False
else:
val = _get_literal(node)
# At this point, there might be entities in the string that are returned as real characters by the dom
# implementation. That should be turned back
object = Literal(_get_literal(node), lang=lang)
# NOTE: rdflib<2.5 didn't equal Literal with lang="", hence this check
# proably always passed?
# All tests pass with this check removed; going with that..
## The object may be empty, for example in an ill-defined <meta> element...
if True:#object != "":
for prop in props:
graph.add((subject, prop, object))
return retval
|