creactistore/_templates/lib/rdflib/plugins/parsers/rdfa/literal.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180

# -*- coding: utf-8 -*-
"""
Implementation of the Literal handling. Details of the algorithm are described on
U{RDFa Task Force's wiki page<http://www.w3.org/2006/07/SWD/wiki/RDFa/LiteralObject>}.

@summary: RDFa Literal generation
@requires: U{RDFLib package<http://rdflib.net>}
@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
"""

import re
from rdflib.namespace import RDF
from rdflib.term import Literal

__all__ = ['generate_literal']

XMLLiteral = RDF.XMLLiteral


def __putBackEntities(str):
    """Put 'back' entities for the '&', '<', and '>' characters, to produce kosher XML string.
    Used by XML Literal
    @param str: string to be converted
    @return: string with entities
    @rtype: string
    """
    return str.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

#### The real meat...
def generate_literal(node, graph, subject, state):
    """Generate the literal the C{@property}, taking into account datatype, etc.
    Note: this method is called only if the C{@property} is indeed present, no need to check.

    This method is an encoding of the algorithm documented
    U{task force's wiki page<http://www.w3.org/2006/07/SWD/wiki/RDFa/LiteralObject>}.

    The method returns a value whether the literal is a 'normal' literal (regardless of its datatype)
    or an XML Literal. The return value is True or False, respectively. This value is used to control whether
    the parser should stop recursion. This also means that that if the literal is generated from @content,
    the return value is False, regardless of the possible @datatype value.

    @param node: DOM element node
    @param graph: the (RDF) graph to add the properies to
    @param subject: the RDFLib URIRef serving as a subject for the generated triples
    @param state: the current state to be used for the CURIE-s
    @type state: L{State.ExecutionContext}
    @return: whether the literal is a 'normal' or an XML Literal (return value is True or False, respectively). Note that if the literal is generated from @content, the return value is False, regardless of the possible @datatype value.
    @rtype: Boolean
    """
    def _get_literal(Pnode):
        """
        Get (recursively) the full text from a DOM Node.

        @param Pnode: DOM Node
        @return: string
        """
        rc = ""
        for node in Pnode.childNodes:
            if node.nodeType == node.TEXT_NODE:
                rc = rc + node.data
            elif node.nodeType == node.ELEMENT_NODE:
                rc = rc + _get_literal(node)

        # The decision of the group in February 2008 is not to normalize the result by default.
        # This is reflected in the default value of the option
        if state.options.space_preserve:
            return rc
        else:
            return re.sub(r'(\r| |\n|\t)+', " ", rc).strip()
    # end getLiteral

    def _get_XML_literal(Pnode):
        """
        Get (recursively) the XML Literal content of a DOM Node. (Most of the processing is done
        via a C{node.toxml} call of the xml minidom implementation.)

        @param Pnode: DOM Node
        @return: string
        """
        def collectPrefixes(prefixes, node):
            def addPf(prefx, string):
                pf = string.split(':')[0]
                if pf != string and pf not in prefx : prefx.append(pf)
            # edn addPf

            # first the local name of the node
            addPf(prefixes, node.tagName)
            # get all the attributes and children
            for child in node.childNodes:
                if child.nodeType == node.ELEMENT_NODE:
                    collectPrefixes(prefixes, child)
                elif child.nodeType == node.ATTRIBUTE_NODE:
                    addPf(prefixes, node.child.name)
        # end collectPrefixes

        rc = ""
        prefixes = []
        for node in Pnode.childNodes:
            if node.nodeType == node.ELEMENT_NODE:
                collectPrefixes(prefixes, node)

        for node in Pnode.childNodes:
            if node.nodeType == node.TEXT_NODE:
                rc = rc + __putBackEntities(node.data)
            elif node.nodeType == node.ELEMENT_NODE:
                # Decorate the element with namespaces and lang values
                for prefix in prefixes:
                    if prefix in state.ns and not node.hasAttribute("xmlns:%s" % prefix):
                        node.setAttribute("xmlns:%s" % prefix, "%s" % state.ns[prefix])
                # Set the default namespace, if not done (and is available)
                if not node.getAttribute("xmlns") and state.defaultNS != None:
                    node.setAttribute("xmlns", state.defaultNS)
                # Get the lang, if necessary
                if not node.getAttribute("xml:lang") and state.lang != None:
                    node.setAttribute("xml:lang", state.lang)
                rc = rc + node.toxml()
        return rc
        # If XML Literals must be canonicalized for space, then this is the return line:
        #return re.sub(r'(\r| |\n|\t)+', " ", rc).strip()
    # end getXMLLiteral

    # Most of the times the literal is a 'normal' one, ie, not an XML Literal
    retval = True

    # Get the Property URI-s
    props = state.get_resources(node.getAttribute("property"), prop=True)

    # Get, if exists, the value of @datatype, and figure out the language
    datatype = None
    dtset    = False
    lang     = state.lang
    if node.hasAttribute("datatype"):
        dtset = True
        dt = node.getAttribute("datatype")
        if dt != "":
            datatype = state.get_resource(dt)
            lang = None

    # The simple case: separate @content attribute
    if node.hasAttribute("content"):
        val = node.getAttribute("content")
        object = Literal(node.getAttribute("content"), datatype=datatype, lang=lang)
        # The value of datatype has been set, and the keyword paramaters take care of the rest
    else:
        # see if there *is* a datatype (even if it is empty!)
        if dtset:
            # yep. The Literal content is the pure text part of the current element:
            # We have to check whether the specified datatype is, in fact, and
            # explicit XML Literal
            if datatype == XMLLiteral:
                object = Literal(_get_XML_literal(node), datatype=XMLLiteral)
                retval = False
            else:
                object = Literal(_get_literal(node), datatype=datatype, lang=lang)
        else:
            # no controlling @datatype. We have to see if there is markup in the contained
            # element
            if True in [ n.nodeType == node.ELEMENT_NODE for n in node.childNodes ]:
                # yep, and XML Literal should be generated
                object = Literal(_get_XML_literal(node), datatype=XMLLiteral)
                retval = False
            else:
                val = _get_literal(node)
                # At this point, there might be entities in the string that are returned as real characters by the dom
                # implementation. That should be turned back
                object = Literal(_get_literal(node), lang=lang)

    # NOTE: rdflib<2.5 didn't equal Literal with lang="", hence this check
    # proably always passed?
    # All tests pass with this check removed; going with that..
    ## The object may be empty, for example in an ill-defined <meta> element...
    if True:#object != "":
        for prop in props:
            graph.add((subject, prop, object))

    return retval