From b2d38f17e56bf3b391fe20b77532df23e5721413 Mon Sep 17 00:00:00 2001 From: Aleksey Lim Date: Thu, 12 Aug 2010 16:20:23 +0000 Subject: Add copyright notes to python files --- (limited to 'aiml/AimlParser.py') diff --git a/aiml/AimlParser.py b/aiml/AimlParser.py new file mode 100644 index 0000000..75c2cf1 --- /dev/null +++ b/aiml/AimlParser.py @@ -0,0 +1,545 @@ +from xml.sax.handler import ContentHandler +from xml.sax.xmlreader import Locator +import sys +import xml.sax +import xml.sax.handler + +class AimlParserError(Exception): pass + +class AimlHandler(ContentHandler): + # The legal states of the AIML parser + _STATE_OutsideAiml = 0 + _STATE_InsideAiml = 1 + _STATE_InsideCategory = 2 + _STATE_InsidePattern = 3 + _STATE_AfterPattern = 4 + _STATE_InsideThat = 5 + _STATE_AfterThat = 6 + _STATE_InsideTemplate = 7 + _STATE_AfterTemplate = 8 + + def __init__(self, encoding = "UTF-8"): + self.categories = {} + self._encoding = encoding + self._state = self._STATE_OutsideAiml + self._version = "" + self._namespace = "" + self._forwardCompatibleMode = False + self._currentPattern = "" + self._currentThat = "" + self._currentTopic = "" + self._insideTopic = False + self._currentUnknown = "" # the name of the current unknown element + + # This is set to true when a parse error occurs in a category. + self._skipCurrentCategory = False + + # Counts the number of parse errors in a particular AIML document. + # query with getNumErrors(). If 0, the document is AIML-compliant. + self._numParseErrors = 0 + + # TODO: select the proper validInfo table based on the version number. + self._validInfo = self._validationInfo101 + + # This stack of bools is used when parsing
  • elements inside + # elements, to keep track of whether or not an + # attribute-less "default"
  • element has been found yet. Only + # one default
  • is allowed in each element. We need + # a stack in order to correctly handle nested tags. + self._foundDefaultLiStack = [] + + # This stack of strings indicates what the current whitespace-handling + # behavior should be. Each string in the stack is either "default" or + # "preserve". When a new AIML element is encountered, a new string is + # pushed onto the stack, based on the value of the element's "xml:space" + # attribute (if absent, the top of the stack is pushed again). When + # ending an element, pop an object off the stack. + self._whitespaceBehaviorStack = ["default"] + + self._elemStack = [] + self._locator = Locator() + self.setDocumentLocator(self._locator) + + def getNumErrors(self): + "Return the number of errors found while parsing the current document." + return self._numParseErrors + + def setEncoding(self, encoding): + """Set the text encoding to use when encoding strings read from XML. + + Defaults to 'UTF-8'. + + """ + self._encoding = encoding + + def _location(self): + "Return a string describing the current location in the source file." + line = self._locator.getLineNumber() + column = self._locator.getColumnNumber() + return "(line %d, column %d)" % (line, column) + + def _pushWhitespaceBehavior(self, attr): + """Push a new string onto the whitespaceBehaviorStack. + + The string's value is taken from the "xml:space" attribute, if it exists + and has a legal value ("default" or "preserve"). Otherwise, the previous + stack element is duplicated. + + """ + assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!" + try: + if attr["xml:space"] == "default" or attr["xml:space"] == "preserve": + self._whitespaceBehaviorStack.append(attr["xml:space"]) + else: + raise AimlParserError, "Invalid value for xml:space attribute "+self._location() + except KeyError: + self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1]) + + def startElementNS(self, name, qname, attr): + print "QNAME:", qname + print "NAME:", name + uri,elem = name + if (elem == "bot"): print "name:", attr.getValueByQName("name"), "a'ite?" + self.startElement(elem, attr) + pass + + def startElement(self, name, attr): + # Wrapper around _startElement, which catches errors in _startElement() + # and keeps going. + + # If we're inside an unknown element, ignore everything until we're + # out again. + if self._currentUnknown != "": + return + # If we're skipping the current category, ignore everything until + # it's finished. + if self._skipCurrentCategory: + return + + # process this start-element. + try: self._startElement(name, attr) + except AimlParserError, msg: + # Print the error message + sys.stderr.write("PARSE ERROR: %s\n" % msg) + + self._numParseErrors += 1 # increment error count + # In case of a parse error, if we're inside a category, skip it. + if self._state >= self._STATE_InsideCategory: + self._skipCurrentCategory = True + + def _startElement(self, name, attr): + if name == "aiml": + # tags are only legal in the OutsideAiml state + if self._state != self._STATE_OutsideAiml: + raise AimlParserError, "Unexpected tag "+self._location() + self._state = self._STATE_InsideAiml + self._insideTopic = False + self._currentTopic = u"" + try: self._version = attr["version"] + except KeyError: + # This SHOULD be a syntax error, but so many AIML sets out there are missing + # "version" attributes that it just seems nicer to let it slide. + #raise AimlParserError, "Missing 'version' attribute in tag "+self._location() + #print "WARNING: Missing 'version' attribute in tag "+self._location() + #print " Defaulting to version 1.0" + self._version = "1.0" + self._forwardCompatibleMode = (self._version != "1.0.1") + self._pushWhitespaceBehavior(attr) + # Not sure about this namespace business yet... + #try: + # self._namespace = attr["xmlns"] + # if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1": + # raise AimlParserError, "Incorrect namespace for AIML v1.0.1 "+self._location() + #except KeyError: + # if self._version != "1.0": + # raise AimlParserError, "Missing 'version' attribute(s) in tag "+self._location() + elif self._state == self._STATE_OutsideAiml: + # If we're outside of an AIML element, we ignore all tags. + return + elif name == "topic": + # tags are only legal in the InsideAiml state, and only + # if we're not already inside a topic. + if (self._state != self._STATE_InsideAiml) or self._insideTopic: + raise AimlParserError, "Unexpected tag", self._location() + try: self._currentTopic = unicode(attr['name']) + except KeyError: + raise AimlParserError, "Required \"name\" attribute missing in element "+self._location() + self._insideTopic = True + elif name == "category": + # tags are only legal in the InsideAiml state + if self._state != self._STATE_InsideAiml: + raise AimlParserError, "Unexpected tag "+self._location() + self._state = self._STATE_InsideCategory + self._currentPattern = u"" + self._currentThat = u"" + # If we're not inside a topic, the topic is implicitly set to * + if not self._insideTopic: self._currentTopic = u"*" + self._elemStack = [] + self._pushWhitespaceBehavior(attr) + elif name == "pattern": + # tags are only legal in the InsideCategory state + if self._state != self._STATE_InsideCategory: + raise AimlParserError, "Unexpected tag "+self._location() + self._state = self._STATE_InsidePattern + elif name == "that" and self._state == self._STATE_AfterPattern: + # are legal either inside a