From 8934f10685a6cec558cc9b9aeacf6c530b2cc6c6 Mon Sep 17 00:00:00 2001 From: Assim Deodia Date: Wed, 18 Jun 2008 04:44:47 +0000 Subject: dictionary and word class interface for sqlite DB --- diff --git a/conv.sh b/conv.sh new file mode 100644 index 0000000..8f8a9f4 --- /dev/null +++ b/conv.sh @@ -0,0 +1,43 @@ + #!/bin/sh + + if [ "x$1" == "x" ]; then + echo "Usage: $0 " + exit + fi + + if [ -e "$1.db" ]; then + echo "$1.db already exists. I will overwrite it in 15 seconds if you do not press CTRL-C." + rm $1.db + fi + + mysqldump -h 10.250.100.69 -u assim -p --compact --compatible=ansi --default-character-set=binary $1 | + grep -v ' KEY "' | + grep -v ' UNIQUE KEY "' | + grep -v ' PRIMARY KEY ' | + sed 's/ unsigned / /g' | + sed 's/ auto_increment/ primary key autoincrement/gi' | + sed 's/ smallint([0-9]*) / integer /gi' | + sed 's/ tinyint([0-9]*) / integer /gi' | + sed 's/ int([0-9]*) / integer /gi' | + sed 's/ character set [^ ]* / /gi' | + sed 's/ enum([^)]*) / varchar(255) /gi' | + sed 's/ on update [^,]*//gi' | + perl -e 'local $/;$_=<>;s/,\n\)/\n\)/gs;print "begin;\n";print;print "commit;\n"' | + perl -pe ' + if (/^(INSERT.+?)\(/) { + $a=$1; + s/\\'\''/'\'\''/g; + s/\\n/\n/g; + s/\),\(/\);\n$a\(/g; + } + ' > $1.sql + cat $1.sql | sqlite3 $1.db > $1.err + ERRORS=`cat $1.err | wc -l` + if [ "$ERRORS" == "0" ]; then + echo "Conversion completed without error. Output file: $1.db" + rm $1.sql + rm $1.err + else + echo "There were errors during conversion. Please review $1.err and $1.sql for details." + fi + diff --git a/dict.db b/dict.db new file mode 100644 index 0000000..8765420 --- /dev/null +++ b/dict.db Binary files differ diff --git a/dict/dict.sql.zip b/dict/dict.sql.zip new file mode 100644 index 0000000..5a8d304 --- /dev/null +++ b/dict/dict.sql.zip Binary files differ diff --git a/dict/index.php b/dict/index.php new file mode 100644 index 0000000..b9588f4 --- /dev/null +++ b/dict/index.php @@ -0,0 +1,69 @@ + 0){ + + + $row = mysql_fetch_assoc($result); + $q2 = "INSERT into `las_word` (`wordid`, `lemma`) VALUES ( '".$row['wordid']."', '".$row['lemma']."' )"; + $result2 = mysql_query($q2) or print('mysql_query:2'.mysql_error()); + + $q3 = "SELECT * FROM `sense` WHERE `wordid` = '".$row['wordid']."'"; + $result3 = mysql_query($q3) or print('mysql_query:3'.mysql_error()); + $num_rows3 = mysql_num_rows($result3); + + + for($i = 0; $i< $num_rows3; $i++){ + + $row3 = mysql_fetch_assoc($result3); + + $q4 = "INSERT INTO `las_sense` (`wordid`, `synsetid`, `rank`) VALUES ('".$row3['wordid']."', '".$row3['synsetid']."', '".$row3['rank']."')"; + $result4 = mysql_query($q4) or print('mysql_query:4'.mysql_error()); + + $q5 = "SELECT * FROM `synset` WHERE `synsetid` = '".$row3['synsetid']."'"; + $q6 = "SELECT * FROM `sample` WHERE `synsetid` = '".$row3['synsetid']."'"; + + $result5 = mysql_query($q5) or print('mysql_query:5'.mysql_error()); + $num_rows5 = mysql_num_rows($result5); + + $result6 = mysql_query($q6) or print('mysql_query:6'.mysql_error()); + $num_rows6 = mysql_num_rows($result6); + + for($j = 0;$j < $num_rows5; $j++){ + + $row5 = mysql_fetch_assoc($result5); + $def = addslashes($row5['definition']); + $q7 = "INSERT INTO `las_synset` (`synsetid`, `pos`, `definition`) VALUES ('".$row5['synsetid']."', '".$row5['pos']."', '".$def."')"; + $result7 = mysql_query($q7) or print('mysql_query:7'.mysql_error().$q7); + + } + + for($k = 0;$k < $num_rows6; $k++){ + + $row6 = mysql_fetch_assoc($result6); + $sample = addslashes($row6['sample']); + $q8 = "INSERT INTO `las_sample` (`synsetid`, `sampleid`, `sample`) VALUES ('".$row6['synsetid']."', '".$row6['sampleid']."', '".$sample."')"; + $result8 = mysql_query($q8) or print('mysql_query:8'.mysql_error()); + + } + + } + + $num_rows --; + //echo "
".$row['wordid'].":".$row['lemma']; +} +if(num_rows == 0) + echo "Done :)"; +?> \ No newline at end of file diff --git a/dictionary.py b/dictionary.py index b962714..a7ad273 100644 --- a/dictionary.py +++ b/dictionary.py @@ -2,91 +2,124 @@ import sys # Provides the API to control the dictionary. -global __root__ -global __element__ -global __word__ -global __def__ -global __phnm__ -global __pos__ -global __src__ -global __fld__ global __debug -global __numword -global __DBName -global __DBTableName +global DBname +global word_list -__root__ = "dictionary" -__element__ = "p" -__word__ = "hw" -__def__ = "def" -__phnm__ = "pr" -__pos__ = "pos" -__src__ = "source" -__fld__ = "fld" -__debug = True -__numword = -1 -__DBName = "/tmp/las" -__DBTableName = "dictionary" #Currently not used +__debug = True +DBname = "dict.db" +word_list = [] #strings which are tag in XML class dictionary: - def __init__(self, filename): - from xml.etree.ElementTree import ElementTree - #create an ElementTree instance from an XML file - self.ETree = ElementTree(file=filename) - if self.ETree.getroot().tag != __root__: - print "Invalid File" - sys.exit(0) - - def getroottag(self): - return self.ETree.getroot().tag - def getnumwords(self): - self.iter = self.ETree.getiterator(__element__) - __numword = len(self.iter) - return __numword - def loadict(self): - self.MakeDB() - for element in self.iter: - if element.getchildren(): - #Can also use: "for child in element.getchildren():" - tempDict = {} - tempDict["word"] = "" - tempDict["def"] = "" - tempDict["phnm"] = "" - tempDict["src"] = "" - tempDict["pos"] = "" - for child in element: - #Child element tag name - if child.tag == __word__: - tempDict["word"] = child.text - elif child.tag == __def__: - tempDict["def"] = child.text - elif child.tag == __phnm__: - tempDict["phnm"] = child.text - elif child.tag == __src__: - tempDict["src"] = child.text - elif child.tag == __pos__: - tempDict["pos"] = child.text - t = ( tempDict["word"],tempDict["def"],tempDict["phnm"],tempDict["src"],tempDict["pos"] ) - self.c.execute( "insert into dict values (?, ? , ?, ?, ?)", t ) - self.conn.commit() - self.c.close() - self.conn.close() + def __init__(self, DBname): - def MakeDB(self): import sqlite3 + self.conn = sqlite3.connect(DBname, isolation_level=None) + # Turn on autocommit mode + # Set isolation_level to "IMMEDIATE" + self.conn.isolation_level = "IMMEDIATE" + self.cur = self.conn.cursor() + self.numwords = -1 + self.wordid_list = [] + self.level = 0 + + def getnumwords(self, level = 0): + if self.numwords == -1: + if level == 0: + self.cur.execute("SELECT COUNT(wordid) from las_word") + else: + self.cur.execute("SELECT COUNT(wordid) from las_word where length = ?", (level, )) + self.numwords = self.cur.fetchone() + return self.numwords + + + def getrandomwordid(self, level=0): + if self.wordid_list == [] or self.level != level: + if level == 0: + self.cur.execute("SELECT wordid from las_word") + else: + self.level = level + self.cur.execute("SELECT wordid from las_word where length = ?", (level, )) + self.wordid_list = self.cur.fetchall() + #count = self.wordid_list.count + count = len(self.wordid_list) import random - self.conn = sqlite3.connect("las.db") - self.c = self.conn.cursor() - self.c.execute('''DROP TABLE IF EXISTS dict''') - self.c.execute('''CREATE table IF NOT EXISTS dict (word text, def text, phnm text, src text, pos text)''') + randid = random.randint(0,count) + (id,) = self.wordid_list[randid] + return id + +class word: + + def __init__(self, identifier, value): + import sqlite3 + self.conn = sqlite3.connect(DBname, isolation_level=None) + # Turn on autocommit mode + # Set isolation_level to "IMMEDIATE" + self.conn.isolation_level = "IMMEDIATE" + self.cur = self.conn.cursor() + if identifier == "las_word_id": + self.las_word_id = value + self.cur.execute("SELECT * from las_word where laswid = ?", (value,)) + elif identifier == "wordid": + self.wordid = value + self.cur.execute("SELECT * from las_word where wordid = ?", (value,)) + elif identifier == "word": + self.word = value + self.cur.execute("SELECT * from las_word where lemma = ?", (value,)) + else: + return "Invalid Usage" + + (laswid, wordid, lemma, length) = self.cur.fetchone() + self.las_word_id = laswid + self.wordid = wordid + self.word = lemma + self.length = length + + def getword(self): + return self.word + + def getsynsetid(self): + self.synsetid_list = [] + self.cur.execute("SELECT * from las_sense where wordid = ?", (self.wordid,)) + for (wordid, synsetid, rank) in self.cur: + self.synsetid_list.append(synsetid) + return self.synsetid_list + + def getdef(self): + self.def_list = [] + if self.synsetid_list == []: + self.getsynsetid() + for synsetid in self.synsetid_list: + self.cur.execute("SELECT * from las_synset where synsetid = ?", (synsetid,) ) + for (synsetid, pos, defination) in self.cur: + self.def_list.append( (synsetid, pos, defination)) + return self.def_list + + def getusage(self): + if self.synsetid_list == []: + self.getsynsetid() + self.usage_list = [] + for synsetid in self.synsetid_list: + self.cur.execute("SELECT * from las_sample where synsetid = ?", (synsetid,)) + for (synsetid, sampleid, sample) in self.cur: + self.usage_list.append( (synsetid, sampleid, sample)) + return self.usage_list + if __name__ == "__main__": - k = dictionary("dict/k.xml") - print k.getroottag() - print k.getnumwords() - k.loadict() \ No newline at end of file + k = dictionary("dict.db") + num_words = k.getnumwords() + print num_words + + wordid = k.getrandomwordid(15) + l = word("wordid", wordid ) + + print l.getword() + l.getsynsetid() + print l.getdef() + print l.getusage() \ No newline at end of file -- cgit v0.9.1