diff options
author | Assim Deodia <assim.deodia@gmail.com> | 2008-06-18 04:44:47 (GMT) |
---|---|---|
committer | Assim Deodia <assim.deodia@gmail.com> | 2008-06-18 04:44:47 (GMT) |
commit | 8934f10685a6cec558cc9b9aeacf6c530b2cc6c6 (patch) | |
tree | 3c3fa0ad5edcefda2639867a170c83e3d677a4d6 | |
parent | 2385830803ca8b6eaf9a2666f26c0f5586167c46 (diff) |
dictionary and word class interface for sqlite DB
-rw-r--r-- | conv.sh | 43 | ||||
-rw-r--r-- | dict.db | bin | 0 -> 13402112 bytes | |||
-rw-r--r-- | dict/dict.sql.zip | bin | 0 -> 4913531 bytes | |||
-rw-r--r-- | dict/index.php | 69 | ||||
-rw-r--r-- | dictionary.py | 181 |
5 files changed, 219 insertions, 74 deletions
@@ -0,0 +1,43 @@ + #!/bin/sh + + if [ "x$1" == "x" ]; then + echo "Usage: $0 <dbname>" + exit + fi + + if [ -e "$1.db" ]; then + echo "$1.db already exists. I will overwrite it in 15 seconds if you do not press CTRL-C." + rm $1.db + fi + + mysqldump -h 10.250.100.69 -u assim -p --compact --compatible=ansi --default-character-set=binary $1 | + grep -v ' KEY "' | + grep -v ' UNIQUE KEY "' | + grep -v ' PRIMARY KEY ' | + sed 's/ unsigned / /g' | + sed 's/ auto_increment/ primary key autoincrement/gi' | + sed 's/ smallint([0-9]*) / integer /gi' | + sed 's/ tinyint([0-9]*) / integer /gi' | + sed 's/ int([0-9]*) / integer /gi' | + sed 's/ character set [^ ]* / /gi' | + sed 's/ enum([^)]*) / varchar(255) /gi' | + sed 's/ on update [^,]*//gi' | + perl -e 'local $/;$_=<>;s/,\n\)/\n\)/gs;print "begin;\n";print;print "commit;\n"' | + perl -pe ' + if (/^(INSERT.+?)\(/) { + $a=$1; + s/\\'\''/'\'\''/g; + s/\\n/\n/g; + s/\),\(/\);\n$a\(/g; + } + ' > $1.sql + cat $1.sql | sqlite3 $1.db > $1.err + ERRORS=`cat $1.err | wc -l` + if [ "$ERRORS" == "0" ]; then + echo "Conversion completed without error. Output file: $1.db" + rm $1.sql + rm $1.err + else + echo "There were errors during conversion. Please review $1.err and $1.sql for details." + fi + Binary files differdiff --git a/dict/dict.sql.zip b/dict/dict.sql.zip Binary files differnew file mode 100644 index 0000000..5a8d304 --- /dev/null +++ b/dict/dict.sql.zip diff --git a/dict/index.php b/dict/index.php new file mode 100644 index 0000000..b9588f4 --- /dev/null +++ b/dict/index.php @@ -0,0 +1,69 @@ +<?PHP
+set_time_limit ( 0 ) ;
+mysql_connect('localhost', 'root', 'kandisa');
+mysql_select_db('wordnet30');
+
+$q1 = "SELECT *
+FROM `word`
+WHERE `lemma` REGEXP CONVERT( _utf8 '^[a-z]*[a-z]$'
+USING latin1 )
+COLLATE latin1_swedish_ci";
+$result = mysql_query($q1) or die('mysql_query1:'.mysql_error());
+
+$num_rows = mysql_num_rows($result);
+
+
+while($num_rows > 0){
+
+
+ $row = mysql_fetch_assoc($result);
+ $q2 = "INSERT into `las_word` (`wordid`, `lemma`) VALUES ( '".$row['wordid']."', '".$row['lemma']."' )";
+ $result2 = mysql_query($q2) or print('mysql_query:2'.mysql_error());
+
+ $q3 = "SELECT * FROM `sense` WHERE `wordid` = '".$row['wordid']."'";
+ $result3 = mysql_query($q3) or print('mysql_query:3'.mysql_error());
+ $num_rows3 = mysql_num_rows($result3);
+
+
+ for($i = 0; $i< $num_rows3; $i++){
+
+ $row3 = mysql_fetch_assoc($result3);
+
+ $q4 = "INSERT INTO `las_sense` (`wordid`, `synsetid`, `rank`) VALUES ('".$row3['wordid']."', '".$row3['synsetid']."', '".$row3['rank']."')";
+ $result4 = mysql_query($q4) or print('mysql_query:4'.mysql_error());
+
+ $q5 = "SELECT * FROM `synset` WHERE `synsetid` = '".$row3['synsetid']."'";
+ $q6 = "SELECT * FROM `sample` WHERE `synsetid` = '".$row3['synsetid']."'";
+
+ $result5 = mysql_query($q5) or print('mysql_query:5'.mysql_error());
+ $num_rows5 = mysql_num_rows($result5);
+
+ $result6 = mysql_query($q6) or print('mysql_query:6'.mysql_error());
+ $num_rows6 = mysql_num_rows($result6);
+
+ for($j = 0;$j < $num_rows5; $j++){
+
+ $row5 = mysql_fetch_assoc($result5);
+ $def = addslashes($row5['definition']);
+ $q7 = "INSERT INTO `las_synset` (`synsetid`, `pos`, `definition`) VALUES ('".$row5['synsetid']."', '".$row5['pos']."', '".$def."')";
+ $result7 = mysql_query($q7) or print('mysql_query:7'.mysql_error().$q7);
+
+ }
+
+ for($k = 0;$k < $num_rows6; $k++){
+
+ $row6 = mysql_fetch_assoc($result6);
+ $sample = addslashes($row6['sample']);
+ $q8 = "INSERT INTO `las_sample` (`synsetid`, `sampleid`, `sample`) VALUES ('".$row6['synsetid']."', '".$row6['sampleid']."', '".$sample."')";
+ $result8 = mysql_query($q8) or print('mysql_query:8'.mysql_error());
+
+ }
+
+ }
+
+ $num_rows --;
+ //echo "<br/>".$row['wordid'].":".$row['lemma'];
+}
+if(num_rows == 0)
+ echo "Done :)";
+?>
\ No newline at end of file diff --git a/dictionary.py b/dictionary.py index b962714..a7ad273 100644 --- a/dictionary.py +++ b/dictionary.py @@ -2,91 +2,124 @@ import sys # Provides the API to control the dictionary. -global __root__ -global __element__ -global __word__ -global __def__ -global __phnm__ -global __pos__ -global __src__ -global __fld__ global __debug -global __numword -global __DBName -global __DBTableName +global DBname +global word_list -__root__ = "dictionary" -__element__ = "p" -__word__ = "hw" -__def__ = "def" -__phnm__ = "pr" -__pos__ = "pos" -__src__ = "source" -__fld__ = "fld" -__debug = True -__numword = -1 -__DBName = "/tmp/las" -__DBTableName = "dictionary" #Currently not used +__debug = True +DBname = "dict.db" +word_list = [] #strings which are tag in XML class dictionary: - def __init__(self, filename): - from xml.etree.ElementTree import ElementTree - #create an ElementTree instance from an XML file - self.ETree = ElementTree(file=filename) - if self.ETree.getroot().tag != __root__: - print "Invalid File" - sys.exit(0) - - def getroottag(self): - return self.ETree.getroot().tag - def getnumwords(self): - self.iter = self.ETree.getiterator(__element__) - __numword = len(self.iter) - return __numword - def loadict(self): - self.MakeDB() - for element in self.iter: - if element.getchildren(): - #Can also use: "for child in element.getchildren():" - tempDict = {} - tempDict["word"] = "" - tempDict["def"] = "" - tempDict["phnm"] = "" - tempDict["src"] = "" - tempDict["pos"] = "" - for child in element: - #Child element tag name - if child.tag == __word__: - tempDict["word"] = child.text - elif child.tag == __def__: - tempDict["def"] = child.text - elif child.tag == __phnm__: - tempDict["phnm"] = child.text - elif child.tag == __src__: - tempDict["src"] = child.text - elif child.tag == __pos__: - tempDict["pos"] = child.text - t = ( tempDict["word"],tempDict["def"],tempDict["phnm"],tempDict["src"],tempDict["pos"] ) - self.c.execute( "insert into dict values (?, ? , ?, ?, ?)", t ) - self.conn.commit() - self.c.close() - self.conn.close() + def __init__(self, DBname): - def MakeDB(self): import sqlite3 + self.conn = sqlite3.connect(DBname, isolation_level=None) + # Turn on autocommit mode + # Set isolation_level to "IMMEDIATE" + self.conn.isolation_level = "IMMEDIATE" + self.cur = self.conn.cursor() + self.numwords = -1 + self.wordid_list = [] + self.level = 0 + + def getnumwords(self, level = 0): + if self.numwords == -1: + if level == 0: + self.cur.execute("SELECT COUNT(wordid) from las_word") + else: + self.cur.execute("SELECT COUNT(wordid) from las_word where length = ?", (level, )) + self.numwords = self.cur.fetchone() + return self.numwords + + + def getrandomwordid(self, level=0): + if self.wordid_list == [] or self.level != level: + if level == 0: + self.cur.execute("SELECT wordid from las_word") + else: + self.level = level + self.cur.execute("SELECT wordid from las_word where length = ?", (level, )) + self.wordid_list = self.cur.fetchall() + #count = self.wordid_list.count + count = len(self.wordid_list) import random - self.conn = sqlite3.connect("las.db") - self.c = self.conn.cursor() - self.c.execute('''DROP TABLE IF EXISTS dict''') - self.c.execute('''CREATE table IF NOT EXISTS dict (word text, def text, phnm text, src text, pos text)''') + randid = random.randint(0,count) + (id,) = self.wordid_list[randid] + return id + +class word: + + def __init__(self, identifier, value): + import sqlite3 + self.conn = sqlite3.connect(DBname, isolation_level=None) + # Turn on autocommit mode + # Set isolation_level to "IMMEDIATE" + self.conn.isolation_level = "IMMEDIATE" + self.cur = self.conn.cursor() + if identifier == "las_word_id": + self.las_word_id = value + self.cur.execute("SELECT * from las_word where laswid = ?", (value,)) + elif identifier == "wordid": + self.wordid = value + self.cur.execute("SELECT * from las_word where wordid = ?", (value,)) + elif identifier == "word": + self.word = value + self.cur.execute("SELECT * from las_word where lemma = ?", (value,)) + else: + return "Invalid Usage" + + (laswid, wordid, lemma, length) = self.cur.fetchone() + self.las_word_id = laswid + self.wordid = wordid + self.word = lemma + self.length = length + + def getword(self): + return self.word + + def getsynsetid(self): + self.synsetid_list = [] + self.cur.execute("SELECT * from las_sense where wordid = ?", (self.wordid,)) + for (wordid, synsetid, rank) in self.cur: + self.synsetid_list.append(synsetid) + return self.synsetid_list + + def getdef(self): + self.def_list = [] + if self.synsetid_list == []: + self.getsynsetid() + for synsetid in self.synsetid_list: + self.cur.execute("SELECT * from las_synset where synsetid = ?", (synsetid,) ) + for (synsetid, pos, defination) in self.cur: + self.def_list.append( (synsetid, pos, defination)) + return self.def_list + + def getusage(self): + if self.synsetid_list == []: + self.getsynsetid() + self.usage_list = [] + for synsetid in self.synsetid_list: + self.cur.execute("SELECT * from las_sample where synsetid = ?", (synsetid,)) + for (synsetid, sampleid, sample) in self.cur: + self.usage_list.append( (synsetid, sampleid, sample)) + return self.usage_list + if __name__ == "__main__": - k = dictionary("dict/k.xml") - print k.getroottag() - print k.getnumwords() - k.loadict()
\ No newline at end of file + k = dictionary("dict.db") + num_words = k.getnumwords() + print num_words + + wordid = k.getrandomwordid(15) + l = word("wordid", wordid ) + + print l.getword() + l.getsynsetid() + print l.getdef() + print l.getusage()
\ No newline at end of file |