Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAssim Deodia <assim.deodia@gmail.com>2008-06-18 04:44:47 (GMT)
committer Assim Deodia <assim.deodia@gmail.com>2008-06-18 04:44:47 (GMT)
commit8934f10685a6cec558cc9b9aeacf6c530b2cc6c6 (patch)
tree3c3fa0ad5edcefda2639867a170c83e3d677a4d6
parent2385830803ca8b6eaf9a2666f26c0f5586167c46 (diff)
dictionary and word class interface for sqlite DB
-rw-r--r--conv.sh43
-rw-r--r--dict.dbbin0 -> 13402112 bytes
-rw-r--r--dict/dict.sql.zipbin0 -> 4913531 bytes
-rw-r--r--dict/index.php69
-rw-r--r--dictionary.py181
5 files changed, 219 insertions, 74 deletions
diff --git a/conv.sh b/conv.sh
new file mode 100644
index 0000000..8f8a9f4
--- /dev/null
+++ b/conv.sh
@@ -0,0 +1,43 @@
+ #!/bin/sh
+
+ if [ "x$1" == "x" ]; then
+ echo "Usage: $0 <dbname>"
+ exit
+ fi
+
+ if [ -e "$1.db" ]; then
+ echo "$1.db already exists. I will overwrite it in 15 seconds if you do not press CTRL-C."
+ rm $1.db
+ fi
+
+ mysqldump -h 10.250.100.69 -u assim -p --compact --compatible=ansi --default-character-set=binary $1 |
+ grep -v ' KEY "' |
+ grep -v ' UNIQUE KEY "' |
+ grep -v ' PRIMARY KEY ' |
+ sed 's/ unsigned / /g' |
+ sed 's/ auto_increment/ primary key autoincrement/gi' |
+ sed 's/ smallint([0-9]*) / integer /gi' |
+ sed 's/ tinyint([0-9]*) / integer /gi' |
+ sed 's/ int([0-9]*) / integer /gi' |
+ sed 's/ character set [^ ]* / /gi' |
+ sed 's/ enum([^)]*) / varchar(255) /gi' |
+ sed 's/ on update [^,]*//gi' |
+ perl -e 'local $/;$_=<>;s/,\n\)/\n\)/gs;print "begin;\n";print;print "commit;\n"' |
+ perl -pe '
+ if (/^(INSERT.+?)\(/) {
+ $a=$1;
+ s/\\'\''/'\'\''/g;
+ s/\\n/\n/g;
+ s/\),\(/\);\n$a\(/g;
+ }
+ ' > $1.sql
+ cat $1.sql | sqlite3 $1.db > $1.err
+ ERRORS=`cat $1.err | wc -l`
+ if [ "$ERRORS" == "0" ]; then
+ echo "Conversion completed without error. Output file: $1.db"
+ rm $1.sql
+ rm $1.err
+ else
+ echo "There were errors during conversion. Please review $1.err and $1.sql for details."
+ fi
+
diff --git a/dict.db b/dict.db
new file mode 100644
index 0000000..8765420
--- /dev/null
+++ b/dict.db
Binary files differ
diff --git a/dict/dict.sql.zip b/dict/dict.sql.zip
new file mode 100644
index 0000000..5a8d304
--- /dev/null
+++ b/dict/dict.sql.zip
Binary files differ
diff --git a/dict/index.php b/dict/index.php
new file mode 100644
index 0000000..b9588f4
--- /dev/null
+++ b/dict/index.php
@@ -0,0 +1,69 @@
+<?PHP
+set_time_limit ( 0 ) ;
+mysql_connect('localhost', 'root', 'kandisa');
+mysql_select_db('wordnet30');
+
+$q1 = "SELECT *
+FROM `word`
+WHERE `lemma` REGEXP CONVERT( _utf8 '^[a-z]*[a-z]$'
+USING latin1 )
+COLLATE latin1_swedish_ci";
+$result = mysql_query($q1) or die('mysql_query1:'.mysql_error());
+
+$num_rows = mysql_num_rows($result);
+
+
+while($num_rows > 0){
+
+
+ $row = mysql_fetch_assoc($result);
+ $q2 = "INSERT into `las_word` (`wordid`, `lemma`) VALUES ( '".$row['wordid']."', '".$row['lemma']."' )";
+ $result2 = mysql_query($q2) or print('mysql_query:2'.mysql_error());
+
+ $q3 = "SELECT * FROM `sense` WHERE `wordid` = '".$row['wordid']."'";
+ $result3 = mysql_query($q3) or print('mysql_query:3'.mysql_error());
+ $num_rows3 = mysql_num_rows($result3);
+
+
+ for($i = 0; $i< $num_rows3; $i++){
+
+ $row3 = mysql_fetch_assoc($result3);
+
+ $q4 = "INSERT INTO `las_sense` (`wordid`, `synsetid`, `rank`) VALUES ('".$row3['wordid']."', '".$row3['synsetid']."', '".$row3['rank']."')";
+ $result4 = mysql_query($q4) or print('mysql_query:4'.mysql_error());
+
+ $q5 = "SELECT * FROM `synset` WHERE `synsetid` = '".$row3['synsetid']."'";
+ $q6 = "SELECT * FROM `sample` WHERE `synsetid` = '".$row3['synsetid']."'";
+
+ $result5 = mysql_query($q5) or print('mysql_query:5'.mysql_error());
+ $num_rows5 = mysql_num_rows($result5);
+
+ $result6 = mysql_query($q6) or print('mysql_query:6'.mysql_error());
+ $num_rows6 = mysql_num_rows($result6);
+
+ for($j = 0;$j < $num_rows5; $j++){
+
+ $row5 = mysql_fetch_assoc($result5);
+ $def = addslashes($row5['definition']);
+ $q7 = "INSERT INTO `las_synset` (`synsetid`, `pos`, `definition`) VALUES ('".$row5['synsetid']."', '".$row5['pos']."', '".$def."')";
+ $result7 = mysql_query($q7) or print('mysql_query:7'.mysql_error().$q7);
+
+ }
+
+ for($k = 0;$k < $num_rows6; $k++){
+
+ $row6 = mysql_fetch_assoc($result6);
+ $sample = addslashes($row6['sample']);
+ $q8 = "INSERT INTO `las_sample` (`synsetid`, `sampleid`, `sample`) VALUES ('".$row6['synsetid']."', '".$row6['sampleid']."', '".$sample."')";
+ $result8 = mysql_query($q8) or print('mysql_query:8'.mysql_error());
+
+ }
+
+ }
+
+ $num_rows --;
+ //echo "<br/>".$row['wordid'].":".$row['lemma'];
+}
+if(num_rows == 0)
+ echo "Done :)";
+?> \ No newline at end of file
diff --git a/dictionary.py b/dictionary.py
index b962714..a7ad273 100644
--- a/dictionary.py
+++ b/dictionary.py
@@ -2,91 +2,124 @@
import sys
# Provides the API to control the dictionary.
-global __root__
-global __element__
-global __word__
-global __def__
-global __phnm__
-global __pos__
-global __src__
-global __fld__
global __debug
-global __numword
-global __DBName
-global __DBTableName
+global DBname
+global word_list
-__root__ = "dictionary"
-__element__ = "p"
-__word__ = "hw"
-__def__ = "def"
-__phnm__ = "pr"
-__pos__ = "pos"
-__src__ = "source"
-__fld__ = "fld"
-__debug = True
-__numword = -1
-__DBName = "/tmp/las"
-__DBTableName = "dictionary" #Currently not used
+__debug = True
+DBname = "dict.db"
+word_list = []
#strings which are tag in XML
class dictionary:
- def __init__(self, filename):
- from xml.etree.ElementTree import ElementTree
- #create an ElementTree instance from an XML file
- self.ETree = ElementTree(file=filename)
- if self.ETree.getroot().tag != __root__:
- print "Invalid File"
- sys.exit(0)
-
- def getroottag(self):
- return self.ETree.getroot().tag
- def getnumwords(self):
- self.iter = self.ETree.getiterator(__element__)
- __numword = len(self.iter)
- return __numword
- def loadict(self):
- self.MakeDB()
- for element in self.iter:
- if element.getchildren():
- #Can also use: "for child in element.getchildren():"
- tempDict = {}
- tempDict["word"] = ""
- tempDict["def"] = ""
- tempDict["phnm"] = ""
- tempDict["src"] = ""
- tempDict["pos"] = ""
- for child in element:
- #Child element tag name
- if child.tag == __word__:
- tempDict["word"] = child.text
- elif child.tag == __def__:
- tempDict["def"] = child.text
- elif child.tag == __phnm__:
- tempDict["phnm"] = child.text
- elif child.tag == __src__:
- tempDict["src"] = child.text
- elif child.tag == __pos__:
- tempDict["pos"] = child.text
- t = ( tempDict["word"],tempDict["def"],tempDict["phnm"],tempDict["src"],tempDict["pos"] )
- self.c.execute( "insert into dict values (?, ? , ?, ?, ?)", t )
- self.conn.commit()
- self.c.close()
- self.conn.close()
+ def __init__(self, DBname):
- def MakeDB(self):
import sqlite3
+ self.conn = sqlite3.connect(DBname, isolation_level=None)
+ # Turn on autocommit mode
+ # Set isolation_level to "IMMEDIATE"
+ self.conn.isolation_level = "IMMEDIATE"
+ self.cur = self.conn.cursor()
+ self.numwords = -1
+ self.wordid_list = []
+ self.level = 0
+
+ def getnumwords(self, level = 0):
+ if self.numwords == -1:
+ if level == 0:
+ self.cur.execute("SELECT COUNT(wordid) from las_word")
+ else:
+ self.cur.execute("SELECT COUNT(wordid) from las_word where length = ?", (level, ))
+ self.numwords = self.cur.fetchone()
+ return self.numwords
+
+
+ def getrandomwordid(self, level=0):
+ if self.wordid_list == [] or self.level != level:
+ if level == 0:
+ self.cur.execute("SELECT wordid from las_word")
+ else:
+ self.level = level
+ self.cur.execute("SELECT wordid from las_word where length = ?", (level, ))
+ self.wordid_list = self.cur.fetchall()
+ #count = self.wordid_list.count
+ count = len(self.wordid_list)
import random
- self.conn = sqlite3.connect("las.db")
- self.c = self.conn.cursor()
- self.c.execute('''DROP TABLE IF EXISTS dict''')
- self.c.execute('''CREATE table IF NOT EXISTS dict (word text, def text, phnm text, src text, pos text)''')
+ randid = random.randint(0,count)
+ (id,) = self.wordid_list[randid]
+ return id
+
+class word:
+
+ def __init__(self, identifier, value):
+ import sqlite3
+ self.conn = sqlite3.connect(DBname, isolation_level=None)
+ # Turn on autocommit mode
+ # Set isolation_level to "IMMEDIATE"
+ self.conn.isolation_level = "IMMEDIATE"
+ self.cur = self.conn.cursor()
+ if identifier == "las_word_id":
+ self.las_word_id = value
+ self.cur.execute("SELECT * from las_word where laswid = ?", (value,))
+ elif identifier == "wordid":
+ self.wordid = value
+ self.cur.execute("SELECT * from las_word where wordid = ?", (value,))
+ elif identifier == "word":
+ self.word = value
+ self.cur.execute("SELECT * from las_word where lemma = ?", (value,))
+ else:
+ return "Invalid Usage"
+
+ (laswid, wordid, lemma, length) = self.cur.fetchone()
+ self.las_word_id = laswid
+ self.wordid = wordid
+ self.word = lemma
+ self.length = length
+
+ def getword(self):
+ return self.word
+
+ def getsynsetid(self):
+ self.synsetid_list = []
+ self.cur.execute("SELECT * from las_sense where wordid = ?", (self.wordid,))
+ for (wordid, synsetid, rank) in self.cur:
+ self.synsetid_list.append(synsetid)
+ return self.synsetid_list
+
+ def getdef(self):
+ self.def_list = []
+ if self.synsetid_list == []:
+ self.getsynsetid()
+ for synsetid in self.synsetid_list:
+ self.cur.execute("SELECT * from las_synset where synsetid = ?", (synsetid,) )
+ for (synsetid, pos, defination) in self.cur:
+ self.def_list.append( (synsetid, pos, defination))
+ return self.def_list
+
+ def getusage(self):
+ if self.synsetid_list == []:
+ self.getsynsetid()
+ self.usage_list = []
+ for synsetid in self.synsetid_list:
+ self.cur.execute("SELECT * from las_sample where synsetid = ?", (synsetid,))
+ for (synsetid, sampleid, sample) in self.cur:
+ self.usage_list.append( (synsetid, sampleid, sample))
+ return self.usage_list
+
if __name__ == "__main__":
- k = dictionary("dict/k.xml")
- print k.getroottag()
- print k.getnumwords()
- k.loadict() \ No newline at end of file
+ k = dictionary("dict.db")
+ num_words = k.getnumwords()
+ print num_words
+
+ wordid = k.getrandomwordid(15)
+ l = word("wordid", wordid )
+
+ print l.getword()
+ l.getsynsetid()
+ print l.getdef()
+ print l.getusage() \ No newline at end of file