Data from OMW 1.2

omwn · Nov 6, 2020 · 291d2c2 · 291d2c2
1 parent 9445a83
commit 291d2c2
Show file tree

Hide file tree

Showing 1,352 changed files with 3,733,603 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,8 @@
 # omw-data
+
 This packages up data for the Open Multilingual Wordnet
+
+
+The directory *wns* has the wordnet data from OMW 1.2 with some small fixes
+ * added a citation for the Icelandic wordnet
+ * added human readable citations in ``omw-citations.tab``
diff --git a/wns/README b/wns/README
@@ -0,0 +1,37 @@
+This directory contains redistributable wordnets in further sub-directories.
+
+Structure is:
+
+lang/wn-data-lang.tab	synset lemma pairs (see below)
+lang/LICENCE		original license file (or equivalent)
+lang/README		Any notes about the conversion
+lang/lang2tab.py	python script to extract the data
+			may rely on wordnet version mappings	
+lang/wn-data-lang.tab.log any notes from the conversion
+lang/citation.bib   the canonical citation reference(s)
+
+Note that a single directory may have wordnets for multiple languages
+
+wn-data is formatted as follows:
+# name<tab>lang<tab>url<tab>license
+offset-pos<tab>type<tab>lemma
+offset-pos<tab>type<tab>lemma
+...
+
+name    is the name of the project
+lang    is the iso 3 letter code for the name
+url     is the url of the project
+license is a short name for the license
+offset  is the Princeton WordNet 3.0 offset 8 digit offset
+pos     is one of [a,s,v,n,r]  
+lemma   is the lemma (word separator normalized to ' ')
+type 	is the language:relationship (e.g. eng:lemma)
+
+Example:
+# Thai	tha	http://th.asianwordnet.org/	wordnet 
+13567960-n	tha:lemma	กระบวนการทรานแอมมิแนชัน
+00155298-n	tha:lemma	การปฏิเสธ
+14369530-n	tha:lemma	ภาวะการหายใจเร็วของทารกแรกเกิด
+10850469-n	tha:lemma	เบธัน
+11268326-n	tha:lemma	เรินต์เกน
+
diff --git a/wns/als/LICENSE b/wns/als/LICENSE
@@ -0,0 +1,5 @@
+The Albanian wordnet is dual licensed under either:
+
+ * GPL 3
+ * CC BY 3
+
diff --git a/wns/als/README b/wns/als/README
@@ -0,0 +1,11 @@
+I cleaned up the source xml a little.
+
+------------------------------------------------------------------------
+This conversion uses the following mappings
+
+WN-Map: Mappings between WordNet versions
+Copyright (c) 2003 TALP Research Center
+	           Universitat Politècnica de Catalunya. 
+		   Barcelona, Spain
+		   http://www.lsi.upc.es/~nlp
+--------------------------------------------------------------------------
diff --git a/wns/als/als2tab.py b/wns/als/als2tab.py
@@ -0,0 +1,97 @@
+#!/usr/share/python
+# -*- encoding: utf-8 -*-
+#
+# Extract synset-word pairs, definitions and others from the Albanian Wordnet
+#
+#
+
+import sys
+import codecs
+import re, collections
+
+
+wndata="/home/bond/work/wns/albanian/"
+wnname = "Albanet" 
+wnlang = "als"
+wnurl = "http://fjalnet.com/"
+wnlicense = "CC BY 3.0"
+
+#
+# header
+#
+outfile = "wn-data-%s.tab" % wnlang
+o = codecs.open(outfile, "w", "utf-8" )
+log = codecs.open("log", "w", "utf-8" )
+
+o.write("# %s\t%s\t%s\t%s \n" % (wnname, wnlang, wnurl, wnlicense))
+
+###
+### mappings
+###
+mapdir = "/home/bond/work/wn/mapps/mapping-20-30/"
+maps = ["wn20-30.adj", "wn20-30.adv", "wn20-30.noun", "wn20-30.verb"]
+pos = {"wn20-30.adj" : "a", "wn20-30.adv" : "r", 
+       "wn20-30.noun" : "n", "wn20-30.verb" : "v", }
+map2030 = collections.defaultdict(lambda: 'unknown');
+for m in maps:
+    mf = codecs.open(mapdir + m, "r", "utf-8" )
+    p = pos[m]
+    for l in mf:
+        lst = l.strip().split()
+        fsfrom = lst[0] + "-" + p
+        fsto = sorted([(lst[i+1], lst[i]) for i in range(1,len(lst),2)])[-1][1]
+        ##print "%s-%s\t%s-%s" % (fsfrom, p, fsto, p)
+        map2030[fsfrom] = "%s-%s" % (fsto, p)
+
+
+#
+# Data is in the file shqip.xml
+#
+# But xml parser complains :-)  so back to regexp
+#
+
+
+f = codecs.open(wndata + "shqip.xml", "r", "utf-8" )
+
+synset = unicode()
+lemma = unicode()
+#of20 = unicode()
+defid = 0
+exid = 0
+
+for l in f:
+    ##print l, "EOS"
+    ### synset
+    m = re.search(r"<ID>(.*)</ID>",l.strip())
+    if (m):
+        synset20 = m.group(1).strip()[6:]
+        synset = map2030[synset20]
+        defid = 0
+        exid = 0
+    ### lemma
+    m = re.search(r"<LITERAL>(.+)<SENSE>(.*)</SENSE>",l.strip())
+    if(m):
+        lemma = m.group(1).strip()  
+        sense = m.group(2).strip()  
+        if lemma == '' or synset=='unknown':
+            log.write("Problem with synset/lemma:  {} ({}) {}\n".format(synset20, synset, sense))
+        else:
+            o.write("%s\t%s:%s\t%s\n" % (synset, wnlang, 'lemma', lemma))
+    ### Definition
+    m = re.search(r"<DEF>(.+)</DEF>",l.strip())
+    if(m):
+        df = m.group(1).strip()  
+        if df == '' or synset=='unknown':
+            log.write("Problem with synset/def:  {} ({})\n".format(synset20, synset))
+        else:
+            o.write("%s\t%s:%s\t%d\t%s\n" % (synset, wnlang, 'def', defid, df))
+            defid += 1
+    ### Example
+    m = re.search(r"<USAGE>(.+)</USAGE>",l.strip())
+    if(m):
+        ex = m.group(1).strip()  
+        if ex == '' or synset=='unknown':
+            log.write("Problem with synset/exe:  {} ({})\n".format(synset20, synset))
+        else:
+            o.write("%s\t%s:%s\t%d\t%s\n" % (synset, wnlang, 'exe', exid, ex))
+            exid += 1
diff --git a/wns/als/citation.bib b/wns/als/citation.bib
@@ -0,0 +1,12 @@
+Citation for alsWN
+
+@TechReport{Ruci:2008,
+  author =	 "Ervin Ruci",
+  title =	 "On the current state of {Albanet} and related
+                  applications",
+  institution =	 "University of Vlora",
+  year =	 2008,
+  note =
+                  "(\url{http://fjalnet.com/technicalreportalbanet.pdf})"
+}
+
diff --git a/wns/als/log b/wns/als/log
@@ -0,0 +1,7 @@
+Problem with synset/lemma:  01505508-a (unknown) 1
+Problem with synset/lemma:  01505508-a (unknown) 1
+Problem with synset/def:  01505508-a (unknown)
+Problem with synset/exe:  01505508-a (unknown)
+Problem with synset/lemma:  02002046-a (unknown) 1
+Problem with synset/def:  02002046-a (unknown)
+Problem with synset/exe:  02002046-a (unknown)