Skip to content

Commit

Permalink
Merge pull request #410 from globalwordnet/2020-release-candidate
Browse files Browse the repository at this point in the history
2020 release candidate
  • Loading branch information
jmccrae committed Apr 16, 2020
2 parents e98f380 + e07e810 commit 66a6736
Show file tree
Hide file tree
Showing 53 changed files with 31,580 additions and 2,802 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
src/deprecations.csv merge=union
src/*.csv merge=union
121 changes: 3 additions & 118 deletions scripts/assign-sense-key.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,133 +3,18 @@
from glob import glob
import re
from sys import exit

lex_filenums = {
"src/wn-adj.all.xml": 0,
"src/wn-adj.pert.xml": 1,
"src/wn-adv.all.xml": 2,
"src/wn-noun.Tops.xml": 3,
"src/wn-noun.act.xml": 4,
"src/wn-noun.animal.xml": 5,
"src/wn-noun.artifact.xml": 6,
"src/wn-noun.attribute.xml": 7,
"src/wn-noun.body.xml": 8,
"src/wn-noun.cognition.xml": 9,
"src/wn-noun.communication.xml": 10,
"src/wn-noun.event.xml": 11,
"src/wn-noun.feeling.xml": 12,
"src/wn-noun.food.xml": 13,
"src/wn-noun.group.xml": 14,
"src/wn-noun.location.xml": 15,
"src/wn-noun.motive.xml": 16,
"src/wn-noun.object.xml": 17,
"src/wn-noun.person.xml": 18,
"src/wn-noun.phenomenon.xml": 19,
"src/wn-noun.plant.xml": 20,
"src/wn-noun.possession.xml": 21,
"src/wn-noun.process.xml": 22,
"src/wn-noun.quantity.xml": 23,
"src/wn-noun.relation.xml": 24,
"src/wn-noun.shape.xml": 25,
"src/wn-noun.state.xml": 26,
"src/wn-noun.substance.xml": 27,
"src/wn-noun.time.xml": 28,
"src/wn-verb.body.xml": 29,
"src/wn-verb.change.xml": 30,
"src/wn-verb.cognition.xml": 31,
"src/wn-verb.communication.xml": 32,
"src/wn-verb.competition.xml": 33,
"src/wn-verb.consumption.xml": 34,
"src/wn-verb.contact.xml": 35,
"src/wn-verb.creation.xml": 36,
"src/wn-verb.emotion.xml": 37,
"src/wn-verb.motion.xml": 38,
"src/wn-verb.perception.xml": 39,
"src/wn-verb.possession.xml": 40,
"src/wn-verb.social.xml": 41,
"src/wn-verb.stative.xml": 42,
"src/wn-verb.weather.xml": 43,
"src/wn-adj.ppl.xml": 44,
"src/wn-contrib.colloq.xml": 50,
"src/wn-contrib.plwn.xml": 51 }

ss_types = {
PartOfSpeech.NOUN: 1,
PartOfSpeech.VERB: 2,
PartOfSpeech.ADJECTIVE: 3,
PartOfSpeech.ADVERB: 4,
PartOfSpeech.ADJECTIVE_SATELLITE: 5
}

sense_id_lex_id = re.compile(".*%\d:\d\d:(\d\d):.*")
id_lemma = re.compile("ewn-(.*)-a-\d{8}-\d{2}")

def gen_lex_id(swn, e, s):
max_id = 0
unseen = 1
seen = False
for s2 in e.senses:
if s2.sense_key:
m = re.match(sense_id_lex_id, s2.sense_key)
max_id = max(max_id, int(m.group(1)))
else:
if not seen:
if s2.id == s.id:
seen = True
else:
unseen += 1
return max_id + unseen


def sense_for_entry_synset_id(wn, ss_id, lemma):
return [
s for e in wn.entry_by_lemma(lemma)
for s in wn.entry_by_id(e).senses
if s.synset == ss_id][0]

def get_head_word(wn, s):
ss = wn.synset_by_id(s.synset)
srs = [r for r in ss.synset_relations if r.rel_type == SynsetRelType.SIMILAR]
if len(srs) != 1:
print(srs)
print(s.id)
print("Could not deduce target of satellite")
else:
s2s = [sense_for_entry_synset_id(wn, srs[0].target, m) for m in wn.members_by_id(srs[0].target)]
s2s = sorted(s2s, key = lambda s2: s2.id[-2:])
s2 = s2s[0]

entry_id = re.match(id_lemma, s2.id).group(1)
if s2.sense_key:
return entry_id, re.match(sense_id_lex_id, s2.sense_key).group(1)
else:
print("No sense key for target of satellite! Marking as 99... please fix for " + s.id)
return entry_id, "99"
print("Failed to find target for satellite synset")
exit(-1)



import sense_keys

def assign_keys(wn, wn_file):
swn = parse_wordnet(wn_file)
for e in swn.entries:
for s in e.senses:
if not s.sense_key:
lemma = e.lemma.written_form.replace(" ", "_").replace("&apos","'").lower()
ss_type = ss_types[e.lemma.part_of_speech]
lex_filenum = lex_filenums[wn_file]
lex_id = gen_lex_id(swn, e, s)
if e.lemma.part_of_speech == PartOfSpeech.ADJECTIVE_SATELLITE:
head_word, head_id = get_head_word(wn, s)
else:
head_word = ""
head_id = ""
s.sense_key = "%s%%%d:%02d:%02d:%s:%s" % (lemma, ss_type, lex_filenum,
lex_id, head_word, head_id)
s.sense_key = sense_keys.get_sense_key(wn, swn, e, s, wn_file)
with open(wn_file, "w") as outp:
swn.to_xml(outp, True)


if __name__ == "__main__":
wn = change_manager.load_wordnet()
for f in glob ("src/wn-*.xml"):
Expand Down
26 changes: 21 additions & 5 deletions scripts/change-definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ def update_def(wn, synset, defn, add):
with open("src/wn-%s.xml" % synset.lex_name, "w") as out:
wn_synset.to_xml(out, True)

def update_ili_def(wn, synset, defn):
wn_synset = wordnet.parse_wordnet("src/wn-%s.xml" % synset.lex_name)
ss = wn_synset.synset_by_id(synset.id)
ss.ili_definition = wordnet.Definition(defn)
with open("src/wn-%s.xml" % synset.lex_name, "w") as out:
wn_synset.to_xml(out, True)


def main():
parser = argparse.ArgumentParser(description="Change a definition within the wordnet")
parser.add_argument('id', metavar='ID', type=str, nargs="?",
Expand All @@ -30,6 +38,8 @@ def main():
help="Add the new definition and retain the previous definition (otherwise this definition replaces previous definitions)")
parser.add_argument('--defn', type=str,
help="The new definition")
parser.add_argument('--ili', action='store_true',
help="Set the ILI definition")

args = parser.parse_args()

Expand All @@ -52,13 +62,19 @@ def main():
print("Could not find the synset %s" % args.id)
sys.exit(-1)

if not args.defn:
print("Definition : " + synset.definitions[0].text)
defn = input("New Definition : ")
if args.ili:
if not args.defn:
args.defn = synset.definitions[0].text

update_ili_def(wn, synset, args.defn)
else:
defn = args.defn
if not args.defn:
print("Definition : " + synset.definitions[0].text)
defn = input("New Definition : ")
else:
defn = args.defn

update_def(wn, synset, defn, args.add)
update_def(wn, synset, defn, args.add)

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion scripts/change_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def change_sense_idx(wn, sense_id, new_idx):
for f in glob("src/wn-*.xml"):
with fileinput.FileInput(f, inplace=True) as file:
for line in file:
print(line.replace(sense_id, new_sense_id), end='')
print(line.replace(sense_id, new_sense_id).rstrip())

def sense_ids_for_synset(wn, synset):
return [sense.id for lemma in wn.members_by_id(synset.id)
Expand Down
6 changes: 3 additions & 3 deletions scripts/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ def wn_merge():
<Lexicon id="ewn"
label="English WordNet"
language="en"
email="[email protected]"
license="https://wordnet.princeton.edu/license-and-commercial-use"
version="2019"
email="[email protected]"
license="https://creativecommons.org/licenses/by/4.0/"
version="2020"
url="https://github.com/globalwordnet/english-wordnet">""")
lex_entries = {}

Expand Down
132 changes: 132 additions & 0 deletions scripts/sense_keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from wordnet import *
import change_manager
from glob import glob
import re
from sys import exit

lex_filenums = {
"src/wn-adj.all.xml": 0,
"src/wn-adj.pert.xml": 1,
"src/wn-adv.all.xml": 2,
"src/wn-noun.Tops.xml": 3,
"src/wn-noun.act.xml": 4,
"src/wn-noun.animal.xml": 5,
"src/wn-noun.artifact.xml": 6,
"src/wn-noun.attribute.xml": 7,
"src/wn-noun.body.xml": 8,
"src/wn-noun.cognition.xml": 9,
"src/wn-noun.communication.xml": 10,
"src/wn-noun.event.xml": 11,
"src/wn-noun.feeling.xml": 12,
"src/wn-noun.food.xml": 13,
"src/wn-noun.group.xml": 14,
"src/wn-noun.location.xml": 15,
"src/wn-noun.motive.xml": 16,
"src/wn-noun.object.xml": 17,
"src/wn-noun.person.xml": 18,
"src/wn-noun.phenomenon.xml": 19,
"src/wn-noun.plant.xml": 20,
"src/wn-noun.possession.xml": 21,
"src/wn-noun.process.xml": 22,
"src/wn-noun.quantity.xml": 23,
"src/wn-noun.relation.xml": 24,
"src/wn-noun.shape.xml": 25,
"src/wn-noun.state.xml": 26,
"src/wn-noun.substance.xml": 27,
"src/wn-noun.time.xml": 28,
"src/wn-verb.body.xml": 29,
"src/wn-verb.change.xml": 30,
"src/wn-verb.cognition.xml": 31,
"src/wn-verb.communication.xml": 32,
"src/wn-verb.competition.xml": 33,
"src/wn-verb.consumption.xml": 34,
"src/wn-verb.contact.xml": 35,
"src/wn-verb.creation.xml": 36,
"src/wn-verb.emotion.xml": 37,
"src/wn-verb.motion.xml": 38,
"src/wn-verb.perception.xml": 39,
"src/wn-verb.possession.xml": 40,
"src/wn-verb.social.xml": 41,
"src/wn-verb.stative.xml": 42,
"src/wn-verb.weather.xml": 43,
"src/wn-adj.ppl.xml": 44,
"src/wn-contrib.colloq.xml": 50,
"src/wn-contrib.plwn.xml": 51 }

ss_types = {
PartOfSpeech.NOUN: 1,
PartOfSpeech.VERB: 2,
PartOfSpeech.ADJECTIVE: 3,
PartOfSpeech.ADVERB: 4,
PartOfSpeech.ADJECTIVE_SATELLITE: 5
}

sense_id_lex_id = re.compile(".*%\d:\d\d:(\d\d):.*")
id_lemma = re.compile("ewn-(.*?)(-(a|ip|p))?-[as]-\d{8}-\d{2}")

def gen_lex_id(swn, e, s):
max_id = 0
unseen = 1
seen = False
for s2 in e.senses:
if s2.sense_key:
m = re.match(sense_id_lex_id, s2.sense_key)
max_id = max(max_id, int(m.group(1)))
else:
if not seen:
if s2.id == s.id:
seen = True
else:
unseen += 1
return max_id + unseen

def extract_lex_id(sense_key):
return int(re.match(sense_id_lex_id, sense_key).group(1))


def sense_for_entry_synset_id(wn, ss_id, lemma):
return [
s for e in wn.entry_by_lemma(lemma)
for s in wn.entry_by_id(e).senses
if s.synset == ss_id][0]

def get_head_word(wn, s):
ss = wn.synset_by_id(s.synset)
# The hack here is we don't care about satellites in non-Princeton sets
srs = [r for r in ss.synset_relations if r.rel_type == SynsetRelType.SIMILAR and not r.target.startswith("ewn-9") and not r.target.startswith("ewn-8")]
if len(srs) != 1:
print([r.target for r in srs])
print(s.id)
print("Could not deduce target of satellite")
else:
s2s = [sense_for_entry_synset_id(wn, srs[0].target, m) for m in wn.members_by_id(srs[0].target)]
s2s = sorted(s2s, key = lambda s2: s2.id[-2:])
s2 = s2s[0]

if not re.match(id_lemma, s2.id):
print(s2.id)
entry_id = re.match(id_lemma, s2.id).group(1)
if s2.sense_key:
return entry_id, re.match(sense_id_lex_id, s2.sense_key).group(1)
else:
print("No sense key for target of satellite! Marking as 99... please fix for " + s.id)
return entry_id, "99"
print("Failed to find target for satellite synset")
exit(-1)

def get_sense_key(wn, swn, e, s, wn_file):
"""Calculate the sense key for a sense of an entry"""
lemma = e.lemma.written_form.replace(" ", "_").replace("&apos","'").lower()
ss_type = ss_types[e.lemma.part_of_speech]
lex_filenum = lex_filenums[wn_file]
if s.sense_key:
lex_id = extract_lex_id(s.sense_key)
else:
lex_id = gen_lex_id(swn, e, s)
if e.lemma.part_of_speech == PartOfSpeech.ADJECTIVE_SATELLITE:
head_word, head_id = get_head_word(wn, s)
else:
head_word = ""
head_id = ""
return "%s%%%d:%02d:%02d:%s:%s" % (lemma, ss_type, lex_filenum,
lex_id, head_word, head_id)
Loading

0 comments on commit 66a6736

Please sign in to comment.