Skip to content

Commit

Permalink
implemented the multilingual prefLabels
Browse files Browse the repository at this point in the history
  • Loading branch information
mdorf committed Oct 15, 2024
1 parent 812dd78 commit e348c4b
Show file tree
Hide file tree
Showing 7 changed files with 26,195 additions and 266 deletions.
26 changes: 11 additions & 15 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,10 @@ GEM
launchy (>= 2.1, < 4.0)
mail (~> 2.7)
eventmachine (1.2.7)
faraday (2.12.0)
faraday-net_http (>= 2.0, < 3.4)
json
logger
faraday-net_http (3.3.0)
net-http
ffi (1.17.0-aarch64-linux-gnu)
ffi (1.17.0-arm64-darwin)
ffi (1.17.0-x86_64-linux-gnu)
faraday (1.2.0)
multipart-post (>= 1.2, < 3)
ruby2_keywords
ffi (1.17.0)
hashie (5.0.0)
htmlentities (4.3.4)
http-accept (1.7.0)
Expand All @@ -87,9 +82,10 @@ GEM
net-pop
net-smtp
method_source (1.1.0)
mime-types (3.5.2)
mime-types (3.6.0)
logger
mime-types-data (~> 3.2015)
mime-types-data (3.2024.0903)
mime-types-data (3.2024.1001)
mini_mime (1.1.5)
minitest (4.7.5)
minitest-reporters (0.14.24)
Expand All @@ -98,8 +94,7 @@ GEM
minitest (>= 2.12, < 5.0)
powerbar
multi_json (1.15.0)
net-http (0.4.1)
uri
multipart-post (2.4.1)
net-http-persistent (2.9.4)
net-imap (0.4.16)
date
Expand Down Expand Up @@ -128,7 +123,7 @@ GEM
pry (0.14.2)
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (6.0.1)
public_suffix (5.1.1)
racc (1.8.1)
rack (2.2.9)
rack-test (0.8.3)
Expand Down Expand Up @@ -166,6 +161,7 @@ GEM
rubocop-ast (1.32.3)
parser (>= 3.3.1.0)
ruby-progressbar (1.13.0)
ruby2_keywords (0.0.5)
rubyzip (1.3.0)
simplecov (0.22.0)
docile (~> 1.1)
Expand All @@ -185,13 +181,13 @@ GEM
timeout (0.4.1)
tzinfo (0.3.62)
unicode-display_width (2.6.0)
uri (0.13.1)
uuid (2.3.9)
macaddr (~> 1.0)

PLATFORMS
aarch64-linux
arm64-darwin-22
arm64-darwin-23
x86_64-linux

DEPENDENCIES
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def process(logger, options = {})

def handle_missing_labels(file_path, logger)
callbacks = {
include_languages: true,
missing_labels: {
op_name: 'Missing Labels Generation',
required: true,
Expand Down Expand Up @@ -60,6 +61,10 @@ def loop_classes(logger, raw_paging, submission, callbacks)
size = 2500
count_classes = 0
acr = submission.id.to_s.split("/")[-1]

# include all languages in attributes of classes if asked for
incl_lang = callbacks.delete(:include_languages)
RequestStore.store[:requested_lang] = :ALL if incl_lang
operations = callbacks.values.map { |v| v[:op_name] }.join(", ")

time = Benchmark.realtime do
Expand Down Expand Up @@ -161,6 +166,7 @@ def loop_classes(logger, raw_paging, submission, callbacks)
@submission.save
end
end
RequestStore.store[:requested_lang] = nil if incl_lang
end

def generate_missing_labels_pre(artifacts = {}, logger, paging)
Expand All @@ -185,26 +191,35 @@ def generate_missing_labels_each(artifacts = {}, logger, paging, page_classes, p
prefLabel = nil

if c.prefLabel.nil?
rdfs_labels = c.label
lang_rdfs_labels = c.label(include_languages: true)
lang_rdfs_labels = {none: []} if lang_rdfs_labels.empty?

if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first
lang_rdfs_labels&.each do |lang, rdfs_labels|
if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first

rdfs_labels = c.label if rdfs_labels.nil? || rdfs_labels.length == 0
end
rdfs_labels = c.label if rdfs_labels.nil? || rdfs_labels.length == 0
end

rdfs_labels = [rdfs_labels] if rdfs_labels and not (rdfs_labels.instance_of? Array)
label = nil
rdfs_labels = [rdfs_labels] if rdfs_labels and not (rdfs_labels.instance_of? Array)
label = nil

if rdfs_labels && rdfs_labels.length > 0
# this sort is needed for a predictable label selection
label = rdfs_labels.sort[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
if rdfs_labels && rdfs_labels.length > 0
# this sort is needed for a predictable label selection
label = rdfs_labels.sort[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
end

if lang === :none
lang = nil
prefLabel = label
end
prefLabel = label if !prefLabel && lang === Goo.portal_language
prefLabel = label unless prefLabel
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label, lang)
end
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label)
prefLabel = label
else
prefLabel = c.prefLabel
end
Expand Down Expand Up @@ -381,238 +396,6 @@ def delete_and_append(triples_file_path, logger, mime_type = nil)
logger.flush
end

def process_callbacks(logger, callbacks, action_name, &block)
callbacks.delete_if do |_, callback|
begin
if callback[action_name]
callable = @submission.method(callback[action_name])
yield(callable, callback)
end
false
rescue Exception => e
logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}")
logger.flush

if callback[:status]
add_submission_status(callback[:status].get_error_status)
@submission.save
end

# halt the entire processing if :required is set to true
raise e if callback[:required]
# continue processing of other callbacks, but not this one
true
end
end
end

def loop_classes(logger, raw_paging, callbacks)
page = 1
size = 2500
count_classes = 0
acr = @submission.id.to_s.split("/")[-1]
operations = callbacks.values.map { |v| v[:op_name] }.join(", ")

time = Benchmark.realtime do
paging = raw_paging.page(page, size)
cls_count_set = false
cls_count = class_count(logger)

if cls_count > -1
# prevent a COUNT SPARQL query if possible
paging.page_count_set(cls_count)
cls_count_set = true
else
cls_count = 0
end

iterate_classes = false
# 1. init artifacts hash if not explicitly passed in the callback
# 2. determine if class-level iteration is required
callbacks.each { |_, callback| callback[:artifacts] ||= {}; iterate_classes = true if callback[:caller_on_each] }

process_callbacks(logger, callbacks, :caller_on_pre) {
|callable, callback| callable.call(callback[:artifacts], logger, paging) }

page_len = -1
prev_page_len = -1

begin
t0 = Time.now
page_classes = paging.page(page, size).all
total_pages = page_classes.total_pages
page_len = page_classes.length

# nothing retrieved even though we're expecting more records
if total_pages > 0 && page_classes.empty? && (prev_page_len == -1 || prev_page_len == size)
j = 0
num_calls = LinkedData.settings.num_retries_4store

while page_classes.empty? && j < num_calls do
j += 1
logger.error("Empty page encountered. Retrying #{j} times...")
sleep(2)
page_classes = paging.page(page, size).all
logger.info("Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") unless page_classes.empty?
end

if page_classes.empty?
msg = "Empty page #{page} of #{total_pages} persisted after retrying #{j} times. #{operations} of #{acr} aborted..."
logger.error(msg)
raise msg
end
end

if page_classes.empty?
if total_pages > 0
logger.info("The number of pages reported for #{acr} - #{total_pages} is higher than expected #{page - 1}. Completing #{operations}...")
else
logger.info("Ontology #{acr} contains #{total_pages} pages...")
end
break
end

prev_page_len = page_len
logger.info("#{acr}: page #{page} of #{total_pages} - #{page_len} ontology terms retrieved in #{Time.now - t0} sec.")
logger.flush
count_classes += page_classes.length

process_callbacks(logger, callbacks, :caller_on_pre_page) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) }

page_classes.each { |c|
process_callbacks(logger, callbacks, :caller_on_each) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page, c) }
} if iterate_classes

process_callbacks(logger, callbacks, :caller_on_post_page) {
|callable, callback| callable.call(callback[:artifacts], logger, paging, page_classes, page) }
cls_count += page_classes.length unless cls_count_set

page = page_classes.next? ? page + 1 : nil
end while !page.nil?

callbacks.each { |_, callback| callback[:artifacts][:count_classes] = cls_count }
process_callbacks(logger, callbacks, :caller_on_post) {
|callable, callback| callable.call(callback[:artifacts], logger, paging) }
end

logger.info("Completed #{operations}: #{acr} in #{time} sec. #{count_classes} classes.")
logger.flush

# set the status on actions that have completed successfully
callbacks.each do |_, callback|
if callback[:status]
add_submission_status(callback[:status])
@submission.save
end
end
end

def generate_missing_labels_pre(artifacts = {}, logger, paging)
file_path = artifacts[:file_path]
artifacts[:save_in_file] = File.join(File.dirname(file_path), "labels.ttl")
artifacts[:save_in_file_mappings] = File.join(File.dirname(file_path), "mappings.ttl")
property_triples = LinkedData::Utils::Triples.rdf_for_custom_properties(@submission)
Goo.sparql_data_client.append_triples(@submission.id, property_triples, mime_type = "application/x-turtle")
fsave = File.open(artifacts[:save_in_file], "w")
fsave.write(property_triples)
fsave_mappings = File.open(artifacts[:save_in_file_mappings], "w")
artifacts[:fsave] = fsave
artifacts[:fsave_mappings] = fsave_mappings
end

def generate_missing_labels_pre_page(artifacts = {}, logger, paging, page_classes, page)
artifacts[:label_triples] = []
artifacts[:mapping_triples] = []
end

def generate_missing_labels_each(artifacts = {}, logger, paging, page_classes, page, c)
prefLabel = nil

if c.prefLabel.nil?
rdfs_labels = c.label

if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0
rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first

if rdfs_labels.nil? || rdfs_labels.length == 0
rdfs_labels = c.label
end
end

if rdfs_labels and not (rdfs_labels.instance_of? Array)
rdfs_labels = [rdfs_labels]
end
label = nil

if rdfs_labels && rdfs_labels.length > 0
label = rdfs_labels[0]
else
label = LinkedData::Utils::Triples.last_iri_fragment c.id.to_s
end
artifacts[:label_triples] << LinkedData::Utils::Triples.label_for_class_triple(
c.id, Goo.vocabulary(:metadata_def)[:prefLabel], label)
prefLabel = label
else
prefLabel = c.prefLabel
end

if @submission.ontology.viewOf.nil?
loomLabel = OntologySubmission.loom_transform_literal(prefLabel.to_s)

if loomLabel.length > 2
artifacts[:mapping_triples] << LinkedData::Utils::Triples.loom_mapping_triple(
c.id, Goo.vocabulary(:metadata_def)[:mappingLoom], loomLabel)
end
artifacts[:mapping_triples] << LinkedData::Utils::Triples.uri_mapping_triple(
c.id, Goo.vocabulary(:metadata_def)[:mappingSameURI], c.id)
end
end

def generate_missing_labels_post_page(artifacts = {}, logger, paging, page_classes, page)
rest_mappings = LinkedData::Mappings.migrate_rest_mappings(@submission.ontology.acronym)
artifacts[:mapping_triples].concat(rest_mappings)

if artifacts[:label_triples].length > 0
logger.info("Asserting #{artifacts[:label_triples].length} labels in " +
"#{@submission.id.to_ntriples}")
logger.flush
artifacts[:label_triples] = artifacts[:label_triples].join("\n")
artifacts[:fsave].write(artifacts[:label_triples])
t0 = Time.now
Goo.sparql_data_client.append_triples(@submission.id, artifacts[:label_triples], mime_type = "application/x-turtle")
t1 = Time.now
logger.info("Labels asserted in #{t1 - t0} sec.")
logger.flush
else
logger.info("No labels generated in page #{page}.")
logger.flush
end

if artifacts[:mapping_triples].length > 0
logger.info("Asserting #{artifacts[:mapping_triples].length} mappings in " +
"#{@submission.id.to_ntriples}")
logger.flush
artifacts[:mapping_triples] = artifacts[:mapping_triples].join("\n")
artifacts[:fsave_mappings].write(artifacts[:mapping_triples])

t0 = Time.now
Goo.sparql_data_client.append_triples(@submission.id, artifacts[:mapping_triples], mime_type = "application/x-turtle")
t1 = Time.now
logger.info("Mapping labels asserted in #{t1 - t0} sec.")
logger.flush
end
end

def generate_missing_labels_post(artifacts = {}, logger, paging)
logger.info("end generate_missing_labels traversed #{artifacts[:count_classes]} classes")
logger.info("Saved generated labels in #{artifacts[:save_in_file]}")
artifacts[:fsave].close()
artifacts[:fsave_mappings].close()
logger.flush
end

def generate_obsolete_classes(logger, file_path)
@submission.bring(:obsoleteProperty) if @submission.bring?(:obsoleteProperty)
@submission.bring(:obsoleteParent) if @submission.bring?(:obsoleteParent)
Expand Down
Loading

0 comments on commit e348c4b

Please sign in to comment.