Skip to content

Commit

Permalink
Create lookup file maps from Strapi, not Allegro data (RPB-154)
Browse files Browse the repository at this point in the history
  • Loading branch information
fsteeg committed Apr 12, 2024
1 parent 392021c commit 1998c95
Show file tree
Hide file tree
Showing 9 changed files with 14,681 additions and 14,676 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ RPB-Export_HBZ_SWN.txt
RPB-Export_HBZ_Syst.txt
RPB-Export_HBZ_ZSS.txt
conf/RPBEXP/*.ZIP
conf/strapi-export.tar.gz
nohup.out*
27,080 changes: 13,541 additions & 13,539 deletions conf/maps/gndId-to-rppdId.tsv

Large diffs are not rendered by default.

2,248 changes: 1,124 additions & 1,124 deletions conf/maps/rppdId-with-label.tsv

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions conf/rppd-rppdId-with-label-map.fix
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ unless any_contain("gndIdentifier", "Keine")
reject()
end

copy_field("rppdId", "rppdUri")
prepend("rppdUri", "https://rppd.lobid.org/")
copy_field("rppdId", "uri")
prepend("uri", "https://rppd.lobid.org/")
replace_all("preferredName", "\\/\\s(ca\\.|um)?-?\\s?\\d.+$", "")
trim("preferredName")

retain("rppdId", "rppdUri", "preferredName")
retain("rppdId", "preferredName", "uri")
4 changes: 2 additions & 2 deletions conf/rppd-rppdId-with-label-map.flux
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FLUX_DIR + "output/output-rppd-strapi.ndjson"
FLUX_DIR + "output/rppd-export.jsonl"
| open-file
| as-lines
| decode-json
| decode-json(recordPath="data")
| fix(FLUX_DIR + "rppd-rppdId-with-label-map.fix")
| encode-csv(includeheader="true", noquotes="true",separator="\t")
| write(FLUX_DIR + "maps/rppdId-with-label.tsv")
Expand Down
6 changes: 4 additions & 2 deletions conf/rppd-to-gnd-map.fix
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ if any_contain("gndIdentifier", "Keine")
reject()
end

prepend("gndIdentifier", "https://d-nb.info/gnd/")
copy_field("gndIdentifier", "uri")
prepend("uri", "https://d-nb.info/gnd/")
replace_all("preferredName", "\\/\\s(ca\\.|um)?-?\\s?\\d.+$", "")
trim("preferredName")
retain("rppdId", "gndIdentifier", "preferredName")

retain("rppdId", "preferredName", "uri")
4 changes: 2 additions & 2 deletions conf/rppd-to-gnd-mapping.flux
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FLUX_DIR + "output/output-rppd-strapi.ndjson"
FLUX_DIR + "output/rppd-export.jsonl"
| open-file
| as-lines
| decode-json
| decode-json(recordPath="data")
| fix(FLUX_DIR + "rppd-to-gnd-map.fix")
| encode-csv(includeheader="true", noquotes="true",separator="\t")
| write(FLUX_DIR + "maps/gndId-to-rppdId.tsv")
Expand Down
2 changes: 1 addition & 1 deletion conf/rppd-to-lobid.fix
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ do once("map")
put_filemap("conf/maps/gndGeographicName.tsv", "gnd_spatial_map", key_column:"0", value_column:"1", sep_char: "\t", expected_columns:"2")

# maps für lookup relatedPerson
put_filemap("conf/maps/gndId-to-rppdId.tsv", "map_rel_preferredName",key_column:"1",value_column:"2", sep_char: "\t", expected_columns:"-1")
put_filemap("conf/maps/gndId-to-rppdId.tsv", "map_rel_preferredName",key_column:"2",value_column:"1", sep_char: "\t", expected_columns:"-1")
put_filemap("conf/maps/rppdId-with-label.tsv", "map_rel_rppdLabel",key_column:"0",value_column:"1", sep_char: "\t", expected_columns:"-1")

# maps für depiction
Expand Down
6 changes: 3 additions & 3 deletions transformRppd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ set -u

bash transformBeacons.sh
rm conf/output/bulk/rppd/*
sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT_FILE=output-rppd-strapi.ndjson"
sbt "runMain rpb.ETL conf/rppd-to-gnd-mapping.flux"
sbt "runMain rpb.ETL conf/rppd-rppdId-with-label-map.flux"
# Here, we used to import Allegro data:
# sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT_FILE=output-rppd-strapi.ndjson"
# sbt "runMain rpb.ETL conf/rppd-to-lobid.flux"
# But now we use the Strapi export:
zgrep -a '"type":"api::person.person"' conf/strapi-export.tar.gz > conf/output/rppd-export.jsonl
sbt "runMain rpb.ETL conf/rppd-to-gnd-mapping.flux"
sbt "runMain rpb.ETL conf/rppd-rppdId-with-label-map.flux"
sbt "runMain rpb.ETL conf/rppd-to-lobid.flux IN_FILE=rppd-export.jsonl RECORD_PATH=data"

# Indexing happens in rppd/transformAndIndexRppd.sh (lobid-gnd repo, branch 'rppd'), which calls this script

0 comments on commit 1998c95

Please sign in to comment.