Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

60 split json2solr output into batches #66

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 45 additions & 35 deletions dataload/json2solr/src/main/java/JSON2Solr.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,15 @@ public static void main(String[] args) throws IOException {
String inputFilePath = cmd.getOptionValue("input");
String outPath = cmd.getOptionValue("outDir");

PrintStream ontologiesWriter = null;
PrintStream classesWriter = null;
PrintStream propertiesWriter = null;
PrintStream individualsWriter = null;
PrintStream autocompleteWriter = null;


String ontologiesOutName = outPath + "/ontologies.jsonl";
String classesOutName = outPath + "/classes.jsonl";
String propertiesOutName = outPath + "/properties.jsonl";
String individualsOutName = outPath + "/individuals.jsonl";
String autocompleteOutName = outPath + "/autocomplete.jsonl";
File file = new File(outPath);
try {
file.mkdirs();
file.createNewFile();
} catch (IOException ioe) {
ioe.printStackTrace();
}

ontologiesWriter = new PrintStream(ontologiesOutName);
classesWriter = new PrintStream(classesOutName);
propertiesWriter = new PrintStream(propertiesOutName);
individualsWriter = new PrintStream(individualsOutName);
autocompleteWriter = new PrintStream(autocompleteOutName);
Map <String,PrintStream> writers = new HashMap<>();


JsonReader reader = new JsonReader(new InputStreamReader(new FileInputStream(inputFilePath)));
Expand Down Expand Up @@ -98,10 +89,9 @@ public static void main(String[] args) throws IOException {
flattenedClass.put("id", entityId);

flattenProperties(_class, flattenedClass);
writeEntity("classes",ontologyId,flattenedClass,outPath,writers);

classesWriter.println(gson.toJson(flattenedClass));

writeAutocompleteEntries(ontologyId, entityId, flattenedClass, autocompleteWriter);
writeAutocompleteEntries(ontologyId, entityId, flattenedClass, outPath, writers);
}

reader.endArray();
Expand All @@ -123,9 +113,9 @@ public static void main(String[] args) throws IOException {

flattenProperties(property, flattenedProperty);

propertiesWriter.println(gson.toJson(flattenedProperty));
writeEntity("properties",ontologyId,flattenedProperty,outPath,writers);

writeAutocompleteEntries(ontologyId, entityId, flattenedProperty, autocompleteWriter);
writeAutocompleteEntries(ontologyId, entityId, flattenedProperty,outPath,writers);
}

reader.endArray();
Expand All @@ -147,9 +137,9 @@ public static void main(String[] args) throws IOException {

flattenProperties(individual, flattenedIndividual);

individualsWriter.println(gson.toJson(flattenedIndividual));
writeEntity("individuals",ontologyId,flattenedIndividual,outPath,writers);

writeAutocompleteEntries(ontologyId, entityId, flattenedIndividual, autocompleteWriter);
writeAutocompleteEntries(ontologyId, entityId, flattenedIndividual,outPath,writers);
}

reader.endArray();
Expand All @@ -176,7 +166,7 @@ public static void main(String[] args) throws IOException {

flattenProperties(ontology, flattenedOntology);

ontologiesWriter.println(gson.toJson(flattenedOntology));
writeEntity("ontologies",ontologyId,flattenedOntology,outPath,writers);

reader.endObject(); // ontology
}
Expand All @@ -192,6 +182,26 @@ public static void main(String[] args) throws IOException {

reader.endObject();
reader.close();
for (PrintStream printStream : writers.values())
printStream.close();
}

static private void writeEntity(String type, String ontologyId, Map<String,Object> flattenedEntity, String outPath, Map <String,PrintStream> writers) throws FileNotFoundException {
if(writers.containsKey(ontologyId+"_"+type))
writers.get(ontologyId+"_"+type).println(gson.toJson(flattenedEntity));
else {
writers.put(ontologyId+"_"+type,new PrintStream(outPath+"/"+ontologyId+"_"+type+".jsonl"));
writers.get(ontologyId+"_"+type).println(gson.toJson(flattenedEntity));
}
}

static private void writeAutocomplete(String ontologyId, Map<String,String> flattenedEntity, String outPath, Map <String,PrintStream> writers) throws FileNotFoundException {
if(writers.containsKey(ontologyId+"_autocomplete"))
writers.get(ontologyId+"_autocomplete").println(gson.toJson(flattenedEntity, Map.class));
else {
writers.put(ontologyId+"_autocomplete",new PrintStream(outPath+"/"+ontologyId+"_autocomplete.jsonl"));
writers.get(ontologyId+"_autocomplete").println(gson.toJson(flattenedEntity, Map.class));
}
}

static private void flattenProperties(Map<String,Object> properties, Map<String,Object> flattened) {
Expand Down Expand Up @@ -233,24 +243,24 @@ static private void flattenProperties(Map<String,Object> properties, Map<String,
// (4) It's reification { type: reification|related, ...., value: ... }
//
// (5) it's some random json object from the ontology config
//
//
// In the case of (1), we discard the datatype and keep the value
//
// In the case of (2), we don't store anything in solr fields. Class
// expressions should already have been evaluated into separate "related"
// fields by the RelatedAnnotator in rdf2json.
//
// In the case of (3), we create a Solr document for each language (see
// In the case of (3), we create a Solr document for each language (see
// above), and the language is passed into this function so we know which
// language's strings to keep.
//
// In the case of (4), we discard any metadata (in Neo4j the metadata is
// preserved for edges, but in Solr we don't care about it).
//
//
// In the case of (5) we discard it in solr because json objects won't be
// querable anyway.
//
//
//
public static Object discardMetadata(Object obj) {

if (obj instanceof Map) {
Expand Down Expand Up @@ -283,7 +293,7 @@ public static Object discardMetadata(Object obj) {
}

} else {

return obj;
}
}
Expand All @@ -299,26 +309,26 @@ public static String objToString(Object obj) {



static void writeAutocompleteEntries(String ontologyId, String entityId, Map<String,Object> flattenedEntity, PrintStream autocompleteWriter) {
static void writeAutocompleteEntries(String ontologyId, String entityId, Map<String,Object> flattenedEntity, String outPath, Map <String,PrintStream> writers) throws FileNotFoundException {

Object labels = flattenedEntity.get("label");

if(labels instanceof List) {
for(Object label : (List<Object>) labels) {
autocompleteWriter.println( gson.toJson(makeAutocompleteEntry(ontologyId, entityId, (String)label), Map.class) );
writeAutocomplete(ontologyId,makeAutocompleteEntry(ontologyId, entityId, (String)label),outPath,writers);
}
} else if(labels instanceof String) {
autocompleteWriter.println( gson.toJson(makeAutocompleteEntry(ontologyId, entityId, (String)labels), Map.class) );
writeAutocomplete(ontologyId,makeAutocompleteEntry(ontologyId, entityId, (String)labels),outPath,writers);
}

Object synonyms = flattenedEntity.get("synonym");

if(synonyms instanceof List) {
for(Object label : (List<Object>) synonyms) {
autocompleteWriter.println( gson.toJson(makeAutocompleteEntry(ontologyId, entityId, (String)label), Map.class) );
writeAutocomplete(ontologyId,makeAutocompleteEntry(ontologyId, entityId, (String)label),outPath,writers);
}
} else if(synonyms instanceof String) {
autocompleteWriter.println( gson.toJson(makeAutocompleteEntry(ontologyId, entityId, (String)synonyms), Map.class) );
writeAutocomplete(ontologyId,makeAutocompleteEntry(ontologyId, entityId, (String)synonyms),outPath,writers);
}
}

Expand Down
39 changes: 15 additions & 24 deletions dataload/load_into_solr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,24 @@ fi
$1/bin/solr start -force -Djetty.host=127.0.0.1
sleep 10

wget --method POST --no-proxy -O - --server-response --content-on-error=on --header="Content-Type: application/json" --body-file $2/ontologies.jsonl \
http://127.0.0.1:8983/solr/ols4_entities/update/json/docs?commit=true

wget --method POST --no-proxy -O - --server-response --content-on-error=on --header="Content-Type: application/json" --body-file $2/classes.jsonl \
http://127.0.0.1:8983/solr/ols4_entities/update/json/docs?commit=true

wget --method POST --no-proxy -O - --server-response --content-on-error=on --header="Content-Type: application/json" --body-file $2/properties.jsonl \
http://127.0.0.1:8983/solr/ols4_entities/update/json/docs?commit=true

wget --method POST --no-proxy -O - --server-response --content-on-error=on --header="Content-Type: application/json" --body-file $2/individuals.jsonl \
http://127.0.0.1:8983/solr/ols4_entities/update/json/docs?commit=true

wget --method POST --no-proxy -O - --server-response --content-on-error=on --header="Content-Type: application/json" --body-file $2/autocomplete.jsonl \
http://127.0.0.1:8983/solr/ols4_autocomplete/update/json/docs?commit=true

FILES=$2/*_*.jsonl
for f in $FILES
do
echo "$f"
if [[ $f == *_ontologies.jsonl ]] || [[ $f == *_classes.jsonl ]] || [[ $f == *_properties.jsonl ]] || [[ $f == *_individuals.jsonl ]]; then
echo 'entity'
wget --method POST --no-proxy -O - --server-response --content-on-error=on --header="Content-Type: application/json" --body-file $f http://127.0.0.1:8983/solr/ols4_entities/update/json/docs?commit=true
elif [[ $f == *_autocomplete.jsonl ]]; then
echo 'autocomplete'
wget --method POST --no-proxy -O - --server-response --content-on-error=on --header="Content-Type: application/json" --body-file $f http://127.0.0.1:8983/solr/ols4_autocomplete/update/json/docs?commit=true
fi
done
sleep 5

echo 'update entities'
wget --no-proxy http://127.0.0.1:8983/solr/ols4_entities/update?commit=true

sleep 5

echo 'update autocomplete'
wget --no-proxy http://127.0.0.1:8983/solr/ols4_autocomplete/update?commit=true

sleep 5
echo 'loading solr finished'

$1/bin/solr stop