Skip to content

Commit

Permalink
Moved saturation function back to end of class
Browse files Browse the repository at this point in the history
  • Loading branch information
leifeld committed Oct 21, 2023
1 parent aa584f4 commit ee1a2a1
Showing 1 changed file with 109 additions and 109 deletions.
218 changes: 109 additions & 109 deletions dna/src/main/java/export/Exporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -2941,115 +2941,6 @@ public void saveNestedBackboneResult() {
Exporter.this.simulatedAnnealingBackboneResult = null;
}

/**
* For a vector of document IDs (the already coded set of documents), generate a number of sequences of cumulative
* additions to the network according to normalized Euclidean network distances between consecutively sampled
* documents from the document ID vector.
*
* @param codedDocumentIds Vector of document IDs that have been coded and are eligible for training.
* @param numSamples The number of randomly resampled document sequences to return.
* @param maxNumDocuments The calculations stop at this maximum number of documents, usually the number of document
* IDs provided.
* @return A two-dimensional array containing the different sequences in the first dimension and the cumulative
* number of documents in the second dimension.
*/
public double[][] saturation(int[] codedDocumentIds, int numSamples, int maxNumDocuments) {
// drop document IDs that are not among the filtered documents
Set<Integer> filteredDocumentIds = new HashSet<>();
this.filteredStatements.stream().mapToInt(s -> s.getDocumentId()).forEach(filteredDocumentIds::add);
ArrayList<Integer> docIds = IntStream.of(codedDocumentIds)
.peek(id -> {
if (!filteredDocumentIds.contains(id)) {
LogEvent l = new LogEvent(Logger.WARNING,
"Document ID " + id + " skipped when predicting saturation.",
"Document ID " + id + " skipped. The document ID was not found in the set of filtered statements as per the network export options. It could hence not be used as a coded document to predict saturation and was skipped. If this was not intended, please check the network export arguments and change them as needed.");
Dna.logger.log(l);
}
})
.filter(id -> filteredDocumentIds.contains(id))
.boxed()
.collect(Collectors.toCollection(ArrayList::new));

// check if the arguments are valid
if (numSamples < 1) {
LogEvent l = new LogEvent(Logger.WARNING,
"Invalid number of resamples for saturation prediction.",
"'numSamples' changed from " + numSamples + " to 1. At least one sequence must be computed to predict saturation, preferably multiple sequences. Less than one sequence was requested. The parameter was changed to one sequence.");
numSamples = 1;
Dna.logger.log(l);
}
if (numSamples > docIds.size()) {
LogEvent l = new LogEvent(Logger.WARNING,
"Invalid number of resamples for saturation prediction.",
"'numSamples' changed from " + numSamples + " to " + docIds.size() + ". At most as many sequences must be computed to predict saturation as there are coded documents in order to avoid multiple counting of sequences. The parameter was changed to " + docIds.size() + " sequences, which corresponds to the number of coded documents that were provided as a training set.");
numSamples = docIds.size();
Dna.logger.log(l);
}
if (maxNumDocuments < 2) {
LogEvent l = new LogEvent(Logger.WARNING,
"Invalid maximum number of documents to compute saturation.",
"'maxNumDocuments' changed from " + maxNumDocuments + " to " + docIds.size() + ". At least two sampled documents are needed to predict saturation, ideally as many as were coded. Setting the number of resampled documents (i.e., the sequence length) to " + docIds.size() + " instead of " + maxNumDocuments + ".");
maxNumDocuments = docIds.size();
Dna.logger.log(l);
}
if (maxNumDocuments > docIds.size()) {
LogEvent l = new LogEvent(Logger.WARNING,
"Invalid maximum number of documents to compute saturation.",
"'maxNumDocuments' changed from " + maxNumDocuments + " to " + docIds.size() + ". At most as many documents must be sampled for predicting saturation as there are coded documents in order to avoid multiple counting of coded documents. The parameter was changed to a maximum of " + docIds.size() + " coded documents (i.e., sequence length), which corresponds to the number of coded documents that were provided as a training set.");
maxNumDocuments = docIds.size();
Dna.logger.log(l);
}
if (!this.timeWindow.equals("no")) {
LogEvent l = new LogEvent(Logger.WARNING,
"Time windows not supported by saturation prediction.",
"Attempted to use time windows with saturation prediction, but this is not supported. Switched off the time window option.");
this.timeWindow = "no";
Dna.logger.log(l);
}
if (!this.networkType.equals("onemode") && !this.networkType.equals("twomode")) {
LogEvent l = new LogEvent(Logger.WARNING,
"Network type not recognized by saturation prediction.",
"Attempted to use network type \"" + this.networkType + "\" with saturation prediction, but only \"onemode\" and \"twomode\" are permitted. Using \"onemode\".");
this.networkType = "onemode";
Dna.logger.log(l);
}
if (!this.isolates) {
this.isolates = true;
LogEvent l = new LogEvent(Logger.WARNING,
"Isolates switched on for saturation prediction.",
"Isolates were switched off for saturation prediction although isolates are required. Switched isolates on.");
Dna.logger.log(l);
}

// resample documents in a random sequence and save sequence of cumulative networks
double[][] results = new double[numSamples][maxNumDocuments];
ArrayList<Matrix> networks = new ArrayList<>();
ArrayList<ExportStatement> cumulativeStatements = new ArrayList<>();
double cumulativeEucDist = 0.0;
for (int i = 0; i < numSamples; i++) {
networks.clear();
cumulativeStatements.clear();
Collections.shuffle(docIds);
for (int j = 0; j < maxNumDocuments; j++) {
final int jFinal = j;
cumulativeStatements.addAll(this.filteredStatements.stream().filter(s -> s.getDocumentId() == docIds.get(jFinal)).collect(Collectors.toCollection(ArrayList::new)));
if (networkType.equals("onemode")) {
networks.add(this.computeOneModeMatrix(cumulativeStatements, this.qualifierAggregation, this.startDateTime, this.stopDateTime));
} else {
networks.add(this.computeTwoModeMatrix(cumulativeStatements, this.startDateTime, this.stopDateTime));
}
if (j == 0) {
double[][] zeroMat = new double[networks.get(j).getMatrix().length][networks.get(j).getMatrix()[0].length];
results[i][j] = normalizedEuclideanNetworkDistance(zeroMat, networks.get(j).getMatrix());
} else {
// TODO: may need to stop adding the previous result for easier non-linear modeling...
results[i][j] = normalizedEuclideanNetworkDistance(networks.get(j - 1).getMatrix(), networks.get(j).getMatrix()) + results[i][j - 1];
}
}
}
return results;
}

/**
* Compute the Euclidean distance between two normalized network matrices. The normalization ensures that the two
* matrices have compatible magnitudes and differ only in structure (i.e., redundant information is discounted).
Expand Down Expand Up @@ -3409,4 +3300,113 @@ public double[] evaluateBackboneSolution(String[] backboneEntities, int p) {

return results;
}

/**
* For a vector of document IDs (the already coded set of documents), generate a number of sequences of cumulative
* additions to the network according to normalized Euclidean network distances between consecutively sampled
* documents from the document ID vector.
*
* @param codedDocumentIds Vector of document IDs that have been coded and are eligible for training.
* @param numSamples The number of randomly resampled document sequences to return.
* @param maxNumDocuments The calculations stop at this maximum number of documents, usually the number of document
* IDs provided.
* @return A two-dimensional array containing the different sequences in the first dimension and the cumulative
* number of documents in the second dimension.
*/
public double[][] saturation(int[] codedDocumentIds, int numSamples, int maxNumDocuments) {
// drop document IDs that are not among the filtered documents
Set<Integer> filteredDocumentIds = new HashSet<>();
this.filteredStatements.stream().mapToInt(s -> s.getDocumentId()).forEach(filteredDocumentIds::add);
ArrayList<Integer> docIds = IntStream.of(codedDocumentIds)
.peek(id -> {
if (!filteredDocumentIds.contains(id)) {
LogEvent l = new LogEvent(Logger.WARNING,
"Document ID " + id + " skipped when predicting saturation.",
"Document ID " + id + " skipped. The document ID was not found in the set of filtered statements as per the network export options. It could hence not be used as a coded document to predict saturation and was skipped. If this was not intended, please check the network export arguments and change them as needed.");
Dna.logger.log(l);
}
})
.filter(id -> filteredDocumentIds.contains(id))
.boxed()
.collect(Collectors.toCollection(ArrayList::new));

// check if the arguments are valid
if (numSamples < 1) {
LogEvent l = new LogEvent(Logger.WARNING,
"Invalid number of resamples for saturation prediction.",
"'numSamples' changed from " + numSamples + " to 1. At least one sequence must be computed to predict saturation, preferably multiple sequences. Less than one sequence was requested. The parameter was changed to one sequence.");
numSamples = 1;
Dna.logger.log(l);
}
if (numSamples > docIds.size()) {
LogEvent l = new LogEvent(Logger.WARNING,
"Invalid number of resamples for saturation prediction.",
"'numSamples' changed from " + numSamples + " to " + docIds.size() + ". At most as many sequences must be computed to predict saturation as there are coded documents in order to avoid multiple counting of sequences. The parameter was changed to " + docIds.size() + " sequences, which corresponds to the number of coded documents that were provided as a training set.");
numSamples = docIds.size();
Dna.logger.log(l);
}
if (maxNumDocuments < 2) {
LogEvent l = new LogEvent(Logger.WARNING,
"Invalid maximum number of documents to compute saturation.",
"'maxNumDocuments' changed from " + maxNumDocuments + " to " + docIds.size() + ". At least two sampled documents are needed to predict saturation, ideally as many as were coded. Setting the number of resampled documents (i.e., the sequence length) to " + docIds.size() + " instead of " + maxNumDocuments + ".");
maxNumDocuments = docIds.size();
Dna.logger.log(l);
}
if (maxNumDocuments > docIds.size()) {
LogEvent l = new LogEvent(Logger.WARNING,
"Invalid maximum number of documents to compute saturation.",
"'maxNumDocuments' changed from " + maxNumDocuments + " to " + docIds.size() + ". At most as many documents must be sampled for predicting saturation as there are coded documents in order to avoid multiple counting of coded documents. The parameter was changed to a maximum of " + docIds.size() + " coded documents (i.e., sequence length), which corresponds to the number of coded documents that were provided as a training set.");
maxNumDocuments = docIds.size();
Dna.logger.log(l);
}
if (!this.timeWindow.equals("no")) {
LogEvent l = new LogEvent(Logger.WARNING,
"Time windows not supported by saturation prediction.",
"Attempted to use time windows with saturation prediction, but this is not supported. Switched off the time window option.");
this.timeWindow = "no";
Dna.logger.log(l);
}
if (!this.networkType.equals("onemode") && !this.networkType.equals("twomode")) {
LogEvent l = new LogEvent(Logger.WARNING,
"Network type not recognized by saturation prediction.",
"Attempted to use network type \"" + this.networkType + "\" with saturation prediction, but only \"onemode\" and \"twomode\" are permitted. Using \"onemode\".");
this.networkType = "onemode";
Dna.logger.log(l);
}
if (!this.isolates) {
this.isolates = true;
LogEvent l = new LogEvent(Logger.WARNING,
"Isolates switched on for saturation prediction.",
"Isolates were switched off for saturation prediction although isolates are required. Switched isolates on.");
Dna.logger.log(l);
}

// resample documents in a random sequence and save sequence of cumulative networks
double[][] results = new double[numSamples][maxNumDocuments];
ArrayList<Matrix> networks = new ArrayList<>();
ArrayList<ExportStatement> cumulativeStatements = new ArrayList<>();
double cumulativeEucDist = 0.0;
for (int i = 0; i < numSamples; i++) {
networks.clear();
cumulativeStatements.clear();
Collections.shuffle(docIds);
for (int j = 0; j < maxNumDocuments; j++) {
final int jFinal = j;
cumulativeStatements.addAll(this.filteredStatements.stream().filter(s -> s.getDocumentId() == docIds.get(jFinal)).collect(Collectors.toCollection(ArrayList::new)));
if (networkType.equals("onemode")) {
networks.add(this.computeOneModeMatrix(cumulativeStatements, this.qualifierAggregation, this.startDateTime, this.stopDateTime));
} else {
networks.add(this.computeTwoModeMatrix(cumulativeStatements, this.startDateTime, this.stopDateTime));
}
if (j == 0) {
double[][] zeroMat = new double[networks.get(j).getMatrix().length][networks.get(j).getMatrix()[0].length];
results[i][j] = normalizedEuclideanNetworkDistance(zeroMat, networks.get(j).getMatrix());
} else {
// TODO: may need to stop adding the previous result for easier non-linear modeling...
results[i][j] = normalizedEuclideanNetworkDistance(networks.get(j - 1).getMatrix(), networks.get(j).getMatrix()) + results[i][j - 1];
}
}
}
return results;
}
}

0 comments on commit ee1a2a1

Please sign in to comment.