Skip to content

Commit

Permalink
Completeness: use proper CSV library to generate .csv #216
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Mar 13, 2023
1 parent ef32c2d commit 18829e6
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 21 deletions.
18 changes: 18 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/CsvUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ public static String createCsv(List<? extends Serializable> values) {
return createCsv(asArray(values));
}

public static String createCsvFromObjects(List<Object> values) {
return createCsv(asArrayFromObject(values));
}

public static String createCsv(String[] values) {
String csv = null;

Expand Down Expand Up @@ -42,6 +46,20 @@ private static String[] asArray(List<? extends Serializable> values) {
return strings.toArray(new String[strings.size()]);
}

private static String[] asArrayFromObject(List<Object> values) {
List<String> strings = new ArrayList<>();
for (Object value : values) {
if (value instanceof String) {
strings.add((String) value);
} else if (value == null) {
strings.add("");
} else {
strings.add(value.toString());
}
}
return strings.toArray(new String[strings.size()]);
}


private static String[] cleanRow(String[] values) {
List<Object> quoted = new ArrayList<>();
Expand Down
33 changes: 14 additions & 19 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,10 @@ private void saveLibraries003(String fileExtension, char separator) {
logger.info("Saving libraries003...");
var path = Paths.get(parameters.getOutputDir(), "libraries003" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write("library" + separator + "count\n");
writer.write(CsvUtils.createCsv(List.of("library", "count")));
completenessDAO.getLibrary003Counter().forEach((key, value) -> {
try {
writer.write(String.format("\"%s\"%s%d%n", key, separator, value));
writer.write(CsvUtils.createCsv(List.of(key, value)));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries003", e);
}
Expand All @@ -209,15 +209,15 @@ private void saveLibraries003(String fileExtension, char separator) {
private void saveMarcElements(String fileExtension, char separator) {
Path path = Paths.get(parameters.getOutputDir(), "marc-elements" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow(
writer.write(CsvUtils.createCsv(List.of(
"documenttype", "path", "packageid", "package", "tag", "subfield",
"number-of-record", "number-of-instances",
"min", "max", "mean", "stddev", "histogram"
));
)));
completenessDAO.getElementCardinality().forEach((documentType, cardinalities) -> {
cardinalities.forEach((marcPath, cardinality) -> {
try {
writer.write(formatCardinality(separator, marcPath, cardinality, documentType, null));
writer.write(formatCardinality(marcPath, cardinality, documentType, null));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveMarcElements", e);
}
Expand All @@ -241,7 +241,7 @@ private void saveGrouppedMarcElements(String fileExtension, char separator) {
documentTypes.forEach((documentType, cardinalities) -> {
cardinalities.forEach((marcPath, cardinality) -> {
try {
writer.write(formatCardinality(separator, marcPath, cardinality, documentType, groupId));
writer.write(formatCardinality(marcPath, cardinality, documentType, groupId));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveMarcElements", e);
}
Expand All @@ -257,7 +257,7 @@ private void savePackages(String fileExtension, char separator) {
logger.info("saving packages...");
var path = Paths.get(parameters.getOutputDir(), "packages" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow(separator, "documenttype", "packageid", "name", "label", "iscoretag", "count"));
writer.write(CsvUtils.createCsv(List.of("documenttype", "packageid", "name", "label", "iscoretag", "count")));
completenessDAO.getPackageCounter().forEach((documentType, packages) -> {
packages.forEach((packageName, count) -> {
try {
Expand All @@ -274,9 +274,7 @@ private void savePackages(String fileExtension, char separator) {
} else {
logger.severe(packageName + " has not been found in TagCategory");
}
writer.write(createRow(
separator, quote(documentType), id, quote(range), quote(label), isPartOfMarcScore, count
));
writer.write(CsvUtils.createCsv(List.of(documentType, id, range, label, isPartOfMarcScore, count)));
} catch (IOException e) {
logger.log(Level.SEVERE, "savePackages", e);
}
Expand All @@ -291,7 +289,7 @@ private void saveGrouppedPackages(String fileExtension, char separator) {
logger.info("saving groupped packages...");
var path = Paths.get(parameters.getOutputDir(), "completeness-groupped-packages" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow(separator, "group", "documenttype", "packageid", "name", "label", "iscoretag", "count"));
writer.write(CsvUtils.createCsv(List.of("group", "documenttype", "packageid", "name", "label", "iscoretag", "count")));
completenessDAO.getGrouppedPackageCounter().forEach((groupId, documentTypes) -> {
documentTypes.forEach((documentType, packages) -> {
packages.forEach((packageName, count) -> {
Expand All @@ -309,9 +307,7 @@ private void saveGrouppedPackages(String fileExtension, char separator) {
} else {
logger.severe(packageName + " has not been found in TagCategory");
}
writer.write(createRow(
separator, quote(groupId), quote(documentType), id, quote(range), quote(label), isPartOfMarcScore, count
));
writer.write(CsvUtils.createCsv(List.of(groupId, documentType, id, range, label, isPartOfMarcScore, count)));
} catch (IOException e) {
logger.log(Level.SEVERE, "savePackages", e);
}
Expand All @@ -327,11 +323,10 @@ private void saveLibraries(String fileExtension, char separator) {
logger.info("Saving libraries...");
var path = Paths.get(parameters.getOutputDir(), "libraries" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write("library" + separator + "count\n");
writer.write(CsvUtils.createCsv(List.of("library", "count")));
completenessDAO.getLibraryCounter().forEach((key, value) -> {
try {
writer.write(CsvUtils.createCsv(List.of(key, value)));
// writer.write(String.format("\"%s\"%s%d%n", key, separator, value));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries", e);
}
Expand Down Expand Up @@ -359,8 +354,7 @@ private void saveGroups(String fileExtension, char separator) {
}
}

private String formatCardinality(char separator,
String marcPath,
private String formatCardinality(String marcPath,
int cardinality,
String documentType,
String groupId) {
Expand Down Expand Up @@ -415,7 +409,8 @@ private String formatCardinality(char separator,
if (groupId != null)
values.add(0, groupId);

return StringUtils.join(values, separator) + "\n";
return CsvUtils.createCsvFromObjects(values);
// return StringUtils.join(values, separator) + "\n";
}

private char getSeparator(ValidationErrorFormat format) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ public void completeness_pica_groupBy() throws Exception {
int occurrences = Integer.parseInt(record[8]);
assertTrue(records <= occurrences);
int total = 0;
for (String expr : record[13].split("; ")) {
String histogram = record[13].replaceAll("^\"(.*)\"$", "$1");
for (String expr : histogram.split("; ")) {
String[] parts = expr.split("=");
total += Integer.parseInt(parts[0]) * Integer.parseInt(parts[1]);
}
Expand Down Expand Up @@ -167,7 +168,8 @@ public void completeness_pica_groupBy_file() throws Exception {
int occurrences = Integer.parseInt(record[8]);
assertTrue(records <= occurrences);
int total = 0;
for (String expr : record[13].split("; ")) {
String histogram = record[13].replaceAll("^\"(.*)\"$", "$1");
for (String expr : histogram.split("; ")) {
String[] parts = expr.split("=");
total += Integer.parseInt(parts[0]) * Integer.parseInt(parts[1]);
}
Expand Down

0 comments on commit 18829e6

Please sign in to comment.