feat(prepare): Add CLI options for output and errors. Default award i…

…tem classification scheme. chore: Use csv crate to write CSV. Reduce indentation with guard clauses. perf: Use Arc<Mutex<BufWriter>>.
open-contracting · May 30, 2023 · 61d6b27 · 61d6b27
1 parent 85302be
commit 61d6b27
Show file tree

Hide file tree

Showing 5 changed files with 189 additions and 130 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,13 +15,14 @@ keywords = ["cli", "json"]
 anyhow = "1.0"
 clap = { version = "4.0", features = ["derive", "wrap_help"] }
 config = {version = "0.13", features = ["ini"] }
+csv = "1.0"
 human-panic = "1.0"
 indexmap = { version = "1.0", features = ["serde"] }
 log = "0.4"
 num_cpus = "1.0"
 pretty_env_logger = "0.4"
 rayon = "1.0"
-serde = "1.0"
+serde = { version ="1.0", features = ["derive"] }
 serde_json = { version = "1.0", features = ["preserve_order"] }
 statrs = "0.16"
 

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,14 @@
 # Changelog
 
+## 0.0.4 (2023-06-30)
+
+### Changed
+
+- {doc}`cli/prepare` command:
+
+  - Add `--output` (`-o`) and `--errors` (`-e`) options, instead of using shell redirection.
+  - Fill in `/awards[]/items[]/classification/scheme` with `item_classification_scheme`.
+
 ## 0.0.3 (2023-05-29)
 
 ### Added

diff --git a/docs/cli/prepare.md b/docs/cli/prepare.md
@@ -13,7 +13,7 @@ Corrected data is written to standard output as line-delimited JSON.
 Quality issues are written to standard error as CSV rows with the columns: line, ocid, path, array
 indexes, incorrect value, error description.
 
-Usage: ocdscardinal[EXE] prepare [OPTIONS] <FILE>
+Usage: ocdscardinal[EXE] prepare [OPTIONS] --output <OUTPUT> --errors <ERRORS> <FILE>
 
 Arguments:
   <FILE>
@@ -27,6 +27,12 @@ Options:
   -v, --verbose...
           Increase verbosity
 
+  -o, --output <OUTPUT>
+          The file to which to write corrected data (or "-" for standard output)
+
+  -e, --errors <ERRORS>
+          The file to which to write quality issues (or "-" for standard output)
+
   -h, --help
           Print help (see a summary with '-h')
 
@@ -48,7 +54,7 @@ Before following this command's workflow, follow the earlier steps in the {doc}`
 1. Run the `prepare` command. For example, if your data is in `input.jsonl`, this command writes the corrected data to `prepared.jsonl` and the quality issues to `issues.csv`:
 
    ```bash
-   ocdscardinal prepare --settings settings.ini input.jsonl > prepared.jsonl 2> issues.csv
+   ocdscardinal prepare --settings settings.ini --output prepared.jsonl --errors issues.csv input.jsonl
    ```
 
 1. Review the quality issues in the `issues.csv` file. Don't worry if many issues are reported: most are repetitive and can be fixed at once. Read the [demonstration](#demonstration) to learn how to interpret results.
@@ -63,16 +69,6 @@ This command is designed to only warn about quality issues (1) that it can fix a
 
 ## Demonstration
 
-Corrected data is written to standard output. Quality issues are written to standard error.
-
-Without redirection (`>`), standard output and standard error are both written to the console.
-
-It is recommended to redirect standard output and standard error to separate files. For example:
-
-```bash
-ocdscardinal prepare --settings settings.ini input.jsonl > prepared.jsonl 2> issues.csv
-```
-
 ::::{admonition} Example
 :class: seealso
 
@@ -84,11 +80,11 @@ This simplified file contains a bid without a status:
 :language: json
 :::
 
-Without redirection, the `prepare` command writes both the quality issue and the (unchanged) data to the console:
+For this demonstration, write both the quality issue and the (unchanged) data to the console:
 
 ```console
-$ ocdscardinal prepare docs/examples/prepare.jsonl
-1,"ocds-213czf-1",/bids/details[]/status,0,,not set
+$ ocdscardinal prepare --output - --errors - docs/examples/prepare.jsonl
+1,ocds-213czf-1,/bids/details[]/status,0,,not set
 {"ocid":"ocds-213czf-1","bids":{"details":[{"id":1}]}}
 
 ```
@@ -101,7 +97,7 @@ Quality issues are reported as CSV rows. Adding a header and rendering the row a
 1,"ocds-213czf-1",/bids/details[]/status,0,,not set
 :::
 
-If you redirect the quality issues to a file, you can open the CSV as a spreadsheet.
+If you write the quality issues to a file instead of the console, you can open the CSV as a spreadsheet.
 
 ::::
 
@@ -156,8 +152,9 @@ This behavior can't be disabled. If you need to disable it, [create an issue on
 The command supports filling in:
 
 - `/bids/details[]/value/currency`
-- `/bids/details[]/items/classification/scheme`
+- `/bids/details[]/items[]/classification/scheme`
 - `/bids/details[]/status`
+- `/awards[]/items[]/classification/scheme`
 - `/awards[]/status`
 
 To fill in one or more of these fields when the field isn't set, add a `[defaults]` section with relevant properties to your {doc}`../topics/settings`. For example:

diff --git a/src/lib.rs b/src/lib.rs
@@ -6,8 +6,9 @@ pub mod standard;
 
 use std::collections::HashMap;
 use std::fs::File;
-use std::io::{self, BufRead, Write};
+use std::io::{self, BufRead, BufWriter, Write};
 use std::path::PathBuf;
+use std::sync::{Arc, Mutex};
 
 use anyhow::Result;
 use indexmap::IndexMap;
@@ -280,9 +281,20 @@ impl Prepare {
     /// # Panics
     ///
     #[allow(clippy::cognitive_complexity)]
-    pub fn run(buffer: impl BufRead + Send, settings: Settings) {
+    #[allow(clippy::too_many_lines)]
+    // https://github.com/rust-lang/rust-clippy/issues/10413
+    #[allow(clippy::significant_drop_tightening)]
+    pub fn run<W: Write + Send>(
+        buffer: impl BufRead + Send,
+        settings: Settings,
+        output: &mut W,
+        errors: &mut W,
+    ) -> Result<(), anyhow::Error> {
         let default = HashMap::new();
 
+        let output = Arc::new(Mutex::new(BufWriter::new(output)));
+        let errors = Arc::new(Mutex::new(BufWriter::new(errors)));
+
         let defaults = settings.defaults.unwrap_or_default();
         // Closed codelists.
         let currency_default = defaults.currency.map(Value::String);
@@ -294,110 +306,127 @@ impl Prepare {
         let bid_status = codelists.get(&Codelist::BidStatus).unwrap_or(&default);
         let award_status = codelists.get(&Codelist::AwardStatus).unwrap_or(&default);
 
-        buffer.lines().enumerate().par_bridge().for_each(|(i, lines_result)| {
-            match lines_result {
-                Ok(string) => {
-                    match serde_json::from_str(&string) {
-                        Ok(value) => {
-                            if let Value::Object(mut release) = value {
-                                let ocid = release.get("ocid").map_or_else(|| Value::Null, std::clone::Clone::clone);
+        buffer.lines().enumerate().par_bridge().try_for_each(|(i, lines)| -> Result<(), anyhow::Error> {
+            // Use guard clauses to reduce indentation and ease readabaility.
+            let string = match lines {
+                Ok(string) => string,
+                Err(e) => return Ok(warn!("Line {} caused an I/O error, skipping. [{e}]", i + 1)),
+            };
+
+            let mut value: Value = match serde_json::from_str(&string) {
+                Ok(value) => value,
+                Err(e) => {
+                    if !string.as_bytes().iter().all(u8::is_ascii_whitespace) {
+                        warn!("Line {} is invalid JSON, skipping. [{e}]", i + 1);
+                    }
+                    return Ok(());
+                }
+            };
 
-                                prepare_id_object!(release, "buyer");
+            let Some(release) = value.as_object_mut() else {
+                return Ok(warn!("Line {} is not a JSON object, skipping.", i + 1))
+            };
 
-                                // /tender
-                                if let Some(Value::Object(tender)) = release.get_mut("tender") {
-                                    prepare_id_object!(tender, "procuringEntity");
-                                }
+            let mut rows = csv::Writer::from_writer(vec![]);
 
-                                // /bids
-                                if let Some(Value::Object(bids)) = release.get_mut("bids")
-                                    && let Some(Value::Array(details)) = bids.get_mut("details")
-                                {
-                                    for (j, bid) in details.iter_mut().enumerate() {
-                                        if let Some(Value::Object(value)) = bid.get_mut("value")
-                                            && !value.contains_key("currency")
-                                        {
-                                            currency_default.as_ref().map_or_else(|| {
-                                                eprintln!("{},{ocid},/bids/details[]/value/currency,{j},,not set", i + 1);
-                                            }, |currency| {
-                                                value.insert("currency".into(), currency.clone());
-                                            });
-                                        }
-
-                                        if let Some(Value::Array(items)) = bid.get_mut("items") {
-                                            for (k, item) in items.iter_mut().enumerate() {
-                                                if let Some(Value::Object(classification)) = item.get_mut("classification")
-                                                    && !classification.contains_key("scheme")
-                                                {
-                                                    item_classification_scheme_default.as_ref().map_or_else(|| {
-                                                        eprintln!("{},{ocid},/bids/details[]/items[]/classification/scheme,{j}.{k},,not set", i + 1);
-                                                    }, |scheme| {
-                                                        classification.insert("scheme".into(), scheme.clone());
-                                                    });
-                                                }
-                                            }
-                                        }
-
-                                        if let Some(Value::String(status)) = bid.get_mut("status") {
-                                            if bid_status.contains_key(status) {
-                                                *status = bid_status[status].clone();
-                                            }
-                                            if !BID_STATUS.contains(status.as_str()) {
-                                                eprintln!("{},{ocid},/bids/details[]/status,{j},\"{status}\",invalid", i + 1);
-                                            }
-                                        } else if bid.get("status").is_none() {
-                                            bid_status_default.as_ref().map_or_else(|| {
-                                                eprintln!("{},{ocid},/bids/details[]/status,{j},,not set", i + 1);
-                                            }, |status| {
-                                                bid["status"] = status.clone();
-                                            });
-                                        }
-
-                                        prepare_id_array!(bid, "tenderers");
-                                    }
-                                }
+            let ocid = release.get("ocid").map_or_else(|| Value::Null, std::clone::Clone::clone);
 
-                                // /awards
-                                if let Some(Value::Array(awards)) = release.get_mut("awards") {
-                                    for (j, award) in awards.iter_mut().enumerate() {
-                                        if let Some(Value::String(status)) = award.get_mut("status") {
-                                            if award_status.contains_key(status) {
-                                                *status = award_status[status].clone();
-                                            }
-                                            if !AWARD_STATUS.contains(status.as_str()) {
-                                                eprintln!("{},{ocid},/awards[]/status,{j},\"{status}\",invalid", i + 1);
-                                            }
-                                        } else if award.get("status").is_none() {
-                                            award_status_default.as_ref().map_or_else(|| {
-                                                eprintln!("{},{ocid},/awards[]/status,{j},,not set", i + 1);
-                                            }, |status| {
-                                                award["status"] = status.clone();
-                                            });
-                                        }
-
-                                        prepare_id_array!(award, "suppliers");
-                                    }
-                                }
+            prepare_id_object!(release, "buyer");
+
+            // /tender
+            if let Some(Value::Object(tender)) = release.get_mut("tender") {
+                prepare_id_object!(tender, "procuringEntity");
+            }
+
+            // /bids
+            if let Some(Value::Object(bids)) = release.get_mut("bids")
+                && let Some(Value::Array(details)) = bids.get_mut("details")
+            {
+                for (j, bid) in details.iter_mut().enumerate() {
+                    if let Some(Value::Object(value)) = bid.get_mut("value")
+                        && !value.contains_key("currency")
+                    {
+                        if let Some(default) = &currency_default {
+                            value.insert("currency".into(), default.clone());
+                        } else {
+                            rows.serialize((i + 1, &ocid, "/bids/details[]/value/currency", j, "", "not set"))?;
+                        }
+                    }
 
-                                println!("{}", serde_json::to_string(&release).unwrap());
-                            } else {
-                                warn!("Line {} is not a JSON object, skipping.", i + 1);
+                    if let Some(Value::Array(items)) = bid.get_mut("items") {
+                        for (k, item) in items.iter_mut().enumerate() {
+                            if let Some(Value::Object(classification)) = item.get_mut("classification")
+                                && !classification.contains_key("scheme")
+                            {
+                                if let Some(default) = &item_classification_scheme_default {
+                                    classification.insert("scheme".into(), default.clone());
+                                } else {
+                                    rows.serialize((i + 1, &ocid, "/bids/details[]/items[]/classification/scheme", format!("{j}.{k}"), "", "not set"))?;
+                                }
                             }
                         }
-                        Err(e) => {
-                            // Skip empty lines silently.
-                            // https://stackoverflow.com/a/64361042/244258
-                            if !string.as_bytes().iter().all(u8::is_ascii_whitespace) {
-                                warn!("Line {} is invalid JSON, skipping. [{e}]", i + 1);
+                    }
+
+                    if let Some(Value::String(status)) = bid.get_mut("status") {
+                        if bid_status.contains_key(status) {
+                            *status = bid_status[status].clone();
+                        }
+                        if !BID_STATUS.contains(status.as_str()) {
+                            rows.serialize((i + 1, &ocid, "/bids/details[]/status", j, status, "invalid"))?;
+                        }
+                    } else if bid.get("status").is_none() {
+                        if let Some(default) = &bid_status_default {
+                            bid["status"] = default.clone();
+                        } else {
+                            rows.serialize((i + 1, &ocid, "/bids/details[]/status", j, "", "not set"))?;
+                        }
+                    }
+
+                    prepare_id_array!(bid, "tenderers");
+                }
+            }
+
+            // /awards
+            if let Some(Value::Array(awards)) = release.get_mut("awards") {
+                for (j, award) in awards.iter_mut().enumerate() {
+                    if let Some(Value::Array(items)) = award.get_mut("items") {
+                        for (k, item) in items.iter_mut().enumerate() {
+                            if let Some(Value::Object(classification)) = item.get_mut("classification")
+                                && !classification.contains_key("scheme")
+                            {
+                                if let Some(default) = &item_classification_scheme_default {
+                                    classification.insert("scheme".into(), default.clone());
+                                } else {
+                                    rows.serialize((i + 1, &ocid, "/awards[]/items[]/classification/scheme", format!("{j}.{k}"), "", "not set"))?;
+                                }
                             }
                         }
                     }
+
+                    if let Some(Value::String(status)) = award.get_mut("status") {
+                        if award_status.contains_key(status) {
+                            *status = award_status[status].clone();
+                        }
+                        if !AWARD_STATUS.contains(status.as_str()) {
+                            rows.serialize((i + 1, &ocid, "/awards[]/status", j, status, "invalid"))?;
+                        }
+                    } else if award.get("status").is_none() {
+                        if let Some(default) = &award_status_default {
+                            award["status"] = default.clone();
+                        } else {
+                            rows.serialize((i + 1, &ocid, "/awards[]/status", j, "", "not set"))?;
+                        }
+                    }
+
+                    prepare_id_array!(award, "suppliers");
                 }
-                // Err: https://doc.rust-lang.org/std/io/enum.ErrorKind.html
-                // https://github.com/rust-lang/rust/blob/1.65.0/library/std/src/io/buffered/bufreader.rs#L362-L365
-                Err(e) => warn!("Line {} caused an I/O error, skipping. [{e}]", i + 1),
             }
-        });
+
+            writeln!(output.lock().unwrap(), "{}", &serde_json::to_string(&release)?)?;
+            errors.lock().unwrap().write_all(&rows.into_inner()?)?;
+
+            Ok(())
+        })
     }
 }