diff --git a/Cargo.toml b/Cargo.toml index 2161c05..ad9b1c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,13 +15,14 @@ keywords = ["cli", "json"] anyhow = "1.0" clap = { version = "4.0", features = ["derive", "wrap_help"] } config = {version = "0.13", features = ["ini"] } +csv = "1.0" human-panic = "1.0" indexmap = { version = "1.0", features = ["serde"] } log = "0.4" num_cpus = "1.0" pretty_env_logger = "0.4" rayon = "1.0" -serde = "1.0" +serde = { version ="1.0", features = ["derive"] } serde_json = { version = "1.0", features = ["preserve_order"] } statrs = "0.16" diff --git a/docs/changelog.md b/docs/changelog.md index d6e03bb..7039e38 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,14 @@ # Changelog +## 0.0.4 (2023-06-30) + +### Changed + +- {doc}`cli/prepare` command: + + - Add `--output` (`-o`) and `--errors` (`-e`) options, instead of using shell redirection. + - Fill in `/awards[]/items[]/classification/scheme` with `item_classification_scheme`. + ## 0.0.3 (2023-05-29) ### Added diff --git a/docs/cli/prepare.md b/docs/cli/prepare.md index 38dd980..e38dd2e 100644 --- a/docs/cli/prepare.md +++ b/docs/cli/prepare.md @@ -13,7 +13,7 @@ Corrected data is written to standard output as line-delimited JSON. Quality issues are written to standard error as CSV rows with the columns: line, ocid, path, array indexes, incorrect value, error description. -Usage: ocdscardinal[EXE] prepare [OPTIONS] +Usage: ocdscardinal[EXE] prepare [OPTIONS] --output --errors Arguments: @@ -27,6 +27,12 @@ Options: -v, --verbose... Increase verbosity + -o, --output + The file to which to write corrected data (or "-" for standard output) + + -e, --errors + The file to which to write quality issues (or "-" for standard output) + -h, --help Print help (see a summary with '-h') @@ -48,7 +54,7 @@ Before following this command's workflow, follow the earlier steps in the {doc}` 1. Run the `prepare` command. For example, if your data is in `input.jsonl`, this command writes the corrected data to `prepared.jsonl` and the quality issues to `issues.csv`: ```bash - ocdscardinal prepare --settings settings.ini input.jsonl > prepared.jsonl 2> issues.csv + ocdscardinal prepare --settings settings.ini --output prepared.jsonl --errors issues.csv input.jsonl ``` 1. Review the quality issues in the `issues.csv` file. Don't worry if many issues are reported: most are repetitive and can be fixed at once. Read the [demonstration](#demonstration) to learn how to interpret results. @@ -63,16 +69,6 @@ This command is designed to only warn about quality issues (1) that it can fix a ## Demonstration -Corrected data is written to standard output. Quality issues are written to standard error. - -Without redirection (`>`), standard output and standard error are both written to the console. - -It is recommended to redirect standard output and standard error to separate files. For example: - -```bash -ocdscardinal prepare --settings settings.ini input.jsonl > prepared.jsonl 2> issues.csv -``` - ::::{admonition} Example :class: seealso @@ -84,11 +80,11 @@ This simplified file contains a bid without a status: :language: json ::: -Without redirection, the `prepare` command writes both the quality issue and the (unchanged) data to the console: +For this demonstration, write both the quality issue and the (unchanged) data to the console: ```console -$ ocdscardinal prepare docs/examples/prepare.jsonl -1,"ocds-213czf-1",/bids/details[]/status,0,,not set +$ ocdscardinal prepare --output - --errors - docs/examples/prepare.jsonl +1,ocds-213czf-1,/bids/details[]/status,0,,not set {"ocid":"ocds-213czf-1","bids":{"details":[{"id":1}]}} ``` @@ -101,7 +97,7 @@ Quality issues are reported as CSV rows. Adding a header and rendering the row a 1,"ocds-213czf-1",/bids/details[]/status,0,,not set ::: -If you redirect the quality issues to a file, you can open the CSV as a spreadsheet. +If you write the quality issues to a file instead of the console, you can open the CSV as a spreadsheet. :::: @@ -156,8 +152,9 @@ This behavior can't be disabled. If you need to disable it, [create an issue on The command supports filling in: - `/bids/details[]/value/currency` -- `/bids/details[]/items/classification/scheme` +- `/bids/details[]/items[]/classification/scheme` - `/bids/details[]/status` +- `/awards[]/items[]/classification/scheme` - `/awards[]/status` To fill in one or more of these fields when the field isn't set, add a `[defaults]` section with relevant properties to your {doc}`../topics/settings`. For example: diff --git a/src/lib.rs b/src/lib.rs index bbcc2c3..caae1bc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,8 +6,9 @@ pub mod standard; use std::collections::HashMap; use std::fs::File; -use std::io::{self, BufRead, Write}; +use std::io::{self, BufRead, BufWriter, Write}; use std::path::PathBuf; +use std::sync::{Arc, Mutex}; use anyhow::Result; use indexmap::IndexMap; @@ -280,9 +281,20 @@ impl Prepare { /// # Panics /// #[allow(clippy::cognitive_complexity)] - pub fn run(buffer: impl BufRead + Send, settings: Settings) { + #[allow(clippy::too_many_lines)] + // https://github.com/rust-lang/rust-clippy/issues/10413 + #[allow(clippy::significant_drop_tightening)] + pub fn run( + buffer: impl BufRead + Send, + settings: Settings, + output: &mut W, + errors: &mut W, + ) -> Result<(), anyhow::Error> { let default = HashMap::new(); + let output = Arc::new(Mutex::new(BufWriter::new(output))); + let errors = Arc::new(Mutex::new(BufWriter::new(errors))); + let defaults = settings.defaults.unwrap_or_default(); // Closed codelists. let currency_default = defaults.currency.map(Value::String); @@ -294,110 +306,127 @@ impl Prepare { let bid_status = codelists.get(&Codelist::BidStatus).unwrap_or(&default); let award_status = codelists.get(&Codelist::AwardStatus).unwrap_or(&default); - buffer.lines().enumerate().par_bridge().for_each(|(i, lines_result)| { - match lines_result { - Ok(string) => { - match serde_json::from_str(&string) { - Ok(value) => { - if let Value::Object(mut release) = value { - let ocid = release.get("ocid").map_or_else(|| Value::Null, std::clone::Clone::clone); + buffer.lines().enumerate().par_bridge().try_for_each(|(i, lines)| -> Result<(), anyhow::Error> { + // Use guard clauses to reduce indentation and ease readabaility. + let string = match lines { + Ok(string) => string, + Err(e) => return Ok(warn!("Line {} caused an I/O error, skipping. [{e}]", i + 1)), + }; + + let mut value: Value = match serde_json::from_str(&string) { + Ok(value) => value, + Err(e) => { + if !string.as_bytes().iter().all(u8::is_ascii_whitespace) { + warn!("Line {} is invalid JSON, skipping. [{e}]", i + 1); + } + return Ok(()); + } + }; - prepare_id_object!(release, "buyer"); + let Some(release) = value.as_object_mut() else { + return Ok(warn!("Line {} is not a JSON object, skipping.", i + 1)) + }; - // /tender - if let Some(Value::Object(tender)) = release.get_mut("tender") { - prepare_id_object!(tender, "procuringEntity"); - } + let mut rows = csv::Writer::from_writer(vec![]); - // /bids - if let Some(Value::Object(bids)) = release.get_mut("bids") - && let Some(Value::Array(details)) = bids.get_mut("details") - { - for (j, bid) in details.iter_mut().enumerate() { - if let Some(Value::Object(value)) = bid.get_mut("value") - && !value.contains_key("currency") - { - currency_default.as_ref().map_or_else(|| { - eprintln!("{},{ocid},/bids/details[]/value/currency,{j},,not set", i + 1); - }, |currency| { - value.insert("currency".into(), currency.clone()); - }); - } - - if let Some(Value::Array(items)) = bid.get_mut("items") { - for (k, item) in items.iter_mut().enumerate() { - if let Some(Value::Object(classification)) = item.get_mut("classification") - && !classification.contains_key("scheme") - { - item_classification_scheme_default.as_ref().map_or_else(|| { - eprintln!("{},{ocid},/bids/details[]/items[]/classification/scheme,{j}.{k},,not set", i + 1); - }, |scheme| { - classification.insert("scheme".into(), scheme.clone()); - }); - } - } - } - - if let Some(Value::String(status)) = bid.get_mut("status") { - if bid_status.contains_key(status) { - *status = bid_status[status].clone(); - } - if !BID_STATUS.contains(status.as_str()) { - eprintln!("{},{ocid},/bids/details[]/status,{j},\"{status}\",invalid", i + 1); - } - } else if bid.get("status").is_none() { - bid_status_default.as_ref().map_or_else(|| { - eprintln!("{},{ocid},/bids/details[]/status,{j},,not set", i + 1); - }, |status| { - bid["status"] = status.clone(); - }); - } - - prepare_id_array!(bid, "tenderers"); - } - } + let ocid = release.get("ocid").map_or_else(|| Value::Null, std::clone::Clone::clone); - // /awards - if let Some(Value::Array(awards)) = release.get_mut("awards") { - for (j, award) in awards.iter_mut().enumerate() { - if let Some(Value::String(status)) = award.get_mut("status") { - if award_status.contains_key(status) { - *status = award_status[status].clone(); - } - if !AWARD_STATUS.contains(status.as_str()) { - eprintln!("{},{ocid},/awards[]/status,{j},\"{status}\",invalid", i + 1); - } - } else if award.get("status").is_none() { - award_status_default.as_ref().map_or_else(|| { - eprintln!("{},{ocid},/awards[]/status,{j},,not set", i + 1); - }, |status| { - award["status"] = status.clone(); - }); - } - - prepare_id_array!(award, "suppliers"); - } - } + prepare_id_object!(release, "buyer"); + + // /tender + if let Some(Value::Object(tender)) = release.get_mut("tender") { + prepare_id_object!(tender, "procuringEntity"); + } + + // /bids + if let Some(Value::Object(bids)) = release.get_mut("bids") + && let Some(Value::Array(details)) = bids.get_mut("details") + { + for (j, bid) in details.iter_mut().enumerate() { + if let Some(Value::Object(value)) = bid.get_mut("value") + && !value.contains_key("currency") + { + if let Some(default) = ¤cy_default { + value.insert("currency".into(), default.clone()); + } else { + rows.serialize((i + 1, &ocid, "/bids/details[]/value/currency", j, "", "not set"))?; + } + } - println!("{}", serde_json::to_string(&release).unwrap()); - } else { - warn!("Line {} is not a JSON object, skipping.", i + 1); + if let Some(Value::Array(items)) = bid.get_mut("items") { + for (k, item) in items.iter_mut().enumerate() { + if let Some(Value::Object(classification)) = item.get_mut("classification") + && !classification.contains_key("scheme") + { + if let Some(default) = &item_classification_scheme_default { + classification.insert("scheme".into(), default.clone()); + } else { + rows.serialize((i + 1, &ocid, "/bids/details[]/items[]/classification/scheme", format!("{j}.{k}"), "", "not set"))?; + } } } - Err(e) => { - // Skip empty lines silently. - // https://stackoverflow.com/a/64361042/244258 - if !string.as_bytes().iter().all(u8::is_ascii_whitespace) { - warn!("Line {} is invalid JSON, skipping. [{e}]", i + 1); + } + + if let Some(Value::String(status)) = bid.get_mut("status") { + if bid_status.contains_key(status) { + *status = bid_status[status].clone(); + } + if !BID_STATUS.contains(status.as_str()) { + rows.serialize((i + 1, &ocid, "/bids/details[]/status", j, status, "invalid"))?; + } + } else if bid.get("status").is_none() { + if let Some(default) = &bid_status_default { + bid["status"] = default.clone(); + } else { + rows.serialize((i + 1, &ocid, "/bids/details[]/status", j, "", "not set"))?; + } + } + + prepare_id_array!(bid, "tenderers"); + } + } + + // /awards + if let Some(Value::Array(awards)) = release.get_mut("awards") { + for (j, award) in awards.iter_mut().enumerate() { + if let Some(Value::Array(items)) = award.get_mut("items") { + for (k, item) in items.iter_mut().enumerate() { + if let Some(Value::Object(classification)) = item.get_mut("classification") + && !classification.contains_key("scheme") + { + if let Some(default) = &item_classification_scheme_default { + classification.insert("scheme".into(), default.clone()); + } else { + rows.serialize((i + 1, &ocid, "/awards[]/items[]/classification/scheme", format!("{j}.{k}"), "", "not set"))?; + } } } } + + if let Some(Value::String(status)) = award.get_mut("status") { + if award_status.contains_key(status) { + *status = award_status[status].clone(); + } + if !AWARD_STATUS.contains(status.as_str()) { + rows.serialize((i + 1, &ocid, "/awards[]/status", j, status, "invalid"))?; + } + } else if award.get("status").is_none() { + if let Some(default) = &award_status_default { + award["status"] = default.clone(); + } else { + rows.serialize((i + 1, &ocid, "/awards[]/status", j, "", "not set"))?; + } + } + + prepare_id_array!(award, "suppliers"); } - // Err: https://doc.rust-lang.org/std/io/enum.ErrorKind.html - // https://github.com/rust-lang/rust/blob/1.65.0/library/std/src/io/buffered/bufreader.rs#L362-L365 - Err(e) => warn!("Line {} caused an I/O error, skipping. [{e}]", i + 1), } - }); + + writeln!(output.lock().unwrap(), "{}", &serde_json::to_string(&release)?)?; + errors.lock().unwrap().write_all(&rows.into_inner()?)?; + + Ok(()) + }) } } diff --git a/src/main.rs b/src/main.rs index 42cdeef..7dc07e7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ #![feature(unix_sigpipe)] use std::fs::File; -use std::io::{self, BufReader, Read}; +use std::io::{self, BufReader, Read, Write}; use std::path::{Path, PathBuf}; use std::process; @@ -48,6 +48,12 @@ enum Commands { /// The path to the settings file #[arg(long, short, value_parser = settings_parser)] settings: Option, + /// The file to which to write corrected data (or "-" for standard output) + #[arg(long, short)] + output: PathBuf, + /// The file to which to write quality issues (or "-" for standard output) + #[arg(long, short)] + errors: PathBuf, }, /// Calculate procurement indicators from OCDS compiled releases in a line-delimited JSON file /// @@ -89,6 +95,11 @@ fn application_error(e: &anyhow::Error) -> ! { process::exit(1); } +fn input_output_error(message: &str, e: &io::Error) -> ! { + eprintln!("I/O error: {message}: {e:#}"); + process::exit(1); +} + fn reader(file: &PathBuf) -> BufReader> { if file == &PathBuf::from("-") { BufReader::new(Box::new(io::stdin())) @@ -104,6 +115,16 @@ fn reader(file: &PathBuf) -> BufReader> { } } +fn create(file: &PathBuf) -> Box { + if file == &PathBuf::from("-") { + Box::new(io::stdout()) + } else { + Box::new(File::create(file).unwrap_or_else(|e| { + input_output_error(&format!("Couldn't open {file:?} for writing"), &e); + })) + } +} + #[unix_sigpipe = "sig_dfl"] fn main() { setup_panic!(); @@ -122,24 +143,28 @@ fn main() { match &cli.command { Commands::Init { file } => match ocdscardinal::init(file) { - Err(e) => { - eprintln!("Error writing to {file:?}: {e}"); - } - Ok(false) => { - println!("Settings written to {file:?}."); - } - _ => {} + Err(e) => eprintln!("Error writing to {file:?}: {e}"), + Ok(false) => println!("Settings written to {file:?}."), + Ok(true) => {} // written to standard output }, Commands::Coverage { file } => match ocdscardinal::Coverage::run(reader(file)) { - Ok(item) => { - println!("{:?}", item.results()); - } - Err(e) => { + Ok(item) => println!("{:?}", item.results()), + Err(e) => application_error(&e), + }, + Commands::Prepare { + file, + settings, + output, + errors, + } => { + if let Err(e) = ocdscardinal::Prepare::run( + reader(file), + settings.clone().unwrap_or_default(), + &mut create(output), + &mut create(errors), + ) { application_error(&e); } - }, - Commands::Prepare { file, settings } => { - ocdscardinal::Prepare::run(reader(file), settings.clone().unwrap_or_default()); } Commands::Indicators { file, count, settings } => { match ocdscardinal::Indicators::run(reader(file), settings.clone().unwrap_or_default()) { @@ -151,9 +176,7 @@ fn main() { } } } - Err(e) => { - application_error(&e); - } + Err(e) => application_error(&e), } } }