Skip to content

Commit

Permalink
Implement --csv-trim-leading-space flag (#1272)
Browse files Browse the repository at this point in the history
* Implement --csv-trim-leading-space flag

* Artifacts from `make dev`
  • Loading branch information
johnkerl authored Apr 20, 2023
1 parent ae61dc5 commit 5801f5c
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 13 deletions.
5 changes: 4 additions & 1 deletion docs/src/manpage.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ MILLER(1) MILLER(1)
fill remaining keys with empty string. If a data line
has more fields than the header line, use integer
field labels as in the implicit-header case.
--csv-trim-leading-space Trims leading spaces in CSV data. Use this for data
like '"foo", "bar' which is non-RFC-4180 compliant,
but common.
--headerless-csv-output or --ho or --headerless-tsv-output
Print only CSV/TSV data lines; do not print CSV/TSV
header lines.
Expand Down Expand Up @@ -3354,5 +3357,5 @@ MILLER(1) MILLER(1)



2023-04-16 MILLER(1)
2023-04-20 MILLER(1)
</pre>
5 changes: 4 additions & 1 deletion docs/src/manpage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ MILLER(1) MILLER(1)
fill remaining keys with empty string. If a data line
has more fields than the header line, use integer
field labels as in the implicit-header case.
--csv-trim-leading-space Trims leading spaces in CSV data. Use this for data
like '"foo", "bar' which is non-RFC-4180 compliant,
but common.
--headerless-csv-output or --ho or --headerless-tsv-output
Print only CSV/TSV data lines; do not print CSV/TSV
header lines.
Expand Down Expand Up @@ -3333,4 +3336,4 @@ MILLER(1) MILLER(1)



2023-04-16 MILLER(1)
2023-04-20 MILLER(1)
1 change: 1 addition & 0 deletions docs/src/reference-main-flag-list.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ These are flags which are applicable to CSV format.
**Flags:**

* `--allow-ragged-csv-input or --ragged or --allow-ragged-tsv-input`: If a data line has fewer fields than the header line, fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case.
* `--csv-trim-leading-space`: Trims leading spaces in CSV data. Use this for data like '"foo", "bar' which is non-RFC-4180 compliant, but common.
* `--headerless-csv-output or --ho or --headerless-tsv-output`: Print only CSV/TSV data lines; do not print CSV/TSV header lines.
* `--implicit-csv-header or --headerless-csv-input or --hi or --implicit-tsv-header`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers.
* `--lazy-quotes`: Accepts quotes appearing in unquoted fields, and non-doubled quotes appearing in quoted fields.
Expand Down
9 changes: 9 additions & 0 deletions internal/pkg/cli/option_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -2173,6 +2173,15 @@ var CSVTSVOnlyFlagSection = FlagSection{
},
},

{
name: "--csv-trim-leading-space",
help: `Trims leading spaces in CSV data. Use this for data like '"foo", "bar' which is non-RFC-4180 compliant, but common.`,
parser: func(args []string, argc int, pargi *int, options *TOptions) {
options.ReaderOptions.CSVTrimLeadingSpace = true
*pargi += 1
},
},

{
name: "--quote-all",
help: "Force double-quoting of CSV fields.",
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/cli/option_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ type TReaderOptions struct {
UseImplicitCSVHeader bool
AllowRaggedCSVInput bool
CSVLazyQuotes bool
CSVTrimLeadingSpace bool

CommentHandling TCommentHandling
CommentString string
Expand Down
19 changes: 11 additions & 8 deletions internal/pkg/input/record_reader_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ import (

// ----------------------------------------------------------------
type RecordReaderCSV struct {
readerOptions *cli.TReaderOptions
recordsPerBatch int64 // distinct from readerOptions.RecordsPerBatch for join/repl
ifs0 byte // Go's CSV library only lets its 'Comma' be a single character
csvLazyQuotes bool // Maps directly to Go's CSV library's LazyQuotes
readerOptions *cli.TReaderOptions
recordsPerBatch int64 // distinct from readerOptions.RecordsPerBatch for join/repl
ifs0 byte // Go's CSV library only lets its 'Comma' be a single character
csvLazyQuotes bool // Maps directly to Go's CSV library's LazyQuotes
csvTrimLeadingSpace bool // Maps directly to Go's CSV library's TrimLeadingSpace

filename string
rowNumber int64
Expand All @@ -40,10 +41,11 @@ func NewRecordReaderCSV(
return nil, fmt.Errorf("for CSV, IFS can only be a single character")
}
return &RecordReaderCSV{
readerOptions: readerOptions,
ifs0: readerOptions.IFS[0],
recordsPerBatch: recordsPerBatch,
csvLazyQuotes: readerOptions.CSVLazyQuotes,
readerOptions: readerOptions,
ifs0: readerOptions.IFS[0],
recordsPerBatch: recordsPerBatch,
csvLazyQuotes: readerOptions.CSVLazyQuotes,
csvTrimLeadingSpace: readerOptions.CSVTrimLeadingSpace,
}, nil
}

Expand Down Expand Up @@ -105,6 +107,7 @@ func (reader *RecordReaderCSV) processHandle(
csvReader := csv.NewReader(NewBOMStrippingReader(handle))
csvReader.Comma = rune(reader.ifs0)
csvReader.LazyQuotes = reader.csvLazyQuotes
csvReader.TrimLeadingSpace = reader.csvTrimLeadingSpace
csvRecordsChannel := make(chan *list.List, recordsPerBatch)
go channelizedCSVRecordScanner(csvReader, csvRecordsChannel, downstreamDoneChannel, errorChannel,
recordsPerBatch)
Expand Down
5 changes: 4 additions & 1 deletion man/manpage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ MILLER(1) MILLER(1)
fill remaining keys with empty string. If a data line
has more fields than the header line, use integer
field labels as in the implicit-header case.
--csv-trim-leading-space Trims leading spaces in CSV data. Use this for data
like '"foo", "bar' which is non-RFC-4180 compliant,
but common.
--headerless-csv-output or --ho or --headerless-tsv-output
Print only CSV/TSV data lines; do not print CSV/TSV
header lines.
Expand Down Expand Up @@ -3333,4 +3336,4 @@ MILLER(1) MILLER(1)



2023-04-16 MILLER(1)
2023-04-20 MILLER(1)
7 changes: 5 additions & 2 deletions man/mlr.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
.\" Date: 2023-04-16
.\" Date: 2023-04-20
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "MILLER" "1" "2023-04-16" "\ \&" "\ \&"
.TH "MILLER" "1" "2023-04-20" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -366,6 +366,9 @@ These are flags which are applicable to CSV format.
fill remaining keys with empty string. If a data line
has more fields than the header line, use integer
field labels as in the implicit-header case.
--csv-trim-leading-space Trims leading spaces in CSV data. Use this for data
like '"foo", "bar' which is non-RFC-4180 compliant,
but common.
--headerless-csv-output or --ho or --headerless-tsv-output
Print only CSV/TSV data lines; do not print CSV/TSV
header lines.
Expand Down
2 changes: 2 additions & 0 deletions test/cases/help/0018/expout
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
--csv-trim-leading-space
Trims leading spaces in CSV data. Use this for data like '"foo", "bar' which is non-RFC-4180 compliant, but common.
--csv
Use CSV format for input and output data.
--csvlite
Expand Down

0 comments on commit 5801f5c

Please sign in to comment.