From 4bdf34fd8ced59626636255ab50eaba18c108374 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 3 Aug 2021 20:27:25 +0200 Subject: [PATCH 1/3] added fwrite sep='' --- R/fwrite.R | 2 +- inst/tests/tests.Rraw | 8 ++++++++ src/fwrite.c | 12 ++++++------ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 3f85ff1ea..c822b0567 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -42,7 +42,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } stopifnot(is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), - is.character(sep) && length(sep)==1L && nchar(sep) == 1L, + is.character(sep) && length(sep)==1L && (nchar(sep) == 1L || sep == ""), is.character(sep2) && length(sep2)==3L && nchar(sep2[2L])==1L, is.character(dec) && length(dec)==1L && nchar(dec) == 1L, dec != sep, # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e30cd255d..20b2300c2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17810,3 +17810,11 @@ setDF(d) d[1:50, "a"] = d[51:100, "a"] setDT(d) test(2200, nrow(d[a==99]), 2L) + +# fwrite now allows sep="", #4817 +test(2201.1, fwrite(data.frame(a="id", b=letters[1:5], c=1:5), sep=""), + output = c("abc", paste0("id", letters[1:5], 1:5))) +test(2201.2, fwrite(data.frame(a="id", b=1:1e2), sep=""), + output = c("ab", paste0("id", 1:1e2))) +test(2201.3, fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar')), sep=""), + output=c("ab", "foo", "2", "3.01bar")) diff --git a/src/fwrite.c b/src/fwrite.c index 7bad0cd16..2ce3403ba 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -716,13 +716,13 @@ void fwriteMain(fwriteMainArgs args) if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv - *ch++ = sep; + if (sep != '\0') *ch++ = sep; } for (int j=0; j=1 because 0-columns was caught earlier. + if (sep != '\0') ch--; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead } // compress buffer if gzip From 810ab5151badb38c07eb4f003975c7d801d01f95 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 4 Aug 2021 17:45:09 -0600 Subject: [PATCH 2/3] added news item --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index f944a2ffb..b5b93e354 100644 --- a/NEWS.md +++ b/NEWS.md @@ -97,6 +97,8 @@ 15. New convenience function `%plike%` maps to `like(..., perl=TRUE)`, [#3702](https://github.com/Rdatatable/data.table/issues/3702). `%plike%` uses Perl-compatible regular expressions (PCRE) which extend TRE, and may be more efficient in some cases. Thanks @KyleHaynes for the suggestion and PR. +16. `fwrite()` now accepts `sep=""`, [#4817](https://github.com/Rdatatable/data.table/issues/4817). The motivation is an example where the result of `paste0()` needs to be written to file but `paste0()` takes 40 minutes due to constructing a very large number of unique long strings in R's global character cache. Allowing `fwrite(, sep="")` avoids the `paste0` and saves 40 mins. Thanks to Jan Gorecki for the request, and Ben Schwen for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. From 5ec80e5b55a6e731a5227c71a458896d65da7572 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 4 Aug 2021 18:03:08 -0600 Subject: [PATCH 3/3] replace branches with ch+=sepLen --- src/fwrite.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 2ce3403ba..f7f400318 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -35,6 +35,7 @@ // Globals for this file only. Written once to hold parameters passed from R level. static const char *na; // by default "" or if set (not recommended) then usually "NA" static char sep; // comma in .csv files +static int sepLen; // 0 when sep="" for #4817, otherwise 1 static char sep2; // '|' within list columns. Used here to know if field should be quoted and in freadR.c to write sep2 in list columns static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 static int8_t doQuote=INT8_MIN; // whether to surround fields with double quote ". NA means 'auto' (default) @@ -590,6 +591,7 @@ void fwriteMain(fwriteMainArgs args) na = args.na; sep = args.sep; + sepLen = sep=='\0' ? 0 : 1; sep2 = args.sep2; dec = args.dec; scipen = args.scipen; @@ -635,10 +637,10 @@ void fwriteMain(fwriteMainArgs args) // could be console output) and writing column names to it. double t0 = wallclock(); - size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + 1/*sep*/); + size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + sepLen); if (args.doRowNames) { maxLineLen += args.rowNames ? getMaxStringLen(args.rowNames, args.nrow)*2 : 1+(int)log10(args.nrow); // the width of the row number - maxLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + maxLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + sepLen; } for (int j=0; j> column name) + headerLen += args.ncol*(sepLen+(doQuote!=0)*2) + eolLen + 3; // 3 in case doRowNames and doQuote (the first blank <<"",>> column name) } if (headerLen) { char *buff = malloc(headerLen); @@ -716,13 +718,15 @@ void fwriteMain(fwriteMainArgs args) if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv - if (sep != '\0') *ch++ = sep; + *ch = sep; + ch += sepLen; } for (int j=0; j=1 because 0-columns was caught earlier. + ch -= sepLen; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead } // compress buffer if gzip