Fwrite integer rownames (#5098)

Rdatatable · Aug 10, 2021 · c3d1100 · c3d1100
1 parent 78da3bd
commit c3d1100
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 24 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -163,6 +163,8 @@
 
 30. `fread(file=URL)` now works rather than error `does not exist or is non-readable`, [#4952](https://github.com/Rdatatable/data.table/issues/4952). `fread(URL)` and `fread(input=URL)` worked before and continue to work. Thanks to @pnacht for reporting and @ben-schwen for the PR.
 
+31. `fwrite(DF, row.names=TRUE)` where `DF` has specific integer rownames (e.g. using `rownames(DF) <- c(10L,20L,30L)`) would ignore the integer rownames and write the row numbers instead, [#4957](https://github.com/Rdatatable/data.table/issues/4957). Thanks to @dgarrimar for reporting and @ColeMiller1 for the PR. Further, when `quote='auto'` (default) and the rownames are integers (either default or specific), they are no longer quoted.
+
 ## NOTES
 
 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.<type>()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -10706,23 +10706,30 @@ test(1733.2, fwrite(data.table(c(1.2,-8.0,pi,67.99),1:4),dec=",",sep=";"),
 
 # fwrite implied and actual row.names
 DT = data.table(foo=1:3,bar=c(1.2,9.8,-6.0))
-test(1734.1, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)),
-             capture.output(write.csv(DT,quote=FALSE)))
-test(1734.2, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)),
-             capture.output(write.csv(DT)))
-test(1734.3, fwrite(DT,row.names=TRUE,quote='auto'),   # same other than 'foo' and 'bar' column names not quoted
-             output="\"\",foo,bar\n\"1\",1,1.2\n\"2\",2,9.8\n\"3\",3,-6")
+test(1734.01, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)),
+              capture.output(write.csv(DT,quote=FALSE)))
+test(1734.02, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)),
+              capture.output(write.csv(DT)))
+test(1734.03, fwrite(DT,row.names=TRUE,quote='auto'),   # same other than 'foo' and 'bar' column names not quoted
+              output="\"\",foo,bar\n1,1,1.2\n2,2,9.8\n3,3,-6")
 DF = as.data.frame(DT)
-test(1734.4, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)),
-             capture.output(write.csv(DF,quote=FALSE)))
-test(1734.5, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)),
-             capture.output(write.csv(DF)))
+test(1734.04, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)),
+              capture.output(write.csv(DF,quote=FALSE)))
+test(1734.05, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)),
+              capture.output(write.csv(DF)))
 rownames(DF)[2] = "someName"
 rownames(DF)[3] = "another"
-test(1734.6, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)),
-             capture.output(write.csv(DF,quote=FALSE)))
-test(1734.7, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)),
-             capture.output(write.csv(DF)))
+test(1734.06, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)),
+              capture.output(write.csv(DF,quote=FALSE)))
+test(1734.07, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)),
+              capture.output(write.csv(DF)))
+rownames(DF) = c(10L, -20L, 30L) ## test for #4957
+test(1734.08, capture.output(fwrite(DF, row.names=TRUE, quote=TRUE)),
+              capture.output(write.csv(DF)))
+test(1734.09, capture.output(fwrite(DF, row.names=TRUE, quote=FALSE)),
+              capture.output(write.csv(DF, quote=FALSE)))
+test(1734.10, fwrite(DF, row.names=TRUE, quote='auto'),
+              output=c('"",foo,bar','10,1,1.2','-20,2,9.8','30,3,-6'))
 
 # list columns and sep2
 set.seed(1)

diff --git a/src/fwrite.c b/src/fwrite.c
@@ -623,8 +623,8 @@ void fwriteMain(fwriteMainArgs args)
       DTPRINT(_("... "));
       for (int j=args.ncol-10; j<args.ncol; j++) DTPRINT(_("%d "), args.whichFun[j]);
     }
-    DTPRINT(_("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%"PRId64" args.ncol=%d eolLen=%d\n"),
-          args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol, eolLen);
+    DTPRINT(_("\nargs.doRowNames=%d args.rowNames=%p args.rowNameFun=%d doQuote=%d args.nrow=%"PRId64" args.ncol=%d eolLen=%d\n"),
+          args.doRowNames, args.rowNames, args.rowNameFun, doQuote, args.nrow, args.ncol, eolLen);
   }
 
   // Calculate upper bound for line length. Numbers use a fixed maximum (e.g. 12 for integer) while strings find the longest
@@ -639,8 +639,10 @@ void fwriteMain(fwriteMainArgs args)
   double t0 = wallclock();
   size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + sepLen);
   if (args.doRowNames) {
-    maxLineLen += args.rowNames ? getMaxStringLen(args.rowNames, args.nrow)*2 : 1+(int)log10(args.nrow);  // the width of the row number
-    maxLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + sepLen;
+    maxLineLen += args.rowNames==NULL ? 1+(int)log10(args.nrow)   // the width of the row number
+                  : (args.rowNameFun==WF_String ? getMaxStringLen(args.rowNames, args.nrow)*2  // *2 in case longest row name is all quotes (!) and all get escaped
+                  : 11); // specific integer names could be MAX_INT 2147483647 (10 chars) even on a 5 row table, and data.frame allows negative integer rownames hence 11 for the sign
+    maxLineLen += 2/*possible quotes*/ + sepLen;
   }
   for (int j=0; j<args.ncol; j++) {
     int width = writerMaxLen[args.whichFun[j]];
@@ -871,15 +873,17 @@ void fwriteMain(fwriteMainArgs args)
       if (failed) continue;  // Not break. Because we don't use #omp cancel yet.
       int64_t end = ((args.nrow - start)<rowsPerBatch) ? args.nrow : start + rowsPerBatch;
       for (int64_t i=start; i<end; i++) {
-        // Tepid starts here (once at beginning of each per line)
+        // Tepid starts here (once at beginning of each line)
         if (args.doRowNames) {
           if (args.rowNames==NULL) {
-            if (doQuote!=0/*NA'auto' or true*/) *ch++='"';
+            if (doQuote==1) *ch++='"';
             int64_t rn = i+1;
             writeInt64(&rn, 0, &ch);
-            if (doQuote!=0) *ch++='"';
+            if (doQuote==1) *ch++='"';
           } else {
-            writeString(args.rowNames, i, &ch);
+            if (args.rowNameFun != WF_String && doQuote==1) *ch++='"';
+            (args.funs[args.rowNameFun])(args.rowNames, i, &ch);  // #5098
+            if (args.rowNameFun != WF_String && doQuote==1) *ch++='"';
           }
           *ch = sep;
           ch += sepLen;

diff --git a/src/fwrite.h b/src/fwrite.h
@@ -84,7 +84,8 @@ typedef struct fwriteMainArgs
 
   const void *colNames;   // NULL means no header, otherwise ncol strings
   bool doRowNames;        // optional, likely false
-  const void *rowNames;   // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output.
+  const void *rowNames;   // if doRowNames is true and rowNames is NULL then row numbers are output
+  uint8_t rowNameFun;     // when rowNames is not NULL, which writer to use for them
   char sep;
   char sep2;
   char dec;

diff --git a/src/fwriteR.c b/src/fwriteR.c
@@ -256,10 +256,22 @@ SEXP fwriteR(
   // so we need a separate boolean flag as well as the row names should they exist (rare)
   args.doRowNames = LOGICAL(rowNames_Arg)[0];
   args.rowNames = NULL;
+  args.rowNameFun = 0;
   if (args.doRowNames) {
     SEXP rn = PROTECT(getAttrib(DF, R_RowNamesSymbol));
     protecti++;
-    args.rowNames = isString(rn) ? DATAPTR_RO(rn) : NULL;
+    if (isInteger(rn)) {
+      if (xlength(rn)!=2 || INTEGER(rn)[0]==NA_INTEGER) {
+        // not R's default rownames c(NA,-nrow)
+        if (xlength(rn) != args.nrow)
+          error(_("input has specific integer rownames but their length (%"PRId64") != nrow (%"PRId64")"), xlength(rn), args.nrow);  // # nocov
+        args.rowNames = INTEGER(rn);
+        args.rowNameFun = WF_Int32;
+      }
+    } else if (isString(rn)) {
+      args.rowNames = DATAPTR_RO(rn);
+      args.rowNameFun = WF_String;
+    }
   }
 
   args.sep = *CHAR(STRING_ELT(sep_Arg, 0));  // DO NOT DO: allow multichar separator (bad idea)