Rdatatable · mattdowle · Jul 14, 2020 · May 20, 2020 · May 20, 2020 · May 21, 2020
@@ -81,6 +81,8 @@ unit = "s")
 
 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR.
 
+15. `fread` now supports native parsing of standard date and time formats (date like `%Y-%m-%d` and time like `%Y-%m-%d %H:%M:%OS` and in [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format, i.e. like `%Y-%m-%dT%H:%M:%OS%Z`, where `%Z` is a time zone marker such as `Z` for UTC or `+%H:%M`/`%H%M` to encode a UTC offset). Detected dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and detected times are returned as `POSIXct` columns with time zone set to `UTC` (with offset indicated by `%Z` applied). To display this column in the time zone of your choosing, recall that time zones in `POSIXct` vectors are just printing artifacts -- all `POSIXct` columns use "numeric seconds since 1970-01-01 00:00:00 UTC" as the underlying representation, and the time zone is only applied when `print` (or `format`) is invoked. As such, changing time zones can be done near instantaneously & with no copies by `setattr(tzcol, 'tzone', 'Asia/Yangon')` (or any other time zone available in `OlsonNames()`).
+
 ## BUG FIXES
 
 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).

@@ -8609,6 +8609,8 @@ if (test_R.utils) {
 # fix for #1573
 ans1 = fread(testDir("issue_1573_fill.txt"), fill=TRUE, na.strings="")
 ans2 = setDT(read.table(testDir("issue_1573_fill.txt"), header=TRUE, fill=TRUE, stringsAsFactors=FALSE, na.strings=""))
+date_cols = c('SD2', 'SD3', 'SD4')
+ans2[ , (date_cols) := lapply(.SD, as.IDate), .SDcols = date_cols]
 test(1622.1, ans1, ans2)
 test(1622.2, ans1, fread(testDir("issue_1573_fill.txt"), fill=TRUE, sep=" ", na.strings=""))
 
@@ -10720,7 +10722,7 @@ test(1743.08, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "int
 test(1743.09, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex"))
 test(1743.10, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), c(a="Date",b="integer",c="complex",d="integer"))
 test(1743.11, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), c(a="Date",b="integer",c="complex",d="raw"))
-test(1743.12, x = vapply(fread("a,b\n2015-01-01,2015-01-01", colClasses = c(NA, "IDate")), inherits, what = "IDate", FUN.VALUE = logical(1)), y = c(a=FALSE, b=TRUE))
+test(1743.12, x = vapply(fread("a,b\n1+3i,2015-01-01", colClasses = c(NA, "IDate")), inherits, what = "IDate", FUN.VALUE = logical(1)), y = c(a=FALSE, b=TRUE))
 
 ## Attempts to impose incompatible colClasses is a warning (not an error)
 ## and does not change the value of the columns
@@ -16853,3 +16855,34 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN))
 test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
 A = data.table(A=as.complex(rep(NA, 5)))
 test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A)))
+
+# native reading of [-]?[0-9]+[-][0-9]{2}[-][0-9]{2} dates and
+#   <date>[T ][0-9]{2}[:][0-9]{2}[:][0-9]{2}(?:[.][0-9]+)?(?:Z|[+-][0-9]{2}[:]?[0-9]{2})? timestamps
+dates = as.IDate(c(9610, 19109, 19643, 20385, -1413, 9847, 4116, -11145, -2327, 1760))
+times = .POSIXct(tz = 'UTC', c(
+  937402277.067304, -626563403.382897, -506636228.039861, -2066740882.02417,
+  -2398617863.28256, -1054008563.60793, 1535199547.55902, 2075410085.54399,
+  1201364458.72486, 939956943.690777
+))
+DT = data.table(dates, times)
+
+tmp = tempfile()
+
+## ISO8601 format (%FT%TZ) by default
+fwrite(DT, tmp)
+test(2139.01, fread(tmp), DT)
+
+fwrite(DT, tmp, dateTimeAs='write.csv')
+test(2139.02, fread(tmp), DT)
+
+fwrite(copy(DT)[ , times := format(times, '%FT%T+00:00')], tmp)
+test(2139.03, fread(tmp), DT)
+
+fwrite(copy(DT)[ , times := format(times, '%FT%T+0000')], tmp)
+test(2139.04, fread(tmp), DT)
+
+fwrite(copy(DT)[ , times := format(times, '%FT%T+0115')], tmp)
+test(2139.05, fread(tmp), copy(DT)[ , times := times - 4500])
+
+fwrite(copy(DT)[ , times := format(times, '%FT%T+01')], tmp)
+test(2139.06, fread(tmp), copy(DT)[ , times := times - 3600])
@@ -2,11 +2,11 @@
 \alias{fread}
 \title{ Fast and friendly file finagler }
 \description{
-   Similar to \code{read.table} but faster and more convenient. All controls such as \code{sep}, \code{colClasses} and \code{nrows} are automatically detected. \code{bit64::integer64} types are also detected and read directly without needing to read as character before converting.
+   Similar to \code{read.table} but faster and more convenient. All controls such as \code{sep}, \code{colClasses} and \code{nrows} are automatically detected.
 
-   Dates are read as character currently. They can be converted afterwards using the excellent \code{fasttime} package or standard base functions.
+   \code{bit64::integer64}, \code{\link{IDate}}, and \code{\link{POSIXct}} types are also detected and read directly without needing to read as character before converting.
 
-   `fread` is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector.
+   \code{fread} is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector.
 }
 \usage{
 fread(input, file, text, cmd, sep="auto", sep2="auto", dec=".", quote="\"",

@@ -66,8 +66,8 @@ static int8_t *type = NULL, *tmpType = NULL, *size = NULL;
 static lenOff *colNames = NULL;
 static freadMainArgs args;  // global for use by DTPRINT
 
-const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "string"};
-int8_t     typeSize[NUMTYPE]     = { 0,      1,       1,       1,       1,       4,       8,       8,         8,         8,         8      };
+const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"};
+int8_t     typeSize[NUMTYPE]     = { 0,      1,       1,       1,       1,       4,       8,       8,         8,         8,         4,       8       , 8       };
 
 // In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043
 // So we assign them through below init function.
@@ -571,11 +571,9 @@ static void Field(FieldParseContext *ctx)
   }
 }
 
-
-static void StrtoI32(FieldParseContext *ctx)
+static void str_to_i32_core(const char **pch, int32_t *target)
 {
-  const char *ch = *(ctx->ch);
-  int32_t *target = (int32_t*) ctx->targets[sizeof(int32_t)];
+  const char *ch = *pch;
 
   if (*ch=='0' && args.keepLeadingZeros && (uint_fast8_t)(ch[1]-'0')<10) return;
   bool neg = *ch=='-';
@@ -605,12 +603,17 @@ static void StrtoI32(FieldParseContext *ctx)
   //     (acc==0 && ch-start==1) ) {
   if ((sf || ch>start) && sf<=10 && acc<=INT32_MAX) {
     *target = neg ? -(int32_t)acc : (int32_t)acc;
-    *(ctx->ch) = ch;
+    *pch = ch;
   } else {
     *target = NA_INT32;  // empty field ideally, contains NA and fall through to check if NA (in which case this write is important), or just plain invalid
   }
 }
 
+static void StrtoI32(FieldParseContext *ctx)
+{
+  str_to_i32_core(ctx->ch, (int32_t*) ctx->targets[sizeof(int32_t)]);
+}
+
 
 static void StrtoI64(FieldParseContext *ctx)
 {
@@ -669,11 +672,10 @@ cat("1.0E300L\n};\n", file=f, append=TRUE)
  * of precision, for example `1.2439827340958723094785103` will not be parsed
  * as a double.
  */
-static void parse_double_regular(FieldParseContext *ctx)
+static void parse_double_regular_core(const char **pch, double *target)
 {
   #define FLOAT_MAX_DIGITS 18
-  const char *ch = *(ctx->ch);
-  double *target = (double*) ctx->targets[sizeof(double)];
+  const char *ch = *pch;
 
   if (*ch=='0' && args.keepLeadingZeros && (uint_fast8_t)(ch[1]-'0')<10) return;
   bool neg, Eneg;
@@ -784,13 +786,16 @@ static void parse_double_regular(FieldParseContext *ctx)
 
   r *= pow10lookup[e];
   *target = (double)(neg? -r : r);
-  *(ctx->ch) = ch;
+  *pch = ch;
   return;
 
   fail:
     *target = NA_FLOAT64;
 }
 
+static void parse_double_regular(FieldParseContext *ctx) {
+  parse_double_regular_core(ctx->ch, (double*) ctx->targets[sizeof(double)]);
+}
 
 
 /**
@@ -937,6 +942,126 @@ static void parse_double_hexadecimal(FieldParseContext *ctx)
     *target = NA_FLOAT64;
 }
 
+/*
+f = 'src/freadLookups.h'
+cat('const double cumDaysCycleYears[401] = {\n', file=f, append=TRUE)
+t = format(as.double(difftime(as.Date(sprintf('%04d-01-01', 1600:1999)), .Date(0), units='days')))
+rows = paste0(apply(matrix(t, ncol = 4L, byrow = TRUE), 1L, paste, collapse = ', '), ',\n')
+cat(rows, sep='', file=f, append=TRUE)
+cat(146097, '// total days in 400 years\n};\n', sep = '', file=f, append=TRUE)
+*/
+static void parse_iso8601_date_core(const char **pch, int32_t *target)
+{
+  const char *ch = *pch;
+
+  int32_t year, month, day;
+
+  str_to_i32_core(&ch, &year);
+
+  // .Date(.Machine$integer.max*c(-1, 1)):
+  //  -5877641-06-24 -- 5881580-07-11
+  //  rather than fiddle with dates within those terminal years (unlikely
+  //  to be showing up in data sets any time soon), just truncate towards 0
+  if (year == NA_INT32 || year < -5877640 || year > 5881579 || *ch != '-')
+    goto fail;
+
+  bool isLeapYear = year % 4 == 0 && (year % 100 != 0 || year/100 % 4 == 0);
+  ch++;
+
+  str_to_i32_core(&ch, &month);
+  if (month == NA_INT32 || month < 1 || month > 12 || *ch != '-')
+    goto fail;
+  ch++;
+
+  str_to_i32_core(&ch, &day);
+  if (day == NA_INT32 || day < 1 ||
+      (day > (isLeapYear ? leapYearDays[month-1] : normYearDays[month-1])))
+    goto fail;
+
+  *target =
+    (year/400 - 4)*cumDaysCycleYears[400] + // days to beginning of 400-year cycle
+    cumDaysCycleYears[year % 400] + // days to beginning of year within 400-year cycle
+    (isLeapYear ? cumDaysCycleMonthsLeap[month-1] : cumDaysCycleMonthsNorm[month-1]) + // days to beginning of month within year
+    day-1; // day within month (subtract 1: 1970-01-01 -> 0)
+
+  *pch = ch;
+  return;
+
+  fail:
+    *target = NA_FLOAT64;
+}
+
+static void parse_iso8601_date(FieldParseContext *ctx) {
+  parse_iso8601_date_core(ctx->ch, (int32_t*) ctx->targets[sizeof(int32_t)]);
+}
+
+static void parse_iso8601_timestamp(FieldParseContext *ctx)
+{
+  const char *ch = *(ctx->ch);
+  double *target = (double*) ctx->targets[sizeof(double)];
+
+  int32_t date, hour, minute;
+  double second;
+
+  parse_iso8601_date_core(&ch, &date);
+  if (date == NA_INT32 || (*ch != ' ' && *ch != 'T'))
+    goto fail;
+  ch++;
+
+  str_to_i32_core(&ch, &hour);
+  if (hour == NA_INT32 || hour < 0 || hour > 23 || *ch != ':')
+    goto fail;
+  ch++;
+
+  str_to_i32_core(&ch, &minute);
+  if (minute == NA_INT32 || minute < 0 || minute > 59 || *ch != ':')
+    goto fail;
+  ch++;
+
+  parse_double_regular_core(&ch, &second);
+  if (second == NA_FLOAT64 || second < 0 || second >= 60)
+    goto fail;
+
+  int32_t tz_hour = 0, tz_minute = 0;
+  if (*ch == 'Z') {
+    ch++; // "Zulu time"=UTC
+  } else {
+    if (*ch == ' ')
+      ch++;
+    if (*ch == '+' || *ch == '-') {
+      const char *start = ch; // facilitates distinguishing +04, +0004, +0000, +00:00
+      // three recognized formats: [+-]AA:BB, [+-]AABB, and [+-]AA
+      str_to_i32_core(&ch, &tz_hour);
+      if (tz_hour == NA_INT32)
+        goto fail;
+      if (ch - start == 5 && tz_hour != 0) { // +AABB
+        if (abs(tz_hour) > 2400)
+          goto fail;
+        tz_minute = tz_hour % 100;
+        tz_hour /= 100;
+      } else if (ch - start == 3) {
+        if (abs(tz_hour) > 24)
+          goto fail;
+        if (*ch == ':') {
+          ch++;
+          str_to_i32_core(&ch, &tz_minute);
+          if (tz_minute == NA_INT32)
+            goto fail;
+        }
+      }
+    }
+  }
+
+  //Rprintf("date=%d\thour=%d\tz_hour=%d\tminute=%d\ttz_minute=%d\tsecond=%.1f\n", date, hour, tz_hour, minute, tz_minute, second);
+  // cast upfront needed to prevent silent overflow
+  *target = 86400*(double)date + 3600*(hour - tz_hour) + 60*(minute - tz_minute) + second;
+
+  *(ctx->ch) = ch;
+  return;
+
+  fail:
+    *target = NA_FLOAT64;
+}
 
 /* Parse numbers 0 | 1 as boolean and ,, as NA (fwrite's default) */
 static void parse_bool_numeric(FieldParseContext *ctx)
@@ -1005,7 +1130,13 @@ static void parse_bool_lowercase(FieldParseContext *ctx)
 }
 
 
-
+/* How to register a new parser
+ *  (1) Write the parser
+ *  (2) Add it to fun array here
+ *  (3) Extend disabled_parsers, typeName, and typeSize here as appropriate
+ *  (4) Extend colType typdef in fread.h as appropriate
+ *  (5) Extend typeSxp, typeRName, typeEnum in freadR.c as appropriate
+ */
 typedef void (*reader_fun_t)(FieldParseContext *ctx);
 static reader_fun_t fun[NUMTYPE] = {
   (reader_fun_t) &Field,
@@ -1018,10 +1149,12 @@ static reader_fun_t fun[NUMTYPE] = {
   (reader_fun_t) &parse_double_regular,
   (reader_fun_t) &parse_double_extended,
   (reader_fun_t) &parse_double_hexadecimal,
+  (reader_fun_t) &parse_iso8601_date,
+  (reader_fun_t) &parse_iso8601_timestamp,
   (reader_fun_t) &Field
 };
 
-static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
 static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped) {
   // used in sampling column types and whether column names are present

@@ -13,19 +13,21 @@
 
 // Ordered hierarchy of types
 typedef enum {
-  NEG = -1,       // dummy to force signed type; sign bit used for out-of-sample type bump management
-  CT_DROP = 0,    // skip column requested by user; it is navigated as a string column with the prevailing quoteRule
-  CT_BOOL8_N,     // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1.
+  NEG = -1,        // dummy to force signed type; sign bit used for out-of-sample type bump management
+  CT_DROP = 0,     // skip column requested by user; it is navigated as a string column with the prevailing quoteRule
+  CT_BOOL8_N,      // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1.
   CT_BOOL8_U,
   CT_BOOL8_T,
   CT_BOOL8_L,
-  CT_INT32,       // int32_t
-  CT_INT64,       // int64_t
-  CT_FLOAT64,     // double (64-bit IEEE 754 float)
-  CT_FLOAT64_EXT, // double, with NAN/INF literals
-  CT_FLOAT64_HEX, // double, in hexadecimal format
-  CT_STRING,      // lenOff struct below
-  NUMTYPE         // placeholder for the number of types including drop; used for allocation and loop bounds
+  CT_INT32,        // int32_t
+  CT_INT64,        // int64_t
+  CT_FLOAT64,      // double (64-bit IEEE 754 float)
+  CT_FLOAT64_EXT,  // double, with NAN/INF literals
+  CT_FLOAT64_HEX,  // double, in hexadecimal format
+  CT_ISO8601_DATE, // integer, as read from a date in ISO-8601 format
+  CT_ISO8601_TIME, // double, as read from a timestamp in ISO-8601 time
+  CT_STRING,       // lenOff struct below
+  NUMTYPE          // placeholder for the number of types including drop; used for allocation and loop bounds
 } colType;
 
 extern int8_t typeSize[NUMTYPE];