avoid mmap on large input files: fixes #160

johnkerl · Jan 2, 2018 · 430d18d · 430d18d
1 parent bc53aa2
commit 430d18d
Show file tree

Hide file tree

Showing 12 changed files with 102 additions and 10 deletions.
diff --git a/c/cli/mlrcli.c b/c/cli/mlrcli.c
@@ -32,6 +32,7 @@
 #define DEFAULT_JSON_FLATTEN_SEPARATOR   ":"
 #define DEFAULT_OOSVAR_FLATTEN_SEPARATOR ":"
 #define DEFAULT_COMMENT_STRING           "#"
+#define DEFAULT_MAX_FILE_SIZE_FOR_MMAP (4LL*1024LL*1024LL*1024LL)
 
 // ----------------------------------------------------------------
 static mapper_setup_t* mapper_lookup_table[] = {
@@ -251,12 +252,32 @@ cli_opts_t* parse_command_line(int argc, char** argv, sllv_t** ppmapper_list) {
 		slls_append(popts->filenames, argv[argi], NO_FREE);
 	}
 
+	// Check for use of mmap. It's fractionally faster than stdio (due to fewer data copies
+	// -- lrecs can be pointer-backed by mmap memory) but we can't use it in all situations.
 	if (no_input) {
 		slls_free(popts->filenames);
 		popts->filenames = NULL;
 	} else if (popts->filenames->length == 0) {
 		// No filenames means read from standard input, and standard input cannot be mmapped.
 		popts->reader_opts.use_mmap_for_read = FALSE;
+	} else if (popts->reader_opts.use_mmap_for_read == TRUE) {
+		// https://github.com/johnkerl/miller/issues/160: don't use mmap for large files.
+		//
+		// If any input files don't exist, don't error out just yet ... it's possible that the user
+		// is doing some complex put-with-tee or somesuch which will create the input file by the
+		// time it's needed. In that case we of course can't know the size yet, so avoid mmap there
+		// to be safe.
+		int all_exist_and_are_small_enough = TRUE;
+		for (sllse_t* pe = popts->filenames->phead; pe != NULL; pe = pe->pnext) {
+			ssize_t file_size = get_file_size(pe->value);
+			if (file_size == (ssize_t)(-1) || file_size >= popts->reader_opts.max_file_size_for_mmap) {
+				all_exist_and_are_small_enough = FALSE;
+				break;
+			}
+		}
+		if (!all_exist_and_are_small_enough) {
+			popts->reader_opts.use_mmap_for_read = FALSE;
+		}
 	}
 
 	if (popts->do_in_place && (popts->filenames == NULL || popts->filenames->length == 0)) {
@@ -763,6 +784,14 @@ static void main_usage_data_format_options(FILE* o, char* argv0) {
 	fprintf(o, "\n");
 	fprintf(o, "  -p is a keystroke-saver for --nidx --fs space --repifs\n");
 	fprintf(o, "\n");
+	fprintf(o, "  --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or\n");
+	fprintf(o, "                                  for files less than n bytes in size. Default is for\n");
+	fprintf(o, "                                  files less than %lld bytes in size.\n", DEFAULT_MAX_FILE_SIZE_FOR_MMAP);
+	fprintf(o, "                                  'Whenever possible' means always except for when reading\n");
+	fprintf(o, "                                  standard input which is not mmappable. If you don't know\n");
+	fprintf(o, "                                  what this means, don't worry about it -- it's a minor\n");
+	fprintf(o, "                                  performance optimization.\n");
+	fprintf(o, "\n");
 	fprintf(o, "  Examples: --csv for CSV-formatted input and output; --idkvp --opprint for\n");
 	fprintf(o, "  DKVP-formatted input and pretty-printed output.\n");
 }
@@ -1021,6 +1050,8 @@ void cli_reader_opts_init(cli_reader_opts_t* preader_opts) {
 	preader_opts->comment_string                 = NULL;
 	preader_opts->comment_handling               = COMMENTS_ARE_DATA;
 
+	preader_opts->max_file_size_for_mmap         = DEFAULT_MAX_FILE_SIZE_FOR_MMAP;
+
 	// xxx temp
 	preader_opts->generator_opts.field_name     = "i";
 	preader_opts->generator_opts.start          = 0LL;
@@ -1474,6 +1505,17 @@ int cli_handle_reader_options(char** argv, int argc, int *pargi, cli_reader_opts
 		preader_opts->use_mmap_for_read = FALSE;
 		argi += 1;
 
+	} else if (streq(argv[argi], "--mmap-below")) {
+		check_arg_count(argv, argi, argc, 2);
+		preader_opts->use_mmap_for_read = TRUE;
+		long long llmax;
+		if (sscanf(argv[argi+1], "%lld", &llmax) != 1) {
+			fprintf(stderr, "%s: could not scan \"%s\".\n",
+				MLR_GLOBALS.bargv0, argv[argi+1]);
+		}
+		preader_opts->max_file_size_for_mmap = llmax;
+		argi += 2;
+
 	} else if (streq(argv[argi], "--prepipe")) {
 		check_arg_count(argv, argi, argc, 2);
 		preader_opts->prepipe = argv[argi+1];

diff --git a/c/cli/mlrcli.h b/c/cli/mlrcli.h
@@ -44,6 +44,9 @@ typedef struct _cli_reader_opts_t {
 	char* comment_string;
 	comment_handling_t comment_handling;
 
+	// https://github.com/johnkerl/miller/issues/160
+	ssize_t max_file_size_for_mmap;
+
 	// Fake internal-data-generator 'reader'
 	generator_opts_t generator_opts;
 

diff --git a/c/draft-release-notes.md b/c/draft-release-notes.md
@@ -41,7 +41,7 @@ longer-term follow-on [**issue 151**](https://github.com/johnkerl/miller/issues/
 
 * [**Issue 159**](https://github.com/johnkerl/miller/issues/159) fixes regex-match of literal dot.
 
-* [**Issue 160**](https://github.com/johnkerl/miller/issues/160) fixes out-of-memory cases for huge files. This is an old bug, as old as Miller, and is due to inadequate testing of huge-file cases. The problem is simple: Miller prefers memory-mapped I/O (using `mmap`) over `stdio` since `mmap` is fractionally faster. Yet as any processing steps through an input file, more and more pages are faulted in -- and, unfortunately, previous pages are not freed once memory pressure increases. (This despite gallant attempts with `madvise`.) Once all processing is done, the memory is released; there is no leak per se. But the Miller process can crash before the entire file is read. The solution is equally simple: to prefer `stdio` over `mmap` for files over xxx finish typing me up re size and command-line flags and doc link.
+* [**Issue 160**](https://github.com/johnkerl/miller/issues/160) fixes out-of-memory cases for huge files. This is an old bug, as old as Miller, and is due to inadequate testing of huge-file cases. The problem is simple: Miller prefers memory-mapped I/O (using `mmap`) over `stdio` since `mmap` is fractionally faster. Yet as any processing (even `mlr cat`) steps through an input file, more and more pages are faulted in -- and, unfortunately, previous pages are not paged out once memory pressure increases. (This despite gallant attempts with `madvise`.) Once all processing is done, the memory is released; there is no leak per se. But the Miller process can crash before the entire file is read. The solution is equally simple: to prefer `stdio` over `mmap` for files over xxx finish typing me up re size and command-line flags and doc link.
 
 * [**Issue 161**](https://github.com/johnkerl/miller/issues/161) fixes a CSV-parse error (with `unwrapped double quote at line 0`) when a CSV file starts with the UTF-8 bill-of-materials ("BOM") sequence `0xef` `0xbb` `0xbf` and the header line has double-quoted fields. ([Release 5.2.0](https://github.com/johnkerl/miller/releases/tag/v5.2.0) introduced handling for UTF-8 BOMs, but missed the case of double-quoted header line.)
 

diff --git a/c/lib/mlrutil.c b/c/lib/mlrutil.c
@@ -508,6 +508,17 @@ char* mlr_alloc_double_backslash(char* input) {
 	return output;
 }
 
+// ----------------------------------------------------------------
+// Returns -1 on error
+ssize_t get_file_size(char* filename) {
+	struct stat statbuf;
+	if (stat(filename, &statbuf) < 0) {
+		return (ssize_t)(-1);
+	} else {
+		return statbuf.st_size;
+	}
+}
+
 // ----------------------------------------------------------------
 char* read_file_into_memory(char* filename, size_t* psize) {
 	struct stat statbuf;

diff --git a/c/lib/mlrutil.h b/c/lib/mlrutil.h
@@ -168,6 +168,9 @@ char* mlr_alloc_unbackslash(char* input);
 // as desired.
 char* mlr_alloc_double_backslash(char* input);
 
+// Returns -1 on error
+ssize_t get_file_size(char* filename);
+
 // The caller should free the return value.
 char* read_file_into_memory(char* filename, size_t* psize);
 // The caller should free the return value.

diff --git a/c/mapping/mapper_join.c b/c/mapping/mapper_join.c
@@ -107,7 +107,7 @@ static void mapper_join_usage(FILE* o, char* argv0, char* verb) {
 	fprintf(o, "  --ips {pair-separator character}\n");
 	fprintf(o, "  --repifs\n");
 	fprintf(o, "  --repips\n");
-	fprintf(o, "  --use-mmap\n");
+	fprintf(o, "  --mmap\n");
 	fprintf(o, "  --no-mmap\n");
 	fprintf(o, "Please use \"%s --usage-separator-options\" for information on specifying separators.\n",
 		argv0);

diff --git a/c/todo.txt b/c/todo.txt
@@ -21,6 +21,7 @@ BUGFIXES
 ! count-similar
   -> ut
 * sql in/outs -> own page?
+* sql-setup doc somewhere .......
 
 ================================================================
 COMMENTS:

diff --git a/doc/manpage.html b/doc/manpage.html
@@ -400,6 +400,14 @@
 
 	 -p is a keystroke-saver for --nidx --fs space --repifs
 
+	 --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or
+					 for files less than n bytes in size. Default is for
+					 files less than 4294967296 bytes in size.
+					 'Whenever possible' means always except for when reading
+					 standard input which is not mmappable. If you don't know
+					 what this means, don't worry about it -- it's a minor
+					 performance optimization.
+
 	 Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
 	 DKVP-formatted input and pretty-printed output.
 
@@ -822,7 +830,7 @@
 	 --ips {pair-separator character}
 	 --repifs
 	 --repips
-	 --use-mmap
+	 --mmap
 	 --no-mmap
        Please use "mlr --usage-separator-options" for information on specifying separators.
        Please see http://johnkerl.org/miller/doc/reference.html for more information
@@ -2294,7 +2302,7 @@
 
 
 
-				  2018-01-01			     MILLER(1)
+				  2018-01-02			     MILLER(1)
 </pre>
 </div>
 <p/>

diff --git a/doc/manpage.txt b/doc/manpage.txt
@@ -206,6 +206,14 @@ OPTIONS
 
 	 -p is a keystroke-saver for --nidx --fs space --repifs
 
+	 --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or
+					 for files less than n bytes in size. Default is for
+					 files less than 4294967296 bytes in size.
+					 'Whenever possible' means always except for when reading
+					 standard input which is not mmappable. If you don't know
+					 what this means, don't worry about it -- it's a minor
+					 performance optimization.
+
 	 Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
 	 DKVP-formatted input and pretty-printed output.
 
@@ -628,7 +636,7 @@ VERBS
 	 --ips {pair-separator character}
 	 --repifs
 	 --repips
-	 --use-mmap
+	 --mmap
 	 --no-mmap
        Please use "mlr --usage-separator-options" for information on specifying separators.
        Please see http://johnkerl.org/miller/doc/reference.html for more information
@@ -2100,4 +2108,4 @@ SEE ALSO
 
 
 
-				  2018-01-01			     MILLER(1)
+				  2018-01-02			     MILLER(1)
diff --git a/doc/mlr.1 b/doc/mlr.1
@@ -2,12 +2,12 @@
 .\"     Title: mlr
 .\"    Author: [see the "AUTHOR" section]
 .\" Generator: ./mkman.rb
-.\"      Date: 2018-01-01
+.\"      Date: 2018-01-02
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "MILLER" "1" "2018-01-01" "\ \&" "\ \&"
+.TH "MILLER" "1" "2018-01-02" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Portability definitions
 .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -261,6 +261,14 @@ Please use "mlr --help-function {function name}" for function-specific help.
 
   -p is a keystroke-saver for --nidx --fs space --repifs
 
+  --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or
+                                  for files less than n bytes in size. Default is for
+                                  files less than 4294967296 bytes in size.
+                                  'Whenever possible' means always except for when reading
+                                  standard input which is not mmappable. If you don't know
+                                  what this means, don't worry about it -- it's a minor
+                                  performance optimization.
+
   Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
   DKVP-formatted input and pretty-printed output.
 .fi
@@ -835,7 +843,7 @@ the main "mlr --help" for more information on syntax for these arguments.
   --ips {pair-separator character}
   --repifs
   --repips
-  --use-mmap
+  --mmap
   --no-mmap
 Please use "mlr --usage-separator-options" for information on specifying separators.
 Please see http://johnkerl.org/miller/doc/reference.html for more information

diff --git a/doc/reference-verbs.html b/doc/reference-verbs.html
@@ -1593,7 +1593,7 @@
   --ips {pair-separator character}
   --repifs
   --repips
-  --use-mmap
+  --mmap
   --no-mmap
 Please use "mlr --usage-separator-options" for information on specifying separators.
 Please see http://johnkerl.org/miller/doc/reference.html for more information

diff --git a/doc/reference.html b/doc/reference.html
@@ -1439,6 +1439,14 @@
 
   -p is a keystroke-saver for --nidx --fs space --repifs
 
+  --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or
+                                  for files less than n bytes in size. Default is for
+                                  files less than 4294967296 bytes in size.
+                                  'Whenever possible' means always except for when reading
+                                  standard input which is not mmappable. If you don't know
+                                  what this means, don't worry about it -- it's a minor
+                                  performance optimization.
+
   Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
   DKVP-formatted input and pretty-printed output.