Skip to content

Commit

Permalink
avoid mmap on large input files: fixes #160
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Jan 2, 2018
1 parent bc53aa2 commit 430d18d
Show file tree
Hide file tree
Showing 12 changed files with 102 additions and 10 deletions.
42 changes: 42 additions & 0 deletions c/cli/mlrcli.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#define DEFAULT_JSON_FLATTEN_SEPARATOR ":"
#define DEFAULT_OOSVAR_FLATTEN_SEPARATOR ":"
#define DEFAULT_COMMENT_STRING "#"
#define DEFAULT_MAX_FILE_SIZE_FOR_MMAP (4LL*1024LL*1024LL*1024LL)

// ----------------------------------------------------------------
static mapper_setup_t* mapper_lookup_table[] = {
Expand Down Expand Up @@ -251,12 +252,32 @@ cli_opts_t* parse_command_line(int argc, char** argv, sllv_t** ppmapper_list) {
slls_append(popts->filenames, argv[argi], NO_FREE);
}

// Check for use of mmap. It's fractionally faster than stdio (due to fewer data copies
// -- lrecs can be pointer-backed by mmap memory) but we can't use it in all situations.
if (no_input) {
slls_free(popts->filenames);
popts->filenames = NULL;
} else if (popts->filenames->length == 0) {
// No filenames means read from standard input, and standard input cannot be mmapped.
popts->reader_opts.use_mmap_for_read = FALSE;
} else if (popts->reader_opts.use_mmap_for_read == TRUE) {
// https://github.com/johnkerl/miller/issues/160: don't use mmap for large files.
//
// If any input files don't exist, don't error out just yet ... it's possible that the user
// is doing some complex put-with-tee or somesuch which will create the input file by the
// time it's needed. In that case we of course can't know the size yet, so avoid mmap there
// to be safe.
int all_exist_and_are_small_enough = TRUE;
for (sllse_t* pe = popts->filenames->phead; pe != NULL; pe = pe->pnext) {
ssize_t file_size = get_file_size(pe->value);
if (file_size == (ssize_t)(-1) || file_size >= popts->reader_opts.max_file_size_for_mmap) {
all_exist_and_are_small_enough = FALSE;
break;
}
}
if (!all_exist_and_are_small_enough) {
popts->reader_opts.use_mmap_for_read = FALSE;
}
}

if (popts->do_in_place && (popts->filenames == NULL || popts->filenames->length == 0)) {
Expand Down Expand Up @@ -763,6 +784,14 @@ static void main_usage_data_format_options(FILE* o, char* argv0) {
fprintf(o, "\n");
fprintf(o, " -p is a keystroke-saver for --nidx --fs space --repifs\n");
fprintf(o, "\n");
fprintf(o, " --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or\n");
fprintf(o, " for files less than n bytes in size. Default is for\n");
fprintf(o, " files less than %lld bytes in size.\n", DEFAULT_MAX_FILE_SIZE_FOR_MMAP);
fprintf(o, " 'Whenever possible' means always except for when reading\n");
fprintf(o, " standard input which is not mmappable. If you don't know\n");
fprintf(o, " what this means, don't worry about it -- it's a minor\n");
fprintf(o, " performance optimization.\n");
fprintf(o, "\n");
fprintf(o, " Examples: --csv for CSV-formatted input and output; --idkvp --opprint for\n");
fprintf(o, " DKVP-formatted input and pretty-printed output.\n");
}
Expand Down Expand Up @@ -1021,6 +1050,8 @@ void cli_reader_opts_init(cli_reader_opts_t* preader_opts) {
preader_opts->comment_string = NULL;
preader_opts->comment_handling = COMMENTS_ARE_DATA;

preader_opts->max_file_size_for_mmap = DEFAULT_MAX_FILE_SIZE_FOR_MMAP;

// xxx temp
preader_opts->generator_opts.field_name = "i";
preader_opts->generator_opts.start = 0LL;
Expand Down Expand Up @@ -1474,6 +1505,17 @@ int cli_handle_reader_options(char** argv, int argc, int *pargi, cli_reader_opts
preader_opts->use_mmap_for_read = FALSE;
argi += 1;

} else if (streq(argv[argi], "--mmap-below")) {
check_arg_count(argv, argi, argc, 2);
preader_opts->use_mmap_for_read = TRUE;
long long llmax;
if (sscanf(argv[argi+1], "%lld", &llmax) != 1) {
fprintf(stderr, "%s: could not scan \"%s\".\n",
MLR_GLOBALS.bargv0, argv[argi+1]);
}
preader_opts->max_file_size_for_mmap = llmax;
argi += 2;

} else if (streq(argv[argi], "--prepipe")) {
check_arg_count(argv, argi, argc, 2);
preader_opts->prepipe = argv[argi+1];
Expand Down
3 changes: 3 additions & 0 deletions c/cli/mlrcli.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ typedef struct _cli_reader_opts_t {
char* comment_string;
comment_handling_t comment_handling;

// https://github.com/johnkerl/miller/issues/160
ssize_t max_file_size_for_mmap;

// Fake internal-data-generator 'reader'
generator_opts_t generator_opts;

Expand Down
2 changes: 1 addition & 1 deletion c/draft-release-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ longer-term follow-on [**issue 151**](https://github.com/johnkerl/miller/issues/

* [**Issue 159**](https://github.com/johnkerl/miller/issues/159) fixes regex-match of literal dot.

* [**Issue 160**](https://github.com/johnkerl/miller/issues/160) fixes out-of-memory cases for huge files. This is an old bug, as old as Miller, and is due to inadequate testing of huge-file cases. The problem is simple: Miller prefers memory-mapped I/O (using `mmap`) over `stdio` since `mmap` is fractionally faster. Yet as any processing steps through an input file, more and more pages are faulted in -- and, unfortunately, previous pages are not freed once memory pressure increases. (This despite gallant attempts with `madvise`.) Once all processing is done, the memory is released; there is no leak per se. But the Miller process can crash before the entire file is read. The solution is equally simple: to prefer `stdio` over `mmap` for files over xxx finish typing me up re size and command-line flags and doc link.
* [**Issue 160**](https://github.com/johnkerl/miller/issues/160) fixes out-of-memory cases for huge files. This is an old bug, as old as Miller, and is due to inadequate testing of huge-file cases. The problem is simple: Miller prefers memory-mapped I/O (using `mmap`) over `stdio` since `mmap` is fractionally faster. Yet as any processing (even `mlr cat`) steps through an input file, more and more pages are faulted in -- and, unfortunately, previous pages are not paged out once memory pressure increases. (This despite gallant attempts with `madvise`.) Once all processing is done, the memory is released; there is no leak per se. But the Miller process can crash before the entire file is read. The solution is equally simple: to prefer `stdio` over `mmap` for files over xxx finish typing me up re size and command-line flags and doc link.

* [**Issue 161**](https://github.com/johnkerl/miller/issues/161) fixes a CSV-parse error (with `unwrapped double quote at line 0`) when a CSV file starts with the UTF-8 bill-of-materials ("BOM") sequence `0xef` `0xbb` `0xbf` and the header line has double-quoted fields. ([Release 5.2.0](https://github.com/johnkerl/miller/releases/tag/v5.2.0) introduced handling for UTF-8 BOMs, but missed the case of double-quoted header line.)

Expand Down
11 changes: 11 additions & 0 deletions c/lib/mlrutil.c
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,17 @@ char* mlr_alloc_double_backslash(char* input) {
return output;
}

// ----------------------------------------------------------------
// Returns -1 on error
ssize_t get_file_size(char* filename) {
struct stat statbuf;
if (stat(filename, &statbuf) < 0) {
return (ssize_t)(-1);
} else {
return statbuf.st_size;
}
}

// ----------------------------------------------------------------
char* read_file_into_memory(char* filename, size_t* psize) {
struct stat statbuf;
Expand Down
3 changes: 3 additions & 0 deletions c/lib/mlrutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ char* mlr_alloc_unbackslash(char* input);
// as desired.
char* mlr_alloc_double_backslash(char* input);

// Returns -1 on error
ssize_t get_file_size(char* filename);

// The caller should free the return value.
char* read_file_into_memory(char* filename, size_t* psize);
// The caller should free the return value.
Expand Down
2 changes: 1 addition & 1 deletion c/mapping/mapper_join.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ static void mapper_join_usage(FILE* o, char* argv0, char* verb) {
fprintf(o, " --ips {pair-separator character}\n");
fprintf(o, " --repifs\n");
fprintf(o, " --repips\n");
fprintf(o, " --use-mmap\n");
fprintf(o, " --mmap\n");
fprintf(o, " --no-mmap\n");
fprintf(o, "Please use \"%s --usage-separator-options\" for information on specifying separators.\n",
argv0);
Expand Down
1 change: 1 addition & 0 deletions c/todo.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ BUGFIXES
! count-similar
-> ut
* sql in/outs -> own page?
* sql-setup doc somewhere .......

================================================================
COMMENTS:
Expand Down
12 changes: 10 additions & 2 deletions doc/manpage.html
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,14 @@

-p is a keystroke-saver for --nidx --fs space --repifs

--mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or
for files less than n bytes in size. Default is for
files less than 4294967296 bytes in size.
'Whenever possible' means always except for when reading
standard input which is not mmappable. If you don't know
what this means, don't worry about it -- it's a minor
performance optimization.

Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
DKVP-formatted input and pretty-printed output.

Expand Down Expand Up @@ -822,7 +830,7 @@
--ips {pair-separator character}
--repifs
--repips
--use-mmap
--mmap
--no-mmap
Please use "mlr --usage-separator-options" for information on specifying separators.
Please see http://johnkerl.org/miller/doc/reference.html for more information
Expand Down Expand Up @@ -2294,7 +2302,7 @@



2018-01-01 MILLER(1)
2018-01-02 MILLER(1)
</pre>
</div>
<p/>
Expand Down
12 changes: 10 additions & 2 deletions doc/manpage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,14 @@ OPTIONS

-p is a keystroke-saver for --nidx --fs space --repifs

--mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or
for files less than n bytes in size. Default is for
files less than 4294967296 bytes in size.
'Whenever possible' means always except for when reading
standard input which is not mmappable. If you don't know
what this means, don't worry about it -- it's a minor
performance optimization.

Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
DKVP-formatted input and pretty-printed output.

Expand Down Expand Up @@ -628,7 +636,7 @@ VERBS
--ips {pair-separator character}
--repifs
--repips
--use-mmap
--mmap
--no-mmap
Please use "mlr --usage-separator-options" for information on specifying separators.
Please see http://johnkerl.org/miller/doc/reference.html for more information
Expand Down Expand Up @@ -2100,4 +2108,4 @@ SEE ALSO



2018-01-01 MILLER(1)
2018-01-02 MILLER(1)
14 changes: 11 additions & 3 deletions doc/mlr.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
.\" Date: 2018-01-01
.\" Date: 2018-01-02
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "MILLER" "1" "2018-01-01" "\ \&" "\ \&"
.TH "MILLER" "1" "2018-01-02" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -261,6 +261,14 @@ Please use "mlr --help-function {function name}" for function-specific help.

-p is a keystroke-saver for --nidx --fs space --repifs

--mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or
for files less than n bytes in size. Default is for
files less than 4294967296 bytes in size.
'Whenever possible' means always except for when reading
standard input which is not mmappable. If you don't know
what this means, don't worry about it -- it's a minor
performance optimization.

Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
DKVP-formatted input and pretty-printed output.
.fi
Expand Down Expand Up @@ -835,7 +843,7 @@ the main "mlr --help" for more information on syntax for these arguments.
--ips {pair-separator character}
--repifs
--repips
--use-mmap
--mmap
--no-mmap
Please use "mlr --usage-separator-options" for information on specifying separators.
Please see http://johnkerl.org/miller/doc/reference.html for more information
Expand Down
2 changes: 1 addition & 1 deletion doc/reference-verbs.html
Original file line number Diff line number Diff line change
Expand Up @@ -1593,7 +1593,7 @@
--ips {pair-separator character}
--repifs
--repips
--use-mmap
--mmap
--no-mmap
Please use "mlr --usage-separator-options" for information on specifying separators.
Please see http://johnkerl.org/miller/doc/reference.html for more information
Expand Down
8 changes: 8 additions & 0 deletions doc/reference.html
Original file line number Diff line number Diff line change
Expand Up @@ -1439,6 +1439,14 @@

-p is a keystroke-saver for --nidx --fs space --repifs

--mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or
for files less than n bytes in size. Default is for
files less than 4294967296 bytes in size.
'Whenever possible' means always except for when reading
standard input which is not mmappable. If you don't know
what this means, don't worry about it -- it's a minor
performance optimization.

Examples: --csv for CSV-formatted input and output; --idkvp --opprint for
DKVP-formatted input and pretty-printed output.

Expand Down

0 comments on commit 430d18d

Please sign in to comment.