From cae20bb5e6058e04aa966b3443d61e83001e27c1 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Wed, 13 Apr 2016 18:05:24 -0700 Subject: [PATCH 1/5] Speculatively open files to avoid overrunning the system limits --- geojson.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/geojson.c b/geojson.c index 99f7972ae..7e42f4f8d 100644 --- a/geojson.c +++ b/geojson.c @@ -81,6 +81,7 @@ struct source { int CPUS; int TEMP_FILES; +long long MAX_FILES; static long long diskfree; #define MAX_ZOOM 24 @@ -147,11 +148,39 @@ void init_cpus() { CPUS = 1 << (int) (log(CPUS) / log(2)); TEMP_FILES = 64; + MAX_FILES = 64; + struct rlimit rl; if (getrlimit(RLIMIT_NOFILE, &rl) != 0) { perror("getrlimit"); } else { - TEMP_FILES = rl.rlim_cur / 3; + // MacOS can run out of system file descriptors + // even if we stay under the rlimit, so try to + // find out the real limit. + + long long fds[rl.rlim_cur]; + long long i; + for (i = 0; i < rl.rlim_cur; i++) { + fds[i] = open("/dev/null", O_RDONLY); + if (fds[i] < 0) { + break; + } + } + + MAX_FILES = i * 3 / 4; + if (MAX_FILES > 2000) { + MAX_FILES = 2000; + } + + long long j; + for (j = 0; j < i; j++) { + if (close(fds[j]) < 0) { + perror("close"); + exit(EXIT_FAILURE); + } + } + + TEMP_FILES = MAX_FILES / 3; if (TEMP_FILES > CPUS * 4) { TEMP_FILES = CPUS * 4; } @@ -1477,13 +1506,6 @@ void radix(struct reader *reader, int nreaders, FILE *geomfile, int geomfd, FILE // Then concatenate each of the sub-outputs into a final output. - struct rlimit rl; - - if (getrlimit(RLIMIT_NOFILE, &rl) != 0) { - perror("getrlimit"); - exit(EXIT_FAILURE); - } - long long mem; #ifdef __APPLE__ @@ -1511,12 +1533,7 @@ void radix(struct reader *reader, int nreaders, FILE *geomfile, int geomfd, FILE mem = 8192; } - // Don't use huge numbers of files that will trouble the file system - if (rl.rlim_cur > 5000) { - rl.rlim_cur = 5000; - } - - long long availfiles = rl.rlim_cur - 2 * nreaders // each reader has a geom and an index + long long availfiles = MAX_FILES - 2 * nreaders // each reader has a geom and an index - 4 // pool, meta, mbtiles, mbtiles journal - 4 // top-level geom and index output, both FILE and fd - 3; // stdin, stdout, stderr From 137bb46db5388c0f20b60aca3a00321462535ee1 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Wed, 13 Apr 2016 20:19:41 -0700 Subject: [PATCH 2/5] Clean up the maximum-files-open check a little --- CHANGELOG.md | 4 +++ geojson.c | 70 ++++++++++++++++++++++++++++------------------------ version.h | 2 +- 3 files changed, 43 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fcec72f3..a3127866e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.9.13 + +* Don't trust the OS so much about how many files can be open + ## 1.9.12 * Limit the size of the parallel parsing streaming input buffer diff --git a/geojson.c b/geojson.c index 7e42f4f8d..ef16bcab6 100644 --- a/geojson.c +++ b/geojson.c @@ -147,43 +147,49 @@ void init_cpus() { // Round down to a power of 2 CPUS = 1 << (int) (log(CPUS) / log(2)); - TEMP_FILES = 64; - MAX_FILES = 64; - struct rlimit rl; if (getrlimit(RLIMIT_NOFILE, &rl) != 0) { perror("getrlimit"); + exit(EXIT_FAILURE); } else { - // MacOS can run out of system file descriptors - // even if we stay under the rlimit, so try to - // find out the real limit. + MAX_FILES = rl.rlim_cur; + } - long long fds[rl.rlim_cur]; - long long i; - for (i = 0; i < rl.rlim_cur; i++) { - fds[i] = open("/dev/null", O_RDONLY); - if (fds[i] < 0) { - break; - } - } + // Don't really want too many temporary files, because the file system + // will start to bog down eventually + if (MAX_FILES > 2000) { + MAX_FILES = 2000; + } - MAX_FILES = i * 3 / 4; - if (MAX_FILES > 2000) { - MAX_FILES = 2000; + // MacOS can run out of system file descriptors + // even if we stay under the rlimit, so try to + // find out the real limit. + long long fds[MAX_FILES]; + long long i; + for (i = 0; i < MAX_FILES; i++) { + fds[i] = open("/dev/null", O_RDONLY); + if (fds[i] < 0) { + break; } - - long long j; - for (j = 0; j < i; j++) { - if (close(fds[j]) < 0) { - perror("close"); - exit(EXIT_FAILURE); - } + } + long long j; + for (j = 0; j < i; j++) { + if (close(fds[j]) < 0) { + perror("close"); + exit(EXIT_FAILURE); } + } - TEMP_FILES = MAX_FILES / 3; - if (TEMP_FILES > CPUS * 4) { - TEMP_FILES = CPUS * 4; - } + // Scale down because we really don't want to run the system out of files + MAX_FILES = i * 3 / 4; + if (MAX_FILES < 32) { + fprintf(stderr, "Can't open a useful number of files: %lld\n", MAX_FILES); + exit(EXIT_FAILURE); + } + + TEMP_FILES = (MAX_FILES - 10) / 2; + if (TEMP_FILES > CPUS * 4) { + TEMP_FILES = CPUS * 4; } } @@ -1533,10 +1539,10 @@ void radix(struct reader *reader, int nreaders, FILE *geomfile, int geomfd, FILE mem = 8192; } - long long availfiles = MAX_FILES - 2 * nreaders // each reader has a geom and an index - - 4 // pool, meta, mbtiles, mbtiles journal - - 4 // top-level geom and index output, both FILE and fd - - 3; // stdin, stdout, stderr + long long availfiles = MAX_FILES - 2 * nreaders // each reader has a geom and an index + - 4 // pool, meta, mbtiles, mbtiles journal + - 4 // top-level geom and index output, both FILE and fd + - 3; // stdin, stdout, stderr // 4 because for each we have output and input FILE and fd for geom and index int splits = availfiles / 4; diff --git a/version.h b/version.h index fb760ae7b..06899da3a 100644 --- a/version.h +++ b/version.h @@ -1 +1 @@ -#define VERSION "tippecanoe v1.9.12\n" +#define VERSION "tippecanoe v1.9.13\n" From 222735004de5b8f04034b6f164b24175494d17ec Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Wed, 13 Apr 2016 20:33:01 -0700 Subject: [PATCH 3/5] Add a sanity check to make sure I remember to close all open files --- geojson.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/geojson.c b/geojson.c index ef16bcab6..7b427d482 100644 --- a/geojson.c +++ b/geojson.c @@ -2476,6 +2476,7 @@ int main(int argc, char **argv) { pool_init(&include, 0); int exclude_all = 0; int read_parallel = 0; + int files_open_at_start; for (i = 0; i < 256; i++) { prevent[i] = 0; @@ -2713,6 +2714,9 @@ int main(int argc, char **argv) { } } + files_open_at_start = open("/dev/null", O_RDONLY); + close(files_open_at_start); + if (maxzoom > MAX_ZOOM) { maxzoom = MAX_ZOOM; fprintf(stderr, "Highest supported zoom is %d\n", maxzoom); @@ -2789,5 +2793,12 @@ int main(int argc, char **argv) { muntrace(); #endif + i = open("/dev/null", O_RDONLY); + // i < files_open_at_start is not an error, because reading from a pipe closes stdin + if (i > files_open_at_start) { + fprintf(stderr, "Internal error: did not close all files: %d\n", i); + exit(EXIT_FAILURE); + } + return ret; } From 732a51d68485080c9d0139e15f95a406e7e1d755 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Thu, 14 Apr 2016 10:06:01 -0700 Subject: [PATCH 4/5] Fix the option for testing radix sort Add a check to make sure I don't make the same mistake again --- geojson.c | 22 ++++++++++++++++++++++ tile.h | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/geojson.c b/geojson.c index 7b427d482..801c5a225 100644 --- a/geojson.c +++ b/geojson.c @@ -2483,6 +2483,28 @@ int main(int argc, char **argv) { additional[i] = 0; } + { + char dup[256]; + + memset(dup, 0, sizeof(dup)); + for (i = 0; i < sizeof(additional_options) / sizeof(additional_options[0]); i++) { + if (dup[additional_options[i]]) { + fprintf(stderr, "Internal error: reused -a%c\n", additional_options[i]); + exit(EXIT_FAILURE); + } + dup[additional_options[i]] = 1; + } + + memset(dup, 0, sizeof(dup)); + for (i = 0; i < sizeof(prevent_options) / sizeof(prevent_options[0]); i++) { + if (dup[prevent_options[i]]) { + fprintf(stderr, "Internal error: reused -p%c\n", prevent_options[i]); + exit(EXIT_FAILURE); + } + dup[prevent_options[i]] = 1; + } + } + static struct option long_options[] = { {"name", required_argument, 0, 'n'}, {"layer", required_argument, 0, 'l'}, diff --git a/tile.h b/tile.h index 904fef80a..bbcb35fb6 100644 --- a/tile.h +++ b/tile.h @@ -52,7 +52,7 @@ static int additional_options[] = { A_LINE_DROP, #define A_POLYGON_DROP ((int) 'p') A_POLYGON_DROP, -#define A_PREFER_RADIX_SORT ((int) 'r') +#define A_PREFER_RADIX_SORT ((int) 'R') A_PREFER_RADIX_SORT, }; From 81517a0cb465a27351bc4e97025850b500872241 Mon Sep 17 00:00:00 2001 From: Eric Fischer Date: Thu, 14 Apr 2016 10:20:28 -0700 Subject: [PATCH 5/5] Fix tracking of number of available file descriptors --- geojson.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/geojson.c b/geojson.c index 801c5a225..9a08623f8 100644 --- a/geojson.c +++ b/geojson.c @@ -1201,7 +1201,7 @@ void start_parsing(int fd, FILE *fp, long long offset, long long len, volatile i } } -void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int splits, long long mem, const char *tmpdir, int availfiles, FILE *geomfile, FILE *indexfile, long long *geompos_out, long long *progress, long long *progress_max, long long *progress_reported) { +void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int splits, long long mem, const char *tmpdir, long long *availfiles, FILE *geomfile, FILE *indexfile, long long *geompos_out, long long *progress, long long *progress_max, long long *progress_reported) { // Arranged as bits to facilitate subdividing again if a subdivided file is still huge int splitbits = log(splits) / log(2); splits = 1 << splitbits; @@ -1243,7 +1243,7 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split exit(EXIT_FAILURE); } - availfiles -= 4; + *availfiles -= 4; unlink(geomname); unlink(indexname); @@ -1320,7 +1320,7 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split exit(EXIT_FAILURE); } - availfiles += 2; + *availfiles += 2; } for (i = 0; i < splits; i++) { @@ -1333,7 +1333,7 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split exit(EXIT_FAILURE); } - availfiles += 2; + *availfiles += 2; } for (i = 0; i < splits; i++) { @@ -1482,7 +1482,7 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split // counter backward but will be an honest estimate of the work remaining. *progress_max += geomst.st_size / 4; - radix1(&geomfds[i], &indexfds[i], 1, prefix + splitbits, availfiles / 4, mem, tmpdir, availfiles, geomfile, indexfile, geompos_out, progress, progress_max, progress_reported); + radix1(&geomfds[i], &indexfds[i], 1, prefix + splitbits, *availfiles / 4, mem, tmpdir, availfiles, geomfile, indexfile, geompos_out, progress, progress_max, progress_reported); already_closed = 1; } } @@ -1496,9 +1496,9 @@ void radix1(int *geomfds_in, int *indexfds_in, int inputs, int prefix, int split perror("close index"); exit(EXIT_FAILURE); } - } - availfiles += 2; + *availfiles += 2; + } } } @@ -1568,7 +1568,13 @@ void radix(struct reader *reader, int nreaders, FILE *geomfile, int geomfd, FILE } long long progress = 0, progress_max = geom_total, progress_reported = -1; - radix1(geomfds, indexfds, nreaders, 0, splits, mem, tmpdir, availfiles, geomfile, indexfile, geompos, &progress, &progress_max, &progress_reported); + long long availfiles_before = availfiles; + radix1(geomfds, indexfds, nreaders, 0, splits, mem, tmpdir, &availfiles, geomfile, indexfile, geompos, &progress, &progress_max, &progress_reported); + + if (availfiles - 2 * nreaders != availfiles_before) { + fprintf(stderr, "Internal error: miscounted available file descriptors: %lld vs %lld\n", availfiles - 2 * nreaders, availfiles); + exit(EXIT_FAILURE); + } } int read_json(int argc, struct source **sourcelist, char *fname, const char *layername, int maxzoom, int minzoom, int basezoom, double basezoom_marker_width, sqlite3 *outdb, struct pool *exclude, struct pool *include, int exclude_all, double droprate, int buffer, const char *tmpdir, double gamma, int *prevent, int *additional, int read_parallel, int forcetable, const char *attribution) {