diff --git a/src/common/autoconfig.h.in b/src/common/autoconfig.h.in index 27feb403..71945aca 100644 --- a/src/common/autoconfig.h.in +++ b/src/common/autoconfig.h.in @@ -20,6 +20,9 @@ #undef HAVE_POSIX_SPAWN #undef USE_POSIX_SPAWN +/* Define to 1 if you have the setrlimit() call. */ +#undef HAVE_SETRLIMIT + /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H diff --git a/src/configure.ac b/src/configure.ac index d4ee1812..f103993c 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -34,7 +34,7 @@ AC_SYS_LARGEFILE # OpenBSD needs sys/param.h for mount.h to compile AC_CHECK_HEADERS([sys/param.h, spawn.h]) -AC_CHECK_FUNCS([posix_spawn]) +AC_CHECK_FUNCS([posix_spawn, setrlimit]) if test "x$ac_cv_func_posix_spawn" = xyes; then : AC_ARG_ENABLE(posix_spawn, diff --git a/src/doc/user/usermanual.xml b/src/doc/user/usermanual.xml index 2a7315ab..b23072b4 100644 --- a/src/doc/user/usermanual.xml +++ b/src/doc/user/usermanual.xml @@ -6022,18 +6022,29 @@ mondelaypatterns = *.log:20 "this one has spaces*:10" - monioniceclass, monioniceclassdata - These allow defining the - ionice class and data used by the - indexer (default class 3, no data). - - + monioniceclass, monioniceclassdata + These allow defining the + ionice class and data used by the + indexer (default class 3, no data). + + + + filtermaxseconds + Maximum handler execution time, after which it + is aborted. Some postscript programs just loop... + + + + filtermaxmbytes + &RCL; 1.20.7 and later. Maximum handler memory + utilisation. This uses setrlimit(RLIMIT_AS) on most systems + (total virtual memory space size limit). Some programs may start + with 500 MBytes of mapped shared libraries, so take this into + account when choosing a value. The default is a liberal + 2000MB. + + - filtermaxseconds - Maximum handler execution time, after which it - is aborted. Some postscript programs just loop... - - filtersdir A directory to search for the external input handler scripts used to index some types of files. The diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 6bd6ee61..94ccdbc8 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -77,6 +77,8 @@ bool MimeHandlerExec::next_document() int filtermaxseconds = 900; m_config->getConfParam("filtermaxseconds", &filtermaxseconds); + int filtermaxmbytes = 0; + m_config->getConfParam("filtermaxmbytes", &filtermaxmbytes); if (params.empty()) { // Hu ho @@ -103,6 +105,7 @@ bool MimeHandlerExec::next_document() mexec.putenv("RECOLL_CONFDIR", m_config->getConfDir()); mexec.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" : "RECOLL_FILTER_FORPREVIEW=no"); + mexec.setrlimit_as(filtermaxmbytes); int status; try { diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 3897bc2f..74a7f911 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -47,6 +47,9 @@ bool MimeHandlerExecMultiple::startCmd() // Command name string cmd = params.front(); + int filtermaxmbytes = 0; + m_config->getConfParam("filtermaxmbytes", &filtermaxmbytes); + m_maxmemberkb = 50000; m_config->getConfParam("membermaxkbs", &m_maxmemberkb); ostringstream oss; @@ -57,6 +60,8 @@ bool MimeHandlerExecMultiple::startCmd() m_cmd.putenv(m_forPreview ? "RECOLL_FILTER_FORPREVIEW=yes" : "RECOLL_FILTER_FORPREVIEW=no"); + m_cmd.setrlimit_as(filtermaxmbytes); + // Build parameter list: delete cmd name vectormyparams(params.begin() + 1, params.end()); diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index a9390da0..ef1d8953 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -250,6 +250,11 @@ textfilepagekbs = 1000 # Maximum external filter execution time. Default 20mn. This is mainly # to avoid infinite loops in postscript files (loop.ps) filtermaxseconds = 1200 +# Maximum virtual memory space for filter process (setrlimit(RLIMIT_AS)), +# in megabytes. Note that this includes any mapped libs (there is no +# reliable Linux way to limit the data space only), so we need to be a +# bit generous here. Anything over 2000 will be ignored on 32 bits machines. +filtermaxmbytes = 2000 # Length of abstracts we store while indexing. Longer will make for a # bigger db diff --git a/src/utils/execmd.cpp b/src/utils/execmd.cpp index 32da0ee5..267052c3 100644 --- a/src/utils/execmd.cpp +++ b/src/utils/execmd.cpp @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -300,6 +302,39 @@ inline void ExecCmd::dochild(const string &cmd, const char **argv, pthread_sigmask(SIG_UNBLOCK, &sset, 0); sigprocmask(SIG_UNBLOCK, &sset, 0); +#ifdef HAVE_SETRLIMIT +#if defined RLIMIT_AS || defined RLIMIT_VMEM || defined RLIMIT_DATA + if (m_rlimit_as_mbytes > 2000 && sizeof(rlim_t) < 8) { + // Impossible limit, don't use it + m_rlimit_as_mbytes = 0; + } + if (m_rlimit_as_mbytes > 0) { + struct rlimit ram_limit = { + static_cast(m_rlimit_as_mbytes * 1024 * 1024), + RLIM_INFINITY + }; + int resource; + + // RLIMIT_AS and RLIMIT_VMEM are usually synonyms when VMEM is + // defined. RLIMIT_AS is Posix. Both don't really do what we + // want, because they count e.g. shared lib mappings, which we + // don't really care about. + // RLIMIT_DATA only limits the data segment. Modern mallocs + // use mmap and will not be bound. (Otoh if we only have this, + // we're probably not modern). + // So we're unsatisfied either way. +#ifdef RLIMIT_AS + resource = RLIMIT_AS; +#elif defined RLIMIT_VMEM + resource = RLIMIT_VMEM; +#else + resource = RLIMIT_DATA; +#endif + setrlimit(resource, &ram_limit); + } +#endif +#endif // have_setrlimit + if (has_input) { close(m_pipein[1]); if (m_pipein[0] != 0) { @@ -347,6 +382,11 @@ inline void ExecCmd::dochild(const string &cmd, const char **argv, _exit(127); } +void ExecCmd::setrlimit_as(int mbytes) +{ + m_rlimit_as_mbytes = mbytes; +} + int ExecCmd::startExec(const string &cmd, const vector& args, bool has_input, bool has_output) { @@ -427,6 +467,7 @@ int ExecCmd::startExec(const string &cmd, const vector& args, //////////////////////////////// End vfork child prepare section. #if HAVE_POSIX_SPAWN && USE_POSIX_SPAWN + // Note that posix_spawn provides no way to setrlimit() the child. { posix_spawnattr_t attrs; posix_spawnattr_init (&attrs); diff --git a/src/utils/execmd.h b/src/utils/execmd.h index 25c3c307..9b42956a 100644 --- a/src/utils/execmd.h +++ b/src/utils/execmd.h @@ -83,6 +83,14 @@ class ExecCmd { void putenv(const std::string &envassign); void putenv(const std::string &name, const std::string& value); + /** + * Try to set a limit on child process vm size. This will use + * setrlimit() and RLIMIT_AS/VMEM if available. Parameter is in + * units of 2**10. Must be called before starting the command, default + * is inherit from parent. + */ + void setrlimit_as(int mbytes); + /** * Set function objects to call whenever new data is available or on * select timeout / whenever new data is needed to send. Must be called @@ -158,7 +166,7 @@ class ExecCmd { void zapChild() {setKill(); (void)wait();} ExecCmd() - : m_advise(0), m_provide(0), m_timeoutMs(1000) + : m_advise(0), m_provide(0), m_timeoutMs(1000), m_rlimit_as_mbytes(0) { reset(); } @@ -191,6 +199,7 @@ class ExecCmd { ExecCmdProvide *m_provide; bool m_killRequest; int m_timeoutMs; + int m_rlimit_as_mbytes; std::string m_stderrFile; // Pipe for data going to the command int m_pipein[2];