From a20be2976efe4d8bdbf601ffae47a278a666f52f Mon Sep 17 00:00:00 2001 From: Robert Stepanek Date: Thu, 20 Apr 2023 09:25:13 +0200 Subject: [PATCH] attachextract: add options to handle timeout and idle connections Signed-off-by: Robert Stepanek --- cassandane/Cassandane/Cyrus/SearchFuzzy.pm | 52 +++++++++++++++++++ cassandane/Cassandane/Cyrus/TestCase.pm | 8 +-- ...squatter_nolock_attachment_text_extraction | 20 +++++++ imap/attachextract.c | 51 ++++++++++++------ lib/imapoptions | 13 +++++ 5 files changed, 124 insertions(+), 20 deletions(-) create mode 100644 changes/next/squatter_nolock_attachment_text_extraction diff --git a/cassandane/Cassandane/Cyrus/SearchFuzzy.pm b/cassandane/Cassandane/Cyrus/SearchFuzzy.pm index 04dbd5ea653..dcf564d6e10 100644 --- a/cassandane/Cassandane/Cyrus/SearchFuzzy.pm +++ b/cassandane/Cassandane/Cyrus/SearchFuzzy.pm @@ -2170,4 +2170,56 @@ sub test_squatter_attachextract_nolock $self->assert_num_lt($reacquired_timestamp, $extractor_timestamp); } +sub test_squatter_attachextract_timeout + :min_version_3_9 :needs_search_xapian :SearchAttachmentExtractor :NoCheckSyslog +{ + my ($self) = @_; + my $instance = $self->{instance}; + my $imap = $self->{store}->get_client(); + + my $tracedir = tempdir (DIR => $instance->{basedir} . "/tmp"); + + # SearchAttachmentExtractor magic configures Cyrus to + # wait at most 3 seconds for a response from extractor + + $self->start_echo_extractor( + tracedir => $tracedir, + response_delay_seconds => 4, + ); + + xlog $self, "Make message with attachment"; + $self->make_message("msg2", + mime_type => "multipart/related", + mime_boundary => "123456789abcdef", + body => "" + ."\r\n--123456789abcdef\r\n" + ."Content-Type: text/plain\r\n" + ."\r\n" + ."bodyterm" + ."\r\n--123456789abcdef\r\n" + ."Content-Type: application/pdf\r\n" + ."\r\n" + ."attachterm" + ."\r\n--123456789abcdef--\r\n"); + + xlog $self, "Clear syslog"; + $self->{instance}->getsyslog(); + + xlog $self, "Run squatter (allowing partials)"; + $self->{instance}->run_command({cyrus => 1}, 'squatter', '-v', '-p'); + + xlog "Assert text body is indexed"; + my $uids = $imap->search('fuzzy', 'body', 'bodyterm'); + $self->assert_deep_equals([1], $uids); + + xlog "Assert attachement is not indexed"; + $uids = $imap->search('fuzzy', 'xattachmentbody', 'attachterm'); + $self->assert_deep_equals([], $uids); + + xlog "Assert extractor got only called once"; + my @tracefiles = glob($tracedir."/*"); + $self->assert_num_equals(1, scalar @tracefiles); + $self->assert_matches(qr/req1_GET_/, $tracefiles[0]); +} + 1; diff --git a/cassandane/Cassandane/Cyrus/TestCase.pm b/cassandane/Cassandane/Cyrus/TestCase.pm index 287fa20075f..0c027cef6ec 100644 --- a/cassandane/Cassandane/Cyrus/TestCase.pm +++ b/cassandane/Cassandane/Cyrus/TestCase.pm @@ -369,12 +369,14 @@ magic(JMAPExtensions => sub { }); magic(SearchAttachmentExtractor => sub { my $port = Cassandane::PortManager::alloc("localhost"); - shift->config_set('search_attachment_extractor_url' => + my $self = shift; + $self->config_set('search_attachment_extractor_url' => "http://localhost:$port/extractor"); + $self->config_set('search_attachment_extractor_request_timeout' => '3s'); + $self->config_set('search_attachment_extractor_idle_timeout' => '3s'); }); magic(SearchLanguage => sub { - my $self = shift; - $self->config_set('search_index_language' => 'yes'); + shift->config_set('search_index_language' => 'yes'); }); magic(SieveUTF8Fileinto => sub { shift->config_set('sieve_utf8fileinto' => 'yes'); diff --git a/changes/next/squatter_nolock_attachment_text_extraction b/changes/next/squatter_nolock_attachment_text_extraction new file mode 100644 index 00000000000..3aa9f39f034 --- /dev/null +++ b/changes/next/squatter_nolock_attachment_text_extraction @@ -0,0 +1,20 @@ +Description: + +Updates squatter to not lock a mailbox while extracting text +from attachments. + + +Config changes: + +search_attachment_extractor_request_timeout +search_attachment_extractor_idle_timeout + + +Upgrade instructions: + +None. + + +GitHub issue: + +None. diff --git a/imap/attachextract.c b/imap/attachextract.c index 87d229f9150..a3b80b676a9 100644 --- a/imap/attachextract.c +++ b/imap/attachextract.c @@ -67,6 +67,8 @@ struct extractor_ctx { static char *attachextract_cachedir = NULL; static int attachextract_cacheonly = 0; +static unsigned attachextract_idle_timeout = 5 * 60; +static unsigned attachextract_request_timeout = 5 * 60; static struct extractor_ctx *global_extractor = NULL; @@ -92,13 +94,13 @@ static void extractor_disconnect(struct extractor_ctx *ext) } static struct prot_waitevent * -extractor_timeout(struct protstream *s __attribute__((unused)), - struct prot_waitevent *ev __attribute__((unused)), - void *rock) +extractor_idle_timeout_cb(struct protstream *s __attribute__((unused)), + struct prot_waitevent *ev __attribute__((unused)), + void *rock) { struct extractor_ctx *ext = rock; - syslog(LOG_DEBUG, "extractor_timeout(%p)", ext); + syslog(LOG_DEBUG, "extractor_idle_timeout(%p)", ext); /* too long since we last used the extractor - disconnect */ extractor_disconnect(ext); @@ -106,9 +108,6 @@ extractor_timeout(struct protstream *s __attribute__((unused)), return NULL; } - -#define IDLE_TIMEOUT (5 * 60) /* 5 min */ - static int login(struct backend *s __attribute__((unused)), const char *userid __attribute__((unused)), sasl_callback_t *cb __attribute__((unused)), @@ -143,7 +142,9 @@ static int extractor_connect(struct extractor_ctx *ext) be = ext->be; if (be && be->sock != -1) { // extend the timeout - if (be->timeout) be->timeout->mark = now + IDLE_TIMEOUT; + if (be->timeout) { + be->timeout->mark = now + attachextract_idle_timeout; + } return 0; } @@ -158,12 +159,14 @@ static int extractor_connect(struct extractor_ctx *ext) return IMAP_IOERROR; } + // set request timeout + prot_settimeout(be->in, attachextract_request_timeout); + if (ext->clientin) { - /* add a default timeout */ + /* set idle timeout */ be->clientin = ext->clientin; be->timeout = prot_addwaitevent(ext->clientin, - now + IDLE_TIMEOUT, - extractor_timeout, ext); + now + attachextract_idle_timeout, extractor_idle_timeout_cb, ext); } return 0; @@ -282,6 +285,8 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec, hostlen = strcspn(ext->hostname, "/"); guidstr = message_guid_encode(&axrec->guid); + prot_settimeout(be->in, attachextract_idle_timeout); + /* try to fetch previously extracted text */ unsigned statuscode = 0; prot_printf(be->out, @@ -295,7 +300,7 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec, "\r\n", ext->path, guidstr, HTTP_VERSION, (int) hostlen, be->hostname, CYRUS_VERSION, - IDLE_TIMEOUT, config_search_maxsize); + attachextract_idle_timeout, config_search_maxsize); prot_flush(be->out); /* Read GET response */ @@ -357,7 +362,7 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec, "X-Truncate-Length: " SIZE_T_FMT "\r\n" "\r\n", ext->path, guidstr, HTTP_VERSION, - (int) hostlen, be->hostname, CYRUS_VERSION, IDLE_TIMEOUT, + (int) hostlen, be->hostname, CYRUS_VERSION, attachextract_idle_timeout, buf_cstring(&ctypehdr), buf_len(data), config_search_maxsize); prot_putbuf(be->out, data); prot_flush(be->out); @@ -383,13 +388,14 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec, goto gotdata; } - if (statuscode >= 400 && statuscode <= 499) { + if ((statuscode >= 400 && statuscode <= 499) || statuscode == 599) { /* indexer can't extract this for some reason, never try again */ goto done; } /* any other status code is an error */ - syslog(LOG_ERR, "extract GOT STATUSCODE %d with timeout %d: %s", statuscode, IDLE_TIMEOUT, errstr); + syslog(LOG_ERR, "extract GOT STATUSCODE %d with timeout %d: %s", + statuscode, attachextract_request_timeout, errstr); } // dropped out of the loop? Then we failed! @@ -447,12 +453,23 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec, EXPORTED void attachextract_init(struct protstream *clientin) { + syslog(LOG_DEBUG, "extractor_init(%p)", clientin); + + /* Read config */ + attachextract_idle_timeout = + config_getduration(IMAPOPT_SEARCH_ATTACHMENT_EXTRACTOR_IDLE_TIMEOUT, 's'); + + attachextract_request_timeout = + config_getduration(IMAPOPT_SEARCH_ATTACHMENT_EXTRACTOR_REQUEST_TIMEOUT, 's'); + + if (attachextract_idle_timeout < attachextract_request_timeout) + attachextract_idle_timeout = attachextract_request_timeout; + const char *exturl = config_getstring(IMAPOPT_SEARCH_ATTACHMENT_EXTRACTOR_URL); if (!exturl) return; - syslog(LOG_DEBUG, "extractor_init(%p)", clientin); - + /* Initialize extractor URL */ char scheme[6], server[100], path[256], *p; unsigned https, port; diff --git a/lib/imapoptions b/lib/imapoptions index 175604340bf..bc0293190d7 100644 --- a/lib/imapoptions +++ b/lib/imapoptions @@ -2356,7 +2356,20 @@ If all partitions are over that limit, this feature is not used anymore. to the extractor. Xapian only. */ +{ "search_attachment_extractor_request_timeout", "5m", DURATION, "UNRELEASED" } +/* Defines the duration after which to cancel non-responding + requests to the search attachment extractor service. + If no unit is specified, seconds is assumed. */ + */ +{ "search_attachment_extractor_idle_timeout", "5m", DURATION, "UNRELEASED" } +/* Defines the duration after which to close unused connections to + the search attachment extractor service. If the idle timeout is + less than search_attachment_extractor_request_timeout, then it + is ignored and request timeout used instead. + + If no unit is specified, seconds is assumed. */ + */ { "search_index_language", 0, SWITCH, "3.3.1" } /* If enabled, then messages bodies are stemmed by detected language