Skip to content

Commit

Permalink
attachextract: add options to handle timeout and idle connections
Browse files Browse the repository at this point in the history
Signed-off-by: Robert Stepanek <[email protected]>
  • Loading branch information
rsto committed Apr 26, 2023
1 parent 821132d commit a20be29
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 20 deletions.
52 changes: 52 additions & 0 deletions cassandane/Cassandane/Cyrus/SearchFuzzy.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2170,4 +2170,56 @@ sub test_squatter_attachextract_nolock
$self->assert_num_lt($reacquired_timestamp, $extractor_timestamp);
}

sub test_squatter_attachextract_timeout
:min_version_3_9 :needs_search_xapian :SearchAttachmentExtractor :NoCheckSyslog
{
my ($self) = @_;
my $instance = $self->{instance};
my $imap = $self->{store}->get_client();

my $tracedir = tempdir (DIR => $instance->{basedir} . "/tmp");

# SearchAttachmentExtractor magic configures Cyrus to
# wait at most 3 seconds for a response from extractor

$self->start_echo_extractor(
tracedir => $tracedir,
response_delay_seconds => 4,
);

xlog $self, "Make message with attachment";
$self->make_message("msg2",
mime_type => "multipart/related",
mime_boundary => "123456789abcdef",
body => ""
."\r\n--123456789abcdef\r\n"
."Content-Type: text/plain\r\n"
."\r\n"
."bodyterm"
."\r\n--123456789abcdef\r\n"
."Content-Type: application/pdf\r\n"
."\r\n"
."attachterm"
."\r\n--123456789abcdef--\r\n");

xlog $self, "Clear syslog";
$self->{instance}->getsyslog();

xlog $self, "Run squatter (allowing partials)";
$self->{instance}->run_command({cyrus => 1}, 'squatter', '-v', '-p');

xlog "Assert text body is indexed";
my $uids = $imap->search('fuzzy', 'body', 'bodyterm');
$self->assert_deep_equals([1], $uids);

xlog "Assert attachement is not indexed";
$uids = $imap->search('fuzzy', 'xattachmentbody', 'attachterm');
$self->assert_deep_equals([], $uids);

xlog "Assert extractor got only called once";
my @tracefiles = glob($tracedir."/*");
$self->assert_num_equals(1, scalar @tracefiles);
$self->assert_matches(qr/req1_GET_/, $tracefiles[0]);
}

1;
8 changes: 5 additions & 3 deletions cassandane/Cassandane/Cyrus/TestCase.pm
Original file line number Diff line number Diff line change
Expand Up @@ -369,12 +369,14 @@ magic(JMAPExtensions => sub {
});
magic(SearchAttachmentExtractor => sub {
my $port = Cassandane::PortManager::alloc("localhost");
shift->config_set('search_attachment_extractor_url' =>
my $self = shift;
$self->config_set('search_attachment_extractor_url' =>
"http://localhost:$port/extractor");
$self->config_set('search_attachment_extractor_request_timeout' => '3s');
$self->config_set('search_attachment_extractor_idle_timeout' => '3s');
});
magic(SearchLanguage => sub {
my $self = shift;
$self->config_set('search_index_language' => 'yes');
shift->config_set('search_index_language' => 'yes');
});
magic(SieveUTF8Fileinto => sub {
shift->config_set('sieve_utf8fileinto' => 'yes');
Expand Down
20 changes: 20 additions & 0 deletions changes/next/squatter_nolock_attachment_text_extraction
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Description:

Updates squatter to not lock a mailbox while extracting text
from attachments.


Config changes:

search_attachment_extractor_request_timeout
search_attachment_extractor_idle_timeout


Upgrade instructions:

None.


GitHub issue:

None.
51 changes: 34 additions & 17 deletions imap/attachextract.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ struct extractor_ctx {

static char *attachextract_cachedir = NULL;
static int attachextract_cacheonly = 0;
static unsigned attachextract_idle_timeout = 5 * 60;
static unsigned attachextract_request_timeout = 5 * 60;

static struct extractor_ctx *global_extractor = NULL;

Expand All @@ -92,23 +94,20 @@ static void extractor_disconnect(struct extractor_ctx *ext)
}

static struct prot_waitevent *
extractor_timeout(struct protstream *s __attribute__((unused)),
struct prot_waitevent *ev __attribute__((unused)),
void *rock)
extractor_idle_timeout_cb(struct protstream *s __attribute__((unused)),
struct prot_waitevent *ev __attribute__((unused)),
void *rock)
{
struct extractor_ctx *ext = rock;

syslog(LOG_DEBUG, "extractor_timeout(%p)", ext);
syslog(LOG_DEBUG, "extractor_idle_timeout(%p)", ext);

/* too long since we last used the extractor - disconnect */
extractor_disconnect(ext);

return NULL;
}


#define IDLE_TIMEOUT (5 * 60) /* 5 min */

static int login(struct backend *s __attribute__((unused)),
const char *userid __attribute__((unused)),
sasl_callback_t *cb __attribute__((unused)),
Expand Down Expand Up @@ -143,7 +142,9 @@ static int extractor_connect(struct extractor_ctx *ext)
be = ext->be;
if (be && be->sock != -1) {
// extend the timeout
if (be->timeout) be->timeout->mark = now + IDLE_TIMEOUT;
if (be->timeout) {
be->timeout->mark = now + attachextract_idle_timeout;
}
return 0;
}

Expand All @@ -158,12 +159,14 @@ static int extractor_connect(struct extractor_ctx *ext)
return IMAP_IOERROR;
}

// set request timeout
prot_settimeout(be->in, attachextract_request_timeout);

if (ext->clientin) {
/* add a default timeout */
/* set idle timeout */
be->clientin = ext->clientin;
be->timeout = prot_addwaitevent(ext->clientin,
now + IDLE_TIMEOUT,
extractor_timeout, ext);
now + attachextract_idle_timeout, extractor_idle_timeout_cb, ext);
}

return 0;
Expand Down Expand Up @@ -282,6 +285,8 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec,
hostlen = strcspn(ext->hostname, "/");
guidstr = message_guid_encode(&axrec->guid);

prot_settimeout(be->in, attachextract_idle_timeout);

/* try to fetch previously extracted text */
unsigned statuscode = 0;
prot_printf(be->out,
Expand All @@ -295,7 +300,7 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec,
"\r\n",
ext->path, guidstr, HTTP_VERSION,
(int) hostlen, be->hostname, CYRUS_VERSION,
IDLE_TIMEOUT, config_search_maxsize);
attachextract_idle_timeout, config_search_maxsize);
prot_flush(be->out);

/* Read GET response */
Expand Down Expand Up @@ -357,7 +362,7 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec,
"X-Truncate-Length: " SIZE_T_FMT "\r\n"
"\r\n",
ext->path, guidstr, HTTP_VERSION,
(int) hostlen, be->hostname, CYRUS_VERSION, IDLE_TIMEOUT,
(int) hostlen, be->hostname, CYRUS_VERSION, attachextract_idle_timeout,
buf_cstring(&ctypehdr), buf_len(data), config_search_maxsize);
prot_putbuf(be->out, data);
prot_flush(be->out);
Expand All @@ -383,13 +388,14 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec,
goto gotdata;
}

if (statuscode >= 400 && statuscode <= 499) {
if ((statuscode >= 400 && statuscode <= 499) || statuscode == 599) {
/* indexer can't extract this for some reason, never try again */
goto done;
}

/* any other status code is an error */
syslog(LOG_ERR, "extract GOT STATUSCODE %d with timeout %d: %s", statuscode, IDLE_TIMEOUT, errstr);
syslog(LOG_ERR, "extract GOT STATUSCODE %d with timeout %d: %s",
statuscode, attachextract_request_timeout, errstr);
}

// dropped out of the loop? Then we failed!
Expand Down Expand Up @@ -447,12 +453,23 @@ EXPORTED int attachextract_extract(const struct attachextract_record *axrec,

EXPORTED void attachextract_init(struct protstream *clientin)
{
syslog(LOG_DEBUG, "extractor_init(%p)", clientin);

/* Read config */
attachextract_idle_timeout =
config_getduration(IMAPOPT_SEARCH_ATTACHMENT_EXTRACTOR_IDLE_TIMEOUT, 's');

attachextract_request_timeout =
config_getduration(IMAPOPT_SEARCH_ATTACHMENT_EXTRACTOR_REQUEST_TIMEOUT, 's');

if (attachextract_idle_timeout < attachextract_request_timeout)
attachextract_idle_timeout = attachextract_request_timeout;

const char *exturl =
config_getstring(IMAPOPT_SEARCH_ATTACHMENT_EXTRACTOR_URL);
if (!exturl) return;

syslog(LOG_DEBUG, "extractor_init(%p)", clientin);

/* Initialize extractor URL */
char scheme[6], server[100], path[256], *p;
unsigned https, port;

Expand Down
13 changes: 13 additions & 0 deletions lib/imapoptions
Original file line number Diff line number Diff line change
Expand Up @@ -2356,7 +2356,20 @@ If all partitions are over that limit, this feature is not used anymore.
to the extractor.
Xapian only.
*/
{ "search_attachment_extractor_request_timeout", "5m", DURATION, "UNRELEASED" }
/* Defines the duration after which to cancel non-responding
requests to the search attachment extractor service.

If no unit is specified, seconds is assumed. */
*/
{ "search_attachment_extractor_idle_timeout", "5m", DURATION, "UNRELEASED" }
/* Defines the duration after which to close unused connections to
the search attachment extractor service. If the idle timeout is
less than search_attachment_extractor_request_timeout, then it
is ignored and request timeout used instead.

If no unit is specified, seconds is assumed. */
*/
{ "search_index_language", 0, SWITCH, "3.3.1" }
/*
If enabled, then messages bodies are stemmed by detected language
Expand Down

0 comments on commit a20be29

Please sign in to comment.