Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: classify emails by importance based on subjects #10277

Open
wants to merge 37 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
bca2af7
Classify emails based on subjects
st3iny Jan 4, 2023
dc47dc1
fixup! Classify emails based on subjects
st3iny Jan 9, 2023
056f67f
fixup! Classify emails based on subjects
st3iny Jan 18, 2023
0be1a21
Cache features per sender
st3iny Jan 24, 2023
99dfa0a
Implement preprocess command
st3iny Jan 24, 2023
28762e6
feat(importance-classifier): Reduce text feature vector
ChristophWurst Jan 26, 2023
af70adf
fixup! feat(importance-classifier): Reduce text feature vector
ChristophWurst Jan 26, 2023
08b1e1b
fixup! feat(importance-classifier): Reduce text feature vector
ChristophWurst Jan 27, 2023
d7cca9c
fixup! feat(importance-classifier): Reduce text feature vector
ChristophWurst Jan 30, 2023
a9f7399
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Jan 31, 2023
cee58bf
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Jan 31, 2023
0e82c52
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Jan 31, 2023
bd82bea
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Jan 31, 2023
6c6e2ca
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Mar 2, 2023
18767c7
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Mar 3, 2023
c764944
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Mar 21, 2023
974ee46
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Mar 24, 2023
f68501b
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Mar 24, 2023
7127cf7
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Mar 24, 2023
c8c214c
fixup! feat(importance-classifier): Reduce text feature vector
st3iny Mar 28, 2023
51f31bf
fixup! fixup! feat(importance-classifier): Reduce text feature vector
st3iny Mar 30, 2023
3bc398b
Try wcv -> tfidf pipeline
st3iny Mar 30, 2023
fed2011
Fix transformer persistence
st3iny May 15, 2023
e2c057c
Refactor classifcation of new messages
st3iny May 15, 2023
bb9056d
Refactor peristence
st3iny May 17, 2023
71b1b5f
Adjust meta estimator params
st3iny Jun 14, 2023
877be83
Change training sample size to 300
st3iny Jun 14, 2023
7c740b1
Adjust tuned knn params
st3iny Jun 14, 2023
1a5c5b5
Fix reuse compliance
st3iny Oct 17, 2024
a7510ca
Run composer cs:fix
st3iny Oct 17, 2024
dade1df
Fix most psalm issues
st3iny Oct 17, 2024
a808b2e
Persist classifiers in memory cache only
st3iny Oct 21, 2024
379484a
Revert "Adjust tuned knn params"
st3iny Oct 22, 2024
7da784e
Finalize code changes
st3iny Oct 22, 2024
7b62b39
Run compser cs:fix
st3iny Oct 22, 2024
21d45eb
Fix all remaining psalm issues
st3iny Oct 22, 2024
a7ea9c0
Run composer cs:fix
st3iny Oct 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion appinfo/info.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ The rating depends on the installed text processing backend. See [the rating ove

Learn more about the Nextcloud Ethical AI Rating [in our blog](https://nextcloud.com/blog/nextcloud-ethical-ai-rating/).
]]></description>
<version>4.1.0-alpha.2</version>
<version>4.1.0-alpha.3</version>
<licence>agpl</licence>
<author homepage="https://github.com/ChristophWurst">Christoph Wurst</author>
<author homepage="https://github.com/GretaD">GretaD</author>
Expand Down Expand Up @@ -90,6 +90,8 @@ Learn more about the Nextcloud Ethical AI Rating [in our blog](https://nextcloud
<command>OCA\Mail\Command\TrainAccount</command>
<command>OCA\Mail\Command\UpdateAccount</command>
<command>OCA\Mail\Command\UpdateSystemAutoresponders</command>
<command>OCA\Mail\Command\PreprocessAccount</command>
<command>OCA\Mail\Command\RunMetaEstimator</command>
</commands>
<settings>
<admin>OCA\Mail\Settings\AdminSettings</admin>
Expand Down
2 changes: 0 additions & 2 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
use OCA\Mail\Listener\MessageCacheUpdaterListener;
use OCA\Mail\Listener\MessageKnownSinceListener;
use OCA\Mail\Listener\MoveJunkListener;
use OCA\Mail\Listener\NewMessageClassificationListener;
use OCA\Mail\Listener\NewMessagesNotifier;
use OCA\Mail\Listener\OauthTokenRefreshListener;
use OCA\Mail\Listener\OptionalIndicesListener;
Expand Down Expand Up @@ -124,7 +123,6 @@ public function register(IRegistrationContext $context): void {
$context->registerEventListener(MessageDeletedEvent::class, MessageCacheUpdaterListener::class);
$context->registerEventListener(MessageSentEvent::class, AddressCollectionListener::class);
$context->registerEventListener(MessageSentEvent::class, InteractionListener::class);
$context->registerEventListener(NewMessagesSynchronized::class, NewMessageClassificationListener::class);
$context->registerEventListener(NewMessagesSynchronized::class, MessageKnownSinceListener::class);
$context->registerEventListener(NewMessagesSynchronized::class, NewMessagesNotifier::class);
$context->registerEventListener(SynchronizationEvent::class, AccountSynchronizedThreadUpdaterListener::class);
Expand Down
5 changes: 1 addition & 4 deletions lib/BackgroundJob/TrainImportanceClassifierJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,7 @@ protected function run($argument) {
}

try {
$this->classifier->train(
$account,
$this->logger
);
$this->classifier->train($account, $this->logger);
} catch (Throwable $e) {
$this->logger->error('Cron importance classifier training failed: ' . $e->getMessage(), [
'exception' => $e,
Expand Down
23 changes: 14 additions & 9 deletions lib/Command/PredictImportance.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
use OCA\Mail\Db\Message;
use OCA\Mail\Service\AccountService;
use OCA\Mail\Service\Classification\ImportanceClassifier;
use OCA\Mail\Support\ConsoleLoggerDecorator;
use OCP\AppFramework\Db\DoesNotExistException;
use OCP\IConfig;
use Psr\Log\LoggerInterface;
Expand All @@ -25,6 +26,7 @@
class PredictImportance extends Command {
public const ARGUMENT_ACCOUNT_ID = 'account-id';
public const ARGUMENT_SENDER = 'sender';
public const ARGUMENT_SUBJECT = 'subject';

private AccountService $accountService;
private ImportanceClassifier $classifier;
Expand All @@ -43,26 +45,27 @@ public function __construct(AccountService $service,
$this->config = $config;
}

/**
* @return void
*/
protected function configure() {
protected function configure(): void {
$this->setName('mail:predict-importance');
$this->setDescription('Predict importance of an incoming message');
$this->addArgument(self::ARGUMENT_ACCOUNT_ID, InputArgument::REQUIRED);
$this->addArgument(self::ARGUMENT_SENDER, InputArgument::REQUIRED);
$this->addArgument(self::ARGUMENT_SUBJECT, InputArgument::OPTIONAL);
}

public function isEnabled() {
public function isEnabled(): bool {
return $this->config->getSystemValueBool('debug');
}

/**
* @return int
*/
protected function execute(InputInterface $input, OutputInterface $output): int {
$accountId = (int)$input->getArgument(self::ARGUMENT_ACCOUNT_ID);
$sender = $input->getArgument(self::ARGUMENT_SENDER);
$subject = $input->getArgument(self::ARGUMENT_SUBJECT) ?? '';

$consoleLogger = new ConsoleLoggerDecorator(
$this->logger,
$output
);

try {
$account = $this->accountService->findById($accountId);
Expand All @@ -73,9 +76,11 @@ protected function execute(InputInterface $input, OutputInterface $output): int
$fakeMessage = new Message();
$fakeMessage->setUid(0);
$fakeMessage->setFrom(AddressList::parse("Name <$sender>"));
$fakeMessage->setSubject($subject);
[$prediction] = $this->classifier->classifyImportance(
$account,
[$fakeMessage]
[$fakeMessage],
$consoleLogger
);
if ($prediction) {
$output->writeln('Message is important');
Expand Down
65 changes: 65 additions & 0 deletions lib/Command/PreprocessAccount.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<?php

declare(strict_types=1);

/**
* SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

namespace OCA\Mail\Command;

use OCA\Mail\Service\AccountService;
use OCA\Mail\Service\PreprocessingService;
use OCP\AppFramework\Db\DoesNotExistException;
use Psr\Log\LoggerInterface;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use function memory_get_peak_usage;

class PreprocessAccount extends Command {
public const ARGUMENT_ACCOUNT_ID = 'account-id';

private AccountService $accountService;
private PreprocessingService $preprocessingService;
private LoggerInterface $logger;

public function __construct(AccountService $service,
PreprocessingService $preprocessingService,
LoggerInterface $logger) {
parent::__construct();

$this->accountService = $service;
$this->preprocessingService = $preprocessingService;
$this->logger = $logger;
}

/**
* @return void
*/
protected function configure() {
$this->setName('mail:account:preprocess');
$this->setDescription('Preprocess all mailboxes of an IMAP account');
$this->addArgument(self::ARGUMENT_ACCOUNT_ID, InputArgument::REQUIRED);
}

protected function execute(InputInterface $input, OutputInterface $output): int {
$accountId = (int)$input->getArgument(self::ARGUMENT_ACCOUNT_ID);

try {
$account = $this->accountService->findById($accountId);
} catch (DoesNotExistException $e) {
$output->writeln("<error>Account $accountId does not exist</error>");
return 1;
}

$this->preprocessingService->process(4294967296, $account);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need an explanation for 4294967296

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lol, I don't even ...

I'll remove this whole command. It only made sense when the preview text was considered so we needed to pre-processing first. Now, only the subject is needed so the command doesn't make sense anymore.


$mbs = (int)(memory_get_peak_usage() / 1024 / 1024);
$output->writeln('<info>' . $mbs . 'MB of memory used</info>');

return 0;
}
}
116 changes: 116 additions & 0 deletions lib/Command/RunMetaEstimator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
<?php

declare(strict_types=1);

/**
* SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

namespace OCA\Mail\Command;

use OCA\Mail\Service\AccountService;
use OCA\Mail\Service\Classification\ImportanceClassifier;
use OCA\Mail\Support\ConsoleLoggerDecorator;
use OCP\AppFramework\Db\DoesNotExistException;
use OCP\IConfig;
use Psr\Log\LoggerInterface;
use Rubix\ML\Backends\Amp;
use Rubix\ML\Classifiers\KNearestNeighbors;
use Rubix\ML\CrossValidation\KFold;
use Rubix\ML\CrossValidation\Metrics\FBeta;
use Rubix\ML\GridSearch;
use Rubix\ML\Kernels\Distance\Euclidean;
use Rubix\ML\Kernels\Distance\Jaccard;
use Rubix\ML\Kernels\Distance\Manhattan;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;

class RunMetaEstimator extends Command {
public const ARGUMENT_ACCOUNT_ID = 'account-id';
public const ARGUMENT_SHUFFLE = 'shuffle';

private AccountService $accountService;
private LoggerInterface $logger;
private ImportanceClassifier $classifier;
private IConfig $config;

public function __construct(
AccountService $accountService,
LoggerInterface $logger,
ImportanceClassifier $classifier,
IConfig $config,
) {
parent::__construct();

$this->accountService = $accountService;
$this->logger = $logger;
$this->classifier = $classifier;
$this->config = $config;
}

protected function configure(): void {
$this->setName('mail:account:run-meta-estimator');
$this->setDescription('Run the meta estimator for an account');
$this->addArgument(self::ARGUMENT_ACCOUNT_ID, InputArgument::REQUIRED);
$this->addOption(self::ARGUMENT_SHUFFLE, null, null, 'Shuffle data set before training');
}

public function isEnabled(): bool {
return $this->config->getSystemValueBool('debug');
}

protected function execute(InputInterface $input, OutputInterface $output): int {
$accountId = (int)$input->getArgument(self::ARGUMENT_ACCOUNT_ID);
$shuffle = (bool)$input->getOption(self::ARGUMENT_SHUFFLE);

try {
$account = $this->accountService->findById($accountId);
} catch (DoesNotExistException $e) {
$output->writeln("<error>Account $accountId does not exist</error>");
return 1;
}

$consoleLogger = new ConsoleLoggerDecorator(
$this->logger,
$output
);

$estimator = static function () use ($consoleLogger) {
$params = [
[5, 10, 15, 20, 25, 30, 35, 40], // Neighbors
[true, false], // Weighted?
[new Euclidean(), new Manhattan(), new Jaccard()], // Kernel
];

$estimator = new GridSearch(
KNearestNeighbors::class,
$params,
new FBeta(),
new KFold(5)
);
$estimator->setLogger($consoleLogger);
$estimator->setBackend(new Amp());
return $estimator;
};

/** @var GridSearch $metaEstimator */
$metaEstimator = $this->classifier->train(
$account,
$consoleLogger,
$estimator,
$shuffle,
false,
);

if ($metaEstimator !== null) {
$output->writeln("<info>Best estimator: {$metaEstimator->base()}</info>");
}

$mbs = (int)(memory_get_peak_usage() / 1024 / 1024);
$output->writeln('<info>' . $mbs . 'MB of memory used</info>');
return 0;
}
}
38 changes: 28 additions & 10 deletions lib/Command/TrainAccount.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
declare(strict_types=1);

/**
* SPDX-FileCopyrightText: 2019 Nextcloud GmbH and Nextcloud contributors
* SPDX-FileCopyrightText: 2019-2024 Nextcloud GmbH and Nextcloud contributors
* SPDX-License-Identifier: AGPL-3.0-or-later
*/

Expand All @@ -23,6 +23,9 @@

class TrainAccount extends Command {
public const ARGUMENT_ACCOUNT_ID = 'account-id';
public const ARGUMENT_SHUFFLE = 'shuffle';
public const ARGUMENT_DRY_RUN = 'dry-run';
public const ARGUMENT_FORCE = 'force';

private AccountService $accountService;
private ImportanceClassifier $classifier;
Expand All @@ -41,28 +44,39 @@ public function __construct(AccountService $service,
$this->classificationSettingsService = $classificationSettingsService;
}

/**
* @return void
*/
protected function configure() {
protected function configure(): void {
$this->setName('mail:account:train');
$this->setDescription('Train the classifier of new messages');
$this->addArgument(self::ARGUMENT_ACCOUNT_ID, InputArgument::REQUIRED);
$this->addOption(self::ARGUMENT_SHUFFLE, null, null, 'Shuffle data set before training');
$this->addOption(
self::ARGUMENT_DRY_RUN,
null,
null,
'Don\'t persist classifier after training'
);
$this->addOption(
self::ARGUMENT_FORCE,
null,
null,
'Train an estimator even if the classification is disabled by the user'
);
}

/**
* @return int
*/
protected function execute(InputInterface $input, OutputInterface $output): int {
$accountId = (int)$input->getArgument(self::ARGUMENT_ACCOUNT_ID);
$shuffle = (bool)$input->getOption(self::ARGUMENT_SHUFFLE);
$dryRun = (bool)$input->getOption(self::ARGUMENT_DRY_RUN);
$force = (bool)$input->getOption(self::ARGUMENT_FORCE);

try {
$account = $this->accountService->findById($accountId);
} catch (DoesNotExistException $e) {
$output->writeln("<error>account $accountId does not exist</error>");
return 1;
}
if (!$this->classificationSettingsService->isClassificationEnabled($account->getUserId())) {

if (!$force && !$this->classificationSettingsService->isClassificationEnabled($account->getUserId())) {
$output->writeln("<info>classification is turned off for account $accountId</info>");
return 2;
}
Expand All @@ -71,9 +85,13 @@ protected function execute(InputInterface $input, OutputInterface $output): int
$this->logger,
$output
);

$this->classifier->train(
$account,
$consoleLogger
$consoleLogger,
null,
$shuffle,
!$dryRun
);

$mbs = (int)(memory_get_peak_usage() / 1024 / 1024);
Expand Down
Loading
Loading