Skip to content

Commit

Permalink
fix: auto delete old transfer-processes if their count exceeds a cert…
Browse files Browse the repository at this point in the history
…ain limit (#449)

Co-authored-by: Simon Engmann <[email protected]>
Co-authored-by: Tim Berthold <[email protected]>
  • Loading branch information
3 people authored Jul 24, 2023
1 parent 62569ef commit 049f051
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ EDC_OAUTH_CLIENT_ID=override this via ENV vars
# For docker-compose.yaml only:
# Override images via ENV vars
RELEASE_EDC_IMAGE=ghcr.io/sovity/edc-ce-mds:4.0.1
RELEASE_EDC_UI_IMAGE=ghcr.io/sovity/edc-ui:0.0.1-milestone-8-sovity11
RELEASE_EDC_UI_IMAGE=ghcr.io/sovity/edc-ui:0.0.1-milestone-8-sovity12

# For docker-compose-dev.yaml only:
# Override images via ENV vars
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

public class DatabaseMigrationManager {
@Setting
Expand All @@ -45,9 +46,14 @@ public void migrateAllDataSources() {

private List<String> getDataSourceNames(Config config) {
var edcDatasourceConfig = config.getConfig(EDC_DATASOURCE_PREFIX);
return edcDatasourceConfig.partition().toList().stream()
var dataSourceNames = edcDatasourceConfig.partition().toList().stream()
.map(Config::currentNode)
.toList();
.collect(Collectors.toList());
// The default data source is always migrated last
if (dataSourceNames.remove(DEFAULT_DATASOURCE)) {
dataSourceNames.add(DEFAULT_DATASOURCE);
}
return dataSourceNames;
}

public List<String> getAdditionalFlywayMigrationLocations(String datasourceName) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
-- Required for reasonably fast ON DELETE CASCADE from edc_transfer_process
create index data_request_transfer_process_id_idx
on edc_data_request (transfer_process_id);
-- Speed up sort + limit query
-- Include transferprocess_id to enable index-only scan
create index transfer_process_created_at_idx
on edc_transfer_process (created_at) include (transferprocess_id);

-- Delete oldest row when table size exceeds 3000 rows
-- The row count should mostly stabilize slightly above 3000, as the reltuples data in pg_class is only updated by VACUUM
-- Unfortunately, I was not able to get conclusive results on the behavior under concurrent inserts
-- One problem is that the table might still grow over time, if concurrent inserts can delete the same row
-- To avoid this, we could delete two rows instead of just one
-- Then the table would shrink until the next auto-vacuum detects that it is below 3000 rows again
create function transfer_process_delete_old_rows() returns trigger as $$
begin
delete from edc_transfer_process o
using (
select i2.transferprocess_id
from edc_transfer_process i2
order by i2.created_at
limit 2
) i,
(
-- Hack to avoid count(*), which takes several hundred milliseconds
-- Not perfectly accurate, but close enough
-- Idea taken from: https://www.cybertec-postgresql.com/en/postgresql-count-made-fast/
select pgc.reltuples::bigint as count
from pg_catalog.pg_class pgc
where pgc.relname = 'edc_transfer_process'
) c
where i.transferprocess_id = o.transferprocess_id and c.count > 3000;

return null;
end;
$$ language plpgsql;

create trigger delete_old_rows after insert
on edc_transfer_process
for each row
execute function transfer_process_delete_old_rows();

0 comments on commit 049f051

Please sign in to comment.