fix: auto delete old transfer-processes if their count exceeds a cert…

…ain limit (#449) Co-authored-by: Simon Engmann <[email protected]> Co-authored-by: Tim Berthold <[email protected]>
sovity · Jul 24, 2023 · 049f051 · 049f051
1 parent 62569ef
commit 049f051
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 3 deletions.
diff --git a/.env b/.env
@@ -5,7 +5,7 @@ EDC_OAUTH_CLIENT_ID=override this via ENV vars
 # For docker-compose.yaml only:
 # Override images via ENV vars
 RELEASE_EDC_IMAGE=ghcr.io/sovity/edc-ce-mds:4.0.1
-RELEASE_EDC_UI_IMAGE=ghcr.io/sovity/edc-ui:0.0.1-milestone-8-sovity11
+RELEASE_EDC_UI_IMAGE=ghcr.io/sovity/edc-ui:0.0.1-milestone-8-sovity12
 
 # For docker-compose-dev.yaml only:
 # Override images via ENV vars

diff --git a/.../src/main/java/de/sovity/edc/extension/postgresql/migration/DatabaseMigrationManager.java b/.../src/main/java/de/sovity/edc/extension/postgresql/migration/DatabaseMigrationManager.java
@@ -20,6 +20,7 @@
 
 import java.util.Arrays;
 import java.util.List;
+import java.util.stream.Collectors;
 
 public class DatabaseMigrationManager {
     @Setting
@@ -45,9 +46,14 @@ public void migrateAllDataSources() {
 
     private List<String> getDataSourceNames(Config config) {
         var edcDatasourceConfig = config.getConfig(EDC_DATASOURCE_PREFIX);
-        return edcDatasourceConfig.partition().toList().stream()
+        var dataSourceNames = edcDatasourceConfig.partition().toList().stream()
                 .map(Config::currentNode)
-                .toList();
+                .collect(Collectors.toList());
+        // The default data source is always migrated last
+        if (dataSourceNames.remove(DEFAULT_DATASOURCE)) {
+            dataSourceNames.add(DEFAULT_DATASOURCE);
+        }
+        return dataSourceNames;
     }
 
     public List<String> getAdditionalFlywayMigrationLocations(String datasourceName) {

diff --git a/...res-flyway/src/main/resources/migration/default/V2__Delete-Transfer-Processes-Trigger.sql b/...res-flyway/src/main/resources/migration/default/V2__Delete-Transfer-Processes-Trigger.sql
@@ -0,0 +1,41 @@
+-- Required for reasonably fast ON DELETE CASCADE from edc_transfer_process
+create index data_request_transfer_process_id_idx
+    on edc_data_request (transfer_process_id);
+-- Speed up sort + limit query
+-- Include transferprocess_id to enable index-only scan
+create index transfer_process_created_at_idx
+    on edc_transfer_process (created_at) include (transferprocess_id);
+
+-- Delete oldest row when table size exceeds 3000 rows
+-- The row count should mostly stabilize slightly above 3000, as the reltuples data in pg_class is only updated by VACUUM
+-- Unfortunately, I was not able to get conclusive results on the behavior under concurrent inserts
+-- One problem is that the table might still grow over time, if concurrent inserts can delete the same row
+-- To avoid this, we could delete two rows instead of just one
+-- Then the table would shrink until the next auto-vacuum detects that it is below 3000 rows again
+create function transfer_process_delete_old_rows() returns trigger as $$
+begin
+    delete from edc_transfer_process o
+        using (
+            select i2.transferprocess_id
+            from edc_transfer_process i2
+            order by i2.created_at
+            limit 2
+        ) i,
+            (
+                -- Hack to avoid count(*), which takes several hundred milliseconds
+                -- Not perfectly accurate, but close enough
+                -- Idea taken from: https://www.cybertec-postgresql.com/en/postgresql-count-made-fast/
+                select pgc.reltuples::bigint as count
+                from pg_catalog.pg_class pgc
+                where pgc.relname = 'edc_transfer_process'
+            ) c
+    where i.transferprocess_id = o.transferprocess_id and c.count > 3000;
+
+    return null;
+end;
+$$ language plpgsql;
+
+create trigger delete_old_rows after insert
+    on edc_transfer_process
+    for each row
+execute function transfer_process_delete_old_rows();