From 83026bdb264507fcb786a5c4975cbf5354a8d76a Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Mon, 28 Oct 2024 22:56:07 -0600 Subject: [PATCH] Remove duplicated entried from relationships and nodes (#1333) --- .semversioner/next-release/patch-20241029003722814474.json | 4 ++++ graphrag/index/flows/create_final_nodes.py | 3 ++- graphrag/index/flows/create_final_relationships.py | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 .semversioner/next-release/patch-20241029003722814474.json diff --git a/.semversioner/next-release/patch-20241029003722814474.json b/.semversioner/next-release/patch-20241029003722814474.json new file mode 100644 index 0000000000..06ae5d083a --- /dev/null +++ b/.semversioner/next-release/patch-20241029003722814474.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Remove duplicated relationships and nodes" +} diff --git a/graphrag/index/flows/create_final_nodes.py b/graphrag/index/flows/create_final_nodes.py index fb0b68904a..871f280e62 100644 --- a/graphrag/index/flows/create_final_nodes.py +++ b/graphrag/index/flows/create_final_nodes.py @@ -69,4 +69,5 @@ async def create_final_nodes( ) joined.rename(columns={"label": "title", "cluster": "community"}, inplace=True) - return joined + # TODO: Find duplication source + return joined.drop_duplicates(subset=["title", "community"]) diff --git a/graphrag/index/flows/create_final_relationships.py b/graphrag/index/flows/create_final_relationships.py index ba82c5bc63..0f283cf621 100644 --- a/graphrag/index/flows/create_final_relationships.py +++ b/graphrag/index/flows/create_final_relationships.py @@ -66,4 +66,5 @@ async def create_final_relationships( "text_unit_ids" ].str.split(",") - return edge_combined_degree + # TODO: Find duplication source + return edge_combined_degree.drop_duplicates(subset=["source", "target"])