Skip to content

Commit

Permalink
Update Incremental Indexing to new embeddings workflow (#1359)
Browse files Browse the repository at this point in the history
  • Loading branch information
AlonsoGuevara authored Nov 5, 2024
1 parent 83bd5ce commit 80c0c7b
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 105 deletions.
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20241105223157965625.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Update Incremental Indexing to new embeddings workflow"
}
2 changes: 2 additions & 0 deletions graphrag/index/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,15 @@ async def run_pipeline_with_config(
):
tables_dict[table.workflow] = table.result

progress_reporter.success("Finished running workflows on new documents.")
await update_dataframe_outputs(
dataframe_dict=tables_dict,
storage=storage,
update_storage=update_index_storage,
config=config,
cache=cache,
callbacks=NoopVerbCallbacks(),
progress_reporter=progress_reporter,
)

else:
Expand Down
10 changes: 4 additions & 6 deletions graphrag/index/run/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ async def _process_workflow(


def _find_workflow_config(
config: PipelineConfig, workflow_name: str, step: str
config: PipelineConfig, workflow_name: str, step: str | None = None
) -> dict:
"""Find a workflow in the pipeline configuration.
Expand Down Expand Up @@ -147,8 +147,6 @@ def _find_workflow_config(
)
raise ValueError(error_message) from err

return (
workflow.config.get(step, {})
if workflow.config and step in workflow.config
else {}
)
if not workflow.config:
return {}
return workflow.config if not step else workflow.config.get(step, {})
49 changes: 0 additions & 49 deletions graphrag/index/update/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.config.pipeline import PipelineConfig
from graphrag.index.operations.embed_text import embed_text
from graphrag.index.operations.summarize_descriptions.strategies import (
run_graph_intelligence as run_entity_summarization,
)
Expand Down Expand Up @@ -67,8 +66,6 @@ def _group_and_resolve_entities(
"description": lambda x: list(x.astype(str)), # Ensure str
# Concatenate nd.array into a single list
"text_unit_ids": lambda x: ",".join(str(i) for j in x.tolist() for i in j),
# Keep only descriptions where the original value wasn't modified
"description_embedding": lambda x: x.iloc[0] if len(x) == 1 else np.nan,
})
.reset_index()
)
Expand All @@ -87,7 +84,6 @@ def _group_and_resolve_entities(
"human_readable_id",
"graph_embedding",
"text_unit_ids",
"description_embedding",
],
]

Expand Down Expand Up @@ -141,48 +137,3 @@ async def process_row(row):
entities_df["description"] = results

return entities_df


async def _run_entity_description_embedding(
entities_df: pd.DataFrame,
config: PipelineConfig,
cache: PipelineCache,
callbacks: VerbCallbacks,
) -> pd.DataFrame:
"""Run entity description embedding.
Parameters
----------
entities_df : pd.DataFrame
The entities dataframe.
config : PipelineConfig
The pipeline configuration.
cache : PipelineCache
Pipeline cache used during the embedding process.
callbacks : WorkflowCallbacks
The workflow callbacks.
Returns
-------
pd.DataFrame
The updated entities dataframe with description embeddings.
"""
embed_config = _find_workflow_config(
config, "create_final_entities", "entity_name_description_embed"
)

# Concatenate name and description for embedding
entities_df["name_description"] = (
entities_df["name"] + ":" + entities_df["description"]
)

# Run embedding
entities_df["description_embedding"] = await embed_text(
entities_df,
callbacks,
cache,
embed_column="name_description",
embedding_name="entity.description",
strategy=embed_config.get("strategy", {}),
)
return entities_df.drop(columns=["name_description"])
Loading

0 comments on commit 80c0c7b

Please sign in to comment.