Detect the prerequisites of user query (#307)

pingcap · Sep 26, 2024 · fe31504 · fe31504
1 parent 465cfe6
commit fe31504
Show file tree

Hide file tree

Showing 8 changed files with 156 additions and 34 deletions.
diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -78,6 +78,7 @@ def server_host(self) -> str:
     TIDB_AI_API_KEY: SecretStr | None = None
 
     COMPLIED_INTENT_ANALYSIS_PROGRAM_PATH: str | None = None
+    COMPLIED_PREREQUISITE_ANALYSIS_PROGRAM_PATH: str | None = None
 
     # CAUTION: Do not change EMBEDDING_DIMS after initializing the database.
     # Changing the embedding dimensions requires recreating the database and tables.

diff --git a/backend/app/rag/chat.py b/backend/app/rag/chat.py
@@ -256,7 +256,7 @@ def _get_llamaindex_callback_manager():
                     ),
                 )
                 graph_index._callback_manager = _get_llamaindex_callback_manager()
-                intent_relationships = graph_index.intent_analyze(
+                sub_queries = graph_index.intent_analyze(
                     self.user_question,
                     self.chat_history,
                 )
@@ -269,8 +269,8 @@ def _get_llamaindex_callback_manager():
                     ),
                 )
                 graph_index._callback_manager = _get_llamaindex_callback_manager()
-                result = graph_index.intent_based_search(
-                    intent_relationships,
+                result = graph_index.graph_semantic_search(
+                    sub_queries,
                     include_meta=True,
                     relationship_meta_filters=kg_config.relationship_meta_filters,
                 )

diff --git a/backend/app/rag/default_prompt.py b/backend/app/rag/default_prompt.py
@@ -1,5 +1,5 @@
 DEFAULT_INTENT_GRAPH_KNOWLEDGE = """\
-Given a list of relationships of a knowledge graph as follows. When there is a conflict in meaning between knowledge relationships, the relationship with the higher `weight` and newer `last_modified_at` value takes precedence.
+Given a list of prerequisite questions and their relevant knowledge for the user's main question, when conflicts in meaning arise, prioritize the relationship with the higher weight and the more recent version.
 
 Knowledge sub-queries:
 
@@ -21,8 +21,7 @@
 {% for relationship in data['relationships'] %}
 
     - Description: {{ relationship.rag_description }}
-    - Last Modified At: {{ relationship.last_modified_at }}
-    - Meta: {{ relationship.meta | tojson(indent=2) }}
+    - Weight: {{ relationship.weight }}
 
 {% endfor %}
 
@@ -58,38 +57,52 @@
 
 DEFAULT_CONDENSE_QUESTION_PROMPT = """\
 ---------------------
-Knowledge graph information is below
+The prerequisite questions and their relevant knowledge for the user's main question.
 ---------------------
 
 {{graph_knowledges}}
 
 ---------------------
 
-Given a conversation (between Human and Assistant) and a follow-up message from the Human, use the context from the previous conversation to rewrite the follow-up message into a standalone, detailed question (Note: The language should be consistent with the follow-up message from Human). Ensure the refined question captures all relevant context and is written in a way that maximizes the effectiveness of a vector search to retrieve precise and comprehensive information.
+Task:
+Given the conversation between the Human and Assistant, along with the follow-up message from the Human, and the provided prerequisite questions and relevant knowledge, refine the Human’s follow-up message into a standalone, detailed question.
 
-Key considerations:
+Instructions:
 1. Focus on the latest query from the Human, ensuring it is given the most weight.
-2. Utilize knowledge graph and the history messages to provide relevant context and background information.
-3. Ensure the refined question is suitable for vector search by emphasizing specific and relevant terms.
-4. Ensure the refined question is grounded and factual, directly based on the user's follow-up question.
+2. Incorporate Key Information:
+  - Use the prerequisite questions and their relevant knowledge to add specific details to the follow-up question.
+  - Replace ambiguous terms or references in the follow-up question with precise information from the provided knowledge. Example: Replace “latest version” with the actual version number mentioned in the knowledge.
+3. Utilize Conversation Context:
+  - Incorporate relevant context and background information from the conversation history to enhance the question's specificity.
+4. Optimize for Retrieval:
+  - Ensure the refined question emphasizes specific and relevant terms to maximize the effectiveness of a vector search for retrieving precise and comprehensive information.
+5. Grounded and Factual:
+  - Make sure the refined question is grounded in and directly based on the user's follow-up question and the provided knowledge.
+  - Do not introduce information that is not supported by the knowledge or conversation history.
 
 Example:
 
-Chat history:
+Chat History:
 
-Human: “I'm exploring options for a database solution for my company's needs.”
-Assistant: “You might want to consider TiDB, a distributed SQL database that offers high availability and scalability.”
+Human: "I'm interested in the performance improvements in the latest version of TiDB."
+Assistant: "TiDB version 8.1 was released recently with significant performance enhancements over version 6.5."
 
-Followup question:
+Follow-up Question:
 
-  "What is TiDB?"
-  
-Refined standalone question:
+"Can you tell me more about these improvements?"
 
-Can you provide an in-depth explanation of TiDB, including its architecture, key features, and how it ensures high availability and scalability?
+Prerequisite Questions and Relevant Knowledge:
 
+- Prerequisite Question: What is the latest version of TiDB?
+- Relevant Knowledge: The latest version of TiDB is 8.1.
 
----------------------
+...
+
+Refined Standalone Question:
+
+"Can you provide detailed information about the performance improvements introduced in TiDB version 8.1 compared to version 6.5?"
+
+Your Turn:
 
 Chat history:
 

diff --git a/backend/app/rag/knowledge_graph/base.py b/backend/app/rag/knowledge_graph/base.py
@@ -18,6 +18,7 @@
 
 from app.rag.knowledge_graph.extractor import SimpleGraphExtractor
 from app.rag.knowledge_graph.intent import IntentAnalyzer, RelationshipReasoning
+from app.rag.knowledge_graph.prerequisite import PrerequisiteAnalyzer, Prerequisites
 from app.rag.types import MyCBEventType
 from app.core.config import settings
 from app.core.db import Scoped_Session
@@ -88,6 +89,10 @@ def __init__(
             dspy_lm=dspy_lm,
             complied_program_path=settings.COMPLIED_INTENT_ANALYSIS_PROGRAM_PATH,
         )
+        self._prerequisites_analyzer = PrerequisiteAnalyzer(
+            dspy_lm=dspy_lm,
+            compiled_program_path=settings.COMPLIED_PREREQUISITE_ANALYSIS_PROGRAM_PATH,
+        )
 
         super().__init__(
             nodes=nodes,
@@ -210,7 +215,7 @@ def intent_analyze(
         self,
         query: str,
         chat_history: list = [],
-    ) -> List[RelationshipReasoning]:
+    ) -> List[str]:
         """Analyze the intent of the query."""
         chat_content = query
         if len(chat_history) > 0:
@@ -230,12 +235,44 @@ def intent_analyze(
                 payload={EventPayload.QUERY_STR: chat_content},
             ) as event:
                 intents = self._intents.analyze(chat_content)
-                event.on_end(payload={"relationships": intents.relationships})
-        return intents.relationships
+                semantic_queries = [
+                    f"{r.source_entity} -> {r.relationship_desc} -> {r.target_entity}"
+                    for r in intents.relationships
+                ]
+                event.on_end(payload={"semantic_queries": semantic_queries})
+
+        return semantic_queries
+
+    def prerequisite_analyze(
+        self,
+        query: str,
+        chat_history: list = [],
+    ) -> List[str]:
+        """Analyze the prerequisite of the query."""
+        chat_content = query
+        if len(chat_history) > 0:
+            chat_history_strings = [
+                f"{message.role.value}: {message.content}" for message in chat_history
+            ]
+            query_with_history = (
+                "++++ Chat History ++++\n"
+                + "\n".join(chat_history_strings)
+                + "++++ Chat History ++++\n"
+            )
+            chat_content = query_with_history + "\n\nThen the user asksq:\n" + query
 
-    def intent_based_search(
+        with self._callback_manager.as_trace("prerequisites_based_search"):
+            with self._callback_manager.event(
+                MyCBEventType.INTENT_DECOMPOSITION,
+                payload={EventPayload.QUERY_STR: chat_content},
+            ) as event:
+                prerequisites = self._prerequisites_analyzer.analyze(chat_content)
+                event.on_end(payload={"semantic_queries": prerequisites.questions})
+        return prerequisites.questions
+
+    def graph_semantic_search(
         self,
-        intent_relationships: List[RelationshipReasoning],
+        semantic_queries: List[str],
         depth: int = 2,
         include_meta: bool = False,
         relationship_meta_filters: Dict = {},
@@ -272,11 +309,6 @@ def add_relationships(relationships):
                         }
                     )
 
-        semantic_queries = [
-            f"{r.source_entity} -> {r.relationship_desc} -> {r.target_entity}"
-            for r in intent_relationships
-        ]
-
         def process_query(sub_query):
             logger.info(f"Processing query: {sub_query}")
             tmp_session = Scoped_Session()

diff --git a/backend/app/rag/knowledge_graph/prerequisite.py b/backend/app/rag/knowledge_graph/prerequisite.py
@@ -0,0 +1,57 @@
+import logging
+import dspy
+from dspy.functional import TypedChainOfThought, TypedPredictor
+from typing import List, Optional
+from pydantic import BaseModel, Field
+
+logger = logging.getLogger(__name__)
+
+class Prerequisites(BaseModel):
+    """Decomposed prerequisite questions extracted from the main query"""
+
+    questions: List[str] = Field(
+        description="List of prerequisite questions necessary for answering the main query."
+    )
+
+class DecomposePrerequisites(dspy.Signature):
+    """You are an expert in query analysis and decomposition. Your task is to identify any prerequisite questions that need to be answered in order to fully address the main query.
+
+    Step-by-Step Analysis:
+
+    1. Analyze the main query to identify terms, references, or concepts that are unclear, ambiguous, or require additional information.
+        - Focus on entities like versions, specific terminologies, or any references that may not be immediately clear.
+    2. Formulate clear and concise prerequisite questions that, when answered, will provide the necessary information to address the main query effectively.
+
+    ## Instructions:
+
+    - Limit the number of prerequisite questions to no more than 3.
+    - Ensure that the prerequisite questions are directly relevant and necessary for answering the main query.
+    - Do not include unnecessary or unrelated questions.
+    - Ensure that the questions are grounded and factual, based on the query provided.
+    """
+
+    query: str = dspy.InputField(
+        desc="The main query text that needs to be analyzed for prerequisite questions."
+    )
+    prerequisites: Prerequisites = dspy.OutputField(
+        desc="The decomposed prerequisite questions extracted from the main query."
+    )
+
+class DecomposePrerequisitesModule(dspy.Module):
+    def __init__(self, dspy_lm: dspy.LM):
+        super().__init__()
+        self.dspy_lm = dspy_lm
+        self.prog = TypedChainOfThought(DecomposePrerequisites)
+
+    def forward(self, query):
+        with dspy.settings.context(lm=self.dspy_lm):
+            return self.prog(query=query)
+
+class PrerequisiteAnalyzer:
+    def __init__(self, dspy_lm: dspy.LM, compiled_program_path: Optional[str] = None):
+        self.prerequisite_analysis_prog = DecomposePrerequisitesModule(dspy_lm=dspy_lm)
+        if compiled_program_path is not None:
+            self.prerequisite_analysis_prog.load(compiled_program_path)
+
+    def analyze(self, query: str) -> Prerequisites:
+        return self.prerequisite_analysis_prog(query=query).prerequisites
diff --git a/backend/app/rag/retrieve.py b/backend/app/rag/retrieve.py
@@ -67,9 +67,9 @@ def _retrieve(self, question: str, top_k: int) -> List[Document]:
             )
 
             if kg_config.using_intent_search:
-                intent_relationships = graph_index.intent_analyze(question)
-                result = graph_index.intent_based_search(
-                    intent_relationships, include_meta=True
+                sub_queries = graph_index.intent_analyze(question)
+                result = graph_index.graph_semantic_search(
+                    sub_queries, include_meta=True
                 )
 
                 graph_knowledges = get_prompt_by_jinja2_template(

diff --git a/backend/dspy_compiled_program/decompose_prerequisites_program b/backend/dspy_compiled_program/decompose_prerequisites_program
@@ -0,0 +1,19 @@
+{
+  "prog.predictor": {
+    "lm": null,
+    "traces": [],
+    "train": [],
+    "demos": [
+      {
+        "query": "Explain the impact of artificial intelligence on modern education.",
+        "prerequisites": "{\"questions\":[\"What are the current applications of artificial intelligence in education?\",\"What defines modern education in today's context?\",\"How is artificial intelligence different from traditional educational technologies?\"]}"
+      },
+      {
+        "query": "summary the performance improvement from version 6.5 to newest version for TiDB",
+        "prerequisites": "{\"questions\":[\"What is the newest version of TiDB?\",\"What performance enhancements have been made in TiDB versions between 6.5 and the newest version?\"]}"
+      }
+    ],
+    "signature_instructions": "You are an expert in query analysis and decomposition. Your task is to identify any prerequisite questions that need to be answered in order to fully address the main query.\n\nStep-by-Step Analysis:\n\n1. Analyze the main query to identify terms, references, or concepts that are unclear, ambiguous, or require additional information.\n    - Focus on entities like versions, specific terminologies, or any references that may not be immediately clear.\n2. Formulate clear and concise prerequisite questions that, when answered, will provide the necessary information to address the main query effectively.\n\n## Instructions:\n\n- Limit the number of prerequisite questions to no more than 3.\n- Ensure that the prerequisite questions are directly relevant and necessary for answering the main query.\n- Do not include unnecessary or unrelated questions.\n- Ensure that the questions are grounded and factual, based on the query provided.",
+    "signature_prefix": "Prerequisites:"
+  }
+}
diff --git a/backend/dspy_compiled_program/prerequisite_samples.pkl b/backend/dspy_compiled_program/prerequisite_samples.pkl