From ae9cceebc837716b02d74d07642259f8abad4c1c Mon Sep 17 00:00:00 2001 From: Ian Date: Wed, 25 Sep 2024 16:27:17 +0800 Subject: [PATCH] step-by-step intent detetcion (#308) --- backend/app/rag/knowledge_graph/intent.py | 26 +- .../decompose_query_program | 39 +- .../decompose_query_samples.json | 135 +++++++ backend/dspy_compiled_program/sql_sample.csv | 21 -- .../sql_sample_extraction.csv | 85 ----- .../sql_sample_extraction_program | 19 - .../sql_sample_gen_program | 20 - .../sql_samples_gen.ipynb | 354 ------------------ 8 files changed, 161 insertions(+), 538 deletions(-) create mode 100644 backend/dspy_compiled_program/decompose_query_samples.json delete mode 100644 backend/dspy_compiled_program/sql_sample.csv delete mode 100644 backend/dspy_compiled_program/sql_sample_extraction.csv delete mode 100644 backend/dspy_compiled_program/sql_sample_extraction_program delete mode 100644 backend/dspy_compiled_program/sql_sample_gen_program delete mode 100644 backend/dspy_compiled_program/sql_samples_gen.ipynb diff --git a/backend/app/rag/knowledge_graph/intent.py b/backend/app/rag/knowledge_graph/intent.py index 3fff209c..b9cf9f98 100644 --- a/backend/app/rag/knowledge_graph/intent.py +++ b/backend/app/rag/knowledge_graph/intent.py @@ -14,7 +14,7 @@ class RelationshipReasoning(Relationship): reasoning: str = Field( description=( - "Category reasoning for the relationship, e.g., 'the main conerns of the user', 'the problem the user is facing', 'the user case scenario', etc." + "Explanation of the user's intention for this step." ) ) @@ -23,32 +23,28 @@ class DecomposedFactors(BaseModel): """Decomposed factors extracted from the query to form the knowledge graph""" relationships: List[RelationshipReasoning] = Field( - description="List of relationships to represent critical concepts and their relationships extracted from the query." + description="List of relationships representing the user's prerequisite and step-by-step intentions extracted from the query." ) class DecomposeQuery(dspy.Signature): - """You are a knowledge base graph expert and are very good at building knowledge graphs. Now you are assigned to extract the most critical concepts and their relationships from the query. Step-by-Step Analysis: - - 1. Extract Meaningful user intents and questions: - - Identify the question what the user itentionally asked, focusing on the the critial information about user's main concerns/questions/problems/use cases, etc. - - Make this question simple and clear and ensure that it is directly related to the user's main concerns. Simple and clear question can improve the search accuracy. - 2. Establish Relationships to describe the user's intents: - - Define relationships that accurately represent the user's query intent and information needs. - - Format each relationship as: (Source Entity) - [Relationship] -> (Target Entity), where the relationship describes what the user wants to know about the connection between these entities. + """You are a knowledge base graph expert and are very good at building knowledge graphs. Now you are assigned to extract the user's step-by-step intentions from the query. ## Instructions: - - Limit to no more than 3 pairs. These pairs must accurately reflect the user's real (sub)questions. - - Ensure that the extracted pairs are of high quality and do not introduce unnecessary search elements. - - Ensure that the relationships and intents are grounded and factual, based on the information provided in the query. + - Break down the user's query into a sequence of prerequisite questions (e.g., identifying specific versions) and step-by-step intentions. + - Represent each prerequisite and intention as a relationship: (Source Entity) - [Relationship] -> (Target Entity). + - Provide reasoning for each relationship, explaining the user's intention at that step. + - Limit to no more than 5 relationships. + - Ensure that the extracted relationships accurately reflect the user's real intentions. + - Ensure that the relationships and intentions are grounded and factual, based on the information provided in the query. """ query: str = dspy.InputField( - desc="The query text to extract the most critical concepts and their relationships from the query." + desc="The query text to extract the user's step-by-step intentions." ) factors: DecomposedFactors = dspy.OutputField( - desc="Factors representation of the critical concepts and their relationships extracted from the query." + desc="Representation of the user's step-by-step intentions extracted from the query." ) diff --git a/backend/dspy_compiled_program/decompose_query_program b/backend/dspy_compiled_program/decompose_query_program index dfda65e2..f5df3eee 100644 --- a/backend/dspy_compiled_program/decompose_query_program +++ b/backend/dspy_compiled_program/decompose_query_program @@ -5,48 +5,39 @@ "train": [], "demos": [ { - "augmented": true, - "query": "Chat2query is returning an error message saying \"Query timeout expired\". Additionally, I am unable to locate this SQL query in the slow query log.", - "factors": "```json\n{\n \"relationships\": [\n {\n \"source_entity\": \"Chat2query\",\n \"target_entity\": \"Error Message\",\n \"relationship_desc\": \"Chat2query is returning an error message saying 'Query timeout expired'.\",\n \"reasoning\": \"The main problem the user is facing.\"\n },\n {\n \"source_entity\": \"SQL Query\",\n \"target_entity\": \"Slow Query Log\",\n \"relationship_desc\": \"The user is unable to locate the SQL query in the slow query log.\",\n \"reasoning\": \"The secondary issue the user is facing.\"\n }\n ]\n}\n```" - }, - { - "query": "Hi, how do u setup tidb on debian vps?", - "factors": "{\"relationships\":[{\"source_entity\":\"TiDB Cluster\",\"target_entity\":\"Debian VPS\",\"relationship_desc\":\"How to deploy a TiDB Cluster on a Debian VPS? Should I use TiUP or TiDB Operator?\",\"reasoning\":\"The main question the user is asking.\"}]}" - }, - { - "query": "We are new to TiDB and don't quite understand the potential impact on our application architecture. We are using TiDB for audit logs and continue to direct traffic to TiDB. We noticed a sudden jump ID from 1 to 30,001. Are there any impacts? Do we need to address this? If we have 100 connections from several applications, what will happen? In summary, what should we do for Auto Increment or do nothing?", - "factors": "{\"relationships\":[{\"source_entity\":\"Auto Increment\",\"target_entity\":\"ID Jump\",\"relationship_desc\":\"Why Auto Increment in TiDB causes a sudden increase in the ID values?\",\"reasoning\":\"The main concerns that the user itentionally asked.\"},{\"source_entity\":\"Connections Impact\",\"target_entity\":\"TiDB\",\"relationship_desc\":\"How 100 connections from several applications affect TiDB, especially when the Auto Increment causes a sudden jump in ID values?\",\"reasoning\":\"The second most important question that the user itentionally asked.\"},{\"source_entity\":\"TiDB\",\"target_entity\":\"Audit Logs\",\"relationship_desc\":\"TiDB is used for storing audit logs and receiving continuous traffic.\",\"reasoning\":\"The user case what the user wants to achieve\"}]}" + "query": "tidb lighting to sync to serverless cluster,but the load command and the tidb-lighting tools dont have the tls config like --ssl-ca or --ca. so i can not sync to the full back data to the serverless", + "factors": "{\"relationships\":[{\"source_entity\":\"TiDB Lighting\",\"target_entity\":\"Serverless Cluster\",\"relationship_desc\":\"Sync data to a serverless cluster using TiDB Lighting.\",\"reasoning\":\"The user case what the user wants to achieve\"},{\"source_entity\":\"Load Command and TiDB Lighting Tools\",\"target_entity\":\"TLS Configuration\",\"relationship_desc\":\"How to configure TLS for TiDB Lightning?\",\"reasoning\":\"The basic question what the user itentionally asked.\"},{\"source_entity\":\"Lack of TLS Configuration\",\"target_entity\":\"Sync Issue\",\"relationship_desc\":\"The sync issue is caused by the lack of TLS configuration options for TiDB Lightning.\",\"reasoning\":\"The problem that the user is facing.\"}]}" }, { - "query": "my cluster shows \"available\" but my app containers can no longer reach the database", - "factors": "{\"relationships\":[{\"source_entity\":\"App Containers\",\"target_entity\":\"TiDB Database\",\"relationship_desc\":\"How to solve the connection issue between the app containers and the TiDB database?\",\"reasoning\":\"The main problem the user is facing.\"},{\"source_entity\":\"Connectivity Issue\",\"target_entity\":\"Cluster Status\",\"relationship_desc\":\"The connectivity issue exists despite the cluster status showing 'available'.\",\"reasoning\":\"The discrepancy the user is concerned about.\"}]}" + "query": "summary the performance improvement from version 6.5 to newest version for TiDB", + "factors": "{\"relationships\":[{\"source_entity\":\"TiDB\",\"target_entity\":\"the newest version\",\"relationship_desc\":\"what is the newest version of TiDB?\",\"reasoning\":\"The prerequist question need to be figured out.\"},{\"source_entity\":\"Performance Improvement\",\"target_entity\":\"TiDB 6.5 to Newest Version\",\"relationship_desc\":\"The performance improvement from TiDB 6.5 to the newest version.\",\"reasoning\":\"The main question the user is asking.\"}]}" }, { - "query": "I am current using tidb serverless, but as my product grows, I really need a dalicated cluster. Is there a solution helps finish the migration?", - "factors": "{\"relationships\":[{\"source_entity\":\"TiDB Serverless\",\"target_entity\":\"Dedicated Cluster\",\"relationship_desc\":\"How to migrate from TiDB serverless to TiDB dedicated cluster?\",\"reasoning\":\"The main concern of the user.\"}]}" + "query": "I am designing a table based on TiDB's TTL feature, but when I try to create the table using a cluster created with Serverless, I get a `'TTL' is not supported on TiDB Serverless` error.\n\nI plan to use Dedicated on my production environment and Serverless on my development environment, so it would be helpful if the TTL feature could be used in a Serverless environment.\n\nI've read the documentation that says Serverless will support TTL features in the future, but is there a specific timeline for this?\n\nAlso, is it possible to prevent TTL syntax from causing errors in Serverless?", + "factors": "{\"relationships\":[{\"source_entity\":\"TTL Feature\",\"target_entity\":\"TiDB Serverless\",\"relationship_desc\":\"The TTL feature is not currently supported in TiDB Serverless.\",\"reasoning\":\"The problem the user is facing.\"},{\"source_entity\":\"TTL Feature\",\"target_entity\":\"Roadmap Support Timeline\",\"relationship_desc\":\"What's the roadmap timeline on when the TTL feature will be supported in TiDB Serverless.\",\"reasoning\":\"The main question the user is asking.\"},{\"source_entity\":\"TTL SQL Syntax\",\"target_entity\":\"Workaround for SQL Syntax Error\",\"relationship_desc\":\"Workaround to prevent TTL feature SQL syntax from causing errors in TiDB Serverless.\",\"reasoning\":\"The secondary question the user is asking.\"}]}" }, { - "query": "Please speak Chinese", - "factors": "{\"relationships\":[{\"source_entity\":\"User\",\"target_entity\":\"Language\",\"relationship_desc\":\"The user is requesting to communicate in Chinese.\",\"reasoning\":\"the main concern of the user\"}]}" + "query": "We are new to TiDB and don't quite understand the potential impact on our application architecture. We are using TiDB for audit logs and continue to direct traffic to TiDB. We noticed a sudden jump ID from 1 to 30,001. Are there any impacts? Do we need to address this? If we have 100 connections from several applications, what will happen? In summary, what should we do for Auto Increment or do nothing?", + "factors": "{\"relationships\":[{\"source_entity\":\"Auto Increment\",\"target_entity\":\"ID Jump\",\"relationship_desc\":\"Why Auto Increment in TiDB causes a sudden increase in the ID values?\",\"reasoning\":\"The main concerns that the user itentionally asked.\"},{\"source_entity\":\"Connections Impact\",\"target_entity\":\"TiDB\",\"relationship_desc\":\"How 100 connections from several applications affect TiDB, especially when the Auto Increment causes a sudden jump in ID values?\",\"reasoning\":\"The second most important question that the user itentionally asked.\"},{\"source_entity\":\"TiDB\",\"target_entity\":\"Audit Logs\",\"relationship_desc\":\"TiDB is used for storing audit logs and receiving continuous traffic.\",\"reasoning\":\"The user case what the user wants to achieve\"}]}" }, { - "query": "I'm attempting to download a specific backup from the database hosted on TiDB Cloud. So far, I've tried accessing the backup through SQL queries, but haven't found a way to execute this operation.\n\nThe instructions provided suggested using SSH to transfer the backup, however, I don't have SSH access to the server where the backups are stored.\n\nI would like to request guidance on how I can proceed to download this backup without direct access to the server. Is there an alternative or different method that I can use to obtain the desired backup?\n\nThank you in advance for any assistance or guidance you can provide on this matter.", - "factors": "{\"relationships\":[{\"source_entity\":\"Backup data\",\"target_entity\":\"TiDB Cloud\",\"relationship_desc\":\"How to download a specific backup from TiDB Cloud?\",\"reasoning\":\"The main question the user is asking\"},{\"source_entity\":\"Backup SQL\",\"target_entity\":\"Backup data\",\"relationship_desc\":\"I can't find a way to execute Backup SQL queries to download the backup.\",\"reasoning\":\"The problem the user is facing\"}]}" + "query": "What are the feature changes in the latest version compared to v7.0 for TiDB?", + "factors": "{\"relationships\":[{\"source_entity\":\"TiDB\",\"target_entity\":\"Latest Version\",\"relationship_desc\":\"What is the latest version of TiDB?\",\"reasoning\":\"The prerequist question need to be figured out.\"},{\"source_entity\":\"New and Deprecated Features\",\"target_entity\":\"TiDB v7.0\",\"relationship_desc\":\"what are the new features added and any features that have been deprecated or removed since TiDB v7.0.\",\"reasoning\":\"The sub question to answer the main question.\"},{\"source_entity\":\"Feature Changes\",\"target_entity\":\"TiDB 7.0 to Latest Version\",\"relationship_desc\":\"The feature changes from TiDB 7.0 to the latest version.\",\"reasoning\":\"The main question the user is asking.\"}]}" }, { "query": "Upgrade TiDB Serverless to 7.4 or latest for enhanced MySQL 8.0 compatibility", "factors": "{\"relationships\":[{\"source_entity\":\"TiDB 7.4 or Latest version\",\"target_entity\":\"MySQL 8.0 Compatibility\",\"relationship_desc\":\"TiDB 7.4 or the latest version enhances compatibility with MySQL 8.0\",\"reasoning\":\"The reasoning why user wants to upgrade TiDB Serverless to 7.4 or latest for enhanced MySQL 8.0 compatibility\"},{\"source_entity\":\"TiDB Serverless\",\"target_entity\":\"Upgrade\",\"relationship_desc\":\"How to upgrade TiDB Serverless?\",\"reasoning\":\"The basic question what the user itentionally asked.\"}]}" }, { - "query": "I am designing a table based on TiDB's TTL feature, but when I try to create the table using a cluster created with Serverless, I get a `'TTL' is not supported on TiDB Serverless` error.\n\nI plan to use Dedicated on my production environment and Serverless on my development environment, so it would be helpful if the TTL feature could be used in a Serverless environment.\n\nI've read the documentation that says Serverless will support TTL features in the future, but is there a specific timeline for this?\n\nAlso, is it possible to prevent TTL syntax from causing errors in Serverless?", - "factors": "{\"relationships\":[{\"source_entity\":\"TTL Feature\",\"target_entity\":\"TiDB Serverless\",\"relationship_desc\":\"The TTL feature is not currently supported in TiDB Serverless.\",\"reasoning\":\"The problem the user is facing.\"},{\"source_entity\":\"TTL Feature\",\"target_entity\":\"Roadmap Support Timeline\",\"relationship_desc\":\"What's the roadmap timeline on when the TTL feature will be supported in TiDB Serverless.\",\"reasoning\":\"The main question the user is asking.\"},{\"source_entity\":\"TTL SQL Syntax\",\"target_entity\":\"Workaround for SQL Syntax Error\",\"relationship_desc\":\"Workaround to prevent TTL feature SQL syntax from causing errors in TiDB Serverless.\",\"reasoning\":\"The secondary question the user is asking.\"}]}" + "query": "I am current using tidb serverless, but as my product grows, I really need a dalicated cluster. Is there a solution helps finish the migration?", + "factors": "{\"relationships\":[{\"source_entity\":\"TiDB Serverless\",\"target_entity\":\"Dedicated Cluster\",\"relationship_desc\":\"How to migrate from TiDB serverless to TiDB dedicated cluster?\",\"reasoning\":\"The main concern of the user.\"}]}" }, { - "query": "tidb lighting to sync to serverless cluster,but the load command and the tidb-lighting tools dont have the tls config like --ssl-ca or --ca. so i can not sync to the full back data to the serverless", - "factors": "{\"relationships\":[{\"source_entity\":\"TiDB Lighting\",\"target_entity\":\"Serverless Cluster\",\"relationship_desc\":\"Sync data to a serverless cluster using TiDB Lighting.\",\"reasoning\":\"The user case what the user wants to achieve\"},{\"source_entity\":\"Load Command and TiDB Lighting Tools\",\"target_entity\":\"TLS Configuration\",\"relationship_desc\":\"How to configure TLS for TiDB Lightning?\",\"reasoning\":\"The basic question what the user itentionally asked.\"},{\"source_entity\":\"Lack of TLS Configuration\",\"target_entity\":\"Sync Issue\",\"relationship_desc\":\"The sync issue is caused by the lack of TLS configuration options for TiDB Lightning.\",\"reasoning\":\"The problem that the user is facing.\"}]}" + "query": "Chat2query is returning an error message saying \"Query timeout expired\". Additionally, I am unable to locate this SQL query in the slow query log.", + "factors": "{\"relationships\":[{\"source_entity\":\"Chat2query\",\"target_entity\":\"Error Message\",\"relationship_desc\":\"Chat2query is returning an error message saying 'Query timeout expired'.\",\"reasoning\":\"The main problem the user is facing.\"},{\"source_entity\":\"SQL Query\",\"target_entity\":\"Slow Query Log\",\"relationship_desc\":\"The reason why not to locate the SQL query in the slow query log.\",\"reasoning\":\"The secondary problem the user is facing.\"}]}" } ], - "signature_instructions": "You are a knowledge base graph expert and are very good at building knowledge graphs. Now you are assigned to extract the most critical concepts and their relationships from the query. Step-by-Step Analysis:\n\n1. Extract Meaningful user intents and questions:\n - Identify the question what the user itentionally asked, focusing on the the critial information about user's main concerns\/questions\/problems\/use cases, etc.\n - Make this question simple and clear and ensure that it is directly related to the user's main concerns. Simple and clear question can improve the search accuracy.\n2. Establish Relationships to describe the user's intents:\n - Define relationships that accurately represent the user's query intent and information needs.\n - Format each relationship as: (Source Entity) - [Relationship] -> (Target Entity), where the relationship describes what the user wants to know about the connection between these entities.\n\n## Instructions:\n\n- Limit to no more than 3 pairs. These pairs must accurately reflect the user's real (sub)questions.\n- Ensure that the extracted pairs are of high quality and do not introduce unnecessary search elements.\n- Ensure that the relationships and intents are grounded and factual, based on the information provided in the query.", + "signature_instructions": "You are a knowledge base graph expert and are very good at building knowledge graphs. Now you are assigned to extract the user's step-by-step intentions from the query.\n\n## Instructions:\n\n- Break down the user's query into a sequence of prerequisite questions (e.g., identifying specific versions) and step-by-step intentions.\n- Represent each prerequisite and intention as a relationship: (Source Entity) - [Relationship] -> (Target Entity).\n- Provide reasoning for each relationship, explaining the user's intention at that step.\n- Limit to no more than 5 relationships.\n- Ensure that the extracted relationships accurately reflect the user's real intentions.\n- Ensure that the relationships and intentions are grounded and factual, based on the information provided in the query.", "signature_prefix": "Factors:" } } \ No newline at end of file diff --git a/backend/dspy_compiled_program/decompose_query_samples.json b/backend/dspy_compiled_program/decompose_query_samples.json new file mode 100644 index 00000000..91569bc5 --- /dev/null +++ b/backend/dspy_compiled_program/decompose_query_samples.json @@ -0,0 +1,135 @@ +[ + { + "query":"Chat2query is returning an error message saying \"Query timeout expired\". Additionally, I am unable to locate this SQL query in the slow query log.", + "source_entity":"Chat2query", + "target_entity":"Error Message", + "relationship_desc":"Chat2query is returning an error message saying 'Query timeout expired'.", + "reasoning":"The main problem the user is facing." + }, + { + "query":"Chat2query is returning an error message saying \"Query timeout expired\". Additionally, I am unable to locate this SQL query in the slow query log.", + "source_entity":"SQL Query", + "target_entity":"Slow Query Log", + "relationship_desc":"The reason why not to locate the SQL query in the slow query log.", + "reasoning":"The secondary problem the user is facing." + }, + { + "query":"I am current using tidb serverless, but as my product grows, I really need a dalicated cluster. Is there a solution helps finish the migration?", + "source_entity":"TiDB Serverless", + "target_entity":"Dedicated Cluster", + "relationship_desc":"How to migrate from TiDB serverless to TiDB dedicated cluster?", + "reasoning":"The main concern of the user." + }, + { + "query":"I am designing a table based on TiDB's TTL feature, but when I try to create the table using a cluster created with Serverless, I get a `'TTL' is not supported on TiDB Serverless` error.\n\nI plan to use Dedicated on my production environment and Serverless on my development environment, so it would be helpful if the TTL feature could be used in a Serverless environment.\n\nI've read the documentation that says Serverless will support TTL features in the future, but is there a specific timeline for this?\n\nAlso, is it possible to prevent TTL syntax from causing errors in Serverless?", + "source_entity":"TTL Feature", + "target_entity":"TiDB Serverless", + "relationship_desc":"The TTL feature is not currently supported in TiDB Serverless.", + "reasoning":"The problem the user is facing." + }, + { + "query":"I am designing a table based on TiDB's TTL feature, but when I try to create the table using a cluster created with Serverless, I get a `'TTL' is not supported on TiDB Serverless` error.\n\nI plan to use Dedicated on my production environment and Serverless on my development environment, so it would be helpful if the TTL feature could be used in a Serverless environment.\n\nI've read the documentation that says Serverless will support TTL features in the future, but is there a specific timeline for this?\n\nAlso, is it possible to prevent TTL syntax from causing errors in Serverless?", + "source_entity":"TTL Feature", + "target_entity":"Roadmap Support Timeline", + "relationship_desc":"What's the roadmap timeline on when the TTL feature will be supported in TiDB Serverless.", + "reasoning":"The main question the user is asking." + }, + { + "query":"I am designing a table based on TiDB's TTL feature, but when I try to create the table using a cluster created with Serverless, I get a `'TTL' is not supported on TiDB Serverless` error.\n\nI plan to use Dedicated on my production environment and Serverless on my development environment, so it would be helpful if the TTL feature could be used in a Serverless environment.\n\nI've read the documentation that says Serverless will support TTL features in the future, but is there a specific timeline for this?\n\nAlso, is it possible to prevent TTL syntax from causing errors in Serverless?", + "source_entity":"TTL SQL Syntax", + "target_entity":"Workaround for SQL Syntax Error", + "relationship_desc":"Workaround to prevent TTL feature SQL syntax from causing errors in TiDB Serverless.", + "reasoning":"The secondary question the user is asking." + }, + { + "query":"Upgrade TiDB Serverless to 7.4 or latest for enhanced MySQL 8.0 compatibility", + "source_entity":"TiDB 7.4 or Latest version", + "target_entity":"MySQL 8.0 Compatibility", + "relationship_desc":"TiDB 7.4 or the latest version enhances compatibility with MySQL 8.0", + "reasoning":"The reasoning why user wants to upgrade TiDB Serverless to 7.4 or latest for enhanced MySQL 8.0 compatibility" + }, + { + "query":"Upgrade TiDB Serverless to 7.4 or latest for enhanced MySQL 8.0 compatibility", + "source_entity":"TiDB Serverless", + "target_entity":"Upgrade", + "relationship_desc":"How to upgrade TiDB Serverless?", + "reasoning":"The basic question what the user itentionally asked." + }, + { + "query":"We are new to TiDB and don't quite understand the potential impact on our application architecture. We are using TiDB for audit logs and continue to direct traffic to TiDB. We noticed a sudden jump ID from 1 to 30,001. Are there any impacts? Do we need to address this? If we have 100 connections from several applications, what will happen? In summary, what should we do for Auto Increment or do nothing?", + "source_entity":"Auto Increment", + "target_entity":"ID Jump", + "relationship_desc":"Why Auto Increment in TiDB causes a sudden increase in the ID values?", + "reasoning":"The main concerns that the user itentionally asked." + }, + { + "query":"We are new to TiDB and don't quite understand the potential impact on our application architecture. We are using TiDB for audit logs and continue to direct traffic to TiDB. We noticed a sudden jump ID from 1 to 30,001. Are there any impacts? Do we need to address this? If we have 100 connections from several applications, what will happen? In summary, what should we do for Auto Increment or do nothing?", + "source_entity":"Connections Impact", + "target_entity":"TiDB", + "relationship_desc":"How 100 connections from several applications affect TiDB, especially when the Auto Increment causes a sudden jump in ID values?", + "reasoning":"The second most important question that the user itentionally asked." + }, + { + "query":"We are new to TiDB and don't quite understand the potential impact on our application architecture. We are using TiDB for audit logs and continue to direct traffic to TiDB. We noticed a sudden jump ID from 1 to 30,001. Are there any impacts? Do we need to address this? If we have 100 connections from several applications, what will happen? In summary, what should we do for Auto Increment or do nothing?", + "source_entity":"TiDB", + "target_entity":"Audit Logs", + "relationship_desc":"TiDB is used for storing audit logs and receiving continuous traffic.", + "reasoning":"The user case what the user wants to achieve" + }, + { + "query":"tidb lighting to sync to serverless cluster,but the load command and the tidb-lighting tools dont have the tls config like --ssl-ca or --ca. so i can not sync to the full back data to the serverless", + "source_entity":"TiDB Lighting", + "target_entity":"Serverless Cluster", + "relationship_desc":"Sync data to a serverless cluster using TiDB Lighting.", + "reasoning":"The user case what the user wants to achieve" + }, + { + "query":"tidb lighting to sync to serverless cluster,but the load command and the tidb-lighting tools dont have the tls config like --ssl-ca or --ca. so i can not sync to the full back data to the serverless", + "source_entity":"Load Command and TiDB Lighting Tools", + "target_entity":"TLS Configuration", + "relationship_desc":"How to configure TLS for TiDB Lightning?", + "reasoning":"The basic question what the user itentionally asked." + }, + { + "query":"tidb lighting to sync to serverless cluster,but the load command and the tidb-lighting tools dont have the tls config like --ssl-ca or --ca. so i can not sync to the full back data to the serverless", + "source_entity":"Lack of TLS Configuration", + "target_entity":"Sync Issue", + "relationship_desc":"The sync issue is caused by the lack of TLS configuration options for TiDB Lightning.", + "reasoning":"The problem that the user is facing." + }, + { + "query":"summary the performance improvement from version 6.5 to newest version for TiDB", + "source_entity":"TiDB", + "target_entity":"the newest version", + "relationship_desc":"what is the newest version of TiDB?", + "reasoning":"The prerequist question need to be figured out." + }, + { + "query":"summary the performance improvement from version 6.5 to newest version for TiDB", + "source_entity":"Performance Improvement", + "target_entity":"TiDB 6.5 to Newest Version", + "relationship_desc":"The performance improvement from TiDB 6.5 to the newest version.", + "reasoning":"The main question the user is asking." + }, + { + "query":"What are the feature changes in the latest version compared to v7.0 for TiDB?", + "source_entity":"TiDB", + "target_entity":"Latest Version", + "relationship_desc":"What is the latest version of TiDB?", + "reasoning":"The prerequist question need to be figured out." + }, + { + "query":"What are the feature changes in the latest version compared to v7.0 for TiDB?", + "source_entity":"New and Deprecated Features", + "target_entity":"TiDB v7.0", + "relationship_desc":"what are the new features added and any features that have been deprecated or removed since TiDB v7.0.", + "reasoning":"The sub question to answer the main question." + }, + { + "query":"What are the feature changes in the latest version compared to v7.0 for TiDB?", + "source_entity":"Feature Changes", + "target_entity":"TiDB 7.0 to Latest Version", + "relationship_desc":"The feature changes from TiDB 7.0 to the latest version.", + "reasoning":"The main question the user is asking." + } + ] \ No newline at end of file diff --git a/backend/dspy_compiled_program/sql_sample.csv b/backend/dspy_compiled_program/sql_sample.csv deleted file mode 100644 index d8a4bd23..00000000 --- a/backend/dspy_compiled_program/sql_sample.csv +++ /dev/null @@ -1,21 +0,0 @@ -QA_content,no_reasonable_example,steps -"User question: Does TiDB support FOREIGN KEY? -Answer: Yes, TiDB supports FOREIGN KEY constraints starting from version 6.6.0 as an experimental feature. However, there are key considerations: - 1. Experimental Status: The feature is experimental and may change or be removed in future releases. - 2. Performance Impact: Foreign keys can lead to performance issues, especially with large datasets. - 3. Implementation: FOREIGN KEY constraints can be created, modified, and viewed using standard SQL commands (e.g., CREATE TABLE, ALTER TABLE). - 4. Upgrade Considerations: If upgrading from a version prior to 6.6.0, foreign keys will not be automatically enforced and must be recreated. - 5. Caution Advised: Given the potential performance impacts and experimental status, use foreign keys cautiously in TiDB. -In conclusion, while supported, foreign keys in TiDB are still experimental and may have significant performance implications. -",False,"[{""explanation"": ""Create a 'users' table which will be referenced by the 'orders' table via a foreign key."", ""sql"": ""CREATE TABLE IF NOT EXISTS sample_db.users (user_id INT PRIMARY KEY, name VARCHAR(255));""}, {""explanation"": ""Create an 'orders' table with a FOREIGN KEY referencing the 'user_id' column in the 'users' table. Ensure the foreign key constraint is specified in the table creation statement."", ""sql"": ""CREATE TABLE IF NOT EXISTS sample_db.orders (order_id INT PRIMARY KEY, order_date DATE NOT NULL, user_id INT, FOREIGN KEY (user_id) REFERENCES sample_db.users(user_id));""}, {""explanation"": ""Insert sample data into the 'users' table."", ""sql"": ""INSERT INTO sample_db.users (user_id, name) VALUES (1, 'Alice'), (2, 'Bob');""}, {""explanation"": ""Insert sample data into the 'orders' table, ensuring the 'user_id' values correspond to existing 'user_id' values in the 'users' table."", ""sql"": ""INSERT INTO sample_db.orders (order_id, order_date, user_id) VALUES (101, '2022-01-01', 1), (102, '2022-01-02', 2);""}, {""explanation"": ""Attempt to insert a record into the 'orders' table with a 'user_id' value that does not exist in the 'users' table. This should trigger a foreign key constraint violation."", ""sql"": ""INSERT INTO sample_db.orders (order_id, order_date, user_id) VALUES (103, '2022-01-03', 3);""}, {""explanation"": ""View the error message that indicates the foreign key constraint violation."", ""sql"": ""SHOW WARNINGS;""}, {""explanation"": ""Drop the 'orders' table to remove the foreign key constraint."", ""sql"": ""DROP TABLE IF EXISTS sample_db.orders;""}, {""explanation"": ""Drop the 'users' table to clean up the example."", ""sql"": ""DROP TABLE IF EXISTS sample_db.users;""}]" -"User question: How to implement rolling upgrade in TiDB? -Answer: 1. Preparation: - - Verify no ongoing DDL operations and ensure the cluster meets the minimum topology requirements. - - Stop the Drainer service if using TiDB Binlog. -2. Use TiUP for Upgrade: - - Run tiup cluster upgrade , following the upgrade order: PD, TiProxy (if applicable), TiFlash, TiKV, TiDB, and TiCDC. -3. Upgrade Process: - - TiUP upgrades nodes one by one, handling leader transfers automatically to minimize impact. - - If any issues are detected, the process halts for manual intervention. -... -",True,[] diff --git a/backend/dspy_compiled_program/sql_sample_extraction.csv b/backend/dspy_compiled_program/sql_sample_extraction.csv deleted file mode 100644 index fe1eea34..00000000 --- a/backend/dspy_compiled_program/sql_sample_extraction.csv +++ /dev/null @@ -1,85 +0,0 @@ -QA_content,no_reasonable_example,steps -"User question: TiDB 支持 vector search 吗? -Answer: 是的,TiDB 支持向量搜索(Vector Search)功能。让我详细为您解释 TiDB 的向量搜索功能: - -1. 功能概述: -TiDB 的向量搜索功能允许用户在 TiDB 数据库中存储和查询向量数据,实现高效的相似性搜索[^1]。这项功能目前处于 beta 阶段,仅适用于 TiDB Serverless 集群[^2]。 - -2. 工作原理: -TiDB 向量搜索使用近似最近邻(ANN)搜索算法来加速 K-最近邻(KNN)查询[^3]。它通过创建和利用向量搜索索引来实现毫秒级的查询速度。目前,TiDB 支持 HNSW(Hierarchical Navigable Small World)算法作为向量搜索索引[^3]。 - -3. 支持的向量类型: -TiDB 引入了新的向量数据类型来支持向量搜索。它可以存储最多 16,000 维的向量,这足以满足大多数嵌入模型的需求[^4]。 - -4. 使用方法: -要使用 TiDB 进行向量搜索,您需要: - a. 创建一个包含向量数据类型列的表 - b. 插入向量数据 - c. 创建向量搜索索引 - d. 执行向量相似性查询 - -5. 集成与应用: -TiDB 向量搜索可以与多种 AI 框架和嵌入模型集成,如 Langchain 和 LlamaIndex[^5]。这使得它可以应用于各种场景,如语义搜索、推荐系统等。 - -6. 性能优化: -TiDB 提供了多种方法来优化向量搜索性能,包括索引调优和查询优化[^6]。 - -7. SQL 支持: -TiDB 扩展了 MySQL 语法以支持向量搜索,引入了新的向量函数和操作符[^7]。这使得用户可以使用熟悉的 SQL 语句来执行向量搜索操作。 - -总的来说,TiDB 的向量搜索功能为用户提供了一个强大的工具,可以在关系数据库中高效地进行向量相似性搜索,适用于各种 AI 和机器学习应用场景。 - -Example SQL: - -以下是使用 TiDB 进行向量搜索的示例 SQL: - -```sql --- 创建包含向量数据的表 -CREATE TABLE vector_data ( - id INT PRIMARY KEY, - description VARCHAR(255), - embedding VECTOR(128) -- 假设我们使用128维的向量 -); - --- 插入向量数据 -INSERT INTO vector_data (id, description, embedding) VALUES -(1, 'Sample 1', '[1.0, 2.0, 3.0, ..., 128.0]'), -(2, 'Sample 2', '[2.0, 3.0, 4.0, ..., 129.0]'); - --- 创建向量搜索索引 -CREATE INDEX idx_embedding ON vector_data USING HNSW(embedding); - --- 执行向量相似性查询 -SELECT id, description, VECTOR_DISTANCE(embedding, '[1.1, 2.1, 3.1, ..., 128.1]') AS distance -FROM vector_data -ORDER BY distance -LIMIT 5; -``` - -请注意,这些 SQL 示例是概念性的,实际使用时需要根据您的具体数据和需求进行调整。 - -[^1]: [Vector Search Overview | PingCAP Docs](https://docs.pingcap.com/tidbcloud/vector-search-overview) -[^2]: [Get Started with Vector Search via SQL | PingCAP Docs](https://docs.pingcap.com/tidbcloud/vector-search-get-started-using-sql) -[^3]: [Vector Search Index | PingCAP Docs](https://docs.pingcap.com/tidbcloud/vector-search-index) -[^4]: [Vector Search Integration Overview | PingCAP Docs](https://docs.pingcap.com/tidbcloud/vector-search-integration-overview) -[^5]: [AI frameworks | PingCAP Docs](https://docs.pingcap.com/tidbcloud/vector-search-integration-overview#ai-frameworks) -[^6]: [Improve Vector Search Performance | PingCAP Docs](https://docs.pingcap.com/tidbcloud/vector-search-improve-performance) -[^7]: [Vector Functions and Operators | PingCAP Docs](https://docs.pingcap.com/tidbcloud/vector-search-functions-and-operators)",False,"[{""explanation"": ""Create a table with a column of vector data type. However to simplify the demo, Let's set the vector length to 3. "", ""sql"": ""CREATE TABLE vector_data (id INT PRIMARY KEY, description VARCHAR(255), embedding VECTOR(3));""}, {""explanation"": ""Insert sample data with vector values."", ""sql"": ""INSERT INTO vector_data (id, description, embedding) VALUES (1, 'Sample 1', '[1.0, 2.0, 3.0]'), (2, 'Sample 2', '[2.0, 3.0, 4.0]');""}, {""explanation"": ""Create a vector search index using the HNSW algorithm. However, since TiDB does not yet support creating indexes using the CREATE INDEX statement, we will skip this step."", ""sql"": ""-- CREATE INDEX idx_embedding ON vector_data USING HNSW(embedding);""}, {""explanation"": ""Execute a vector similarity query to find the closest vectors. But TiDB don't support `VECTOR_DISTANCE` function, let's use `VEC_COSINE_DISTANCE`."", ""sql"": ""SELECT id, description, VEC_COSINE_DISTANCE(embedding, '[1.1, 2.1, 3.1]') AS distance FROM vector_data ORDER BY distance LIMIT 5;""}, {""explanation"": ""Clear the table after the demo."", ""sql"": ""DROP TABLE vector_data;""}]" -"What is TiDB?? -Answer: TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads[^1]. It is designed to provide a scalable, highly available, and MySQL-compatible database solution for large-scale data management. Let's explore TiDB's key features and functionalities: - -1. Distributed Architecture: -TiDB has a distributed architecture that separates computing from storage, allowing for flexible and elastic scalability[^2]. This design enables users to scale out or scale in both computing and storage resources independently, adapting to changing workloads without disrupting operations. - -.... - -8. Comprehensive Toolset: -TiDB provides a rich set of tools for data migration, replication, and backup, facilitating easy integration with existing database systems and ensuring data safety[^3]. - -By addressing the challenges of scalability and high availability in a distributed environment while maintaining MySQL compatibility, TiDB positions itself as a powerful solution for businesses dealing with large-scale data and requiring both transactional and analytical processing capabilities. - -[^1]: [TiDB Introduction | PingCAP Docs](https://docs.pingcap.com/tidb/v6.1/overview) -[^2]: [TiDB Architecture | PingCAP Docs](https://docs.pingcap.com/tidb/v7.1/tidb-architecture) -[^3]: [TiDB Architecture | PingCAP Docs](https://docs.pingcap.com/tidb/stable/tidb-architecture) -[^4]: [TiDB Key Features | PingCAP Docs](https://docs.pingcap.com/tidb/v5.4/overview) -",True,[] diff --git a/backend/dspy_compiled_program/sql_sample_extraction_program b/backend/dspy_compiled_program/sql_sample_extraction_program deleted file mode 100644 index 468c2087..00000000 --- a/backend/dspy_compiled_program/sql_sample_extraction_program +++ /dev/null @@ -1,19 +0,0 @@ -{ - "prog.predictor": { - "lm": null, - "traces": [], - "train": [], - "demos": [ - { - "QA_content": "What is TiDB??\nAnswer: TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads[^1]. It is designed to provide a scalable, highly available, and MySQL-compatible database solution for large-scale data management. Let's explore TiDB's key features and functionalities:\n\n1. Distributed Architecture:\nTiDB has a distributed architecture that separates computing from storage, allowing for flexible and elastic scalability[^2]. This design enables users to scale out or scale in both computing and storage resources independently, adapting to changing workloads without disrupting operations.\n\n....\n\n8. Comprehensive Toolset:\nTiDB provides a rich set of tools for data migration, replication, and backup, facilitating easy integration with existing database systems and ensuring data safety[^3].\n\nBy addressing the challenges of scalability and high availability in a distributed environment while maintaining MySQL compatibility, TiDB positions itself as a powerful solution for businesses dealing with large-scale data and requiring both transactional and analytical processing capabilities.\n\n[^1]: [TiDB Introduction | PingCAP Docs](https:\/\/docs.pingcap.com\/tidb\/v6.1\/overview)\n[^2]: [TiDB Architecture | PingCAP Docs](https:\/\/docs.pingcap.com\/tidb\/v7.1\/tidb-architecture)\n[^3]: [TiDB Architecture | PingCAP Docs](https:\/\/docs.pingcap.com\/tidb\/stable\/tidb-architecture)\n[^4]: [TiDB Key Features | PingCAP Docs](https:\/\/docs.pingcap.com\/tidb\/v5.4\/overview)\n", - "sample": "{\"no_reasonable_example\":true,\"steps\":[]}" - }, - { - "QA_content": "User question: TiDB \u652f\u6301 vector search \u5417\uff1f\nAnswer: \u662f\u7684\uff0cTiDB \u652f\u6301\u5411\u91cf\u641c\u7d22\uff08Vector Search\uff09\u529f\u80fd\u3002\u8ba9\u6211\u8be6\u7ec6\u4e3a\u60a8\u89e3\u91ca TiDB \u7684\u5411\u91cf\u641c\u7d22\u529f\u80fd\uff1a\n\n1. \u529f\u80fd\u6982\u8ff0\uff1a\nTiDB \u7684\u5411\u91cf\u641c\u7d22\u529f\u80fd\u5141\u8bb8\u7528\u6237\u5728 TiDB \u6570\u636e\u5e93\u4e2d\u5b58\u50a8\u548c\u67e5\u8be2\u5411\u91cf\u6570\u636e\uff0c\u5b9e\u73b0\u9ad8\u6548\u7684\u76f8\u4f3c\u6027\u641c\u7d22[^1]\u3002\u8fd9\u9879\u529f\u80fd\u76ee\u524d\u5904\u4e8e beta \u9636\u6bb5\uff0c\u4ec5\u9002\u7528\u4e8e TiDB Serverless \u96c6\u7fa4[^2]\u3002\n\n2. \u5de5\u4f5c\u539f\u7406\uff1a\nTiDB \u5411\u91cf\u641c\u7d22\u4f7f\u7528\u8fd1\u4f3c\u6700\u8fd1\u90bb\uff08ANN\uff09\u641c\u7d22\u7b97\u6cd5\u6765\u52a0\u901f K-\u6700\u8fd1\u90bb\uff08KNN\uff09\u67e5\u8be2[^3]\u3002\u5b83\u901a\u8fc7\u521b\u5efa\u548c\u5229\u7528\u5411\u91cf\u641c\u7d22\u7d22\u5f15\u6765\u5b9e\u73b0\u6beb\u79d2\u7ea7\u7684\u67e5\u8be2\u901f\u5ea6\u3002\u76ee\u524d\uff0cTiDB \u652f\u6301 HNSW\uff08Hierarchical Navigable Small World\uff09\u7b97\u6cd5\u4f5c\u4e3a\u5411\u91cf\u641c\u7d22\u7d22\u5f15[^3]\u3002\n\n3. \u652f\u6301\u7684\u5411\u91cf\u7c7b\u578b\uff1a\nTiDB \u5f15\u5165\u4e86\u65b0\u7684\u5411\u91cf\u6570\u636e\u7c7b\u578b\u6765\u652f\u6301\u5411\u91cf\u641c\u7d22\u3002\u5b83\u53ef\u4ee5\u5b58\u50a8\u6700\u591a 16,000 \u7ef4\u7684\u5411\u91cf\uff0c\u8fd9\u8db3\u4ee5\u6ee1\u8db3\u5927\u591a\u6570\u5d4c\u5165\u6a21\u578b\u7684\u9700\u6c42[^4]\u3002\n\n4. \u4f7f\u7528\u65b9\u6cd5\uff1a\n\u8981\u4f7f\u7528 TiDB \u8fdb\u884c\u5411\u91cf\u641c\u7d22\uff0c\u60a8\u9700\u8981\uff1a\n a. \u521b\u5efa\u4e00\u4e2a\u5305\u542b\u5411\u91cf\u6570\u636e\u7c7b\u578b\u5217\u7684\u8868\n b. \u63d2\u5165\u5411\u91cf\u6570\u636e\n c. \u521b\u5efa\u5411\u91cf\u641c\u7d22\u7d22\u5f15\n d. \u6267\u884c\u5411\u91cf\u76f8\u4f3c\u6027\u67e5\u8be2\n\n5. \u96c6\u6210\u4e0e\u5e94\u7528\uff1a\nTiDB \u5411\u91cf\u641c\u7d22\u53ef\u4ee5\u4e0e\u591a\u79cd AI \u6846\u67b6\u548c\u5d4c\u5165\u6a21\u578b\u96c6\u6210\uff0c\u5982 Langchain \u548c LlamaIndex[^5]\u3002\u8fd9\u4f7f\u5f97\u5b83\u53ef\u4ee5\u5e94\u7528\u4e8e\u5404\u79cd\u573a\u666f\uff0c\u5982\u8bed\u4e49\u641c\u7d22\u3001\u63a8\u8350\u7cfb\u7edf\u7b49\u3002\n\n6. \u6027\u80fd\u4f18\u5316\uff1a\nTiDB \u63d0\u4f9b\u4e86\u591a\u79cd\u65b9\u6cd5\u6765\u4f18\u5316\u5411\u91cf\u641c\u7d22\u6027\u80fd\uff0c\u5305\u62ec\u7d22\u5f15\u8c03\u4f18\u548c\u67e5\u8be2\u4f18\u5316[^6]\u3002\n\n7. SQL \u652f\u6301\uff1a\nTiDB \u6269\u5c55\u4e86 MySQL \u8bed\u6cd5\u4ee5\u652f\u6301\u5411\u91cf\u641c\u7d22\uff0c\u5f15\u5165\u4e86\u65b0\u7684\u5411\u91cf\u51fd\u6570\u548c\u64cd\u4f5c\u7b26[^7]\u3002\u8fd9\u4f7f\u5f97\u7528\u6237\u53ef\u4ee5\u4f7f\u7528\u719f\u6089\u7684 SQL \u8bed\u53e5\u6765\u6267\u884c\u5411\u91cf\u641c\u7d22\u64cd\u4f5c\u3002\n\n\u603b\u7684\u6765\u8bf4\uff0cTiDB \u7684\u5411\u91cf\u641c\u7d22\u529f\u80fd\u4e3a\u7528\u6237\u63d0\u4f9b\u4e86\u4e00\u4e2a\u5f3a\u5927\u7684\u5de5\u5177\uff0c\u53ef\u4ee5\u5728\u5173\u7cfb\u6570\u636e\u5e93\u4e2d\u9ad8\u6548\u5730\u8fdb\u884c\u5411\u91cf\u76f8\u4f3c\u6027\u641c\u7d22\uff0c\u9002\u7528\u4e8e\u5404\u79cd AI \u548c\u673a\u5668\u5b66\u4e60\u5e94\u7528\u573a\u666f\u3002\n\nExample SQL:\n\n\u4ee5\u4e0b\u662f\u4f7f\u7528 TiDB \u8fdb\u884c\u5411\u91cf\u641c\u7d22\u7684\u793a\u4f8b SQL\uff1a\n\n```sql\n-- \u521b\u5efa\u5305\u542b\u5411\u91cf\u6570\u636e\u7684\u8868\nCREATE TABLE vector_data (\n id INT PRIMARY KEY,\n description VARCHAR(255),\n embedding VECTOR(128) -- \u5047\u8bbe\u6211\u4eec\u4f7f\u7528128\u7ef4\u7684\u5411\u91cf\n);\n\n-- \u63d2\u5165\u5411\u91cf\u6570\u636e\nINSERT INTO vector_data (id, description, embedding) VALUES \n(1, 'Sample 1', '[1.0, 2.0, 3.0, ..., 128.0]'),\n(2, 'Sample 2', '[2.0, 3.0, 4.0, ..., 129.0]');\n\n-- \u521b\u5efa\u5411\u91cf\u641c\u7d22\u7d22\u5f15\nCREATE INDEX idx_embedding ON vector_data USING HNSW(embedding);\n\n-- \u6267\u884c\u5411\u91cf\u76f8\u4f3c\u6027\u67e5\u8be2\nSELECT id, description, VECTOR_DISTANCE(embedding, '[1.1, 2.1, 3.1, ..., 128.1]') AS distance\nFROM vector_data\nORDER BY distance\nLIMIT 5;\n```\n\n\u8bf7\u6ce8\u610f\uff0c\u8fd9\u4e9b SQL \u793a\u4f8b\u662f\u6982\u5ff5\u6027\u7684\uff0c\u5b9e\u9645\u4f7f\u7528\u65f6\u9700\u8981\u6839\u636e\u60a8\u7684\u5177\u4f53\u6570\u636e\u548c\u9700\u6c42\u8fdb\u884c\u8c03\u6574\u3002\n\n[^1]: [Vector Search Overview | PingCAP Docs](https:\/\/docs.pingcap.com\/tidbcloud\/vector-search-overview)\n[^2]: [Get Started with Vector Search via SQL | PingCAP Docs](https:\/\/docs.pingcap.com\/tidbcloud\/vector-search-get-started-using-sql)\n[^3]: [Vector Search Index | PingCAP Docs](https:\/\/docs.pingcap.com\/tidbcloud\/vector-search-index)\n[^4]: [Vector Search Integration Overview | PingCAP Docs](https:\/\/docs.pingcap.com\/tidbcloud\/vector-search-integration-overview)\n[^5]: [AI frameworks | PingCAP Docs](https:\/\/docs.pingcap.com\/tidbcloud\/vector-search-integration-overview#ai-frameworks)\n[^6]: [Improve Vector Search Performance | PingCAP Docs](https:\/\/docs.pingcap.com\/tidbcloud\/vector-search-improve-performance)\n[^7]: [Vector Functions and Operators | PingCAP Docs](https:\/\/docs.pingcap.com\/tidbcloud\/vector-search-functions-and-operators)", - "sample": "{\"no_reasonable_example\":false,\"steps\":[{\"explanation\":\"Create a table with a column of vector data type. However to simplify the demo, Let's set the vector length to 3. \",\"sql\":\"CREATE TABLE vector_data (id INT PRIMARY KEY, description VARCHAR(255), embedding VECTOR(3));\"},{\"explanation\":\"Insert sample data with vector values.\",\"sql\":\"INSERT INTO vector_data (id, description, embedding) VALUES (1, 'Sample 1', '[1.0, 2.0, 3.0]'), (2, 'Sample 2', '[2.0, 3.0, 4.0]');\"},{\"explanation\":\"Create a vector search index using the HNSW algorithm. However, since TiDB does not yet support creating indexes using the CREATE INDEX statement, we will skip this step.\",\"sql\":\"-- CREATE INDEX idx_embedding ON vector_data USING HNSW(embedding);\"},{\"explanation\":\"Execute a vector similarity query to find the closest vectors. But TiDB don't support `VECTOR_DISTANCE` function, let's use `VEC_COSINE_DISTANCE`.\",\"sql\":\"SELECT id, description, VEC_COSINE_DISTANCE(embedding, '[1.1, 2.1, 3.1]') AS distance FROM vector_data ORDER BY distance LIMIT 5;\"},{\"explanation\":\"Clear the table after the demo.\",\"sql\":\"DROP TABLE vector_data;\"}]}" - } - ], - "signature_instructions": "Your goal is to extract the \"SQL Example\" from the provided answer. This section contains SQL queries intended to address the user's issue. \n\nFollow these steps to ensure the SQL examples are accurate and executable:\n\n 1. Extract the SQL Example: Identify and extract the SQL Example Section from the given answer. This section should contain all the SQL queries that were generated.\n 2. Review the SQL Example:\n - Carefully review each SQL query to ensure it is accurate and can be executed directly.\n - Verify that the SQL syntax is fully supported and executable in TiDB Serverless.\n - Check for missing details, or syntax that might be unsupported in TiDB.\n 3. Manual Adjustments If Necessary: If necessary, manually adjust the SQL queries to make them fully executable in TiDB Serverless. This may include adding missing content, replacing placeholder table names, or modifying syntax to match TiDB's supported features\n 4. Generate the Final SQL List: Provide the final, executable SQL Example Section as a list of SQL queries that can be directly used in TiDB Serverless.\n\nBy following these instructions, you will help the user not only resolve their current query but also deepen their understanding of the topic through practical application.", - "signature_prefix": "Sample:" - } -} \ No newline at end of file diff --git a/backend/dspy_compiled_program/sql_sample_gen_program b/backend/dspy_compiled_program/sql_sample_gen_program deleted file mode 100644 index fd059b47..00000000 --- a/backend/dspy_compiled_program/sql_sample_gen_program +++ /dev/null @@ -1,20 +0,0 @@ -{ - "prog.predictor": { - "lm": null, - "traces": [], - "train": [], - "demos": [ - { - "augmented": true, - "QA_content": "User question: How to implement rolling upgrade in TiDB?\nAnswer: 1. Preparation:\n - Verify no ongoing DDL operations and ensure the cluster meets the minimum topology requirements.\n - Stop the Drainer service if using TiDB Binlog.\n2. Use TiUP for Upgrade:\n - Run tiup cluster upgrade , following the upgrade order: PD, TiProxy (if applicable), TiFlash, TiKV, TiDB, and TiCDC.\n3. Upgrade Process:\n - TiUP upgrades nodes one by one, handling leader transfers automatically to minimize impact.\n - If any issues are detected, the process halts for manual intervention.\n...\n", - "sample": "```json\n{\n \"no_reasonable_example\": true,\n \"steps\": []\n}\n```" - }, - { - "QA_content": "User question: Does TiDB support FOREIGN KEY?\nAnswer: Yes, TiDB supports FOREIGN KEY constraints starting from version 6.6.0 as an experimental feature. However, there are key considerations:\n 1. Experimental Status: The feature is experimental and may change or be removed in future releases.\n 2. Performance Impact: Foreign keys can lead to performance issues, especially with large datasets.\n 3. Implementation: FOREIGN KEY constraints can be created, modified, and viewed using standard SQL commands (e.g., CREATE TABLE, ALTER TABLE).\n 4. Upgrade Considerations: If upgrading from a version prior to 6.6.0, foreign keys will not be automatically enforced and must be recreated.\n 5. Caution Advised: Given the potential performance impacts and experimental status, use foreign keys cautiously in TiDB.\nIn conclusion, while supported, foreign keys in TiDB are still experimental and may have significant performance implications.\n", - "sample": "{\"no_reasonable_example\":false,\"steps\":[{\"explanation\":\"Create a 'users' table which will be referenced by the 'orders' table via a foreign key.\",\"sql\":\"CREATE TABLE IF NOT EXISTS sample_db.users (user_id INT PRIMARY KEY, name VARCHAR(255));\"},{\"explanation\":\"Create an 'orders' table with a FOREIGN KEY referencing the 'user_id' column in the 'users' table. Ensure the foreign key constraint is specified in the table creation statement.\",\"sql\":\"CREATE TABLE IF NOT EXISTS sample_db.orders (order_id INT PRIMARY KEY, order_date DATE NOT NULL, user_id INT, FOREIGN KEY (user_id) REFERENCES sample_db.users(user_id));\"},{\"explanation\":\"Insert sample data into the 'users' table.\",\"sql\":\"INSERT INTO sample_db.users (user_id, name) VALUES (1, 'Alice'), (2, 'Bob');\"},{\"explanation\":\"Insert sample data into the 'orders' table, ensuring the 'user_id' values correspond to existing 'user_id' values in the 'users' table.\",\"sql\":\"INSERT INTO sample_db.orders (order_id, order_date, user_id) VALUES (101, '2022-01-01', 1), (102, '2022-01-02', 2);\"},{\"explanation\":\"Attempt to insert a record into the 'orders' table with a 'user_id' value that does not exist in the 'users' table. This should trigger a foreign key constraint violation.\",\"sql\":\"INSERT INTO sample_db.orders (order_id, order_date, user_id) VALUES (103, '2022-01-03', 3);\"},{\"explanation\":\"View the error message that indicates the foreign key constraint violation.\",\"sql\":\"SHOW WARNINGS;\"},{\"explanation\":\"Drop the 'orders' table to remove the foreign key constraint.\",\"sql\":\"DROP TABLE IF EXISTS sample_db.orders;\"},{\"explanation\":\"Drop the 'users' table to clean up the example.\",\"sql\":\"DROP TABLE IF EXISTS sample_db.users;\"}]}" - } - ], - "signature_instructions": "You are a technical assistant at TiDB, dedicated to providing users with precise and actionable guidance. \nYour mission is to ensure that users receive not only accurate answers but also valuable learning opportunities through practical, step-by-step examples. \nTo achieve this, adhere to the following instructions:\n\n1. Understand the User's question and answer: Carefully review the user's question and answer provided. Ensure you fully grasp the technical context, the core issue, and any relevant background information.\n2. Determine the Feasibility of Providing a Complete Example:\n - Evaluate whether a step-by-step example can be provided to help the user better understand the topic at hand. Consider the technical details involved, \n and ensure that any example you provide is fully executable without requiring additional adjustments.\n - Ensure the example is comprehensive, and is designed to be directly usable in TiDB Serverless.\n3. Generate and Present a Complete Example:\n - Create a clear, detailed SQLs guide that the user can follow step-by-step. This example should include all necessary SQL commands and should be self-contained without requiring additional adjustments.\n - **Each step should include a single SQL Query (only SQL are allowed)**. he example should be self-contained, requiring no additional adjustments or assumptions from the user. \n Avoid combining multiple SQL commands within a single step to maintain clarity and prevent confusion.\n\n\nBy following these instructions, you will help the user not only resolve their current query but also deepen their understanding of the topic through practical application.", - "signature_prefix": "Sample:" - } -} \ No newline at end of file diff --git a/backend/dspy_compiled_program/sql_samples_gen.ipynb b/backend/dspy_compiled_program/sql_samples_gen.ipynb deleted file mode 100644 index ae108216..00000000 --- a/backend/dspy_compiled_program/sql_samples_gen.ipynb +++ /dev/null @@ -1,354 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ianz/miniconda3/envs/ad/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import dspy\n", - "import os\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()\n", - "\n", - "turbo = dspy.OpenAI(model='gpt-4o', api_key=os.getenv(\"OPENAI_API_KEY\"), max_tokens=1000)\n", - "dspy.settings.configure(lm=turbo)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from pydantic import BaseModel, Field\n", - "from typing import List\n", - "\n", - "class Step(BaseModel):\n", - " explanation: str\n", - " sql: str\n", - "\n", - "class SQLSample(BaseModel):\n", - " no_reasonable_example: bool = Field(description=\"Whether it is not possible to provide a reasonable example\")\n", - " steps: List[Step] = Field(description=\"List of steps to execute the SQL query in TiDB Serverless\")\n", - "\n", - "class SampleGen(dspy.Signature):\n", - " \"\"\"You are a technical assistant at TiDB, dedicated to providing users with precise and actionable guidance. \n", - " Your mission is to ensure that users receive not only accurate answers but also valuable learning opportunities through practical, step-by-step examples. \n", - " To achieve this, adhere to the following instructions:\n", - "\n", - " 1. Understand the User's question and answer: Carefully review the user's question and answer provided. Ensure you fully grasp the technical context, the core issue, and any relevant background information.\n", - " 2. Determine the Feasibility of Providing a Complete Example:\n", - " - Evaluate whether a step-by-step example can be provided to help the user better understand the topic at hand. Consider the technical details involved, \n", - " and ensure that any example you provide is fully executable without requiring additional adjustments.\n", - " - Ensure the example is comprehensive, and is designed to be directly usable in TiDB Serverless.\n", - " 3. Generate and Present a Complete Example:\n", - " - Create a clear, detailed SQLs guide that the user can follow step-by-step. This example should include all necessary SQL commands and should be self-contained without requiring additional adjustments.\n", - " - **Each step should include a single SQL Query (only SQL are allowed)**. he example should be self-contained, requiring no additional adjustments or assumptions from the user. \n", - " Avoid combining multiple SQL commands within a single step to maintain clarity and prevent confusion.\n", - "\n", - " By following these instructions, you will help the user not only resolve their current query but also deepen their understanding of the topic through practical application.\n", - " \"\"\"\n", - "\n", - " QA_content: str = dspy.InputField(\n", - " desc=\"The user's query that requires a step-by-step example to be generated.\"\n", - " )\n", - " sample : SQLSample = dspy.OutputField(\n", - " desc=\"Step-by-step example to execute the SQL query in TiDB Serverless.\"\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from dspy.functional import TypedPredictor\n", - "\n", - "class SQLGenModule(dspy.Module):\n", - " def __init__(self, dspy_lm: dspy.LM):\n", - " super().__init__()\n", - " self.dspy_lm = dspy_lm\n", - " self.prog = TypedPredictor(SampleGen)\n", - "\n", - " def forward(self, QA_content: str):\n", - " with dspy.settings.context(lm=self.dspy_lm):\n", - " return self.prog(QA_content=QA_content)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
QA_contentno_reasonable_examplesteps
0User question: Does TiDB support FOREIGN KEY?\\...False[{\"explanation\": \"Create a 'users' table which...
1User question: How to implement rolling upgrad...True[]
\n", - "
" - ], - "text/plain": [ - " QA_content no_reasonable_example \\\n", - "0 User question: Does TiDB support FOREIGN KEY?\\... False \n", - "1 User question: How to implement rolling upgrad... True \n", - "\n", - " steps \n", - "0 [{\"explanation\": \"Create a 'users' table which... \n", - "1 [] " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "samples = pd.read_csv(\"sql_sample.csv\")\n", - "samples" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Example({'QA_content': 'User question: Does TiDB support FOREIGN KEY?\\nAnswer: Yes, TiDB supports FOREIGN KEY constraints starting from version 6.6.0 as an experimental feature. However, there are key considerations:\\n 1. Experimental Status: The feature is experimental and may change or be removed in future releases.\\n 2. Performance Impact: Foreign keys can lead to performance issues, especially with large datasets.\\n 3. Implementation: FOREIGN KEY constraints can be created, modified, and viewed using standard SQL commands (e.g., CREATE TABLE, ALTER TABLE).\\n 4. Upgrade Considerations: If upgrading from a version prior to 6.6.0, foreign keys will not be automatically enforced and must be recreated.\\n 5. Caution Advised: Given the potential performance impacts and experimental status, use foreign keys cautiously in TiDB.\\nIn conclusion, while supported, foreign keys in TiDB are still experimental and may have significant performance implications.\\n', 'sample': SQLSample(no_reasonable_example=False, steps=[Step(explanation=\"Create a 'users' table which will be referenced by the 'orders' table via a foreign key.\", sql='CREATE TABLE IF NOT EXISTS sample_db.users (user_id INT PRIMARY KEY, name VARCHAR(255));'), Step(explanation=\"Create an 'orders' table with a FOREIGN KEY referencing the 'user_id' column in the 'users' table. Ensure the foreign key constraint is specified in the table creation statement.\", sql='CREATE TABLE IF NOT EXISTS sample_db.orders (order_id INT PRIMARY KEY, order_date DATE NOT NULL, user_id INT, FOREIGN KEY (user_id) REFERENCES sample_db.users(user_id));'), Step(explanation=\"Insert sample data into the 'users' table.\", sql=\"INSERT INTO sample_db.users (user_id, name) VALUES (1, 'Alice'), (2, 'Bob');\"), Step(explanation=\"Insert sample data into the 'orders' table, ensuring the 'user_id' values correspond to existing 'user_id' values in the 'users' table.\", sql=\"INSERT INTO sample_db.orders (order_id, order_date, user_id) VALUES (101, '2022-01-01', 1), (102, '2022-01-02', 2);\"), Step(explanation=\"Attempt to insert a record into the 'orders' table with a 'user_id' value that does not exist in the 'users' table. This should trigger a foreign key constraint violation.\", sql=\"INSERT INTO sample_db.orders (order_id, order_date, user_id) VALUES (103, '2022-01-03', 3);\"), Step(explanation='View the error message that indicates the foreign key constraint violation.', sql='SHOW WARNINGS;'), Step(explanation=\"Drop the 'orders' table to remove the foreign key constraint.\", sql='DROP TABLE IF EXISTS sample_db.orders;'), Step(explanation=\"Drop the 'users' table to clean up the example.\", sql='DROP TABLE IF EXISTS sample_db.users;')])}) (input_keys=None),\n", - " Example({'QA_content': 'User question: How to implement rolling upgrade in TiDB?\\nAnswer: 1. Preparation:\\n - Verify no ongoing DDL operations and ensure the cluster meets the minimum topology requirements.\\n - Stop the Drainer service if using TiDB Binlog.\\n2. Use TiUP for Upgrade:\\n - Run tiup cluster upgrade , following the upgrade order: PD, TiProxy (if applicable), TiFlash, TiKV, TiDB, and TiCDC.\\n3. Upgrade Process:\\n - TiUP upgrades nodes one by one, handling leader transfers automatically to minimize impact.\\n - If any issues are detected, the process halts for manual intervention.\\n...\\n', 'sample': SQLSample(no_reasonable_example=True, steps=[])}) (input_keys=None)]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "dataset = []\n", - "for _, row in samples.iterrows():\n", - " qa = row['QA_content']\n", - " no_reasonable_example = row['no_reasonable_example']\n", - " steps = [Step(explanation=step[\"explanation\"], sql=step[\"sql\"]) for step in json.loads(row['steps'])]\n", - "\n", - " sample = SQLSample(no_reasonable_example=no_reasonable_example, steps=steps)\n", - " dataset.append(\n", - " dspy.Example(\n", - " QA_content=qa,\n", - " sample=sample\n", - " )\n", - " )\n", - "\n", - "dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trainset = [x.with_inputs('QA_content') for x in dataset]\n", - "len(trainset)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 2/2 [00:00<00:00, 99.83it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "score: 0.7\n", - "score: 1.0\n", - "Bootstrapped 1 full traces after 2 examples in round 0.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "from dspy.teleprompt import BootstrapFewShot\n", - "from dspy.functional import TypedPredictor\n", - "import traceback\n", - "\n", - "class AssessmentReuslt(BaseModel):\n", - " \"\"\"The assessment result of the entities and relationships\"\"\"\n", - "\n", - " score: float = Field(\n", - " description=\"float between 0 and 1 indicating the quality of the sql samples generated, 1 being the best, 0 being the worst\"\n", - " )\n", - " \n", - "class SQLSampleAssess(dspy.Signature):\n", - " \"\"\"\n", - " Assess the quality of the SQL samples generated by the model.\n", - " You need to consider the following factors:\n", - " 1. Completeness: Ensure that the steps cover all necessary actions to execute the SQL query.\n", - " 2. Correctness: Verify that the SQL commands are accurate and error-free.\n", - " 3. Clarity: Evaluate the clarity of the explanations and SQL commands provided.\n", - " \"\"\"\n", - "\n", - " assessed_samples: SQLSample = dspy.InputField(desc=\"the assessed sql samples\")\n", - " gold_samples: SQLSample = dspy.InputField(desc=\"the gold sql samples\")\n", - " result:AssessmentReuslt = dspy.OutputField(desc=\"the assessment result\")\n", - "\n", - "def assessment_metric(gold, pred, trace=None):\n", - " with dspy.context(lm=turbo):\n", - " try:\n", - " score = TypedPredictor(SQLSampleAssess)(\n", - " assessed_samples=pred.sample,\n", - " gold_samples=gold.sample,\n", - " config={\n", - " \"response_format\":{ \"type\": \"json_object\" },\n", - " }\n", - " )\n", - " except Exception as e:\n", - " print(f\"Error: {e}\")\n", - " traceback.print_exc()\n", - " return False\n", - " \n", - " print(f\"score: {score.result.score}\")\n", - "\n", - " if trace is None: # if we're doing evaluation or optimization\n", - " #return relationship_score.result.score >= 0.85\n", - " pass\n", - "\n", - " return score.result.score >= 0.85\n", - "\n", - "\n", - "teleprompter = BootstrapFewShot(metric=assessment_metric)\n", - "\n", - "# Compile!\n", - "compiled_program = teleprompter.compile(SQLGenModule(turbo), trainset=trainset)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('prog.predictor', Predict(SampleGen(QA_content -> sample\n", - " instructions=\"You are a technical assistant at TiDB, dedicated to providing users with precise and actionable guidance. \\nYour mission is to ensure that users receive not only accurate answers but also valuable learning opportunities through practical, step-by-step examples. \\nTo achieve this, adhere to the following instructions:\\n\\n1. Understand the User's question and answer: Carefully review the user's question and answer provided. Ensure you fully grasp the technical context, the core issue, and any relevant background information.\\n2. Determine the Feasibility of Providing a Complete Example:\\n - Evaluate whether a step-by-step example can be provided to help the user better understand the topic at hand. Consider the technical details involved, \\n and ensure that any example you provide is fully executable without requiring additional adjustments.\\n - Ensure the example is comprehensive, and is designed to be directly usable in TiDB Serverless.\\n3. Generate and Present a Complete Example:\\n - Create a clear, detailed SQLs guide that the user can follow step-by-step. This example should include all necessary SQL commands and should be self-contained without requiring additional adjustments.\\n - **Each step should include a single SQL Query (only SQL are allowed)**. he example should be self-contained, requiring no additional adjustments or assumptions from the user. \\n Avoid combining multiple SQL commands within a single step to maintain clarity and prevent confusion.\\n\\n\\nBy following these instructions, you will help the user not only resolve their current query but also deepen their understanding of the topic through practical application.\"\n", - " QA_content = Field(annotation=str required=True json_schema_extra={'desc': \"The user's query that requires a step-by-step example to be generated.\", '__dspy_field_type': 'input', 'prefix': 'QA Content:'})\n", - " sample = Field(annotation=SQLSample required=True json_schema_extra={'desc': 'Step-by-step example to execute the SQL query in TiDB Serverless.', '__dspy_field_type': 'output', 'prefix': 'Sample:'})\n", - ")))]\n" - ] - } - ], - "source": [ - "compiled_program.save(\"./sql_sample_gen_program\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from app.utils.sql_sample_gen import SQlGenerator\n", - "\n", - "generator = SQlGenerator(turbo)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ad", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}