From a250dff400cfb6f30368819b9d44119a143bc348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Sat, 14 Sep 2024 10:38:26 +0800 Subject: [PATCH 1/3] add vector index part in other document MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- vector-search-data-types.md | 9 +++++++-- vector-search-integrate-with-django-orm.md | 19 ++++++++++++++++++- vector-search-integrate-with-peewee.md | 19 ++++++++++++++++++- vector-search-integrate-with-sqlalchemy.md | 14 ++++++++++++++ vector-search-limitations.md | 1 + 5 files changed, 58 insertions(+), 4 deletions(-) diff --git a/vector-search-data-types.md b/vector-search-data-types.md index 146fa67db45d..97aab2c602e8 100644 --- a/vector-search-data-types.md +++ b/vector-search-data-types.md @@ -14,6 +14,7 @@ summary: 本文介绍 TiDB 的向量数据类型。 与使用 [`JSON`](/data-type-json.md) 类型相比,使用向量类型具有以下优势: +- 支持向量索引。 可以通过构建[向量搜索索引](/vector-search-index.md)加速查询。 - 可指定维度。指定一个固定维度后,不符合维度的数据将被阻止写入到表中。 - 存储格式更优。向量数据类型针对向量数据进行了特别优化,在空间利用和性能效率上都优于 `JSON` 类型。 @@ -52,7 +53,8 @@ ERROR 1105 (HY000): Invalid vector text: [5, ] ERROR 1105 (HY000): vector has 2 dimensions, does not fit VECTOR(3) ``` -可参阅[向量函数与操作符](/vector-search-functions-and-operators.md)了解向量数据类型支持的所有函数和操作符。 +可参阅 [向量函数与操作符](/vector-search-functions-and-operators.md) 了解向量数据类型支持的所有函数和操作符。 +可参阅 [向量搜索索引](/vector-search-index.md) 了解向量搜索索引的信息。 ## 混合存储不同维度的向量 @@ -68,6 +70,8 @@ INSERT INTO vector_table VALUES (1, '[0.3, 0.5, -0.1]'); -- 3 dimensions vector, INSERT INTO vector_table VALUES (2, '[0.3, 0.5]'); -- 2 dimensions vector, OK ``` +但是,我们不能为存储了不同维度的向量列构建 [向量搜索索引](/vector-search-index.md),因为向量距离只能在具有相同维度的向量之间计算。 + ## 比较 [比较运算符](/vector-search-functions-and-operators.md#扩展的内置函数和运算符) 如 `=`, `!=`, `<`, `>`, `<=` 和 `>=` 等都能正常对向量数据进行比较。可参阅[向量函数与操作符](/vector-search-functions-and-operators.md#扩展的内置函数和运算符)了解向量数据类型支持的所有函数和操作符。 @@ -239,4 +243,5 @@ ERROR 1105 (HY000): vectors have different dimensions: 1 and 3 ## 另请参阅 -- [向量函数和操作符](/vector-search-functions-and-operators.md) \ No newline at end of file +- [向量函数和操作符](/vector-search-functions-and-operators.md) +- [向量搜索索引](/vector-search-index.md) \ No newline at end of file diff --git a/vector-search-integrate-with-django-orm.md b/vector-search-integrate-with-django-orm.md index 9c7ce2f86691..c279a9bef2d0 100644 --- a/vector-search-integrate-with-django-orm.md +++ b/vector-search-integrate-with-django-orm.md @@ -224,6 +224,22 @@ Document.objects.create(content="fish", embedding=[1, 2, 4]) Document.objects.create(content="tree", embedding=[1, 0, 0]) ``` +#### 用索引定义优化的向量列 + +定义三维向量列,并使用 [向量搜索索引 (HNSW 索引)](/vector-search-index.md) 对其进行优化。 + +```python +class DocumentWithIndex(models.Model): + content = models.TextField() + # Note: + # - Using comment to add hnsw index is a temporary solution. In the future it will use `CREATE INDEX` syntax. + # - Currently the HNSW index cannot be changed after the table has been created. + # - Only Django >= 4.2 supports `db_comment`. + embedding = VectorField(dimensions=3, db_comment="VECTOR INDEX embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))") +``` + +TiDB 将使用该索引来加速基于余弦距离函数的向量搜索查询。 + ### 搜索近邻向量 TiDB 向量支持以下距离函数: @@ -253,4 +269,5 @@ results = Document.objects.annotate( ## 另请参阅 -- [向量数据类型](/vector-search-data-types.md) \ No newline at end of file +- [向量数据类型](/vector-search-data-types.md) +- [向量搜索索引](/vector-search-index.md) \ No newline at end of file diff --git a/vector-search-integrate-with-peewee.md b/vector-search-integrate-with-peewee.md index d06e1e52f38d..3a3fc3d23d5e 100644 --- a/vector-search-integrate-with-peewee.md +++ b/vector-search-integrate-with-peewee.md @@ -223,6 +223,22 @@ Document.create(content='fish', embedding=[1, 2, 4]) Document.create(content='tree', embedding=[1, 0, 0]) ``` +#### 用索引定义优化的向量列 + +定义三维矢量列,并使用 [向量搜索索引](/vector-search-index.md) (HNSW 索引) 对其进行优化。 + +```python +class DocumentWithIndex(Model): + class Meta: + database = db + table_name = 'peewee_demo_documents_with_index' + + content = TextField() + embedding = VectorField(3, constraints=[SQL("VECTOR INDEX embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))")]) +``` + +TiDB 将使用该索引来加速基于余弦距离函数的向量搜索查询。 + ### 搜索近邻向量 可以选择使用余弦距离 (`CosineDistance`) 函数,查询与向量 `[1, 2, 3]` 语义最接近的前 3 个 `document`。 @@ -244,4 +260,5 @@ results = Document.select(Document, distance).where(distance_expression < 0.2).o ## 另请参阅 -- [向量数据类型](/vector-search-data-types.md) \ No newline at end of file +- [向量数据类型](/vector-search-data-types.md) +- [向量搜索索引](/vector-search-index.md) \ No newline at end of file diff --git a/vector-search-integrate-with-sqlalchemy.md b/vector-search-integrate-with-sqlalchemy.md index 5650b5bb2cb5..6094302d28e1 100644 --- a/vector-search-integrate-with-sqlalchemy.md +++ b/vector-search-integrate-with-sqlalchemy.md @@ -186,6 +186,20 @@ with Session(engine) as session: session.commit() ``` +#### 用索引定义优化的矢量列 + +定义三维矢量列,并使用 [向量量搜索索引](/vector-search-index.md) (HNSW 索引)对其进行优化。 + +```python +class DocumentWithIndex(Base): + __tablename__ = 'sqlalchemy_demo_documents_with_index' + id = Column(Integer, primary_key=True) + content = Column(Text) + embedding = Column(VectorType(3), comment="VECTOR INDEX embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))") +``` + +TiDB 将使用该索引来加速基于余弦距离函数的矢量搜索查询。 + ### 搜索近邻向量 可以选择使用余弦距离 (`CosineDistance`) 函数,查询与向量 `[1, 2, 3]` 语义最接近的前 3 个 `document`。 diff --git a/vector-search-limitations.md b/vector-search-limitations.md index 5c0af31dc65e..838cf8216687 100644 --- a/vector-search-limitations.md +++ b/vector-search-limitations.md @@ -9,6 +9,7 @@ summary: 了解 TiDB 向量搜索功能的限制。 - 向量最大支持 16383 维。 - 向量数据中不支持 `NaN`、`Infinity` 和 `-Infinity` 浮点数。 +- 创建 [向量搜索索引](/vector-search-index.md) 时只支持余弦距离和L2距离。 - 目前,向量数据类型不支持存储双精度浮点数(该功能计划在未来的版本中支持)。当向 TiDB 中的向量字段插入或存储数据时,如果这些数据的类型是双精度浮点数,TiDB 会将这些双精度浮点数自动转换为单精度浮点数。 ## 反馈 From 634c602580107c14eca2c63c6124e3bff8403eae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Sat, 14 Sep 2024 10:44:03 +0800 Subject: [PATCH 2/3] modify index name when create vector index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- vector-search-index.md | 4 ++-- vector-search-integrate-with-django-orm.md | 2 +- vector-search-integrate-with-peewee.md | 2 +- vector-search-integrate-with-sqlalchemy.md | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vector-search-index.md b/vector-search-index.md index 51c11ed96922..b250805af730 100644 --- a/vector-search-index.md +++ b/vector-search-index.md @@ -26,7 +26,7 @@ TiDB 目前支持以下向量搜索索引算法: id INT PRIMARY KEY, data VECTOR(5), data64 VECTOR64(10), - VECTOR INDEX data USING HNSW ((VEC_COSINE_DISTANCE(data))) + VECTOR INDEX idx_data USING HNSW ((VEC_COSINE_DISTANCE(data))) ); ``` @@ -122,7 +122,7 @@ CREATE TABLE docs ( ver VARCHAR(10), doc TEXT, embedding VECTOR(3), - VECTOR INDEX embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding))) + VECTOR INDEX idx_embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding))) ) PARTITION BY LIST COLUMNS (ver) ( PARTITION p_v1_0 VALUES IN ('v1.0'), PARTITION p_v1_1 VALUES IN ('v1.1'), diff --git a/vector-search-integrate-with-django-orm.md b/vector-search-integrate-with-django-orm.md index c279a9bef2d0..02a8ac58d029 100644 --- a/vector-search-integrate-with-django-orm.md +++ b/vector-search-integrate-with-django-orm.md @@ -235,7 +235,7 @@ class DocumentWithIndex(models.Model): # - Using comment to add hnsw index is a temporary solution. In the future it will use `CREATE INDEX` syntax. # - Currently the HNSW index cannot be changed after the table has been created. # - Only Django >= 4.2 supports `db_comment`. - embedding = VectorField(dimensions=3, db_comment="VECTOR INDEX embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))") + embedding = VectorField(dimensions=3, db_comment="VECTOR INDEX idx_embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))") ``` TiDB 将使用该索引来加速基于余弦距离函数的向量搜索查询。 diff --git a/vector-search-integrate-with-peewee.md b/vector-search-integrate-with-peewee.md index 3a3fc3d23d5e..611277c0d0b9 100644 --- a/vector-search-integrate-with-peewee.md +++ b/vector-search-integrate-with-peewee.md @@ -234,7 +234,7 @@ class DocumentWithIndex(Model): table_name = 'peewee_demo_documents_with_index' content = TextField() - embedding = VectorField(3, constraints=[SQL("VECTOR INDEX embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))")]) + embedding = VectorField(3, constraints=[SQL("VECTOR INDEX idx_embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))")]) ``` TiDB 将使用该索引来加速基于余弦距离函数的向量搜索查询。 diff --git a/vector-search-integrate-with-sqlalchemy.md b/vector-search-integrate-with-sqlalchemy.md index 6094302d28e1..484e52b62292 100644 --- a/vector-search-integrate-with-sqlalchemy.md +++ b/vector-search-integrate-with-sqlalchemy.md @@ -195,7 +195,7 @@ class DocumentWithIndex(Base): __tablename__ = 'sqlalchemy_demo_documents_with_index' id = Column(Integer, primary_key=True) content = Column(Text) - embedding = Column(VectorType(3), comment="VECTOR INDEX embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))") + embedding = Column(VectorType(3), comment="VECTOR INDEX idx_embedding USING HNSW ((VEC_COSINE_DISTANCE(embedding)))") ``` TiDB 将使用该索引来加速基于余弦距离函数的矢量搜索查询。 From 4b54e6dd86390494d44627e6006ce63d9aa76059 Mon Sep 17 00:00:00 2001 From: EricZequan <110292382+EricZequan@users.noreply.github.com> Date: Sat, 14 Sep 2024 11:04:20 +0800 Subject: [PATCH 3/3] Update vector-search-improve-performance.md --- vector-search-improve-performance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector-search-improve-performance.md b/vector-search-improve-performance.md index 6c9cc96a171b..6928a79f7ae6 100644 --- a/vector-search-improve-performance.md +++ b/vector-search-improve-performance.md @@ -17,7 +17,7 @@ summary: 了解优化 TiDB 向量搜索性能的最佳实践。 ## 减少向量维数或缩短嵌入时间 -随着向量大小的增加,向量搜索索引和查询的计算复杂度会显著增加,因为这意味着要进行更多的浮点数比较运算。 +随着向量维度大小的增加,向量搜索索引和查询的计算复杂度会显著增加,因为这意味着要进行更多的浮点数比较运算。 为了优化性能,可以考虑尽可能地减少向量的维数。这通常需要切换到另一种嵌入模型。在切换模型时,你需要确保改变嵌入模型对向量查询准确性的影响。