From 5fb4aa342723c848e4af2f063c50d3c4149dfbb3 Mon Sep 17 00:00:00 2001 From: Abirdcfly Date: Fri, 15 Mar 2024 17:04:00 +0800 Subject: [PATCH] feat: show pdf info in reference Signed-off-by: Abirdcfly --- .../arcadia_v1alpha1_model_reranking_bce.yaml | 1 + controllers/base/knowledgebase_controller.go | 2 +- .../documentloader/documentloader.go | 2 +- pkg/documentloaders/pdf.go | 11 ++++++++--- pkg/documentloaders/qa_csv.go | 19 +++++++++++++------ pkg/vectorstore/vectorstore.go | 5 ++++- 6 files changed, 28 insertions(+), 12 deletions(-) diff --git a/config/samples/arcadia_v1alpha1_model_reranking_bce.yaml b/config/samples/arcadia_v1alpha1_model_reranking_bce.yaml index f43091d92..0913632e1 100644 --- a/config/samples/arcadia_v1alpha1_model_reranking_bce.yaml +++ b/config/samples/arcadia_v1alpha1_model_reranking_bce.yaml @@ -13,3 +13,4 @@ spec: HuggingFace: https://huggingface.co/maidalun1020/bce-reranker-base_v1 types: "reranking" huggingFaceRepo: maidalun1020/bce-reranker-base_v1 + modelSource: huggingface diff --git a/controllers/base/knowledgebase_controller.go b/controllers/base/knowledgebase_controller.go index 55fdf81a7..26749326c 100644 --- a/controllers/base/knowledgebase_controller.go +++ b/controllers/base/knowledgebase_controller.go @@ -537,7 +537,7 @@ func (r *KnowledgeBaseReconciler) handleFile(ctx context.Context, log logr.Logge case ".html", ".htm": loader = documentloaders.NewHTML(dataReader) case ".pdf": - loader = pkgdocumentloaders.NewPDF(dataReader) + loader = pkgdocumentloaders.NewPDF(dataReader, fileName) // TODO: support .mp3,.wav default: loader = documentloaders.NewText(dataReader) diff --git a/pkg/appruntime/documentloader/documentloader.go b/pkg/appruntime/documentloader/documentloader.go index 89f899a05..99f607735 100644 --- a/pkg/appruntime/documentloader/documentloader.go +++ b/pkg/appruntime/documentloader/documentloader.go @@ -127,7 +127,7 @@ func (dl *DocumentLoader) Run(ctx context.Context, cli client.Client, args map[s loader = documentloaders.NewHTML(dataReader) case ".pdf": dataReader := bytes.NewReader(data) - loader = arcadiadocumentloaders.NewPDF(dataReader) + loader = arcadiadocumentloaders.NewPDF(dataReader, file) // loader = documentloaders.NewPDF(dataReader, int64(len(data))) default: dataReader := bytes.NewReader(data) diff --git a/pkg/documentloaders/pdf.go b/pkg/documentloaders/pdf.go index 307cfd6c2..10164304b 100644 --- a/pkg/documentloaders/pdf.go +++ b/pkg/documentloaders/pdf.go @@ -29,11 +29,12 @@ import ( ) type PDF struct { - r io.Reader + r io.Reader + fileName string } -func NewPDF(r io.Reader) *PDF { - return &PDF{r: r} +func NewPDF(r io.Reader, fileName string) *PDF { + return &PDF{r: r, fileName: fileName} } func (p *PDF) Load(ctx context.Context) ([]schema.Document, error) { @@ -59,6 +60,8 @@ func (p *PDF) Load(ctx context.Context) ([]schema.Document, error) { Metadata: map[string]any{ "page": page, "total_pages": page, + FileNameCol: p.fileName, + PageNumberCol: strconv.Itoa(page), }, }) break @@ -68,6 +71,8 @@ func (p *PDF) Load(ctx context.Context) ([]schema.Document, error) { Metadata: map[string]any{ "page": page, "total_pages": pages, + FileNameCol: p.fileName, + PageNumberCol: strconv.Itoa(page), }, }) from = idx + len(key) + 1 diff --git a/pkg/documentloaders/qa_csv.go b/pkg/documentloaders/qa_csv.go index 990551309..96dfbe08f 100644 --- a/pkg/documentloaders/qa_csv.go +++ b/pkg/documentloaders/qa_csv.go @@ -31,13 +31,20 @@ import ( ) const ( - QuestionCol = "q" - AnswerCol = "a" - FileNameCol = "file_name" - PageNumberCol = "page_number" + // QustionCol the question column, which will be embedding + QuestionCol = "q" + // AnswerCol the answer column, will be added to qustionCol when use qachain and knowledgebase retriever + AnswerCol = "a" + // FileNameCol the file name column, will show in reference + FileNameCol = "file_name" + // PageNumberCol the page number column, will show in reference + PageNumberCol = "page_number" + // ChunkContentCol the chunk content column, will show in reference ChunkContentCol = "chunk_content" - LineNumber = "line_number" - QAFileName = "qafile_name" + // LineNumber the qafile line number column, + LineNumber = "line_number" + // QAFileName the qafile name + QAFileName = "qafile_name" ) // QACSV represents a QA CSV document loader. diff --git a/pkg/vectorstore/vectorstore.go b/pkg/vectorstore/vectorstore.go index 9ea609470..36d75bea5 100644 --- a/pkg/vectorstore/vectorstore.go +++ b/pkg/vectorstore/vectorstore.go @@ -118,17 +118,20 @@ func AddDocuments(ctx context.Context, log logr.Logger, vs *arcadiav1alpha1.Vect log.Info("handle file: add documents to embedder") if store, ok := s.(*PGVectorStore); ok { // now only pgvector support Row-level updates - log.V(3).Info("handle file: use pgvector, filter out exist documents") + log.V(3).Info("handle file: use pgvector, filter out exist documents...") if documents, err = store.RemoveExist(ctx, log, documents); err != nil { return err } + log.V(3).Info("handle file: use pgvector, filter out exist documents done") } for i, doc := range documents { log.V(5).Info(fmt.Sprintf("add doc to vectorstore, document[%d]: embedding:%s, metadata:%v", i, doc.PageContent, doc.Metadata)) } + log.V(3).Info("handle file: add documents, may take long time...") if _, err = s.AddDocuments(ctx, documents); err != nil { return err } + log.V(3).Info("handle file: add documents done") if finish != nil { finish() }