Skip to content

Commit

Permalink
Merge pull request kubeagi#871 from Abirdcfly/pdf
Browse files Browse the repository at this point in the history
feat: show pdf info in reference
  • Loading branch information
bjwswang authored Mar 15, 2024
2 parents 30cbb0c + 5fb4aa3 commit fbd6bfb
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 12 deletions.
1 change: 1 addition & 0 deletions config/samples/arcadia_v1alpha1_model_reranking_bce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ spec:
HuggingFace: https://huggingface.co/maidalun1020/bce-reranker-base_v1
types: "reranking"
huggingFaceRepo: maidalun1020/bce-reranker-base_v1
modelSource: huggingface
2 changes: 1 addition & 1 deletion controllers/base/knowledgebase_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ func (r *KnowledgeBaseReconciler) handleFile(ctx context.Context, log logr.Logge
case ".html", ".htm":
loader = documentloaders.NewHTML(dataReader)
case ".pdf":
loader = pkgdocumentloaders.NewPDF(dataReader)
loader = pkgdocumentloaders.NewPDF(dataReader, fileName)
// TODO: support .mp3,.wav
default:
loader = documentloaders.NewText(dataReader)
Expand Down
2 changes: 1 addition & 1 deletion pkg/appruntime/documentloader/documentloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func (dl *DocumentLoader) Run(ctx context.Context, cli client.Client, args map[s
loader = documentloaders.NewHTML(dataReader)
case ".pdf":
dataReader := bytes.NewReader(data)
loader = arcadiadocumentloaders.NewPDF(dataReader)
loader = arcadiadocumentloaders.NewPDF(dataReader, file)
// loader = documentloaders.NewPDF(dataReader, int64(len(data)))
default:
dataReader := bytes.NewReader(data)
Expand Down
11 changes: 8 additions & 3 deletions pkg/documentloaders/pdf.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@ import (
)

type PDF struct {
r io.Reader
r io.Reader
fileName string
}

func NewPDF(r io.Reader) *PDF {
return &PDF{r: r}
func NewPDF(r io.Reader, fileName string) *PDF {
return &PDF{r: r, fileName: fileName}
}

func (p *PDF) Load(ctx context.Context) ([]schema.Document, error) {
Expand All @@ -59,6 +60,8 @@ func (p *PDF) Load(ctx context.Context) ([]schema.Document, error) {
Metadata: map[string]any{
"page": page,
"total_pages": page,
FileNameCol: p.fileName,
PageNumberCol: strconv.Itoa(page),
},
})
break
Expand All @@ -68,6 +71,8 @@ func (p *PDF) Load(ctx context.Context) ([]schema.Document, error) {
Metadata: map[string]any{
"page": page,
"total_pages": pages,
FileNameCol: p.fileName,
PageNumberCol: strconv.Itoa(page),
},
})
from = idx + len(key) + 1
Expand Down
19 changes: 13 additions & 6 deletions pkg/documentloaders/qa_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,20 @@ import (
)

const (
QuestionCol = "q"
AnswerCol = "a"
FileNameCol = "file_name"
PageNumberCol = "page_number"
// QustionCol the question column, which will be embedding
QuestionCol = "q"
// AnswerCol the answer column, will be added to qustionCol when use qachain and knowledgebase retriever
AnswerCol = "a"
// FileNameCol the file name column, will show in reference
FileNameCol = "file_name"
// PageNumberCol the page number column, will show in reference
PageNumberCol = "page_number"
// ChunkContentCol the chunk content column, will show in reference
ChunkContentCol = "chunk_content"
LineNumber = "line_number"
QAFileName = "qafile_name"
// LineNumber the qafile line number column,
LineNumber = "line_number"
// QAFileName the qafile name
QAFileName = "qafile_name"
)

// QACSV represents a QA CSV document loader.
Expand Down
5 changes: 4 additions & 1 deletion pkg/vectorstore/vectorstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,17 +118,20 @@ func AddDocuments(ctx context.Context, log logr.Logger, vs *arcadiav1alpha1.Vect
log.Info("handle file: add documents to embedder")
if store, ok := s.(*PGVectorStore); ok {
// now only pgvector support Row-level updates
log.V(3).Info("handle file: use pgvector, filter out exist documents")
log.V(3).Info("handle file: use pgvector, filter out exist documents...")
if documents, err = store.RemoveExist(ctx, log, documents); err != nil {
return err
}
log.V(3).Info("handle file: use pgvector, filter out exist documents done")
}
for i, doc := range documents {
log.V(5).Info(fmt.Sprintf("add doc to vectorstore, document[%d]: embedding:%s, metadata:%v", i, doc.PageContent, doc.Metadata))
}
log.V(3).Info("handle file: add documents, may take long time...")
if _, err = s.AddDocuments(ctx, documents); err != nil {
return err
}
log.V(3).Info("handle file: add documents done")
if finish != nil {
finish()
}
Expand Down

0 comments on commit fbd6bfb

Please sign in to comment.