-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: get rid of langchain fully
- Loading branch information
1 parent
d10c036
commit 0763052
Showing
15 changed files
with
692 additions
and
1,068 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,80 @@ | ||
from abc import ABC | ||
from abc import ABC, abstractmethod | ||
from typing import Any | ||
|
||
from langchain.embeddings import HuggingFaceEmbeddings | ||
|
||
|
||
class Embedder(ABC): | ||
embedder: Any | ||
@abstractmethod | ||
def embed_documents(self, texts: list[str]) -> list[list[float]]: | ||
"""Embed search docs.""" | ||
|
||
@abstractmethod | ||
def embed_query(self, text: str) -> list[float]: | ||
"""Embed query text.""" | ||
|
||
|
||
class HuggingFaceEmbedder(Embedder): | ||
"""HuggingFace sentence_transformers embedding models. | ||
To use, you should have the ``sentence_transformers`` python package installed. | ||
""" | ||
|
||
client: Any #: :meta private: | ||
model_name: str = "all-MiniLM-L6-v2" | ||
"""Model name to use.""" | ||
cache_folder: str | None = None | ||
"""Path to store models. | ||
Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" | ||
model_kwargs: dict[str, Any] = {} | ||
"""Keyword arguments to pass to the model.""" | ||
encode_kwargs: dict[str, Any] = {} | ||
"""Keyword arguments to pass when calling the `encode` method of the model.""" | ||
multi_process: bool = False | ||
"""Run encode() on multiple GPUs.""" | ||
|
||
def __init__(self, **kwargs: Any): | ||
"""Initialize the sentence_transformer.""" | ||
super().__init__(**kwargs) | ||
try: | ||
import sentence_transformers | ||
|
||
except ImportError as exc: | ||
raise ImportError( | ||
"Could not import sentence_transformers python package. " | ||
"Please install it with `pip install sentence-transformers`." | ||
) from exc | ||
|
||
self.client = sentence_transformers.SentenceTransformer( | ||
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs | ||
) | ||
|
||
def embed_documents(self, texts: list[str]) -> list[list[float]]: | ||
"""Compute doc embeddings using a HuggingFace transformer model. | ||
Args: | ||
texts: The list of texts to embed. | ||
Returns: | ||
List of embeddings, one for each text. | ||
""" | ||
import sentence_transformers | ||
|
||
texts = list(map(lambda x: x.replace("\n", " "), texts)) | ||
if self.multi_process: | ||
pool = self.client.start_multi_process_pool() | ||
embeddings = self.client.encode_multi_process(texts, pool) | ||
sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool) | ||
else: | ||
embeddings = self.client.encode(texts, **self.encode_kwargs) | ||
|
||
return embeddings.tolist() | ||
|
||
def get_embedding(self): | ||
return self.embedder | ||
def embed_query(self, text: str) -> list[float]: | ||
"""Compute query embeddings using a HuggingFace transformer model. | ||
Args: | ||
text: The text to embed. | ||
class EmbedderHuggingFace(Embedder): | ||
def __init__(self, model_name: str = "all-MiniLM-L6-v2"): | ||
self.embedder = HuggingFaceEmbeddings(model_name=model_name) | ||
Returns: | ||
Embeddings for the text. | ||
""" | ||
return self.embed_documents([text])[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from enum import Enum | ||
|
||
|
||
class Format(Enum): | ||
MARKDOWN = "markdown" | ||
HTML = "html" | ||
|
||
|
||
SUPPORTED_FORMATS = { | ||
Format.MARKDOWN.value: [ | ||
# First, try to split along Markdown headings (starting with level 2) | ||
"\n#{1,6} ", | ||
# Note the alternative syntax for headings (below) is not handled here | ||
# Heading level 2 | ||
# --------------- | ||
# End of code block | ||
"```\n", | ||
# Horizontal lines | ||
"\n\\*\\*\\*+\n", | ||
"\n---+\n", | ||
"\n___+\n", | ||
# Note that this splitter doesn't handle horizontal lines defined | ||
# by *three or more* of ***, ---, or ___, but this is not handled | ||
"\n\n", | ||
"\n", | ||
" ", | ||
"", | ||
], | ||
Format.HTML.value: [ | ||
# First, try to split along HTML tags | ||
"<body", | ||
"<div", | ||
"<p", | ||
"<br", | ||
"<li", | ||
"<h1", | ||
"<h2", | ||
"<h3", | ||
"<h4", | ||
"<h5", | ||
"<h6", | ||
"<span", | ||
"<table", | ||
"<tr", | ||
"<td", | ||
"<th", | ||
"<ul", | ||
"<ol", | ||
"<header", | ||
"<footer", | ||
"<nav", | ||
# Head | ||
"<head", | ||
"<style", | ||
"<script", | ||
"<meta", | ||
"<title", | ||
"", | ||
], | ||
} | ||
|
||
|
||
def get_separators(format: str): | ||
separators = SUPPORTED_FORMATS.get(format) | ||
|
||
# validate input | ||
if separators is None: | ||
raise KeyError(format + " is a not supported format") | ||
|
||
return separators |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.