Skip to content

Commit

Permalink
Merge pull request #172 from airtai/171-update-semantic-search-script…
Browse files Browse the repository at this point in the history
…-to-use-examples-from-faststream-gen-repo

Update semantic search script to use examples from faststream gen repo
  • Loading branch information
rjambrecic authored Oct 10, 2023
2 parents 2dd261f + fb020f9 commit 1572bf0
Show file tree
Hide file tree
Showing 15 changed files with 488 additions and 119 deletions.
35 changes: 25 additions & 10 deletions faststream_gen/_code_generator/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
__all__ = ['APPLICATION_FILE_PATH', 'TEST_FILE_PATH', 'TOML_FILE_NAME', 'LOGS_DIR_NAME', 'STEP_LOG_DIR_NAMES', 'DEFAULT_PARAMS',
'MAX_RETRIES', 'MAX_RESTARTS', 'MAX_ASYNC_SPEC_RETRIES', 'TOKEN_TYPES', 'MODEL_PRICING',
'OPENAI_KEY_EMPTY_ERROR', 'OPENAI_KEY_NOT_SET_ERROR', 'EMPTY_DESCRIPTION_ERROR', 'INCOMPLETE_DESCRIPTION',
'DESCRIPTION_EXAMPLE', 'MAX_NUM_FIXES_MSG', 'INCOMPLETE_APP_ERROR_MSG', 'FASTSTREAM_REPO_ZIP_URL',
'FASTSTREAM_DOCS_DIR_SUFFIX', 'FASTSTREAM_EXAMPLES_DIR_SUFFIX', 'FASTSTREAM_EXAMPLE_FILES',
'FASTSTREAM_TMP_DIR_PREFIX', 'FASTSTREAM_DIR_TO_EXCLUDE', 'FASTSTREAM_TEMPLATE_ZIP_URL',
'DESCRIPTION_EXAMPLE', 'MAX_NUM_FIXES_MSG', 'INCOMPLETE_APP_ERROR_MSG', 'FASTSTREAM_GEN_REPO_ZIP_URL',
'FASTSTREAM_GEN_EXAMPLES_DIR_SUFFIX', 'FASTSTREAM_REPO_ZIP_URL', 'FASTSTREAM_ROOT_DIR_NAME',
'FASTSTREAM_DOCS_DIR_SUFFIX', 'FASTSTREAM_EN_DOCS_DIR', 'FASTSTREAM_EXAMPLE_FILES',
'FASTSTREAM_TMP_DIR_PREFIX', 'FASTSTREAM_DIR_TO_EXCLUDE', 'STAT_0o775', 'FASTSTREAM_TEMPLATE_ZIP_URL',
'FASTSTREAM_TEMPLATE_DIR_SUFFIX', 'OpenAIModel']

# %% ../../nbs/Constants.ipynb 2
# %% ../../nbs/Constants.ipynb 1
import stat

# %% ../../nbs/Constants.ipynb 3
APPLICATION_FILE_PATH = "app/application.py"
TEST_FILE_PATH = "tests/test_application.py"
TOML_FILE_NAME = "pyproject.toml"
Expand All @@ -21,7 +25,7 @@
"requirements": "requirements-generation-logs",
}

# %% ../../nbs/Constants.ipynb 4
# %% ../../nbs/Constants.ipynb 5
DEFAULT_PARAMS = {
"temperature": 0.7,
}
Expand All @@ -37,7 +41,7 @@ class OpenAIModel(str, Enum):
gpt4 = "gpt-4"


# %% ../../nbs/Constants.ipynb 7
# %% ../../nbs/Constants.ipynb 8
TOKEN_TYPES = ["prompt_tokens", "completion_tokens", "total_tokens"]

MODEL_PRICING = {
Expand All @@ -51,7 +55,7 @@ class OpenAIModel(str, Enum):
},
}

# %% ../../nbs/Constants.ipynb 9
# %% ../../nbs/Constants.ipynb 10
OPENAI_KEY_EMPTY_ERROR = "Error: OPENAI_API_KEY cannot be empty. Please set a valid OpenAI API key in OPENAI_API_KEY environment variable and try again.\nYou can generate API keys in the OpenAI web interface. See https://platform.openai.com/account/api-keys for details."
OPENAI_KEY_NOT_SET_ERROR = "Error: OPENAI_API_KEY not found in environment variables. Set a valid OpenAI API key in OPENAI_API_KEY environment variable and try again. You can generate API keys in the OpenAI web interface. See https://platform.openai.com/account/api-keys for details."

Expand Down Expand Up @@ -79,14 +83,25 @@ class OpenAIModel(str, Enum):
Please run the following command to start manual debugging:"""

# %% ../../nbs/Constants.ipynb 11
# %% ../../nbs/Constants.ipynb 12
FASTSTREAM_GEN_REPO_ZIP_URL = "http://github.com/airtai/faststream-gen/archive/main.zip"
FASTSTREAM_GEN_EXAMPLES_DIR_SUFFIX = "faststream-gen-main/search/examples"

FASTSTREAM_REPO_ZIP_URL = "http://github.com/airtai/faststream/archive/main.zip"
FASTSTREAM_DOCS_DIR_SUFFIX = "faststream-main/.faststream_gen"
FASTSTREAM_EXAMPLES_DIR_SUFFIX = "faststream-main/faststream_gen_examples"
FASTSTREAM_ROOT_DIR_NAME = "faststream-main"
FASTSTREAM_DOCS_DIR_SUFFIX = ".faststream_gen"
FASTSTREAM_EN_DOCS_DIR = "docs/docs/en"

FASTSTREAM_EXAMPLE_FILES = ['description.txt', 'app_skeleton.py', 'app.py', 'test_app.py']
FASTSTREAM_TMP_DIR_PREFIX = "appended_examples"
FASTSTREAM_DIR_TO_EXCLUDE = "api"

# %% ../../nbs/Constants.ipynb 13
STAT_0o775 = ( stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
| stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP
| stat.S_IROTH | stat.S_IXOTH )


# %% ../../nbs/Constants.ipynb 15
FASTSTREAM_TEMPLATE_ZIP_URL = "http://github.com/airtai/faststream-template/archive/main.zip"
FASTSTREAM_TEMPLATE_DIR_SUFFIX = "faststream-template-main"
5 changes: 2 additions & 3 deletions faststream_gen/_code_generator/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
# %% auto 0
__all__ = ['logger', 'examples_delimiter', 'set_cwd', 'set_logger_level', 'retry_on_error', 'ensure_openai_api_key_set',
'add_tokens_usage', 'get_relevant_prompt_examples', 'strip_white_spaces', 'write_file_contents',
'read_file_contents', 'mock_openai_create', 'download_and_extract_faststream_archive',
'validate_python_code']
'read_file_contents', 'mock_openai_create', 'download_and_extract_github_repo', 'validate_python_code']

# %% ../../nbs/Helper.ipynb 1
from typing import *
Expand Down Expand Up @@ -308,7 +307,7 @@ def _fetch_content(url: str) -> requests.models.Response: # type: ignore

# %% ../../nbs/Helper.ipynb 35
@contextmanager
def download_and_extract_faststream_archive(url: str) -> Generator[Path, None, None]:
def download_and_extract_github_repo(url: str) -> Generator[Path, None, None]:
with TemporaryDirectory() as d:
try:
input_path = Path(f"{d}/archive.zip")
Expand Down
137 changes: 112 additions & 25 deletions faststream_gen/_components/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# %% ../../nbs/Embeddings_CLI.ipynb 1
from typing import *
import shutil
import re
import os
from tempfile import TemporaryDirectory
from contextlib import contextmanager
from pathlib import Path
Expand All @@ -22,13 +24,16 @@
from faststream_gen._code_generator.constants import (
FASTSTREAM_REPO_ZIP_URL,
FASTSTREAM_DOCS_DIR_SUFFIX,
FASTSTREAM_EXAMPLES_DIR_SUFFIX,
FASTSTREAM_GEN_REPO_ZIP_URL,
FASTSTREAM_GEN_EXAMPLES_DIR_SUFFIX,
FASTSTREAM_EXAMPLE_FILES,
FASTSTREAM_TMP_DIR_PREFIX,
FASTSTREAM_DIR_TO_EXCLUDE
FASTSTREAM_DIR_TO_EXCLUDE,
FASTSTREAM_ROOT_DIR_NAME,
FASTSTREAM_EN_DOCS_DIR,
)
from .package_data import get_root_data_path
from .._code_generator.helper import download_and_extract_faststream_archive
from .._code_generator.helper import download_and_extract_github_repo

# %% ../../nbs/Embeddings_CLI.ipynb 3
def _create_documents(
Expand Down Expand Up @@ -119,6 +124,82 @@ def _delete_directory(d: str) -> None:
print(f"Error deleting directory: {e}")

# %% ../../nbs/Embeddings_CLI.ipynb 11
def _read_lines_from_file(file_path: Path, lines_spec: str) -> str:
with open(file_path, "r") as file:
all_lines = file.readlines()

# Check if lines_spec is empty (indicating all lines should be read)
if not lines_spec:
return "".join(all_lines)

selected_lines = []
line_specs = lines_spec.split(",")

for line_spec in line_specs:
if "-" in line_spec:
# Handle line ranges (e.g., "1-10")
start, end = map(int, line_spec.split("-"))
selected_lines.extend(all_lines[start - 1 : end])
else:
# Handle single line numbers
line_number = int(line_spec)
if 1 <= line_number <= len(all_lines):
selected_lines.append(all_lines[line_number - 1])

return "".join(selected_lines)


def _extract_lines(embedded_line: str, root_path: Path) -> str:
to_expand_path = re.search("{!>(.*)!}", embedded_line).group(1).strip() # type: ignore
lines_spec = ""
if "[ln:" in to_expand_path:
to_expand_path, lines_spec = to_expand_path.split("[ln:")
to_expand_path = to_expand_path.strip()
lines_spec = lines_spec[:-1]

if Path(f"{root_path}/docs/docs_src").exists():
to_expand_path = Path(f"{root_path}/docs") / to_expand_path
elif Path(f"{root_path}/docs_src").exists():
to_expand_path = Path(f"{root_path}/") / to_expand_path
else:
raise ValueError(f"Couldn't find docs_src directory")
return _read_lines_from_file(to_expand_path, lines_spec)


def _expand_markdown(
input_markdown_path: Path,
output_markdown_path: Path,
root_path: Path
) -> None:
with open(input_markdown_path, "r") as input_file, open(
output_markdown_path, "w"
) as output_file:
for line in input_file:
# Check if the line does not contain the "{!>" pattern
if "{!>" not in line:
# Write the line to the output file
output_file.write(line)
else:
output_file.write(_extract_lines(embedded_line=line, root_path=root_path))

# %% ../../nbs/Embeddings_CLI.ipynb 12
def _expand_faststream_docs(root_path: Path) -> None:
docs_suffix = root_path / FASTSTREAM_DOCS_DIR_SUFFIX
docs_suffix.mkdir(exist_ok=True)
md_files = (root_path / FASTSTREAM_EN_DOCS_DIR).glob("**/*.md")

def expand_doc(input_path: Path) -> None:
relative_path = os.path.relpath(input_path, docs_suffix)
output_path = docs_suffix / relative_path.replace("../docs/docs/en/", "")
output_path.parent.mkdir(parents=True, exist_ok=True)
_expand_markdown(
input_markdown_path=input_path, output_markdown_path=output_path, root_path=root_path
)

for md_file in md_files:
expand_doc(md_file)

# %% ../../nbs/Embeddings_CLI.ipynb 14
def _generate_docs_db(input_path: Path, output_path: Path) -> None:
"""Generate Document Embeddings Database.
Expand All @@ -127,20 +208,22 @@ def _generate_docs_db(input_path: Path, output_path: Path) -> None:
to the specified output directory.
Args:
input_path (Path): The path to the directory containing input documents.
input_path (Path): The path to the directory containing the extracted files.
output_path (Path): The path to the directory where the embeddings
database will be saved.
"""
with yaspin(
text="Creating embeddings for the docs...", color="cyan", spinner="clock"
) as sp:
docs = _create_documents(input_path)
_expand_faststream_docs(input_path / FASTSTREAM_ROOT_DIR_NAME)

docs = _create_documents(input_path / FASTSTREAM_ROOT_DIR_NAME / FASTSTREAM_DOCS_DIR_SUFFIX)
_save_embeddings_db(docs, output_path)

sp.text = ""
sp.ok(f" ✔ Docs embeddings created and saved to: {output_path}")

# %% ../../nbs/Embeddings_CLI.ipynb 13
# %% ../../nbs/Embeddings_CLI.ipynb 16
def _check_all_files_exist(d: Path, required_files: List[str]) -> bool:
"""Check if all required files exist in a directory.
Expand All @@ -155,7 +238,7 @@ def _check_all_files_exist(d: Path, required_files: List[str]) -> bool:
"""
return all((d / file_name).exists() for file_name in required_files)

# %% ../../nbs/Embeddings_CLI.ipynb 16
# %% ../../nbs/Embeddings_CLI.ipynb 19
def _append_file_contents(d: Path, parent_d: Path, required_files: List[str]) -> None:
"""Append contents of specified files to a result file.
Expand All @@ -179,7 +262,7 @@ def _append_file_contents(d: Path, parent_d: Path, required_files: List[str]) ->
f"==== {file_name} starts ====\n{file.read()}\n==== {file_name} ends ====\n"
)

# %% ../../nbs/Embeddings_CLI.ipynb 18
# %% ../../nbs/Embeddings_CLI.ipynb 21
def _format_examples(input_path: Path, required_files: List[str]) -> None:
"""Format Examples by Appending File Contents.
Expand Down Expand Up @@ -237,12 +320,12 @@ def _generate_examples_db(
sp.text = ""
sp.ok(f" ✔ Examples embeddings created and saved to: {output_path}")

# %% ../../nbs/Embeddings_CLI.ipynb 20
# %% ../../nbs/Embeddings_CLI.ipynb 23
app = typer.Typer(
short_help="Download the zipped FastKafka documentation markdown files, generate embeddings, and save them in a vector database.",
)

# %% ../../nbs/Embeddings_CLI.ipynb 21
# %% ../../nbs/Embeddings_CLI.ipynb 24
@app.command(
"generate",
help="Download the docs and examples from FastStream repo, generate embeddings, and save them in a vector database.",
Expand All @@ -256,26 +339,30 @@ def generate(
)
) -> None:
typer.echo(
f"Downloading files docs and examples from FastStream repo and generating embeddings."
f"Downloading documentation and examples for semantic search."
)
try:
_delete_directory(db_path)

with download_and_extract_faststream_archive(
FASTSTREAM_REPO_ZIP_URL
) as extracted_path:
try:
_delete_directory(db_path)
with download_and_extract_github_repo(
FASTSTREAM_REPO_ZIP_URL
) as extracted_path:
_generate_docs_db(
extracted_path / FASTSTREAM_DOCS_DIR_SUFFIX, Path(db_path) / "docs"
extracted_path, Path(db_path) / "docs"
)

with download_and_extract_github_repo(
FASTSTREAM_GEN_REPO_ZIP_URL
) as extracted_path:
_generate_examples_db(
extracted_path / FASTSTREAM_EXAMPLES_DIR_SUFFIX,
extracted_path / FASTSTREAM_GEN_EXAMPLES_DIR_SUFFIX,
Path(db_path) / "examples",
)

typer.echo(
f"\nSuccessfully generated all the embeddings and saved to: {db_path}"
)
except Exception as e:
fg = typer.colors.RED
typer.secho(f"Unexpected internal error: {e}", err=True, fg=fg)
raise typer.Exit(code=1)
typer.echo(
f"\nSuccessfully generated all the embeddings and saved to: {db_path}"
)
except Exception as e:
fg = typer.colors.RED
typer.secho(f"Unexpected internal error: {e}", err=True, fg=fg)
raise typer.Exit(code=1)
1 change: 0 additions & 1 deletion faststream_gen/_components/integration_test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from .._code_generator.chat import CustomAIChat, ValidateAndFixResponse

from faststream_gen._code_generator.helper import (
download_and_extract_faststream_archive,
write_file_contents,
read_file_contents,
set_cwd,
Expand Down
13 changes: 4 additions & 9 deletions faststream_gen/_components/new_project_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,29 @@
from yaspin import yaspin
import shutil
import os
import stat


from faststream_gen._code_generator.helper import (
download_and_extract_faststream_archive,
download_and_extract_github_repo,
)
from faststream_gen._code_generator.constants import (
FASTSTREAM_TEMPLATE_ZIP_URL,
FASTSTREAM_TEMPLATE_DIR_SUFFIX,
APPLICATION_FILE_PATH,
TEST_FILE_PATH,
STAT_0o775
)

from .._code_generator.helper import write_file_contents

# %% ../../nbs/New_Project_Generator.ipynb 3
_STAT_0o775 = ( stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
| stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP
| stat.S_IROTH | stat.S_IXOTH )


def create_project(
output_path: str,
) -> None:
with yaspin(
text="Creating a new FastStream project...", color="cyan", spinner="clock"
) as sp:
with download_and_extract_faststream_archive(
with download_and_extract_github_repo(
FASTSTREAM_TEMPLATE_ZIP_URL
) as extracted_path:
app_file = str(
Expand All @@ -51,7 +46,7 @@ def create_project(
for p in (
Path(extracted_path) / FASTSTREAM_TEMPLATE_DIR_SUFFIX / "scripts"
).glob("*.sh"):
p.chmod(_STAT_0o775)
p.chmod(STAT_0o775)

shutil.copytree(
str(extracted_path / FASTSTREAM_TEMPLATE_DIR_SUFFIX),
Expand Down
12 changes: 10 additions & 2 deletions faststream_gen/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@
'faststream_gen/_code_generator/helper.py'),
'faststream_gen._code_generator.helper.add_tokens_usage': ( 'helper.html#add_tokens_usage',
'faststream_gen/_code_generator/helper.py'),
'faststream_gen._code_generator.helper.download_and_extract_faststream_archive': ( 'helper.html#download_and_extract_faststream_archive',
'faststream_gen/_code_generator/helper.py'),
'faststream_gen._code_generator.helper.download_and_extract_github_repo': ( 'helper.html#download_and_extract_github_repo',
'faststream_gen/_code_generator/helper.py'),
'faststream_gen._code_generator.helper.ensure_openai_api_key_set': ( 'helper.html#ensure_openai_api_key_set',
'faststream_gen/_code_generator/helper.py'),
'faststream_gen._code_generator.helper.get_relevant_prompt_examples': ( 'helper.html#get_relevant_prompt_examples',
Expand Down Expand Up @@ -86,12 +86,20 @@
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._delete_directory': ( 'embeddings_cli.html#_delete_directory',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._expand_faststream_docs': ( 'embeddings_cli.html#_expand_faststream_docs',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._expand_markdown': ( 'embeddings_cli.html#_expand_markdown',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._extract_lines': ( 'embeddings_cli.html#_extract_lines',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._format_examples': ( 'embeddings_cli.html#_format_examples',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._generate_docs_db': ( 'embeddings_cli.html#_generate_docs_db',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._generate_examples_db': ( 'embeddings_cli.html#_generate_examples_db',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._read_lines_from_file': ( 'embeddings_cli.html#_read_lines_from_file',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._save_embeddings_db': ( 'embeddings_cli.html#_save_embeddings_db',
'faststream_gen/_components/embeddings.py'),
'faststream_gen._components.embeddings._split_document_into_chunks': ( 'embeddings_cli.html#_split_document_into_chunks',
Expand Down
Binary file modified faststream_gen/package_data/docs/index.faiss
Binary file not shown.
Binary file modified faststream_gen/package_data/docs/index.pkl
Binary file not shown.
Binary file modified faststream_gen/package_data/examples/index.faiss
Binary file not shown.
Binary file modified faststream_gen/package_data/examples/index.pkl
Binary file not shown.
Loading

0 comments on commit 1572bf0

Please sign in to comment.