Skip to content

Commit

Permalink
expose chnk tokens for pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
zzstoatzz committed Feb 23, 2024
1 parent 4a8885e commit 8981203
Show file tree
Hide file tree
Showing 9 changed files with 252 additions and 35 deletions.
46 changes: 46 additions & 0 deletions examples/chat_with_X/repo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import asyncio

from marvin.beta.assistants import Assistant
from marvin.utilities.tools import custom_partial
from rich.status import Status

from raggy.loaders.github import GitHubRepoLoader
from raggy.vectorstores.tpuf import TurboPuffer, query_namespace

TPUF_NS = "demo"


async def ingest_repo(repo: str):
loader = GitHubRepoLoader(repo=repo)

with Status(f"Loading {repo}"):
documents = await loader.load()

async with TurboPuffer(namespace=TPUF_NS) as tpuf, Status(f"Ingesting {repo}"):
await tpuf.upsert(documents)


async def chat_with_repo(repo: str, clean_up: bool = True):
await ingest_repo(repo)

try:
with Assistant(
name="Raggy Expert",
instructions=(
"You use `query_namespace` to answer questions about a github"
f" repo called {repo}!. You MUST use this tool to answer questions."
),
tools=[custom_partial(query_namespace, namespace=TPUF_NS)],
) as assistant:
assistant.chat()

finally:
if clean_up:
async with TurboPuffer(namespace=TPUF_NS) as tpuf, Status(
f"Cleaning up namespace {TPUF_NS}"
):
await tpuf.reset()


if __name__ == "__main__":
asyncio.run(chat_with_repo("zzstoatzz/raggy"))
3 changes: 3 additions & 0 deletions examples/chat_with_X/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
marvin
raggy
turbopuffer
24 changes: 22 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ dependencies = [
"bs4",
"chardet",
"fake-useragent",
"gh-util",
"openai>1.0.0",
"pypdf",
"tenacity",
"tiktoken",
"xxhash",
"yake",
]
Expand All @@ -35,15 +38,28 @@ dev = [
"mkdocstrings[python]~=0.22",
"mypy",
"pre-commit>=2.21,<4.0",
"pydantic[dotenv]",
"ruff",
"types-aiofiles",
"raggy[tests]",
]

tests = [
"pytest-asyncio>=0.18.2,!=0.22.0,<0.23.0",
"pytest-env>=0.8,<2.0",
"pytest-rerunfailures>=10,<14",
"pytest-sugar>=0.9,<2.0",
"pytest~=7.3.1",
"pytest-timeout",
"pytest-xdist",
]

chroma = ["chromadb"]
tpuf = ["turbopuffer"]
pdf = ["pypdf"]

[project.scripts]
raggy = "raggy.cli:app"

[project.urls]
Code = "https://github.com/zzstoatzz/raggy"

Expand All @@ -70,7 +86,11 @@ asyncio_mode = 'auto'
filterwarnings = [
"ignore:'crypt' is deprecated and slated for removal in Python 3.13:DeprecationWarning",
]
env = ['PYTEST_TIMEOUT=20']
env = [
'D:RAGGY_LOG_VERBOSE=1',
'D:RAGGY_LOG_LEVEL=DEBUG',
'PYTEST_TIMEOUT=20',
]

[tool.ruff]
extend-select = ["I"]
Expand Down
175 changes: 175 additions & 0 deletions src/raggy/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import argparse
import os
import sys
from datetime import datetime, timezone
from pathlib import Path

import openai
from prompt_toolkit import PromptSession
from prompt_toolkit.auto_suggest import AutoSuggestFromHistory
from prompt_toolkit.history import FileHistory
from rich.console import Console, ConsoleOptions, RenderResult
from rich.live import Live
from rich.markdown import CodeBlock, Markdown
from rich.status import Status
from rich.syntax import Syntax
from rich.text import Text


class SimpleCodeBlock(CodeBlock):
def __rich_console__(
self, console: Console, options: ConsoleOptions
) -> RenderResult:
code = str(self.text).rstrip()
yield Text(self.lexer_name, style="dim")
yield Syntax(
code,
self.lexer_name,
theme=self.theme,
background_color="default",
word_wrap=True,
)
yield Text(f"/{self.lexer_name}", style="dim")


Markdown.elements["fence"] = SimpleCodeBlock


def app() -> int:
parser = argparse.ArgumentParser(
prog="aicli",
description="""\
OpenAI powered AI CLI (thank you samuelcolvin)
Special prompts:
* `show-markdown` - show the markdown output from the previous response
* `multiline` - toggle multiline mode
""",
)
parser.add_argument(
"prompt", nargs="?", help="AI Prompt, if omitted fall into interactive mode"
)

parser.add_argument(
"--no-stream",
action="store_true",
help="Whether to stream responses from OpenAI",
)

parser.add_argument("--version", action="store_true", help="Show version and exit")

args = parser.parse_args()

console = Console()
console.print("OpenAI powered AI CLI", style="green bold", highlight=False)
if args.version:
return 0

try:
openai_api_key = os.environ["OPENAI_API_KEY"]
except KeyError:
console.print(
"You must set the OPENAI_API_KEY environment variable", style="red"
)
return 1

client = openai.OpenAI(api_key=openai_api_key)

now_utc = datetime.now(timezone.utc)
t = now_utc.astimezone().tzinfo.tzname(now_utc) # type: ignore
setup = f"""\
Help the user by responding to their request, the output should
be concise and always written in markdown. The current date and time
is {datetime.now()} {t}. The user is running {sys.platform}."""

stream = not args.no_stream
messages = [{"role": "system", "content": setup}]

if args.prompt:
messages.append({"role": "user", "content": args.prompt})
try:
ask_openai(client, messages, stream, console)
except KeyboardInterrupt:
pass
return 0

history = Path().home() / ".openai-prompt-history.txt"
session = PromptSession(history=FileHistory(str(history)))
multiline = False

while True:
try:
text = session.prompt(
"aicli ➤ ", auto_suggest=AutoSuggestFromHistory(), multiline=multiline
)
except (KeyboardInterrupt, EOFError):
return 0

if not text.strip():
continue

ident_prompt = text.lower().strip(" ").replace(" ", "-")
if ident_prompt == "show-markdown":
last_content = messages[-1]["content"]
console.print("[dim]Last markdown output of last question:[/dim]\n")
console.print(
Syntax(last_content, lexer="markdown", background_color="default")
)
continue
elif ident_prompt == "multiline":
multiline = not multiline
if multiline:
console.print(
"Enabling multiline mode. "
"[dim]Press [Meta+Enter] or [Esc] followed by [Enter] to accept input.[/dim]"
)
else:
console.print("Disabling multiline mode.")
continue

messages.append({"role": "user", "content": text})

try:
content = ask_openai(client, messages, stream, console)
except KeyboardInterrupt:
return 0
messages.append({"role": "assistant", "content": content})


def ask_openai(
client: openai.OpenAI,
messages: list[dict[str, str]],
stream: bool,
console: Console,
) -> str:
with Status("[dim]Working on it…[/dim]", console=console):
response = client.chat.completions.create(
model="gpt-4", messages=messages, stream=stream
)

console.print("\nResponse:", style="green")
if stream:
content = ""
interrupted = False
with Live("", refresh_per_second=15, console=console) as live:
try:
for chunk in response:
if chunk.choices[0].finish_reason is not None:
break
chunk_text = chunk.choices[0].delta.content
content += chunk_text
live.update(Markdown(content))
except KeyboardInterrupt:
interrupted = True

if interrupted:
console.print("[dim]Interrupted[/dim]")
else:
content = response.choices[0].message.content
console.print(Markdown(content))

return content


if __name__ == "__main__":
sys.exit(app())
35 changes: 3 additions & 32 deletions src/raggy/loaders/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import functools
import os
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple

import aiofiles
import chardet
import httpx
from pydantic import BaseModel, Field, field_validator, model_validator
from gh_util.types import GitHubComment, GitHubIssue
from pydantic import Field, field_validator, model_validator

from raggy.documents import Document, document_to_excerpts
from raggy.loaders import Loader
Expand All @@ -28,39 +28,10 @@ async def read_file_with_chardet(file_path, errors="replace"):
return text


class GitHubUser(BaseModel):
login: str


class GitHubComment(BaseModel):
body: str = Field(default="")
user: GitHubUser = Field(default_factory=GitHubUser)


class GitHubLabel(BaseModel):
name: str = Field(default="")


class GitHubIssue(BaseModel):
created_at: datetime = Field(...)
html_url: str = Field(...)
number: int = Field(...)
title: str = Field(default="")
body: str | None = Field(default="")
labels: List[GitHubLabel] = Field(default_factory=GitHubLabel)
user: GitHubUser = Field(default_factory=GitHubUser)

@field_validator("body")
def validate_body(cls, v):
if not v:
return ""
return v


class GitHubIssueLoader(Loader):
"""Loader for GitHub issues in a given repository.
**Beware** the [GitHub API rate limit](https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting).
**Beware** the [GitHub API rate limit](https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api).
Attributes:
repo: The GitHub repository in the format 'owner/repo'.
Expand Down
4 changes: 3 additions & 1 deletion src/raggy/loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class PDFLoader(Loader):
"""

file_path: str
chunk_tokens: int = 500

@asynccontextmanager
async def open_pdf_file(self, file_path: str):
Expand All @@ -79,6 +80,7 @@ async def load(self) -> List[Document]:
Document(
text=page.extract_text(),
metadata={"page": i + 1, "file_path": self.file_path},
)
),
chunk_tokens=self.chunk_tokens,
)
]
Empty file added tests/__init__.py
Empty file.
Empty file added tests/loaders/__init__.py
Empty file.
Empty file added tests/loaders/test_github.py
Empty file.

0 comments on commit 8981203

Please sign in to comment.