expose chnk tokens for pdf

zzstoatzz · Feb 23, 2024 · 8981203 · 8981203
1 parent 4a8885e
commit 8981203
Show file tree

Hide file tree

Showing 9 changed files with 252 additions and 35 deletions.
diff --git a/examples/chat_with_X/repo.py b/examples/chat_with_X/repo.py
@@ -0,0 +1,46 @@
+import asyncio
+
+from marvin.beta.assistants import Assistant
+from marvin.utilities.tools import custom_partial
+from rich.status import Status
+
+from raggy.loaders.github import GitHubRepoLoader
+from raggy.vectorstores.tpuf import TurboPuffer, query_namespace
+
+TPUF_NS = "demo"
+
+
+async def ingest_repo(repo: str):
+    loader = GitHubRepoLoader(repo=repo)
+
+    with Status(f"Loading {repo}"):
+        documents = await loader.load()
+
+    async with TurboPuffer(namespace=TPUF_NS) as tpuf, Status(f"Ingesting {repo}"):
+        await tpuf.upsert(documents)
+
+
+async def chat_with_repo(repo: str, clean_up: bool = True):
+    await ingest_repo(repo)
+
+    try:
+        with Assistant(
+            name="Raggy Expert",
+            instructions=(
+                "You use `query_namespace` to answer questions about a github"
+                f" repo called {repo}!. You MUST use this tool to answer questions."
+            ),
+            tools=[custom_partial(query_namespace, namespace=TPUF_NS)],
+        ) as assistant:
+            assistant.chat()
+
+    finally:
+        if clean_up:
+            async with TurboPuffer(namespace=TPUF_NS) as tpuf, Status(
+                f"Cleaning up namespace {TPUF_NS}"
+            ):
+                await tpuf.reset()
+
+
+if __name__ == "__main__":
+    asyncio.run(chat_with_repo("zzstoatzz/raggy"))
diff --git a/examples/chat_with_X/requirements.txt b/examples/chat_with_X/requirements.txt
@@ -0,0 +1,3 @@
+marvin
+raggy
+turbopuffer
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,8 +19,11 @@ dependencies = [
     "bs4",
     "chardet",
     "fake-useragent",
+    "gh-util",
+    "openai>1.0.0",
     "pypdf",
     "tenacity",
+    "tiktoken",
     "xxhash",
     "yake",
 ]
@@ -35,15 +38,28 @@ dev = [
     "mkdocstrings[python]~=0.22",
     "mypy",
     "pre-commit>=2.21,<4.0",
-    "pydantic[dotenv]",
     "ruff",
     "types-aiofiles",
+    "raggy[tests]",
+]
+
+tests = [
+    "pytest-asyncio>=0.18.2,!=0.22.0,<0.23.0",
+    "pytest-env>=0.8,<2.0",
+    "pytest-rerunfailures>=10,<14",
+    "pytest-sugar>=0.9,<2.0",
+    "pytest~=7.3.1",
+    "pytest-timeout",
+    "pytest-xdist",
 ]
 
 chroma = ["chromadb"]
 tpuf = ["turbopuffer"]
 pdf = ["pypdf"]
 
+[project.scripts]
+raggy = "raggy.cli:app"
+
 [project.urls]
 Code = "https://github.com/zzstoatzz/raggy"
 
@@ -70,7 +86,11 @@ asyncio_mode = 'auto'
 filterwarnings = [
     "ignore:'crypt' is deprecated and slated for removal in Python 3.13:DeprecationWarning",
 ]
-env = ['PYTEST_TIMEOUT=20']
+env = [
+    'D:RAGGY_LOG_VERBOSE=1',
+    'D:RAGGY_LOG_LEVEL=DEBUG',
+    'PYTEST_TIMEOUT=20',
+]
 
 [tool.ruff]
 extend-select = ["I"]

diff --git a/src/raggy/cli/__init__.py b/src/raggy/cli/__init__.py
@@ -0,0 +1,175 @@
+import argparse
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import openai
+from prompt_toolkit import PromptSession
+from prompt_toolkit.auto_suggest import AutoSuggestFromHistory
+from prompt_toolkit.history import FileHistory
+from rich.console import Console, ConsoleOptions, RenderResult
+from rich.live import Live
+from rich.markdown import CodeBlock, Markdown
+from rich.status import Status
+from rich.syntax import Syntax
+from rich.text import Text
+
+
+class SimpleCodeBlock(CodeBlock):
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        code = str(self.text).rstrip()
+        yield Text(self.lexer_name, style="dim")
+        yield Syntax(
+            code,
+            self.lexer_name,
+            theme=self.theme,
+            background_color="default",
+            word_wrap=True,
+        )
+        yield Text(f"/{self.lexer_name}", style="dim")
+
+
+Markdown.elements["fence"] = SimpleCodeBlock
+
+
+def app() -> int:
+    parser = argparse.ArgumentParser(
+        prog="aicli",
+        description="""\
+OpenAI powered AI CLI (thank you samuelcolvin)
+
+Special prompts:
+* `show-markdown` - show the markdown output from the previous response
+* `multiline` - toggle multiline mode
+""",
+    )
+    parser.add_argument(
+        "prompt", nargs="?", help="AI Prompt, if omitted fall into interactive mode"
+    )
+
+    parser.add_argument(
+        "--no-stream",
+        action="store_true",
+        help="Whether to stream responses from OpenAI",
+    )
+
+    parser.add_argument("--version", action="store_true", help="Show version and exit")
+
+    args = parser.parse_args()
+
+    console = Console()
+    console.print("OpenAI powered AI CLI", style="green bold", highlight=False)
+    if args.version:
+        return 0
+
+    try:
+        openai_api_key = os.environ["OPENAI_API_KEY"]
+    except KeyError:
+        console.print(
+            "You must set the OPENAI_API_KEY environment variable", style="red"
+        )
+        return 1
+
+    client = openai.OpenAI(api_key=openai_api_key)
+
+    now_utc = datetime.now(timezone.utc)
+    t = now_utc.astimezone().tzinfo.tzname(now_utc)  # type: ignore
+    setup = f"""\
+Help the user by responding to their request, the output should 
+be concise and always written in markdown. The current date and time
+is {datetime.now()} {t}. The user is running {sys.platform}."""
+
+    stream = not args.no_stream
+    messages = [{"role": "system", "content": setup}]
+
+    if args.prompt:
+        messages.append({"role": "user", "content": args.prompt})
+        try:
+            ask_openai(client, messages, stream, console)
+        except KeyboardInterrupt:
+            pass
+        return 0
+
+    history = Path().home() / ".openai-prompt-history.txt"
+    session = PromptSession(history=FileHistory(str(history)))
+    multiline = False
+
+    while True:
+        try:
+            text = session.prompt(
+                "aicli ➤ ", auto_suggest=AutoSuggestFromHistory(), multiline=multiline
+            )
+        except (KeyboardInterrupt, EOFError):
+            return 0
+
+        if not text.strip():
+            continue
+
+        ident_prompt = text.lower().strip(" ").replace(" ", "-")
+        if ident_prompt == "show-markdown":
+            last_content = messages[-1]["content"]
+            console.print("[dim]Last markdown output of last question:[/dim]\n")
+            console.print(
+                Syntax(last_content, lexer="markdown", background_color="default")
+            )
+            continue
+        elif ident_prompt == "multiline":
+            multiline = not multiline
+            if multiline:
+                console.print(
+                    "Enabling multiline mode. "
+                    "[dim]Press [Meta+Enter] or [Esc] followed by [Enter] to accept input.[/dim]"
+                )
+            else:
+                console.print("Disabling multiline mode.")
+            continue
+
+        messages.append({"role": "user", "content": text})
+
+        try:
+            content = ask_openai(client, messages, stream, console)
+        except KeyboardInterrupt:
+            return 0
+        messages.append({"role": "assistant", "content": content})
+
+
+def ask_openai(
+    client: openai.OpenAI,
+    messages: list[dict[str, str]],
+    stream: bool,
+    console: Console,
+) -> str:
+    with Status("[dim]Working on it…[/dim]", console=console):
+        response = client.chat.completions.create(
+            model="gpt-4", messages=messages, stream=stream
+        )
+
+    console.print("\nResponse:", style="green")
+    if stream:
+        content = ""
+        interrupted = False
+        with Live("", refresh_per_second=15, console=console) as live:
+            try:
+                for chunk in response:
+                    if chunk.choices[0].finish_reason is not None:
+                        break
+                    chunk_text = chunk.choices[0].delta.content
+                    content += chunk_text
+                    live.update(Markdown(content))
+            except KeyboardInterrupt:
+                interrupted = True
+
+        if interrupted:
+            console.print("[dim]Interrupted[/dim]")
+    else:
+        content = response.choices[0].message.content
+        console.print(Markdown(content))
+
+    return content
+
+
+if __name__ == "__main__":
+    sys.exit(app())
diff --git a/src/raggy/loaders/github.py b/src/raggy/loaders/github.py
@@ -3,14 +3,14 @@
 import functools
 import os
 import re
-from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Tuple
 
 import aiofiles
 import chardet
 import httpx
-from pydantic import BaseModel, Field, field_validator, model_validator
+from gh_util.types import GitHubComment, GitHubIssue
+from pydantic import Field, field_validator, model_validator
 
 from raggy.documents import Document, document_to_excerpts
 from raggy.loaders import Loader
@@ -28,39 +28,10 @@ async def read_file_with_chardet(file_path, errors="replace"):
     return text
 
 
-class GitHubUser(BaseModel):
-    login: str
-
-
-class GitHubComment(BaseModel):
-    body: str = Field(default="")
-    user: GitHubUser = Field(default_factory=GitHubUser)
-
-
-class GitHubLabel(BaseModel):
-    name: str = Field(default="")
-
-
-class GitHubIssue(BaseModel):
-    created_at: datetime = Field(...)
-    html_url: str = Field(...)
-    number: int = Field(...)
-    title: str = Field(default="")
-    body: str | None = Field(default="")
-    labels: List[GitHubLabel] = Field(default_factory=GitHubLabel)
-    user: GitHubUser = Field(default_factory=GitHubUser)
-
-    @field_validator("body")
-    def validate_body(cls, v):
-        if not v:
-            return ""
-        return v
-
-
 class GitHubIssueLoader(Loader):
     """Loader for GitHub issues in a given repository.
 
-    **Beware** the [GitHub API rate limit](https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting).
+    **Beware** the [GitHub API rate limit](https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api).
 
     Attributes:
         repo: The GitHub repository in the format 'owner/repo'.

diff --git a/src/raggy/loaders/pdf.py b/src/raggy/loaders/pdf.py
@@ -56,6 +56,7 @@ class PDFLoader(Loader):
     """
 
     file_path: str
+    chunk_tokens: int = 500
 
     @asynccontextmanager
     async def open_pdf_file(self, file_path: str):
@@ -79,6 +80,7 @@ async def load(self) -> List[Document]:
                     Document(
                         text=page.extract_text(),
                         metadata={"page": i + 1, "file_path": self.file_path},
-                    )
+                    ),
+                    chunk_tokens=self.chunk_tokens,
                 )
             ]
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/loaders/__init__.py b/tests/loaders/__init__.py
diff --git a/tests/loaders/test_github.py b/tests/loaders/test_github.py