jxnl · jxnl · Jul 22, 2023 · Jul 22, 2023 · Jul 22, 2023 · Jul 22, 2023
diff --git a/examples/citation_with_extraction/README.md b/examples/citation_with_extraction/README.md
@@ -0,0 +1,68 @@
+# Citation with Extraction
+
+This repository contains a FastAPI application that uses GPT-4 to answer questions based on a given context and extract relevant facts with correct and exact citations. The extracted facts are returned as JSON events using Server-Sent Events (SSE).
+
+## How it Works
+
+The FastAPI app defines an endpoint `/extract` that accepts a POST request with JSON data containing a `context` and a `query`. The `context` represents the text from which the question is being asked, and the `query` is the question itself.
+
+The app leverages GPT-4, an advanced language model, to generate answers to the questions and extract relevant facts. It ensures that the extracted facts include direct quotes from the given context.
+
+## Example Usage
+
+To use the `/extract` endpoint, send a POST request with `curl` or any HTTP client with the following format:
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d '{
+  "context": "My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.I went to an arts highschool but in university I studied Computational Mathematics and physics.  As part of coop I worked at many companies including Stitchfix, Facebook.  I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.",
+  "query": "What did the author do in school?"
+}' -N http://localhost:8000/extract
+```
+
+Replace `http://localhost:8000` with the actual URL of your FastAPI app if it's running on a different host and port. The API will respond with Server-Sent Events (SSE) containing the extracted facts in real-time.
+
+## Bring your own API key
+
+If you have your own api key but dont want to try deploying it yourself you're welcome to use my 
+modal isntance here, this code is public and I do not store your key.
+
+```bash
+curl -X 'POST' \
+  'https://jxnl--rag-citation-fastapi-app.modal.run/extract' \
+  -H 'accept: */*' \
+  -H 'Content-Type: application/json' \
+  -H 'Authorization: Bearer <OPENAI_API_KEY>' \
+  -d '{
+  "context": "My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.I went to an arts highschool but in university I studied Computational Mathematics and physics.  As part of coop I worked at many companies including Stitchfix, Facebook.  I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.",
+  "query": "What did the author do in school?"
+}'
+```
+
+
+## Requirements
+
+To run this application, ensure you have the following Python packages installed:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Running the App
+
+To run the FastAPI app, execute the following command:
+
+```bash
+uvicorn main:app --reload
+```
+
+This will start the server, and the `/extract` endpoint will be available at `http://localhost:8000/extract`.
+
+## Note
+
+Ensure that you have a valid API key for GPT-4 from OpenAI. If you don't have one, you can obtain it from the OpenAI website.
+
+Please use this application responsibly and be mindful of any usage limits or restrictions from OpenAI's API usage policy.
+
+## License
+
+This project is licensed under the [MIT License](LICENSE). Feel free to use, modify, and distribute it as you see fit.
diff --git a/examples/citation_with_extraction/citation_fuzzy_match.py b/examples/citation_with_extraction/citation_fuzzy_match.py
@@ -92,10 +92,7 @@ def ask_ai(question: str, context: str) -> QuestionAnswer:
 
 question = "What did the author do during college?"
 context = """
-My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.
-I went to an arts highschool but in university I studied Computational Mathematics and physics. 
-As part of coop I worked at many companies including Stitchfix, Facebook.
-I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.
+My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.I went to an arts highschool but in university I studied Computational Mathematics and physics.  As part of coop I worked at many companies including Stitchfix, Facebook.  I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.
 """
 
 

diff --git a/examples/citation_with_extraction/main.py b/examples/citation_with_extraction/main.py
@@ -0,0 +1,124 @@
+from typing import Iterable, List
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.params import Depends
+from openai_function_call import MultiTask
+from pydantic import BaseModel, Field
+from starlette.responses import StreamingResponse
+
+import os
+import openai
+
+
+# FastAPI app
+app = FastAPI(
+    title="Citation with Extraction",
+)
+
+
+class SubResponse(BaseModel):
+    """
+    If there are multiple phrases with difference citations. Each one should be its own object.
+    make sure to break them apart such that each one only uses a set of
+    sources that are relevant to it.
+    """
+
+    body: str = Field(..., description="Body of the sentences, as part of a response")
+    substring_quotes: List[str] = Field(
+        ...,
+        description="Each source should be a direct quote from the context, as a substring of the original content but should be a wide enough quote to capture the context of the quote. The citation should at least be long and capture the context and be a full sentence.",
+    )
+
+    def _get_span(self, quote, context):
+        import regex
+
+        minor = quote
+        major = context
+
+        errs_ = 0
+        s = regex.search(f"({minor}){{e<={errs_}}}", major)
+        while s is None and errs_ <= len(context) * 0.05:
+            errs_ += 1
+            s = regex.search(f"({minor}){{e<={errs_}}}", major)
+
+        if s is not None:
+            yield from s.spans()
+
+    def get_spans(self, context):
+        if self.substring_quotes:
+            for quote in self.substring_quotes:
+                yield from self._get_span(quote, context)
+
+
+Answers = MultiTask(
+    SubResponse,
+    name="Answer",
+    description="Correctly answer questions based on a context. Quotes should be full sentences when possible",
+)
+
+
+class Question(BaseModel):
+    context: str = Field(..., description="Context to extract answers from")
+    query: str = Field(..., description="Question to answer")
+
+
+# Function to extract entities from input text using GPT-3.5
+def stream_extract(question: Question) -> Iterable[SubResponse]:
+    completion = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo-0613",
+        temperature=0,
+        stream=True,
+        functions=[Answers.openai_schema],
+        function_call={"name": Answers.openai_schema["name"]},
+        messages=[
+            {
+                "role": "system",
+                "content": f"You are a world class algorithm to answer questions with correct and exact citations. ",
+            },
+            {"role": "user", "content": f"Answer question using the following context"},
+            {"role": "user", "content": f"{question.context}"},
+            {"role": "user", "content": f"Question: {question.query}"},
+            {
+                "role": "user",
+                "content": f"Tips: Make sure to cite your sources, and use the exact words from the context.",
+            },
+        ],
+        max_tokens=2000,
+    )
+    return Answers.from_streaming_response(completion)
+
+
+def get_api_key(request: Request):
+    """
+    This just gets the API key from the request headers.
+    but tries to read from the environment variable OPENAI_API_KEY first.
+    """
+    if "OPENAI_API_KEY" in os.environ:
+        return os.environ["OPENAI_API_KEY"]
+
+    auth = request.headers.get("Authorization")
+    if auth is None:
+        raise HTTPException(status_code=401, detail="Missing Authorization header")
+
+    if auth.startswith("Bearer "):
+        return auth.replace("Bearer ", "")
+
+    return None
+
+
+# Route to handle SSE events and return users
+@app.post("/extract", response_class=StreamingResponse)
+async def extract(question: Question, openai_key=Depends(get_api_key)):
+    openai.api_key = openai_key
+    facts = stream_extract(question)
+
+    async def generate():
+        for fact in facts:
+            spans = list(fact.get_spans(question.context))
+            resp = {
+                "body": fact.body,
+                "spans": spans,
+                "citation": [question.context[a:b] for (a, b) in spans],
+            }
+            yield f"data: {resp}"
+
+    return StreamingResponse(generate(), media_type="text/event-stream")
diff --git a/examples/citation_with_extraction/modal_main.py b/examples/citation_with_extraction/modal_main.py
@@ -0,0 +1,14 @@
+from main import app
+import modal
+
+stub = modal.Stub("rag-citation")
+
+image = modal.Image.debian_slim().pip_install(
+    "fastapi", "openai_function_call>=0.2.1", "regex"
+)
+
+
+@stub.function(image=image)
+@modal.asgi_app()
+def fastapi_app():
+    return app
diff --git a/examples/citation_with_extraction/requirements.txt b/examples/citation_with_extraction/requirements.txt
@@ -0,0 +1,5 @@
+fastapi
+uvicorn
+openai
+pydantic
+openai_function_call
diff --git a/openai_function_call/dsl/multitask.py b/openai_function_call/dsl/multitask.py
@@ -1,5 +1,5 @@
-from pydantic import create_model, Field
-from typing import Optional, List, Type
+from pydantic import BaseModel, create_model, Field
+from typing import Optional, List, Type, Union
 from openai_function_call import OpenAISchema
 
 
@@ -48,7 +48,7 @@ def get_object(str, stack):
 
 
 def MultiTask(
-    subtask_class: Type[OpenAISchema],
+    subtask_class: Type[BaseModel],
     name: Optional[str] = None,
     description: Optional[str] = None,
 ):