From 95b62953070c9c8bee1e4028ed830f7047ec9f69 Mon Sep 17 00:00:00 2001 From: Jack Retterer Date: Mon, 4 Sep 2023 09:15:50 -0700 Subject: [PATCH] Jack/update documentation (#1190) Updated: - Added back support document types for partitioning - Added more tabs for python code in the API page - Added a RAG section in Key Concepts - Added a Common Use case section in overview --- CHANGELOG.md | 7 + docs/source/api.rst | 503 +++++++++++++++++----- docs/source/bricks/partition.rst | 48 +++ docs/source/introduction/key_concepts.rst | 35 +- docs/source/introduction/overview.rst | 22 +- unstructured/__version__.py | 2 +- 6 files changed, 505 insertions(+), 112 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ce9acc4e4..609e673239 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## 0.10.13-dev0 + +### Enhancements + +* Updated documentation: Added back support doc types for partitioning, more Python codes in the API page, RAG definition, and use case. + + ## 0.10.12 ### Enhancements diff --git a/docs/source/api.rst b/docs/source/api.rst index b2ee5da4c5..ffc7030e73 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -23,31 +23,26 @@ Now you can get started with this quick example: .. code:: python - # Define the URL + import requests + url = 'https://api.unstructured.io/general/v0/general' - # Define the headers headers = { 'accept': 'application/json', 'unstructured-api-key': '', } - # Define the form data data = { 'strategy': 'auto', } - # Define the file data file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - # Make the POST request response = requests.post(url, headers=headers, data=data, files=file_data) - # Close the file file_data['files'].close() - # Parse the JSON response json_response = response.json() Below, you will find a more comprehensive overview of the API capabilities. For detailed information on request and response schemas, refer to the `API documentation `_. @@ -77,84 +72,235 @@ Coordinates When elements are extracted from PDFs or images, it may be useful to get their bounding boxes as well. Set the ``coordinates`` parameter to ``true`` to add this field to the elements in the response. -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/layout-parser-paper.pdf' \ - -F 'coordinates=true' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/layout-parser-paper.pdf' \ + -F 'coordinates=true' \ + | jq -C . | less -R + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + headers = { + "accept": "application/json", + "Content-Type": "multipart/form-data", + "unstructured-api-key": "" + } + + data = { + "coordinates": "true" + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() Encoding ========= You can specify the encoding to use to decode the text input. If no value is provided, ``utf-8`` will be used. -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/fake-power-point.pptx' \ - -F 'encoding=utf_8' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/fake-power-point.pptx' \ + -F 'encoding=utf_8' \ + | jq -C . | less -R + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + + headers = { + "accept": "application/json", + "Content-Type": "multipart/form-data", + "unstructured-api-key": "" + } + + data = { + "encoding": "utf_8" + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() OCR Languages ============== You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. See the `Tesseract documentation `_ for a full list of languages and install instructions. OCR is only applied if the text is not already available in the PDF document. -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/english-and-korean.png' \ - -F 'strategy=ocr_only' \ - -F 'ocr_languages=eng' \ - -F 'ocr_languages=kor' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/english-and-korean.png' \ + -F 'strategy=ocr_only' \ + -F 'ocr_languages=eng' \ + -F 'ocr_languages=kor' \ + | jq -C . | less -R + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + headers = { + "accept": "application/json", + "unstructured-api-key": "" + } + + data = { + "strategy": "ocr_only", + "ocr_languages": ["eng", "kor"] + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() Output Format ============== By default the result will be in ``json``, but it can be set to ``text/csv`` to get data in ``csv`` format: -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/family-day.eml' \ - -F 'output_format="text/csv"' + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/family-day.eml' \ + -F 'output_format="text/csv"' + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + + headers = { + "accept": "application/json", + "unstructured-api-key": "" + } + + data = { + "strategy": "ocr_only", + "ocr_languages": ["eng", "kor"] + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() Page Break =========== Pass the `include_page_breaks` parameter to `true` to include `PageBreak` elements in the output. -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/family-day.eml' \ - -F 'include_page_breaks=true' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/family-day.eml' \ + -F 'include_page_breaks=true' \ + | jq -C . | less -R + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + + headers = { + "accept": "application/json", + "unstructured-api-key": "" + } + + data = { + "include_page_breaks": "true" + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() Strategies @@ -164,16 +310,46 @@ Four strategies are available for processing PDF/Images files: ``hi_res``, ``fas On the other hand, ``hi_res`` is the better choice for PDFs that may have text within embedded images, or for achieving greater precision of `element types `_ in the response JSON. Be aware that ``hi_res`` requests may take 20 times longer to process compared to the ``fast`` option. See the example below for making a ``hi_res`` request. -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/layout-parser-paper.pdf' \ - -F 'strategy=hi_res' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/layout-parser-paper.pdf' \ + -F 'strategy=hi_res' \ + | jq -C . | less -R + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + + headers = { + "accept": "application/json", + "unstructured-api-key": "" + } + + data = { + "strategy": "hi_res" + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() The ``ocr_only`` strategy runs the document through Tesseract for OCR. Currently, ``hi_res`` has difficulty ordering elements for documents with multiple columns. If you have a document with multiple columns that do not have extractable text, it's recommended that you use the ``ocr_only`` strategy. Please be aware that ``ocr_only`` will fall back to another strategy if Tesseract is not available. @@ -184,18 +360,49 @@ Beta Version: ``hi_res`` Strategy with Chipper Model To use the ``hi_res`` strategy with **Chipper** model, pass the argument for ``hi_res_model_name`` as shown in the code block below. -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'strategy=hi_res' \ - -F 'hi_res_model_name=chipper' \ - -F 'files=@example-docs/layout-parser-paper-fast.pdf' \ - -F 'strategy=hi_res' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'strategy=hi_res' \ + -F 'hi_res_model_name=chipper' \ + -F 'files=@example-docs/layout-parser-paper-fast.pdf' \ + -F 'strategy=hi_res' \ + | jq -C . | less -R + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + + headers = { + "accept": "application/json", + "unstructured-api-key": "" + } + + data = { + "strategy": "hi_res", + "hi_res_model_name": "chipper" + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() *Please note that the Chipper model does not currently support the coordinates argument.* @@ -207,50 +414,142 @@ PDF Table Extraction To extract the table structure from PDF files using the ``hi_res`` strategy, ensure that the ``pdf_infer_table_structure`` parameter is set to ``true``. This setting includes the table's text content in the response. By default, this parameter is set to ``false`` because table extraction is computationally expensive. -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/layout-parser-paper.pdf' \ - -F 'strategy=hi_res' \ - -F 'pdf_infer_table_structure=true' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/layout-parser-paper.pdf' \ + -F 'strategy=hi_res' \ + -F 'pdf_infer_table_structure=true' \ + | jq -C . | less -R + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + + headers = { + "accept": "application/json", + "unstructured-api-key": "" + } + + data = { + "strategy": "hi_res", + "pdf_infer_table_structure": "true" + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() Table Extraction for other filetypes ------------------------------------ We also provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter ``skip_infer_table_types`` to specify the document types that you want to skip table extraction with. By default, we skip table extraction for PDFs and Images, which are ``pdf``, ``jpg`` and ``png``. Note that table extraction only works with ``hi_res`` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to ``skip_infer_table_types`` with: -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/layout-parser-paper-with-table.jpg' \ - -F 'strategy=hi_res' \ - -F 'skip_infer_table_types=[]' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/layout-parser-paper-with-table.jpg' \ + -F 'strategy=hi_res' \ + -F 'skip_infer_table_types=[]' \ + | jq -C . | less -R + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + + headers = { + "accept": "application/json", + "unstructured-api-key": "" + } + + data = { + "strategy": "hi_res", + "skip_infer_table_types": "[]" + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() XML Tags ========= When processing XML documents, set the ``xml_keep_tags`` parameter to ``true`` to retain the XML tags in the output. If not specified, it will simply extract the text from within the tags. -.. code:: shell +.. tabs:: - curl -X 'POST' \ - 'https://api.unstructured.io/general/v0/general' \ - -H 'accept: application/json' \ - -H 'Content-Type: multipart/form-data' \ - -H 'unstructured-api-key: ' \ - -F 'files=@example-docs/fake-xml.xml' \ - -F 'xml_keep_tags=true' \ - | jq -C . | less -R + .. tab:: Shell + + .. code:: shell + + curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -H 'unstructured-api-key: ' \ + -F 'files=@example-docs/fake-xml.xml' \ + -F 'xml_keep_tags=true' \ + | jq -C . | less -R + + .. tab:: Python + + .. code:: python + + import requests + + url = "https://api.unstructured.io/general/v0/general" + + headers = { + "accept": "application/json", + "unstructured-api-key": "" + } + + data = { + "xml_keep_tags": "true" + } + + file_path = "/Path/To/File" + file_data = {'files': open(file_path, 'rb')} + + response = requests.post(url, headers=headers, files=files, data=data) + + file_data['files'].close() + + json_response = response.json() Using the API Locally diff --git a/docs/source/bricks/partition.rst b/docs/source/bricks/partition.rst index 22245f202b..2c839d8236 100644 --- a/docs/source/bricks/partition.rst +++ b/docs/source/bricks/partition.rst @@ -13,6 +13,54 @@ The easiest way to partition documents in unstructured is to use the ``partition If you call the ``partition`` brick, ``unstructured`` will use ``libmagic`` to automatically determine the file type and invoke the appropriate partition function. In cases where ``libmagic`` is not available, filetype detection will fall back to using the file extension. +The following table shows the document types the ``unstructured`` library currently supports. ``partition`` will recognize each of these document types and route the document +to the appropriate partitioning function. If you already know your document type, you can use the partitioning function listed in the table directly. + ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Document Type | Partition Function | Strategies | Table Support | Options | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| CSV Files (`.csv`) | `partition_csv` | N/A | Yes | None | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition; Process Attachments | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition; Process Attachments | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| EPubs (`.epub`) | `partition_epub` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Excel Documents (`.xlsx`/`.xls`) | `partition_xlsx` | N/A | Yes | None | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Images (`.png`/`.jpg`) | `partition_image` | "auto", "hi_res", "ocr_only" | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Markdown (`.md`) | `partition_md` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Org Mode (`.org`) | `partition_org` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| PDFs (`.pdf`) | `partition_pdf` | "auto", "fast", "hi_res", "ocr_only" | Yes | Encoding; Include Page Breaks; Infer Table Structure; Max Partition; OCR Languages, Strategy | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding; Max Partition; Paragraph Grouper | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Power Points (`.ppt`) | `partition_ppt` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Power Points (`.pptx`) | `partition_pptx` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| ReStructured Text (`.rst`) | `partition_rst` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Rich Text Files (`.rtf`) | `partition_rtf` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| TSV Files (`.tsv`) | `partition_tsv` | N/A | Yes | None | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Word Documents (`.doc`) | `partition_doc` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| Word Documents (`.docx`) | `partition_docx` | N/A | Yes | Include Page Breaks | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ +| XML Documents (`.xml`) | `partition_xml` | N/A | No | Encoding; Max Partition; XML Keep Tags | ++----------------------------------------------+--------------------------------+----------------------------------------+----------------+------------------------------------------------------------------------------------------------------------------+ + + As shown in the examples below, the ``partition`` function accepts both filenames and file-like objects as input. ``partition`` also has some optional kwargs. For example, if you set ``include_page_breaks=True``, the output will include ``PageBreak`` elements if the filetype supports it. diff --git a/docs/source/introduction/key_concepts.rst b/docs/source/introduction/key_concepts.rst index 6eecfc69cb..a6d7aab12e 100644 --- a/docs/source/introduction/key_concepts.rst +++ b/docs/source/introduction/key_concepts.rst @@ -1,12 +1,12 @@ Key Concepts ------------ -Natural Language Processing (NLP) encompasses a broad spectrum of tasks and methodologies. This section introduces some fundamental concepts crucial for most NLP projects. +Natural Language Processing (NLP) encompasses a broad spectrum of tasks and methodologies. This section introduces some fundamental concepts crucial for most NLP projects that involve Unstructured's products. Data Ingestion ^^^^^^^^^^^^^^^ -The initial step in any NLP task involves ingesting data from varied sources. This might include reading texts from files, scraping websites, listening to speech, or tapping into databases. Efficient data ingestion is vital to ensure that data is accessible and usable for downstream tasks. +Unstructured's ``upstream connectors`` make data ingestion easy. They ensure that your data is accessible, up to date, and usable for any downstream task. If you'd like to read more on our upstream connectors, you can find details `here <../upstream_connectors.html>`_. Data Preprocessing ^^^^^^^^^^^^^^^^^^^ @@ -45,3 +45,34 @@ Large Language Models (LLMs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ LLMs, like GPT, are trained on vast amounts of data and have the capacity to comprehend and generate human-like text. They have achieved state-of-the-art results across a multitude of NLP tasks and can be fine-tuned to cater to specific domains or requirements. + +Retrieval Augmented Generation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Large Language Models (LLMs) like OpenAI's ChatGPT and Anthropic's Claude have revolutionized the AI landscape with their prowess. However, they inherently suffer from significant drawbacks. One major issue is their static nature, which means they're "frozen in time". +For instance, ChatGPT's knowledge is limited up to September 2021, leaving it blind to any developments or information post that period. Despite this, LLMs might often respond to newer queries with unwarranted confidence, a phenomenon known as "hallucination". +Such errors can be highly detrimental, especially when these models serve critical real-world applications. + +Retrieval Augmented Generation (RAG) is a groundbreaking technique designed to counteract the limitations of foundational LLMs. By pairing an LLM with a RAG pipeline, we can enable users to access the underlying data sources that the model uses. This transparent approach not +only ensures that an LLM's claims can be verified for accuracy but also builds a trust factor among users. + +Moreover, RAG offers a cost-effective solution. Instead of bearing the extensive computational and financial burdens of training custom models or finetuning existing ones, RAG can, in many situations, serve as a sufficient alternative. This reduction in resource consumption +is particularly beneficial for organizations that lack the means to develop and deploy foundational models from scratch. + +A RAG workflow can be broken down into the following steps: + +1. **Data ingestion**: The first step is acquiring data from your relevant sources. At Unstructured we make this super easy with our `data connectors `_. + +2. **Data preprocessing and cleaning**: Once you've identified and collected your data sources a good practice is to remove any unnecessary artifacts within the dataset. At Unstructured we have a variety of different tools to remove unneccesary elements. Found `here `_ + +3. **Chunking**: The next step is to break your text down into digestable pieces for your LLM to be able to consume. LangChain, Llama Index and Haystack offer chunking funcionalities. + +4. **Embedding**: After chunking, you will need to convert the text into a numerical representation (vector embedding) that a LLM can understand. OpenAI, Cohere, and Hugging Face all offer embedding models. + +5. **Vector Database**: The next step is to choose a location for storing your chunked embeddings. There are lots of options to choose from for your vector database (Pinecone, Milvus, ChromaDD, Weaviate and more). + +6. **User Prompt**: Take the user prompt and grab the most relevant chunks of information in the vector database via similarity search. + +7. **LLM Generation**: Once you've retrieved your relevant chunks you pass the prompt + the context to the LLM for the LLM to generate a more accurate response. + +For a full guide on how to implement RAG check out this `blog post `_ \ No newline at end of file diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst index 7206e29260..a1b587ae30 100644 --- a/docs/source/introduction/overview.rst +++ b/docs/source/introduction/overview.rst @@ -7,6 +7,15 @@ Introduction The ``unstructured`` library aims to simplify and streamline the preprocessing of structured and unstructured documents for downstream tasks. And what that means is no matter where your data is and no matter what format that data is in, Unstructured's toolkit will transform and preprocess that data into an easily digestable and usable format. +Product Offerings +^^^^^^^^^^^^^^^^^ + +- **Python Library**: Unstructured's open source software `(library) `_. + +- **Hosted API**: Easiest and most scalable way to process large documents in quantity `(library) `_. + +- **Enterprise Product**: In development with the hopes of launching late 2023. + Key Features ^^^^^^^^^^^^^ @@ -18,14 +27,13 @@ Key Features - **Customizability**: Easily extend and customize the library to fit specific requirements or unique use cases. -Key Concepts -^^^^^^^^^^^^^ - -- **Connectors**: Interfaces that enable the library to interact with different data sources and sinks, like cloud storage or databases. - -- **Bricks**: Modular units of the library that allow users to partition, clean, and stage data efficiently. +Common Use Cases +^^^^^^^^^^^^^^^^ -- **Metadata**: Data about data. In ``unstructured``, metadata helps in keeping track of the source, type, and other essential attributes of the data. +- **Pretraining Models** +- **Fine-tuning Models** +- **Retrieval Augmented Generation (RAG)** +- **Traditional ETL** Quickstart Tutorial ^^^^^^^^^^^^^^^^^^^^ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 7a431ee589..ba254c7ae9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.12" # pragma: no cover +__version__ = "0.10.13-dev0" # pragma: no cover