From 035ef6abe32d403b14cd512faf5be43961fc9101 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Mon, 18 Sep 2023 14:23:15 -0400 Subject: [PATCH] update wikipedia doc --- docs/source/source_connectors/wikipedia.rst | 79 +++++++++------------ 1 file changed, 33 insertions(+), 46 deletions(-) diff --git a/docs/source/source_connectors/wikipedia.rst b/docs/source/source_connectors/wikipedia.rst index 7d81160994..cf6a6af061 100644 --- a/docs/source/source_connectors/wikipedia.rst +++ b/docs/source/source_connectors/wikipedia.rst @@ -28,28 +28,21 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "wikipedia", - "--page-title", "Open Source Software", - "--output-dir", "dropbox-output", - "--num-processes", "2", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.runner.wikipedia import wikipedia + from unstructured.ingest.interfaces import ReadConfig, PartitionConfig + + + if __name__ == "__main__": + wikipedia( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="wikipedia-ingest-output", + num_processes=2 + ), + page_title="Open Source Software", + auto_suggest=False, + ) Run via the API --------------- @@ -75,30 +68,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "wikipedia", - "--page-title", "Open Source Software", - "--output-dir", "dropbox-output", - "--num-processes", "2", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.wikipedia import wikipedia + + if __name__ == "__main__": + wikipedia( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="wikipedia-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + page_title="Open Source Software", + auto_suggest=False, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_.