Merge pull request #106 from agentcoinorg/evan/min-scraped-sites

agentcoinorg · Aug 8, 2024 · 487b774 · 487b774
2 parents 0564033 + 1ad9c99
commit 487b774
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 29 deletions.
diff --git a/prediction_prophet/functions/research.py b/prediction_prophet/functions/research.py
@@ -20,6 +20,8 @@ def research(
     model: str = "gpt-4-0125-preview",
     initial_subqueries_limit: int = 20,
     subqueries_limit: int = 4,
+    max_results_per_search: int = 5,
+    min_scraped_sites: int = 0,
     scrape_content_split_chunk_size: int = 800,
     scrape_content_split_chunk_overlap: int = 225,
     top_k_per_query: int = 8,
@@ -28,6 +30,14 @@ def research(
     tavily_api_key: SecretStr | None = None,
     logger: t.Union[logging.Logger, "Logger"] = logging.getLogger()
 ) -> str:
+    # Validate args
+    if min_scraped_sites > max_results_per_search * subqueries_limit:
+        raise ValueError(
+            f"min_scraped_sites ({min_scraped_sites}) must be less than or "
+            f"equal to max_results_per_search ({max_results_per_search}) * "
+            f"subqueries_limit ({subqueries_limit})."
+        )
+
     logger.info("Started subqueries generation")
     queries = generate_subqueries(query=goal, limit=initial_subqueries_limit, model=model, api_key=openai_api_key)
 
@@ -42,8 +52,9 @@ def research(
 
     logger.info(f"Started web searching")
     search_results_with_queries = search(
-        queries, 
+        queries,
         lambda result: not result.url.startswith("https://www.youtube"),
+        max_results_per_search=max_results_per_search,
         tavily_api_key=tavily_api_key
     )
 
@@ -64,7 +75,18 @@ def research(
         content=result.raw_content,
     ) for result in scrape_args if result.raw_content]
     scraped = [result for result in scraped if result.content != ""]
-
+
+    unique_scraped_websites = set([result.url for result in scraped])
+    if len(scraped) < min_scraped_sites:
+        # Get urls that were not scraped
+        raise ValueError(
+            f"Only successfully scraped content from "
+            f"{len(unique_scraped_websites)} websites, out of a possible "
+            f"{len(websites_to_scrape)} websites, which is less than the "
+            f"minimum required ({min_scraped_sites}). The following websites "
+            f"were not scraped: {websites_to_scrape - unique_scraped_websites}"
+        )
+
     logger.info(f"Scraped content from {len(scraped)} websites")
 
     text_splitter = RecursiveCharacterTextSplitter(

diff --git a/prediction_prophet/functions/search.py b/prediction_prophet/functions/search.py
@@ -13,13 +13,18 @@ def safe_web_search(query: str, max_results: int = 5, tavily_api_key: SecretStr
         return None
 
 
-def search(queries: list[str], filter: t.Callable[[WebSearchResult], bool] = lambda x: True, tavily_api_key: SecretStr | None = None) -> list[tuple[str, WebSearchResult]]:
+def search(
+    queries: list[str],
+    filter: t.Callable[[WebSearchResult], bool] = lambda x: True,
+    tavily_api_key: SecretStr | None = None,
+    max_results_per_search: int = 5,
+) -> list[tuple[str, WebSearchResult]]:
     maybe_results: list[t.Optional[list[WebSearchResult]]] = []
 
     # Each result will have a query associated with it
     # We only want to keep the results that are unique
     with ThreadPoolExecutor(max_workers=5) as executor:
-        futures = {executor.submit(safe_web_search, query, 5, tavily_api_key) for query in queries}
+        futures = {executor.submit(safe_web_search, query, max_results_per_search, tavily_api_key) for query in queries}
         for future in as_completed(futures):
             maybe_results.append(future.result())
 

diff --git a/prediction_prophet/functions/web_scrape.py b/prediction_prophet/functions/web_scrape.py
@@ -16,34 +16,38 @@ def fetch_html(url: str, timeout: int) -> Response:
     response = requests.get(url, headers=headers, timeout=timeout)
     return response
 
+def web_scrape_strict(url: str, timeout: int = 10) -> str:
+    response = fetch_html(url=url, timeout=timeout)
+
+    if 'text/html' in response.headers.get('Content-Type', ''):
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        [x.extract() for x in soup.findAll('script')]
+        [x.extract() for x in soup.findAll('style')]
+        [x.extract() for x in soup.findAll('noscript')]
+        [x.extract() for x in soup.findAll('link')]
+        [x.extract() for x in soup.findAll('head')]
+        [x.extract() for x in soup.findAll('image')]
+        [x.extract() for x in soup.findAll('img')]
+
+        text: str = soup.get_text()
+        text = markdownify(text)
+        text = "  ".join([x.strip() for x in text.split("\n")])
+        text = " ".join([x.strip() for x in text.split("  ")])
+
+        return text
+    else:
+        print("Non-HTML content received")
+        logging.warning("Non-HTML content received")
+        return ""
 
 def web_scrape(url: str, timeout: int = 10) -> str:
+    """
+    Do not throw if the HTTP request fails.
+    """
     try:
-        response = fetch_html(url=url, timeout=timeout)
-
-        if 'text/html' in response.headers.get('Content-Type', ''):
-            soup = BeautifulSoup(response.content, "html.parser")
-
-            [x.extract() for x in soup.findAll('script')]
-            [x.extract() for x in soup.findAll('style')]
-            [x.extract() for x in soup.findAll('noscript')]
-            [x.extract() for x in soup.findAll('link')]
-            [x.extract() for x in soup.findAll('head')]
-            [x.extract() for x in soup.findAll('image')]
-            [x.extract() for x in soup.findAll('img')]
-
-            text: str = soup.get_text()
-            text = markdownify(text)
-            text = "  ".join([x.strip() for x in text.split("\n")])
-            text = " ".join([x.strip() for x in text.split("  ")])
-
-            return text
-        else:
-            print("Non-HTML content received")
-            logging.warning("Non-HTML content received")
-            return ""
-
+        return web_scrape_strict(url=url, timeout=timeout)
     except requests.RequestException as e:
         print(f"HTTP request failed: {e}")
-        logging.error(f"HTTP request failed: {e}")
+        logging.warning(f"HTTP request failed: {e}")
         return ""