Skip to content

Commit

Permalink
Merge pull request #106 from agentcoinorg/evan/min-scraped-sites
Browse files Browse the repository at this point in the history
  • Loading branch information
evangriffiths authored Aug 8, 2024
2 parents 0564033 + 1ad9c99 commit 487b774
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 29 deletions.
26 changes: 24 additions & 2 deletions prediction_prophet/functions/research.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def research(
model: str = "gpt-4-0125-preview",
initial_subqueries_limit: int = 20,
subqueries_limit: int = 4,
max_results_per_search: int = 5,
min_scraped_sites: int = 0,
scrape_content_split_chunk_size: int = 800,
scrape_content_split_chunk_overlap: int = 225,
top_k_per_query: int = 8,
Expand All @@ -28,6 +30,14 @@ def research(
tavily_api_key: SecretStr | None = None,
logger: t.Union[logging.Logger, "Logger"] = logging.getLogger()
) -> str:
# Validate args
if min_scraped_sites > max_results_per_search * subqueries_limit:
raise ValueError(
f"min_scraped_sites ({min_scraped_sites}) must be less than or "
f"equal to max_results_per_search ({max_results_per_search}) * "
f"subqueries_limit ({subqueries_limit})."
)

logger.info("Started subqueries generation")
queries = generate_subqueries(query=goal, limit=initial_subqueries_limit, model=model, api_key=openai_api_key)

Expand All @@ -42,8 +52,9 @@ def research(

logger.info(f"Started web searching")
search_results_with_queries = search(
queries,
queries,
lambda result: not result.url.startswith("https://www.youtube"),
max_results_per_search=max_results_per_search,
tavily_api_key=tavily_api_key
)

Expand All @@ -64,7 +75,18 @@ def research(
content=result.raw_content,
) for result in scrape_args if result.raw_content]
scraped = [result for result in scraped if result.content != ""]


unique_scraped_websites = set([result.url for result in scraped])
if len(scraped) < min_scraped_sites:
# Get urls that were not scraped
raise ValueError(
f"Only successfully scraped content from "
f"{len(unique_scraped_websites)} websites, out of a possible "
f"{len(websites_to_scrape)} websites, which is less than the "
f"minimum required ({min_scraped_sites}). The following websites "
f"were not scraped: {websites_to_scrape - unique_scraped_websites}"
)

logger.info(f"Scraped content from {len(scraped)} websites")

text_splitter = RecursiveCharacterTextSplitter(
Expand Down
9 changes: 7 additions & 2 deletions prediction_prophet/functions/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,18 @@ def safe_web_search(query: str, max_results: int = 5, tavily_api_key: SecretStr
return None


def search(queries: list[str], filter: t.Callable[[WebSearchResult], bool] = lambda x: True, tavily_api_key: SecretStr | None = None) -> list[tuple[str, WebSearchResult]]:
def search(
queries: list[str],
filter: t.Callable[[WebSearchResult], bool] = lambda x: True,
tavily_api_key: SecretStr | None = None,
max_results_per_search: int = 5,
) -> list[tuple[str, WebSearchResult]]:
maybe_results: list[t.Optional[list[WebSearchResult]]] = []

# Each result will have a query associated with it
# We only want to keep the results that are unique
with ThreadPoolExecutor(max_workers=5) as executor:
futures = {executor.submit(safe_web_search, query, 5, tavily_api_key) for query in queries}
futures = {executor.submit(safe_web_search, query, max_results_per_search, tavily_api_key) for query in queries}
for future in as_completed(futures):
maybe_results.append(future.result())

Expand Down
54 changes: 29 additions & 25 deletions prediction_prophet/functions/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,34 +16,38 @@ def fetch_html(url: str, timeout: int) -> Response:
response = requests.get(url, headers=headers, timeout=timeout)
return response

def web_scrape_strict(url: str, timeout: int = 10) -> str:
response = fetch_html(url=url, timeout=timeout)

if 'text/html' in response.headers.get('Content-Type', ''):
soup = BeautifulSoup(response.content, "html.parser")

[x.extract() for x in soup.findAll('script')]
[x.extract() for x in soup.findAll('style')]
[x.extract() for x in soup.findAll('noscript')]
[x.extract() for x in soup.findAll('link')]
[x.extract() for x in soup.findAll('head')]
[x.extract() for x in soup.findAll('image')]
[x.extract() for x in soup.findAll('img')]

text: str = soup.get_text()
text = markdownify(text)
text = " ".join([x.strip() for x in text.split("\n")])
text = " ".join([x.strip() for x in text.split(" ")])

return text
else:
print("Non-HTML content received")
logging.warning("Non-HTML content received")
return ""

def web_scrape(url: str, timeout: int = 10) -> str:
"""
Do not throw if the HTTP request fails.
"""
try:
response = fetch_html(url=url, timeout=timeout)

if 'text/html' in response.headers.get('Content-Type', ''):
soup = BeautifulSoup(response.content, "html.parser")

[x.extract() for x in soup.findAll('script')]
[x.extract() for x in soup.findAll('style')]
[x.extract() for x in soup.findAll('noscript')]
[x.extract() for x in soup.findAll('link')]
[x.extract() for x in soup.findAll('head')]
[x.extract() for x in soup.findAll('image')]
[x.extract() for x in soup.findAll('img')]

text: str = soup.get_text()
text = markdownify(text)
text = " ".join([x.strip() for x in text.split("\n")])
text = " ".join([x.strip() for x in text.split(" ")])

return text
else:
print("Non-HTML content received")
logging.warning("Non-HTML content received")
return ""

return web_scrape_strict(url=url, timeout=timeout)
except requests.RequestException as e:
print(f"HTTP request failed: {e}")
logging.error(f"HTTP request failed: {e}")
logging.warning(f"HTTP request failed: {e}")
return ""

0 comments on commit 487b774

Please sign in to comment.