Skip to content

Commit

Permalink
Merge branch 'main' into fixed-relationships-correct
Browse files Browse the repository at this point in the history
  • Loading branch information
ngupta10 authored Apr 16, 2024
2 parents b2e6a29 + 2559a2e commit 0d2e910
Show file tree
Hide file tree
Showing 10 changed files with 142 additions and 105 deletions.
19 changes: 11 additions & 8 deletions querent/core/transformers/bert_ner_opensourcellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,14 +206,17 @@ async def process_tokens(self, data: IngestedTokens):
if self.sample_relationships:
embedding_triples = self.predicate_context_extractor.update_embedding_triples_with_similarity(self.predicate_json_emb, embedding_triples)
for triple in embedding_triples:
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file)
await self.set_state(new_state=current_state)
if not self.termination_event.is_set():
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file)
await self.set_state(new_state=current_state)
else:
return
else:
return
else:
Expand Down
19 changes: 11 additions & 8 deletions querent/core/transformers/fixed_entities_set_opensourcellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,14 +183,17 @@ async def process_tokens(self, data: IngestedTokens):
if self.sample_relationships:
embedding_triples = self.predicate_context_extractor.update_embedding_triples_with_similarity(self.predicate_json_emb, embedding_triples)
for triple in embedding_triples:
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file)
await self.set_state(new_state=current_state)
if not self.termination_event.is_set():
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file)
await self.set_state(new_state=current_state)
else:
return
else:
return
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,14 +287,17 @@ async def process_tokens(self, data: IngestedTokens):
if self.sample_relationships:
embedding_triples = self.predicate_context_extractor.update_embedding_triples_with_similarity(self.predicate_json_emb, embedding_triples)
for triple in embedding_triples:
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file)
await self.set_state(new_state=current_state)
if not self.termination_event.is_set():
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file)
await self.set_state(new_state=current_state)
else:
return
except Exception as e:
self.logger.error(f"Invalid {self.__class__.__name__} configuration. Unable to extract predicates using GPT. {e}")
raise Exception(f"An error occurred while extracting predicates using GPT: {e}")
Expand Down
23 changes: 13 additions & 10 deletions querent/core/transformers/gpt_llm_gpt_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,17 +243,20 @@ async def process_tokens(self, data: IngestedTokens):
final_triples = self.remove_duplicate_triplets(final_triples)
if len(final_triples) > 0:
for triple in final_triples:
graph_json = json.dumps(triple)
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file)
await self.set_state(new_state=current_state)
context_embeddings = self.create_emb.get_embeddings([triple['sentence']])[0]
triple['context_embeddings'] = context_embeddings
triple['context'] = triple['sentence']
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson((triple['subject'],json.dumps(triple), triple['object'])))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file)
if not self.termination_event.is_set():
graph_json = json.dumps(triple)
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file)
await self.set_state(new_state=current_state)
context_embeddings = self.create_emb.get_embeddings([triple['sentence']])[0]
triple['context_embeddings'] = context_embeddings
triple['context'] = triple['sentence']
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson((triple['subject'],json.dumps(triple), triple['object'])))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file)
await self.set_state(new_state=current_state)
else:
return

except Exception as e:
self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to extract predicates using GPT NER LLM class. {e}")
Expand Down
4 changes: 2 additions & 2 deletions querent/kg/rel_helperfunctions/triple_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def convert_vectorjson(triple):
if data is None:
return {}

id_format = f"{TripleToJsonConverter._normalize_text(subject)}_{TripleToJsonConverter._normalize_text(data.get('predicate', ''))}_{TripleToJsonConverter._normalize_text(object_)}"
id_format = f"{TripleToJsonConverter._normalize_text(subject)}-{TripleToJsonConverter._normalize_text(data.get('predicate', ''))}-{TripleToJsonConverter._normalize_text(object_)}"
json_object = {
"id": TripleToJsonConverter._normalize_text(id_format,replace_space=True).replace(",","_"),
"embeddings": data.get("context_embeddings", []),
Expand All @@ -76,7 +76,7 @@ def convert_vectorjson(triple):
@staticmethod
def replace_special_chars_with_underscore(data):
# This pattern will match anything that is not a letter, number, or underscore
pattern = r'[^a-zA-Z0-9_]'
pattern = r'[^-a-zA-Z0-9_]'
# Replace matched patterns with an underscore
return re.sub(pattern, '_', data)

Expand Down
33 changes: 30 additions & 3 deletions querent/workflow/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,38 @@
async def start_workflow(config_dict: dict):
# Start the workflow
workflow_config = config_dict.get("workflow")
engine_params = workflow_config.get("config").get("engine_params", None)
engine_params = workflow_config.get("config", None)
is_engine_params = False
try:
if engine_params is not None:
engine_params = json.loads(engine_params)
engine_params_json = {}

if engine_params.get("fixed_entities") is not None:
engine_params_json["fixed_entities"] = [x for x in engine_params.get("fixed_entities").split(",")]

if engine_params.get("sample_entities") is not None:
engine_params_json["sample_entities"] = [x for x in engine_params.get("fixed_entities").split(",")]

if engine_params.get("ner_model_name") is not None:
engine_params_json["ner_model_name"] = engine_params.get("ner_model_name")

if engine_params.get("enable_filtering") is not None:
engine_params_json["enable_filtering"] = engine_params.get("enable_filtering")

engine_params_json["filter_params"] = {
"score_threshold": float(engine_params.get("score_threshold")) if engine_params.get("score_threshold") is not None else None,
"attention_score_threshold": float(engine_params.get("attention_score_threshold")) if engine_params.get("attention_score_threshold") is not None else None,
"similarity_threshold": float(engine_params.get("similarity_threshold")) if engine_params.get("similarity_threshold") is not None else None,
"min_cluster_size": int(engine_params.get("min_cluster_size")) if engine_params.get("min_cluster_size") is not None else None,
"min_samples": int(engine_params.get("min_samples")) if engine_params.get("min_samples") is not None else None,
"cluster_persistence_threshold": float(engine_params.get("cluster_persistence_threshold")) if engine_params.get("cluster_persistence_threshold") is not None else None,
}

if engine_params.get("is_confined_search") is not None:
engine_params_json["is_confined_search"] = engine_params.get("is_confined_search")

if engine_params.get("user_context") is not None:
engine_params_json["user_context"] = engine_params.get("user_context")
is_engine_params = True
except Exception as e:
logger.error("Got error while loading engine params: ", e)
Expand All @@ -37,7 +64,7 @@ async def start_workflow(config_dict: dict):
engines = []
for engine_config in engine_configs:
if is_engine_params:
engine_config.update(engine_params)
engine_config.update(engine_params_json)
engine_config_source = engine_config.get("config", {})
if engine_config["name"] == "knowledge_graph_using_openai":
engine_config.update({"openai_api_key": engine_config["config"]["openai_api_key"]})
Expand Down
13 changes: 6 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cachetools==5.3.3
aiohttp==3.9.2
aiohttp==3.9.3
attrs==23.1.0
beautifulsoup4==4.12.2
beautifulsoup4==4.12.3
boto3==1.26.146
botocore==1.29.146
bs4==0.0.1
Expand All @@ -11,7 +11,7 @@ hdbscan==0.8.33
jira==3.6.0
jmespath==1.0.1
joblib==1.2.0
json5==0.9.14
json5==0.9.24
jsonmerge==1.9.0
jsonschema==4.17.3
kombu==5.2.4
Expand All @@ -22,7 +22,7 @@ lxml==4.9.2
newspaper3k==0.2.8
nltk==3.8.1
numpy==1.24.3
Pillow==10.0.1
Pillow==10.3.0
pydantic==2.6.4
PyJWT==2.4.0
pytest==7.3.2
Expand All @@ -31,15 +31,15 @@ redis==5.0.3
regex==2023.5.5
sentence-transformers==2.2.2
spacy==3.7.2
uvicorn==0.22.0
uvicorn==0.29.0
slack-sdk==3.26.1
pylint==2.17.4
pytest-cov==4.1.0
pytest-mock==3.11.1
tensorflow==2.14.0
transformers==4.36.0
torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu
pymupdf==1.23.26
pymupdf==1.24.0
asyncio==3.4.3
prometheus-client==0.17.1
rdflib==7.0.0
Expand All @@ -63,7 +63,6 @@ psutil==5.9.8
dropbox==11.36.2
requests==2.31.0
google-api-python-client==2.105.0
rapidocr-onnxruntime==1.3.9
pybase64==1.3.1
pdfminer==20191125
requests_html==0.10.0
Expand Down
15 changes: 7 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

requirements = [
"cachetools==5.3.3",
"aiohttp==3.9.2",
"aiohttp==3.9.3",
"attrs==23.1.0",
"beautifulsoup4==4.12.2",
"beautifulsoup4==4.12.3",
"boto3==1.26.146",
"botocore==1.29.146",
"bs4==0.0.1",
Expand All @@ -18,7 +18,7 @@
"jira==3.6.0",
"jmespath==1.0.1",
"joblib==1.2.0",
"json5==0.9.14",
"json5==0.9.24",
"jsonmerge==1.9.0",
"jsonschema==4.17.3",
"kombu==5.2.4",
Expand All @@ -27,7 +27,7 @@
"newspaper3k==0.2.8",
"nltk==3.8.1",
"numpy==1.24.3",
"Pillow==10.0.1",
"Pillow==10.3.0",
"pydantic==2.6.4",
"PyJWT==2.4.0",
"pytest==7.3.2",
Expand All @@ -36,7 +36,7 @@
"regex==2023.5.5",
"sentence-transformers==2.2.2",
"spacy==3.7.2",
"uvicorn==0.22.0",
"uvicorn==0.29.0",
"slack-sdk==3.26.1",
"pylint==2.17.4",
"pytest-cov==4.1.0",
Expand All @@ -50,7 +50,7 @@
"pytest-asyncio==0.23.2",
"pyshacl==0.25.0",
"google-cloud-storage==2.14.0",
"PyMuPDF==1.23.26",
"PyMuPDF==1.24.0",
"pydub==0.25.1",
"SpeechRecognition==3.10.1",
"pytesseract==0.3.10",
Expand All @@ -67,7 +67,6 @@
"requests==2.31.0",
"google-api-python-client==2.105.0",
"requests_html==0.10.0",
"rapidocr-onnxruntime==1.3.9",
"pybase64==1.3.1",
"pdfminer==20191125",
"unidecode==1.3.7",
Expand All @@ -84,7 +83,7 @@

setup(
name="querent",
version="3.0.0",
version="3.0.2",
author="Querent AI",
description="The Asynchronous Data Dynamo and Graph Neural Network Catalyst",
long_description=long_description,
Expand Down
Loading

0 comments on commit 0d2e910

Please sign in to comment.