Skip to content

Commit

Permalink
changes
Browse files Browse the repository at this point in the history
  • Loading branch information
ngupta10 committed Apr 16, 2024
1 parent 0c862a3 commit b2e6a29
Show file tree
Hide file tree
Showing 8 changed files with 233 additions and 157 deletions.
42 changes: 18 additions & 24 deletions querent/core/transformers/bert_ner_opensourcellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,6 @@
from querent.kg.rel_helperfunctions.triple_to_json import TripleToJsonConverter
from querent.kg.rel_helperfunctions.embedding_store import EmbeddingStore
from querent.kg.rel_helperfunctions.filter_semantic_triples import SemanticTripleFilter

"""
BERTLLM is a class derived from BaseEngine designed for processing language models, particularly focusing on named entity recognition and relationship extraction in text. It integrates various components for handling different types of input data (messages, images, code, tokens), extracting entities, filtering relevant information, and constructing knowledge graphs.
Key functionalities include:
- Initializing with a specific configuration for named entity recognition (NER) and language model processing.
- Validating the presence of NER models and tokenizers.
- Processing various types of input data like messages, images, code, and tokens.
- Implementing methods for counting entity pairs, setting filter parameters, and processing tokens.
- Extracting and clustering entities and relationships from the text, and converting them into graph and vector formats.
- Handling errors and maintaining robustness in data processing.
The class also incorporates mechanisms for filtering and clustering entities and relationships, as well as extracting embeddings and generating output in different formats.
"""


class BERTLLM(BaseEngine):
def __init__(
self,
Expand Down Expand Up @@ -97,7 +81,7 @@ def __init__(
self.predicate_json_emb = self.create_emb.generate_relationship_embeddings(self.predicate_json)
elif self.sample_relationships:
self.predicate_context_extractor = FixedPredicateExtractor(predicate_types=self.sample_relationships,model = self.nlp_model)
self.predicate_json = self.predicate_context_extractor.construct_predicate_json(self.sample_relationships)
self.predicate_json = self.predicate_context_extractor.construct_predicate_json(relationship_types=self.sample_relationships)
self.predicate_json_emb = self.create_emb.generate_relationship_embeddings(self.predicate_json)
else:
self.predicate_context_extractor = None
Expand Down Expand Up @@ -167,8 +151,8 @@ async def process_tokens(self, data: IngestedTokens):
if content:
if self.fixed_entities:
content = self.entity_context_extractor.find_entity_sentences(content)
if self.fixed_relationships:
content = self.predicate_context_extractor.find_predicate_sentences(content)
# if self.fixed_relationships:
# content = self.predicate_context_extractor.find_predicate_sentences(content)
tokens = self.ner_llm_instance._tokenize_and_chunk(content)
for tokenized_sentence, original_sentence, sentence_idx in tokens:
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
Expand All @@ -180,6 +164,7 @@ async def process_tokens(self, data: IngestedTokens):
if self.sample_entities:
doc_entity_pairs = self.entity_context_extractor.process_entity_types(doc_entities=doc_entity_pairs)
if any(doc_entity_pairs):
print("Found doc_entity_pairs-------------------------------------", len(doc_entity_pairs))
doc_entity_pairs = self.ner_llm_instance.remove_duplicates(doc_entity_pairs)
pairs_withattn = self.attn_scores_instance.extract_and_append_attention_weights(doc_entity_pairs)
if self.enable_filtering == True and not self.entity_context_extractor and self.count_entity_pairs(pairs_withattn)>1 and not self.predicate_context_extractor:
Expand All @@ -188,30 +173,38 @@ async def process_tokens(self, data: IngestedTokens):
else:
pairs_withemb = pairs_withattn
pairs_with_predicates = process_data(pairs_withemb, file)
print("Found doc_entity_pairs-------------------------------------", len(pairs_with_predicates))
if self.enable_filtering == True and not self.entity_context_extractor and self.count_entity_pairs(pairs_withattn)>1 and not self.predicate_context_extractor:
cluster_output = self.triple_filter.cluster_triples(pairs_with_predicates)
clustered_triples = cluster_output['filtered_triples']
cluster_labels = cluster_output['cluster_labels']
cluster_persistence = cluster_output['cluster_persistence']
# final_clustered_triples = self.triple_filter.filter_by_cluster_persistence(pairs_with_predicates, cluster_persistence, cluster_labels)
if clustered_triples:
filtered_triples, reduction_count = self.triple_filter.filter_triples(clustered_triples)
else:
# filtered_triples, _ = self.triple_filter.filter_triples(clustered_triples)
self.logger.debug(f"Filtering in {self.__class__.__name__} producing 0 entity pairs. Filtering Disabled. ")
filtered_triples = pairs_with_predicates
else:
filtered_triples = pairs_with_predicates
print("Found doc_entity_pairs-------------------------------------", len(filtered_triples))
if not filtered_triples:
self.logger.debug("No entity pairs")
return
elif not self.skip_inferences:
relationships = self.semantic_extractor.process_tokens(filtered_triples)
print("Extracting Entities-------------------------------------", filtered_triples)
relationships = self.semantic_extractor.process_tokens(filtered_triples[:5])
relationships = self.semantictriplefilter.filter_triples(relationships)
print("Relationships: {}".format(relationships))
if len(relationships) > 0:
embedding_triples = self.create_emb.generate_embeddings(relationships)
if self.fixed_relationships and self.sample_relationships:
embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True, generate_embeddings_with_fixed_relationship = True)
elif self.sample_relationships:
print("Only for sample_relationships")
embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True)
else:
embedding_triples = self.create_emb.generate_embeddings(relationships)
if self.sample_relationships:
embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples)
embedding_triples = self.predicate_context_extractor.update_embedding_triples_with_similarity(self.predicate_json_emb, embedding_triples)
for triple in embedding_triples:
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
Expand All @@ -226,4 +219,5 @@ async def process_tokens(self, data: IngestedTokens):
else:
return filtered_triples, file
except Exception as e:
print("Exception Caught: %s" % e)
self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")
18 changes: 14 additions & 4 deletions querent/core/transformers/fixed_entities_set_opensourcellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,12 @@ def __init__(
raise ValueError("If specific predicates are provided, their types should also be provided.")
if self.fixed_relationships and self.sample_relationships:
self.predicate_context_extractor = FixedPredicateExtractor(fixed_predicates=self.fixed_relationships, predicate_types=self.sample_relationships,model = self.nlp_model)
self.predicate_json = self.predicate_context_extractor.construct_predicate_json(self.fixed_relationships, self.sample_relationships)
self.predicate_json_emb = self.create_emb.generate_relationship_embeddings(self.predicate_json)
elif self.sample_relationships:
self.predicate_context_extractor = FixedPredicateExtractor(predicate_types=self.sample_relationships,model = self.nlp_model)
self.predicate_json = self.predicate_context_extractor.construct_predicate_json(relationship_types=self.sample_relationships)
self.predicate_json_emb = self.create_emb.generate_relationship_embeddings(self.predicate_json)
else:
self.predicate_context_extractor = None
self.user_context = config.user_context
Expand Down Expand Up @@ -145,8 +149,8 @@ async def process_tokens(self, data: IngestedTokens):
if content:
if self.fixed_entities:
content = self.entity_context_extractor.find_entity_sentences(content)
if self.fixed_relationships:
content = self.predicate_context_extractor.find_predicate_sentences(content)
# if self.fixed_relationships:
# content = self.predicate_context_extractor.find_predicate_sentences(content)
tokens = self.ner_llm_instance._tokenize_and_chunk(content)
for tokenized_sentence, original_sentence, sentence_idx in tokens:
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
Expand All @@ -169,9 +173,15 @@ async def process_tokens(self, data: IngestedTokens):
self.logger.debug(f"length of relationships {len(relationships)}")
relationships = self.semantictriplefilter.filter_triples(relationships)
if len(relationships) > 0:
embedding_triples = self.create_emb.generate_embeddings(relationships)
if self.fixed_relationships and self.sample_relationships:
embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True, generate_embeddings_with_fixed_relationship = True)
elif self.sample_relationships:
print("Only for sample_relationships")
embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True)
else:
embedding_triples = self.create_emb.generate_embeddings(relationships)
if self.sample_relationships:
embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples)
embedding_triples = self.predicate_context_extractor.update_embedding_triples_with_similarity(self.predicate_json_emb, embedding_triples)
for triple in embedding_triples:
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,19 @@ def __init__(
self.user_context = config.user_context
self.nlp_model = NER_LLM.set_nlp_model(config.spacy_model_path)
self.nlp_model = NER_LLM.get_class_variable()
self.create_emb = EmbeddingStore(inference_api_key=config.huggingface_token)
if self.fixed_relationships and not self.sample_relationships:
raise ValueError("If specific predicates are provided, their types should also be provided.")
if self.fixed_relationships and self.sample_relationships:
self.predicate_context_extractor = FixedPredicateExtractor(fixed_predicates=self.fixed_relationships, predicate_types=self.sample_relationships, model = self.nlp_model)
self.predicate_context_extractor = FixedPredicateExtractor(fixed_predicates=self.fixed_relationships, predicate_types=self.sample_relationships,model = self.nlp_model)
self.predicate_json = self.predicate_context_extractor.construct_predicate_json(self.fixed_relationships, self.sample_relationships)
self.predicate_json_emb = self.create_emb.generate_relationship_embeddings(self.predicate_json)
elif self.sample_relationships:
self.predicate_context_extractor = FixedPredicateExtractor(predicate_types=self.sample_relationships, model = self.nlp_model)
self.predicate_context_extractor = FixedPredicateExtractor(predicate_types=self.sample_relationships,model = self.nlp_model)
self.predicate_json = self.predicate_context_extractor.construct_predicate_json(relationship_types=self.sample_relationships)
self.predicate_json_emb = self.create_emb.generate_relationship_embeddings(self.predicate_json)
else:
self.predicate_context_extractor = None
self.create_emb = EmbeddingStore(inference_api_key=config.huggingface_token)
if config.is_confined_search:
self.llm_instance = Fixed_Entities_LLM(input_queue, llm_config)
else :
Expand Down Expand Up @@ -273,9 +277,15 @@ async def process_tokens(self, data: IngestedTokens):
output_tuple = self.generate_output_tuple(result, context_json)
relationships.append(output_tuple)
if len(relationships) > 0:
embedding_triples = self.create_emb.generate_embeddings(relationships)
if self.fixed_relationships and self.sample_relationships:
embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True, generate_embeddings_with_fixed_relationship = True)
elif self.sample_relationships:
print("Only for sample_relationships")
embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True)
else:
embedding_triples = self.create_emb.generate_embeddings(relationships)
if self.sample_relationships:
embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples)
embedding_triples = self.predicate_context_extractor.update_embedding_triples_with_similarity(self.predicate_json_emb, embedding_triples)
for triple in embedding_triples:
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
Expand Down
62 changes: 36 additions & 26 deletions querent/kg/ner_helperfunctions/fixed_predicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,42 +137,52 @@ def process_predicate_types(self, doc_predicates):
except Exception as e:
raise Exception(f"Error processing predicate types: {e}")

def construct_predicate_json(relationships=None, relationship_types=None):
def construct_predicate_json(self, relationships=None, relationship_types=None):
predicate_values = []
if relationships and relationship_types:
if len(relationships) != len(relationship_types):
raise Exception("'relationships' and 'relationship_types' lists must have the same length.")
for relationship, relationship_type in zip(relationships, relationship_types):
predicate_value = f"{relationship} ({relationship_type})"
predicate_values.append({"predicate_value": predicate_value, "relationship": relationship, "type": relationship_type})
predicate_values.append(json.dumps({"predicate_value": predicate_value, "relationship": relationship, "type": relationship_type}))
elif relationship_types:
for relationship_type in relationship_types:
predicate_values.append({"predicate_value": relationship_type, "type": relationship_type})
predicate_values.append(json.dumps({"predicate_value": relationship_type, "type": relationship_type}))
else:

return
return []

return predicate_values

return json.dumps(predicate_values)



def update_embedding_triples_with_similarity(predicate_json_emb, embedding_triples):
predicate_json_emb = [json.loads(item) for item in predicate_json_emb]
embedding_triples = [json.loads(item) for item in embedding_triples]
predicate_emb_list = [item["predicate_emb"] for item in predicate_json_emb if item["predicate_emb"] != "Not Implemented"]
predicate_emb_matrix = np.array(predicate_emb_list)
for triple in embedding_triples:
if triple["predicate_emb"] == "Not Implemented":
def update_embedding_triples_with_similarity(self, predicate_json_emb, embedding_triples):
try:
print("Updating embedding------------------------------")
predicate_json_emb = [json.loads(item) for item in predicate_json_emb]
predicate_emb_list = [item["predicate_emb"] for item in predicate_json_emb if item["predicate_emb"] != "Not Implemented"]
print("Updating embedding------------------------------1")
predicate_emb_matrix = np.array(predicate_emb_list)
print("Updating embedding------------------------------2")
updated_embedding_triples = []
for triple in embedding_triples:
entity, triple_json, study_field = triple
triple_data = json.loads(triple_json)

continue

current_predicate_emb = np.array(triple["predicate_emb"]).reshape(1, -1)
similarities = cosine_similarity(current_predicate_emb, predicate_emb_matrix)
max_similarity_index = np.argmax(similarities)
most_similar_predicate_details = predicate_json_emb[max_similarity_index]
triple["predicate_type"] = most_similar_predicate_details["type"]
if most_similar_predicate_details["relationship"].lower() != "unlabelled":
triple["predicate"] = most_similar_predicate_details["relationship"]
updated_embedding_triples = [json.dumps(item) for item in embedding_triples]

return updated_embedding_triples
if triple_data["predicate_emb"] == "Not Implemented":
updated_embedding_triples.append(triple)
continue

current_predicate_emb = np.array(triple_data["predicate_emb"]).reshape(1, -1)
similarities = cosine_similarity(current_predicate_emb, predicate_emb_matrix)
max_similarity_index = np.argmax(similarities)
most_similar_predicate_details = predicate_json_emb[max_similarity_index]
print("Score: ", similarities[0][max_similarity_index])
if similarities[0][max_similarity_index] > 0.4:
triple_data["predicate_type"] = most_similar_predicate_details["type"]
if most_similar_predicate_details["relationship"].lower() != "unlabelled":
triple_data["predicate"] = most_similar_predicate_details["relationship"]
updated_triple_json = json.dumps(triple_data)
updated_embedding_triples.append((entity, updated_triple_json, study_field))
return updated_embedding_triples
except Exception as e:
raise Exception(f"Error processing predicate types: {e}")
Loading

0 comments on commit b2e6a29

Please sign in to comment.