Skip to content

Commit

Permalink
fixed llama and open ai process images
Browse files Browse the repository at this point in the history
  • Loading branch information
ngupta10 committed Apr 25, 2024
1 parent e938505 commit 1376006
Show file tree
Hide file tree
Showing 11 changed files with 281 additions and 124 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,7 @@ lib/vis-9.1.2/vis-network.css
lib/vis-9.1.2/vis-network.min.js
tests/data/llm/cleaned_graph_event (copy).csv
tests/data/llm/cleaned_graph_event1.csv
graph.png
my_subgraph_data.csv
subgraph_output_2.csv
subgraph_output.csv
88 changes: 86 additions & 2 deletions querent/core/transformers/bert_ner_opensourcellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,92 @@ def validate(self) -> bool:
def process_messages(self, data: IngestedMessages):
return super().process_messages(data)

@staticmethod
def validate_ingested_images(data: IngestedImages) -> bool:
if data.is_error():

return False

return True
async def process_images(self, data: IngestedImages):
return super().process_images(data)
doc_entity_pairs = []
doc_entity_pairs_ocr = []
entity_ocr = []
number_sentences = 0
try:
doc_source = data.doc_source
if not BERTLLM.validate_ingested_images(data):
self.set_termination_event()
return
if data.ocr_text:
ocr_text = ' '.join(data.ocr_text)
if data.text:
content = ' '.join(data.text)
file = data.file
ocr_content = ocr_text
if ocr_content or content:
ocr_tokens = self.ner_llm_instance._tokenize_and_chunk(ocr_content)
for tokenized_sentence, original_sentence, sentence_idx in ocr_tokens:
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in ocr_tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
if entities:
entity_ocr.append(entities)
if entity_pairs:
doc_entity_pairs_ocr.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
number_sentences = number_sentences + 1
if len(doc_entity_pairs_ocr) >= 1:
results = doc_entity_pairs_ocr
elif len(doc_entity_pairs_ocr) == 0:
if content:
if self.fixed_entities:
content = self.entity_context_extractor.find_entity_sentences(content)
tokens = self.ner_llm_instance._tokenize_and_chunk(content)
for tokenized_sentence, original_sentence, sentence_idx in tokens:
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
if entity_pairs:
doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
number_sentences = number_sentences + 1
if len(doc_entity_pairs) > 0 and len(entity_ocr) >=1:
results = [self.ner_llm_instance.filter_matching_entities(doc_entity_pairs, entity_ocr)]
elif len(doc_entity_pairs) > 0 and len(entity_ocr) == 0:
results = doc_entity_pairs
else:
return
if results:
doc_entity_pairs = self.ner_llm_instance.remove_duplicates(results)
filtered_triples = process_data(doc_entity_pairs, file)
if self.skip_inferences:
return filtered_triples, file
else :
unique_id = str(hash(data.image))
for triple in filtered_triples:
if not self.termination_event.is_set():
updated_data = []
entity, info_json, second_entity = triple
info = json.loads(info_json)
info['subject_type'] = info.pop('entity1_label')
info['object_type'] = info.pop('entity2_label')
info['predicate'] = "has image"
info['predicate_type'] = "has image"
info['context_embeddings'] = self.create_emb.embeddings.embed_query(info['context'])
updated_json = json.dumps(info)
updated_tuple = (entity, updated_json, second_entity)
graph_json = TripleToJsonConverter.convert_graphjson(updated_tuple)
graph_json['unique_image_id'] = unique_id
graph_json = json.dumps(graph_json)
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source)
await self.set_state(new_state=current_state)
vector_json = TripleToJsonConverter.convert_vectorjson(updated_tuple)
vector_json['unique_image_id'] = unique_id
vector_json = json.dumps(vector_json)
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source)
await self.set_state(new_state=current_state)
else:
return
except Exception as e:
print("Exception -----------", e)
self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")

async def process_tables(self, data: IngestedTables):
return super().process_tables(data)
Expand Down Expand Up @@ -206,7 +290,7 @@ async def process_tokens(self, data: IngestedTokens):
if not filtered_triples:
return
elif not self.skip_inferences:
relationships = self.semantic_extractor.process_tokens(filtered_triples)
relationships = self.semantic_extractor.process_tokens(filtered_triples[:1])
relationships = self.semantictriplefilter.filter_triples(relationships)
if len(relationships) > 0:
embedding_triples = self.create_emb.generate_embeddings(relationships)
Expand Down
102 changes: 22 additions & 80 deletions querent/core/transformers/fixed_entities_set_opensourcellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,7 @@ def process_tables(self, data: IngestedTables):
async def process_images(self, data: IngestedImages):
doc_entity_pairs = []
doc_entity_pairs_ocr = []
entities_list = []
final_entities_list = []
entity_ocr = []
number_sentences = 0
try:
doc_source = data.doc_source
Expand All @@ -111,101 +110,44 @@ async def process_images(self, data: IngestedImages):
return
if data.ocr_text:
ocr_text = ' '.join(data.ocr_text)
else:
ocr_text = data.ocr_text

if data.text:
clean_text = ' '.join(data.text)
else:
clean_text = data.text

file, content = data.file, clean_text

ocr_content = ocr_text

if ocr_content:
if self.fixed_entities:
ocr_content = self.entity_context_extractor.find_entity_sentences(ocr_content)
content = ' '.join(data.text)
file = data.file
ocr_content = ocr_text
if ocr_content or content:
ocr_tokens = self.ner_llm_instance._tokenize_and_chunk(ocr_content)
for tokenized_sentence, original_sentence, sentence_idx in ocr_tokens:
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in ocr_tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
print("Entities ---------", entities)
print("Entities pairs ---------------------------", entity_pairs)
if entities:
entity_ocr.append(entities)
if entity_pairs:
doc_entity_pairs_ocr.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
else:
continue
number_sentences = number_sentences + 1

print("Doc entity pairs --------", doc_entity_pairs)

if len(doc_entity_pairs_ocr) == 0 and len(ocr_content) != 0:
if len(doc_entity_pairs_ocr) >= 1:
results = doc_entity_pairs_ocr
elif len(doc_entity_pairs_ocr) == 0:
if content:
if self.fixed_entities:
content = self.entity_context_extractor.find_entity_sentences(content)
tokens = self.ner_llm_instance._tokenize_and_chunk(content)
doc_entity_pairs_ocr = self.ner_llm_instance.extract_entities_from_sentence_for_given_sentence(ocr_content, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
print("doc_entity_pairs_ocr-----------------------", doc_entity_pairs_ocr)
for tokenized_sentence, original_sentence, sentence_idx in tokens:
#return list of entities from document, and entity pair
print("Here in side fo loop")
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_chunk(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
print("Entity pairs found from content", entity_pairs)
print("Entities found from content", entities)
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
if entity_pairs:

doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
entities_list.append(entities)
number_sentences = number_sentences + 1
#process those entities and ocr entity here
#if FE, then find the one most occuring
#if not FE, find the entity pair, where 1 entity is OCR text, and other is any other entity, which is most occuring, or which has higher confidence
final_entities_list = self.ner_llm_instance.create_subject_object_sentence_tuples(doc_entity_pairs_ocr, entities_list)


elif len(ocr_content) == 0:
#highest confidence entity pair from page text
sample_entity_pair = [{'entity': 'Image', 'label': 'image_data', 'score': 1.0, 'start_idx': 1, 'noun_chunk': 'image', 'noun_chunk_length': 1}]
final_entities_list = self.ner_llm_instance.create_subject_object_sentence_tuples(sample_entity_pair, entities_list)


print("Final entities ------", final_entities_list)
#-

if self.sample_entities:
doc_entity_pairs = self.entity_context_extractor.process_entity_types(doc_entities=final_entities_list)
if doc_entity_pairs and any(doc_entity_pairs):
doc_entity_pairs = self.ner_llm_instance.remove_duplicates(final_entities_list)
filtered_triples = process_data(doc_entity_pairs, file)
if not filtered_triples:
self.logger.debug("No entity pairs")
return
elif not self.skip_inferences:
relationships = self.semantic_extractor.process_tokens(filtered_triples)
self.logger.debug(f"length of relationships {len(relationships)}")
relationships = self.semantictriplefilter.filter_triples(relationships)
if len(relationships) > 0:
embedding_triples = self.create_emb.generate_embeddings(relationships)
if self.sample_relationships:
embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples)
for triple in embedding_triples:
if not self.termination_event.is_set():
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source)
await self.set_state(new_state=current_state)
else:
return
if len(doc_entity_pairs) > 0 and len(entity_ocr) >=1:
results = [self.ner_llm_instance.filter_matching_entities(doc_entity_pairs, entity_ocr)]
elif len(doc_entity_pairs) > 0 and len(entity_ocr) == 0:
results = doc_entity_pairs
else:
return
else:
return filtered_triples, file
return
if results:
doc_entity_pairs = self.ner_llm_instance.remove_duplicates(results)
filtered_triples = process_data(doc_entity_pairs, file)
if self.skip_inferences:
return filtered_triples, file
else:
return
return
except Exception as e:
self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,38 @@ async def process_images(self, data: IngestedImages):
if not GPTLLM.validate_ingested_images(data):
self.set_termination_event()
return

unique_id = str(hash(data.image))
doc_source = data.doc_source
relationships = []
unique_keys = set()
result = await self.llm_instance.process_images(data)
if not result:
return

return None
if not result: return
else:
filtered_triples, file = result
for triple in filtered_triples:
if not self.termination_event.is_set():
updated_data = []
entity, info_json, second_entity = triple
info = json.loads(info_json)
info['subject_type'] = info.pop('entity1_label')
info['object_type'] = info.pop('entity2_label')
info['predicate'] = "has image"
info['predicate_type'] = "has image"
info['context_embeddings'] = self.create_emb.embeddings.embed_query(info['context'])
updated_json = json.dumps(info)
updated_tuple = (entity, updated_json, second_entity)
graph_json = TripleToJsonConverter.convert_graphjson(updated_tuple)
graph_json['unique_image_id'] = unique_id
graph_json = json.dumps(graph_json)
if graph_json:
current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source)
await self.set_state(new_state=current_state)
vector_json = TripleToJsonConverter.convert_vectorjson(updated_tuple)
vector_json['unique_image_id'] = unique_id
vector_json = json.dumps(vector_json)
if vector_json:
current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source)
await self.set_state(new_state=current_state)

except Exception as e:
self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")
Expand Down
49 changes: 21 additions & 28 deletions querent/kg/ner_helperfunctions/ner_llm_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,6 @@ def extract_fixed_entities_from_chunk(self, chunk: List[str], fixed_entities: Li


def extract_entities_from_sentence_for_given_sentence(self, sentence: str, sentence_idx: int, all_sentences: List[str], fixed_entities_flag: bool, fixed_entities: List[str],entity_types: List[str]):
print("Extracting entity pair")
try:
tokens = self.tokenize_sentence(sentence)
chunks = self.get_chunks(tokens)
Expand Down Expand Up @@ -452,33 +451,27 @@ def find_most_frequent_entity_pair(self, binary_pairs):
"contexts": contexts
}

def create_subject_object_sentence_tuples(self, ocr_entities, entity_list):
# Prepare the list to hold the result tuples
results = []

for single_entity in ocr_entities:

# Iterate through each entity in the list
for entity in entity_list:
# Create a tuple with the single entity as 'subject', the current entity as 'object', and use the 'sentence' from the object entity
if 'sentence' in entity:
result_tuple = (
single_entity,
entity['sentence'],
entity
)

results.append(result_tuple)
else:
# Handle cases where 'sentence' might not be present in the entity dictionary
result_tuple = (
single_entity,
entity,
"No sentence available"
)
results.append(result_tuple)

return results
def filter_matching_entities(self, tuples_nested_list, entities_nested_list):
# Initialize the list to store matching tuples
matched_tuples = []

# Loop through each list of entities
for entities_list in entities_nested_list:
# Loop through each entity dictionary in the current list
for entity_dict in entities_list:
entity_name = entity_dict['entity'] # Extract the entity name

# Loop through each list of tuples
for tuples_list in tuples_nested_list:
# Loop through each tuple in the current list
for tup in tuples_list:
# Check if the entity is in the 1st or 3rd element of the tuple
if entity_name in tup[0] or entity_name in tup[2]:
# Add the tuple to the result list if it's not already included
if tup not in matched_tuples:
matched_tuples.append(tup)

return matched_tuples



Expand Down
Loading

0 comments on commit 1376006

Please sign in to comment.