fixed llama and open ai process images

Querent-ai · Apr 25, 2024 · 1376006 · 1376006
1 parent e938505
commit 1376006
Show file tree

Hide file tree

Showing 11 changed files with 281 additions and 124 deletions.
diff --git a/.gitignore b/.gitignore
@@ -200,3 +200,7 @@ lib/vis-9.1.2/vis-network.css
 lib/vis-9.1.2/vis-network.min.js
 tests/data/llm/cleaned_graph_event (copy).csv
 tests/data/llm/cleaned_graph_event1.csv
+graph.png
+my_subgraph_data.csv
+subgraph_output_2.csv
+subgraph_output.csv
diff --git a/querent/core/transformers/bert_ner_opensourcellm.py b/querent/core/transformers/bert_ner_opensourcellm.py
@@ -113,8 +113,92 @@ def validate(self) -> bool:
     def process_messages(self, data: IngestedMessages):
         return super().process_messages(data)
 
+    @staticmethod
+    def validate_ingested_images(data: IngestedImages) -> bool:
+        if data.is_error():
+
+            return False
+
+        return True
     async def process_images(self, data: IngestedImages):
-        return super().process_images(data)
+        doc_entity_pairs = []
+        doc_entity_pairs_ocr = []
+        entity_ocr = []
+        number_sentences = 0
+        try:
+            doc_source = data.doc_source
+            if not BERTLLM.validate_ingested_images(data):
+                self.set_termination_event()                                      
+                return
+            if data.ocr_text:
+                ocr_text = ' '.join(data.ocr_text)
+            if data.text:
+                content = ' '.join(data.text)
+            file = data.file
+            ocr_content = ocr_text             
+            if ocr_content or content:
+                ocr_tokens = self.ner_llm_instance._tokenize_and_chunk(ocr_content)
+                for tokenized_sentence, original_sentence, sentence_idx in ocr_tokens:
+                    (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in ocr_tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
+                    if entities:
+                        entity_ocr.append(entities)
+                    if entity_pairs:
+                        doc_entity_pairs_ocr.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
+                    number_sentences = number_sentences + 1
+                if len(doc_entity_pairs_ocr) >= 1:
+                    results = doc_entity_pairs_ocr
+                elif len(doc_entity_pairs_ocr) == 0:
+                    if content:
+                        if self.fixed_entities:
+                            content = self.entity_context_extractor.find_entity_sentences(content)
+                        tokens = self.ner_llm_instance._tokenize_and_chunk(content)
+                        for tokenized_sentence, original_sentence, sentence_idx in tokens:
+                            (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
+                            if entity_pairs:
+                                doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
+                            number_sentences = number_sentences + 1
+                        if len(doc_entity_pairs) > 0 and len(entity_ocr) >=1:
+                            results = [self.ner_llm_instance.filter_matching_entities(doc_entity_pairs, entity_ocr)]
+                        elif len(doc_entity_pairs) > 0 and len(entity_ocr) == 0:
+                            results = doc_entity_pairs
+                    else:
+                        return        
+                if results:
+                    doc_entity_pairs = self.ner_llm_instance.remove_duplicates(results)
+                    filtered_triples = process_data(doc_entity_pairs, file)
+                    if self.skip_inferences:
+                        return filtered_triples, file
+                    else :
+                        unique_id = str(hash(data.image))
+                        for triple in filtered_triples:
+                            if not self.termination_event.is_set():
+                                updated_data = []
+                                entity, info_json, second_entity = triple
+                                info = json.loads(info_json)
+                                info['subject_type'] = info.pop('entity1_label')
+                                info['object_type'] = info.pop('entity2_label')
+                                info['predicate'] = "has image"
+                                info['predicate_type'] = "has image"
+                                info['context_embeddings'] = self.create_emb.embeddings.embed_query(info['context'])
+                                updated_json = json.dumps(info)
+                                updated_tuple = (entity, updated_json, second_entity)
+                                graph_json = TripleToJsonConverter.convert_graphjson(updated_tuple)
+                                graph_json['unique_image_id'] = unique_id
+                                graph_json = json.dumps(graph_json)
+                                if graph_json:
+                                    current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source)
+                                    await self.set_state(new_state=current_state)
+                                vector_json = TripleToJsonConverter.convert_vectorjson(updated_tuple)
+                                vector_json['unique_image_id'] = unique_id
+                                vector_json = json.dumps(vector_json)
+                                if vector_json:
+                                    current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source)
+                                    await self.set_state(new_state=current_state)
+            else:
+                return        
+        except Exception as e:
+            print("Exception -----------", e)
+            self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")
 
     async def process_tables(self, data: IngestedTables):
         return super().process_tables(data)
@@ -206,7 +290,7 @@ async def process_tokens(self, data: IngestedTokens):
                 if not filtered_triples:
                     return
                 elif not self.skip_inferences:
-                    relationships = self.semantic_extractor.process_tokens(filtered_triples)
+                    relationships = self.semantic_extractor.process_tokens(filtered_triples[:1])
                     relationships = self.semantictriplefilter.filter_triples(relationships)
                     if len(relationships) > 0:
                         embedding_triples = self.create_emb.generate_embeddings(relationships)

diff --git a/querent/core/transformers/fixed_entities_set_opensourcellm.py b/querent/core/transformers/fixed_entities_set_opensourcellm.py
@@ -101,8 +101,7 @@ def process_tables(self, data: IngestedTables):
     async def process_images(self, data: IngestedImages):
         doc_entity_pairs = []
         doc_entity_pairs_ocr = []
-        entities_list = []
-        final_entities_list = []
+        entity_ocr = []
         number_sentences = 0
         try:
             doc_source = data.doc_source
@@ -111,101 +110,44 @@ async def process_images(self, data: IngestedImages):
                 return
             if data.ocr_text:
                 ocr_text = ' '.join(data.ocr_text)
-            else:
-                ocr_text = data.ocr_text
-
             if data.text:
-                clean_text = ' '.join(data.text)
-            else:
-                clean_text = data.text
-
-            file, content = data.file, clean_text
-
-            ocr_content = ocr_text
-
-            if ocr_content:
-                if self.fixed_entities:
-                    ocr_content = self.entity_context_extractor.find_entity_sentences(ocr_content)
+                content = ' '.join(data.text)
+            file = data.file
+            ocr_content = ocr_text             
+            if ocr_content or content:
                 ocr_tokens = self.ner_llm_instance._tokenize_and_chunk(ocr_content)
                 for tokenized_sentence, original_sentence, sentence_idx in ocr_tokens:
                     (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in ocr_tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
-                    print("Entities ---------", entities)
-                    print("Entities pairs   ---------------------------", entity_pairs)
+                    if entities:
+                        entity_ocr.append(entities)
                     if entity_pairs:
                         doc_entity_pairs_ocr.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
-                    else:
-                        continue
                     number_sentences = number_sentences + 1
-
-                print("Doc entity pairs --------", doc_entity_pairs)
-
-                if len(doc_entity_pairs_ocr) == 0 and len(ocr_content) != 0:
+                if len(doc_entity_pairs_ocr) >= 1:
+                    results = doc_entity_pairs_ocr
+                elif len(doc_entity_pairs_ocr) == 0:
                     if content:
                         if self.fixed_entities:
                             content = self.entity_context_extractor.find_entity_sentences(content)
                         tokens = self.ner_llm_instance._tokenize_and_chunk(content)
-                        doc_entity_pairs_ocr = self.ner_llm_instance.extract_entities_from_sentence_for_given_sentence(ocr_content, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
-                        print("doc_entity_pairs_ocr-----------------------", doc_entity_pairs_ocr)
                         for tokenized_sentence, original_sentence, sentence_idx in tokens:
-                            #return list of entities from document, and entity pair
-                            print("Here in side fo loop")
-                            (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_chunk(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
-                            print("Entity pairs found from content", entity_pairs)
-                            print("Entities found from content", entities)
+                            (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
                             if entity_pairs:
-
                                 doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
-                                entities_list.append(entities)
                             number_sentences = number_sentences + 1
-                        #process those entities and ocr entity here
-                        #if FE, then find the one most occuring
-                        #if not FE, find the entity pair, where 1 entity is OCR text, and other is any other entity, which is most occuring, or which has higher confidence
-                        final_entities_list = self.ner_llm_instance.create_subject_object_sentence_tuples(doc_entity_pairs_ocr, entities_list)
-
-
-            elif len(ocr_content) == 0:
-                #highest confidence entity pair from page text
-                sample_entity_pair = [{'entity': 'Image', 'label': 'image_data', 'score': 1.0, 'start_idx': 1, 'noun_chunk': 'image', 'noun_chunk_length': 1}]
-                final_entities_list = self.ner_llm_instance.create_subject_object_sentence_tuples(sample_entity_pair, entities_list)
-
-
-            print("Final entities ------", final_entities_list)
-            #-
-
-            if self.sample_entities:
-                doc_entity_pairs = self.entity_context_extractor.process_entity_types(doc_entities=final_entities_list)
-            if doc_entity_pairs and any(doc_entity_pairs):
-                doc_entity_pairs = self.ner_llm_instance.remove_duplicates(final_entities_list)
-                filtered_triples = process_data(doc_entity_pairs, file)
-                if not filtered_triples:
-                    self.logger.debug("No entity pairs")
-                    return
-                elif not self.skip_inferences:
-                    relationships = self.semantic_extractor.process_tokens(filtered_triples)
-                    self.logger.debug(f"length of relationships {len(relationships)}")
-                    relationships = self.semantictriplefilter.filter_triples(relationships)
-                    if len(relationships) > 0:
-                        embedding_triples = self.create_emb.generate_embeddings(relationships)
-                        if self.sample_relationships:
-                            embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples)
-                        for triple in embedding_triples:
-                            if not self.termination_event.is_set():
-                                graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
-                                if graph_json:
-                                    current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source)
-                                    await self.set_state(new_state=current_state)
-                                vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
-                                if vector_json:
-                                    current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source)
-                                    await self.set_state(new_state=current_state)
-                            else:
-                                return
+                        if len(doc_entity_pairs) > 0 and len(entity_ocr) >=1:
+                            results = [self.ner_llm_instance.filter_matching_entities(doc_entity_pairs, entity_ocr)]
+                        elif len(doc_entity_pairs) > 0 and len(entity_ocr) == 0:
+                            results = doc_entity_pairs
                     else:
-                        return
-                else:
-                    return filtered_triples, file
+                        return        
+                if results:
+                    doc_entity_pairs = self.ner_llm_instance.remove_duplicates(results)
+                    filtered_triples = process_data(doc_entity_pairs, file)
+                    if self.skip_inferences:
+                        return filtered_triples, file
             else:
-                return
+                return        
         except Exception as e:
             self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")
 

diff --git a/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py b/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py
@@ -102,15 +102,38 @@ async def process_images(self, data: IngestedImages):
             if not GPTLLM.validate_ingested_images(data):
                     self.set_termination_event()                    
                     return 
-
+            unique_id = str(hash(data.image))
             doc_source = data.doc_source
             relationships = []
             unique_keys = set()
             result = await self.llm_instance.process_images(data)  
-            if not result: 
-                return 
-
-            return None
+            if not result: return 
+            else:
+                filtered_triples, file = result
+                for triple in filtered_triples:
+                    if not self.termination_event.is_set():
+                        updated_data = []
+                        entity, info_json, second_entity = triple
+                        info = json.loads(info_json)
+                        info['subject_type'] = info.pop('entity1_label')
+                        info['object_type'] = info.pop('entity2_label')
+                        info['predicate'] = "has image"
+                        info['predicate_type'] = "has image"
+                        info['context_embeddings'] = self.create_emb.embeddings.embed_query(info['context'])
+                        updated_json = json.dumps(info)
+                        updated_tuple = (entity, updated_json, second_entity)
+                        graph_json = TripleToJsonConverter.convert_graphjson(updated_tuple)
+                        graph_json['unique_image_id'] = unique_id
+                        graph_json = json.dumps(graph_json)
+                        if graph_json:
+                            current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source)
+                            await self.set_state(new_state=current_state)
+                        vector_json = TripleToJsonConverter.convert_vectorjson(updated_tuple)
+                        vector_json['unique_image_id'] = unique_id
+                        vector_json = json.dumps(vector_json)
+                        if vector_json:
+                            current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source)
+                            await self.set_state(new_state=current_state)
 
         except Exception as e:
             self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")

diff --git a/querent/kg/ner_helperfunctions/ner_llm_transformer.py b/querent/kg/ner_helperfunctions/ner_llm_transformer.py
@@ -319,7 +319,6 @@ def extract_fixed_entities_from_chunk(self, chunk: List[str], fixed_entities: Li
 
 
     def extract_entities_from_sentence_for_given_sentence(self, sentence: str, sentence_idx: int, all_sentences: List[str], fixed_entities_flag: bool, fixed_entities: List[str],entity_types: List[str]):
-        print("Extracting entity pair")
         try:
             tokens = self.tokenize_sentence(sentence)
             chunks = self.get_chunks(tokens)
@@ -452,33 +451,27 @@ def find_most_frequent_entity_pair(self, binary_pairs):
             "contexts": contexts
         }
 
-    def create_subject_object_sentence_tuples(self, ocr_entities, entity_list):
-        # Prepare the list to hold the result tuples
-        results = []
-
-        for single_entity in ocr_entities:
-
-            # Iterate through each entity in the list
-            for entity in entity_list:
-                # Create a tuple with the single entity as 'subject', the current entity as 'object', and use the 'sentence' from the object entity
-                if 'sentence' in entity:
-                    result_tuple = (
-                        single_entity,
-                        entity['sentence'],
-                        entity
-                    )
-
-                    results.append(result_tuple)
-                else:
-                    # Handle cases where 'sentence' might not be present in the entity dictionary
-                    result_tuple = (
-                        single_entity,
-                        entity,
-                        "No sentence available"
-                    )
-                    results.append(result_tuple)
-
-        return results
+    def filter_matching_entities(self, tuples_nested_list, entities_nested_list):
+    # Initialize the list to store matching tuples
+        matched_tuples = []
+
+        # Loop through each list of entities
+        for entities_list in entities_nested_list:
+            # Loop through each entity dictionary in the current list
+            for entity_dict in entities_list:
+                entity_name = entity_dict['entity']  # Extract the entity name
+
+                # Loop through each list of tuples
+                for tuples_list in tuples_nested_list:
+                    # Loop through each tuple in the current list
+                    for tup in tuples_list:
+                        # Check if the entity is in the 1st or 3rd element of the tuple
+                        if entity_name in tup[0] or entity_name in tup[2]:
+                            # Add the tuple to the result list if it's not already included
+                            if tup not in matched_tuples:
+                                matched_tuples.append(tup)
+
+        return matched_tuples