removing reduntant code

Querent-ai · May 3, 2024 · e895dba · e895dba
1 parent 867326c
commit e895dba
Show file tree

Hide file tree

Showing 7 changed files with 177 additions and 330 deletions.
diff --git a/querent/core/base_engine.py b/querent/core/base_engine.py
@@ -299,33 +299,3 @@ async def _stop_workers(self):
             self.termination_event.set()
         except Exception as e:
             self.logger.error(f"Error while stopping workers: {e}")
-
-    async def get_doc_entity_pairs(self, content):
-        doc_entity_pairs = []
-        if content:
-            if self.fixed_entities:
-                content = self.entity_context_extractor.find_entity_sentences(content)
-            if self.fixed_relationships:
-                content = self.predicate_context_extractor.find_predicate_sentences(content)
-            tokens = self.ner_llm_instance._tokenize_and_chunk(content)
-            for tokenized_sentence, original_sentence, sentence_idx in tokens:
-                (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
-                if entity_pairs:
-                    doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
-                number_sentences = number_sentences + 1
-        else:
-            return
-
-        return doc_entity_pairs
-
-    async def get_ocr_entity_pairs(self, ocr_content):
-        entity_ocr = []
-        doc_entity_pairs_ocr = []
-        ocr_tokens = self.ner_llm_instance._tokenize_and_chunk(ocr_content)
-        for tokenized_sentence, original_sentence, sentence_idx in ocr_tokens:
-            (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in ocr_tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
-            if entities:
-                entity_ocr.append(entities)
-            if entity_pairs:
-                doc_entity_pairs_ocr.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
-            number_sentences = number_sentences + 1
diff --git a/querent/core/transformers/bert_ner_opensourcellm.py b/querent/core/transformers/bert_ner_opensourcellm.py
@@ -123,6 +123,7 @@ def validate_ingested_images(data: IngestedImages) -> bool:
 
         return True
     async def process_images(self, data: IngestedImages):
+        print("Going to run the function from 000000000000000000")
         doc_entity_pairs = []
         doc_entity_pairs_ocr = []
         entity_ocr = []
@@ -138,21 +139,25 @@ async def process_images(self, data: IngestedImages):
             if data.text:
                 content = ' '.join(data.text)
             file = data.file
-            ocr_content = ocr_text             
+            ocr_content = ocr_text
+            print("Going to run the function from Mian1111111111", ocr_content)             
             if ocr_content or content:
-                (entity_ocr, doc_entity_pairs_ocr) = self.get_ocr_entity_pairs(ocr_content=ocr_content)
+                print("Going to run the function from Mian", ocr_content)
+                (entity_ocr, doc_entity_pairs_ocr) = await self.ner_llm_instance.get_entity_pairs(isConfinedSearch= self.isConfinedSearch, 
+                                                                                                  content=ocr_content,
+                                                                                                  fixed_entities=self.fixed_entities,
+                                                                                                  sample_entities=self.sample_entities)
+                print("Results from Main-------------", doc_entity_pairs_ocr)
                 if len(doc_entity_pairs_ocr) >= 1:
                     results = doc_entity_pairs_ocr
                 elif len(doc_entity_pairs_ocr) == 0:
                     if content:
                         if self.fixed_entities:
                             content = self.entity_context_extractor.find_entity_sentences(content)
-                        tokens = self.ner_llm_instance._tokenize_and_chunk(content)
-                        for tokenized_sentence, original_sentence, sentence_idx in tokens:
-                            (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
-                            if entity_pairs:
-                                doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
-                            number_sentences = number_sentences + 1
+                        (_, doc_entity_pairs) = await self.ner_llm_instance.get_entity_pairs(isConfinedSearch= self.isConfinedSearch, 
+                                                                                                  content=ocr_content,
+                                                                                                  fixed_entities=self.fixed_entities,
+                                                                                                  sample_entities=self.sample_entities)
                         if len(doc_entity_pairs) > 0 and len(entity_ocr) >=1:
                             results = [self.ner_llm_instance.filter_matching_entities(doc_entity_pairs, entity_ocr)]
                         elif len(doc_entity_pairs) > 0 and len(entity_ocr) == 0:
@@ -166,10 +171,8 @@ async def process_images(self, data: IngestedImages):
                         return filtered_triples, file
                     else :
                         unique_id = str(hash(data.image))
-                        for triple in filtered_triples:
+                        for entity, info_json, second_entity in filtered_triples:
                             if not self.termination_event.is_set():
-                                updated_data = []
-                                entity, info_json, second_entity = triple
                                 info = json.loads(info_json)
                                 info['subject_type'] = info.pop('entity1_label')
                                 info['object_type'] = info.pop('entity2_label')
@@ -191,6 +194,7 @@ async def process_images(self, data: IngestedImages):
             else:
                 return        
         except Exception as e:
+            print("Exception -----------------", e)
             self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")
 
     async def process_tables(self, data: IngestedTables):

diff --git a/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py b/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py
@@ -111,26 +111,15 @@ async def process_images(self, data: IngestedImages):
             if not result: return 
             else:
                 filtered_triples, file = result
-                for triple in filtered_triples:
-                    if not self.termination_event.is_set():
-                        updated_data = []
-                        entity, info_json, second_entity = triple
-                        info = json.loads(info_json)
-                        info['subject_type'] = info.pop('entity1_label')
-                        info['object_type'] = info.pop('entity2_label')
-                        info['predicate'] = "has image"
-                        info['predicate_type'] = "has image"
-                        info['context_embeddings'] = self.create_emb.get_embeddings([info['context']])[0]
-                        updated_json = json.dumps(info)
-                        updated_tuple = (entity, updated_json, second_entity)
-                        graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(updated_tuple))
-                        if graph_json:
-                            current_state = EventState(event_type=EventType.Graph, timestamp=time.time(), payload=graph_json, file=file, doc_source=doc_source, image_id=unique_id)
-                            await self.set_state(new_state=current_state)
-                        vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(updated_tuple, blob))
-                        if vector_json:
-                            current_state = EventState(event_type=EventType.Vector, timestamp=time.time(), payload=vector_json, file=file, doc_source=doc_source, image_id=unique_id)
-                            await self.set_state(new_state=current_state)
+                updated_tuple = self.nlp_model.final_ingested_images_tuples(filtered_triples)
+                graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(updated_tuple))
+                if graph_json:
+                    current_state = EventState(event_type=EventType.Graph, timestamp=time.time(), payload=graph_json, file=file, doc_source=doc_source, image_id=unique_id)
+                    await self.set_state(new_state=current_state)
+                vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(updated_tuple, blob))
+                if vector_json:
+                    current_state = EventState(event_type=EventType.Vector, timestamp=time.time(), payload=vector_json, file=file, doc_source=doc_source, image_id=unique_id)
+                    await self.set_state(new_state=current_state)
 
         except Exception as e:
             self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")

diff --git a/querent/ingestors/pdfs/pdf_ingestor_v1.py b/querent/ingestors/pdfs/pdf_ingestor_v1.py
@@ -18,7 +18,7 @@
 
 import pybase64
 import pytesseract
-import pdfplumber
+# import pdfplumber
 
 
 class PdfIngestorFactory(IngestorFactory):
@@ -130,18 +130,18 @@ async def extract_and_process_pdf(
                 f"Getting unknown error while handling this file: {collected_bytes.file} error - {exc}"
             ) from exc
 
-    async def extract_table(self, data):
-        with pdfplumber.open(io.BytesIO(data.data)) as pdf:
-            i = 0
-            for page in pdf.pages:
-                # Extract tables from the current page
-                tables = page.extract_tables()
-                i += 1
-                # for table in tables:
-                #     if len(table) <= 1:
-                #         continue
-
-                #     yield IngestedTables(file= data.file, table = table, text=page.extract_text(), error=None, page_num= i)
+    # async def extract_table(self, data):
+    #     with pdfplumber.open(io.BytesIO(data.data)) as pdf:
+    #         i = 0
+    #         for page in pdf.pages:
+    #             # Extract tables from the current page
+    #             tables = page.extract_tables()
+    #             i += 1
+    #             # for table in tables:
+    #             #     if len(table) <= 1:
+    #             #         continue
+
+    #             #     yield IngestedTables(file= data.file, table = table, text=page.extract_text(), error=None, page_num= i)
 
     async def extract_img(self, doc, file_path, data, doc_source):
         image_page_map = {}

diff --git a/querent/kg/ner_helperfunctions/ner_llm_transformer.py b/querent/kg/ner_helperfunctions/ner_llm_transformer.py
@@ -1,3 +1,4 @@
+import json
 import spacy
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 import torch
@@ -317,157 +318,15 @@ def extract_fixed_entities_from_chunk(self, chunk: List[str], fixed_entities: Li
 
         return sorted(results, key=lambda x: x['start_idx'])
 
-
-    def extract_entities_from_sentence_for_given_sentence(self, sentence: str, sentence_idx: int, all_sentences: List[str], fixed_entities_flag: bool, fixed_entities: List[str],entity_types: List[str]):
-        try:
-            tokens = self.tokenize_sentence(sentence)
-            chunks = self.get_chunks(tokens)
-            all_page_entities = []
-
-            for chunk in chunks:
-                if fixed_entities_flag == False:
-                    entities = self.extract_entities_from_chunk(chunk)
-                else:
-                    entities = self.extract_fixed_entities_from_chunk(chunk,fixed_entities, entity_types)
-                all_page_entities.append(entities)
-
-            return all_page_entities
-
-        except Exception as e:
-            self.logger.error(f"Error extracting entities for an ocr sentence: {e}")
-
-    def get_max_score_entity_pair(self, entities_list):
-        max_score = 0
-        best_pair = None
-        for i in range(0, len(entities_list), 2):
-            pair_score = entities_list[i]['score'] + entities_list[i+1]['score']
-            if pair_score > max_score:
-                max_score = pair_score
-                best_pair = (entities_list[i], entities_list[i+1])
-        return best_pair
-
-    def compare_and_retrieve(self, entities_list, binary_pairs, single_entity):
-        best_pair = self.get_max_score_entity_pair(entities_list)
-
-        found_pair = False
-        for pair, context in binary_pairs:
-            if (single_entity['entity'] == best_pair[0]['entity'] or single_entity['entity'] == best_pair[1]['entity']):
-                found_pair = True
-            if found_pair and (
-                (pair[0]['entity'] == best_pair[0]['entity'] and pair[1]['entity'] == best_pair[1]['entity']) or
-                (pair[1]['entity'] == best_pair[0]['entity'] and pair[0]['entity'] == best_pair[1]['entity'])):
-                return {
-                    "entity_pair": best_pair,
-                    "context": context
-                }
-        if not found_pair:
-            return {"message": "No matching entity found in the best pair based on the single entity input."}
-        return None
-
-    def find_most_frequent_pair_with_entity(self, binary_pairs, ocr_entities):
-
-        result = []
-        for single_entity in ocr_entities:
-
-            # Filter pairs to include only those containing the single entity
-            filtered_pairs = []
-            for pair, context in binary_pairs:
-                if single_entity['entity'] in [pair[0]['entity'], pair[1]['entity']]:
-                    # Since dictionaries are unhashable, convert pair to a tuple of sorted tuples to count occurrences
-                    sorted_pair = tuple(sorted((pair[0]['entity'], pair[1]['entity'])))
-                    filtered_pairs.append((sorted_pair, context))
-
-            # Count occurrences of each pair
-            pair_counter = Counter([pair for pair, _ in filtered_pairs])
-
-            # Find the pair with the maximum occurrence
-            if not pair_counter:
-                return None
-
-            most_frequent_pair, count = pair_counter.most_common(1)[0]
-
-            # Retrieve the context information for the most frequent pair
-            for pair, context in filtered_pairs:
-                if tuple(sorted((pair[0]['entity'], pair[1]['entity']))) == most_frequent_pair:
-                    """
-                    
-                    
-                    """
-                    result.append({
-                        "most_frequent_pair": most_frequent_pair,
-                        "count": count,
-                        "context": context
-                    })
-
-        return result
-
-    def find_highest_scoring_pair(self, binary_pairs):
-        max_score = 0  # Initialize with a value lower than the lowest possible score (scores are usually non-negative).
-        highest_scoring_pair = None
-        highest_scoring_context = None
-
-        # Iterate through each pair and calculate the total score
-        for pair, context in binary_pairs:
-            total_score = pair[0]['score'] + pair[1]['score']
-            if total_score > max_score:
-                max_score = total_score
-                highest_scoring_pair = pair
-                highest_scoring_context = context
-
-        # Return the highest scoring pair along with its context and total score
-        if highest_scoring_pair:
-            return {
-                "highest_scoring_pair": highest_scoring_pair,
-                "total_score": max_score,
-                "context": highest_scoring_context
-            }
-        else:
-            return {"message": "No pairs found or all pairs have zero or negative scores."}
-
-
-    def find_most_frequent_entity_pair(self, binary_pairs):
-        # Initialize a counter to keep track of entity pair occurrences
-        pair_counter = Counter()
-
-        # Iterate through each pair and count each unique pair
-        for pair, context in binary_pairs:
-            # Create a canonical form for the pair (sorted by entity name to ensure (A,B) and (B,A) are treated the same)
-            sorted_pair = tuple(sorted((pair[0]['entity'], pair[1]['entity'])))
-            pair_counter[sorted_pair] += 1
-
-        # Find the pair with the highest occurrence
-        if not pair_counter:
-            return None
-
-        most_frequent_pair, count = pair_counter.most_common(1)[0]
-
-        # Collect all contexts where the most frequent pair appears
-        contexts = [context for pair, context in binary_pairs if tuple(sorted((pair[0]['entity'], pair[1]['entity']))) == most_frequent_pair]
-
-        # Return the most frequent pair along with its occurrence count and all associated contexts
-        return {
-            "most_frequent_pair": most_frequent_pair,
-            "occurrence_count": count,
-            "contexts": contexts
-        }
 
     def filter_matching_entities(self, tuples_nested_list, entities_nested_list):
-    # Initialize the list to store matching tuples
         matched_tuples = []
-
-        # Loop through each list of entities
         for entities_list in entities_nested_list:
-            # Loop through each entity dictionary in the current list
             for entity_dict in entities_list:
-                entity_name = entity_dict['entity']  # Extract the entity name
-
-                # Loop through each list of tuples
+                entity_name = entity_dict['entity']
                 for tuples_list in tuples_nested_list:
-                    # Loop through each tuple in the current list
                     for tup in tuples_list:
-                        # Check if the entity is in the 1st or 3rd element of the tuple
                         if entity_name in tup[0] or entity_name in tup[2]:
-                            # Add the tuple to the result list if it's not already included
                             if tup not in matched_tuples:
                                 matched_tuples.append(tup)
 
@@ -501,6 +360,32 @@ def extract_entities_from_sentence(self, sentence: str, sentence_idx: int, all_s
         except Exception as e:
             self.logger.error(f"Error extracting entities from sentence: {e}")
 
+
+    async def get_entity_pairs(self, isConfinedSearch, fixed_entities, sample_entities, content):
+        entity = []
+        doc_entity_pairs = []
+        tokens = self._tokenize_and_chunk(content)
+        for tokenized_sentence, original_sentence, sentence_idx in tokens:
+            (entities, entity_pairs,) = self.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],isConfinedSearch, fixed_entities, sample_entities)
+            if entity_pairs:
+                doc_entity_pairs.append(self.transform_entity_pairs(entity_pairs))
+            if entities:
+                entity.append(entities)
+        return (entities, doc_entity_pairs)
+
+    async def final_ingested_images_tuples(self, filtered_triples):
+        for entity, info_json, second_entity in filtered_triples:
+            if not self.termination_event.is_set():
+                info = json.loads(info_json)
+                info['subject_type'] = info.pop('entity1_label')
+                info['object_type'] = info.pop('entity2_label')
+                info['predicate'] = "has image"
+                info['predicate_type'] = "has image"
+                info['context_embeddings'] = self.create_emb.get_embeddings([info['context']])[0]
+                updated_json = json.dumps(info)
+                updated_tuple = (entity, updated_json, second_entity)
+        return updated_tuple
+
     def remove_duplicates(self, data):
         seen = set()
         new_data = []

diff --git a/tests/data/image/Untitled 1 (2).pdf b/tests/data/image/Untitled 1 (2).pdf