Skip to content

Commit

Permalink
removing reduntant code
Browse files Browse the repository at this point in the history
  • Loading branch information
ngupta10 committed May 3, 2024
1 parent 867326c commit e895dba
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 330 deletions.
30 changes: 0 additions & 30 deletions querent/core/base_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,33 +299,3 @@ async def _stop_workers(self):
self.termination_event.set()
except Exception as e:
self.logger.error(f"Error while stopping workers: {e}")

async def get_doc_entity_pairs(self, content):
doc_entity_pairs = []
if content:
if self.fixed_entities:
content = self.entity_context_extractor.find_entity_sentences(content)
if self.fixed_relationships:
content = self.predicate_context_extractor.find_predicate_sentences(content)
tokens = self.ner_llm_instance._tokenize_and_chunk(content)
for tokenized_sentence, original_sentence, sentence_idx in tokens:
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
if entity_pairs:
doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
number_sentences = number_sentences + 1
else:
return

return doc_entity_pairs

async def get_ocr_entity_pairs(self, ocr_content):
entity_ocr = []
doc_entity_pairs_ocr = []
ocr_tokens = self.ner_llm_instance._tokenize_and_chunk(ocr_content)
for tokenized_sentence, original_sentence, sentence_idx in ocr_tokens:
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in ocr_tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
if entities:
entity_ocr.append(entities)
if entity_pairs:
doc_entity_pairs_ocr.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
number_sentences = number_sentences + 1
26 changes: 15 additions & 11 deletions querent/core/transformers/bert_ner_opensourcellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def validate_ingested_images(data: IngestedImages) -> bool:

return True
async def process_images(self, data: IngestedImages):
print("Going to run the function from 000000000000000000")
doc_entity_pairs = []
doc_entity_pairs_ocr = []
entity_ocr = []
Expand All @@ -138,21 +139,25 @@ async def process_images(self, data: IngestedImages):
if data.text:
content = ' '.join(data.text)
file = data.file
ocr_content = ocr_text
ocr_content = ocr_text
print("Going to run the function from Mian1111111111", ocr_content)
if ocr_content or content:
(entity_ocr, doc_entity_pairs_ocr) = self.get_ocr_entity_pairs(ocr_content=ocr_content)
print("Going to run the function from Mian", ocr_content)
(entity_ocr, doc_entity_pairs_ocr) = await self.ner_llm_instance.get_entity_pairs(isConfinedSearch= self.isConfinedSearch,
content=ocr_content,
fixed_entities=self.fixed_entities,
sample_entities=self.sample_entities)
print("Results from Main-------------", doc_entity_pairs_ocr)
if len(doc_entity_pairs_ocr) >= 1:
results = doc_entity_pairs_ocr
elif len(doc_entity_pairs_ocr) == 0:
if content:
if self.fixed_entities:
content = self.entity_context_extractor.find_entity_sentences(content)
tokens = self.ner_llm_instance._tokenize_and_chunk(content)
for tokenized_sentence, original_sentence, sentence_idx in tokens:
(entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities)
if entity_pairs:
doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs))
number_sentences = number_sentences + 1
(_, doc_entity_pairs) = await self.ner_llm_instance.get_entity_pairs(isConfinedSearch= self.isConfinedSearch,
content=ocr_content,
fixed_entities=self.fixed_entities,
sample_entities=self.sample_entities)
if len(doc_entity_pairs) > 0 and len(entity_ocr) >=1:
results = [self.ner_llm_instance.filter_matching_entities(doc_entity_pairs, entity_ocr)]
elif len(doc_entity_pairs) > 0 and len(entity_ocr) == 0:
Expand All @@ -166,10 +171,8 @@ async def process_images(self, data: IngestedImages):
return filtered_triples, file
else :
unique_id = str(hash(data.image))
for triple in filtered_triples:
for entity, info_json, second_entity in filtered_triples:
if not self.termination_event.is_set():
updated_data = []
entity, info_json, second_entity = triple
info = json.loads(info_json)
info['subject_type'] = info.pop('entity1_label')
info['object_type'] = info.pop('entity2_label')
Expand All @@ -191,6 +194,7 @@ async def process_images(self, data: IngestedImages):
else:
return
except Exception as e:
print("Exception -----------------", e)
self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")

async def process_tables(self, data: IngestedTables):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,26 +111,15 @@ async def process_images(self, data: IngestedImages):
if not result: return
else:
filtered_triples, file = result
for triple in filtered_triples:
if not self.termination_event.is_set():
updated_data = []
entity, info_json, second_entity = triple
info = json.loads(info_json)
info['subject_type'] = info.pop('entity1_label')
info['object_type'] = info.pop('entity2_label')
info['predicate'] = "has image"
info['predicate_type'] = "has image"
info['context_embeddings'] = self.create_emb.get_embeddings([info['context']])[0]
updated_json = json.dumps(info)
updated_tuple = (entity, updated_json, second_entity)
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(updated_tuple))
if graph_json:
current_state = EventState(event_type=EventType.Graph, timestamp=time.time(), payload=graph_json, file=file, doc_source=doc_source, image_id=unique_id)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(updated_tuple, blob))
if vector_json:
current_state = EventState(event_type=EventType.Vector, timestamp=time.time(), payload=vector_json, file=file, doc_source=doc_source, image_id=unique_id)
await self.set_state(new_state=current_state)
updated_tuple = self.nlp_model.final_ingested_images_tuples(filtered_triples)
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(updated_tuple))
if graph_json:
current_state = EventState(event_type=EventType.Graph, timestamp=time.time(), payload=graph_json, file=file, doc_source=doc_source, image_id=unique_id)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(updated_tuple, blob))
if vector_json:
current_state = EventState(event_type=EventType.Vector, timestamp=time.time(), payload=vector_json, file=file, doc_source=doc_source, image_id=unique_id)
await self.set_state(new_state=current_state)

except Exception as e:
self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")
Expand Down
26 changes: 13 additions & 13 deletions querent/ingestors/pdfs/pdf_ingestor_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import pybase64
import pytesseract
import pdfplumber
# import pdfplumber


class PdfIngestorFactory(IngestorFactory):
Expand Down Expand Up @@ -130,18 +130,18 @@ async def extract_and_process_pdf(
f"Getting unknown error while handling this file: {collected_bytes.file} error - {exc}"
) from exc

async def extract_table(self, data):
with pdfplumber.open(io.BytesIO(data.data)) as pdf:
i = 0
for page in pdf.pages:
# Extract tables from the current page
tables = page.extract_tables()
i += 1
# for table in tables:
# if len(table) <= 1:
# continue

# yield IngestedTables(file= data.file, table = table, text=page.extract_text(), error=None, page_num= i)
# async def extract_table(self, data):
# with pdfplumber.open(io.BytesIO(data.data)) as pdf:
# i = 0
# for page in pdf.pages:
# # Extract tables from the current page
# tables = page.extract_tables()
# i += 1
# # for table in tables:
# # if len(table) <= 1:
# # continue

# # yield IngestedTables(file= data.file, table = table, text=page.extract_text(), error=None, page_num= i)

async def extract_img(self, doc, file_path, data, doc_source):
image_page_map = {}
Expand Down
171 changes: 28 additions & 143 deletions querent/kg/ner_helperfunctions/ner_llm_transformer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
Expand Down Expand Up @@ -317,157 +318,15 @@ def extract_fixed_entities_from_chunk(self, chunk: List[str], fixed_entities: Li

return sorted(results, key=lambda x: x['start_idx'])


def extract_entities_from_sentence_for_given_sentence(self, sentence: str, sentence_idx: int, all_sentences: List[str], fixed_entities_flag: bool, fixed_entities: List[str],entity_types: List[str]):
try:
tokens = self.tokenize_sentence(sentence)
chunks = self.get_chunks(tokens)
all_page_entities = []

for chunk in chunks:
if fixed_entities_flag == False:
entities = self.extract_entities_from_chunk(chunk)
else:
entities = self.extract_fixed_entities_from_chunk(chunk,fixed_entities, entity_types)
all_page_entities.append(entities)

return all_page_entities

except Exception as e:
self.logger.error(f"Error extracting entities for an ocr sentence: {e}")

def get_max_score_entity_pair(self, entities_list):
max_score = 0
best_pair = None
for i in range(0, len(entities_list), 2):
pair_score = entities_list[i]['score'] + entities_list[i+1]['score']
if pair_score > max_score:
max_score = pair_score
best_pair = (entities_list[i], entities_list[i+1])
return best_pair

def compare_and_retrieve(self, entities_list, binary_pairs, single_entity):
best_pair = self.get_max_score_entity_pair(entities_list)

found_pair = False
for pair, context in binary_pairs:
if (single_entity['entity'] == best_pair[0]['entity'] or single_entity['entity'] == best_pair[1]['entity']):
found_pair = True
if found_pair and (
(pair[0]['entity'] == best_pair[0]['entity'] and pair[1]['entity'] == best_pair[1]['entity']) or
(pair[1]['entity'] == best_pair[0]['entity'] and pair[0]['entity'] == best_pair[1]['entity'])):
return {
"entity_pair": best_pair,
"context": context
}
if not found_pair:
return {"message": "No matching entity found in the best pair based on the single entity input."}
return None

def find_most_frequent_pair_with_entity(self, binary_pairs, ocr_entities):

result = []
for single_entity in ocr_entities:

# Filter pairs to include only those containing the single entity
filtered_pairs = []
for pair, context in binary_pairs:
if single_entity['entity'] in [pair[0]['entity'], pair[1]['entity']]:
# Since dictionaries are unhashable, convert pair to a tuple of sorted tuples to count occurrences
sorted_pair = tuple(sorted((pair[0]['entity'], pair[1]['entity'])))
filtered_pairs.append((sorted_pair, context))

# Count occurrences of each pair
pair_counter = Counter([pair for pair, _ in filtered_pairs])

# Find the pair with the maximum occurrence
if not pair_counter:
return None

most_frequent_pair, count = pair_counter.most_common(1)[0]

# Retrieve the context information for the most frequent pair
for pair, context in filtered_pairs:
if tuple(sorted((pair[0]['entity'], pair[1]['entity']))) == most_frequent_pair:
"""
"""
result.append({
"most_frequent_pair": most_frequent_pair,
"count": count,
"context": context
})

return result

def find_highest_scoring_pair(self, binary_pairs):
max_score = 0 # Initialize with a value lower than the lowest possible score (scores are usually non-negative).
highest_scoring_pair = None
highest_scoring_context = None

# Iterate through each pair and calculate the total score
for pair, context in binary_pairs:
total_score = pair[0]['score'] + pair[1]['score']
if total_score > max_score:
max_score = total_score
highest_scoring_pair = pair
highest_scoring_context = context

# Return the highest scoring pair along with its context and total score
if highest_scoring_pair:
return {
"highest_scoring_pair": highest_scoring_pair,
"total_score": max_score,
"context": highest_scoring_context
}
else:
return {"message": "No pairs found or all pairs have zero or negative scores."}


def find_most_frequent_entity_pair(self, binary_pairs):
# Initialize a counter to keep track of entity pair occurrences
pair_counter = Counter()

# Iterate through each pair and count each unique pair
for pair, context in binary_pairs:
# Create a canonical form for the pair (sorted by entity name to ensure (A,B) and (B,A) are treated the same)
sorted_pair = tuple(sorted((pair[0]['entity'], pair[1]['entity'])))
pair_counter[sorted_pair] += 1

# Find the pair with the highest occurrence
if not pair_counter:
return None

most_frequent_pair, count = pair_counter.most_common(1)[0]

# Collect all contexts where the most frequent pair appears
contexts = [context for pair, context in binary_pairs if tuple(sorted((pair[0]['entity'], pair[1]['entity']))) == most_frequent_pair]

# Return the most frequent pair along with its occurrence count and all associated contexts
return {
"most_frequent_pair": most_frequent_pair,
"occurrence_count": count,
"contexts": contexts
}

def filter_matching_entities(self, tuples_nested_list, entities_nested_list):
# Initialize the list to store matching tuples
matched_tuples = []

# Loop through each list of entities
for entities_list in entities_nested_list:
# Loop through each entity dictionary in the current list
for entity_dict in entities_list:
entity_name = entity_dict['entity'] # Extract the entity name

# Loop through each list of tuples
entity_name = entity_dict['entity']
for tuples_list in tuples_nested_list:
# Loop through each tuple in the current list
for tup in tuples_list:
# Check if the entity is in the 1st or 3rd element of the tuple
if entity_name in tup[0] or entity_name in tup[2]:
# Add the tuple to the result list if it's not already included
if tup not in matched_tuples:
matched_tuples.append(tup)

Expand Down Expand Up @@ -501,6 +360,32 @@ def extract_entities_from_sentence(self, sentence: str, sentence_idx: int, all_s
except Exception as e:
self.logger.error(f"Error extracting entities from sentence: {e}")


async def get_entity_pairs(self, isConfinedSearch, fixed_entities, sample_entities, content):
entity = []
doc_entity_pairs = []
tokens = self._tokenize_and_chunk(content)
for tokenized_sentence, original_sentence, sentence_idx in tokens:
(entities, entity_pairs,) = self.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],isConfinedSearch, fixed_entities, sample_entities)
if entity_pairs:
doc_entity_pairs.append(self.transform_entity_pairs(entity_pairs))
if entities:
entity.append(entities)
return (entities, doc_entity_pairs)

async def final_ingested_images_tuples(self, filtered_triples):
for entity, info_json, second_entity in filtered_triples:
if not self.termination_event.is_set():
info = json.loads(info_json)
info['subject_type'] = info.pop('entity1_label')
info['object_type'] = info.pop('entity2_label')
info['predicate'] = "has image"
info['predicate_type'] = "has image"
info['context_embeddings'] = self.create_emb.get_embeddings([info['context']])[0]
updated_json = json.dumps(info)
updated_tuple = (entity, updated_json, second_entity)
return updated_tuple

def remove_duplicates(self, data):
seen = set()
new_data = []
Expand Down
Binary file added tests/data/image/Untitled 1 (2).pdf
Binary file not shown.
Loading

0 comments on commit e895dba

Please sign in to comment.