diff --git a/.gitignore b/.gitignore index b64fdaae..c6035ae6 100644 --- a/.gitignore +++ b/.gitignore @@ -200,3 +200,7 @@ lib/vis-9.1.2/vis-network.css lib/vis-9.1.2/vis-network.min.js tests/data/llm/cleaned_graph_event (copy).csv tests/data/llm/cleaned_graph_event1.csv +graph.png +my_subgraph_data.csv +subgraph_output_2.csv +subgraph_output.csv diff --git a/querent/core/transformers/bert_ner_opensourcellm.py b/querent/core/transformers/bert_ner_opensourcellm.py index 93532423..c4ececa7 100644 --- a/querent/core/transformers/bert_ner_opensourcellm.py +++ b/querent/core/transformers/bert_ner_opensourcellm.py @@ -113,8 +113,92 @@ def validate(self) -> bool: def process_messages(self, data: IngestedMessages): return super().process_messages(data) + @staticmethod + def validate_ingested_images(data: IngestedImages) -> bool: + if data.is_error(): + + return False + + return True async def process_images(self, data: IngestedImages): - return super().process_images(data) + doc_entity_pairs = [] + doc_entity_pairs_ocr = [] + entity_ocr = [] + number_sentences = 0 + try: + doc_source = data.doc_source + if not BERTLLM.validate_ingested_images(data): + self.set_termination_event() + return + if data.ocr_text: + ocr_text = ' '.join(data.ocr_text) + if data.text: + content = ' '.join(data.text) + file = data.file + ocr_content = ocr_text + if ocr_content or content: + ocr_tokens = self.ner_llm_instance._tokenize_and_chunk(ocr_content) + for tokenized_sentence, original_sentence, sentence_idx in ocr_tokens: + (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in ocr_tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities) + if entities: + entity_ocr.append(entities) + if entity_pairs: + doc_entity_pairs_ocr.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs)) + number_sentences = number_sentences + 1 + if len(doc_entity_pairs_ocr) >= 1: + results = doc_entity_pairs_ocr + elif len(doc_entity_pairs_ocr) == 0: + if content: + if self.fixed_entities: + content = self.entity_context_extractor.find_entity_sentences(content) + tokens = self.ner_llm_instance._tokenize_and_chunk(content) + for tokenized_sentence, original_sentence, sentence_idx in tokens: + (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities) + if entity_pairs: + doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs)) + number_sentences = number_sentences + 1 + if len(doc_entity_pairs) > 0 and len(entity_ocr) >=1: + results = [self.ner_llm_instance.filter_matching_entities(doc_entity_pairs, entity_ocr)] + elif len(doc_entity_pairs) > 0 and len(entity_ocr) == 0: + results = doc_entity_pairs + else: + return + if results: + doc_entity_pairs = self.ner_llm_instance.remove_duplicates(results) + filtered_triples = process_data(doc_entity_pairs, file) + if self.skip_inferences: + return filtered_triples, file + else : + unique_id = str(hash(data.image)) + for triple in filtered_triples: + if not self.termination_event.is_set(): + updated_data = [] + entity, info_json, second_entity = triple + info = json.loads(info_json) + info['subject_type'] = info.pop('entity1_label') + info['object_type'] = info.pop('entity2_label') + info['predicate'] = "has image" + info['predicate_type'] = "has image" + info['context_embeddings'] = self.create_emb.embeddings.embed_query(info['context']) + updated_json = json.dumps(info) + updated_tuple = (entity, updated_json, second_entity) + graph_json = TripleToJsonConverter.convert_graphjson(updated_tuple) + graph_json['unique_image_id'] = unique_id + graph_json = json.dumps(graph_json) + if graph_json: + current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source) + await self.set_state(new_state=current_state) + vector_json = TripleToJsonConverter.convert_vectorjson(updated_tuple) + vector_json['unique_image_id'] = unique_id + vector_json = json.dumps(vector_json) + if vector_json: + current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source) + await self.set_state(new_state=current_state) + else: + return + except Exception as e: + print("Exception -----------", e) + self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}") async def process_tables(self, data: IngestedTables): return super().process_tables(data) @@ -206,7 +290,7 @@ async def process_tokens(self, data: IngestedTokens): if not filtered_triples: return elif not self.skip_inferences: - relationships = self.semantic_extractor.process_tokens(filtered_triples) + relationships = self.semantic_extractor.process_tokens(filtered_triples[:1]) relationships = self.semantictriplefilter.filter_triples(relationships) if len(relationships) > 0: embedding_triples = self.create_emb.generate_embeddings(relationships) diff --git a/querent/core/transformers/fixed_entities_set_opensourcellm.py b/querent/core/transformers/fixed_entities_set_opensourcellm.py index dcf318fa..23cd6701 100644 --- a/querent/core/transformers/fixed_entities_set_opensourcellm.py +++ b/querent/core/transformers/fixed_entities_set_opensourcellm.py @@ -101,8 +101,7 @@ def process_tables(self, data: IngestedTables): async def process_images(self, data: IngestedImages): doc_entity_pairs = [] doc_entity_pairs_ocr = [] - entities_list = [] - final_entities_list = [] + entity_ocr = [] number_sentences = 0 try: doc_source = data.doc_source @@ -111,101 +110,44 @@ async def process_images(self, data: IngestedImages): return if data.ocr_text: ocr_text = ' '.join(data.ocr_text) - else: - ocr_text = data.ocr_text - if data.text: - clean_text = ' '.join(data.text) - else: - clean_text = data.text - - file, content = data.file, clean_text - - ocr_content = ocr_text - - if ocr_content: - if self.fixed_entities: - ocr_content = self.entity_context_extractor.find_entity_sentences(ocr_content) + content = ' '.join(data.text) + file = data.file + ocr_content = ocr_text + if ocr_content or content: ocr_tokens = self.ner_llm_instance._tokenize_and_chunk(ocr_content) for tokenized_sentence, original_sentence, sentence_idx in ocr_tokens: (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in ocr_tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities) - print("Entities ---------", entities) - print("Entities pairs ---------------------------", entity_pairs) + if entities: + entity_ocr.append(entities) if entity_pairs: doc_entity_pairs_ocr.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs)) - else: - continue number_sentences = number_sentences + 1 - - print("Doc entity pairs --------", doc_entity_pairs) - - if len(doc_entity_pairs_ocr) == 0 and len(ocr_content) != 0: + if len(doc_entity_pairs_ocr) >= 1: + results = doc_entity_pairs_ocr + elif len(doc_entity_pairs_ocr) == 0: if content: if self.fixed_entities: content = self.entity_context_extractor.find_entity_sentences(content) tokens = self.ner_llm_instance._tokenize_and_chunk(content) - doc_entity_pairs_ocr = self.ner_llm_instance.extract_entities_from_sentence_for_given_sentence(ocr_content, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities) - print("doc_entity_pairs_ocr-----------------------", doc_entity_pairs_ocr) for tokenized_sentence, original_sentence, sentence_idx in tokens: - #return list of entities from document, and entity pair - print("Here in side fo loop") - (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_chunk(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities) - print("Entity pairs found from content", entity_pairs) - print("Entities found from content", entities) + (entities, entity_pairs,) = self.ner_llm_instance.extract_entities_from_sentence(original_sentence, sentence_idx, [s[1] for s in tokens],self.isConfinedSearch, self.fixed_entities, self.sample_entities) if entity_pairs: - doc_entity_pairs.append(self.ner_llm_instance.transform_entity_pairs(entity_pairs)) - entities_list.append(entities) number_sentences = number_sentences + 1 - #process those entities and ocr entity here - #if FE, then find the one most occuring - #if not FE, find the entity pair, where 1 entity is OCR text, and other is any other entity, which is most occuring, or which has higher confidence - final_entities_list = self.ner_llm_instance.create_subject_object_sentence_tuples(doc_entity_pairs_ocr, entities_list) - - - elif len(ocr_content) == 0: - #highest confidence entity pair from page text - sample_entity_pair = [{'entity': 'Image', 'label': 'image_data', 'score': 1.0, 'start_idx': 1, 'noun_chunk': 'image', 'noun_chunk_length': 1}] - final_entities_list = self.ner_llm_instance.create_subject_object_sentence_tuples(sample_entity_pair, entities_list) - - - print("Final entities ------", final_entities_list) - #- - - if self.sample_entities: - doc_entity_pairs = self.entity_context_extractor.process_entity_types(doc_entities=final_entities_list) - if doc_entity_pairs and any(doc_entity_pairs): - doc_entity_pairs = self.ner_llm_instance.remove_duplicates(final_entities_list) - filtered_triples = process_data(doc_entity_pairs, file) - if not filtered_triples: - self.logger.debug("No entity pairs") - return - elif not self.skip_inferences: - relationships = self.semantic_extractor.process_tokens(filtered_triples) - self.logger.debug(f"length of relationships {len(relationships)}") - relationships = self.semantictriplefilter.filter_triples(relationships) - if len(relationships) > 0: - embedding_triples = self.create_emb.generate_embeddings(relationships) - if self.sample_relationships: - embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples) - for triple in embedding_triples: - if not self.termination_event.is_set(): - graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple)) - if graph_json: - current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source) - await self.set_state(new_state=current_state) - vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple)) - if vector_json: - current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source) - await self.set_state(new_state=current_state) - else: - return + if len(doc_entity_pairs) > 0 and len(entity_ocr) >=1: + results = [self.ner_llm_instance.filter_matching_entities(doc_entity_pairs, entity_ocr)] + elif len(doc_entity_pairs) > 0 and len(entity_ocr) == 0: + results = doc_entity_pairs else: - return - else: - return filtered_triples, file + return + if results: + doc_entity_pairs = self.ner_llm_instance.remove_duplicates(results) + filtered_triples = process_data(doc_entity_pairs, file) + if self.skip_inferences: + return filtered_triples, file else: - return + return except Exception as e: self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}") diff --git a/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py b/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py index 7afa77f6..8f39001a 100644 --- a/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py +++ b/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py @@ -102,15 +102,38 @@ async def process_images(self, data: IngestedImages): if not GPTLLM.validate_ingested_images(data): self.set_termination_event() return - + unique_id = str(hash(data.image)) doc_source = data.doc_source relationships = [] unique_keys = set() result = await self.llm_instance.process_images(data) - if not result: - return - - return None + if not result: return + else: + filtered_triples, file = result + for triple in filtered_triples: + if not self.termination_event.is_set(): + updated_data = [] + entity, info_json, second_entity = triple + info = json.loads(info_json) + info['subject_type'] = info.pop('entity1_label') + info['object_type'] = info.pop('entity2_label') + info['predicate'] = "has image" + info['predicate_type'] = "has image" + info['context_embeddings'] = self.create_emb.embeddings.embed_query(info['context']) + updated_json = json.dumps(info) + updated_tuple = (entity, updated_json, second_entity) + graph_json = TripleToJsonConverter.convert_graphjson(updated_tuple) + graph_json['unique_image_id'] = unique_id + graph_json = json.dumps(graph_json) + if graph_json: + current_state = EventState(EventType.Graph,1.0, graph_json, file, doc_source=doc_source) + await self.set_state(new_state=current_state) + vector_json = TripleToJsonConverter.convert_vectorjson(updated_tuple) + vector_json['unique_image_id'] = unique_id + vector_json = json.dumps(vector_json) + if vector_json: + current_state = EventState(EventType.Vector,1.0, vector_json, file, doc_source=doc_source) + await self.set_state(new_state=current_state) except Exception as e: self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}") diff --git a/querent/kg/ner_helperfunctions/ner_llm_transformer.py b/querent/kg/ner_helperfunctions/ner_llm_transformer.py index 2370ff20..c7e166b1 100644 --- a/querent/kg/ner_helperfunctions/ner_llm_transformer.py +++ b/querent/kg/ner_helperfunctions/ner_llm_transformer.py @@ -319,7 +319,6 @@ def extract_fixed_entities_from_chunk(self, chunk: List[str], fixed_entities: Li def extract_entities_from_sentence_for_given_sentence(self, sentence: str, sentence_idx: int, all_sentences: List[str], fixed_entities_flag: bool, fixed_entities: List[str],entity_types: List[str]): - print("Extracting entity pair") try: tokens = self.tokenize_sentence(sentence) chunks = self.get_chunks(tokens) @@ -452,33 +451,27 @@ def find_most_frequent_entity_pair(self, binary_pairs): "contexts": contexts } - def create_subject_object_sentence_tuples(self, ocr_entities, entity_list): - # Prepare the list to hold the result tuples - results = [] - - for single_entity in ocr_entities: - - # Iterate through each entity in the list - for entity in entity_list: - # Create a tuple with the single entity as 'subject', the current entity as 'object', and use the 'sentence' from the object entity - if 'sentence' in entity: - result_tuple = ( - single_entity, - entity['sentence'], - entity - ) - - results.append(result_tuple) - else: - # Handle cases where 'sentence' might not be present in the entity dictionary - result_tuple = ( - single_entity, - entity, - "No sentence available" - ) - results.append(result_tuple) - - return results + def filter_matching_entities(self, tuples_nested_list, entities_nested_list): + # Initialize the list to store matching tuples + matched_tuples = [] + + # Loop through each list of entities + for entities_list in entities_nested_list: + # Loop through each entity dictionary in the current list + for entity_dict in entities_list: + entity_name = entity_dict['entity'] # Extract the entity name + + # Loop through each list of tuples + for tuples_list in tuples_nested_list: + # Loop through each tuple in the current list + for tup in tuples_list: + # Check if the entity is in the 1st or 3rd element of the tuple + if entity_name in tup[0] or entity_name in tup[2]: + # Add the tuple to the result list if it's not already included + if tup not in matched_tuples: + matched_tuples.append(tup) + + return matched_tuples diff --git a/querent/kg/rel_helperfunctions/embedding_store.py b/querent/kg/rel_helperfunctions/embedding_store.py index b87663e8..26350854 100644 --- a/querent/kg/rel_helperfunctions/embedding_store.py +++ b/querent/kg/rel_helperfunctions/embedding_store.py @@ -134,12 +134,6 @@ def get_embeddings(self, texts): else: payload = {"inputs": text} embedding = self.query(payload) - if isinstance(self.embeddings,HuggingFaceEmbeddings) or isinstance(self.embeddings, HuggingFaceInferenceAPIEmbeddings) : - embedding = self.embeddings.embed_query(text) - embeddings.append(embedding) - else: - payload = {"inputs": text} - embedding = self.query(payload) return embeddings except Exception as e: self.logger.error(f"Failed to generate embeddings: {e}") diff --git a/tests/data/Untitled 1.pdf b/tests/data/Untitled 1.pdf new file mode 100644 index 00000000..fc7d5f7e Binary files /dev/null and b/tests/data/Untitled 1.pdf differ diff --git a/tests/data/image/Untitled 1 (1).pdf b/tests/data/image/Untitled 1 (1).pdf new file mode 100644 index 00000000..18fc2209 Binary files /dev/null and b/tests/data/image/Untitled 1 (1).pdf differ diff --git a/tests/data/image/Untitled 1.pdf b/tests/data/image/Untitled 1.pdf new file mode 100644 index 00000000..35c17e31 Binary files /dev/null and b/tests/data/image/Untitled 1.pdf differ diff --git a/tests/workflows/bert_ingested_images_test.py b/tests/workflows/bert_ingested_images_test.py new file mode 100644 index 00000000..9233af89 --- /dev/null +++ b/tests/workflows/bert_ingested_images_test.py @@ -0,0 +1,117 @@ +# import asyncio +# from asyncio import Queue +# import json +# from pathlib import Path +# from querent.callback.event_callback_interface import EventCallbackInterface +# from querent.collectors.fs.fs_collector import FSCollectorFactory +# from querent.common.types.ingested_tokens import IngestedTokens +# from querent.common.types.querent_event import EventState, EventType +# from querent.config.collector.collector_config import FSCollectorConfig +# from querent.common.uri import Uri +# from querent.config.core.llm_config import LLM_Config +# from querent.core.transformers.fixed_entities_set_opensourcellm import Fixed_Entities_LLM +# from querent.ingestors.ingestor_manager import IngestorFactoryManager +# import pytest +# import uuid +# from querent.common.types.file_buffer import FileBuffer +# from querent.core.transformers.bert_ner_opensourcellm import BERTLLM +# from querent.processors.text_cleanup_processor import TextCleanupProcessor +# from querent.querent.resource_manager import ResourceManager +# from querent.querent.querent import Querent +# import time +# # from querent.storage.milvus_vectorevent_storage import MilvusDBConnection +# from querent.config.core.gpt_llm_config import GPTConfig +# from querent.core.transformers.gpt_llm_bert_ner_or_fixed_entities_set_ner import GPTLLM + +# @pytest.mark.asyncio +# async def test_ingest_all_async(): +# # Set up the collectors +# # db_conn = DatabaseConnection(dbname="postgres", +# # user="postgres", +# # password="querent", +# # host="localhost", +# # port="5432") +# # # ml_conn = MilvusDBConnection() +# directories = [ "/home/nishantg/querent-main/querent/tests/data/image"] +# collectors = [ +# FSCollectorFactory().resolve( +# Uri("file://" + str(Path(directory).resolve())), +# FSCollectorConfig(config_source={ +# "id": str(uuid.uuid4()), +# "root_path": directory, +# "name": "Local-config", +# "config": {}, +# "backend": "localfile", +# "uri": "file://", +# }), +# ) +# for directory in directories +# ] + +# # Set up the result queue +# result_queue = asyncio.Queue() +# text_cleanup_processor = TextCleanupProcessor() +# # Create the IngestorFactoryManager +# ingestor_factory_manager = IngestorFactoryManager( +# collectors=collectors, result_queue=result_queue, processors=[text_cleanup_processor] +# ) +# ingest_task = asyncio.create_task(ingestor_factory_manager.ingest_all_async()) +# resource_manager = ResourceManager() +# gpt_llm_config = LLM_Config( +# ner_model_name="botryan96/GeoBERT", +# # rel_model_path="/home/nishantg/Downloads/openhermes-2.5-mistral-7b.Q5_K_M.gguf", +# enable_filtering=True +# # openai_api_key="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" +# ,filter_params={ +# 'score_threshold': 0.5, +# 'attention_score_threshold': 0.1, +# 'similarity_threshold': 0.5, +# 'min_cluster_size': 5, +# 'min_samples': 3, +# 'cluster_persistence_threshold':0.2 +# } +# ,user_context="Query: Your task is to analyze and interpret the context to construct semantic triples. Please Identify the entity which is the subject and the entity which is object based on the context, and determine the meaningful relationship or predicate linking the subject entity to the object entity. Determine whether the entity labels provided match the subject type and object type and correct if needed. Also provide the predicate type. Answer:" +# ,fixed_entities =["modeling", "sonic", "symmetry","isotropy", "Carbonate", "Clastic", "Porosity", "Permeability", "Oil saturation", "Water saturation", "Gas saturation", "Depth", "Size", "Temperature", "Pressure", "Oil viscosity", "Gas-oil ratio", "Water cut", "Recovery factor", "Enhanced recovery technique", "Horizontal drilling", "Hydraulic fracturing", "Water injection", "Gas injection", "Steam injection", "Seismic activity", "Structural deformation", "Faulting", "Cap rock integrity", "Compartmentalization", "Connectivity", "Production rate", "Depletion rate", "Exploration technique", "Drilling technique", "Completion technique", "Environmental impact", "Regulatory compliance", "Economic analysis", "Market analysis", "oil well", "gas well", "oil field", "Gas field", "eagle ford shale", "ghawar", "johan sverdrup", "karachaganak", "maracaibo"], +# sample_entities = ["method","method","method","method", "rock_type", "rock_type", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_characteristic", "reservoir_characteristic", "reservoir_characteristic", "reservoir_characteristic", "reservoir_property", "reservoir_property", "production_metric", "production_metric", "recovery_technique", "drilling_technique", "recovery_technique", "recovery_technique", "recovery_technique", "recovery_technique", "geological_feature", "geological_feature", "geological_feature", "reservoir_feature", "reservoir_feature", "reservoir_feature", "production_metric", "production_metric", "exploration_method", "drilling_method", "completion_method", "environmental_aspect", "regulatory_aspect", "economic_aspect", "economic_aspect", "hydrocarbon_source", "hydrocarbon_source", "hydrocarbon_source", "hydrocarbon_source", "reservoir", "reservoir", "reservoir", "reservoir", "reservoir"] +# , is_confined_search = True +# , huggingface_token = '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' +# ) +# llm_instance = BERTLLM(result_queue, gpt_llm_config) +# class StateChangeCallback(EventCallbackInterface): +# def handle_event(self, event_type: EventType, event_state: EventState): +# # assert event_state.event_type == EventType.Graph +# if event_state['event_type'] == EventType.Graph : +# triple = json.loads(event_state['payload']) +# print("file---------------------",event_state['file'], "----------------", type(event_state['file'])) +# print("triple: {}".format(triple)) +# graph_event_data = { +# 'subject': triple['subject'], +# 'subject_type': triple['subject_type'], +# 'object': triple['object'], +# 'object_type': triple['object_type'], +# 'predicate': triple['predicate'], +# 'predicate_type': triple['predicate_type'], +# 'sentence': triple['sentence'], +# 'document_id': event_state['file'] +# } +# # db_conn.insert_graph_event(graph_event_data) +# assert isinstance(triple['subject'], str) and triple['subject'] +# # else : +# # vector_triple = json.loads(event_state.payload) +# # print("Inside Vector event ---------------------------------", vector_triple) +# # milvus_coll = ml_conn.create_collection(collection_name=vector_triple['namespace'],dim = 384) +# # ml_conn.insert_vector_event(id = vector_triple['id'], embedding= vector_triple['embeddings'], namespace= vector_triple['namespace'], document=event_state.file, collection= milvus_coll ) +# llm_instance.subscribe(EventType.Graph, StateChangeCallback()) +# # llm_instance.subscribe(EventType.Vector, StateChangeCallback()) +# querent = Querent( +# [llm_instance], +# resource_manager=resource_manager, +# ) +# querent_task = asyncio.create_task(querent.start()) +# await asyncio.gather(ingest_task, querent_task) +# # db_conn.close() + +# if __name__ == "__main__": + +# # Run the async function +# asyncio.run(test_ingest_all_async()) diff --git a/tests/workflows/ingested_images_test.py b/tests/workflows/openai_ingested_images_test.py similarity index 75% rename from tests/workflows/ingested_images_test.py rename to tests/workflows/openai_ingested_images_test.py index b395d894..f47280b6 100644 --- a/tests/workflows/ingested_images_test.py +++ b/tests/workflows/openai_ingested_images_test.py @@ -32,7 +32,7 @@ # # host="localhost", # # port="5432") # # # ml_conn = MilvusDBConnection() -# directories = [ "/home/ansh/pyg-trail/testing-ocr"] +# directories = [ "/home/nishantg/querent-main/querent/tests/data/image"] # collectors = [ # FSCollectorFactory().resolve( # Uri("file://" + str(Path(directory).resolve())), @@ -71,8 +71,8 @@ # 'cluster_persistence_threshold':0.2 # } # ,user_context="Query: Your task is to analyze and interpret the context to construct semantic triples. Please Identify the entity which is the subject and the entity which is object based on the context, and determine the meaningful relationship or predicate linking the subject entity to the object entity. Determine whether the entity labels provided match the subject type and object type and correct if needed. Also provide the predicate type. Answer:" -# ,fixed_entities =["Carbonate", "Clastic", "Porosity", "Permeability", "Oil saturation", "Water saturation", "Gas saturation", "Depth", "Size", "Temperature", "Pressure", "Oil viscosity", "Gas-oil ratio", "Water cut", "Recovery factor", "Enhanced recovery technique", "Horizontal drilling", "Hydraulic fracturing", "Water injection", "Gas injection", "Steam injection", "Seismic activity", "Structural deformation", "Faulting", "Cap rock integrity", "Compartmentalization", "Connectivity", "Production rate", "Depletion rate", "Exploration technique", "Drilling technique", "Completion technique", "Environmental impact", "Regulatory compliance", "Economic analysis", "Market analysis", "oil well", "gas well", "oil field", "Gas field", "eagle ford shale", "ghawar", "johan sverdrup", "karachaganak", "maracaibo"], -# sample_entities = ["rock_type", "rock_type", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_characteristic", "reservoir_characteristic", "reservoir_characteristic", "reservoir_characteristic", "reservoir_property", "reservoir_property", "production_metric", "production_metric", "recovery_technique", "drilling_technique", "recovery_technique", "recovery_technique", "recovery_technique", "recovery_technique", "geological_feature", "geological_feature", "geological_feature", "reservoir_feature", "reservoir_feature", "reservoir_feature", "production_metric", "production_metric", "exploration_method", "drilling_method", "completion_method", "environmental_aspect", "regulatory_aspect", "economic_aspect", "economic_aspect", "hydrocarbon_source", "hydrocarbon_source", "hydrocarbon_source", "hydrocarbon_source", "reservoir", "reservoir", "reservoir", "reservoir", "reservoir"] +# ,fixed_entities =["modeling", "sonic", "symmetry","isotropy", "Carbonate", "Clastic", "Porosity", "Permeability", "Oil saturation", "Water saturation", "Gas saturation", "Depth", "Size", "Temperature", "Pressure", "Oil viscosity", "Gas-oil ratio", "Water cut", "Recovery factor", "Enhanced recovery technique", "Horizontal drilling", "Hydraulic fracturing", "Water injection", "Gas injection", "Steam injection", "Seismic activity", "Structural deformation", "Faulting", "Cap rock integrity", "Compartmentalization", "Connectivity", "Production rate", "Depletion rate", "Exploration technique", "Drilling technique", "Completion technique", "Environmental impact", "Regulatory compliance", "Economic analysis", "Market analysis", "oil well", "gas well", "oil field", "Gas field", "eagle ford shale", "ghawar", "johan sverdrup", "karachaganak", "maracaibo"], +# sample_entities = ["method","method","method","method", "rock_type", "rock_type", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_property", "reservoir_characteristic", "reservoir_characteristic", "reservoir_characteristic", "reservoir_characteristic", "reservoir_property", "reservoir_property", "production_metric", "production_metric", "recovery_technique", "drilling_technique", "recovery_technique", "recovery_technique", "recovery_technique", "recovery_technique", "geological_feature", "geological_feature", "geological_feature", "reservoir_feature", "reservoir_feature", "reservoir_feature", "production_metric", "production_metric", "exploration_method", "drilling_method", "completion_method", "environmental_aspect", "regulatory_aspect", "economic_aspect", "economic_aspect", "hydrocarbon_source", "hydrocarbon_source", "hydrocarbon_source", "hydrocarbon_source", "reservoir", "reservoir", "reservoir", "reservoir", "reservoir"] # , is_confined_search = True # , huggingface_token = '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' # )