diff --git a/.gitignore b/.gitignore index b64fdaae..c6035ae6 100644 --- a/.gitignore +++ b/.gitignore @@ -200,3 +200,7 @@ lib/vis-9.1.2/vis-network.css lib/vis-9.1.2/vis-network.min.js tests/data/llm/cleaned_graph_event (copy).csv tests/data/llm/cleaned_graph_event1.csv +graph.png +my_subgraph_data.csv +subgraph_output_2.csv +subgraph_output.csv diff --git a/querent/core/transformers/bert_ner_opensourcellm.py b/querent/core/transformers/bert_ner_opensourcellm.py index 2b8611f2..461b350d 100644 --- a/querent/core/transformers/bert_ner_opensourcellm.py +++ b/querent/core/transformers/bert_ner_opensourcellm.py @@ -148,6 +148,7 @@ async def process_tokens(self, data: IngestedTokens): content = clean_text file = data.get_file_path() if content: + print("BERT Content -----------------------", content) if self.fixed_entities: content = self.entity_context_extractor.find_entity_sentences(content) tokens = self.ner_llm_instance._tokenize_and_chunk(content) @@ -158,9 +159,11 @@ async def process_tokens(self, data: IngestedTokens): number_sentences = number_sentences + 1 else: return + print("Doc Entity Pairss-------------------", doc_entity_pairs) if self.sample_entities: doc_entity_pairs = self.entity_context_extractor.process_entity_types(doc_entities=doc_entity_pairs) if any(doc_entity_pairs): + print("Found odc entity pairssssssssssssssssssssssssssssssss") doc_entity_pairs = self.ner_llm_instance.remove_duplicates(doc_entity_pairs) pairs_withattn = self.attn_scores_instance.extract_and_append_attention_weights(doc_entity_pairs) if self.enable_filtering == True and not self.entity_context_extractor and self.count_entity_pairs(pairs_withattn)>1 and not self.predicate_context_extractor: @@ -184,8 +187,11 @@ async def process_tokens(self, data: IngestedTokens): if not filtered_triples: return elif not self.skip_inferences: - relationships = self.semantic_extractor.process_tokens(filtered_triples, fixed_entities=(len(self.sample_entities) >= 1)) + print("Going to run BERT") + relationships = self.semantic_extractor.process_tokens(filtered_triples[:5], fixed_entities=(len(self.sample_entities) >= 1)) + print ("Found these relationshipssssssss ----", relationships) relationships = self.semantictriplefilter.filter_triples(relationships) + print ("Found these relationshipssssssss ----", relationships) if len(relationships) > 0: if self.fixed_relationships and self.sample_relationships: embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True, generate_embeddings_with_fixed_relationship = True) @@ -194,6 +200,7 @@ async def process_tokens(self, data: IngestedTokens): else: embedding_triples = self.create_emb.generate_embeddings(relationships) if self.sample_relationships: + print("Inside BERT going to compute scortessssss") embedding_triples = self.predicate_context_extractor.update_embedding_triples_with_similarity(self.predicate_json_emb, embedding_triples) for triple in embedding_triples: if not self.termination_event.is_set(): @@ -214,4 +221,5 @@ async def process_tokens(self, data: IngestedTokens): else: return except Exception as e: + print("Exception in BERT: -----------------------------", e) self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}") diff --git a/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py b/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py index fa210559..5259e2fa 100644 --- a/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py +++ b/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py @@ -232,6 +232,7 @@ async def process_triples(self, context, entity1, entity2, entity1_label, entity {"role": "user", "content": identify_entity_message}, {"role": "user", "content": self.user_context}, ] + print("GPT LLM prompt message -------------------------", messages_classify_entity) identify_predicate_response = self.generate_response( messages_classify_entity, "predicate_info" @@ -292,11 +293,13 @@ async def process_tokens(self, data: IngestedTokens): doc_source = data.doc_source relationships = [] unique_keys = set() + print("Inside GPT-----------------------") result = await self.llm_instance.process_tokens(data) if not result: return else: filtered_triples, file = result modified_data = GPTLLM.remove_items_from_tuples(filtered_triples) + print("Data in GPT------------------------", modified_data[:1]) for entity1, context_json, entity2 in modified_data: context_data = json.loads(context_json) context = context_data.get("context", "") @@ -313,12 +316,15 @@ async def process_tokens(self, data: IngestedTokens): relationships.append(output_tuple) if len(relationships) > 0: if self.fixed_relationships and self.sample_relationships: + print("Both are settttttttttttttttttttt-----") embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True, generate_embeddings_with_fixed_relationship = True) elif self.sample_relationships: + print("Only Sample Relationships are settttttttttttttttttttttttt-----") embedding_triples = self.create_emb.generate_embeddings(relationships, relationship_finder=True) else: embedding_triples = self.create_emb.generate_embeddings(relationships) if self.sample_relationships: + print("Going to compute scores------------------------------") embedding_triples = self.predicate_context_extractor.update_embedding_triples_with_similarity(self.predicate_json_emb, embedding_triples) for triple in embedding_triples: if not self.termination_event.is_set(): @@ -335,6 +341,7 @@ async def process_tokens(self, data: IngestedTokens): else: return except Exception as e: + print("Exception in GPT-----------------------", e) self.logger.error(f"Invalid {self.__class__.__name__} configuration. Unable to extract predicates using GPT. {e}") raise Exception(f"An error occurred while extracting predicates using GPT: {e}") diff --git a/querent/kg/ner_helperfunctions/fixed_predicate.py b/querent/kg/ner_helperfunctions/fixed_predicate.py index 2ad05d12..50a0ffaf 100644 --- a/querent/kg/ner_helperfunctions/fixed_predicate.py +++ b/querent/kg/ner_helperfunctions/fixed_predicate.py @@ -161,6 +161,8 @@ def update_embedding_triples_with_similarity(self, predicate_json_emb, embedding predicate_emb_list = [item["predicate_emb"] for item in predicate_json_emb if item["predicate_emb"] != "Not Implemented"] predicate_emb_matrix = np.array(predicate_emb_list) updated_embedding_triples = [] + print("Embedding Triples withSimilarity----", embedding_triples) + print("Predicate Matrixxxxxxxxx", predicate_emb_matrix) for triple in embedding_triples: entity, triple_json, study_field = triple triple_data = json.loads(triple_json) @@ -173,12 +175,14 @@ def update_embedding_triples_with_similarity(self, predicate_json_emb, embedding similarities = cosine_similarity(current_predicate_emb, predicate_emb_matrix) max_similarity_index = np.argmax(similarities) most_similar_predicate_details = predicate_json_emb[max_similarity_index] + print("Max similarity index -------", similarities) if similarities[0][max_similarity_index] > 0.5: triple_data["predicate_type"] = most_similar_predicate_details["type"] if most_similar_predicate_details["relationship"].lower() != "unlabelled": triple_data["predicate"] = most_similar_predicate_details["relationship"] updated_triple_json = json.dumps(triple_data) updated_embedding_triples.append((entity, updated_triple_json, study_field)) + print("updated_embedding_triples------------", updated_embedding_triples) return updated_embedding_triples except Exception as e: raise Exception(f"Error processing predicate types: {e}") diff --git a/setup.py b/setup.py index 7b05eeff..13b61e9c 100644 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ setup( name="querent", - version="3.0.3", + version="3.0.4", author="Querent AI", description="The Asynchronous Data Dynamo and Graph Neural Network Catalyst", long_description=long_description, diff --git a/tests/traverser/__init__.py b/tests/traverser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/traverser/kge.py b/tests/traverser/kge.py new file mode 100644 index 00000000..119f9b69 --- /dev/null +++ b/tests/traverser/kge.py @@ -0,0 +1,108 @@ +import torch +import torch.nn as nn +from transformers import BertModel, BertTokenizer +import pandas as pd +from torch.nn.functional import cosine_similarity +import random +import numpy as np + +# Set seeds for reproducibility +random.seed(42) +np.random.seed(42) +torch.manual_seed(42) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + + +class TextEnhancedKGE(nn.Module): + def __init__(self, entity_dim, relation_dim, entity_to_idx, relation_to_idx, bert_model_name='bert-base-uncased'): + super(TextEnhancedKGE, self).__init__() + self.entity_embeddings = nn.Embedding(len(entity_to_idx), entity_dim) + self.relation_embeddings = nn.Embedding(len(relation_to_idx), relation_dim) + self.sentence_projection = nn.Linear(768, relation_dim) + self.bert_model = BertModel.from_pretrained(bert_model_name) + self.bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name) + self.score_layer = nn.Linear(entity_dim * 2 + relation_dim, 1) + self.combined_projection = nn.Linear(entity_dim * 2 + relation_dim, 768) + + nn.init.xavier_uniform_(self.entity_embeddings.weight) + nn.init.xavier_uniform_(self.relation_embeddings.weight) + nn.init.xavier_uniform_(self.sentence_projection.weight) + nn.init.xavier_uniform_(self.combined_projection.weight) + + def sentence_to_embedding(self, sentences): + inputs = self.bert_tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=500) + outputs = self.bert_model(**inputs) + return outputs.last_hidden_state[:, 0, :].squeeze() + + def forward(self, heads, relations, tails, sentences): + print("Heads: ", heads) + print("Relations: ", relations) + print("Tensors: ", tails) + print("Sets: ", sentences) + head_embeddings = self.entity_embeddings(heads) + print("head embeddings", head_embeddings) + relation_embeddings = self.relation_embeddings(relations) + tail_embeddings = self.entity_embeddings(tails) + + sentence_embeddings = self.sentence_to_embedding(sentences) + projected_sentences = self.sentence_projection(sentence_embeddings) + + score = self.calculate_score(head_embeddings, relation_embeddings, tail_embeddings, projected_sentences) + return score + + def calculate_score(self, head_embeddings, relation_embeddings, tail_embeddings, sentence_embeddings): + combined_embeddings = torch.cat([head_embeddings, relation_embeddings + sentence_embeddings, tail_embeddings], dim=1) + return self.score_layer(combined_embeddings) + + def query(self, query_text, heads, relations, tails, sentences): + query_embedding = self.sentence_to_embedding([query_text]).unsqueeze(0) + sentence_embeddings = self.sentence_to_embedding(sentences) + projected_sentences = self.sentence_projection(sentence_embeddings) + + head_embeddings = self.entity_embeddings(heads) + relation_embeddings = self.relation_embeddings(relations) + tail_embeddings = self.entity_embeddings(tails) + + relation_plus_sentence = relation_embeddings + projected_sentences + combined_embeddings = torch.cat([head_embeddings, relation_plus_sentence, tail_embeddings], dim=1) + + if combined_embeddings.shape[-1] != query_embedding.shape[-1]: + combined_embeddings = self.combined_projection(combined_embeddings) + + similarities = cosine_similarity(query_embedding, combined_embeddings, dim=-1) + return similarities + +# Load your data +data = pd.read_csv('my_subgraph_data.csv') +print(data.head()) +print(data.columns) +# Create mappings +entity_to_idx = {entity: idx for idx, entity in enumerate(pd.concat([data['Node Start'], data['Node End']]).unique())} +relation_to_idx = {relation: idx for idx, relation in enumerate(data['Relationship Type'].unique())} +print('Creating entity_to_idx and relation_to_idx', entity_to_idx) +print('2nd indx', relation_to_idx) + +# Initialize the model +model = TextEnhancedKGE( + entity_dim=100, + relation_dim=100, + entity_to_idx=entity_to_idx, + relation_to_idx=relation_to_idx +) + +# Prepare data for the model +heads = torch.LongTensor(data['Node Start'].map(entity_to_idx).values) +relations = torch.LongTensor(data['Relationship Type'].map(relation_to_idx).values) +tails = torch.LongTensor(data['Node End'].map(entity_to_idx).values) +sentences = data['Sentence'].tolist() + +# Calculate scores +scores = model(heads, relations, tails, sentences) +print(scores) + +# User query +query_text = "How does hydraulic fracturing enhance porosity?" +similarities = model.query(query_text, heads, relations, tails, sentences) +top_matches = similarities.topk(10) # Get the top 10 matches as per revised requirement +print(top_matches) diff --git a/tests/traverser/traverser.py b/tests/traverser/traverser.py new file mode 100644 index 00000000..bfe2bae9 --- /dev/null +++ b/tests/traverser/traverser.py @@ -0,0 +1,235 @@ +from neo4j import GraphDatabase +import matplotlib.pyplot as plt +# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io" +import csv +import pandas as pd +from neo4j import GraphDatabase +import networkx as nx +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity +from sentence_transformers import SentenceTransformer + +class Neo4jConnection: + def __init__(self, uri, user, password): + self.__uri = uri + self.__user = user + self.__password = password + self.__driver = None + self.model = SentenceTransformer('all-MiniLM-L6-v2') + try: + self.__driver = GraphDatabase.driver(self.__uri, auth=(user, password)) + except Exception as e: + print("Failed to create the driver:", e) + + def close(self): + if self.__driver is not None: + self.__driver.close() + + def sentence_to_embedding(self, sentence): + """Converts a sentence to an embedding""" + return self.model.encode(sentence, convert_to_tensor=False) + + def extract_subgraph_to_csv(self, query, tags, output_file): + with self.__driver.session() as session: + result = session.run(query, tags=list(tags)) + with open(output_file, 'w', newline='') as file: + writer = csv.writer(file) + # Write headers + writer.writerow(['Node Start', 'Relationship Type', 'Node End']) + + for record in result: + node_start = record["n"]["name"] + node_end = record["m"]["name"] + if isinstance(record["r"], list): # Handling multiple relationships + for rel in record["r"]: + writer.writerow([node_start, rel.type, node_end]) + else: # Handling a single relationship + writer.writerow([node_start, record["r"].type, node_end]) + + def fetch_triples(self, knowledge): + subject, relationship, obj = knowledge.split('-') + subject = subject.replace('_', ' ') + obj = obj.replace('_', ' ') + relationship = relationship.replace('_',' ') + query = """ + MATCH (n)-[r]->(m) + WHERE type(r) = $relationship AND n.name = $subject AND m.name = $object + RETURN n.name, m.name + """ + with self.__driver.session() as session: + result = session.run(query, subject=subject, relationship=relationship, object=obj) + nodes = set() + for record in result: + nodes.add(record['n.name']) + nodes.add(record['m.name']) + return list(nodes) + + def explore_connections(self, node_names, query_sentence, top_n=10, similarity_threshold=0.7): + query = """MATCH (n)-[r]->(m) +WHERE n.name IN $node_names AND m.name IN $node_names AND n <> m +RETURN n, r, m, r.sentence AS sentence, r.document_id AS document_id, r.predicate_type AS predicate_type, r.score AS score ORDER BY r.score DESC""" + output = [] + + with self.__driver.session() as session: + result = session.run(query, node_names=node_names) + df = pd.DataFrame([{'Node Start': record['n']['name'], + 'Relationship Type': record['r'].type, + 'Node End': record['m']['name'], + 'Sentence': record['r']['sentence'], + 'Document ID': record['r']['document_id'], + 'Predicate Type': record['r']['predicate_type'], + 'Score': record['r']['score']} for record in result]) + + # Sanitize column names + df.columns = [col.strip() for col in df.columns] + + # Group by relationship and take top N by score + df_top_n = df.groupby(['Node Start', 'Relationship Type', 'Node End']).head(top_n) + + # Now apply select_dominant_sentence for each group + df_top_n_unique = df_top_n.groupby(['Node Start', 'Relationship Type', 'Node End', 'Document ID', 'Predicate Type']).agg({'Sentence': self.select_dominant_sentence, 'Score': 'mean'}).reset_index() + + # Convert sentences to embeddings + query_embedding = self.sentence_to_embedding(query_sentence) + df_top_n_unique['Embedding'] = df_top_n_unique['Sentence'].apply(self.sentence_to_embedding) + + # Calculate cosine similarity and filter + df_top_n_unique['Similarity'] = df_top_n_unique['Embedding'].apply(lambda emb: cosine_similarity([query_embedding], [emb])[0][0]) + df_filtered = df_top_n_unique[df_top_n_unique['Similarity'] >= similarity_threshold] + + + + # Convert the DataFrame back to a list of tuples for graph construction + for _, row in df_filtered.iterrows(): + output.append(row.to_dict()) + + # Build the graph + G = nx.MultiDiGraph() + for data in output: + node_start = data['Node Start'] + node_end = data['Node End'] + relationship = data['Relationship Type'] + # Add edge to graph with all properties + G.add_edge(node_start, node_end, label=relationship, **data) + + return output, G + + def write_to_csv(self, output, filename="output.csv"): + with open(filename, 'w', newline='') as file: + writer = csv.writer(file) + # Include headers for new columns + writer.writerow(['Node Start', 'Relationship Type', 'Node End', 'Sentence', 'Document ID', 'Predicate Type', 'Score']) + + for data in output: + # Unpacking data stored in the dictionary format into the CSV + writer.writerow([data['Node Start'], data['Relationship Type'], data['Node End'], + data['Sentence'], data['Document ID'], data['Predicate Type'], data['Score']]) + + + + def draw_graph(self, G, filename="graph.png", figsize=(12, 12)): + plt.figure(figsize=figsize) + pos = nx.kamada_kawai_layout(G) # For a better spread + + # Generate a color map for the edges + edge_labels = nx.get_edge_attributes(G, 'label') + unique_labels = set(edge_labels.values()) + color_map = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels))) # Using a color map + label_color_map = dict(zip(unique_labels, color_map)) + + # Draw the nodes and edges with the color map + default_color = 'grey' + edge_colors = [label_color_map.get(edge_labels.get(edge, ''), default_color) for edge in G.edges()] + + # Draw nodes and edges using the color map + nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=500, + edge_color=edge_colors, linewidths=1, font_size=15) + + # Draw edge labels + nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8) + + # Save the figure + plt.savefig(filename) + + @staticmethod + def select_dominant_sentence(sentences): + # Ensure it's a list if it's not already + if isinstance(sentences, pd.Series): + sentences = sentences.tolist() + + # Assuming sentences is now a list of sentence strings + dominant_sentence = max(sentences, key=len) # Start with the longest sentence + + for sentence in sentences: + if all(sentence not in other or sentence == other for other in sentences): + dominant_sentence = max(dominant_sentence, sentence, key=len) + + return dominant_sentence + + +# Configuration for Neo4j +neo4j_uri = "neo4j+s://6b6151d7.databases.neo4j.io" # Change this to your Neo4j instance +neo4j_user = "neo4j" # Change to your Neo4j username +neo4j_password = "m0PKWfVRYrhDUSQsTCqOGBYoGQLmN4d4gkTiOV0r8AE" # Change to your Neo4j password + +# Initialize Neo4j connection +neo4j_conn = Neo4jConnection(neo4j_uri, neo4j_user, neo4j_password) +input_data = { + "session_id": "4709e58df737433e9df0b52927afb795", + "query": "What is eagle ford shale reservoir porosity and permeability ?", + "insights": [ + { + "document": "Decline curve analysis of shale oil production_ The case of Eagle Ford.docx", + "source": "S64", + "knowledge": "carbonate-facilitates-hydraulic_fracturing", + "sentence": "4.1 geology the shale formation of eagle ford is of the late cretaceous era, roughly 90 million years old. it has a high carbonate content, up to 70%, which makes it brittle and facilitates hydraulic fracturing (texas rrc, 2014). during the cretaceous time the tectonic movements caused the land masses in the south-east, in the direction of the mexican gulf, to be pressed down.", + "tags": "carbonate, hydraulic fracturing, facilitates" + }, + { + "document": "Decline curve analysis of shale oil production_ The case of Eagle Ford.docx", + "source": "S70", + "knowledge": "permeability-affects-porosity", + "sentence": "the wells are located in two counties in different parts of the eagle ford region and there is big chance other parameters than the api gravity differ between the counties. such parameters could be permeability, porosity, brittleness (ability to induce fractures) and other geological parameters. if the porosity is higher more water will be used in the hydro-fracturing and more of fracturing water would stay in the reservoir.", + "tags": "permeability, porosity, affects" + }, + { + "document": "Reservoir Pressure Mapping from Well-Test Data_ An Eagle Ford Example.docx", + "source": "S86", + "knowledge": "eagle_ford-displays-depth", + "sentence": "154 reservoir pressure mapping from well-test data: an eagle ford example summary and conclusions we have presented a novel method to estimate bottom-hole pressures from public-domain data readily available from the state of texas rrc. the values that we have calculated for the eagle ford shale display an increase in depth below ground surface similar to other published reports. similar analyses can be undertaken for any productive formation, in any area, in texas or other areas where similar data sets are publicly available.", + "tags": "eagle ford shale, depth, display an increase in depth below ground surface" + } + ] +} + +# Extract tags +tags = set() +for insight in input_data["insights"]: + tags.update(insight["tags"].split(", ")) + +print("Tags for graph query:", tags) + + + +output_file = "subgraph_output.csv" +# Every Relationsip +query1 = """ + MATCH (n)-[r]->(m) + WHERE n.name IN $tags AND m.name IN $tags + RETURN n, r, m + """ +# subgraph_data = neo4j_conn.extract_subgraph_to_csv(query1, tags, output_file) +# print(subgraph_data) + +knowledge_items = [insight['knowledge'] for insight in input_data['insights']] +nodes = set() +for knowledge in knowledge_items: + nodes.update(neo4j_conn.fetch_triples(knowledge)) + +print("Nodes for graph query:", nodes) +subgraph_data, graph = neo4j_conn.explore_connections(list(nodes), query_sentence="What is eagle ford shale reservoir porosity and permeability ?") +# print(subgraph) +neo4j_conn.write_to_csv(subgraph_data, "my_subgraph_data.csv") +neo4j_conn.draw_graph(graph) + +neo4j_conn.close()