Skip to content

Commit

Permalink
updated readme and example (#314)
Browse files Browse the repository at this point in the history
* updated readme and example

* updated readme index

* updated assertion
  • Loading branch information
ngupta10 authored Jun 29, 2024
1 parent cfa5344 commit 277b445
Show file tree
Hide file tree
Showing 10 changed files with 494 additions and 201 deletions.
344 changes: 155 additions & 189 deletions README.md

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions querent/core/transformers/bert_ner_opensourcellm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import uuid
from transformers import AutoConfig, AutoTokenizer
import transformers
import time
Expand Down Expand Up @@ -298,7 +299,6 @@ async def process_tokens(self, data: IngestedTokens):
try:
doc_entity_pairs = []
doc_source = data.doc_source

if not BERTLLM.validate_ingested_tokens(data):
self.set_termination_event()
return
Expand All @@ -311,7 +311,7 @@ async def process_tokens(self, data: IngestedTokens):
doc_entity_pairs = self._get_entity_pairs(content)
if not doc_entity_pairs:
return

doc_entity_pairs = self._process_entity_types(doc_entity_pairs)
if not self.entity_context_extractor and not self.predicate_context_extractor:
pairs_withattn = self.attn_scores_instance.extract_and_append_attention_weights(doc_entity_pairs)
Expand Down Expand Up @@ -341,6 +341,7 @@ def _prepare_content(self, data):
else:
content = clean_text
file = data.get_file_path()

return content, file

def _get_entity_pairs(self, content):
Expand Down Expand Up @@ -414,8 +415,9 @@ async def _process_embedding_triples(self, embedding_triples, file, doc_source):
for triple in embedding_triples:
if self.termination_event.is_set():
return
event_id = str(uuid.uuid4())

graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple, event_id=event_id))
if graph_json:
current_state = EventState(
event_type=EventType.Graph,
Expand All @@ -436,7 +438,7 @@ async def _process_embedding_triples(self, embedding_triples, file, doc_source):
base_weights=[predicate_score, predicate_score, 3],
normalize_weights=True # Normalize weights to ensure they sum to 1
)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple=triple, embeddings=final_emb))
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple=triple, embeddings=final_emb,event_id=event_id))
if vector_json:
current_state = EventState(
event_type=EventType.Vector,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,10 @@ def process_tokens(ner_instance : NER_LLM, extractor, filtered_triples, nlp_mode
updated_triples = []
for subject, predicate_metadata, object in filtered_triples:
try:
context = predicate_metadata['current_sentence'].replace("\n"," ")
context = predicate_metadata['current_sentence'].replace("\n"," ").lower()
head_positions = ner_instance.find_subword_indices(context, predicate_metadata['entity1_nn_chunk'])
tail_positions = ner_instance.find_subword_indices(context, predicate_metadata['entity2_nn_chunk'])

if head_positions[0][0] > tail_positions[0][0]:
head_entity = {'entity': object, 'noun_chunk':predicate_metadata['entity2_nn_chunk'], 'entity_label':predicate_metadata['entity2_label'] }
tail_entity = {'entity': subject, 'noun_chunk':predicate_metadata['entity1_nn_chunk'], 'entity_label':predicate_metadata['entity1_label']}
Expand All @@ -188,20 +189,18 @@ def process_tokens(ner_instance : NER_LLM, extractor, filtered_triples, nlp_mode
attention_matrix = extractor.inference_attention(model_input)
token_idx_with_word = ner_instance.tokenize_sentence_with_positions(context)
spacy_doc = nlp_model(context)
filter = IndividualFilter(True, 0.02, token_idx_with_word, spacy_doc)

filter = IndividualFilter(True, 0.01, token_idx_with_word, spacy_doc)
## HEAD Entity Based Attention Search
candidate_paths = perform_search(entity_pair.head_entity['start_idx'], attention_matrix, entity_pair, search_candidates=5, require_contiguous=True, max_relation_length=8, num_initial_tokens=extractor.num_start_tokens())
candidate_paths = remove_duplicates(candidate_paths)
filtered_results = filter.filter(candidates=candidate_paths,e_pair=entity_pair)
predicate_he, score_he = get_best_relation(filtered_results)

##TAIL ENTITY Based Attention Search
candidate_paths = perform_search(entity_pair.tail_entity['start_idx'], attention_matrix, entity_pair, search_candidates=5, require_contiguous=True, max_relation_length=8, num_initial_tokens=extractor.num_start_tokens())
candidate_paths = remove_duplicates(candidate_paths)
filtered_results = filter.filter(candidates=candidate_paths,e_pair=entity_pair)
predicate_te, score_te = get_best_relation(filtered_results)

if score_he > score_te and (score_he >= 0.1 or score_te >= 0.1):
triple = create_semantic_triple(head_entity=head_entity['noun_chunk'],
tail_entity=tail_entity['noun_chunk'],
Expand Down
6 changes: 4 additions & 2 deletions querent/kg/rel_helperfunctions/triple_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,15 @@ def _parse_json_str(json_str):
raise ValueError(f"Error decoding JSON: {e}")

@staticmethod
def convert_graphjson(triple):
def convert_graphjson(triple, event_id = None):
try:
subject, json_str, object_ = triple
predicate_info = TripleToJsonConverter._parse_json_str(json_str)
if predicate_info is None:
return {}

json_object = {
"event_id": event_id,
"subject": TripleToJsonConverter._normalize_text(subject, replace_space=True),
"subject_type": TripleToJsonConverter._normalize_text(predicate_info.get("subject_type", "Unlabeled"), replace_space=True),
"object": TripleToJsonConverter._normalize_text(object_, replace_space=True),
Expand Down Expand Up @@ -67,7 +68,7 @@ def dynamic_weighted_average_embeddings(embeddings, base_weights, normalize_weig
return weighted_sum

@staticmethod
def convert_vectorjson(triple, blob = None, embeddings=None):
def convert_vectorjson(triple, blob = None, embeddings=None, event_id = None):
try:
subject, json_str, object_ = triple
data = TripleToJsonConverter._parse_json_str(json_str)
Expand All @@ -76,6 +77,7 @@ def convert_vectorjson(triple, blob = None, embeddings=None):

id_format = f"{TripleToJsonConverter._normalize_text(subject,replace_space=True)}-{TripleToJsonConverter._normalize_text(data.get('predicate', ''),replace_space=True)}-{TripleToJsonConverter._normalize_text(object_,replace_space=True)}"
json_object = {
"event_id": event_id,
"id": TripleToJsonConverter._normalize_text(id_format),
"embeddings": embeddings.tolist(),
"size": len(embeddings.tolist()),
Expand Down
Binary file added tests/data/readme_assets/example.pdf
Binary file not shown.
Empty file added tests/tutorial/__init__.py
Empty file.
23 changes: 23 additions & 0 deletions tests/tutorial/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
version: '3'
services:
postgres:
image: pgvector/pgvector:pg16
environment:
- POSTGRES_USER=querent
- POSTGRES_PASSWORD=querent
- POSTGRES_DB=querent_test
volumes:
- ./quester/storage/sql/:/docker-entrypoint-initdb.d
ports:
- "5432:5432"
networks:
- querent
healthcheck:
test: ["CMD-SHELL", "pg_isready", "-d", "querent_test"]
interval: 30s
timeout: 60s
retries: 5
start_period: 80s

networks:
querent:
102 changes: 102 additions & 0 deletions tests/tutorial/example_fixed_entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import asyncio
from asyncio import Queue
import json
from pathlib import Path
from querent.callback.event_callback_interface import EventCallbackInterface
from querent.collectors.fs.fs_collector import FSCollectorFactory
from querent.common.types.querent_event import EventState, EventType
from querent.config.collector.collector_config import FSCollectorConfig
from querent.common.uri import Uri
from querent.config.core.llm_config import LLM_Config
from querent.ingestors.ingestor_manager import IngestorFactoryManager
import uuid
import numpy as np
from querent.core.transformers.bert_ner_opensourcellm import BERTLLM
from querent.querent.resource_manager import ResourceManager
from querent.querent.querent import Querent
from postgres_utility import DatabaseManager

async def ingest_all_async():
db_manager = DatabaseManager(
dbname="querent_test",
user="querent",
password="querent",
host="localhost",
port="5432"
)

db_manager.connect_db()
db_manager.create_tables()
directories = ["/home/nishantg/querent-main/querent/tests/data/readme_assets"]
collectors = [
FSCollectorFactory().resolve(
Uri("file://" + str(Path(directory).resolve())),
FSCollectorConfig(config_source={
"id": str(uuid.uuid4()),
"root_path": directory,
"name": "Local-config",
"config": {},
"backend": "localfile",
"uri": "file://",
}),
)
for directory in directories
]

result_queue = asyncio.Queue()

ingestor_factory_manager = IngestorFactoryManager(
collectors=collectors, result_queue=result_queue
)
ingest_task = asyncio.create_task(ingestor_factory_manager.ingest_all_async())
resource_manager = ResourceManager()
bert_llm_config = LLM_Config(
# ner_model_name="English",
rel_model_type = "bert",
rel_model_path = 'bert-base-uncased',
fixed_entities = [
"university", "greenwood", "liam zheng", "department", "Metroville",
"Emily Stanton", "Coach", "health", "training", "athletes"
],
sample_entities = [
"organization", "organization", "person", "department", "city",
"person", "person", "method", "method", "person"
],
is_confined_search = True
)
llm_instance = BERTLLM(result_queue, bert_llm_config)

class StateChangeCallback(EventCallbackInterface):
def handle_event(self, event_type: EventType, event_state: EventState):
if event_state['event_type'] == EventType.Graph:
triple = json.loads(event_state['payload'])
db_manager.insert_metadata(
event_id=triple['event_id'],
subject=triple['subject'],
subject_type=triple['subject_type'],
predicate=triple['predicate'],
object=triple['object'],
object_type=triple['object_type'],
sentence=triple['sentence'],
file=event_state['file'],
doc_source=event_state['doc_source'],
score=triple['score']
)
elif event_state['event_type'] == EventType.Vector:
triple_v = json.loads(event_state['payload'])
db_manager.insert_embedding(
event_id=triple_v['event_id'],
embeddings=triple_v['embeddings'],
)

llm_instance.subscribe(EventType.Graph, StateChangeCallback())
llm_instance.subscribe(EventType.Vector, StateChangeCallback())
querent = Querent(
[llm_instance],
resource_manager=resource_manager,
)
querent_task = asyncio.create_task(querent.start())
await asyncio.gather(ingest_task, querent_task)

if __name__ == "__main__":
asyncio.run(ingest_all_async())
Loading

0 comments on commit 277b445

Please sign in to comment.